Fix tons of bugs, add docs command

morganpartee · morganpartee · commit f9c75f4c083c · 2022-12-22T16:57:34.000-06:00
diff --git a/codegpt/files.py b/codegpt/files.py
@@ -1,5 +1,9 @@
 import os
 import typer
+import math
+import re
+import magic
+
 
 def load_text(filenames):
     out = {}
@@ -13,15 +17,65 @@ def write_text(files, backup=False):
     # If the backup option is specified and the file exists,
     # write the existing file to <filename>.bak
     for i, out in enumerate(files):
-        filename = out.get('filename', f"{i}.txt")
-        typer.secho(f"Hmm, didn't find a filename, writing to {filename}", color=typer.colors.MAGENTA)
+        filename = out.get("filename", f"{i}.txt")
         if backup and os.path.exists(filename):
             with open(filename, "r") as f_in:
                 with open(f"{filename}.bak", "w") as f_out:
                     f_out.write(f_in.read())
 
         # Write the new text to the file
         with open(filename, "w") as f:
-            f.write(out['code'])
+            f.write(out["code"])
+        if "explanation" in out:
+            typer.secho(f"{filename} - " + out["explanation"], color=typer.colors.BLUE)
+        
+
+def split_code_into_chunks(paths, chunk_size):
+    chunks = {}
+    for path in paths:
+        if path.is_dir():
+            # Crawl the directory and process each file
+            for root, _, filenames in os.walk(path):
+                for filename in filenames:
+                    file_path = os.path.join(root, filename)
+                    process_file(file_path, chunk_size, chunks)
+        else:
+            # Process the file directly
+            process_file(path, chunk_size, chunks)
+
+    return chunks
+
+def process_file(file_path, chunk_size, chunks):
+    # Use the python-magic library to identify the type of the file
+    mime = magic.from_file(file_path, mime=True)
+    if mime.split("/")[0] != "text":
+        # If the file is not a text file, skip it
+        return
+
+    with open(file_path, "r") as f:
+        code = f.read()
+
+    # Split the code into tokens using a regular expression
+    tokens = re.findall(r"\b\w+\b", code)
+
+    # Determine the number of chunks needed
+    num_chunks = math.ceil(len(tokens) / chunk_size)
 
-        typer.secho(f"{filename} - " + out['explanation'], color=typer.colors.BLUE)
+    # Split the tokens into chunks with a hundred token overlap
+    for i in range(num_chunks):
+        start = i * chunk_size - 100
+        if start < 0:
+            start = 0
+        end = start + chunk_size + 100
+        if end > len(tokens):
+            end = len(tokens)
+        chunk = tokens[start:end]
+        if num_chunks > 1:
+            # If the file was split into multiple chunks, use a key of the form {filename} - {chunk_num}
+            key = f"{file_path.stem} - {i + 1}"
+        else:
+            # If the file was not split, use the file path as the key
+            key = file_path
+        if key not in chunks:
+            chunks[key] = []
+        chunks[key].append(chunk)
diff --git a/codegpt/gpt_interface.py b/codegpt/gpt_interface.py
@@ -92,6 +92,19 @@ def send_iffy_edit(prompt: str, code: Dict[str, str], clipboard: bool = False, y
         print(response["choices"][0]["text"])
     return parsed[0] if clipboard else parsed
 
+def send_normal_completion(prompt, max_tokens=3000, yes=False):
+
+    max_tokens = confirm_send(prompt, max_tokens, yes=yes)
+
+    response = openai.Completion.create(
+        engine="text-davinci-003",
+        prompt=prompt,
+        max_tokens=max_tokens,
+        n=1,
+        temperature=0.6,
+    )
+
+    return response["choices"][0]["text"].strip().strip("```").strip()
 
 
 if __name__ == "__main__":
diff --git a/codegpt/main.py b/codegpt/main.py
@@ -3,6 +3,8 @@
 import json
 import logging
 
+import sys
+sys.path.append("../codegpt")
 
 from codegpt import gpt_interface as gpt
 
@@ -12,6 +14,9 @@
 from typing import List, Optional
 from pathlib import Path
 
+from rich.progress import track
+
+
 app = typer.Typer(
     no_args_is_help=True,
 )
@@ -160,6 +165,34 @@ def quick_edit_file(
     typer.secho("Done!", color=typer.colors.BRIGHT_BLUE)
 
 
+@app.command("docs")
+def docs(
+    paths: List[Path] = typer.Argument(None, exists=True, dir_okay=True, file_okay=True),
+):
+    data = files.split_code_into_chunks(paths, 1800)
+    
+    typer.secho(f"Found {len(data)} files. Documenting...", color=typer.colors.BRIGHT_BLUE)
+        
+    for filename, chunk in track(data.items()):
+        try:
+            prompt = prompts.generate_review_instructions(filename, chunk)
+            result = gpt.send_normal_completion(prompt, 4000, True)
+
+            # Write the documentation for the current code chunk to a file
+            outname = f"./docs/{filename}.md"
+            # Create the '/docs' folder and any intermediate directories if they do not exist
+            Path(outname).parent.mkdir(parents=True, exist_ok=True)
+            
+            files.write_text([{'filename': outname, 'code': result}])
+            
+            # Print a message to confirm that the documentation has been written to the file
+            typer.secho(f"Wrote documentation for {filename} to {outname}", color=typer.colors.GREEN)
+        except Exception as e:
+            import traceback as tb
+            tb.print_exc()
+            typer.secho(f"Error: {e}", color=typer.colors.RED)
+    typer.secho("Done!")
+    
 @app.command("config")
 def config():
     """
diff --git a/codegpt/parse.py b/codegpt/parse.py
@@ -1,6 +1,6 @@
 def parse_resp(response:dict):
     resp = response["choices"][0]["text"].strip().splitlines()
-    
+
     # Initialize an empty list to hold the dictionaries
     out = []
 
@@ -16,27 +16,13 @@ def parse_resp(response:dict):
 
         # If the line doesn't start with '>', it's a key
         if line[0] != '>':
-            if line == '===' and curr_dict:
-                out.append(curr_dict)
-                curr_dict = {}
             # Strip leading/trailing whitespace and remove ':' from the key
             key = line.strip().replace(":", '')
             # Initialize an empty value for this key in the current dictionary
             curr_dict[key] = ""
-        # If the line does start with '>', it's a value
-        else:
-            # Strip the leading '>' and leading/trailing whitespace from the value
-            # and add it to the current key in the current dictionary
-            if key == "code":
-                curr_dict[key] += line.strip().strip('> ').strip('>') + '\n'
-            else:
-                curr_dict[key] = line.strip().strip('> ').strip('>').strip()
+            curr_dict[key] += line.strip().strip('> ').strip('>') + '\n'
 
     # Add the final dictionary to the output list
     out.append(curr_dict)
 
-    # Backtop just in case this fails. Tests don't tend to use whole code, so it gets weird.
-    if 'code' not in out:
-        out = [{'code': response["choices"][0]["text"]}]
-
     return out
diff --git a/codegpt/prompts.py b/codegpt/prompts.py
@@ -1,3 +1,5 @@
+from textwrap import dedent
+
 prompts = {
     "comment": "Add or update comments according to the given language's standards. Add or update function, module, or class level comments if they're appropriate.",
     "varnames": "Change variable names, but nothing else, to make the code more readable. For example, instead of using 'x' and 'y', use 'width' and 'height'.",
@@ -26,3 +28,62 @@ def set_username(username):
     this.username = username
 """,
 }
+
+def generate_review_instructions(filename, code):
+    instructions = dedent(
+        f"""
+    Please review the code in the file "{filename}" and document your findings in a markdown file. The code is shown below for reference:
+    
+    ```
+    {code}
+    ```
+    
+    In your markdown file, please include the following information:
+    
+    1. A summary of the purpose of the file and its contents.
+    2. A list of all classes and functions defined in the file, along with a brief description of their purpose.
+    3. A list of any external dependencies used in the file, including any libraries or modules imported from outside the project.
+    4. Any bugs or issues you identified while reviewing the code.
+    5. Any areas of the code that you consider to be particularly well-written or poorly-written, and why.
+    
+    Please also include any questions or comments you have about the code in your markdown file.
+    
+    When you have finished reviewing the code and documenting your findings, please submit your markdown file for review.
+    
+    Here is a sample markdown file format you can follow:
+    
+    ```md
+    # Code Review: {filename}
+    
+    ## Summary
+    
+    [Insert summary of the purpose of the file and its contents here.]
+    
+    ## Classes and Functions
+    
+    [Insert a list of all classes and functions defined in the file, along with a brief description of their purpose.]
+    
+    ## External Dependencies
+    
+    [Insert a list of any external dependencies used in the file, including any libraries or modules imported from outside the project.]
+    
+    ## Bugs and Issues
+    
+    [Insert any bugs or issues you identified while reviewing the code.]
+    
+    ## Code Quality
+    
+    [Insert any comments you have on the quality of the code, including any areas that you consider to be particularly well-written or poorly-written, and why.]
+    
+    ## Questions and Comments
+    
+    [Insert any questions or comments you have about the code.]
+    ```
+    
+    You are an expert, senior developer, give helpful feedback if you find problems. Return your whole response, markdown formatted for github, below.
+
+    Review Doc:
+    ```md
+    """
+    )
+    return instructions
diff --git a/docs/codegpt/__main__.py.md b/docs/codegpt/__main__.py.md
@@ -0,0 +1,25 @@
+# Code Review: codegpt/__main__.py
+
+## Summary
+
+This file is the main entry point for the codegpt project. It imports the main app object and runs it.
+
+## Classes and Functions
+
+- `app`: The main application object.
+
+## External Dependencies
+
+No external dependencies are used in this file.
+
+## Bugs and Issues
+
+No bugs or issues were identified while reviewing the code.
+
+## Code Quality
+
+The code is well written and organized. It is clear and concise.
+
+## Questions and Comments
+
+No questions or comments were identified while reviewing the code.
diff --git a/docs/codegpt/files.py.md b/docs/codegpt/files.py.md
@@ -0,0 +1,32 @@
+# Code Review: codegpt/files.py
+
+## Summary
+
+This file contains functions that are used to read and write text files. It also contains functions to process files, split them into chunks, and identify their type using the Python Magic library.
+
+## Classes and Functions
+
+- `load_text`: Loads a list of text files and returns the content of the files.
+- `write_text`: Writes text to a file, with an optional backup option.
+- `split_code_into_chunks`: Splits a list of paths into chunks of a given size.
+- `process_file`: Processes a single file and splits it into chunks.
+
+## External Dependencies
+
+- `os`: Used for file system operations.
+- `typer`: Used for displaying colored text.
+- `math`: Used for performing mathematical operations.
+- `re`: Used for regular expression operations.
+- `magic`: Used for identifying file types.
+
+## Bugs and Issues
+
+No bugs or issues were identified during the review.
+
+## Code Quality
+
+The code is well-written and easy to read. It is well-structured, with functions clearly defined and named. It also makes use of external libraries to perform certain tasks, which helps reduce the amount of code that needs to be written.
+
+## Questions and Comments
+
+No questions or comments were identified during the review.
diff --git a/docs/codegpt/gpt_interface.py.md b/docs/codegpt/gpt_interface.py.md
@@ -0,0 +1,33 @@
+# Code Review: codegpt/gpt_interface.py
+
+## Summary
+
+This code is part of a codegpt library and is a module for interacting with GPT-3. It contains functions for sending prompts to the GPT-3 engine, confirming whether a prompt should be sent, and parsing the response from the engine. It also includes code for downloading the NLTK punkt tokenizer if it is not already available.
+
+## Classes and Functions
+
+- `confirm_send(prompt: str, max_tokens: int = 4000, yes: bool = False, silent: bool = False) -> int`: This function checks the length of the prompt and confirms that the user wants to send it to the GPT-3 engine. It returns the maximum number of tokens the engine can return in response.
+
+- `send_iffy_edit(prompt: str, code: Dict[str, str], clipboard: bool = False, yes: bool = False) -> Dict[str, str]:` This function is used to send a prompt with code that may need to be edited. It returns a parsed response from the GPT-3 engine.
+
+- `send_normal_completion(prompt: str, max_tokens: int = 3000, yes: bool = False) -> str:` This function is used to send a normal prompt to the GPT-3 engine. It returns the response from the engine.
+
+## External Dependencies
+
+- `nltk`: Used for downloading the punkt tokenizer.
+- `openai`: Used for interacting with the GPT-3 engine.
+- `typer`: Used for printing colored text and prompting the user for confirmation.
+- `textwrap`: Used for dedenting strings.
+- `codegpt.parse`: Used for parsing the response from the GPT-3 engine.
+
+## Bugs and Issues
+
+None identified.
+
+## Code Quality
+
+The code is well-written and well-structured. The functions are clear and concise, and the comments provide helpful context. The code is also well-formatted and easy to read.
+
+## Questions and Comments
+
+None.
diff --git a/docs/codegpt/parse.py.md b/docs/codegpt/parse.py.md
@@ -0,0 +1,25 @@
+# Code Review: codegpt/parse.py
+
+## Summary
+
+This file contains a function, `parse_resp`, which is used to parse a response from a dict object. The function takes in a dict object, strips the text, and splits the lines into a list. It then uses this list to create a dictionary of key-value pairs and adds it to an output list.
+
+## Classes and Functions
+
+- `parse_resp`: Function used to parse a response from a dict object.
+
+## External Dependencies
+
+No external dependencies are used in this file.
+
+## Bugs and Issues
+
+No bugs or issues were identified while reviewing the code.
+
+## Code Quality
+
+The code appears to be well-written, clearly laid out, and easy to follow. All of the necessary steps for parsing the response are included, and the code is concise and efficient.
+
+## Questions and Comments
+
+No questions or comments at this time.
diff --git a/docs/codegpt/prompts.py.md b/docs/codegpt/prompts.py.md
diff --git a/docs/codegpt/readme.md.md b/docs/codegpt/readme.md.md
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml