morganpartee · morganpartee · Dec 22, 2022 · Dec 22, 2022 · Dec 22, 2022 · Dec 22, 2022
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # Codegpt
 
-## 0.2.15
+## 0.3
 
 A tool for using GPT just a little quicker. A nearly truly automated footgun. Learn how to revert with git before trying please.
 
@@ -21,7 +21,19 @@ Windows users can also use `setx` like:
 
 from an admin console.
 
-## Be careful! But try this
+## Your first (safe) command
+
+One cool thing is generating documentation. GPT-3 has a token limit of 4000 for completions, so larger files will be chunked up.
+
+```bash
+codegpt docs <paths>
+```
+
+And it'll generate docs, one per file. This is great when you're coming into a codebase you've never seen before.
+
+## Unsafe Commands
+
+Everything else can modify files. Have someone hold your beer and try some of these (after you check it into git):
 
 Usage
 To try Codegpt, you can run the following command:

diff --git a/codegpt/files.py b/codegpt/files.py
@@ -1,5 +1,9 @@
 import os
 import typer
+import math
+import re
+import magic
+
 
 def load_text(filenames):
     out = {}
@@ -13,15 +17,65 @@ def write_text(files, backup=False):
     # If the backup option is specified and the file exists,
     # write the existing file to <filename>.bak
     for i, out in enumerate(files):
-        filename = out.get('filename', f"{i}.txt")
-        typer.secho(f"Hmm, didn't find a filename, writing to {filename}", color=typer.colors.MAGENTA)
+        filename = out.get("filename", f"{i}.txt")
         if backup and os.path.exists(filename):
             with open(filename, "r") as f_in:
                 with open(f"{filename}.bak", "w") as f_out:
                     f_out.write(f_in.read())
 
         # Write the new text to the file
         with open(filename, "w") as f:
-            f.write(out['code'])
+            f.write(out["code"])
+        if "explanation" in out:
+            typer.secho(f"{filename} - " + out["explanation"], color=typer.colors.BLUE)
+
+
+def split_code_into_chunks(paths, chunk_size):
+    chunks = {}
+    for path in paths:
+        if path.is_dir():
+            # Crawl the directory and process each file
+            for root, _, filenames in os.walk(path):
+                for filename in filenames:
+                    file_path = os.path.join(root, filename)
+                    process_file(file_path, chunk_size, chunks)
+        else:
+            # Process the file directly
+            process_file(path, chunk_size, chunks)
+
+    return chunks
+
+def process_file(file_path, chunk_size, chunks):
+    # Use the python-magic library to identify the type of the file
+    mime = magic.from_file(file_path, mime=True)
+    if mime.split("/")[0] != "text":
+        # If the file is not a text file, skip it
+        return
+
+    with open(file_path, "r") as f:
+        code = f.read()
+
+    # Split the code into tokens using a regular expression
+    tokens = re.findall(r"\b\w+\b", code)
+
+    # Determine the number of chunks needed
+    num_chunks = math.ceil(len(tokens) / chunk_size)
 
-        typer.secho(f"{filename} - " + out['explanation'], color=typer.colors.BLUE)
+    # Split the tokens into chunks with a hundred token overlap
+    for i in range(num_chunks):
+        start = i * chunk_size - 100
+        if start < 0:
+            start = 0
+        end = start + chunk_size + 100
+        if end > len(tokens):
+            end = len(tokens)
+        chunk = tokens[start:end]
+        if num_chunks > 1:
+            # If the file was split into multiple chunks, use a key of the form {filename} - {chunk_num}
+            key = f"{file_path.stem} - {i + 1}"
+        else:
+            # If the file was not split, use the file path as the key
+            key = file_path
+        if key not in chunks:
+            chunks[key] = []
+        chunks[key].append(chunk)
diff --git a/codegpt/gpt_interface.py b/codegpt/gpt_interface.py
@@ -52,9 +52,7 @@ def send_iffy_edit(prompt: str, code: Dict[str, str], clipboard: bool = False, y
         > <the code to be output line 1>
         > <the code to be output, line n...>
 
-        You must include an explanation of what you did, and the code to be output, regardless of the format or file.
-
-        OUTPUT:""")
+        You must include an explanation of what you did, and the code to be output, regardless of the format or file.""")
 
     else:
         full_prompt += dedent("""
@@ -71,9 +69,7 @@ def send_iffy_edit(prompt: str, code: Dict[str, str], clipboard: bool = False, y
         > <code line 1>
         > <code line n...>
 
-        You must include the filename, an explanation of what you did, and the code for the file to be output, regardless of the format or file.
-
-        OUTPUT:""")
+        You must include the filename, an explanation of what you did, and the code for the file to be output, regardless of the format or file.""")
 
     max_tokens = confirm_send(full_prompt, yes=yes, silent=clipboard)
 
@@ -92,6 +88,19 @@ def send_iffy_edit(prompt: str, code: Dict[str, str], clipboard: bool = False, y
         print(response["choices"][0]["text"])
     return parsed[0] if clipboard else parsed
 
+def send_normal_completion(prompt, max_tokens=3000, yes=False):
+
+    max_tokens = confirm_send(prompt, max_tokens, yes=yes)
+
+    response = openai.Completion.create(
+        engine="text-davinci-003",
+        prompt=prompt,
+        max_tokens=max_tokens,
+        n=1,
+        temperature=0.6,
+    )
+
+    return response["choices"][0]["text"].strip().strip("```").strip()
 
 
 if __name__ == "__main__":

diff --git a/codegpt/main.py b/codegpt/main.py
@@ -3,6 +3,8 @@
 import json
 import logging
 
+import sys
+sys.path.append("../codegpt")
 
 from codegpt import gpt_interface as gpt
 
@@ -12,6 +14,9 @@
 from typing import List, Optional
 from pathlib import Path
 
+from rich.progress import track
+
+
 app = typer.Typer(
     no_args_is_help=True,
 )
@@ -83,7 +88,7 @@ def edit_file(
         return
 
     if raw_out:
-        print(result['code'])
+        print(result.get('code') or result)
         return
 
     files.write_text(result, backup)
@@ -160,6 +165,34 @@ def quick_edit_file(
     typer.secho("Done!", color=typer.colors.BRIGHT_BLUE)
 
 
+@app.command("docs")
+def docs(
+    paths: List[Path] = typer.Argument(None, exists=True, dir_okay=True, file_okay=True),
+):
+    data = files.split_code_into_chunks(paths, 1800)
+
+    typer.secho(f"Found {len(data)} files. Documenting...", color=typer.colors.BRIGHT_BLUE)
+
+    for filename, chunk in track(data.items()):
+        try:
+            prompt = prompts.generate_review_instructions(filename, chunk)
+            result = gpt.send_normal_completion(prompt, 4000, True)
+
+            # Write the documentation for the current code chunk to a file
+            outname = f"./docs/{filename}.md"
+            # Create the '/docs' folder and any intermediate directories if they do not exist
+            Path(outname).parent.mkdir(parents=True, exist_ok=True)
+
+            files.write_text([{'filename': outname, 'code': result}])
+
+            # Print a message to confirm that the documentation has been written to the file
+            typer.secho(f"Wrote documentation for {filename} to {outname}", color=typer.colors.GREEN)
+        except Exception as e:
+            import traceback as tb
+            tb.print_exc()
+            typer.secho(f"Error: {e}", color=typer.colors.RED)
+    typer.secho("Done!")
+
 @app.command("config")
 def config():
     """

diff --git a/codegpt/parse.py b/codegpt/parse.py
@@ -1,6 +1,6 @@
 def parse_resp(response:dict):
     resp = response["choices"][0]["text"].strip().splitlines()
-    
+
     # Initialize an empty list to hold the dictionaries
     out = []
 
@@ -16,27 +16,14 @@ def parse_resp(response:dict):
 
         # If the line doesn't start with '>', it's a key
         if line[0] != '>':
-            if line == '===' and curr_dict:
-                out.append(curr_dict)
-                curr_dict = {}
             # Strip leading/trailing whitespace and remove ':' from the key
-            key = line.strip().replace(":", '')
+            key = line.strip().replace(":", '').lower()
             # Initialize an empty value for this key in the current dictionary
             curr_dict[key] = ""
-        # If the line does start with '>', it's a value
         else:
-            # Strip the leading '>' and leading/trailing whitespace from the value
-            # and add it to the current key in the current dictionary
-            if key == "code":
-                curr_dict[key] += line.strip().strip('> ').strip('>') + '\n'
-            else:
-                curr_dict[key] = line.strip().strip('> ').strip('>').strip()
+            curr_dict[key] += line.strip().strip('> ').strip('>') + '\n'
 
     # Add the final dictionary to the output list
     out.append(curr_dict)
 
-    # Backtop just in case this fails. Tests don't tend to use whole code, so it gets weird.
-    if 'code' not in out:
-        out = [{'code': response["choices"][0]["text"]}]
-
     return out
diff --git a/codegpt/prompts.py b/codegpt/prompts.py
@@ -1,3 +1,5 @@
+from textwrap import dedent
+
 prompts = {
     "comment": "Add or update comments according to the given language's standards. Add or update function, module, or class level comments if they're appropriate.",
     "varnames": "Change variable names, but nothing else, to make the code more readable. For example, instead of using 'x' and 'y', use 'width' and 'height'.",
@@ -26,3 +28,62 @@ def set_username(username):
     this.username = username
 """,
 }
+
+def generate_review_instructions(filename, code):
+    instructions = dedent(
+        f"""
+    Please review the code in the file "{filename}" and document your findings in a markdown file. The code is shown below for reference:
+
+    ```
+    {code}
+    ```
+
+    In your markdown file, please include the following information:
+
+    1. A summary of the purpose of the file and its contents.
+    2. A list of all classes and functions defined in the file, along with a brief description of their purpose.
+    3. A list of any external dependencies used in the file, including any libraries or modules imported from outside the project.
+    4. Any bugs or issues you identified while reviewing the code.
+    5. Any areas of the code that you consider to be particularly well-written or poorly-written, and why.
+
+    Please also include any questions or comments you have about the code in your markdown file.
+
+    When you have finished reviewing the code and documenting your findings, please submit your markdown file for review.
+
+    Here is a sample markdown file format you can follow:
+
+    ```md
+    # Code Review: {filename}
+
+    ## Summary
+
+    [Insert summary of the purpose of the file and its contents here.]
+
+    ## Classes and Functions
+
+    [Insert a list of all classes and functions defined in the file, along with a brief description of their purpose.]
+
+    ## External Dependencies
+
+    [Insert a list of any external dependencies used in the file, including any libraries or modules imported from outside the project.]
+
+    ## Bugs and Issues
+
+    [Insert any bugs or issues you identified while reviewing the code.]
+
+    ## Code Quality
+
+    [Insert any comments you have on the quality of the code, including any areas that you consider to be particularly well-written or poorly-written, and why.]
+
+    ## Questions and Comments
+
+    [Insert any questions or comments you have about the code.]
+    ```
+
+    You are an expert, senior developer, give helpful feedback if you find problems. Return your whole response, markdown formatted for github, below.
+
+    Review Doc:
+    ```md
+    """
+    )
+    return instructions