Questions based prompt and response translator

ashwinprasadme · ashwinprasadme · commit 62dfbd4a1508 · 2023-12-10T20:05:04.000+01:00
diff --git a/src/target_tools/ollama/src/prompts.py b/src/target_tools/ollama/src/prompts.py
@@ -471,9 +471,41 @@ def id_func ( arg ):
 
 Analyze the provided Python code and determine the types of various elements. Answer the following questions based on your analysis.
 
+Python Code:
+{code}
+
 Questions:
 {questions}
 
-Python Code:
+Your Answers:
+{answers}
+"""
+
+questions_based_2 = """
+## Task Description
+
+**Objective**: Examine and identify the data types of various elements such as function parameters, local variables, and function return types in the given Python code.
+
+**Instructions**:
+1. For each question below, provide a concise, one-word answer indicating the data type.
+2. For arguments and variables inside a function, list every data type they take within the current program context as a comma separated list.
+3. Do not include additional explanations or commentary in your answers.
+
+**Python Code Provided**:
+```python
 {code}
+```
+
+**Questions**:
+{questions}
+
+**Format for Answers**:
+- Provide your answer next to each question number, using only one word.
+- Example:
+    1. int
+    2. float
+    3. str
+
+**Your Answers**:
+{answers}
 """
diff --git a/src/target_tools/ollama/src/runner.py b/src/target_tools/ollama/src/runner.py
@@ -2,6 +2,7 @@
 import json
 import logging
 import multiprocessing
+import os
 import re
 import shutil
 import sys
@@ -12,6 +13,7 @@
 from typing import List, Optional
 
 import prompts
+import translator
 import utils
 from langchain.callbacks.manager import CallbackManager
 from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
@@ -22,8 +24,9 @@
 from langchain.pydantic_v1 import BaseModel
 
 AUTOFIX_WITH_OPENAI = False
-ENABLE_STREAMING = False
+ENABLE_STREAMING = True
 REQUEST_TIMEOUT = 60
+USE_MULTIPROCESSING_FOR_TERMINATION = True
 
 
 class TypeEvalPySchema(BaseModel):
@@ -39,6 +42,7 @@ class TypeEvalPySchema(BaseModel):
     "json_based_1": prompts.json_based_1,
     "json_based_2": prompts.json_based_2,
     "questions_based_1": prompts.questions_based_1,
+    "questions_based_2": prompts.questions_based_2,
 }
 
 # Create a logger
@@ -76,17 +80,20 @@ def get_prompt(prompt_id, code_path, json_filepath):
     with open(code_path, "r") as file:
         code = file.read()
 
-    if prompt_id == "questions_based_1":
+    if prompt_id in ["questions_based_1", "questions_based_2"]:
         questions_from_json = utils.generate_questions_from_json(json_filepath)
 
         prompt = PromptTemplate(
             template=PROMPTS_MAP[prompt_id],
-            input_variables=["code", "questions"],
+            input_variables=["code", "questions", "answers"],
         )
 
         prompt_data = {
             "code": code,
-            "questions": "\nResult:\n".join(questions_from_json),
+            "questions": "\n".join(questions_from_json),
+            "answers": "\n".join(
+                [f"{x}." for x in range(1, len(questions_from_json) + 1)]
+            ),
         }
     elif prompt_id in ["json_based_1", "json_based_2"]:
         parser = PydanticOutputParser(pydantic_object=TypeEvalPySchema)
@@ -112,32 +119,35 @@ def process_file(file_path, llm, openai_llm, prompt_id):
         json_filepath = str(file_path).replace(".py", "_gt.json")
         result_filepath = str(file_path).replace(".py", f"_result.json")
 
-        # Queue for communication between processes
-        queue = multiprocessing.Queue()
+        if USE_MULTIPROCESSING_FOR_TERMINATION:
+            # Queue for communication between processes
+            queue = multiprocessing.Queue()
 
-        # Create a process for llm.invoke
-        process = multiprocessing.Process(
-            target=invoke_llm,
-            args=(llm, get_prompt(prompt_id, file_path, json_filepath), queue),
-        )
-        process.start()
+            # Create a process for llm.invoke
+            process = multiprocessing.Process(
+                target=invoke_llm,
+                args=(llm, get_prompt(prompt_id, file_path, json_filepath), queue),
+            )
+            process.start()
 
-        # Wait for the process to finish with a timeout (e.g., 60 seconds)
-        process.join(timeout=REQUEST_TIMEOUT)
+            # Wait for the process to finish with a timeout (e.g., 60 seconds)
+            process.join(timeout=REQUEST_TIMEOUT)
 
-        if process.is_alive():
-            logger.info(f"Timeout occurred for {file_path}")
-            process.terminate()  # Terminate the process if it's still running
-            process.join()
-            logger.info(f"{file_path} failed: Not a valid JSON")
-            raise utils.TimeoutException("json")
+            if process.is_alive():
+                logger.info(f"Timeout occurred for {file_path}")
+                process.terminate()  # Terminate the process if it's still running
+                process.join()
+                logger.info(f"{file_path} failed: Not a valid JSON")
+                raise utils.TimeoutException("json")
 
-        result = queue.get_nowait()
+            result = queue.get_nowait()
 
-        if isinstance(result, Exception):
-            raise result
+            if isinstance(result, Exception):
+                raise result
 
-        output = result
+            output = result
+        else:
+            output = llm.invoke(get_prompt(prompt_id, file_path, json_filepath))
 
         if isinstance(llm, ChatOpenAI):
             output = output.content
@@ -157,7 +167,13 @@ def process_file(file_path, llm, openai_llm, prompt_id):
 
     logger.info(output)
 
-    is_valid_json = utils.generate_json_file(result_filepath, output)
+    if prompt_id == "questions_based_2":
+        answers_json = utils.generate_json_from_answers(json_filepath, output)
+        translated_json = translator.translate_content(answers_json)
+    else:
+        translated_json = translator.translate_content(output)
+
+    is_valid_json = utils.generate_json_file(result_filepath, translated_json)
     if not is_valid_json:
         logger.info(f"{file_path} failed: Not a valid JSON")
         raise utils.JsonException("json")
@@ -184,33 +200,37 @@ def main_runner(args):
 
         python_files = list_python_files(results_dst)
 
-        if model.startswith("gpt-"):
-            # OpenAI models
-            llm = ChatOpenAI(
-                model_name=model,
-                temperature=temperature,
-                openai_api_key=args.openai_key,
-            )
-
-        else:
-            llm = Ollama(
-                model=model,
-                callback_manager=(
-                    CallbackManager([StreamingStdOutCallbackHandler()])
-                    if ENABLE_STREAMING
-                    else None
-                ),
-                temperature=temperature,
-                timeout=REQUEST_TIMEOUT,
-            )
-            llm.base_url = args.ollama_url
-            if utils.is_ollama_online(llm.base_url):
+        if not model.startswith("gpt-"):
+            if utils.is_ollama_online(args.ollama_url):
                 logger.info("Ollama is online!")
             else:
                 logger.error("Ollama server is not online!!!")
                 sys.exit(-1)
 
         for file in python_files:
+            # Recreating llm object each iteration since we might force terminate in thread
+            # Maybe there is another better way to do this
+            if model.startswith("gpt-"):
+                # OpenAI models
+                llm = ChatOpenAI(
+                    model_name=model,
+                    temperature=temperature,
+                    openai_api_key=args.openai_key,
+                )
+
+            else:
+                llm = Ollama(
+                    model=model,
+                    callback_manager=(
+                        CallbackManager([StreamingStdOutCallbackHandler()])
+                        if ENABLE_STREAMING
+                        else None
+                    ),
+                    temperature=temperature,
+                    timeout=REQUEST_TIMEOUT,
+                )
+                llm.base_url = args.ollama_url
+
             prompt_start_time = time.time()
             try:
                 logger.info(file)
@@ -229,9 +249,9 @@ def main_runner(args):
 
             files_analyzed += 1
             logger.info(
-                f"Progress: {files_analyzed}/{len(python_files)} | Errors/JSON:"
-                f" {error_count}/{json_count} | PromptTime:"
-                f" {time.time()-prompt_start_time}"
+                f"Progress: {files_analyzed}/{len(python_files)} | Total Errors / JSON"
+                f" Errors / Timeouts: {error_count},{json_count},{timeout_count} |"
+                f" PromptTime: {time.time()-prompt_start_time}"
             )
 
     logger.info(
diff --git a/src/target_tools/ollama/src/translator.py b/src/target_tools/ollama/src/translator.py
@@ -0,0 +1,19 @@
+import json
+
+
+def translate_content(data):
+    type_mapping = {
+        "integer": "int",
+        "string": "str",
+        "function": "callable",
+        "none": "Nonetype",
+    }
+
+    for entry in data:
+        translated_types = [
+            type_mapping[t.lower()] if t.lower() in type_mapping else t
+            for t in entry["type"]
+        ]
+        entry["type"] = translated_types
+
+    return data
diff --git a/src/target_tools/ollama/src/utils.py b/src/target_tools/ollama/src/utils.py
@@ -1,5 +1,6 @@
 import json
 import os
+import re
 import shutil
 import sys
 
@@ -80,6 +81,31 @@ def generate_json_file(filename, type_info):
     return is_valid_json
 
 
+def generate_json_from_answers(gt_json_file, answers):
+    try:
+        with open(gt_json_file, "r") as file:
+            gt_data = json.load(file)
+
+        pattern = re.compile(r"^\s*(\d+)\.\s+(.+)\s*$", re.MULTILINE)
+        parsed_answers = pattern.findall(answers)
+
+        if len(gt_data) != len(parsed_answers):
+            return False
+
+        answers_json_data = []
+        for fact in range(len(gt_data)):
+            _result = gt_data[fact]
+            _result.pop("type")
+            _result["type"] = [x.strip() for x in parsed_answers[fact][1].split(",")]
+            answers_json_data.append(_result)
+
+        return answers_json_data
+    except Exception as e:
+        print("Error generating json from questions")
+        print(e)
+        return False
+
+
 def generate_questions_from_json(json_file):
     # Read and parse the JSON file
     with open(json_file, "r") as file:
@@ -98,26 +124,26 @@ def generate_questions_from_json(json_file):
             question = (
                 "What is the return type of the function"
                 f" '{entry['function']}' at line {line_number}, column"
-                f" {col_offset}?"
+                f" {col_offset}? Reply in one word."
             )
         # Function Parameter type
         elif "parameter" in entry:
             question = (
                 f"What is the type of the parameter '{entry['parameter']}' at line"
                 f" {line_number}, column {col_offset}, within the function"
-                f" '{entry['function']}'?"
+                f" '{entry['function']}'? Reply in one word."
             )
         # Variable in a function type
         elif "variable" in entry and "function" not in entry:
             question = (
                 f"What is the type of the variable '{entry['variable']}' at line"
-                f" {line_number}, column {col_offset}?"
+                f" {line_number}, column {col_offset}? Reply in one word."
             )
         elif "variable" in entry and "function" in entry:
             question = (
                 f"What is the type of the variable '{entry['variable']}' at line"
                 f" {line_number}, column {col_offset}, within the function"
-                f" '{entry['function']}'?"
+                f" '{entry['function']}'? Reply in one word."
             )
         else:
             print("ERROR! Type could not be converted to types")
@@ -127,4 +153,5 @@ def generate_questions_from_json(json_file):
         print("ERROR! Type questions length does not match json length")
         sys.exit(-1)
 
+    questions = [f"{x}. {y}" for x, y in zip(range(1, len(questions) + 1), questions)]
     return questions