fix: by default, try it in ollama

ketsapiwiq · ketsapiwiq · commit 8ffb59539472 · 2025-02-09T11:53:30.000+01:00
diff --git a/README.md b/README.md
@@ -76,28 +76,28 @@ Use Claude 3 with Vision to see how it stacks up to GPT-4-Vision at operating a
 operate -m claude-3
 ```
 
-#### Try LLaVa Hosted Through Ollama `-m llava`
-If you wish to experiment with the Self-Operating Computer Framework using LLaVA on your own machine, you can with Ollama!   
+#### Try a model Hosted Through Ollama `-m llama3.2-vision`
+If you wish to experiment with the Self-Operating Computer Framework using e.g. LLaVA on your own machine, you can with Ollama!   
 *Note: Ollama currently only supports MacOS and Linux. Windows now in Preview*   
 
 First, install Ollama on your machine from https://ollama.ai/download.   
 
-Once Ollama is installed, pull the LLaVA model:
+Once Ollama is installed, pull the vision model:
 ```
-ollama pull llava
+ollama pull llama3.2-vision
 ```
 This will download the model on your machine which takes approximately 5 GB of storage.   
 
-When Ollama has finished pulling LLaVA, start the server:
+When Ollama has finished pulling llama3.2-vision, start the server:
 ```
 ollama serve
 ```
 
-That's it! Now start `operate` and select the LLaVA model:
+That's it! Now start `operate` and select the model:
 ```
-operate -m llava
+operate -m llama3.2-vision
 ```   
-**Important:** Error rates when using LLaVA are very high. This is simply intended to be a base to build off of as local multimodal models improve over time.
+**Important:** Error rates when using self-hosted models are very high. This is simply intended to be a base to build off of as local multimodal models improve over time.
 
 Learn more about Ollama at its [GitHub Repository](https://www.github.com/ollama/ollama)
 
diff --git a/operate/models/apis.py b/operate/models/apis.py
@@ -50,14 +50,11 @@ async def get_next_action(model, messages, objective, session_id):
         return "coming soon"
     if model == "gemini-pro-vision":
         return call_gemini_pro_vision(messages, objective), None
-    if "llava" in model:
-        operation = call_ollama_llava(messages, model)
-        return operation, None
     if model == "claude-3":
         operation = await call_claude_3_with_ocr(messages, objective, model)
         return operation, None
-    raise ModelNotRecognizedException(model)
-
+    operation = call_ollama_llava(model, messages)
+    return operation, None
 
 def call_gpt_4o(messages):
     if config.verbose:
@@ -557,10 +554,7 @@ async def call_gpt_4o_labeled(messages, objective, model):
             traceback.print_exc()
         return call_gpt_4o(messages)
 
-
-def call_ollama_llava(messages, model):
-    if model == "":
-        model = "llava"
+def call_ollama_llava(model, messages):
     if config.verbose:
         print(f"[call_ollama_llava] model {model}")
     time.sleep(1)
@@ -635,7 +629,7 @@ def call_ollama_llava(messages, model):
         )
         if config.verbose:
             traceback.print_exc()
-        return call_ollama_llava(messages, model)
+        return call_ollama_llava(model, messages)
 
 
 async def call_claude_3_with_ocr(messages, objective, model):