fix: add default cpu_threads (#1948)

vansangpfiev · sangjanai · web-flow · commit 9f1a50f8d0ba · 2025-02-27T09:22:25.000+07:00
* fix: add default cpu_threads

* fix: use half of cpu threads

---------

Co-authored-by: vansangpfiev &lt;sang@jan.ai&gt;
diff --git a/engine/services/model_service.cc b/engine/services/model_service.cc
@@ -945,6 +945,11 @@ cpp::result<StartModelResult, std::string> ModelService::StartModel(
 
     json_helper::MergeJson(json_data, params_override);
 
+    // Set default cpu_threads if it is not configured
+    if (!json_data.isMember("cpu_threads")) {
+      json_data["cpu_threads"] = GetCpuThreads();
+    }
+
     // Set the latest ctx_len
     if (ctx_len) {
       json_data["ctx_len"] =
@@ -1329,6 +1334,10 @@ ModelService::MayFallbackToCpu(const std::string& model_path, int ngl,
   return warning;
 }
 
+int ModelService::GetCpuThreads() const {
+  return std::max(std::thread::hardware_concurrency() / 2, 1u);
+}
+
 cpp::result<std::shared_ptr<ModelMetadata>, std::string>
 ModelService::GetModelMetadata(const std::string& model_id) const {
   if (model_id.empty()) {
diff --git a/engine/services/model_service.h b/engine/services/model_service.h
@@ -112,6 +112,8 @@ class ModelService {
       const std::string& model_path, int ngl, int ctx_len, int n_batch = 2048,
       int n_ubatch = 2048, const std::string& kv_cache_type = "f16");
 
+  int GetCpuThreads() const;
+
   std::shared_ptr<DatabaseService> db_service_;
   std::shared_ptr<HardwareService> hw_service_;
   std::shared_ptr<DownloadService> download_service_;
diff --git a/engine/utils/hardware/gguf/gguf_file.h b/engine/utils/hardware/gguf/gguf_file.h
@@ -7,11 +7,11 @@
 #include <filesystem>
 #include <iostream>
 #include <memory>
+#include <optional>
 #include <string>
 #include <unordered_set>
 #include <variant>
 #include <vector>
-#include <optional>
 
 #ifdef _WIN32
 #include <io.h>
@@ -23,8 +23,8 @@
 #endif
 
 #include "ggml.h"
-#include "utils/string_utils.h"
 #include "utils/logging_utils.h"
+#include "utils/string_utils.h"
 
 // #define GGUF_LOG(msg)                                                  \
 //   do {                                                                 \
@@ -246,11 +246,15 @@ struct GGUFHelper {
     file_size = std::filesystem::file_size(file_path);
 
     int fd = open(file_path.c_str(), O_RDONLY);
+    if (fd == -1) {
+      CTL_INF("Failed to open file: " << file_path << ", error: " << errno);
+      return false;
+    }
     // Memory-map the file
     data = static_cast<uint8_t*>(
         mmap(nullptr, file_size, PROT_READ, MAP_PRIVATE, fd, 0));
     if (data == MAP_FAILED) {
-      perror("Error mapping file");
+      CTL_INF("Error mapping file");
       close(fd);
       return false;
     }
@@ -482,7 +486,7 @@ struct GGUFFile {
 inline std::optional<GGUFFile> ParseGgufFile(const std::string& path) {
   GGUFFile gf;
   GGUFHelper h;
-  if(!h.OpenAndMMap(path)) {
+  if (!h.OpenAndMMap(path)) {
     return std::nullopt;
   }
 
diff --git a/engine/utils/hardware/gguf/gguf_file_estimate.h b/engine/utils/hardware/gguf/gguf_file_estimate.h
@@ -64,14 +64,14 @@ inline float GetQuantBit(const std::string& kv_cache_t) {
 
 inline std::optional<Estimation> EstimateLLaMACppRun(
     const std::string& file_path, const RunConfig& rc) {
-  Estimation res;
   // token_embeddings_size = n_vocab * embedding_length * 2 * quant_bit/16 bytes
   //RAM = token_embeddings_size + ((total_ngl-ngl) >=1 ? Output_layer_size +  (total_ngl - ngl - 1 ) / (total_ngl-1) * (total_file_size - token_embeddings_size - Output_layer_size) : 0  )  (bytes)
 
   // VRAM = total_file_size - RAM (bytes)
   auto gf = ParseGgufFile(file_path);
   if (!gf)
     return std::nullopt;
+  Estimation res;
   int32_t embedding_length = 0;
   int64_t n_vocab = 0;
   int32_t num_block = 0;