chore: sync main to dev (#1978)

vansangpfiev · sangjanai · ohaiibuzzle · web-flow · commit 58c071c9cd3b · 2025-02-18T17:03:53.000+07:00
* feat: AMD hardware API (#1797) * feat: add amd gpu windows * chore: remove unused code * feat: get amd gpus * fix: clean * chore: cleanup * fix: set activate * fix: build windows * feat: linux * fix: add patches * fix: map cuda gpus * fix: build * chore: docs * fix: build * chore: clean up * fix: build * fix: build * chore: pack vulkan windows * chore: vulkan linux --------- Co-authored-by: vansangpfiev <sang@jan.ai> * fix: add cpu usage (#1868) Co-authored-by: vansangpfiev <sang@jan.ai> * fix: PATCH method for Thread and Messages management (#1923) Co-authored-by: vansangpfiev <sang@jan.ai> * fix: ignore compute_cap if not present (#1866) * fix: ignore compute_cap if not present * fix: correct gpu info * fix: remove check for toolkit version --------- Co-authored-by: vansangpfiev <sang@jan.ai> * fix: models.cc: symlinked model deletion shouldn't remove original file (#1918) Co-authored-by: vansangpfiev <vansangpfiev@gmail.com> * fix: correct gpu info list (#1944) * fix: correct gpu info list * chore: cleanup --------- Co-authored-by: vansangpfiev <sang@jan.ai> * fix: gpu: filter out llvmpipe * fix: add vendor in gpu info (#1952) Co-authored-by: vansangpfiev <sang@jan.ai> * fix: correct get server name method (#1953) Co-authored-by: vansangpfiev <sang@jan.ai> * fix: map nvidia and vulkan uuid (#1954) Co-authored-by: vansangpfiev <sang@jan.ai> * fix: permission issue for default drogon uploads folder (#1870) Co-authored-by: vansangpfiev <sang@jan.ai> * chore: change timeout * fix: make get hardware info function thread-safe (#1956) Co-authored-by: vansangpfiev <sang@jan.ai> * fix: cache data for gpu information (#1959) * fix: wrap vulkan gpu function * fix: init * fix: cpu usage * fix: build windows * fix: buld macos --------- Co-authored-by: vansangpfiev <sang@jan.ai> * fix: handle path with space (#1963) * fix: unload engine before updating (#1970) Co-authored-by: sangjanai <sang@jan.ai> * fix: auto-reload model for remote engine (#1971) Co-authored-by: sangjanai <sang@jan.ai> * fix: use updated configuration for remote model when reload (#1972) Co-authored-by: sangjanai <sang@jan.ai> * fix: correct engine interface order (#1974) Co-authored-by: sangjanai <sang@jan.ai> * fix: improve error handling for remote engine (#1975) Co-authored-by: sangjanai <sang@jan.ai> * fix: temporarily remove model setting recommendation (#1977) Co-authored-by: sangjanai <sang@jan.ai> --------- Co-authored-by: vansangpfiev <sang@jan.ai> Co-authored-by: OHaiiBuzzle <23693150+ohaiibuzzle@users.noreply.github.com>
diff --git a/docs/docs/architecture/cortex-db.mdx b/docs/docs/architecture/cortex-db.mdx
@@ -15,15 +15,14 @@ import TabItem from "@theme/TabItem";
 This document outlines Cortex database architecture which is designed to store and manage models, engines,
 files and more.
 
-## Tables Structure
-
+## Table Structure
 ### schema Table
-
 The `schema` table is designed to hold schema version for cortex database. Below is the structure of the table:
 
 | Column Name        | Data Type | Description                                             |
 |--------------------|-----------|---------------------------------------------------------|
-| version            | INTEGER   | A unique schema version for database.                   |
+| schema_version     | INTEGER   | A unique schema version for database.                   |
+
 
 ### models Table
 The `models` table is designed to hold metadata about various AI models. Below is the structure of the table:
@@ -53,7 +52,6 @@ The `hardware` table is designed to hold metadata about hardware information. Be
 | activated          | INTEGER   | A boolean value (0 or 1) indicating whether the hardware is activated or not. |
 | priority           | INTEGER   | An integer value representing the priority associated with the hardware. |
 
-
 ### engines Table
 The `engines` table is designed to hold metadata about the different engines available for useage with Cortex.
 Below is the structure of the table:
diff --git a/engine/CMakeLists.txt b/engine/CMakeLists.txt
@@ -73,7 +73,6 @@ if(CMAKE_BUILD_INJA_TEST)
   add_subdirectory(examples/inja)
 endif()
 
-
 find_package(jsoncpp CONFIG REQUIRED)
 find_package(Drogon CONFIG REQUIRED)
 find_package(yaml-cpp CONFIG REQUIRED)
diff --git a/engine/cli/commands/server_start_cmd.cc b/engine/cli/commands/server_start_cmd.cc
@@ -66,16 +66,16 @@ bool ServerStartCmd::Exec(const std::string& host, int port,
   si.cb = sizeof(si);
   ZeroMemory(&pi, sizeof(pi));
   std::wstring params = L"--start-server";
-  params += L" --config_file_path " +
-            file_manager_utils::GetConfigurationPath().wstring();
-  params += L" --data_folder_path " +
-            file_manager_utils::GetCortexDataPath().wstring();
+  params += L" --config_file_path \"" + 
+            file_manager_utils::GetConfigurationPath().wstring() + L"\"";
+  params += L" --data_folder_path \"" +
+            file_manager_utils::GetCortexDataPath().wstring() + L"\"";
   params += L" --loglevel " + cortex::wc::Utf8ToWstring(log_level_);
   std::wstring exe_w = cortex::wc::Utf8ToWstring(exe);
   std::wstring current_path_w =
       file_manager_utils::GetExecutableFolderContainerPath().wstring();
-  std::wstring wcmds = current_path_w + L"/" + exe_w + L" " + params;
-  CTL_DBG("wcmds: " << wcmds);
+  std::wstring wcmds = current_path_w + L"\\" + exe_w + L" " + params;
+  CTL_INF("wcmds: " << wcmds);
   std::vector<wchar_t> mutable_cmds(wcmds.begin(), wcmds.end());
   mutable_cmds.push_back(L'\0');
   // Create child process
diff --git a/engine/common/hardware_common.h b/engine/common/hardware_common.h
@@ -79,6 +79,7 @@ struct GPU {
   int64_t total_vram;
   std::string uuid;
   bool is_activated = true;
+  std::string vendor;
 };
 
 inline Json::Value ToJson(const std::vector<GPU>& gpus) {
@@ -100,7 +101,10 @@ inline Json::Value ToJson(const std::vector<GPU>& gpus) {
     gpu["total_vram"] = gpus[i].total_vram;
     gpu["uuid"] = gpus[i].uuid;
     gpu["activated"] = gpus[i].is_activated;
-    res.append(gpu);
+    gpu["vendor"] = gpus[i].vendor;
+    if (gpus[i].total_vram > 0) {
+      res.append(gpu);
+    }
   }
   return res;
 }
diff --git a/engine/controllers/engines.cc b/engine/controllers/engines.cc
@@ -375,17 +375,21 @@ void Engines::UpdateEngine(
           metadata = (*exist_engine).metadata;
         }
 
+        (void)engine_service_->UnloadEngine(engine);
+
         auto upd_res =
             engine_service_->UpsertEngine(engine, type, api_key, url, version,
                                           "all-platforms", status, metadata);
         if (upd_res.has_error()) {
           Json::Value res;
           res["message"] = upd_res.error();
+          CTL_WRN("Error: " << upd_res.error());
           auto resp = cortex_utils::CreateCortexHttpJsonResponse(res);
           resp->setStatusCode(k400BadRequest);
           callback(resp);
         } else {
           Json::Value res;
+          CTL_INF("Remote Engine update successfully!");
           res["message"] = "Remote Engine update successfully!";
           auto resp = cortex_utils::CreateCortexHttpJsonResponse(res);
           resp->setStatusCode(k200OK);
@@ -394,6 +398,7 @@ void Engines::UpdateEngine(
       } else {
         Json::Value res;
         res["message"] = "Request body is empty!";
+        CTL_WRN("Error: Request body is empty!");
         auto resp = cortex_utils::CreateCortexHttpJsonResponse(res);
         resp->setStatusCode(k400BadRequest);
         callback(resp);
diff --git a/engine/controllers/models.cc b/engine/controllers/models.cc
@@ -218,10 +218,11 @@ void Models::ListModel(
           obj["id"] = model_entry.model;
           obj["model"] = model_entry.model;
           obj["status"] = "downloaded";
-          auto es = model_service_->GetEstimation(model_entry.model);
-          if (es.has_value() && !!es.value()) {
-            obj["recommendation"] = hardware::ToJson(*(es.value()));
-          }
+          // TODO(sang) Temporarily remove this estimation
+          // auto es = model_service_->GetEstimation(model_entry.model);
+          // if (es.has_value() && !!es.value()) {
+          //   obj["recommendation"] = hardware::ToJson(*(es.value()));
+          // }
           data.append(std::move(obj));
           yaml_handler.Reset();
         } else if (model_config.engine == kPythonEngine) {
diff --git a/engine/cortex-common/EngineI.h b/engine/cortex-common/EngineI.h
@@ -59,14 +59,14 @@ class EngineI {
                              const std::string& log_path) = 0;
   virtual void SetLogLevel(trantor::Logger::LogLevel logLevel) = 0;
 
+  // Stop inflight chat completion in stream mode
+  virtual void StopInferencing(const std::string& model_id) = 0;
+
   virtual Json::Value GetRemoteModels() = 0;
   virtual void HandleRouteRequest(
       std::shared_ptr<Json::Value> json_body,
       std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;
   virtual void HandleInference(
       std::shared_ptr<Json::Value> json_body,
       std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;
-
-  // Stop inflight chat completion in stream mode
-  virtual void StopInferencing(const std::string& model_id) = 0;
 };
diff --git a/engine/extensions/remote-engine/remote_engine.cc b/engine/extensions/remote-engine/remote_engine.cc
@@ -29,8 +29,13 @@ size_t StreamWriteCallback(char* ptr, size_t size, size_t nmemb,
   CTL_DBG(chunk);
   Json::Value check_error;
   Json::Reader reader;
-  if (reader.parse(chunk, check_error)) {
+  context->chunks += chunk;
+  if (reader.parse(context->chunks, check_error) ||
+      (reader.parse(chunk, check_error) &&
+       chunk.find("error") != std::string::npos)) {
+    CTL_WRN(context->chunks);
     CTL_WRN(chunk);
+    CTL_INF("Request: " << context->last_request);
     Json::Value status;
     status["is_done"] = true;
     status["has_error"] = true;
@@ -143,7 +148,9 @@ CurlResponse RemoteEngine::MakeStreamingChatCompletionRequest(
       "",
       config.model,
       renderer_,
-      stream_template};
+      stream_template,
+      true,
+      body};
 
   curl_easy_setopt(curl, CURLOPT_URL, full_url.c_str());
   curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
diff --git a/engine/extensions/remote-engine/remote_engine.h b/engine/extensions/remote-engine/remote_engine.h
@@ -25,6 +25,8 @@ struct StreamContext {
   extensions::TemplateRenderer& renderer;
   std::string stream_template;
   bool need_stop = true;
+  std::string last_request;
+  std::string chunks;
 };
 struct CurlResponse {
   std::string body;
diff --git a/engine/services/engine_service.cc b/engine/services/engine_service.cc
@@ -870,10 +870,10 @@ cpp::result<void, std::string> EngineService::UnloadEngine(
     auto unload_opts = EngineI::EngineUnloadOption{};
     e->Unload(unload_opts);
     delete e;
-    engines_.erase(ne);
   } else {
     delete std::get<RemoteEngineI*>(engines_[ne].engine);
   }
+  engines_.erase(ne);
 
   CTL_DBG("Engine unloaded: " + ne);
   return {};
diff --git a/engine/services/hardware_service.cc b/engine/services/hardware_service.cc
@@ -38,6 +38,7 @@ bool TryConnectToServer(const std::string& host, int port) {
 
 HardwareInfo HardwareService::GetHardwareInfo() {
   // append active state
+  std::lock_guard<std::mutex> l(mtx_);
   auto gpus = cortex::hw::GetGPUInfo();
   auto res = db_service_->LoadHardwareList();
   if (res.has_value()) {
@@ -63,7 +64,8 @@ bool HardwareService::Restart(const std::string& host, int port) {
   namespace luh = logging_utils_helper;
   if (!ahc_)
     return true;
-  auto exe = commands::GetCortexServerBinary();
+  auto exe = file_manager_utils::Subtract(
+      file_manager_utils::GetExecutablePath(), cortex_utils::GetCurrentPath());
   auto get_config_file_path = []() -> std::string {
     if (file_manager_utils::cortex_config_file_path.empty()) {
       return file_manager_utils::GetConfigurationPath().string();
@@ -144,16 +146,17 @@ bool HardwareService::Restart(const std::string& host, int port) {
   ZeroMemory(&pi, sizeof(pi));
   // TODO (sang) write a common function for this and server_start_cmd
   std::wstring params = L"--ignore_cout";
-  params += L" --config_file_path " +
-            file_manager_utils::GetConfigurationPath().wstring();
-  params += L" --data_folder_path " +
-            file_manager_utils::GetCortexDataPath().wstring();
+  params += L" --config_file_path \"" +
+            file_manager_utils::GetConfigurationPath().wstring() + L"\"";
+  params += L" --data_folder_path \"" +
+            file_manager_utils::GetCortexDataPath().wstring() + L"\"";
   params += L" --loglevel " +
             cortex::wc::Utf8ToWstring(luh::LogLevelStr(luh::global_log_level));
-  std::wstring exe_w = cortex::wc::Utf8ToWstring(exe);
+  std::wstring exe_w = exe.wstring();
   std::wstring current_path_w =
       file_manager_utils::GetExecutableFolderContainerPath().wstring();
-  std::wstring wcmds = current_path_w + L"/" + exe_w + L" " + params;
+  std::wstring wcmds = current_path_w + L"\\" + exe_w + L" " + params;
+  CTL_DBG("wcmds: " << wcmds);
   std::vector<wchar_t> mutable_cmds(wcmds.begin(), wcmds.end());
   mutable_cmds.push_back(L'\0');
   // Create child process
@@ -185,7 +188,7 @@ bool HardwareService::Restart(const std::string& host, int port) {
   auto dylib_path_mng = std::make_shared<cortex::DylibPathManager>();
   auto db_srv = std::make_shared<DatabaseService>();
   EngineService(download_srv, dylib_path_mng, db_srv).RegisterEngineLibPath();
-  std::string p = cortex_utils::GetCurrentPath() + "/" + exe;
+  std::string p = cortex_utils::GetCurrentPath() / exe;
   commands.push_back(p);
   commands.push_back("--ignore_cout");
   commands.push_back("--config_file_path");
@@ -486,7 +489,7 @@ std::vector<int> HardwareService::GetCudaConfig() {
   // Map uuid back to nvidia id
   for (auto const& uuid : uuids) {
     for (auto const& ngpu : nvidia_gpus) {
-      if (uuid == ngpu.uuid) {
+      if (ngpu.uuid.find(uuid) != std::string::npos) {
         res.push_back(std::stoi(ngpu.id));
       }
     }
diff --git a/engine/services/hardware_service.h b/engine/services/hardware_service.h
@@ -2,6 +2,7 @@
 #include <stdint.h>
 #include <string>
 #include <vector>
+#include <mutex>
 
 #include "common/hardware_config.h"
 #include "database_service.h"
@@ -39,4 +40,5 @@ class HardwareService {
  private:
   std::shared_ptr<DatabaseService> db_service_ = nullptr;
   std::optional<cortex::hw::ActivateHardwareConfig> ahc_;
+  std::mutex mtx_;
 };
diff --git a/engine/services/inference_service.cc b/engine/services/inference_service.cc
@@ -24,8 +24,12 @@ cpp::result<void, InferResult> InferenceService::HandleChatCompletion(
     auto status = std::get<0>(ir)["status_code"].asInt();
     if (status != drogon::k200OK) {
       CTL_INF("Model is not loaded, start loading it: " << model_id);
-      auto res = LoadModel(saved_models_.at(model_id));
-      // ignore return result
+      // For remote engine, we use the updated configuration
+      if (engine_service_->IsRemoteEngine(engine_type)) {
+        (void)model_service_.lock()->StartModel(model_id, {}, false);
+      } else {
+        (void)LoadModel(saved_models_.at(model_id));
+      }
     }
   }
 
@@ -38,7 +42,7 @@ cpp::result<void, InferResult> InferenceService::HandleChatCompletion(
     LOG_WARN << "Engine is not loaded yet";
     return cpp::fail(std::make_pair(stt, res));
   }
- 
+
   if (!model_id.empty()) {
     if (auto model_service = model_service_.lock()) {
       auto metadata_ptr = model_service->GetCachedModelMetadata(model_id);
@@ -72,7 +76,6 @@ cpp::result<void, InferResult> InferenceService::HandleChatCompletion(
     }
   }
 
-
   CTL_DBG("Json body inference: " + json_body->toStyledString());
 
   auto cb = [q, tool_choice](Json::Value status, Json::Value res) {
@@ -217,10 +220,9 @@ InferResult InferenceService::LoadModel(
     std::get<RemoteEngineI*>(engine_result.value())
         ->LoadModel(json_body, std::move(cb));
   }
-  if (!engine_service_->IsRemoteEngine(engine_type)) {
-    auto model_id = json_body->get("model", "").asString();
-    saved_models_[model_id] = json_body;
-  }
+  // Save model config to reload if needed
+  auto model_id = json_body->get("model", "").asString();
+  saved_models_[model_id] = json_body;
   return std::make_pair(stt, r);
 }
 
diff --git a/engine/services/model_service.cc b/engine/services/model_service.cc
@@ -1233,6 +1233,8 @@ cpp::result<std::optional<std::string>, std::string>
 ModelService::MayFallbackToCpu(const std::string& model_path, int ngl,
                                int ctx_len, int n_batch, int n_ubatch,
                                const std::string& kv_cache_type) {
+  // TODO(sang) temporary disable this function 
+  return std::nullopt;
   assert(hw_service_);
   auto hw_info = hw_service_->GetHardwareInfo();
   assert(!!engine_svc_);
diff --git a/engine/services/model_source_service.cc b/engine/services/model_source_service.cc
@@ -475,14 +475,13 @@ ModelSourceService::AddCortexsoRepoBranch(const std::string& model_source,
 
 void ModelSourceService::SyncModelSource() {
   while (running_) {
-    std::this_thread::sleep_for(std::chrono::milliseconds(500));
+    std::this_thread::sleep_for(std::chrono::milliseconds(100));
     auto now = std::chrono::system_clock::now();
     auto config = file_manager_utils::GetCortexConfig();
     auto last_check =
         std::chrono::system_clock::time_point(
             std::chrono::milliseconds(config.checkedForSyncHubAt)) +
         std::chrono::hours(1);
-
     if (now > last_check) {
       CTL_DBG("Start to sync cortex.db");
 
diff --git a/engine/utils/file_manager_utils.cc b/engine/utils/file_manager_utils.cc
@@ -17,14 +17,15 @@
 #endif
 
 namespace file_manager_utils {
-std::filesystem::path GetExecutableFolderContainerPath() {
+
+std::filesystem::path GetExecutablePath() {
 #if defined(__APPLE__) && defined(__MACH__)
   char buffer[1024];
   uint32_t size = sizeof(buffer);
 
   if (_NSGetExecutablePath(buffer, &size) == 0) {
     // CTL_DBG("Executable path: " << buffer);
-    return std::filesystem::path{buffer}.parent_path();
+    return std::filesystem::path{buffer};
   } else {
     CTL_ERR("Failed to get executable path");
     return std::filesystem::current_path();
@@ -35,7 +36,7 @@ std::filesystem::path GetExecutableFolderContainerPath() {
   if (len != -1) {
     buffer[len] = '\0';
     // CTL_DBG("Executable path: " << buffer);
-    return std::filesystem::path{buffer}.parent_path();
+    return std::filesystem::path{buffer};
   } else {
     CTL_ERR("Failed to get executable path");
     return std::filesystem::current_path();
@@ -44,13 +45,17 @@ std::filesystem::path GetExecutableFolderContainerPath() {
   wchar_t buffer[MAX_PATH];
   GetModuleFileNameW(NULL, buffer, MAX_PATH);
   // CTL_DBG("Executable path: " << buffer);
-  return std::filesystem::path{buffer}.parent_path();
+  return std::filesystem::path{buffer};
 #else
   LOG_ERROR << "Unsupported platform!";
   return std::filesystem::current_path();
 #endif
 }
 
+std::filesystem::path GetExecutableFolderContainerPath() {
+  return GetExecutablePath().parent_path();
+}
+
 std::filesystem::path GetHomeDirectoryPath() {
 #ifdef _WIN32
   const wchar_t* homeDir = _wgetenv(L"USERPROFILE");
diff --git a/engine/utils/file_manager_utils.h b/engine/utils/file_manager_utils.h
@@ -20,6 +20,8 @@ inline std::string cortex_config_file_path;
 
 inline std::string cortex_data_folder_path;
 
+std::filesystem::path GetExecutablePath();
+
 std::filesystem::path GetExecutableFolderContainerPath();
 
 std::filesystem::path GetHomeDirectoryPath();
diff --git a/engine/utils/hardware/cpu_info.h b/engine/utils/hardware/cpu_info.h
diff --git a/engine/utils/hardware/gpu/vulkan/vulkan_gpu.h b/engine/utils/hardware/gpu/vulkan/vulkan_gpu.h
diff --git a/engine/utils/hardware/gpu_info.h b/engine/utils/hardware/gpu_info.h
diff --git a/engine/utils/system_info_utils.cc b/engine/utils/system_info_utils.cc
diff --git a/engine/utils/system_info_utils.h b/engine/utils/system_info_utils.h

Original file line number	Diff line number	Diff line change
`@@ -24,8 +24,12 @@ cpp::result<void, InferResult> InferenceService::HandleChatCompletion(`
`24`	`24`	`auto status = std::get<0>(ir)["status_code"].asInt();`
`25`	`25`	`if (status != drogon::k200OK) {`
`26`	`26`	`CTL_INF("Model is not loaded, start loading it: " << model_id);`
`27`		`- auto res = LoadModel(saved_models_.at(model_id));`
`28`		`- // ignore return result`
	`27`	`+ // For remote engine, we use the updated configuration`
	`28`	`+ if (engine_service_->IsRemoteEngine(engine_type)) {`
	`29`	`+ (void)model_service_.lock()->StartModel(model_id, {}, false);`
	`30`	`+ } else {`
	`31`	`+ (void)LoadModel(saved_models_.at(model_id));`
	`32`	`+ }`
`29`	`33`	`}`
`30`	`34`	`}`
`31`	`35`
`@@ -38,7 +42,7 @@ cpp::result<void, InferResult> InferenceService::HandleChatCompletion(`
`38`	`42`	`LOG_WARN << "Engine is not loaded yet";`
`39`	`43`	`return cpp::fail(std::make_pair(stt, res));`
`40`	`44`	`}`
`41`		`-`
	`45`	`+`
`42`	`46`	`if (!model_id.empty()) {`
`43`	`47`	`if (auto model_service = model_service_.lock()) {`
`44`	`48`	`auto metadata_ptr = model_service->GetCachedModelMetadata(model_id);`
`@@ -72,7 +76,6 @@ cpp::result<void, InferResult> InferenceService::HandleChatCompletion(`
`72`	`76`	`}`
`73`	`77`	`}`
`74`	`78`
`75`		`-`
`76`	`79`	`CTL_DBG("Json body inference: " + json_body->toStyledString());`
`77`	`80`
`78`	`81`	`auto cb = [q, tool_choice](Json::Value status, Json::Value res) {`
`@@ -217,10 +220,9 @@ InferResult InferenceService::LoadModel(`
`217`	`220`	`std::get<RemoteEngineI*>(engine_result.value())`
`218`	`221`	`->LoadModel(json_body, std::move(cb));`
`219`	`222`	`}`
`220`		`- if (!engine_service_->IsRemoteEngine(engine_type)) {`
`221`		`- auto model_id = json_body->get("model", "").asString();`
`222`		`- saved_models_[model_id] = json_body;`
`223`		`- }`
	`223`	`+ // Save model config to reload if needed`
	`224`	`+ auto model_id = json_body->get("model", "").asString();`
	`225`	`+ saved_models_[model_id] = json_body;`
`224`	`226`	`return std::make_pair(stt, r);`
`225`	`227`	`}`
`226`	`228`