1
1
#include " utils.hpp"
2
2
3
3
#include " arg.h"
4
+ #include " chat-memory/chat_memory.h"
4
5
#include " common.h"
5
6
#include " json-schema-to-grammar.h"
6
7
#include " llama.h"
@@ -3911,8 +3912,21 @@ int main(int argc, char ** argv) {
3911
3912
auto completion_id = gen_chatcmplid ();
3912
3913
std::vector<server_task> tasks;
3913
3914
3915
+ std::string conv_id = " " ;
3914
3916
try {
3915
- const auto & prompt = data.at (" prompt" );
3917
+ // Read conv_id from JSON or skip if empty.
3918
+ conv_id = data.value (" conv_id" , " " );
3919
+ if (conv_id.empty ()) {
3920
+ SRV_INF (" %s" , " No conv_id provided, chat memory will be disabled.\n " );
3921
+ }
3922
+
3923
+ std::string prefix = " " ;
3924
+ if (!conv_id.empty ()) {
3925
+ auto & mem = get_or_create_chat_memory (conv_id);
3926
+ prefix = mem.format_injection_prompt () + " \n\n " ;
3927
+ }
3928
+ std::string prompt = prefix + data.at (" prompt" ).get <std::string>();
3929
+
3916
3930
// TODO: this log can become very long, put it behind a flag or think about a more compact format
3917
3931
// SRV_DBG("Prompt: %s\n", prompt.is_string() ? prompt.get<std::string>().c_str() : prompt.dump(2).c_str());
3918
3932
@@ -3953,12 +3967,24 @@ int main(int argc, char ** argv) {
3953
3967
ctx_server.receive_multi_results (task_ids, [&](std::vector<server_task_result_ptr> & results) {
3954
3968
if (results.size () == 1 ) {
3955
3969
// single result
3956
- res_ok (res, results[0 ]->to_json ());
3970
+ json out = results[0 ]->to_json ();
3971
+ // Parse model output for memory commands
3972
+ if (!conv_id.empty () && !results.empty ()) {
3973
+ auto & mem = get_or_create_chat_memory (conv_id);
3974
+ mem.parse_and_execute_command_json (out);
3975
+ }
3976
+ res_ok (res, out);
3957
3977
} else {
3958
3978
// multiple results (multitask)
3959
3979
json arr = json::array ();
3960
3980
for (auto & res : results) {
3961
- arr.push_back (res->to_json ());
3981
+ json out = res->to_json ();
3982
+ // Parse model output for memory commands from each task
3983
+ if (!conv_id.empty () && !out.empty ()) {
3984
+ auto & mem = get_or_create_chat_memory (conv_id);
3985
+ mem.parse_and_execute_command_json (out);
3986
+ }
3987
+ arr.push_back (out);
3962
3988
}
3963
3989
res_ok (res, arr);
3964
3990
}
@@ -3968,9 +3994,16 @@ int main(int argc, char ** argv) {
3968
3994
3969
3995
ctx_server.queue_results .remove_waiting_task_ids (task_ids);
3970
3996
} else {
3971
- const auto chunked_content_provider = [task_ids, &ctx_server, oaicompat](size_t , httplib::DataSink & sink) {
3997
+ const auto chunked_content_provider = [task_ids, &ctx_server, oaicompat, conv_id ](size_t , httplib::DataSink & sink) {
3972
3998
ctx_server.receive_cmpl_results_stream (task_ids, [&](server_task_result_ptr & result) -> bool {
3973
3999
json res_json = result->to_json ();
4000
+ if (!conv_id.empty ()) {
4001
+ auto & mem = get_or_create_chat_memory (conv_id);
4002
+ mem.process_response (res_json, result->is_stop (),
4003
+ [&sink](const char * data, size_t size) {
4004
+ sink.write (data, size);
4005
+ });
4006
+ }
3974
4007
if (res_json.is_array ()) {
3975
4008
for (const auto & res : res_json) {
3976
4009
if (!server_sent_event (sink, " data" , res)) {
0 commit comments