apache
diff --git a/‎apisix/cli/config.lua
Lines changed: 3 additions & 0 deletions b/‎apisix/cli/config.lua
Lines changed: 3 additions & 0 deletions
diff --git a/‎apisix/cli/ngx_tpl.lua
Lines changed: 13 additions & 1 deletion b/‎apisix/cli/ngx_tpl.lua
Lines changed: 13 additions & 1 deletion
diff --git a/‎apisix/plugins/ai-rate-limiting.lua
Lines changed: 209 additions & 0 deletions b/‎apisix/plugins/ai-rate-limiting.lua
Lines changed: 209 additions & 0 deletions
diff --git a/‎apisix/plugins/limit-count/init.lua
Lines changed: 20 additions & 10 deletions b/‎apisix/plugins/limit-count/init.lua
Lines changed: 20 additions & 10 deletions
diff --git a/‎conf/config.yaml.example
Lines changed: 1 addition & 0 deletions b/‎conf/config.yaml.example
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/en/latest/config.json
Lines changed: 2 additions & 1 deletion b/‎docs/en/latest/config.json
Lines changed: 2 additions & 1 deletion
@@ -160,6 +160,8 @@ local _M = {
         ["plugin-limit-req-redis-cluster-slot-lock"] = "1m",
         ["plugin-limit-count-redis-cluster-slot-lock"] = "1m",
         ["plugin-limit-conn-redis-cluster-slot-lock"] = "1m",
+        ["plugin-ai-rate-limiting"] = "10m",
+        ["plugin-ai-rate-limiting-reset-header"] = "10m",
         tracing_buffer = "10m",
         ["plugin-api-breaker"] = "10m",
         ["etcd-cluster-health-check"] = "10m",
@@ -219,6 +221,7 @@ local _M = {
     "ai-prompt-decorator",
     "ai-prompt-guard",
     "ai-rag",
+    "ai-rate-limiting",
     "ai-proxy-multi",
     "ai-proxy",
     "ai-aws-content-moderation",
 
@@ -287,6 +287,19 @@ http {
     lua_shared_dict tars {* http.lua_shared_dict["tars"] *};
     {% end %}
 
+
+    {% if http.lua_shared_dict["plugin-ai-rate-limiting"] then %}
+    lua_shared_dict plugin-ai-rate-limiting {* http.lua_shared_dict["plugin-ai-rate-limiting"] *};
+    {% else %}
+    lua_shared_dict plugin-ai-rate-limiting 10m;
+    {% end %}
+
+    {% if http.lua_shared_dict["plugin-ai-rate-limiting"] then %}
+    lua_shared_dict plugin-ai-rate-limiting-reset-header {* http.lua_shared_dict["plugin-ai-rate-limiting-reset-header"] *};
+    {% else %}
+    lua_shared_dict plugin-ai-rate-limiting-reset-header 10m;
+    {% end %}
+
     {% if enabled_plugins["limit-conn"] then %}
     lua_shared_dict plugin-limit-conn {* http.lua_shared_dict["plugin-limit-conn"] *};
     lua_shared_dict plugin-limit-conn-redis-cluster-slot-lock {* http.lua_shared_dict["plugin-limit-conn-redis-cluster-slot-lock"] *};
@@ -418,7 +431,6 @@ http {
     {% if ssl.ssl_trusted_certificate ~= nil then %}
     lua_ssl_trusted_certificate {* ssl.ssl_trusted_certificate *};
     {% end %}
-
     # http configuration snippet starts
     {% if http_configuration_snippet then %}
     {* http_configuration_snippet *}
 
@@ -0,0 +1,209 @@
+--
+-- Licensed to the Apache Software Foundation (ASF) under one or more
+-- contributor license agreements.  See the NOTICE file distributed with
+-- this work for additional information regarding copyright ownership.
+-- The ASF licenses this file to You under the Apache License, Version 2.0
+-- (the "License"); you may not use this file except in compliance with
+-- the License.  You may obtain a copy of the License at
+--
+--     http://www.apache.org/licenses/LICENSE-2.0
+--
+-- Unless required by applicable law or agreed to in writing, software
+-- distributed under the License is distributed on an "AS IS" BASIS,
+-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+-- See the License for the specific language governing permissions and
+-- limitations under the License.
+--
+local require = require
+local setmetatable = setmetatable
+local ipairs = ipairs
+local type = type
+local core = require("apisix.core")
+local limit_count = require("apisix.plugins.limit-count.init")
+
+local plugin_name = "ai-rate-limiting"
+
+local instance_limit_schema = {
+    type = "object",
+    properties = {
+        name = {type = "string"},
+        limit = {type = "integer", minimum = 1},
+        time_window = {type = "integer", minimum = 1}
+    },
+    required = {"name", "limit", "time_window"}
+}
+
+local schema = {
+    type = "object",
+    properties = {
+        limit = {type = "integer", exclusiveMinimum = 0},
+        time_window = {type = "integer",  exclusiveMinimum = 0},
+        show_limit_quota_header = {type = "boolean", default = true},
+        limit_strategy = {
+            type = "string",
+            enum = {"total_tokens", "prompt_tokens", "completion_tokens"},
+            default = "total_tokens",
+            description = "The strategy to limit the tokens"
+        },
+        instances = {
+            type = "array",
+            items = instance_limit_schema
+        },
+        rejected_code = {
+            type = "integer", minimum = 200, maximum = 599, default = 503
+        },
+        rejected_msg = {
+            type = "string", minLength = 1
+        },
+    },
+    required = {"limit", "time_window"},
+}
+
+local _M = {
+    version = 0.1,
+    priority = 1030,
+    name = plugin_name,
+    schema = schema
+}
+
+local limit_conf_cache = core.lrucache.new({
+    ttl = 300, count = 512
+})
+
+
+function _M.check_schema(conf)
+    return core.schema.check(schema, conf)
+end
+
+
+local function transform_limit_conf(plugin_conf, instance_conf, instance_name)
+    local key = plugin_name .. "#global"
+    local limit = plugin_conf.limit
+    local time_window = plugin_conf.time_window
+    local name = instance_name or ""
+    if instance_conf then
+        name = instance_conf.name
+        key = instance_conf.name
+        limit = instance_conf.limit
+        time_window = instance_conf.time_window
+    end
+    return {
+        _vid = key,
+
+        key = key,
+        count = limit,
+        time_window = time_window,
+        rejected_code = plugin_conf.rejected_code,
+        rejected_msg = plugin_conf.rejected_msg,
+        show_limit_quota_header = plugin_conf.show_limit_quota_header,
+        -- limit-count need these fields
+        policy = "local",
+        key_type = "constant",
+        allow_degradation = false,
+        sync_interval = -1,
+
+        limit_header = "X-AI-RateLimit-Limit-" .. name,
+        remaining_header = "X-AI-RateLimit-Remaining-" .. name,
+        reset_header = "X-AI-RateLimit-Reset-" .. name,
+    }
+end
+
+
+local function fetch_limit_conf_kvs(conf)
+    local mt = {
+        __index = function(t, k)
+            local limit_conf = transform_limit_conf(conf, nil, k)
+            t[k] = limit_conf
+            return limit_conf
+        end
+    }
+    local limit_conf_kvs = setmetatable({}, mt)
+    local conf_instances = conf.instances or {}
+    for _, limit_conf in ipairs(conf_instances) do
+        limit_conf_kvs[limit_conf.name] = transform_limit_conf(conf, limit_conf)
+    end
+    return limit_conf_kvs
+end
+
+
+function _M.access(conf, ctx)
+    local ai_instance_name = ctx.picked_ai_instance_name
+    if not ai_instance_name then
+        return
+    end
+
+    local limit_conf_kvs = limit_conf_cache(conf, nil, fetch_limit_conf_kvs, conf)
+    local limit_conf = limit_conf_kvs[ai_instance_name]
+    local code, msg = limit_count.rate_limit(limit_conf, ctx, plugin_name, 1, true)
+    ctx.ai_rate_limiting = code and true or false
+    return code, msg
+end
+
+
+function _M.check_instance_status(conf, ctx, instance_name)
+    if conf == nil then
+        local plugins = ctx.plugins
+        for _, plugin in ipairs(plugins) do
+            if plugin.name == plugin_name then
+                conf = plugin
+            end
+        end
+    end
+    if not conf then
+        return true
+    end
+
+    instance_name = instance_name or ctx.picked_ai_instance_name
+    if not instance_name then
+        return nil, "missing instance_name"
+    end
+
+    if type(instance_name) ~= "string" then
+        return nil, "invalid instance_name"
+    end
+
+    local limit_conf_kvs = limit_conf_cache(conf, nil, fetch_limit_conf_kvs, conf)
+    local limit_conf = limit_conf_kvs[instance_name]
+    local code, _ = limit_count.rate_limit(limit_conf, ctx, plugin_name, 1, true)
+    if code then
+        core.log.info("rate limit for instance: ", instance_name, " code: ", code)
+        return false
+    end
+    return true
+end
+
+
+local function get_token_usage(conf, ctx)
+    local usage = ctx.ai_token_usage
+    if not usage then
+        return
+    end
+    return usage[conf.limit_strategy]
+end
+
+
+function _M.log(conf, ctx)
+    local instance_name = ctx.picked_ai_instance_name
+    if not instance_name then
+        return
+    end
+
+    if ctx.ai_rate_limiting then
+        return
+    end
+
+    local used_tokens = get_token_usage(conf, ctx)
+    if not used_tokens then
+        core.log.error("failed to get token usage for llm service")
+        return
+    end
+
+    core.log.info("instance name: ", instance_name, " used tokens: ", used_tokens)
+
+    local limit_conf_kvs = limit_conf_cache(conf, nil, fetch_limit_conf_kvs, conf)
+    local limit_conf = limit_conf_kvs[instance_name]
+    limit_count.rate_limit(limit_conf, ctx, plugin_name, used_tokens)
+end
+
+
+return _M
@@ -21,6 +21,7 @@ local ipairs = ipairs
 local pairs = pairs
 local redis_schema = require("apisix.utils.redis-schema")
 local policy_to_additional_properties = redis_schema.schema
+local get_phase = ngx.get_phase
 
 local limit_redis_cluster_new
 local limit_redis_new
@@ -233,8 +234,9 @@ local function gen_limit_obj(conf, ctx, plugin_name)
     return core.lrucache.plugin_ctx(lrucache, ctx, extra_key, create_limit_obj, conf, plugin_name)
 end
 
-function _M.rate_limit(conf, ctx, name, cost)
+function _M.rate_limit(conf, ctx, name, cost, dry_run)
     core.log.info("ver: ", ctx.conf_version)
+    core.log.info("conf: ", core.json.delay_encode(conf, true))
 
     local lim, err = gen_limit_obj(conf, ctx, name)
 
@@ -275,7 +277,7 @@ function _M.rate_limit(conf, ctx, name, cost)
 
     local delay, remaining, reset
     if not conf.policy or conf.policy == "local" then
-        delay, remaining, reset = lim:incoming(key, true, conf, cost)
+        delay, remaining, reset = lim:incoming(key, not dry_run, conf, cost)
     else
         delay, remaining, reset = lim:incoming(key, cost)
     end
@@ -288,14 +290,22 @@ function _M.rate_limit(conf, ctx, name, cost)
     end
     core.log.info("limit-count plugin-metadata: ", core.json.delay_encode(metadata))
 
+    local set_limit_headers = {
+        limit_header = conf.limit_header or metadata.limit_header,
+        remaining_header = conf.remaining_header or metadata.remaining_header,
+        reset_header = conf.reset_header or metadata.reset_header,
+    }
+    local phase = get_phase()
+    local set_header = phase ~= "log"
+
     if not delay then
         local err = remaining
         if err == "rejected" then
             -- show count limit header when rejected
-            if conf.show_limit_quota_header then
-                core.response.set_header(metadata.limit_header, conf.count,
-                    metadata.remaining_header, 0,
-                    metadata.reset_header, reset)
+            if conf.show_limit_quota_header and set_header then
+                core.response.set_header(set_limit_headers.limit_header, conf.count,
+                set_limit_headers.remaining_header, 0,
+                set_limit_headers.reset_header, reset)
             end
 
             if conf.rejected_msg then
@@ -311,10 +321,10 @@ function _M.rate_limit(conf, ctx, name, cost)
         return 500, {error_msg = "failed to limit count"}
     end
 
-    if conf.show_limit_quota_header then
-        core.response.set_header(metadata.limit_header, conf.count,
-            metadata.remaining_header, remaining,
-            metadata.reset_header, reset)
+    if conf.show_limit_quota_header and set_header then
+        core.response.set_header(set_limit_headers.limit_header, conf.count,
+            set_limit_headers.remaining_header, remaining,
+            set_limit_headers.reset_header, reset)
     end
 end
 
 
@@ -480,6 +480,7 @@ plugins:                           # plugin list (sorted by priority)
   - ai-prompt-decorator            # priority: 1070
   - ai-prompt-guard                # priority: 1072
   - ai-rag                         # priority: 1060
+  - ai-rate-limiting               # priority: 1030
   - ai-aws-content-moderation      # priority: 1040 TODO: compare priority with other ai plugins
   - proxy-mirror                   # priority: 1010
   - proxy-rewrite                  # priority: 1008
 
@@ -158,7 +158,8 @@
             "plugins/request-id",
             "plugins/proxy-control",
             "plugins/client-control",
-            "plugins/workflow"
+            "plugins/workflow",
+            "plugins/ai-rate-limiting"
           ]
         },
         {
Original file line number	Diff line number	Diff line change
`@@ -158,7 +158,8 @@`
`158`	`158`	`"plugins/request-id",`
`159`	`159`	`"plugins/proxy-control",`
`160`	`160`	`"plugins/client-control",`
`161`		`- "plugins/workflow"`
	`161`	`+ "plugins/workflow",`
	`162`	`+ "plugins/ai-rate-limiting"`
`162`	`163`	`]`
`163`	`164`	`},`
`164`	`165`	`{`