From e18b8b81207b0491c572ab581de1fdefc8ff7065 Mon Sep 17 00:00:00 2001
From: Filippe Spolti <fspolti@redhat.com>
Date: Thu, 27 Jun 2024 17:04:21 -0300
Subject: [PATCH] chore: Upgrade Golang to 1.21 (#89)

Signed-off-by: Spolti <fspolti@redhat.com>

feat: Update model_config.proto
---
 Dockerfile                               |   2 +-
 go.mod                                   |   2 +-
 go.sum                                   |  14 +-
 internal/proto/triton/model_config.proto | 519 +++++++++++++++++++++--
 4 files changed, 495 insertions(+), 42 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 6514930d..8d06c227 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -15,7 +15,7 @@
 ###############################################################################
 # Stage 1: Create the developer image for the BUILDPLATFORM only
 ###############################################################################
-ARG GOLANG_VERSION=1.19
+ARG GOLANG_VERSION=1.21
 FROM --platform=$BUILDPLATFORM registry.access.redhat.com/ubi8/go-toolset:$GOLANG_VERSION AS develop
 
 ARG PROTOC_VERSION=21.5
diff --git a/go.mod b/go.mod
index 1eb4dac6..4bb36a32 100644
--- a/go.mod
+++ b/go.mod
@@ -1,6 +1,6 @@
 module github.com/kserve/modelmesh-runtime-adapter
 
-go 1.19
+go 1.21
 
 require (
 	cloud.google.com/go/storage v1.28.1
diff --git a/go.sum b/go.sum
index e15fc1a6..dd31d146 100644
--- a/go.sum
+++ b/go.sum
@@ -8,6 +8,7 @@ cloud.google.com/go/compute/metadata v0.2.3/go.mod h1:VAV5nSsACxMJvgaAuX6Pk2Aawl
 cloud.google.com/go/iam v0.13.0 h1:+CmB+K0J/33d0zSQ9SlFWUeCCEn5XJA0ZMZ3pHE9u8k=
 cloud.google.com/go/iam v0.13.0/go.mod h1:ljOg+rcNfzZ5d6f1nAUJ8ZIxOaZUVoS14bKCtaLZ/D0=
 cloud.google.com/go/longrunning v0.4.1 h1:v+yFJOfKC3yZdY6ZUI933pIYdhyhV8S3NpWrXWmg7jM=
+cloud.google.com/go/longrunning v0.4.1/go.mod h1:4iWDqhBZ70CvZ6BfETbvam3T8FMvLK+eFj0E6AaRQTo=
 cloud.google.com/go/storage v1.28.1 h1:F5QDG5ChchaAVQhINh24U99OWHURqrW8OmQcGKXcbgI=
 cloud.google.com/go/storage v1.28.1/go.mod h1:Qnisd4CqDdo6BGs2AD5LLnEsmSQ80wQ5ogcBBKhU86Y=
 github.com/Azure/azure-sdk-for-go/sdk/azcore v0.21.0/go.mod h1:fBF9PQNqB8scdgpZ3ufzaLntG0AG7C1WjPMsiFOmfHM=
@@ -50,6 +51,7 @@ github.com/go-logr/logr v1.2.3/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbV
 github.com/go-logr/zapr v1.2.3 h1:a9vnzlIBPQBBkeaR9IuMUfmVOrQlkoC4YfPoFkX3T7A=
 github.com/go-logr/zapr v1.2.3/go.mod h1:eIauM6P8qSvTw5o2ez6UEAfGjQKrxQTl5EoK+Qa2oG4=
 github.com/go-task/slim-sprig v0.0.0-20210107165309-348f09dbbbc0 h1:p104kn46Q8WdvHunIJ9dAyjPVtrBPhSr3KT2yUst43I=
+github.com/go-task/slim-sprig v0.0.0-20210107165309-348f09dbbbc0/go.mod h1:fyg7847qk6SyHyPtNmDHnmrv/HOrqktSC+C9fM+CJOE=
 github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q=
 github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q=
 github.com/golang-jwt/jwt v3.2.1+incompatible h1:73Z+4BJcrTC+KczS6WvTPvRGOp1WmfEP4Q1lOd9Z/+c=
@@ -89,7 +91,9 @@ github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/
 github.com/google/gofuzz v1.1.0 h1:Hsa8mG0dQ46ij8Sl2AYJDUv1oA9/d6Vk+3LG99Oe02g=
 github.com/google/gofuzz v1.1.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
 github.com/google/martian/v3 v3.3.2 h1:IqNFLAmvJOgVlpdEBiQbDc2EwKW77amAycfTuWKdfvw=
+github.com/google/martian/v3 v3.3.2/go.mod h1:oBOf6HBosgwRXnUGWUB05QECsc6uvmMiJ3+6W4l/CUk=
 github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1 h1:K6RDEckDVWvDI9JAJYCmNdQXq6neHJOYx3V6jnqNEec=
+github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE=
 github.com/google/uuid v1.1.1/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
 github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
 github.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I=
@@ -111,6 +115,7 @@ github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+o
 github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
 github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
 github.com/kr/pretty v0.3.0 h1:WgNl7dwNpEZ6jJ9k1snq4pZsg7DOEN8hP9Xw0Tsjwk0=
+github.com/kr/pretty v0.3.0/go.mod h1:640gp4NfQd8pI5XOwp5fnNeVWj67G7CFk/SaSQn7NBk=
 github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
 github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
 github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
@@ -125,7 +130,9 @@ github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjY
 github.com/modocache/gover v0.0.0-20171022184752-b58185e213c5/go.mod h1:caMODM3PzxT8aQXRPkAt8xlV/e7d7w8GM5g0fa5F0D8=
 github.com/montanaflynn/stats v0.6.6/go.mod h1:etXPPgVO6n31NxCd9KQUMvCM+ve0ruNzt6R8Bnaayow=
 github.com/onsi/ginkgo/v2 v2.9.1 h1:zie5Ly042PD3bsCvsSOPvRnFwyo3rKe64TJlD6nu0mk=
+github.com/onsi/ginkgo/v2 v2.9.1/go.mod h1:FEcmzVcCHl+4o9bQZVab+4dC9+j+91t2FHSzmGAPfuo=
 github.com/onsi/gomega v1.27.4 h1:Z2AnStgsdSayCMDiCU42qIz+HLqEPcgiOCXjAU/w+8E=
+github.com/onsi/gomega v1.27.4/go.mod h1:riYq/GJKh8hhoM01HN6Vmuy93AarCXCBGpvFDK3q3fQ=
 github.com/pkg/browser v0.0.0-20210115035449-ce105d075bb4 h1:Qj1ukM4GlMWXNdMBuXcXfz/Kw9s1qm0CLY32QxuSImI=
 github.com/pkg/browser v0.0.0-20210115035449-ce105d075bb4/go.mod h1:N6UoU20jOqggOuDwUaBQpluzLNDqif3kq9z2wpdYEfQ=
 github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
@@ -137,6 +144,7 @@ github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:
 github.com/rogpeppe/go-internal v1.11.0 h1:cWPaGQEPrBb5/AsnsZesgZZ9yb1OQ+GOISoDNXVBh4M=
 github.com/rogpeppe/go-internal v1.11.0/go.mod h1:ddIwULY96R17DhadqLgMfk9H9tvdUzkipdSkR5nkCZA=
 github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA=
+github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
 github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
 github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
 github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
@@ -158,6 +166,7 @@ go.uber.org/atomic v1.7.0 h1:ADUqmZGgLDDfbSL9ZmPxKTybcoEYHgpYfELNoN+7hsw=
 go.uber.org/atomic v1.7.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc=
 go.uber.org/goleak v1.1.10/go.mod h1:8a7PlsEVH3e/a/GLqe5IIrQx6GzcnRmZEufDUTk4A7A=
 go.uber.org/goleak v1.2.0 h1:xqgm/S+aQvhWFTtR0XK3Jvg7z8kGV8P4X14IzwN3Eqk=
+go.uber.org/goleak v1.2.0/go.mod h1:XJYK+MuIchqpmGmUSAzotztawfKvYLUIgg7guXrwVUo=
 go.uber.org/multierr v1.6.0 h1:y6IPFStTAIT5Ytl7/XYmHvzXQ7S3g/IeZW9hyZ5thw4=
 go.uber.org/multierr v1.6.0/go.mod h1:cdWPpRnG4AhwMwsgIHip0KRBQjJy5kYEpYjJxpXp9iU=
 go.uber.org/zap v1.19.0/go.mod h1:xg/QME4nWcxGxrpdeYfq7UvYrLh66cuVKdrbD1XF/NI=
@@ -225,6 +234,7 @@ golang.org/x/tools v0.1.1/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk=
 golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
 golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
 golang.org/x/tools v0.7.0 h1:W4OVu8VVOaIO0yzWMNdepAulS7YfoS3Zabrm8DOXXU4=
+golang.org/x/tools v0.7.0/go.mod h1:4pg6aUX35JBAogB10C9AtvVL+qowtN4pT3CGSQex14s=
 golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
@@ -260,8 +270,6 @@ google.golang.org/protobuf v1.23.1-0.20200526195155-81db48ad09cc/go.mod h1:EGpAD
 google.golang.org/protobuf v1.25.0/go.mod h1:9JNX74DMeImyA3h4bdi1ymwjUzf21/xIlbajtzgsN7c=
 google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw=
 google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc=
-google.golang.org/protobuf v1.32.0 h1:pPC6BG5ex8PDFnkbrGU3EixyhKcQ2aDuBS36lqK/C7I=
-google.golang.org/protobuf v1.32.0/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos=
 google.golang.org/protobuf v1.33.0 h1:uNO2rsAINq/JlFpSdYEKIZ0uKD/R9cpdv0T+yoGwGmI=
 google.golang.org/protobuf v1.33.0/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos=
 gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
@@ -282,6 +290,7 @@ gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
 honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
 honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
 k8s.io/api v0.26.1 h1:f+SWYiPd/GsiWwVRz+NbFyCgvv75Pk9NK6dlkZgpCRQ=
+k8s.io/api v0.26.1/go.mod h1:xd/GBNgR0f707+ATNyPmQ1oyKSgndzXij81FzWGsejg=
 k8s.io/apimachinery v0.27.0 h1:vEyy/PVMbPMCPutrssCVHCf0JNZ0Px+YqPi82K2ALlk=
 k8s.io/apimachinery v0.27.0/go.mod h1:5ikh59fK3AJ287GUvpUsryoMFtH9zj/ARfWCo3AyXTM=
 k8s.io/klog/v2 v2.90.1 h1:m4bYOKall2MmOiRaR1J+We67Do7vm9KiQVlT96lnHUw=
@@ -295,3 +304,4 @@ sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd/go.mod h1:B8JuhiUyNFVKdsE8h6
 sigs.k8s.io/structured-merge-diff/v4 v4.2.3 h1:PRbqxJClWWYMNV1dhaG4NsibJbArud9kFxnAMREiWFE=
 sigs.k8s.io/structured-merge-diff/v4 v4.2.3/go.mod h1:qjx8mGObPmV2aSZepjQjbmb2ihdVs8cGKBraizNC69E=
 sigs.k8s.io/yaml v1.3.0 h1:a2VclLzOGrwOHDiV8EfBGhvjHvP46CtW5j6POvhYGGo=
+sigs.k8s.io/yaml v1.3.0/go.mod h1:GeOyir5tyXNByN85N/dRIT9es5UQNerPYEKK56eTBm8=
diff --git a/internal/proto/triton/model_config.proto b/internal/proto/triton/model_config.proto
index 378bfb6b..31655fce 100644
--- a/internal/proto/triton/model_config.proto
+++ b/internal/proto/triton/model_config.proto
@@ -1,4 +1,4 @@
-// Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
+// Copyright 2018-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -71,6 +71,9 @@ enum DataType {
 
   //@@  .. cpp:enumerator:: DataType::STRING = 13
   TYPE_STRING = 13;
+
+  //@@  .. cpp:enumerator:: DataType::BF16 = 14
+  TYPE_BF16 = 14;
 }
 
 //@@
@@ -115,17 +118,18 @@ message ModelRateLimiter
   //@@
   //@@     The resources required to execute the request on a model instance.
   //@@     Resources are just names with a corresponding count. The execution
-  //@@     of the instance will be blocked until the specificied resources are
+  //@@     of the instance will be blocked until the specified resources are
   //@@     available. By default an instance uses no rate-limiter resources.
   //@@
   repeated Resource resources = 1;
 
   //@@  .. cpp:var:: uint32 priority
   //@@
-  //@@     The weighting value to be used for prioritizing across instances.
-  //@@     An instance with priority 2 will be given 1/2 the number of
-  //@@     scheduling chances as an instance_group with priority 1. The
-  //@@     default priority is 1.
+  //@@     The optional weighting value to be used for prioritizing across
+  //@@     instances. An instance with priority 2 will be given 1/2 the
+  //@@     number of scheduling chances as an instance_group with priority
+  //@@     1. The default priority is 1. The priority of value 0 will be
+  //@@     treated as priority 1.
   //@@
   uint32 priority = 2;
 }
@@ -173,11 +177,44 @@ message ModelInstanceGroup
     //@@       CPU and/or GPU(s) as specified by the model or backend itself.
     //@@       The inference server will not override the model/backend
     //@@       settings.
-    //@@       Currently, this option is supported only for Tensorflow models.
     //@@
     KIND_MODEL = 3;
   }
 
+  //@@
+  //@@  .. cpp:var:: message SecondaryDevice
+  //@@
+  //@@     A secondary device required for a model instance.
+  //@@
+  message SecondaryDevice
+  {
+    //@@
+    //@@  .. cpp:enum:: SecondaryDeviceKind
+    //@@
+    //@@     The kind of the secondary device.
+    //@@
+    enum SecondaryDeviceKind {
+      //@@    .. cpp:enumerator:: SecondaryDeviceKind::KIND_NVDLA = 0
+      //@@
+      //@@       An NVDLA core. http://nvdla.org
+      //@@       Currently KIND_NVDLA is only supported by the TensorRT backend.
+      //@@
+      KIND_NVDLA = 0;
+    }
+
+    //@@  .. cpp:var:: SecondaryDeviceKind kind
+    //@@
+    //@@     The secondary device kind.
+    //@@
+    SecondaryDeviceKind kind = 1;
+
+    //@@  .. cpp:var:: int64 device_id
+    //@@
+    //@@     Identifier for the secondary device.
+    //@@
+    int64 device_id = 2;
+  }
+
   //@@  .. cpp:var:: string name
   //@@
   //@@     Optional name of this group of instances. If not specified the
@@ -215,11 +252,18 @@ message ModelInstanceGroup
   //@@
   //@@     GPU(s) where instances should be available. For each GPU listed,
   //@@     'count' instances of the model will be available. Setting 'gpus'
-  //@@     to empty (or not specifying at all) is eqivalent to listing all
+  //@@     to empty (or not specifying at all) is equivalent to listing all
   //@@     available GPUs.
   //@@
   repeated int32 gpus = 3;
 
+  //@@  .. cpp:var:: SecondaryDevice secondary_devices (repeated)
+  //@@
+  //@@     Secondary devices that are required by instances specified by this
+  //@@     instance group. Optional.
+  //@@
+  repeated SecondaryDevice secondary_devices = 8;
+
   //@@  .. cpp:var:: string profile (repeated)
   //@@
   //@@     For TensorRT models containing multiple optimization profile, this
@@ -232,6 +276,23 @@ message ModelInstanceGroup
   //@@     optimization profile by default.
   //@@
   repeated string profile = 5;
+
+  //@@  .. cpp:var:: bool passive
+  //@@
+  //@@     Whether the instances within this instance group will be accepting
+  //@@     inference requests from the scheduler. If true, the instances will
+  //@@     not be added to the scheduler. Default value is false.
+  //@@
+  bool passive = 7;
+
+  //@@  .. cpp:var:: string host_policy
+  //@@
+  //@@     The host policy name that the instance to be associated with.
+  //@@     The default value is set to reflect the device kind of the instance,
+  //@@     for instance, KIND_CPU is "cpu", KIND_MODEL is "model" and
+  //@@     KIND_GPU is "gpu_<gpu_id>".
+  //@@
+  string host_policy = 9;
 }
 
 //@@
@@ -337,10 +398,26 @@ message ModelInput
   //@@     created batch. Default is false indicating that two requests will
   //@@     only be batched if this tensor has the same shape in both requests.
   //@@     True indicates that two requests can be batched even if this tensor
-  //@@     has a different shape in each request. A true value is currently
-  //@@     supported only for custom models.
+  //@@     has a different shape in each request.
   //@@
   bool allow_ragged_batch = 7;
+
+  //@@  .. cpp:var:: bool optional
+  //@@
+  //@@     Whether or not the input is optional for the model execution.
+  //@@     If true, the input is not required in the inference request.
+  //@@     Default value is false.
+  //@@
+  bool optional = 8;
+
+  //@@  .. cpp:var:: bool is_non_linear_format_io
+  //@@
+  //@@     Indicates whether the input tensor uses a non-linear IO format. This
+  //@@     field is currently supported only for TensorRT models. An error will
+  //@@     be generated if this specification does not comply with the
+  //@@     underlying model.
+  //@@
+  bool is_non_linear_format_io = 9;
 }
 
 //@@
@@ -371,7 +448,7 @@ message ModelOutput
   //@@  .. cpp:var:: ModelTensorReshape reshape
   //@@
   //@@     The shape produced for this output by the backend. The output will
-  //@@     be reshaped from this to the shape specifed in 'dims' before being
+  //@@     be reshaped from this to the shape specified in 'dims' before being
   //@@     returned in the inference response. The reshape must have the same
   //@@     number of elements as the output shape specified by 'dims'. Optional.
   //@@
@@ -393,6 +470,15 @@ message ModelOutput
   //@@     model.
   //@@
   bool is_shape_tensor = 6;
+
+  //@@  .. cpp:var:: bool is_non_linear_format_io
+  //@@
+  //@@     Indicates whether the output tensor uses a non-linear IO format. This
+  //@@     field is currently supported only for TensorRT models. An error will
+  //@@     be generated if this specification does not comply with the
+  //@@     underlying model.
+  //@@
+  bool is_non_linear_format_io = 7;
 }
 
 //@@  .. cpp:var:: message BatchInput
@@ -444,6 +530,28 @@ message BatchInput
     //@@         The data of the tensor will be uninitialized.
     //@@
     BATCH_MAX_ELEMENT_COUNT_AS_SHAPE = 3;
+
+    //@@      .. cpp:enumerator:: Kind::BATCH_ITEM_SHAPE = 4
+    //@@
+    //@@         Among the requests in the batch, the shape of the
+    //@@         'source_input' will be added as input with shape
+    //@@         [batch_size, len(input_dim)]. For example, if one
+    //@@         batch-2 input with shape [3, 1] and batch-1 input
+    //@@         with shape [2, 2] are batched, the batch input will
+    //@@         have shape [3, 2] and value [ [3, 1], [3, 1], [2, 2]].
+    //@@
+    BATCH_ITEM_SHAPE = 4;
+
+    //@@      .. cpp:enumerator:: Kind::BATCH_ITEM_SHAPE_FLATTEN = 5
+    //@@
+    //@@         Among the requests in the batch, the shape of the
+    //@@         'source_input' will be added as input with single dimensional
+    //@@         shape [batch_size * len(input_dim)]. For example, if one
+    //@@         batch-2 input with shape [3, 1] and batch-1 input
+    //@@         with shape [2, 2] are batched, the batch input will
+    //@@         have shape [6] and value [3, 1, 3, 1, 2, 2].
+    //@@
+    BATCH_ITEM_SHAPE_FLATTEN = 5;
   }
 
   //@@    .. cpp:var:: Kind kind
@@ -749,6 +857,17 @@ message ModelOptimizationPolicy
     //@@       Currently only recognized by TensorRT backend.
     //@@
     repeated GraphSpec graph_spec = 3;
+
+    //@@    .. cpp:var:: bool output_copy_stream
+    //@@
+    //@@       Uses a CUDA stream separate from the inference stream to copy the
+    //@@       output to host. However, be aware that setting this option to
+    //@@       true will lead to an increase in the memory consumption of the
+    //@@       model as Triton will allocate twice as much GPU memory for its
+    //@@       I/O tensor buffers. Default value is false.
+    //@@       Currently only recognized by TensorRT backend.
+    //@@
+    bool output_copy_stream = 4;
   }
 
   //@@
@@ -786,7 +905,7 @@ message ModelOptimizationPolicy
 
       //@@    .. cpp:var:: map<string, string> parameters
       //@@
-      //@@       Additional paremeters used to configure the accelerator.
+      //@@       Additional parameters used to configure the accelerator.
       //@@
       map<string, string> parameters = 2;
     }
@@ -903,6 +1022,29 @@ message ModelOptimizationPolicy
   //@@     Default is true.
   //@@
   PinnedMemoryBuffer output_pinned_memory = 6;
+
+  //@@  .. cpp:var:: uint32 gather_kernel_buffer_threshold
+  //@@
+  //@@     The backend may use a gather kernel to gather input data if the
+  //@@     device has direct access to the source buffer and the destination
+  //@@     buffer. In such case, the gather kernel will be used only if the
+  //@@     number of buffers to be gathered is greater or equal to
+  //@@     the specified value. If 0, the gather kernel will be disabled.
+  //@@     Default value is 0.
+  //@@     Currently only recognized by TensorRT backend.
+  //@@
+  uint32 gather_kernel_buffer_threshold = 7;
+
+  //@@  .. cpp:var:: bool eager_batching
+  //@@
+  //@@     Start preparing the next batch before the model instance is ready
+  //@@     for the next inference. This option can be used to overlap the
+  //@@     batch preparation with model execution, with the trade-off that
+  //@@     the next batch might be smaller than what it could have been.
+  //@@     Default value is false.
+  //@@     Currently only recognized by TensorRT backend.
+  //@@
+  bool eager_batching = 8;
 }
 
 //@@
@@ -1010,7 +1152,7 @@ message ModelDynamicBatching
   //@@
   bool preserve_ordering = 3;
 
-  //@@  .. cpp:var:: uint32 priority_levels
+  //@@  .. cpp:var:: uint64 priority_levels
   //@@
   //@@     The number of priority levels to be enabled for the model,
   //@@     the priority level starts from 1 and 1 is the highest priority.
@@ -1019,14 +1161,14 @@ message ModelDynamicBatching
   //@@     priority 3, etc. Requests with the same priority level will be
   //@@     handled in the order that they are received.
   //@@
-  uint32 priority_levels = 4;
+  uint64 priority_levels = 4;
 
-  //@@  .. cpp:var:: uint32 default_priority_level
+  //@@  .. cpp:var:: uint64 default_priority_level
   //@@
   //@@     The priority level used for requests that don't specify their
   //@@     priority. The value must be in the range [ 1, 'priority_levels' ].
   //@@
-  uint32 default_priority_level = 5;
+  uint64 default_priority_level = 5;
 
   //@@  .. cpp:var:: ModelQueuePolicy default_queue_policy
   //@@
@@ -1037,13 +1179,13 @@ message ModelDynamicBatching
   //@@
   ModelQueuePolicy default_queue_policy = 6;
 
-  //@@  .. cpp:var:: map<uint32, ModelQueuePolicy> priority_queue_policy
+  //@@  .. cpp:var:: map<uint64, ModelQueuePolicy> priority_queue_policy
   //@@
   //@@     Specify the queue policy for the priority level. The default queue
   //@@     policy will be used if a priority level doesn't specify a queue
   //@@     policy.
   //@@
-  map<uint32, ModelQueuePolicy> priority_queue_policy = 7;
+  map<uint64, ModelQueuePolicy> priority_queue_policy = 7;
 }
 
 //@@
@@ -1071,8 +1213,8 @@ message ModelSequenceBatching
       //@@
       //@@         A new sequence is/is-not starting. If true a sequence is
       //@@         starting, if false a sequence is continuing. Must
-      //@@         specify either int32_false_true or fp32_false_true for
-      //@@         this control. This control is optional.
+      //@@         specify either int32_false_true, fp32_false_true or
+      //@@         bool_false_true for this control. This control is optional.
       //@@
       CONTROL_SEQUENCE_START = 0;
 
@@ -1081,17 +1223,18 @@ message ModelSequenceBatching
       //@@         A sequence is/is-not ready for inference. If true the
       //@@         input tensor data is valid and should be used. If false
       //@@         the input tensor data is invalid and inferencing should
-      //@@         be "skipped".  Must specify either int32_false_true or
-      //@@         fp32_false_true for this control. This control is optional.
+      //@@         be "skipped". Must specify either int32_false_true,
+      //@@         fp32_false_true or bool_false_true for this control. This
+      //@@         control is optional.
       //@@
       CONTROL_SEQUENCE_READY = 1;
 
       //@@      .. cpp:enumerator:: Kind::CONTROL_SEQUENCE_END = 2
       //@@
       //@@         A sequence is/is-not ending. If true a sequence is
-      //@@         ending, if false a sequence is continuing. Must
-      //@@         specify either int32_false_true or fp32_false_true for
-      //@@         this control. This control is optional.
+      //@@         ending, if false a sequence is continuing. Must specify
+      //@@         either int32_false_true, fp32_false_true or bool_false_true
+      //@@         for this control. This control is optional.
       //@@
       CONTROL_SEQUENCE_END = 2;
 
@@ -1134,6 +1277,16 @@ message ModelSequenceBatching
     //@@
     repeated float fp32_false_true = 3;
 
+    //@@    .. cpp:var:: bool bool_false_true (repeated)
+    //@@
+    //@@       The control's true and false setting is indicated by setting
+    //@@       a value in a bool tensor. The tensor must be a
+    //@@       1-dimensional tensor with size equal to the batch size of
+    //@@       the request. 'bool_false_true' must have two entries: the
+    //@@       first the false value and the second the true value.
+    //@@
+    repeated bool bool_false_true = 5;
+
     //@@    .. cpp:var:: DataType data_type
     //@@
     //@@       The control's datatype.
@@ -1161,6 +1314,128 @@ message ModelSequenceBatching
     repeated Control control = 2;
   }
 
+  //@@
+  //@@  .. cpp:var:: message InitialState
+  //@@
+  //@@     Settings used to initialize data for implicit state.
+  //@@
+  message InitialState
+  {
+    //@@      .. cpp:var:: DataType data_type
+    //@@
+    //@@         The data-type of the state.
+    //@@
+    DataType data_type = 1;
+
+    //@@      .. cpp:var:: int64 dims (repeated)
+    //@@
+    //@@         The shape of the state tensor, not including the batch
+    //@@         dimension.
+    //@@
+    repeated int64 dims = 2;
+
+    //@@      .. cpp:var:: oneof state_data
+    //@@
+    //@@         Specify how the initial state data is generated.
+    //@@
+    oneof state_data
+    {
+      //@@
+      //@@      .. cpp:var:: bool zero_data
+      //@@
+      //@@         The identifier for using zeros as initial state data.
+      //@@         Note that the value of 'zero_data' will not be checked,
+      //@@         instead, zero data will be used as long as the field is set.
+      //@@
+      bool zero_data = 3;
+
+      //@@      .. cpp:var:: string data_file
+      //@@
+      //@@         The file whose content will be used as the initial data for
+      //@@         the state in row-major order. The file must be provided in
+      //@@         sub-directory 'initial_state' under the model directory.
+      //@@
+      string data_file = 4;
+    }
+
+    //@@  .. cpp:var:: string name
+    //@@
+    //@@     The name of the state initialization.
+    //@@
+    string name = 5;
+  }
+
+  //@@  .. cpp:var:: message State
+  //@@
+  //@@     An input / output pair of tensors that carry state for the sequence.
+  //@@
+  message State
+  {
+    //@@    .. cpp:var:: string input_name
+    //@@
+    //@@       The name of the model state input.
+    //@@
+    string input_name = 1;
+
+    //@@    .. cpp:var:: string output_name
+    //@@
+    //@@       The name of the model state output.
+    //@@
+    string output_name = 2;
+
+    //@@    .. cpp:var:: DataType data_type
+    //@@
+    //@@       The data-type of the state.
+    //@@
+    DataType data_type = 3;
+
+    //@@    .. cpp:var:: int64 dim (repeated)
+    //@@
+    //@@       The dimension.
+    //@@
+    repeated int64 dims = 4;
+
+    //@@  .. cpp:var:: InitialState initial_state (repeated)
+    //@@
+    //@@     The optional field to specify the initial state for the model.
+    //@@
+    repeated InitialState initial_state = 5;
+
+    //@@  .. cpp:var:: bool use_same_buffer_for_input_output
+    //@@
+    //@@     The optional field to use a single buffer for both input and output
+    //@@     state. Without this option, Triton allocates separate buffers
+    //@@     for input and output state
+    //@@     which can be problematic if the state size is
+    //@@     large. This option reduces the memory usage by allocating a single
+    //@@     buffer. Enabling this option is recommended whenever
+    //@@     the input state is processed before the output state is written.
+    //@@     When enabled the state
+    //@@     will always be updated independent of whether
+    //@@     TRITONBACKEND_StateUpdate is called
+    //@@     (however TRITONBACKEND_StateUpdate should still be called for
+    //@@     completeness).
+    //@@
+    //@@     The default value is false.
+    //@@
+    bool use_same_buffer_for_input_output = 6;
+
+    //@@  .. cpp:var:: bool use_growable_memory
+    //@@
+    //@@     The optional field to enable an implicit state buffer to grow
+    //@@     without reallocating or copying existing memory.
+    //@@     Additional memory will be appended to the end of the buffer and
+    //@@     existing data will be preserved.
+    //@@     This option is only available for CUDA memory and requires enabling
+    //@@     use_same_buffer_for_input_output. When using this option,
+    //@@     StateBuffer call will always return CUDA memory even if CPU memory
+    //@@     is requested.
+    //@@
+    //@@     The default value is false.
+    //@@
+    bool use_growable_memory = 7;
+  }
+
   //@@  .. cpp:var:: message StrategyDirect
   //@@
   //@@     The sequence batcher uses a specific, unique batch
@@ -1169,7 +1444,31 @@ message ModelSequenceBatching
   //@@     model instance over the lifetime of the sequence. This
   //@@     is the default strategy.
   //@@
-  message StrategyDirect {}
+  message StrategyDirect
+  {
+    //@@    .. cpp:var:: uint64 max_queue_delay_microseconds
+    //@@
+    //@@       The maximum time, in microseconds, a candidate request
+    //@@       will be delayed in the sequence batch scheduling queue to
+    //@@       wait for additional requests for batching. Default is 0.
+    //@@
+    uint64 max_queue_delay_microseconds = 1;
+
+    //@@    .. cpp:var:: float minimum_slot_utilization
+    //@@
+    //@@       The minimum slot utilization that must be satisfied to
+    //@@       execute the batch before 'max_queue_delay_microseconds' expires.
+    //@@       For example, a value of 0.5 indicates that the batch should be
+    //@@       executed as soon as 50% or more of the slots are ready even if
+    //@@       the 'max_queue_delay_microseconds' timeout has not expired.
+    //@@       The default is 0.0, indicating that a batch will be executed
+    //@@       before 'max_queue_delay_microseconds' timeout expires if at least
+    //@@       one batch slot is ready. 'max_queue_delay_microseconds' will be
+    //@@       ignored unless minimum_slot_utilization is set to a non-zero
+    //@@       value.
+    //@@
+    float minimum_slot_utilization = 2;
+  }
 
   //@@  .. cpp:var:: message StrategyOldest
   //@@
@@ -1187,7 +1486,7 @@ message ModelSequenceBatching
     //@@    .. cpp:var:: int32 max_candidate_sequences
     //@@
     //@@       Maximum number of candidate sequences that the batcher
-    //@@       maintains. Excess seqences are kept in an ordered backlog
+    //@@       maintains. Excess sequences are kept in an ordered backlog
     //@@       and become candidates when existing candidate sequences
     //@@       complete.
     //@@
@@ -1197,7 +1496,7 @@ message ModelSequenceBatching
     //@@
     //@@       Preferred batch sizes for dynamic batching of candidate
     //@@       sequences. If a batch of one of these sizes can be formed
-    //@@       it will be executed immediately.  If not specified a
+    //@@       it will be executed immediately. If not specified a
     //@@       preferred batch size will be chosen automatically
     //@@       based on model and GPU characteristics.
     //@@
@@ -1210,6 +1509,29 @@ message ModelSequenceBatching
     //@@       wait for additional requests for batching. Default is 0.
     //@@
     uint64 max_queue_delay_microseconds = 3;
+
+    //@@    .. cpp:var:: bool preserve_ordering
+    //@@
+    //@@       Should the dynamic batcher preserve the ordering of responses to
+    //@@       match the order of requests received by the scheduler. Default is
+    //@@       false. If true, the responses will be returned in the same order
+    //@@       as the order of requests sent to the scheduler. If false, the
+    //@@       responses may be returned in arbitrary order. This option is
+    //@@       specifically needed when a sequence of related inference requests
+    //@@       (i.e. inference requests with the same correlation ID) are sent
+    //@@       to the dynamic batcher to ensure that the sequence responses are
+    //@@       in the correct order.
+    //@@
+    //@@       When using decoupled models, setting this to true may block the
+    //@@       responses from independent sequences from being returned to the
+    //@@       client until the previous request completes, hurting overall
+    //@@       performance. If using GRPC streaming protocol, the stream
+    //@@       ordering guarantee may be sufficient alone to ensure the
+    //@@       responses for each sequence are returned in sequence-order
+    //@@       without blocking based on independent requests, depending on the
+    //@@       use case.
+    //@@
+    bool preserve_ordering = 4;
   }
 
   //@@  .. cpp:var:: oneof strategy_choice
@@ -1251,6 +1573,28 @@ message ModelSequenceBatching
   //@@     model.
   //@@
   repeated ControlInput control_input = 2;
+
+  //@@  .. cpp:var:: State state (repeated)
+  //@@
+  //@@     The optional state that can be stored in Triton for performing
+  //@@     inference requests on a sequence. Each sequence holds an implicit
+  //@@     state local to itself. The output state tensor provided by the
+  //@@     model in 'output_name' field of the current inference request will
+  //@@     be transferred as an input tensor named 'input_name' in the next
+  //@@     request of the same sequence. The input state of the first request
+  //@@     in the sequence contains garbage data.
+  //@@
+  repeated State state = 5;
+
+  //@@  .. cpp:var:: bool iterative_sequence
+  //@@
+  //@@     Requests for iterative sequences are processed over a number
+  //@@     of iterations. An iterative sequence is initiated by a single
+  //@@     request and is "rescheduled" by the model until completion.
+  //@@     Requests for inflight requests will be batched together
+  //@@     and can complete independently. Note this feature
+  //@@     requires backend support. Default value is false.
+  bool iterative_sequence = 6;
 }
 
 //@@
@@ -1301,6 +1645,13 @@ message ModelEnsembling
     //@@     can appear in an output map only once.
     //@@
     map<string, string> output_map = 4;
+
+    //@@  .. cpp:var:: string model_namespace
+    //@@
+    //@@     [RESERVED] currently this field is reserved for internal use, users
+    //@@     must not set any value to this field to avoid unexpected behavior.
+    //@@
+    string model_namespace = 5;
   }
 
   //@@  .. cpp:var:: Step step (repeated)
@@ -1380,7 +1731,10 @@ message ModelWarmup
       //@@
       //@@       The file whose content will be used as raw input data in
       //@@       row-major order. The file must be provided in a sub-directory
-      //@@       'warmup' under the model directory.
+      //@@       'warmup' under the model directory. The file contents should be
+      //@@       in binary format. For TYPE_STRING data-type, an element is
+      //@@       represented by a 4-byte unsigned integer giving the length
+      //@@       followed by the actual bytes.
       //@@
       string input_data_file = 5;
     }
@@ -1407,6 +1761,20 @@ message ModelWarmup
   //@@     control tensors.
   //@@
   map<string, Input> inputs = 3;
+
+  //@@  .. cpp:var:: uint32 count
+  //@@
+  //@@     The number of iterations that this warmup sample will be executed.
+  //@@     For example, if this field is set to 2, 2 model executions using this
+  //@@     sample will be scheduled for warmup. Default value is 0 which
+  //@@     indicates that this sample will be used only once.
+  //@@     Note that for sequence model, 'count' may not work well
+  //@@     because the model often expect a valid sequence of requests which
+  //@@     should be represented by a series of warmup samples. 'count > 1'
+  //@@     essentially "resends" one of the sample, which may invalidate the
+  //@@     sequence and result in unexpected warmup failure.
+  //@@
+  uint32 count = 4;
 }
 
 //@@
@@ -1416,7 +1784,7 @@ message ModelWarmup
 //@@
 message ModelOperations
 {
-  //@@  .. cpp:var:: string op_library_filename
+  //@@  .. cpp:var:: string op_library_filename (repeated)
   //@@
   //@@     Optional paths of the libraries providing custom operations for
   //@@     this model. Valid only for ONNX models.
@@ -1444,6 +1812,63 @@ message ModelTransactionPolicy
   bool decoupled = 1;
 }
 
+//@@
+//@@.. cpp:var:: message ModelRepositoryAgents
+//@@
+//@@   The repository agents for the model.
+//@@
+message ModelRepositoryAgents
+{
+  //@@
+  //@@  .. cpp:var:: message Agent
+  //@@
+  //@@     A repository agent that should be invoked for the specified
+  //@@     repository actions for this model.
+  //@@
+  message Agent
+  {
+    //@@    .. cpp:var:: string name
+    //@@
+    //@@       The name of the agent.
+    //@@
+    string name = 1;
+
+    //@@    .. cpp:var:: map<string, string> parameters
+    //@@
+    //@@       The parameters for the agent.
+    //@@
+    map<string, string> parameters = 2;
+  }
+
+  //@@
+  //@@  .. cpp:var:: Agent agents (repeated)
+  //@@
+  //@@     The ordered list of agents for the model. These agents will be
+  //@@     invoked in order to respond to repository actions occurring for the
+  //@@     model.
+  //@@
+  repeated Agent agents = 1;
+}
+
+//@@
+//@@.. cpp:var:: message ModelResponseCache
+//@@
+//@@   The response cache setting for the model.
+//@@
+message ModelResponseCache
+{
+  //@@
+  //@@  .. cpp::var:: bool enable
+  //@@
+  //@@     Whether or not to use response cache for the model. If True, the
+  //@@     responses from the model are cached and when identical request
+  //@@     is encountered, instead of going through the model execution,
+  //@@     the response from the cache is utilized. By default, response
+  //@@     cache is disabled for the models.
+  //@@
+  bool enable = 1;
+}
+
 //@@
 //@@.. cpp:var:: message ModelConfig
 //@@
@@ -1459,10 +1884,9 @@ message ModelConfig
 
   //@@  .. cpp:var:: string platform
   //@@
-  //@@     The framework for the model. Possible values are
-  //@@     "tensorrt_plan", "tensorflow_graphdef",
-  //@@     "tensorflow_savedmodel", "caffe2_netdef",
-  //@@     "onnxruntime_onnx", "pytorch_libtorch" and "custom".
+  //@@     Additional backend-specific configuration for the model.
+  //@@     Please refer to the backend documentation on whether this field
+  //@@     should be specified.
   //@@
   string platform = 2;
 
@@ -1472,6 +1896,12 @@ message ModelConfig
   //@@
   string backend = 17;
 
+  //@@  .. cpp:var:: string runtime
+  //@@
+  //@@     The name of the backend library file used by the model.
+  //@@
+  string runtime = 25;
+
   //@@  .. cpp:var:: ModelVersionPolicy version_policy
   //@@
   //@@     Policy indicating which version(s) of the model will be served.
@@ -1583,7 +2013,7 @@ message ModelConfig
   //@@     compute-capability specific model is not specified in
   //@@     :cpp:var:`cc_model_filenames`. If not specified the default name
   //@@     is 'model.graphdef', 'model.savedmodel', 'model.plan' or
-  //@@     'model.netdef' depending on the model type.
+  //@@     'model.pt' depending on the model type.
   //@@
   string default_model_filename = 8;
 
@@ -1605,8 +2035,7 @@ message ModelConfig
 
   //@@  .. cpp:var:: map<string,ModelParameter> parameters
   //@@
-  //@@     Optional model parameters. User-specified parameter values that
-  //@@     are made available to custom backends.
+  //@@     Optional model parameters. User-specified parameter values.
   //@@
   map<string, ModelParameter> parameters = 14;
 
@@ -1633,4 +2062,18 @@ message ModelConfig
   //@@     to be expected from the model.
   //@@
   ModelTransactionPolicy model_transaction_policy = 19;
-}
+
+  //@@  .. cpp:var:: ModelRepositoryAgents model_repository_agents
+  //@@
+  //@@     Optional specification of the agent(s) that should be invoked
+  //@@     with repository actions are performed for this model.
+  //@@
+  ModelRepositoryAgents model_repository_agents = 23;
+
+  //@@  .. cpp:var:: ModelResponseCache response_cache
+  //@@
+  //@@     Optional setting for utilizing the response cache for this
+  //@@     model.
+  //@@
+  ModelResponseCache response_cache = 24;
+}
\ No newline at end of file