From e18b8b81207b0491c572ab581de1fdefc8ff7065 Mon Sep 17 00:00:00 2001 From: Filippe Spolti Date: Thu, 27 Jun 2024 17:04:21 -0300 Subject: [PATCH] chore: Upgrade Golang to 1.21 (#89) Signed-off-by: Spolti feat: Update model_config.proto --- Dockerfile | 2 +- go.mod | 2 +- go.sum | 14 +- internal/proto/triton/model_config.proto | 519 +++++++++++++++++++++-- 4 files changed, 495 insertions(+), 42 deletions(-) diff --git a/Dockerfile b/Dockerfile index 6514930d..8d06c227 100644 --- a/Dockerfile +++ b/Dockerfile @@ -15,7 +15,7 @@ ############################################################################### # Stage 1: Create the developer image for the BUILDPLATFORM only ############################################################################### -ARG GOLANG_VERSION=1.19 +ARG GOLANG_VERSION=1.21 FROM --platform=$BUILDPLATFORM registry.access.redhat.com/ubi8/go-toolset:$GOLANG_VERSION AS develop ARG PROTOC_VERSION=21.5 diff --git a/go.mod b/go.mod index 1eb4dac6..4bb36a32 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module github.com/kserve/modelmesh-runtime-adapter -go 1.19 +go 1.21 require ( cloud.google.com/go/storage v1.28.1 diff --git a/go.sum b/go.sum index e15fc1a6..dd31d146 100644 --- a/go.sum +++ b/go.sum @@ -8,6 +8,7 @@ cloud.google.com/go/compute/metadata v0.2.3/go.mod h1:VAV5nSsACxMJvgaAuX6Pk2Aawl cloud.google.com/go/iam v0.13.0 h1:+CmB+K0J/33d0zSQ9SlFWUeCCEn5XJA0ZMZ3pHE9u8k= cloud.google.com/go/iam v0.13.0/go.mod h1:ljOg+rcNfzZ5d6f1nAUJ8ZIxOaZUVoS14bKCtaLZ/D0= cloud.google.com/go/longrunning v0.4.1 h1:v+yFJOfKC3yZdY6ZUI933pIYdhyhV8S3NpWrXWmg7jM= +cloud.google.com/go/longrunning v0.4.1/go.mod h1:4iWDqhBZ70CvZ6BfETbvam3T8FMvLK+eFj0E6AaRQTo= cloud.google.com/go/storage v1.28.1 h1:F5QDG5ChchaAVQhINh24U99OWHURqrW8OmQcGKXcbgI= cloud.google.com/go/storage v1.28.1/go.mod h1:Qnisd4CqDdo6BGs2AD5LLnEsmSQ80wQ5ogcBBKhU86Y= github.com/Azure/azure-sdk-for-go/sdk/azcore v0.21.0/go.mod h1:fBF9PQNqB8scdgpZ3ufzaLntG0AG7C1WjPMsiFOmfHM= @@ -50,6 +51,7 @@ github.com/go-logr/logr v1.2.3/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbV github.com/go-logr/zapr v1.2.3 h1:a9vnzlIBPQBBkeaR9IuMUfmVOrQlkoC4YfPoFkX3T7A= github.com/go-logr/zapr v1.2.3/go.mod h1:eIauM6P8qSvTw5o2ez6UEAfGjQKrxQTl5EoK+Qa2oG4= github.com/go-task/slim-sprig v0.0.0-20210107165309-348f09dbbbc0 h1:p104kn46Q8WdvHunIJ9dAyjPVtrBPhSr3KT2yUst43I= +github.com/go-task/slim-sprig v0.0.0-20210107165309-348f09dbbbc0/go.mod h1:fyg7847qk6SyHyPtNmDHnmrv/HOrqktSC+C9fM+CJOE= github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= github.com/golang-jwt/jwt v3.2.1+incompatible h1:73Z+4BJcrTC+KczS6WvTPvRGOp1WmfEP4Q1lOd9Z/+c= @@ -89,7 +91,9 @@ github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/ github.com/google/gofuzz v1.1.0 h1:Hsa8mG0dQ46ij8Sl2AYJDUv1oA9/d6Vk+3LG99Oe02g= github.com/google/gofuzz v1.1.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/martian/v3 v3.3.2 h1:IqNFLAmvJOgVlpdEBiQbDc2EwKW77amAycfTuWKdfvw= +github.com/google/martian/v3 v3.3.2/go.mod h1:oBOf6HBosgwRXnUGWUB05QECsc6uvmMiJ3+6W4l/CUk= github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1 h1:K6RDEckDVWvDI9JAJYCmNdQXq6neHJOYx3V6jnqNEec= +github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE= github.com/google/uuid v1.1.1/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I= @@ -111,6 +115,7 @@ github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+o github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= github.com/kr/pretty v0.3.0 h1:WgNl7dwNpEZ6jJ9k1snq4pZsg7DOEN8hP9Xw0Tsjwk0= +github.com/kr/pretty v0.3.0/go.mod h1:640gp4NfQd8pI5XOwp5fnNeVWj67G7CFk/SaSQn7NBk= github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= @@ -125,7 +130,9 @@ github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjY github.com/modocache/gover v0.0.0-20171022184752-b58185e213c5/go.mod h1:caMODM3PzxT8aQXRPkAt8xlV/e7d7w8GM5g0fa5F0D8= github.com/montanaflynn/stats v0.6.6/go.mod h1:etXPPgVO6n31NxCd9KQUMvCM+ve0ruNzt6R8Bnaayow= github.com/onsi/ginkgo/v2 v2.9.1 h1:zie5Ly042PD3bsCvsSOPvRnFwyo3rKe64TJlD6nu0mk= +github.com/onsi/ginkgo/v2 v2.9.1/go.mod h1:FEcmzVcCHl+4o9bQZVab+4dC9+j+91t2FHSzmGAPfuo= github.com/onsi/gomega v1.27.4 h1:Z2AnStgsdSayCMDiCU42qIz+HLqEPcgiOCXjAU/w+8E= +github.com/onsi/gomega v1.27.4/go.mod h1:riYq/GJKh8hhoM01HN6Vmuy93AarCXCBGpvFDK3q3fQ= github.com/pkg/browser v0.0.0-20210115035449-ce105d075bb4 h1:Qj1ukM4GlMWXNdMBuXcXfz/Kw9s1qm0CLY32QxuSImI= github.com/pkg/browser v0.0.0-20210115035449-ce105d075bb4/go.mod h1:N6UoU20jOqggOuDwUaBQpluzLNDqif3kq9z2wpdYEfQ= github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= @@ -137,6 +144,7 @@ github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1: github.com/rogpeppe/go-internal v1.11.0 h1:cWPaGQEPrBb5/AsnsZesgZZ9yb1OQ+GOISoDNXVBh4M= github.com/rogpeppe/go-internal v1.11.0/go.mod h1:ddIwULY96R17DhadqLgMfk9H9tvdUzkipdSkR5nkCZA= github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= +github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= @@ -158,6 +166,7 @@ go.uber.org/atomic v1.7.0 h1:ADUqmZGgLDDfbSL9ZmPxKTybcoEYHgpYfELNoN+7hsw= go.uber.org/atomic v1.7.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc= go.uber.org/goleak v1.1.10/go.mod h1:8a7PlsEVH3e/a/GLqe5IIrQx6GzcnRmZEufDUTk4A7A= go.uber.org/goleak v1.2.0 h1:xqgm/S+aQvhWFTtR0XK3Jvg7z8kGV8P4X14IzwN3Eqk= +go.uber.org/goleak v1.2.0/go.mod h1:XJYK+MuIchqpmGmUSAzotztawfKvYLUIgg7guXrwVUo= go.uber.org/multierr v1.6.0 h1:y6IPFStTAIT5Ytl7/XYmHvzXQ7S3g/IeZW9hyZ5thw4= go.uber.org/multierr v1.6.0/go.mod h1:cdWPpRnG4AhwMwsgIHip0KRBQjJy5kYEpYjJxpXp9iU= go.uber.org/zap v1.19.0/go.mod h1:xg/QME4nWcxGxrpdeYfq7UvYrLh66cuVKdrbD1XF/NI= @@ -225,6 +234,7 @@ golang.org/x/tools v0.1.1/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk= golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= golang.org/x/tools v0.7.0 h1:W4OVu8VVOaIO0yzWMNdepAulS7YfoS3Zabrm8DOXXU4= +golang.org/x/tools v0.7.0/go.mod h1:4pg6aUX35JBAogB10C9AtvVL+qowtN4pT3CGSQex14s= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= @@ -260,8 +270,6 @@ google.golang.org/protobuf v1.23.1-0.20200526195155-81db48ad09cc/go.mod h1:EGpAD google.golang.org/protobuf v1.25.0/go.mod h1:9JNX74DMeImyA3h4bdi1ymwjUzf21/xIlbajtzgsN7c= google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= -google.golang.org/protobuf v1.32.0 h1:pPC6BG5ex8PDFnkbrGU3EixyhKcQ2aDuBS36lqK/C7I= -google.golang.org/protobuf v1.32.0/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos= google.golang.org/protobuf v1.33.0 h1:uNO2rsAINq/JlFpSdYEKIZ0uKD/R9cpdv0T+yoGwGmI= google.golang.org/protobuf v1.33.0/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= @@ -282,6 +290,7 @@ gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= k8s.io/api v0.26.1 h1:f+SWYiPd/GsiWwVRz+NbFyCgvv75Pk9NK6dlkZgpCRQ= +k8s.io/api v0.26.1/go.mod h1:xd/GBNgR0f707+ATNyPmQ1oyKSgndzXij81FzWGsejg= k8s.io/apimachinery v0.27.0 h1:vEyy/PVMbPMCPutrssCVHCf0JNZ0Px+YqPi82K2ALlk= k8s.io/apimachinery v0.27.0/go.mod h1:5ikh59fK3AJ287GUvpUsryoMFtH9zj/ARfWCo3AyXTM= k8s.io/klog/v2 v2.90.1 h1:m4bYOKall2MmOiRaR1J+We67Do7vm9KiQVlT96lnHUw= @@ -295,3 +304,4 @@ sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd/go.mod h1:B8JuhiUyNFVKdsE8h6 sigs.k8s.io/structured-merge-diff/v4 v4.2.3 h1:PRbqxJClWWYMNV1dhaG4NsibJbArud9kFxnAMREiWFE= sigs.k8s.io/structured-merge-diff/v4 v4.2.3/go.mod h1:qjx8mGObPmV2aSZepjQjbmb2ihdVs8cGKBraizNC69E= sigs.k8s.io/yaml v1.3.0 h1:a2VclLzOGrwOHDiV8EfBGhvjHvP46CtW5j6POvhYGGo= +sigs.k8s.io/yaml v1.3.0/go.mod h1:GeOyir5tyXNByN85N/dRIT9es5UQNerPYEKK56eTBm8= diff --git a/internal/proto/triton/model_config.proto b/internal/proto/triton/model_config.proto index 378bfb6b..31655fce 100644 --- a/internal/proto/triton/model_config.proto +++ b/internal/proto/triton/model_config.proto @@ -1,4 +1,4 @@ -// Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved. +// Copyright 2018-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -71,6 +71,9 @@ enum DataType { //@@ .. cpp:enumerator:: DataType::STRING = 13 TYPE_STRING = 13; + + //@@ .. cpp:enumerator:: DataType::BF16 = 14 + TYPE_BF16 = 14; } //@@ @@ -115,17 +118,18 @@ message ModelRateLimiter //@@ //@@ The resources required to execute the request on a model instance. //@@ Resources are just names with a corresponding count. The execution - //@@ of the instance will be blocked until the specificied resources are + //@@ of the instance will be blocked until the specified resources are //@@ available. By default an instance uses no rate-limiter resources. //@@ repeated Resource resources = 1; //@@ .. cpp:var:: uint32 priority //@@ - //@@ The weighting value to be used for prioritizing across instances. - //@@ An instance with priority 2 will be given 1/2 the number of - //@@ scheduling chances as an instance_group with priority 1. The - //@@ default priority is 1. + //@@ The optional weighting value to be used for prioritizing across + //@@ instances. An instance with priority 2 will be given 1/2 the + //@@ number of scheduling chances as an instance_group with priority + //@@ 1. The default priority is 1. The priority of value 0 will be + //@@ treated as priority 1. //@@ uint32 priority = 2; } @@ -173,11 +177,44 @@ message ModelInstanceGroup //@@ CPU and/or GPU(s) as specified by the model or backend itself. //@@ The inference server will not override the model/backend //@@ settings. - //@@ Currently, this option is supported only for Tensorflow models. //@@ KIND_MODEL = 3; } + //@@ + //@@ .. cpp:var:: message SecondaryDevice + //@@ + //@@ A secondary device required for a model instance. + //@@ + message SecondaryDevice + { + //@@ + //@@ .. cpp:enum:: SecondaryDeviceKind + //@@ + //@@ The kind of the secondary device. + //@@ + enum SecondaryDeviceKind { + //@@ .. cpp:enumerator:: SecondaryDeviceKind::KIND_NVDLA = 0 + //@@ + //@@ An NVDLA core. http://nvdla.org + //@@ Currently KIND_NVDLA is only supported by the TensorRT backend. + //@@ + KIND_NVDLA = 0; + } + + //@@ .. cpp:var:: SecondaryDeviceKind kind + //@@ + //@@ The secondary device kind. + //@@ + SecondaryDeviceKind kind = 1; + + //@@ .. cpp:var:: int64 device_id + //@@ + //@@ Identifier for the secondary device. + //@@ + int64 device_id = 2; + } + //@@ .. cpp:var:: string name //@@ //@@ Optional name of this group of instances. If not specified the @@ -215,11 +252,18 @@ message ModelInstanceGroup //@@ //@@ GPU(s) where instances should be available. For each GPU listed, //@@ 'count' instances of the model will be available. Setting 'gpus' - //@@ to empty (or not specifying at all) is eqivalent to listing all + //@@ to empty (or not specifying at all) is equivalent to listing all //@@ available GPUs. //@@ repeated int32 gpus = 3; + //@@ .. cpp:var:: SecondaryDevice secondary_devices (repeated) + //@@ + //@@ Secondary devices that are required by instances specified by this + //@@ instance group. Optional. + //@@ + repeated SecondaryDevice secondary_devices = 8; + //@@ .. cpp:var:: string profile (repeated) //@@ //@@ For TensorRT models containing multiple optimization profile, this @@ -232,6 +276,23 @@ message ModelInstanceGroup //@@ optimization profile by default. //@@ repeated string profile = 5; + + //@@ .. cpp:var:: bool passive + //@@ + //@@ Whether the instances within this instance group will be accepting + //@@ inference requests from the scheduler. If true, the instances will + //@@ not be added to the scheduler. Default value is false. + //@@ + bool passive = 7; + + //@@ .. cpp:var:: string host_policy + //@@ + //@@ The host policy name that the instance to be associated with. + //@@ The default value is set to reflect the device kind of the instance, + //@@ for instance, KIND_CPU is "cpu", KIND_MODEL is "model" and + //@@ KIND_GPU is "gpu_". + //@@ + string host_policy = 9; } //@@ @@ -337,10 +398,26 @@ message ModelInput //@@ created batch. Default is false indicating that two requests will //@@ only be batched if this tensor has the same shape in both requests. //@@ True indicates that two requests can be batched even if this tensor - //@@ has a different shape in each request. A true value is currently - //@@ supported only for custom models. + //@@ has a different shape in each request. //@@ bool allow_ragged_batch = 7; + + //@@ .. cpp:var:: bool optional + //@@ + //@@ Whether or not the input is optional for the model execution. + //@@ If true, the input is not required in the inference request. + //@@ Default value is false. + //@@ + bool optional = 8; + + //@@ .. cpp:var:: bool is_non_linear_format_io + //@@ + //@@ Indicates whether the input tensor uses a non-linear IO format. This + //@@ field is currently supported only for TensorRT models. An error will + //@@ be generated if this specification does not comply with the + //@@ underlying model. + //@@ + bool is_non_linear_format_io = 9; } //@@ @@ -371,7 +448,7 @@ message ModelOutput //@@ .. cpp:var:: ModelTensorReshape reshape //@@ //@@ The shape produced for this output by the backend. The output will - //@@ be reshaped from this to the shape specifed in 'dims' before being + //@@ be reshaped from this to the shape specified in 'dims' before being //@@ returned in the inference response. The reshape must have the same //@@ number of elements as the output shape specified by 'dims'. Optional. //@@ @@ -393,6 +470,15 @@ message ModelOutput //@@ model. //@@ bool is_shape_tensor = 6; + + //@@ .. cpp:var:: bool is_non_linear_format_io + //@@ + //@@ Indicates whether the output tensor uses a non-linear IO format. This + //@@ field is currently supported only for TensorRT models. An error will + //@@ be generated if this specification does not comply with the + //@@ underlying model. + //@@ + bool is_non_linear_format_io = 7; } //@@ .. cpp:var:: message BatchInput @@ -444,6 +530,28 @@ message BatchInput //@@ The data of the tensor will be uninitialized. //@@ BATCH_MAX_ELEMENT_COUNT_AS_SHAPE = 3; + + //@@ .. cpp:enumerator:: Kind::BATCH_ITEM_SHAPE = 4 + //@@ + //@@ Among the requests in the batch, the shape of the + //@@ 'source_input' will be added as input with shape + //@@ [batch_size, len(input_dim)]. For example, if one + //@@ batch-2 input with shape [3, 1] and batch-1 input + //@@ with shape [2, 2] are batched, the batch input will + //@@ have shape [3, 2] and value [ [3, 1], [3, 1], [2, 2]]. + //@@ + BATCH_ITEM_SHAPE = 4; + + //@@ .. cpp:enumerator:: Kind::BATCH_ITEM_SHAPE_FLATTEN = 5 + //@@ + //@@ Among the requests in the batch, the shape of the + //@@ 'source_input' will be added as input with single dimensional + //@@ shape [batch_size * len(input_dim)]. For example, if one + //@@ batch-2 input with shape [3, 1] and batch-1 input + //@@ with shape [2, 2] are batched, the batch input will + //@@ have shape [6] and value [3, 1, 3, 1, 2, 2]. + //@@ + BATCH_ITEM_SHAPE_FLATTEN = 5; } //@@ .. cpp:var:: Kind kind @@ -749,6 +857,17 @@ message ModelOptimizationPolicy //@@ Currently only recognized by TensorRT backend. //@@ repeated GraphSpec graph_spec = 3; + + //@@ .. cpp:var:: bool output_copy_stream + //@@ + //@@ Uses a CUDA stream separate from the inference stream to copy the + //@@ output to host. However, be aware that setting this option to + //@@ true will lead to an increase in the memory consumption of the + //@@ model as Triton will allocate twice as much GPU memory for its + //@@ I/O tensor buffers. Default value is false. + //@@ Currently only recognized by TensorRT backend. + //@@ + bool output_copy_stream = 4; } //@@ @@ -786,7 +905,7 @@ message ModelOptimizationPolicy //@@ .. cpp:var:: map parameters //@@ - //@@ Additional paremeters used to configure the accelerator. + //@@ Additional parameters used to configure the accelerator. //@@ map parameters = 2; } @@ -903,6 +1022,29 @@ message ModelOptimizationPolicy //@@ Default is true. //@@ PinnedMemoryBuffer output_pinned_memory = 6; + + //@@ .. cpp:var:: uint32 gather_kernel_buffer_threshold + //@@ + //@@ The backend may use a gather kernel to gather input data if the + //@@ device has direct access to the source buffer and the destination + //@@ buffer. In such case, the gather kernel will be used only if the + //@@ number of buffers to be gathered is greater or equal to + //@@ the specified value. If 0, the gather kernel will be disabled. + //@@ Default value is 0. + //@@ Currently only recognized by TensorRT backend. + //@@ + uint32 gather_kernel_buffer_threshold = 7; + + //@@ .. cpp:var:: bool eager_batching + //@@ + //@@ Start preparing the next batch before the model instance is ready + //@@ for the next inference. This option can be used to overlap the + //@@ batch preparation with model execution, with the trade-off that + //@@ the next batch might be smaller than what it could have been. + //@@ Default value is false. + //@@ Currently only recognized by TensorRT backend. + //@@ + bool eager_batching = 8; } //@@ @@ -1010,7 +1152,7 @@ message ModelDynamicBatching //@@ bool preserve_ordering = 3; - //@@ .. cpp:var:: uint32 priority_levels + //@@ .. cpp:var:: uint64 priority_levels //@@ //@@ The number of priority levels to be enabled for the model, //@@ the priority level starts from 1 and 1 is the highest priority. @@ -1019,14 +1161,14 @@ message ModelDynamicBatching //@@ priority 3, etc. Requests with the same priority level will be //@@ handled in the order that they are received. //@@ - uint32 priority_levels = 4; + uint64 priority_levels = 4; - //@@ .. cpp:var:: uint32 default_priority_level + //@@ .. cpp:var:: uint64 default_priority_level //@@ //@@ The priority level used for requests that don't specify their //@@ priority. The value must be in the range [ 1, 'priority_levels' ]. //@@ - uint32 default_priority_level = 5; + uint64 default_priority_level = 5; //@@ .. cpp:var:: ModelQueuePolicy default_queue_policy //@@ @@ -1037,13 +1179,13 @@ message ModelDynamicBatching //@@ ModelQueuePolicy default_queue_policy = 6; - //@@ .. cpp:var:: map priority_queue_policy + //@@ .. cpp:var:: map priority_queue_policy //@@ //@@ Specify the queue policy for the priority level. The default queue //@@ policy will be used if a priority level doesn't specify a queue //@@ policy. //@@ - map priority_queue_policy = 7; + map priority_queue_policy = 7; } //@@ @@ -1071,8 +1213,8 @@ message ModelSequenceBatching //@@ //@@ A new sequence is/is-not starting. If true a sequence is //@@ starting, if false a sequence is continuing. Must - //@@ specify either int32_false_true or fp32_false_true for - //@@ this control. This control is optional. + //@@ specify either int32_false_true, fp32_false_true or + //@@ bool_false_true for this control. This control is optional. //@@ CONTROL_SEQUENCE_START = 0; @@ -1081,17 +1223,18 @@ message ModelSequenceBatching //@@ A sequence is/is-not ready for inference. If true the //@@ input tensor data is valid and should be used. If false //@@ the input tensor data is invalid and inferencing should - //@@ be "skipped". Must specify either int32_false_true or - //@@ fp32_false_true for this control. This control is optional. + //@@ be "skipped". Must specify either int32_false_true, + //@@ fp32_false_true or bool_false_true for this control. This + //@@ control is optional. //@@ CONTROL_SEQUENCE_READY = 1; //@@ .. cpp:enumerator:: Kind::CONTROL_SEQUENCE_END = 2 //@@ //@@ A sequence is/is-not ending. If true a sequence is - //@@ ending, if false a sequence is continuing. Must - //@@ specify either int32_false_true or fp32_false_true for - //@@ this control. This control is optional. + //@@ ending, if false a sequence is continuing. Must specify + //@@ either int32_false_true, fp32_false_true or bool_false_true + //@@ for this control. This control is optional. //@@ CONTROL_SEQUENCE_END = 2; @@ -1134,6 +1277,16 @@ message ModelSequenceBatching //@@ repeated float fp32_false_true = 3; + //@@ .. cpp:var:: bool bool_false_true (repeated) + //@@ + //@@ The control's true and false setting is indicated by setting + //@@ a value in a bool tensor. The tensor must be a + //@@ 1-dimensional tensor with size equal to the batch size of + //@@ the request. 'bool_false_true' must have two entries: the + //@@ first the false value and the second the true value. + //@@ + repeated bool bool_false_true = 5; + //@@ .. cpp:var:: DataType data_type //@@ //@@ The control's datatype. @@ -1161,6 +1314,128 @@ message ModelSequenceBatching repeated Control control = 2; } + //@@ + //@@ .. cpp:var:: message InitialState + //@@ + //@@ Settings used to initialize data for implicit state. + //@@ + message InitialState + { + //@@ .. cpp:var:: DataType data_type + //@@ + //@@ The data-type of the state. + //@@ + DataType data_type = 1; + + //@@ .. cpp:var:: int64 dims (repeated) + //@@ + //@@ The shape of the state tensor, not including the batch + //@@ dimension. + //@@ + repeated int64 dims = 2; + + //@@ .. cpp:var:: oneof state_data + //@@ + //@@ Specify how the initial state data is generated. + //@@ + oneof state_data + { + //@@ + //@@ .. cpp:var:: bool zero_data + //@@ + //@@ The identifier for using zeros as initial state data. + //@@ Note that the value of 'zero_data' will not be checked, + //@@ instead, zero data will be used as long as the field is set. + //@@ + bool zero_data = 3; + + //@@ .. cpp:var:: string data_file + //@@ + //@@ The file whose content will be used as the initial data for + //@@ the state in row-major order. The file must be provided in + //@@ sub-directory 'initial_state' under the model directory. + //@@ + string data_file = 4; + } + + //@@ .. cpp:var:: string name + //@@ + //@@ The name of the state initialization. + //@@ + string name = 5; + } + + //@@ .. cpp:var:: message State + //@@ + //@@ An input / output pair of tensors that carry state for the sequence. + //@@ + message State + { + //@@ .. cpp:var:: string input_name + //@@ + //@@ The name of the model state input. + //@@ + string input_name = 1; + + //@@ .. cpp:var:: string output_name + //@@ + //@@ The name of the model state output. + //@@ + string output_name = 2; + + //@@ .. cpp:var:: DataType data_type + //@@ + //@@ The data-type of the state. + //@@ + DataType data_type = 3; + + //@@ .. cpp:var:: int64 dim (repeated) + //@@ + //@@ The dimension. + //@@ + repeated int64 dims = 4; + + //@@ .. cpp:var:: InitialState initial_state (repeated) + //@@ + //@@ The optional field to specify the initial state for the model. + //@@ + repeated InitialState initial_state = 5; + + //@@ .. cpp:var:: bool use_same_buffer_for_input_output + //@@ + //@@ The optional field to use a single buffer for both input and output + //@@ state. Without this option, Triton allocates separate buffers + //@@ for input and output state + //@@ which can be problematic if the state size is + //@@ large. This option reduces the memory usage by allocating a single + //@@ buffer. Enabling this option is recommended whenever + //@@ the input state is processed before the output state is written. + //@@ When enabled the state + //@@ will always be updated independent of whether + //@@ TRITONBACKEND_StateUpdate is called + //@@ (however TRITONBACKEND_StateUpdate should still be called for + //@@ completeness). + //@@ + //@@ The default value is false. + //@@ + bool use_same_buffer_for_input_output = 6; + + //@@ .. cpp:var:: bool use_growable_memory + //@@ + //@@ The optional field to enable an implicit state buffer to grow + //@@ without reallocating or copying existing memory. + //@@ Additional memory will be appended to the end of the buffer and + //@@ existing data will be preserved. + //@@ This option is only available for CUDA memory and requires enabling + //@@ use_same_buffer_for_input_output. When using this option, + //@@ StateBuffer call will always return CUDA memory even if CPU memory + //@@ is requested. + //@@ + //@@ The default value is false. + //@@ + bool use_growable_memory = 7; + } + //@@ .. cpp:var:: message StrategyDirect //@@ //@@ The sequence batcher uses a specific, unique batch @@ -1169,7 +1444,31 @@ message ModelSequenceBatching //@@ model instance over the lifetime of the sequence. This //@@ is the default strategy. //@@ - message StrategyDirect {} + message StrategyDirect + { + //@@ .. cpp:var:: uint64 max_queue_delay_microseconds + //@@ + //@@ The maximum time, in microseconds, a candidate request + //@@ will be delayed in the sequence batch scheduling queue to + //@@ wait for additional requests for batching. Default is 0. + //@@ + uint64 max_queue_delay_microseconds = 1; + + //@@ .. cpp:var:: float minimum_slot_utilization + //@@ + //@@ The minimum slot utilization that must be satisfied to + //@@ execute the batch before 'max_queue_delay_microseconds' expires. + //@@ For example, a value of 0.5 indicates that the batch should be + //@@ executed as soon as 50% or more of the slots are ready even if + //@@ the 'max_queue_delay_microseconds' timeout has not expired. + //@@ The default is 0.0, indicating that a batch will be executed + //@@ before 'max_queue_delay_microseconds' timeout expires if at least + //@@ one batch slot is ready. 'max_queue_delay_microseconds' will be + //@@ ignored unless minimum_slot_utilization is set to a non-zero + //@@ value. + //@@ + float minimum_slot_utilization = 2; + } //@@ .. cpp:var:: message StrategyOldest //@@ @@ -1187,7 +1486,7 @@ message ModelSequenceBatching //@@ .. cpp:var:: int32 max_candidate_sequences //@@ //@@ Maximum number of candidate sequences that the batcher - //@@ maintains. Excess seqences are kept in an ordered backlog + //@@ maintains. Excess sequences are kept in an ordered backlog //@@ and become candidates when existing candidate sequences //@@ complete. //@@ @@ -1197,7 +1496,7 @@ message ModelSequenceBatching //@@ //@@ Preferred batch sizes for dynamic batching of candidate //@@ sequences. If a batch of one of these sizes can be formed - //@@ it will be executed immediately. If not specified a + //@@ it will be executed immediately. If not specified a //@@ preferred batch size will be chosen automatically //@@ based on model and GPU characteristics. //@@ @@ -1210,6 +1509,29 @@ message ModelSequenceBatching //@@ wait for additional requests for batching. Default is 0. //@@ uint64 max_queue_delay_microseconds = 3; + + //@@ .. cpp:var:: bool preserve_ordering + //@@ + //@@ Should the dynamic batcher preserve the ordering of responses to + //@@ match the order of requests received by the scheduler. Default is + //@@ false. If true, the responses will be returned in the same order + //@@ as the order of requests sent to the scheduler. If false, the + //@@ responses may be returned in arbitrary order. This option is + //@@ specifically needed when a sequence of related inference requests + //@@ (i.e. inference requests with the same correlation ID) are sent + //@@ to the dynamic batcher to ensure that the sequence responses are + //@@ in the correct order. + //@@ + //@@ When using decoupled models, setting this to true may block the + //@@ responses from independent sequences from being returned to the + //@@ client until the previous request completes, hurting overall + //@@ performance. If using GRPC streaming protocol, the stream + //@@ ordering guarantee may be sufficient alone to ensure the + //@@ responses for each sequence are returned in sequence-order + //@@ without blocking based on independent requests, depending on the + //@@ use case. + //@@ + bool preserve_ordering = 4; } //@@ .. cpp:var:: oneof strategy_choice @@ -1251,6 +1573,28 @@ message ModelSequenceBatching //@@ model. //@@ repeated ControlInput control_input = 2; + + //@@ .. cpp:var:: State state (repeated) + //@@ + //@@ The optional state that can be stored in Triton for performing + //@@ inference requests on a sequence. Each sequence holds an implicit + //@@ state local to itself. The output state tensor provided by the + //@@ model in 'output_name' field of the current inference request will + //@@ be transferred as an input tensor named 'input_name' in the next + //@@ request of the same sequence. The input state of the first request + //@@ in the sequence contains garbage data. + //@@ + repeated State state = 5; + + //@@ .. cpp:var:: bool iterative_sequence + //@@ + //@@ Requests for iterative sequences are processed over a number + //@@ of iterations. An iterative sequence is initiated by a single + //@@ request and is "rescheduled" by the model until completion. + //@@ Requests for inflight requests will be batched together + //@@ and can complete independently. Note this feature + //@@ requires backend support. Default value is false. + bool iterative_sequence = 6; } //@@ @@ -1301,6 +1645,13 @@ message ModelEnsembling //@@ can appear in an output map only once. //@@ map output_map = 4; + + //@@ .. cpp:var:: string model_namespace + //@@ + //@@ [RESERVED] currently this field is reserved for internal use, users + //@@ must not set any value to this field to avoid unexpected behavior. + //@@ + string model_namespace = 5; } //@@ .. cpp:var:: Step step (repeated) @@ -1380,7 +1731,10 @@ message ModelWarmup //@@ //@@ The file whose content will be used as raw input data in //@@ row-major order. The file must be provided in a sub-directory - //@@ 'warmup' under the model directory. + //@@ 'warmup' under the model directory. The file contents should be + //@@ in binary format. For TYPE_STRING data-type, an element is + //@@ represented by a 4-byte unsigned integer giving the length + //@@ followed by the actual bytes. //@@ string input_data_file = 5; } @@ -1407,6 +1761,20 @@ message ModelWarmup //@@ control tensors. //@@ map inputs = 3; + + //@@ .. cpp:var:: uint32 count + //@@ + //@@ The number of iterations that this warmup sample will be executed. + //@@ For example, if this field is set to 2, 2 model executions using this + //@@ sample will be scheduled for warmup. Default value is 0 which + //@@ indicates that this sample will be used only once. + //@@ Note that for sequence model, 'count' may not work well + //@@ because the model often expect a valid sequence of requests which + //@@ should be represented by a series of warmup samples. 'count > 1' + //@@ essentially "resends" one of the sample, which may invalidate the + //@@ sequence and result in unexpected warmup failure. + //@@ + uint32 count = 4; } //@@ @@ -1416,7 +1784,7 @@ message ModelWarmup //@@ message ModelOperations { - //@@ .. cpp:var:: string op_library_filename + //@@ .. cpp:var:: string op_library_filename (repeated) //@@ //@@ Optional paths of the libraries providing custom operations for //@@ this model. Valid only for ONNX models. @@ -1444,6 +1812,63 @@ message ModelTransactionPolicy bool decoupled = 1; } +//@@ +//@@.. cpp:var:: message ModelRepositoryAgents +//@@ +//@@ The repository agents for the model. +//@@ +message ModelRepositoryAgents +{ + //@@ + //@@ .. cpp:var:: message Agent + //@@ + //@@ A repository agent that should be invoked for the specified + //@@ repository actions for this model. + //@@ + message Agent + { + //@@ .. cpp:var:: string name + //@@ + //@@ The name of the agent. + //@@ + string name = 1; + + //@@ .. cpp:var:: map parameters + //@@ + //@@ The parameters for the agent. + //@@ + map parameters = 2; + } + + //@@ + //@@ .. cpp:var:: Agent agents (repeated) + //@@ + //@@ The ordered list of agents for the model. These agents will be + //@@ invoked in order to respond to repository actions occurring for the + //@@ model. + //@@ + repeated Agent agents = 1; +} + +//@@ +//@@.. cpp:var:: message ModelResponseCache +//@@ +//@@ The response cache setting for the model. +//@@ +message ModelResponseCache +{ + //@@ + //@@ .. cpp::var:: bool enable + //@@ + //@@ Whether or not to use response cache for the model. If True, the + //@@ responses from the model are cached and when identical request + //@@ is encountered, instead of going through the model execution, + //@@ the response from the cache is utilized. By default, response + //@@ cache is disabled for the models. + //@@ + bool enable = 1; +} + //@@ //@@.. cpp:var:: message ModelConfig //@@ @@ -1459,10 +1884,9 @@ message ModelConfig //@@ .. cpp:var:: string platform //@@ - //@@ The framework for the model. Possible values are - //@@ "tensorrt_plan", "tensorflow_graphdef", - //@@ "tensorflow_savedmodel", "caffe2_netdef", - //@@ "onnxruntime_onnx", "pytorch_libtorch" and "custom". + //@@ Additional backend-specific configuration for the model. + //@@ Please refer to the backend documentation on whether this field + //@@ should be specified. //@@ string platform = 2; @@ -1472,6 +1896,12 @@ message ModelConfig //@@ string backend = 17; + //@@ .. cpp:var:: string runtime + //@@ + //@@ The name of the backend library file used by the model. + //@@ + string runtime = 25; + //@@ .. cpp:var:: ModelVersionPolicy version_policy //@@ //@@ Policy indicating which version(s) of the model will be served. @@ -1583,7 +2013,7 @@ message ModelConfig //@@ compute-capability specific model is not specified in //@@ :cpp:var:`cc_model_filenames`. If not specified the default name //@@ is 'model.graphdef', 'model.savedmodel', 'model.plan' or - //@@ 'model.netdef' depending on the model type. + //@@ 'model.pt' depending on the model type. //@@ string default_model_filename = 8; @@ -1605,8 +2035,7 @@ message ModelConfig //@@ .. cpp:var:: map parameters //@@ - //@@ Optional model parameters. User-specified parameter values that - //@@ are made available to custom backends. + //@@ Optional model parameters. User-specified parameter values. //@@ map parameters = 14; @@ -1633,4 +2062,18 @@ message ModelConfig //@@ to be expected from the model. //@@ ModelTransactionPolicy model_transaction_policy = 19; -} + + //@@ .. cpp:var:: ModelRepositoryAgents model_repository_agents + //@@ + //@@ Optional specification of the agent(s) that should be invoked + //@@ with repository actions are performed for this model. + //@@ + ModelRepositoryAgents model_repository_agents = 23; + + //@@ .. cpp:var:: ModelResponseCache response_cache + //@@ + //@@ Optional setting for utilizing the response cache for this + //@@ model. + //@@ + ModelResponseCache response_cache = 24; +} \ No newline at end of file