bytedance
diff --git a/‎.pre-commit-config.yaml
Lines changed: 1 addition & 1 deletion b/‎.pre-commit-config.yaml
Lines changed: 1 addition & 1 deletion
diff --git a/‎CMakeLists.txt
Lines changed: 3 additions & 0 deletions b/‎CMakeLists.txt
Lines changed: 3 additions & 0 deletions
diff --git a/‎docker/Dockerfile.pypi
Lines changed: 6 additions & 1 deletion b/‎docker/Dockerfile.pypi
Lines changed: 6 additions & 1 deletion
diff --git a/‎examples/inference/python/__init__.py b/‎examples/inference/python/__init__.py
diff --git a/‎examples/inference/python/export/__init__.py b/‎examples/inference/python/export/__init__.py
diff --git a/‎examples/inference/python/export/fairseq/__init__.py b/‎examples/inference/python/export/fairseq/__init__.py
diff --git a/‎examples/inference/python/export/huggingface/__init__.py b/‎examples/inference/python/export/huggingface/__init__.py
diff --git a/‎examples/triton_backend/README.md
Lines changed: 91 additions & 0 deletions b/‎examples/triton_backend/README.md
Lines changed: 91 additions & 0 deletions
diff --git a/‎examples/triton_backend/model_repo/bert_example/1/.gitignore b/‎examples/triton_backend/model_repo/bert_example/1/.gitignore
diff --git a/‎examples/triton_backend/model_repo/bert_example/config.pbtxt
Lines changed: 32 additions & 0 deletions b/‎examples/triton_backend/model_repo/bert_example/config.pbtxt
Lines changed: 32 additions & 0 deletions
diff --git a/‎examples/triton_backend/model_repo/gpt_example/1/.gitignore b/‎examples/triton_backend/model_repo/gpt_example/1/.gitignore
diff --git a/‎examples/triton_backend/model_repo/gpt_example/config.pbtxt
Lines changed: 32 additions & 0 deletions b/‎examples/triton_backend/model_repo/gpt_example/config.pbtxt
Lines changed: 32 additions & 0 deletions
diff --git a/‎examples/triton_backend/model_repo/transformer_example/1/.gitignore b/‎examples/triton_backend/model_repo/transformer_example/1/.gitignore
diff --git a/‎examples/triton_backend/model_repo/transformer_example/config.pbtxt
Lines changed: 37 additions & 0 deletions b/‎examples/triton_backend/model_repo/transformer_example/config.pbtxt
Lines changed: 37 additions & 0 deletions
diff --git a/‎lightseq/inference/pywrapper/bert.cc
Lines changed: 0 additions & 1 deletion b/‎lightseq/inference/pywrapper/bert.cc
Lines changed: 0 additions & 1 deletion
diff --git a/‎lightseq/inference/pywrapper/gpt.cc
Lines changed: 0 additions & 1 deletion b/‎lightseq/inference/pywrapper/gpt.cc
Lines changed: 0 additions & 1 deletion
diff --git a/‎lightseq/inference/pywrapper/gpt.h
Lines changed: 1 addition & 1 deletion b/‎lightseq/inference/pywrapper/gpt.h
Lines changed: 1 addition & 1 deletion
diff --git a/‎lightseq/inference/pywrapper/moe.cc
Lines changed: 0 additions & 1 deletion b/‎lightseq/inference/pywrapper/moe.cc
Lines changed: 0 additions & 1 deletion
diff --git a/‎lightseq/inference/pywrapper/quant_transformer.cc
Lines changed: 0 additions & 1 deletion b/‎lightseq/inference/pywrapper/quant_transformer.cc
Lines changed: 0 additions & 1 deletion
diff --git a/‎lightseq/inference/pywrapper/quant_transformer.h
Lines changed: 2 additions & 2 deletions b/‎lightseq/inference/pywrapper/quant_transformer.h
Lines changed: 2 additions & 2 deletions
diff --git a/‎lightseq/inference/pywrapper/transformer.cc
Lines changed: 0 additions & 1 deletion b/‎lightseq/inference/pywrapper/transformer.cc
Lines changed: 0 additions & 1 deletion
diff --git a/‎lightseq/inference/pywrapper/transformer.h
Lines changed: 2 additions & 2 deletions b/‎lightseq/inference/pywrapper/transformer.h
Lines changed: 2 additions & 2 deletions
@@ -12,7 +12,7 @@ repos:
         args: [-style=file]
 
   - repo: https://github.com/psf/black
-    rev: 21.5b2
+    rev: 22.3.0
     hooks:
       - id: black
 
 
@@ -48,6 +48,8 @@ include_directories(SYSTEM ${PROJECT_SOURCE_DIR}/3rdparty/cub)
 
 link_directories(${COMMON_LIB_DIRS})
 
+add_compile_options(-Wno-unknown-pragmas)
+
 if(FP16_MODE)
   add_definitions(-DFP16_MODE)
   message(STATUS "Build using fp16 precision")
@@ -67,5 +69,6 @@ add_subdirectory(lightseq/inference/proto)
 add_subdirectory(lightseq/inference/model)
 add_subdirectory(lightseq/inference/pywrapper)
 add_subdirectory(lightseq/inference/server)
+add_subdirectory(lightseq/inference/triton_backend)
 
 # add_subdirectory(examples/inference/cpp)
@@ -35,7 +35,6 @@ ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH}
 ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64
 ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs
 
-
 # install protobuf
 RUN curl -O -L -C - \
     https://github.com/protocolbuffers/protobuf/releases/download/v3.13.0/protobuf-cpp-3.13.0.tar.gz && \
@@ -62,5 +61,11 @@ RUN curl -O -L -C - \
     ./configure --prefix=/usr/local/hdf5 "CFLAGS=-fPIC" "CXXFLAGS=-fPIC" --enable-build-mode=production && \
     make -j$(nproc) && make install && cd .. && rm -rf hdf5-hdf5-1_12_0
 
+# install rapidjson
+RUN curl -L -o rapidjson.tar.gz -C - https://github.com/Tencent/rapidjson/archive/refs/tags/v1.1.0.tar.gz && \
+    tar xf rapidjson.tar.gz && rm rapidjson.tar.gz && \
+    mkdir rapidjson-1.1.0/build && cd rapidjson-1.1.0/build && \
+    cmake .. && make install && cd ../.. && rm -rf rapidjson-1.1.0
+
 ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs:${LIBRARY_PATH}
 ENV PATH /usr/local/cuda/bin:/usr/local/hdf5:${PATH}
@@ -0,0 +1,91 @@
+# Tritonbackend Usage
+
+## How To Use Tritonbackend
+
+### How To Compile Tritonbackend
+
+- Execute commands as below
+
+  ```
+  $ cd <lightseq_repository>
+  $ mkdir build && cd build && \
+    cmake -DCMAKE_BUILD_TYPE=Release -DFP16_MODE=ON -DDEBUG_MODE=OFF -DDYNAMIC_API=ON .. && \
+    make -j${nproc}
+  ```
+
+   Then you can get outcomes include `libliblightseq.so` and `libtriton_lightseq.so`, Which are needed by model repository.
+
+   You can find libliblightseq.so in this path
+
+       `<lightseq_repository>/build/lightseq/inference/pywrapper/libliblightseq.so`
+
+   While libtriton_lightseq.so is in
+
+        `<lightseq_repository>/build/lightseq/inference/triton_backend/libtriton_lightseq.so`
+
+### How To Organize Model Repository
+
+```
+├── <path_to_model_repository>/
+│  ├── libliblightseq.so          # dynamic link library of lightseq, which contains the almost
+│  │                                implement of lightseq, and should be included by LD_LIBRARY_PATH
+│  ├── <model_name_1>/            # the directory of model, include parameters and configurations.
+│  │  ├── config.pbtxt            # the config of model, more detail is as below.
+│  │  ├── <model_file>            # the file of model parameters.
+│  │  ├── 1/                      # this empty directory is necessary, which is needed by tritonserver.
+│  │  ├── libtriton_lightseq.so   # dynamic link library of lightseq's tritonbackend
+│  ├── <model_name_2>/            # ...
+│  │  ├── config.pbtxt            # ...
+│  │  ├── <model_file>            # ...
+│  │  ├── 1/                      # ...
+│  │  ├── libtriton_lightseq.so   # ...
+│  ├── #<model_name_vid>...       # more models etc...
+```
+
+- The meaning of parameters in config.pbtxt, more information you can find in [Model config of tritonbackend](https://github.com/triton-inference-server/common/blob/main/protobuf/model_config.proto)
+
+  > ${name}: name of model，**which should be same with <model_name_vid>**
+  >
+  > ${backend}: **fixed value - "lightseq"**，which is used to recognize the dynamic link library of tritonbackend,  libtriton_lightseq.so
+  >
+  > ${default_model_filename}: name of model file，**which should be same with <model_file>**
+  >
+  > ${parameters - value - string_value}: the type of model, which should be supported by lightseq. You can choose `Transformer`|`QuantTransformer`|`Bert`|`Gpt`|`Moe`
+
+- You can see example in [Example Of Triton Model Config](https://github.com/bytedance/lightseq/tree/master/examples/triton_backend/model_repo), while you can also find more detailed information in [Model Config Of Tritonserver](https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md).
+
+  - The model files which needed by [Example Of Triton Model Config](https://github.com/bytedance/lightseq/tree/master/examples/triton_backend/model_repo) you can find in [The Guide of Model Export](https://github.com/bytedance/lightseq/blob/master/examples/inference/python/README.md).
+
+### How To Run Tritonserver
+
+#### Run Tritonserver By Docker
+
+- Get tritonserver Docker: [Tritonserver Quickstart](https://github.com/triton-inference-server/server/blob/main/docs/quickstart.md#install-triton-docker-image)
+
+  ```
+  $ sudo docker pull nvcr.io/nvidia/tritonserver:22.01-py3
+  ```
+
+- Docker Commands:
+
+  ```
+  $ sudo docker run --gpus=<num_of_gpus> --rm -e LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/models" -p8000:8000 -p8001:8001 -p8002:8002 -v<model_repository>:/models nvcr.io/nvidia/tritonserver:22.01-py3 tritonserver --model-repository=/models
+  ```
+
+  - <num_of_gpus>: int, the number of gpus which are needed by tritonserver.
+
+  - <model_repository>: str, the path of model repository which are organized by yourself.
+
+- Install client requirements:
+
+  ```
+  $ pip install tritonclient[all]
+  ```
+
+## Reference
+
+- [triton-inference-server/backend](https://github.com/triton-inference-server/backend)
+- [triton-inference-server/server](https://github.com/triton-inference-server/server)
+- [triton-inference-server/client](https://github.com/triton-inference-server/client)
+- [triton-inference-server/core](https://github.com/triton-inference-server/core)
+- [triton-inference-server/common](https://github.com/triton-inference-server/common)
@@ -0,0 +1,32 @@
+name: "bert_example"
+backend: "lightseq"
+max_batch_size: 8
+input [
+  {
+    name: "token_ids"
+    data_type: TYPE_INT32
+    dims: [ -1 ]
+  }
+]
+output [
+  {
+    name: "encoder_output"
+    data_type: TYPE_FP32
+    dims: [ -1 ]
+  }
+]
+instance_group [
+  {
+    count: 1
+    kind: KIND_GPU
+  }
+]
+default_model_filename: "lightseq_bert_base_uncased.hdf5"
+parameters: [
+    {
+        key: "model_type"
+        value: {
+            string_value: "Bert"
+        }
+    }
+]
@@ -0,0 +1,32 @@
+name: "gpt_example"
+backend: "lightseq"
+max_batch_size: 8
+input [
+  {
+    name: "token_ids"
+    data_type: TYPE_INT32
+    dims: [ -1 ]
+  }
+]
+output [
+  {
+    name: "result"
+    data_type: TYPE_INT32
+    dims: [ -1 ]
+  }
+]
+instance_group [
+  {
+    count: 1
+    kind: KIND_GPU
+  }
+]
+default_model_filename: "lightseq_gpt2_base.hdf5"
+parameters: [
+    {
+        key: "model_type"
+        value: {
+            string_value: "Gpt"
+        }
+    }
+]
@@ -0,0 +1,37 @@
+name: "transformer_example"
+backend: "lightseq"
+max_batch_size: 8
+input [
+  {
+    name: "source_ids"
+    data_type: TYPE_INT32
+    dims: [ -1 ]
+  }
+]
+output [
+  {
+    name: "target_ids"
+    data_type: TYPE_FP32
+    dims: [ -1 ]
+  },
+  {
+    name: "target_scores"
+    data_type: TYPE_FP32
+    dims: [ -1 ]
+  }
+]
+instance_group [
+  {
+    count: 1
+    kind: KIND_GPU
+  }
+]
+default_model_filename: "lightseq_bart_base.hdf5"
+parameters: [
+    {
+        key: "model_type"
+        value: {
+            string_value: "Transformer"
+        }
+    }
+]
@@ -7,7 +7,6 @@ Bert::Bert(const std::string weight_path, const int max_batch_size)
     : LSModel({"token_ids"}, {"encoder_output"}),
       _max_batch_size(max_batch_size) {
   /* ---step1. init environment--- */
-  CHECK_GPU_ERROR(cudaSetDevice(0));
   CHECK_GPU_ERROR(cudaStreamCreate(&stream_));
   CHECK_GPU_ERROR(cublasCreate(&hd_));
   CHECK_GPU_ERROR(cublasSetStream(hd_, stream_));
 
@@ -10,7 +10,6 @@ Gpt::Gpt(const std::string weight_path, const int max_batch_size)
       encoder_(nullptr),
       _max_batch_size(max_batch_size) {
   /* ---step1. init environment--- */
-  CHECK_GPU_ERROR(cudaSetDevice(0));
   CHECK_GPU_ERROR(cudaStreamCreate(&stream_));
   CHECK_GPU_ERROR(cudaStreamCreate(&cache_stream_));
   CHECK_GPU_ERROR(cublasCreate(&hd_));
 
@@ -38,7 +38,7 @@ class Gpt : public LSModel {
 
   const int* get_result_ptr();
   const float* get_score_ptr();
-  const int get_max_step() { return tw_._max_step; }
+  int get_max_step() { return tw_._max_step; }
 
   void Infer() override;
   void set_input_ptr(int index, void* input_ptr) override;
 
@@ -12,7 +12,6 @@ Moe::Moe(const std::string weight_path, const int max_batch_size)
       decoder_(nullptr),
       _max_batch_size(max_batch_size) {
   /* ---step1. init environment--- */
-  CHECK_GPU_ERROR(cudaSetDevice(0));
   CHECK_GPU_ERROR(cudaStreamCreate(&stream_));
   CHECK_GPU_ERROR(cublasCreate(&hd_));
   CHECK_GPU_ERROR(cublasSetStream(hd_, stream_));
 
@@ -13,7 +13,6 @@ QuantTransformer::QuantTransformer(const std::string weight_path,
       decoder_(nullptr),
       _max_batch_size(max_batch_size) {
   /* ---step1. init environment--- */
-  CHECK_GPU_ERROR(cudaSetDevice(0));
   CHECK_GPU_ERROR(cudaStreamCreate(&stream_));
   CHECK_GPU_ERROR(cublasCreate(&hd_));
   CHECK_GPU_ERROR(cublasSetStream(hd_, stream_));
 
@@ -37,8 +37,8 @@ class QuantTransformer : public LSModel {
 
   const int *get_result_ptr();
   const float *get_score_ptr();
-  const int get_max_step() { return tw_._max_step; }
-  const int get_beam_size() { return tw_._beam_size; }
+  int get_max_step() { return tw_._max_step; }
+  int get_beam_size() { return tw_._beam_size; }
 
  public:
   QuantTransformer(const std::string weight_path, const int max_batch_size);
 
@@ -13,7 +13,6 @@ Transformer::Transformer(const std::string weight_path,
       decoder_(nullptr),
       _max_batch_size(max_batch_size) {
   /* ---step1. init environment--- */
-  CHECK_GPU_ERROR(cudaSetDevice(0));
   CHECK_GPU_ERROR(cudaStreamCreate(&stream_));
   CHECK_GPU_ERROR(cublasCreate(&hd_));
   CHECK_GPU_ERROR(cublasSetStream(hd_, stream_));
 
@@ -38,8 +38,8 @@ class Transformer : public LSModel {
 
   const int *get_result_ptr();
   const float *get_score_ptr();
-  const int get_max_step() { return tw_._max_step; }
-  const int get_beam_size() { return tw_._beam_size; }
+  int get_max_step() { return tw_._max_step; }
+  int get_beam_size() { return tw_._beam_size; }
 
  public:
   Transformer(const std::string weight_path, const int max_batch_size);