Skip to content

Commit 2c89e24

Browse files
authored
Triton backend rebase (#301)
* tritonbackend for lightseq remove useless type qualifiers tritonbackend README format update psf/black fix code format update tritonbackend README fix readme format fix READEME format fix REDME format fix REDME format adapt README add empty directories which are needed by triton * format Co-authored-by: zhoubofan <[email protected]>
1 parent 9a61730 commit 2c89e24

32 files changed

+1305
-17
lines changed

.pre-commit-config.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ repos:
1212
args: [-style=file]
1313

1414
- repo: https://github.com/psf/black
15-
rev: 21.5b2
15+
rev: 22.3.0
1616
hooks:
1717
- id: black
1818

CMakeLists.txt

+3
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,8 @@ include_directories(SYSTEM ${PROJECT_SOURCE_DIR}/3rdparty/cub)
4848

4949
link_directories(${COMMON_LIB_DIRS})
5050

51+
add_compile_options(-Wno-unknown-pragmas)
52+
5153
if(FP16_MODE)
5254
add_definitions(-DFP16_MODE)
5355
message(STATUS "Build using fp16 precision")
@@ -67,5 +69,6 @@ add_subdirectory(lightseq/inference/proto)
6769
add_subdirectory(lightseq/inference/model)
6870
add_subdirectory(lightseq/inference/pywrapper)
6971
add_subdirectory(lightseq/inference/server)
72+
add_subdirectory(lightseq/inference/triton_backend)
7073

7174
# add_subdirectory(examples/inference/cpp)

docker/Dockerfile.pypi

+6-1
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@ ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH}
3535
ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64
3636
ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs
3737

38-
3938
# install protobuf
4039
RUN curl -O -L -C - \
4140
https://github.com/protocolbuffers/protobuf/releases/download/v3.13.0/protobuf-cpp-3.13.0.tar.gz && \
@@ -62,5 +61,11 @@ RUN curl -O -L -C - \
6261
./configure --prefix=/usr/local/hdf5 "CFLAGS=-fPIC" "CXXFLAGS=-fPIC" --enable-build-mode=production && \
6362
make -j$(nproc) && make install && cd .. && rm -rf hdf5-hdf5-1_12_0
6463

64+
# install rapidjson
65+
RUN curl -L -o rapidjson.tar.gz -C - https://github.com/Tencent/rapidjson/archive/refs/tags/v1.1.0.tar.gz && \
66+
tar xf rapidjson.tar.gz && rm rapidjson.tar.gz && \
67+
mkdir rapidjson-1.1.0/build && cd rapidjson-1.1.0/build && \
68+
cmake .. && make install && cd ../.. && rm -rf rapidjson-1.1.0
69+
6570
ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs:${LIBRARY_PATH}
6671
ENV PATH /usr/local/cuda/bin:/usr/local/hdf5:${PATH}

examples/inference/python/__init__.py

Whitespace-only changes.

examples/inference/python/export/__init__.py

Whitespace-only changes.

examples/inference/python/export/fairseq/__init__.py

Whitespace-only changes.

examples/inference/python/export/huggingface/__init__.py

Whitespace-only changes.

examples/triton_backend/README.md

+91
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
# Tritonbackend Usage
2+
3+
## How To Use Tritonbackend
4+
5+
### How To Compile Tritonbackend
6+
7+
- Execute commands as below
8+
9+
```
10+
$ cd <lightseq_repository>
11+
$ mkdir build && cd build && \
12+
cmake -DCMAKE_BUILD_TYPE=Release -DFP16_MODE=ON -DDEBUG_MODE=OFF -DDYNAMIC_API=ON .. && \
13+
make -j${nproc}
14+
```
15+
16+
Then you can get outcomes include `libliblightseq.so` and `libtriton_lightseq.so`, Which are needed by model repository.
17+
18+
You can find libliblightseq.so in this path
19+
20+
`<lightseq_repository>/build/lightseq/inference/pywrapper/libliblightseq.so`
21+
22+
While libtriton_lightseq.so is in
23+
24+
`<lightseq_repository>/build/lightseq/inference/triton_backend/libtriton_lightseq.so`
25+
26+
### How To Organize Model Repository
27+
28+
```
29+
├── <path_to_model_repository>/
30+
│ ├── libliblightseq.so # dynamic link library of lightseq, which contains the almost
31+
│ │ implement of lightseq, and should be included by LD_LIBRARY_PATH
32+
│ ├── <model_name_1>/ # the directory of model, include parameters and configurations.
33+
│ │ ├── config.pbtxt # the config of model, more detail is as below.
34+
│ │ ├── <model_file> # the file of model parameters.
35+
│ │ ├── 1/ # this empty directory is necessary, which is needed by tritonserver.
36+
│ │ ├── libtriton_lightseq.so # dynamic link library of lightseq's tritonbackend
37+
│ ├── <model_name_2>/ # ...
38+
│ │ ├── config.pbtxt # ...
39+
│ │ ├── <model_file> # ...
40+
│ │ ├── 1/ # ...
41+
│ │ ├── libtriton_lightseq.so # ...
42+
│ ├── #<model_name_vid>... # more models etc...
43+
```
44+
45+
- The meaning of parameters in config.pbtxt, more information you can find in [Model config of tritonbackend](https://github.com/triton-inference-server/common/blob/main/protobuf/model_config.proto)
46+
47+
> ${name}: name of model,**which should be same with <model_name_vid>**
48+
>
49+
> ${backend}: **fixed value - "lightseq"**,which is used to recognize the dynamic link library of tritonbackend, libtriton_lightseq.so
50+
>
51+
> ${default_model_filename}: name of model file,**which should be same with <model_file>**
52+
>
53+
> ${parameters - value - string_value}: the type of model, which should be supported by lightseq. You can choose `Transformer`|`QuantTransformer`|`Bert`|`Gpt`|`Moe`
54+
55+
- You can see example in [Example Of Triton Model Config](https://github.com/bytedance/lightseq/tree/master/examples/triton_backend/model_repo), while you can also find more detailed information in [Model Config Of Tritonserver](https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md).
56+
57+
- The model files which needed by [Example Of Triton Model Config](https://github.com/bytedance/lightseq/tree/master/examples/triton_backend/model_repo) you can find in [The Guide of Model Export](https://github.com/bytedance/lightseq/blob/master/examples/inference/python/README.md).
58+
59+
### How To Run Tritonserver
60+
61+
#### Run Tritonserver By Docker
62+
63+
- Get tritonserver Docker: [Tritonserver Quickstart](https://github.com/triton-inference-server/server/blob/main/docs/quickstart.md#install-triton-docker-image)
64+
65+
```
66+
$ sudo docker pull nvcr.io/nvidia/tritonserver:22.01-py3
67+
```
68+
69+
- Docker Commands:
70+
71+
```
72+
$ sudo docker run --gpus=<num_of_gpus> --rm -e LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/models" -p8000:8000 -p8001:8001 -p8002:8002 -v<model_repository>:/models nvcr.io/nvidia/tritonserver:22.01-py3 tritonserver --model-repository=/models
73+
```
74+
75+
- <num_of_gpus>: int, the number of gpus which are needed by tritonserver.
76+
77+
- <model_repository>: str, the path of model repository which are organized by yourself.
78+
79+
- Install client requirements:
80+
81+
```
82+
$ pip install tritonclient[all]
83+
```
84+
85+
## Reference
86+
87+
- [triton-inference-server/backend](https://github.com/triton-inference-server/backend)
88+
- [triton-inference-server/server](https://github.com/triton-inference-server/server)
89+
- [triton-inference-server/client](https://github.com/triton-inference-server/client)
90+
- [triton-inference-server/core](https://github.com/triton-inference-server/core)
91+
- [triton-inference-server/common](https://github.com/triton-inference-server/common)

examples/triton_backend/model_repo/bert_example/1/.gitignore

Whitespace-only changes.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
name: "bert_example"
2+
backend: "lightseq"
3+
max_batch_size: 8
4+
input [
5+
{
6+
name: "token_ids"
7+
data_type: TYPE_INT32
8+
dims: [ -1 ]
9+
}
10+
]
11+
output [
12+
{
13+
name: "encoder_output"
14+
data_type: TYPE_FP32
15+
dims: [ -1 ]
16+
}
17+
]
18+
instance_group [
19+
{
20+
count: 1
21+
kind: KIND_GPU
22+
}
23+
]
24+
default_model_filename: "lightseq_bert_base_uncased.hdf5"
25+
parameters: [
26+
{
27+
key: "model_type"
28+
value: {
29+
string_value: "Bert"
30+
}
31+
}
32+
]

examples/triton_backend/model_repo/gpt_example/1/.gitignore

Whitespace-only changes.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
name: "gpt_example"
2+
backend: "lightseq"
3+
max_batch_size: 8
4+
input [
5+
{
6+
name: "token_ids"
7+
data_type: TYPE_INT32
8+
dims: [ -1 ]
9+
}
10+
]
11+
output [
12+
{
13+
name: "result"
14+
data_type: TYPE_INT32
15+
dims: [ -1 ]
16+
}
17+
]
18+
instance_group [
19+
{
20+
count: 1
21+
kind: KIND_GPU
22+
}
23+
]
24+
default_model_filename: "lightseq_gpt2_base.hdf5"
25+
parameters: [
26+
{
27+
key: "model_type"
28+
value: {
29+
string_value: "Gpt"
30+
}
31+
}
32+
]

examples/triton_backend/model_repo/transformer_example/1/.gitignore

Whitespace-only changes.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
name: "transformer_example"
2+
backend: "lightseq"
3+
max_batch_size: 8
4+
input [
5+
{
6+
name: "source_ids"
7+
data_type: TYPE_INT32
8+
dims: [ -1 ]
9+
}
10+
]
11+
output [
12+
{
13+
name: "target_ids"
14+
data_type: TYPE_FP32
15+
dims: [ -1 ]
16+
},
17+
{
18+
name: "target_scores"
19+
data_type: TYPE_FP32
20+
dims: [ -1 ]
21+
}
22+
]
23+
instance_group [
24+
{
25+
count: 1
26+
kind: KIND_GPU
27+
}
28+
]
29+
default_model_filename: "lightseq_bart_base.hdf5"
30+
parameters: [
31+
{
32+
key: "model_type"
33+
value: {
34+
string_value: "Transformer"
35+
}
36+
}
37+
]

lightseq/inference/pywrapper/bert.cc

-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@ Bert::Bert(const std::string weight_path, const int max_batch_size)
77
: LSModel({"token_ids"}, {"encoder_output"}),
88
_max_batch_size(max_batch_size) {
99
/* ---step1. init environment--- */
10-
CHECK_GPU_ERROR(cudaSetDevice(0));
1110
CHECK_GPU_ERROR(cudaStreamCreate(&stream_));
1211
CHECK_GPU_ERROR(cublasCreate(&hd_));
1312
CHECK_GPU_ERROR(cublasSetStream(hd_, stream_));

lightseq/inference/pywrapper/gpt.cc

-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@ Gpt::Gpt(const std::string weight_path, const int max_batch_size)
1010
encoder_(nullptr),
1111
_max_batch_size(max_batch_size) {
1212
/* ---step1. init environment--- */
13-
CHECK_GPU_ERROR(cudaSetDevice(0));
1413
CHECK_GPU_ERROR(cudaStreamCreate(&stream_));
1514
CHECK_GPU_ERROR(cudaStreamCreate(&cache_stream_));
1615
CHECK_GPU_ERROR(cublasCreate(&hd_));

lightseq/inference/pywrapper/gpt.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ class Gpt : public LSModel {
3838

3939
const int* get_result_ptr();
4040
const float* get_score_ptr();
41-
const int get_max_step() { return tw_._max_step; }
41+
int get_max_step() { return tw_._max_step; }
4242

4343
void Infer() override;
4444
void set_input_ptr(int index, void* input_ptr) override;

lightseq/inference/pywrapper/moe.cc

-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@ Moe::Moe(const std::string weight_path, const int max_batch_size)
1212
decoder_(nullptr),
1313
_max_batch_size(max_batch_size) {
1414
/* ---step1. init environment--- */
15-
CHECK_GPU_ERROR(cudaSetDevice(0));
1615
CHECK_GPU_ERROR(cudaStreamCreate(&stream_));
1716
CHECK_GPU_ERROR(cublasCreate(&hd_));
1817
CHECK_GPU_ERROR(cublasSetStream(hd_, stream_));

lightseq/inference/pywrapper/quant_transformer.cc

-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@ QuantTransformer::QuantTransformer(const std::string weight_path,
1313
decoder_(nullptr),
1414
_max_batch_size(max_batch_size) {
1515
/* ---step1. init environment--- */
16-
CHECK_GPU_ERROR(cudaSetDevice(0));
1716
CHECK_GPU_ERROR(cudaStreamCreate(&stream_));
1817
CHECK_GPU_ERROR(cublasCreate(&hd_));
1918
CHECK_GPU_ERROR(cublasSetStream(hd_, stream_));

lightseq/inference/pywrapper/quant_transformer.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,8 @@ class QuantTransformer : public LSModel {
3737

3838
const int *get_result_ptr();
3939
const float *get_score_ptr();
40-
const int get_max_step() { return tw_._max_step; }
41-
const int get_beam_size() { return tw_._beam_size; }
40+
int get_max_step() { return tw_._max_step; }
41+
int get_beam_size() { return tw_._beam_size; }
4242

4343
public:
4444
QuantTransformer(const std::string weight_path, const int max_batch_size);

lightseq/inference/pywrapper/transformer.cc

-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@ Transformer::Transformer(const std::string weight_path,
1313
decoder_(nullptr),
1414
_max_batch_size(max_batch_size) {
1515
/* ---step1. init environment--- */
16-
CHECK_GPU_ERROR(cudaSetDevice(0));
1716
CHECK_GPU_ERROR(cudaStreamCreate(&stream_));
1817
CHECK_GPU_ERROR(cublasCreate(&hd_));
1918
CHECK_GPU_ERROR(cublasSetStream(hd_, stream_));

lightseq/inference/pywrapper/transformer.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,8 @@ class Transformer : public LSModel {
3838

3939
const int *get_result_ptr();
4040
const float *get_score_ptr();
41-
const int get_max_step() { return tw_._max_step; }
42-
const int get_beam_size() { return tw_._beam_size; }
41+
int get_max_step() { return tw_._max_step; }
42+
int get_beam_size() { return tw_._beam_size; }
4343

4444
public:
4545
Transformer(const std::string weight_path, const int max_batch_size);

0 commit comments

Comments
 (0)