bytedance
diff --git a/‎CMakeLists.txt‎
Lines changed: 2 additions & 9 deletions b/‎CMakeLists.txt‎
Lines changed: 2 additions & 9 deletions
diff --git a/‎docs/guide.md‎
Lines changed: 2 additions & 2 deletions b/‎docs/guide.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/inference/benchmark_bart.sh‎
Lines changed: 40 additions & 0 deletions b/‎examples/inference/benchmark_bart.sh‎
Lines changed: 40 additions & 0 deletions
diff --git a/‎examples/inference/benchmark_gpt.sh‎
Lines changed: 40 additions & 0 deletions b/‎examples/inference/benchmark_gpt.sh‎
Lines changed: 40 additions & 0 deletions
diff --git a/‎examples/inference/benchmark_quant_bart.sh‎
Lines changed: 41 additions & 0 deletions b/‎examples/inference/benchmark_quant_bart.sh‎
Lines changed: 41 additions & 0 deletions
diff --git a/‎examples/inference/benchmark_quant_gpt.sh‎
Lines changed: 40 additions & 0 deletions b/‎examples/inference/benchmark_quant_gpt.sh‎
Lines changed: 40 additions & 0 deletions
diff --git a/‎examples/inference/cpp/CMakeLists.txt‎
Lines changed: 6 additions & 0 deletions b/‎examples/inference/cpp/CMakeLists.txt‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎examples/inference/cpp/gpt_example.cc‎
Lines changed: 15 additions & 6 deletions b/‎examples/inference/cpp/gpt_example.cc‎
Lines changed: 15 additions & 6 deletions
diff --git a/‎examples/inference/cpp/quant_gpt_example.cc‎
Lines changed: 15 additions & 3 deletions b/‎examples/inference/cpp/quant_gpt_example.cc‎
Lines changed: 15 additions & 3 deletions
diff --git a/‎examples/inference/cpp/quant_transformer_example.cc‎
Lines changed: 14 additions & 7 deletions b/‎examples/inference/cpp/quant_transformer_example.cc‎
Lines changed: 14 additions & 7 deletions
@@ -1,15 +1,8 @@
 cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
 project(LightSeq LANGUAGES C CXX CUDA)
 
-set(CMAKE_CUDA_ARCHITECTURES
-    60
-    61
-    70
-    75
-    80
-    86
-    87)
-find_package(CUDA 11.6 REQUIRED)
+set(CMAKE_CUDA_ARCHITECTURES 60 61 70 75 80 86)
+find_package(CUDA 11 REQUIRED)
 
 option(FP16_MODE "inference with fp16" OFF)
 option(DEBUG_MODE "debug computation result" OFF)
 
@@ -123,8 +123,8 @@ The main code is as follows (some parameters are omitted). Complete code is avai
 ```python
 model = Transformer()
 encoder_state_dict, decoder_state_dict = _extract_weight(state_dict)
-export_ls_embedding(model, encoder_state_dict, is_encoder=True)
-export_ls_embedding(model, encoder_state_dict, is_encoder=False)
+export_ls_embedding(model, encoder_state_dict, max_length, emb_dim, is_encoder=True)
+export_ls_embedding(model, encoder_state_dict, max_length, emb_dim, is_encoder=False)
 export_ls_encoder(model, encoder_state_dict)
 export_ls_decoder(model, decoder_state_dict)
 export_fs_weights(model, state_dict)
 
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+SCRIPT=$(realpath "$0")
+CUR_DIR=$(dirname "$SCRIPT")
+
+model_full_name=facebook/bart-base
+model_name=$(echo $model_full_name | cut -d "/" -f 2)
+all_log=$CUR_DIR/${model_name}_bench.log
+res_log=$CUR_DIR/${model_name}_bench.txt
+if [ -f $res_log ]; then
+    rm $res_log
+fi
+if [ -f $all_log ]; then
+    rm $all_log
+fi
+echo "batch_size input_seq_len output_seq_len beam_size latency" >>$res_log
+
+for batch_size in 1 8 32; do
+    for beam_size in 1 4 32; do
+        for input_seq_len in 8 16 32 64; do
+            output_seq_len=$input_seq_len
+            cd $CUR_DIR/python
+
+            python3 generate_model.py --model_name $model_full_name --sampling_method beam_search \
+                --beam_size $beam_size --input_seq_len $input_seq_len --output_seq_len=$output_seq_len
+            model_path=$(realpath lightseq_${model_name}_bench.hdf5)
+
+            cd $CUR_DIR/../../build
+            ./examples/inference/cpp/transformer_example \
+                $model_path $batch_size $input_seq_len |& tee temp.log
+
+            cat temp.log >>$all_log
+            latency=$(tail -n 5 temp.log | head -n 1 | awk '{print $4}')
+            echo "$batch_size $input_seq_len $output_seq_len $beam_size $latency" >>$res_log
+            rm temp.log
+        done
+    done
+done
+pip3 install tabulate
+tabulate --header $res_log
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+SCRIPT=$(realpath "$0")
+CUR_DIR=$(dirname "$SCRIPT")
+
+model_full_name=gpt2
+model_name=$model_full_name
+all_log=$CUR_DIR/${model_name}_bench.log
+res_log=$CUR_DIR/${model_name}_bench.txt
+if [ -f $res_log ]; then
+    rm $res_log
+fi
+if [ -f $all_log ]; then
+    rm $all_log
+fi
+echo "batch_size input_seq_len output_seq_len topk latency" >>$res_log
+
+for batch_size in 1 8 32; do
+    for topk in 1 4 32; do
+        for input_seq_len in 118 86 22; do
+            output_seq_len=$((150 - $input_seq_len))
+            cd $CUR_DIR/python
+
+            python3 generate_model.py --model_name $model_full_name --sampling_method topk \
+                --topk $topk --input_seq_len $input_seq_len --output_seq_len=$output_seq_len
+            model_path=$(realpath lightseq_${model_name}_bench.hdf5)
+
+            cd $CUR_DIR/../../build
+            ./examples/inference/cpp/gpt_example \
+                $model_path $batch_size $input_seq_len |& tee temp.log
+
+            cat temp.log >>$all_log
+            latency=$(tail -n 3 temp.log | head -n 1 | awk '{print $4}')
+            echo "$batch_size $input_seq_len $output_seq_len $topk $latency" >>$res_log
+            rm temp.log
+        done
+    done
+done
+pip3 install tabulate
+tabulate --header $res_log
@@ -0,0 +1,41 @@
+#!/bin/bash
+
+SCRIPT=$(realpath "$0")
+CUR_DIR=$(dirname "$SCRIPT")
+
+model_full_name=facebook/bart-base
+model_name=$(echo $model_full_name | cut -d "/" -f 2)
+all_log=$CUR_DIR/quant_${model_name}_bench.log
+res_log=$CUR_DIR/quant_${model_name}_bench.txt
+if [ -f $all_log ]; then
+    rm $res_log
+fi
+if [ -f $res_log ]; then
+    rm $res_log
+fi
+echo "batch_size input_seq_len output_seq_len beam_size latency" >>$res_log
+
+for batch_size in 1 8 32; do
+    for beam_size in 1 4 32; do
+        for input_seq_len in 16 32 64; do
+            output_seq_len=$input_seq_len
+            cd $CUR_DIR/python
+
+            python3 generate_model.py --model_name $model_full_name --sampling_method beam_search \
+                --beam_size $beam_size --input_seq_len $input_seq_len --output_seq_len=$output_seq_len
+            model_path=$(realpath lightseq_${model_name}_bench.hdf5)
+
+            cd $CUR_DIR/../../build
+            ./examples/inference/cpp/quant_transformer_example \
+                $model_path $batch_size $input_seq_len |& tee temp.log
+
+            cat temp.log >> $all_log
+            latency=$(tail -n 5 temp.log | head -n 1 | awk '{print $4}')
+            echo "$batch_size $input_seq_len $output_seq_len $beam_size $latency" >>$res_log
+            rm temp.log
+        done
+    done
+done
+
+pip3 install tabulate
+tabulate --header $res_log
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+SCRIPT=$(realpath "$0")
+CUR_DIR=$(dirname "$SCRIPT")
+
+model_full_name=/tmp/quant/test-clm/pytorch_model.bin
+model_name=quant_gpt2
+all_log=$CUR_DIR/${model_name}_bench.log
+res_log=$CUR_DIR/${model_name}_bench.txt
+if [ -f $res_log ]; then
+    rm $res_log
+fi
+if [ -f $all_log ]; then
+    rm $all_log
+fi
+echo "batch_size input_seq_len output_seq_len topk latency" >>$res_log
+
+for batch_size in 1 8 32; do
+    for topk in 1 4 32; do
+        for input_seq_len in 118 86 22; do
+            output_seq_len=$((150 - $input_seq_len))
+            cd $CUR_DIR/python
+
+            python3 generate_model.py --model_name $model_full_name --sampling_method topk \
+                --topk $topk --input_seq_len $input_seq_len --output_seq_len=$output_seq_len --enable_quant true
+            model_path=$(realpath lightseq_${model_name}_bench.hdf5)
+
+            cd $CUR_DIR/../../build
+            ./examples/inference/cpp/quant_gpt_example \
+                $model_path $batch_size $input_seq_len |& tee temp.log
+
+            cat temp.log >>$all_log
+            latency=$(tail -n 3 temp.log | head -n 1 | awk '{print $4}')
+            echo "$batch_size $input_seq_len $output_seq_len $topk $latency" >>$res_log
+            rm temp.log
+        done
+    done
+done
+pip3 install tabulate
+tabulate --header $res_log
@@ -20,3 +20,9 @@ target_link_libraries(quant_gpt_example PUBLIC liblightseq)
 
 add_executable(transformer_decoder_example decoder_example.cc.cu)
 target_link_libraries(transformer_decoder_example PUBLIC transformer_model)
+
+add_executable(vit_example vit_example.cc)
+target_link_libraries(vit_example PUBLIC liblightseq)
+
+add_executable(quant_vit_example quant_vit_example.cc)
+target_link_libraries(quant_vit_example PUBLIC liblightseq)
@@ -10,17 +10,16 @@ int main(int argc, char* argv[]) {
   std::string model_weights_path = argv[1];
   std::vector<int> example_input = {40, 1842, 345, 11, 475, 345, 910, 326};
   int eg_seq_len = example_input.size();
-  int max_batch_size = 128;
+
   int batch_size = 1;
   int batch_seq_len = eg_seq_len;
 
   if (argc == 4) {
     batch_size = atoi(argv[2]);
     batch_seq_len = atoi(argv[3]);
   }
-  if (batch_size > max_batch_size) {
-    throw std::runtime_error("batch_size exceeds the maximum (128)!");
-  }
+
+  int max_batch_size = std::max(8, batch_size);
 
   std::vector<int> host_input;
   for (int i = 0; i < batch_size; ++i) {
@@ -39,6 +38,7 @@ int main(int argc, char* argv[]) {
       d_input, host_input.data(), sizeof(int) * batch_size * batch_seq_len,
       cudaMemcpyHostToDevice));
 
+  model->benchmark_mode(true);
   model->set_input_ptr(0, d_input);
   model->set_input_shape(0, {batch_size, batch_seq_len});
 
@@ -56,13 +56,22 @@ int main(int argc, char* argv[]) {
   lightseq::cuda::CHECK_GPU_ERROR(cudaStreamSynchronize(0));
   std::cout << "infer preprocessing finished" << std::endl;
 
+  std::chrono::duration<double> elapsed;
+  int iter = 0;
   /* ---step5. infer and log--- */
-  for (int i = 0; i < 10; i++) {
+  for (int i = 0; i < 20; i++) {
     auto start = std::chrono::high_resolution_clock::now();
     model->Infer();
-    lightseq::cuda::print_time_duration(start, "one infer time", 0);
+    auto finish = std::chrono::high_resolution_clock::now();
+    if (i >= 5) {
+      iter++;
+      elapsed += finish - start;
+    }
   }
 
+  std::cout << "lightseq inference latency: " << elapsed.count() * 1000 / iter
+            << " ms" << std::endl;
+
   for (int i = 0; i < model->get_output_size(); i++) {
     const int* d_output;
     d_output = static_cast<const int*>(model->get_output_ptr(i));
 
@@ -10,14 +10,16 @@ int main(int argc, char* argv[]) {
   std::string model_weights_path = argv[1];
   std::vector<int> example_input = {40, 1842, 345, 11, 475, 345, 910, 326};
   int eg_seq_len = example_input.size();
-  int max_batch_size = 128;
   int batch_size = 1;
   int batch_seq_len = eg_seq_len;
 
   if (argc == 4) {
     batch_size = atoi(argv[2]);
     batch_seq_len = atoi(argv[3]);
   }
+
+  int max_batch_size = std::max(4, batch_size);
+
   if (batch_size > max_batch_size) {
     throw std::runtime_error("batch_size exceeds the maximum (128)!");
   }
@@ -39,6 +41,7 @@ int main(int argc, char* argv[]) {
       d_input, host_input.data(), sizeof(int) * batch_size * batch_seq_len,
       cudaMemcpyHostToDevice));
 
+  model->benchmark_mode(true);
   model->set_input_ptr(0, d_input);
   model->set_input_shape(0, {batch_size, batch_seq_len});
 
@@ -56,13 +59,22 @@ int main(int argc, char* argv[]) {
   lightseq::cuda::CHECK_GPU_ERROR(cudaStreamSynchronize(0));
   std::cout << "infer preprocessing finished" << std::endl;
 
+  std::chrono::duration<double> elapsed;
+  int iter = 0;
   /* ---step5. infer and log--- */
-  for (int i = 0; i < 10; i++) {
+  for (int i = 0; i < 20; i++) {
     auto start = std::chrono::high_resolution_clock::now();
     model->Infer();
-    lightseq::cuda::print_time_duration(start, "one infer time", 0);
+    auto finish = std::chrono::high_resolution_clock::now();
+    if (i >= 5) {
+      iter++;
+      elapsed += finish - start;
+    }
   }
 
+  std::cout << "lightseq inference latency: " << elapsed.count() * 1000 / iter
+            << " ms" << std::endl;
+
   for (int i = 0; i < model->get_output_size(); i++) {
     const int* d_output;
     d_output = static_cast<const int*>(model->get_output_ptr(i));
 
@@ -12,17 +12,14 @@ int main(int argc, char* argv[]) {
   std::vector<int> example_input = {63, 47,   65,  1507, 88,  74,
                                     10, 2057, 362, 9,    284, 6};
   int eg_seq_len = example_input.size();
-  int max_batch_size = 128;
   int batch_size = 1;
   int batch_seq_len = eg_seq_len;
 
   if (argc == 4) {
     batch_size = atoi(argv[2]);
     batch_seq_len = atoi(argv[3]);
   }
-  if (batch_size > max_batch_size) {
-    throw std::runtime_error("batch_size exceeds the maximum (128)!");
-  }
+  int max_batch_size = std::max(4, batch_size);
 
   std::vector<int> host_input;
   for (int i = 0; i < batch_size; ++i) {
@@ -41,6 +38,7 @@ int main(int argc, char* argv[]) {
       d_input, host_input.data(), sizeof(int) * batch_size * batch_seq_len,
       cudaMemcpyHostToDevice));
 
+  model->benchmark_mode(true);
   model->set_input_ptr(0, d_input);
   model->set_input_shape(0, {batch_size, batch_seq_len});
 
@@ -58,13 +56,22 @@ int main(int argc, char* argv[]) {
   lightseq::cuda::CHECK_GPU_ERROR(cudaStreamSynchronize(0));
   std::cout << "infer preprocessing finished" << std::endl;
 
+  std::chrono::duration<double> elapsed;
+  int iter = 0;
   /* ---step5. infer and log--- */
   for (int i = 0; i < 20; i++) {
     auto start = std::chrono::high_resolution_clock::now();
     model->Infer();
-    lightseq::cuda::print_time_duration(start, "one infer time", 0);
+    auto finish = std::chrono::high_resolution_clock::now();
+    if (i >= 5) {
+      iter++;
+      elapsed += finish - start;
+    }
   }
 
+  std::cout << "lightseq inference latency: " << elapsed.count() * 1000 / iter
+            << " ms" << std::endl;
+
   for (int i = 0; i < model->get_output_size(); i++) {
     const void* d_output;
     d_output = static_cast<const float*>(model->get_output_ptr(i));
@@ -76,9 +83,9 @@ int main(int argc, char* argv[]) {
     std::cout << std::endl;
 
     if (!i)
-      lightseq::cuda::print_vec((int*)d_output, "output", 15);
+      lightseq::cuda::print_vec((int*)d_output, "output", batch_size);
     else
-      lightseq::cuda::print_vec((float*)d_output, "output", 5);
+      lightseq::cuda::print_vec((float*)d_output, "output", batch_size);
   }
 
   // const int* res = model.get_result_ptr();