Skip to content

Commit c018151

Browse files
committed
Added perf script
1 parent 6caf833 commit c018151

File tree

3 files changed

+123
-0
lines changed

3 files changed

+123
-0
lines changed

Diff for: tools/perf/Flux/benchmark.sh

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
#TODO: Enter the HF Token
2+
huggingface-cli login --token HF_TOKEN
3+
4+
python flux_perf.py > benchmark_output.txt

Diff for: tools/perf/Flux/create_env.sh

+26
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
%bash
2+
3+
git config --global --add safe.directory /home/TensorRT
4+
5+
#Install bazel
6+
apt install apt-transport-https curl gnupg -y
7+
curl -fsSL https://bazel.build/bazel-release.pub.gpg | gpg --dearmor >bazel-archive-keyring.gpg
8+
mv bazel-archive-keyring.gpg /usr/share/keyrings
9+
echo "deb [arch=amd64 signed-by=/usr/share/keyrings/bazel-archive-keyring.gpg] https://storage.googleapis.com/bazel-apt stable jdk1.8" | tee /etc/apt/sources.list.d/bazel.list
10+
11+
12+
apt update && apt install bazel-7.2.1
13+
apt install bazel
14+
bazel
15+
cd /home/TensorRT
16+
17+
python -m pip install --pre -e . --extra-index-url https://download.pytorch.org/whl/nightly/cu128
18+
pip install tensorrt==10.9.0.34 --force-reinstall
19+
20+
pip3 install --pre torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu128
21+
22+
23+
pip install sentencepiece=="0.2.0" transformers=="4.48.2" accelerate=="1.3.0" diffusers=="0.32.2" protobuf=="5.29.3"
24+
25+
pip install notebook
26+
pip install gradio safetensors peft pyinstrument

Diff for: tools/perf/Flux/flux_perf.py

+93
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
from time import time
2+
3+
import torch
4+
import torch_tensorrt
5+
from diffusers import FluxPipeline
6+
7+
for i in range(torch.cuda.device_count()):
8+
print(torch.cuda.get_device_properties(i).name)
9+
10+
DEVICE = "cuda:0"
11+
pipe = FluxPipeline.from_pretrained(
12+
"black-forest-labs/FLUX.1-dev",
13+
torch_dtype=torch.float32,
14+
)
15+
pipe.to(DEVICE).to(torch.float32)
16+
backbone = pipe.transformer
17+
18+
19+
batch_size = 2
20+
BATCH = torch.export.Dim("batch", min=1, max=8)
21+
22+
# This particular min, max values for img_id input are recommended by torch dynamo during the export of the model.
23+
# To see this recommendation, you can try exporting using min=1, max=4096
24+
dynamic_shapes = {
25+
"hidden_states": {0: BATCH},
26+
"encoder_hidden_states": {0: BATCH},
27+
"pooled_projections": {0: BATCH},
28+
"timestep": {0: BATCH},
29+
"txt_ids": {},
30+
"img_ids": {},
31+
"guidance": {0: BATCH},
32+
"joint_attention_kwargs": {},
33+
"return_dict": None,
34+
}
35+
36+
settings = {
37+
"strict": False,
38+
"allow_complex_guards_as_runtime_asserts": True,
39+
"enabled_precisions": {torch.float32},
40+
"truncate_double": True,
41+
"min_block_size": 1,
42+
"use_fp32_acc": True,
43+
"use_explicit_typing": True,
44+
"debug": False,
45+
"use_python_runtime": True,
46+
"immutable_weights": False,
47+
}
48+
49+
50+
def generate_image(prompt, inference_step, batch_size=2, benchmark=False, iterations=1):
51+
52+
start = time()
53+
for i in range(iterations):
54+
image = pipe(
55+
prompt,
56+
output_type="pil",
57+
num_inference_steps=inference_step,
58+
num_images_per_prompt=batch_size,
59+
).images
60+
end = time()
61+
if benchmark:
62+
print("Time Elapse for", iterations, "iterations:", end - start)
63+
print("Average Latency Per Step:", (end - start) / inference_step / iterations)
64+
return image
65+
66+
67+
generate_image(["Test"], 2)
68+
print("Benchmark Original PyTorch Module Latency (float32)")
69+
generate_image(["Test"], 50, benchmark=True, iterations=3)
70+
71+
pipe.to(torch.float16)
72+
print("Benchmark Original PyTorch Module Latency (float16)")
73+
generate_image(["Test"], 50, benchmark=True, iterations=3)
74+
75+
76+
trt_gm = torch_tensorrt.MutableTorchTensorRTModule(backbone, **settings)
77+
trt_gm.set_expected_dynamic_shape_range((), dynamic_shapes)
78+
pipe.transformer = trt_gm
79+
80+
start = time()
81+
generate_image(["Test"], 2)
82+
end = time()
83+
print("Time Elapse compilation:", end - start)
84+
print()
85+
print("Benchmark TRT Accelerated Latency")
86+
generate_image(["Test"], 50, benchmark=True, iterations=3)
87+
torch.cuda.empty_cache()
88+
89+
90+
with torch_tensorrt.runtime.enable_cudagraphs(trt_gm):
91+
generate_image(["Test"], 2)
92+
print("Benchmark TRT Accelerated Latency with Cuda Graph")
93+
generate_image(["Test"], 50, benchmark=True, iterations=3)

0 commit comments

Comments
 (0)