Added perf script

cehongwang · cehongwang · commit c01815128404 · 2025-04-07T15:03:33.000Z
diff --git a/tools/perf/Flux/benchmark.sh b/tools/perf/Flux/benchmark.sh
@@ -0,0 +1,4 @@
+#TODO: Enter the HF Token
+huggingface-cli login --token HF_TOKEN
+
+python flux_perf.py > benchmark_output.txt
diff --git a/tools/perf/Flux/create_env.sh b/tools/perf/Flux/create_env.sh
@@ -0,0 +1,26 @@
+%bash
+
+git config --global --add safe.directory /home/TensorRT
+
+#Install bazel
+apt install apt-transport-https curl gnupg -y
+curl -fsSL https://bazel.build/bazel-release.pub.gpg | gpg --dearmor >bazel-archive-keyring.gpg
+mv bazel-archive-keyring.gpg /usr/share/keyrings
+echo "deb [arch=amd64 signed-by=/usr/share/keyrings/bazel-archive-keyring.gpg] https://storage.googleapis.com/bazel-apt stable jdk1.8" | tee /etc/apt/sources.list.d/bazel.list
+
+
+apt update && apt install bazel-7.2.1
+apt install bazel
+bazel
+cd /home/TensorRT
+
+python -m pip install --pre -e . --extra-index-url https://download.pytorch.org/whl/nightly/cu128
+pip install tensorrt==10.9.0.34 --force-reinstall
+
+pip3 install --pre  torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu128
+
+
+pip install sentencepiece=="0.2.0" transformers=="4.48.2" accelerate=="1.3.0" diffusers=="0.32.2" protobuf=="5.29.3"
+
+pip install notebook
+pip install gradio safetensors peft pyinstrument
diff --git a/tools/perf/Flux/flux_perf.py b/tools/perf/Flux/flux_perf.py
@@ -0,0 +1,93 @@
+from time import time
+
+import torch
+import torch_tensorrt
+from diffusers import FluxPipeline
+
+for i in range(torch.cuda.device_count()):
+    print(torch.cuda.get_device_properties(i).name)
+
+DEVICE = "cuda:0"
+pipe = FluxPipeline.from_pretrained(
+    "black-forest-labs/FLUX.1-dev",
+    torch_dtype=torch.float32,
+)
+pipe.to(DEVICE).to(torch.float32)
+backbone = pipe.transformer
+
+
+batch_size = 2
+BATCH = torch.export.Dim("batch", min=1, max=8)
+
+# This particular min, max values for img_id input are recommended by torch dynamo during the export of the model.
+# To see this recommendation, you can try exporting using min=1, max=4096
+dynamic_shapes = {
+    "hidden_states": {0: BATCH},
+    "encoder_hidden_states": {0: BATCH},
+    "pooled_projections": {0: BATCH},
+    "timestep": {0: BATCH},
+    "txt_ids": {},
+    "img_ids": {},
+    "guidance": {0: BATCH},
+    "joint_attention_kwargs": {},
+    "return_dict": None,
+}
+
+settings = {
+    "strict": False,
+    "allow_complex_guards_as_runtime_asserts": True,
+    "enabled_precisions": {torch.float32},
+    "truncate_double": True,
+    "min_block_size": 1,
+    "use_fp32_acc": True,
+    "use_explicit_typing": True,
+    "debug": False,
+    "use_python_runtime": True,
+    "immutable_weights": False,
+}
+
+
+def generate_image(prompt, inference_step, batch_size=2, benchmark=False, iterations=1):
+
+    start = time()
+    for i in range(iterations):
+        image = pipe(
+            prompt,
+            output_type="pil",
+            num_inference_steps=inference_step,
+            num_images_per_prompt=batch_size,
+        ).images
+    end = time()
+    if benchmark:
+        print("Time Elapse for", iterations, "iterations:", end - start)
+        print("Average Latency Per Step:", (end - start) / inference_step / iterations)
+    return image
+
+
+generate_image(["Test"], 2)
+print("Benchmark Original PyTorch Module Latency (float32)")
+generate_image(["Test"], 50, benchmark=True, iterations=3)
+
+pipe.to(torch.float16)
+print("Benchmark Original PyTorch Module Latency (float16)")
+generate_image(["Test"], 50, benchmark=True, iterations=3)
+
+
+trt_gm = torch_tensorrt.MutableTorchTensorRTModule(backbone, **settings)
+trt_gm.set_expected_dynamic_shape_range((), dynamic_shapes)
+pipe.transformer = trt_gm
+
+start = time()
+generate_image(["Test"], 2)
+end = time()
+print("Time Elapse compilation:", end - start)
+print()
+print("Benchmark TRT Accelerated Latency")
+generate_image(["Test"], 50, benchmark=True, iterations=3)
+torch.cuda.empty_cache()
+
+
+with torch_tensorrt.runtime.enable_cudagraphs(trt_gm):
+    generate_image(["Test"], 2)
+    print("Benchmark TRT Accelerated Latency with Cuda Graph")
+    generate_image(["Test"], 50, benchmark=True, iterations=3)