Merge pull request #157 from mit-han-lab/dev

Fix missing third_party & merge main branch of dev repo
mit-han-lab · Mar 8, 2025 · 6772359 · 6772359
2 parents e1c5f3e + 75dad57
commit 6772359
Show file tree

Hide file tree

Showing 12 changed files with 46 additions and 1 deletion.
diff --git a/setup.py b/setup.py
@@ -129,7 +129,7 @@ def cond(s) -> list:
     for target in sm_targets:
         NVCC_FLAGS += ["-gencode", f"arch=compute_{target},code=sm_{target}"]
 
-    NVCC_MSVC_FLAGS = ["-Xcompiler", "/Zc:__cplusplus", "-Xcompiler", "/FS"]
+    NVCC_MSVC_FLAGS = ["-Xcompiler", "/Zc:__cplusplus", "-Xcompiler", "/FS", "-Xcompiler", "/bigobj"]
 
     nunchaku_extension = CUDAExtension(
         name="nunchaku._C",

diff --git a/src/Module.h b/src/Module.h
@@ -215,6 +215,11 @@ struct LayerOffloadHelper {
         if (offload) {
             streamCompute = std::make_unique<CUDAStreamWrapper>();
             streamLoad = std::make_unique<CUDAStreamWrapper>();
+
+            needWorkaround = checkWorkaround();
+            if (needWorkaround) {
+                spdlog::debug("Offloading helper: use WDDM workaround");
+            }
         }
     }
 
@@ -240,6 +245,7 @@ struct LayerOffloadHelper {
                 funcCompute(layer);
                 nextComputeDone = std::make_unique<CUDAEventWrapper>();
                 checkCUDA(cudaEventRecord(nextComputeDone->event, getCurrentCUDAStream()));
+                workaroundFlush();
             }
 
             {
@@ -253,10 +259,13 @@ struct LayerOffloadHelper {
                 }
                 nextLoadDone = std::make_unique<CUDAEventWrapper>();
                 checkCUDA(cudaEventRecord(nextLoadDone->event, getCurrentCUDAStream()));
+                workaroundFlush();
             }
 
             eventComputeDone = std::move(nextComputeDone);
             eventLoadDone = std::move(nextLoadDone);
+
+            workaroundSynchronize();
         }
     }
 
@@ -266,4 +275,35 @@ struct LayerOffloadHelper {
         }
         checkCUDA(cudaStreamWaitEvent(getCurrentCUDAStream(), event->event));
     }
+
+    // WDDM prevents multiple streams run concurrently
+    // use flush and synchronize to work around
+    bool needWorkaround;
+    static bool checkWorkaround() {
+        if (char *env = getenv("NUNCHAKU_OFFLOAD_WDDM_WORKAROUND")) {
+            if (std::string(env) == "1") {
+                return true;
+            } else if (std::string(env) == "0") {
+                return false;
+            }
+        }
+
+    #ifdef _WIN32
+        return true;
+    #else
+        return false;
+    #endif
+    }
+    void workaroundFlush() {
+        if (!needWorkaround) {
+            return;
+        }
+        cudaStreamQuery(getCurrentCUDAStream());
+    }
+    void workaroundSynchronize() {
+        if (!needWorkaround) {
+            return;
+        }
+        checkCUDA(cudaEventSynchronize(eventComputeDone->event));
+    }
 };
diff --git a/third_party/Block-Sparse-Attention b/third_party/Block-Sparse-Attention
diff --git a/third_party/Block-Sparse-Attention/.gitkeep b/third_party/Block-Sparse-Attention/.gitkeep
diff --git a/third_party/cutlass b/third_party/cutlass
diff --git a/third_party/cutlass/.gitkeep b/third_party/cutlass/.gitkeep
diff --git a/third_party/json b/third_party/json
diff --git a/third_party/json/.gitkeep b/third_party/json/.gitkeep
diff --git a/third_party/mio b/third_party/mio
diff --git a/third_party/mio/.gitkeep b/third_party/mio/.gitkeep
diff --git a/third_party/spdlog b/third_party/spdlog
diff --git a/third_party/spdlog/.gitkeep b/third_party/spdlog/.gitkeep