Update launch_bounds_utils.h for correct compile on Multiple Cuda Arch - PTXAS out of range Warning (#25843)

Signed-off-by: Salvatore Cena <cena@cenas.it> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-12-06 06:53:12 +08:00 · 2025-10-01 04:18:19 +02:00
parent 001e50c92c
commit a73f6491c8
1 changed files with 29 additions and 3 deletions
--- a/csrc/launch_bounds_utils.h
+++ b/csrc/launch_bounds_utils.h
@@ -8,11 +8,37 @@
  #define VLLM_LAUNCH_BLOCKS_CAP 4
 #endif

-// compile-time estimate of max threads per SM for launch bounds.
+// Compile-time estimate of max threads per SM for launch bounds.
+// Families: 1024, 1536, 2048 threads/SM.
 #ifndef VLLM_MAX_THREADS_PER_SM
-  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
-    #define VLLM_MAX_THREADS_PER_SM 1536
+  #ifdef __CUDA_ARCH__
+
+    /* 1024 thr/SM: Turing (sm_75) */
+    #if (__CUDA_ARCH__ == 750)
+      #define VLLM_MAX_THREADS_PER_SM 1024
+
+    /* 1536 thr/SM: Ampere GA10x (sm_86/87), Ada (sm_89),
+        GB20x consumer (sm_120/121), Thor (sm_101 or sm_110) */
+    #elif (__CUDA_ARCH__ == 860) || (__CUDA_ARCH__ == 870) || \
+        (__CUDA_ARCH__ == 890) || (__CUDA_ARCH__ == 1010) ||  \
+        (__CUDA_ARCH__ == 1100) || (__CUDA_ARCH__ == 1200) || \
+        (__CUDA_ARCH__ == 1210)
+      #define VLLM_MAX_THREADS_PER_SM 1536
+
+    /* 2048 thr/SM: Volta (sm_70/72), Ampere GA100 (sm_80),
+        Hopper (sm_90), Blackwell (sm_100/103) */
+    #elif (__CUDA_ARCH__ == 700) || (__CUDA_ARCH__ == 720) || \
+        (__CUDA_ARCH__ == 800) || (__CUDA_ARCH__ == 900) ||   \
+        (__CUDA_ARCH__ == 1000) || (__CUDA_ARCH__ == 1030)
+      #define VLLM_MAX_THREADS_PER_SM 2048
+
+    /* Fallback: use 2048 for unknown future CCs */
+    #else
+      #define VLLM_MAX_THREADS_PER_SM 2048
+    #endif
+
  #else
+  /* Host pass (no __CUDA_ARCH__): neutral default */
    #define VLLM_MAX_THREADS_PER_SM 2048
  #endif
 #endif