ROCm
diff --git a/‎aiter/ops/topk.py
Lines changed: 52 additions & 1 deletion b/‎aiter/ops/topk.py
Lines changed: 52 additions & 1 deletion
diff --git a/‎csrc/include/moe_op.h
Lines changed: 11 additions & 1 deletion b/‎csrc/include/moe_op.h
Lines changed: 11 additions & 1 deletion
diff --git a/‎csrc/kernels/topk_softmax_kernels_group.cu
Lines changed: 200 additions & 35 deletions b/‎csrc/kernels/topk_softmax_kernels_group.cu
Lines changed: 200 additions & 35 deletions
@@ -17,9 +17,20 @@ def biased_grouped_topk(
     num_expert_group: int,
     topk_group: int,
     need_renorm: bool,
-    routed_scaling_factor: float  # mul to topk_weights
+    routed_scaling_factor: float=1.0  # mul to topk_weights
 ): ...
 
+@compile_ops("module_moe_asm")
+def grouped_topk(
+    gating_output: Tensor,
+    topk_weights: Tensor,
+    topk_ids: Tensor,
+    num_expert_group: int,
+    topk_group: int,
+    need_renorm: bool,
+    scoring_func: str="softmax",
+    scale_factor: float=1.0,
+): ...
 
 # this one copied from sglang
 def biased_grouped_topk_torch(
@@ -62,3 +73,43 @@ def biased_grouped_topk_torch(
         topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
 
     return topk_weights.to(torch.float32), topk_ids.to(torch.int32)
+
+
+# this one copied from sglang
+def grouped_topk_torch(
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+    num_expert_group: int = 0,
+    topk_group: int = 0,
+    scoring_func: str = "softmax",
+):
+    gating_output = gating_output.to(torch.float)
+    if scoring_func == "softmax":
+        scores = torch.softmax(gating_output, dim=-1)
+    elif scoring_func == "sigmoid":
+        scores = gating_output.sigmoid()
+    else:
+        raise ValueError(f"Scoring function '{scoring_func}' is not supported.")
+
+    num_token = scores.shape[0]
+    group_scores = (
+        scores.view(num_token, num_expert_group, -1).max(dim=-1).values
+    )  # [n, n_group]
+    group_idx = torch.topk(group_scores, k=topk_group, dim=-1, sorted=False)[
+        1
+    ]  # [n, top_k_group]
+    group_mask = torch.zeros_like(group_scores)  # [n, n_group]
+    group_mask.scatter_(1, group_idx, 1)  # [n, n_group]
+    score_mask = (
+        group_mask.unsqueeze(-1)
+        .expand(num_token, num_expert_group, scores.shape[-1] // num_expert_group)
+        .reshape(num_token, -1)
+    )  # [n, e]
+    tmp_scores = scores.masked_fill(~score_mask.bool(), 0.0)  # [n, e]
+    topk_weights, topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=False)
+
+    if renormalize:
+        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
+
+    return topk_weights.to(torch.float32), topk_ids.to(torch.int32)
@@ -16,7 +16,17 @@ void biased_grouped_topk(
     int num_expert_group,
     int topk_group,
     bool renormalize,
-    const float routed_scaling_factor);
+    const float routed_scaling_factor = 1.);
+
+void grouped_topk(
+    torch::Tensor &gating_output, // [num_tokens, num_experts]
+    torch::Tensor &topk_weights,  // [num_tokens, topk]
+    torch::Tensor &topk_ids,      // [num_tokens, topk]
+    int num_expert_group,
+    int topk_grp,
+    bool need_renorm,
+    std::string scoring_func = "softmax",
+    const float routed_scaling_factor = 1.);
 
 void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
                           int64_t block_size, torch::Tensor sorted_token_ids,
 
@@ -19,6 +19,23 @@
 #define WARP_SIZE 64
 namespace aiter
 {
+    template <typename T, typename F>
+    __device__ constexpr T wave_reduce(T local, F reduce_f)
+    {
+      constexpr int reduce_stage = 6; // 1<<6=64
+      T v_local = local;
+#pragma unroll
+      for (int i_stage = 0; i_stage < reduce_stage; i_stage++)
+      {
+        int src_lane = __lane_id() ^ (1 << i_stage);
+        int32_t v_remote_tmp =
+            __builtin_amdgcn_ds_bpermute(src_lane << 2, __builtin_bit_cast(int32_t, v_local));
+        T v_remote = __builtin_bit_cast(T, v_remote_tmp);
+        v_local = reduce_f(v_local, v_remote);
+      }
+      return v_local;
+    }
+
     __inline__ __device__ void warpReduceMax(float &val, int &idx)
     {
         static_assert(64 == WARP_SIZE, "WARP_SIZE == 64");
@@ -63,8 +80,8 @@ namespace aiter
         __syncthreads();
     }
 
-    template <typename DTYPE_I, typename fvec, int NUM_GRP, bool need_renorm>
-    __global__ void biased_grouped_topk_kernel(
+    template <typename DTYPE_I, typename fvec, int NUM_GRP, bool need_renorm, bool isBiased, bool isSoftmax>
+    __global__ void grouped_topk_kernel(
         const DTYPE_I *__restrict__ gating_output,   // [num_tokens, hidden_size]
         const DTYPE_I *__restrict__ correction_bias, // [num_expert]
         float *__restrict__ topk_weights,            // [num_tokens, topk]
@@ -106,36 +123,112 @@ namespace aiter
         fvec *scores_vec = reinterpret_cast<fvec *>(scores);
         constexpr uint32_t vec_size = sizeof(fvec) / sizeof(float);
 
-        for (int e = threadIdx.x; e < num_experts; e += blockDim.x)
+        if constexpr (!isSoftmax)
         {
-            float gating = static_cast<float>(gating_output[token_idx * num_experts + e]);
-            float score = 1.0f / (1.0f + expf(-gating));
-            scores[e] = score + correction_bias[e];
+            for (int e = threadIdx.x; e < num_experts; e += blockDim.x)
+            {
+                float gating = static_cast<float>(gating_output[token_idx * num_experts + e]);
+                gating = 1.0f / (1.0f + expf(-gating));
+                if constexpr (isBiased)
+                {
+                    gating += correction_bias[e];
+                }
+                scores[e] = gating;
+            }
+            __syncthreads();
         }
+        else
+        {
+            __shared__ float sdata;
+            float max_val = -INFINITY;
+            for (int e = threadIdx.x; e < num_experts; e += blockDim.x)
+            {
 
+                float gating = gating_output[token_idx * num_experts + e];
+                scores[e] = gating;
+                if (gating > max_val)
+                {
+                    max_val = gating;
+                }
+            }
+            __syncthreads();
 #pragma unroll
-        for (int g = threadIdx.x; g < NUM_GRP; g += blockDim.x)
-        {
-            float max1 = -INFINITY, max2 = -INFINITY;
-            const int start = g * experts_per_group;
-            const int end = start + experts_per_group;
+            for (int i = 0; i < 6; i++)
+            {
+                int offset = 1 << i;
+                float tmp_val = __shfl_down(max_val, offset);
+                if (tmp_val > max_val)
+                {
+                    max_val = tmp_val;
+                }
+            }
+            if (threadIdx.x == 0)
+            {
+                sdata = max_val;
+            }
+            __syncthreads();
+            max_val = sdata;
+            float thread_sum = 0.0;
+            for (int e = threadIdx.x; e < num_experts; e += blockDim.x)
+            {
+                scores[e] = expf(scores[e] - max_val);
+                thread_sum += scores[e];
+            }
+            __syncthreads();
+            thread_sum = wave_reduce(thread_sum, [](float a, float b) { return a + b; });
+            for (int e = threadIdx.x; e < num_experts; e += blockDim.x)
+            {
+                scores[e] /= thread_sum;
+            }
+            __syncthreads();
+        }
 
-            for (int e = start; e < end; ++e)
+        if constexpr (isBiased)
+        {
+#pragma unroll
+            for (int g = threadIdx.x; g < NUM_GRP; g += blockDim.x)
             {
-                if (scores[e] > max1)
+                float max1 = -INFINITY, max2 = -INFINITY;
+                const int start = g * experts_per_group;
+                const int end = start + experts_per_group;
+
+                for (int e = start; e < end; ++e)
                 {
-                    max2 = max1;
-                    max1 = scores[e];
+                    if (scores[e] > max1)
+                    {
+                        max2 = max1;
+                        max1 = scores[e];
+                    }
+                    else if (scores[e] > max2)
+                    {
+                        max2 = scores[e];
+                    }
                 }
-                else if (scores[e] > max2)
+                group_scores[g] = max1 + max2;
+                group_mask[g] = false;
+            }
+            __syncthreads();
+        }
+        else
+        {
+#pragma unroll
+            for (int g = threadIdx.x; g < NUM_GRP; g += blockDim.x)
+            {
+                float max1 = -INFINITY;
+                const int start = g * experts_per_group;
+                const int end = start + experts_per_group;
+                for (int e = start; e < end; ++e)
                 {
-                    max2 = scores[e];
+                    if (scores[e] > max1)
+                    {
+                        max1 = scores[e];
+                    }
                 }
+                group_scores[g] = max1;
+                group_mask[g] = false;
             }
-            group_scores[g] = max1 + max2;
-            group_mask[g] = false;
+            __syncthreads();
         }
-        __syncthreads();
 
         for (int k = 0; k < topk_group; k++)
         {
@@ -205,7 +298,10 @@ namespace aiter
                     max_idx = k;
                     max_val = scores[max_idx];
                 }
-                max_val -= correction_bias[max_idx];
+                if constexpr (isBiased)
+                {
+                    max_val -= correction_bias[max_idx];
+                }
                 scores[max_idx] = -INFINITY;
                 topk_indices[k] = max_idx;
                 topk_values[k] = max_val;
@@ -233,7 +329,7 @@ namespace aiter
 
         for (int k = threadIdx.x; k < topk; k += blockDim.x)
         {
-            topk_weights[token_idx * stride_tk + k] = need_renorm ? topk_values[k] * sum : topk_values[k];
+            topk_weights[token_idx * stride_tk + k] = topk_values[k] * sum;
             topk_ids[token_idx * stride_tk + k] = topk_indices[k];
         }
     }
@@ -281,18 +377,49 @@ namespace aiter
         LAUNCHER4(VEC_F, NUM_GRP, false) \
     }
 
-#define LAUNCHER4(VEC_F, NUM_GRP, need_renorm)                                     \
-    VLLM_DISPATCH_FLOATING_TYPES(                                                  \
-        gating_output.scalar_type(), "biased_grouped_topk_kernel", [&]             \
-        { aiter::biased_grouped_topk_kernel<scalar_t, VEC_F, NUM_GRP, need_renorm> \
-              <<<grid, block, shared_mem_size, stream>>>(                          \
-                  gating_output.data_ptr<scalar_t>(),                              \
-                  correction_bias.data_ptr<scalar_t>(),                            \
-                  topk_weights.data_ptr<float>(),                                  \
-                  topk_ids.data_ptr<int>(),                                        \
-                  stride_tk,                                                       \
-                  num_experts,                                                     \
-                  topk,                                                            \
+#define LAUNCHER4(VEC_F, NUM_GRP, need_renorm)                                        \
+    if constexpr (isBiased)                                                           \
+    {                                                                                 \
+        LAUNCHER_biased_grouped_topk_kernel(VEC_F, NUM_GRP, need_renorm, true, false) \
+    }                                                                                 \
+    else                                                                              \
+    {                                                                                 \
+        if (isSoftmax)                                                                \
+        {                                                                             \
+            LAUNCHER_grouped_topk_kernel(VEC_F, NUM_GRP, need_renorm, false, true)    \
+        }                                                                             \
+        else                                                                          \
+        {                                                                             \
+            LAUNCHER_grouped_topk_kernel(VEC_F, NUM_GRP, need_renorm, false, false)   \
+        }                                                                             \
+    }
+
+#define LAUNCHER_biased_grouped_topk_kernel(VEC_F, NUM_GRP, need_renorm, isBiased, isSoftmax)    \
+    VLLM_DISPATCH_FLOATING_TYPES(                                                                \
+        gating_output.scalar_type(), "biased_grouped_topk_kernel", [&]                           \
+        { aiter::grouped_topk_kernel<scalar_t, VEC_F, NUM_GRP, need_renorm, isBiased, isSoftmax> \
+              <<<grid, block, shared_mem_size, stream>>>(                                        \
+                  gating_output.data_ptr<scalar_t>(),                                            \
+                  correction_bias.data_ptr<scalar_t>(),                                          \
+                  topk_weights.data_ptr<float>(),                                                \
+                  topk_ids.data_ptr<int>(),                                                      \
+                  stride_tk,                                                                     \
+                  num_experts,                                                                   \
+                  topk,                                                                          \
+                  topk_grp, num_tokens, routed_scaling_factor); });
+
+#define LAUNCHER_grouped_topk_kernel(VEC_F, NUM_GRP, need_renorm, isBiased, isSoftmax)           \
+    VLLM_DISPATCH_FLOATING_TYPES(                                                                \
+        gating_output.scalar_type(), "grouped_topk_kernel", [&]                                  \
+        { aiter::grouped_topk_kernel<scalar_t, VEC_F, NUM_GRP, need_renorm, isBiased, isSoftmax> \
+              <<<grid, block, shared_mem_size, stream>>>(                                        \
+                  gating_output.data_ptr<scalar_t>(),                                            \
+                  nullptr,                                                                       \
+                  topk_weights.data_ptr<float>(),                                                \
+                  topk_ids.data_ptr<int>(),                                                      \
+                  stride_tk,                                                                     \
+                  num_experts,                                                                   \
+                  topk,                                                                          \
                   topk_grp, num_tokens, routed_scaling_factor); });
 
 void biased_grouped_topk(
@@ -303,8 +430,10 @@ void biased_grouped_topk(
     int num_expert_group,
     int topk_grp,
     bool need_renorm,
-    const float routed_scaling_factor)
+    const float routed_scaling_factor=1.)
 {
+    const bool isBiased = true;
+    bool isSoftmax = false;
     int num_tokens = gating_output.size(0);
     int num_experts = gating_output.size(1);
     int topk = topk_ids.size(1);
@@ -326,6 +455,42 @@ void biased_grouped_topk(
 
     LAUNCH_KERNEL()
 }
+
+void grouped_topk(
+    torch::Tensor &gating_output, // [num_tokens, num_experts]
+    torch::Tensor &topk_weights,  // [num_tokens, topk]
+    torch::Tensor &topk_ids,      // [num_tokens, topk]
+    int num_expert_group,
+    int topk_grp,
+    bool need_renorm,
+    std::string scoring_func = "softmax",
+    const float routed_scaling_factor = 1.)
+{
+    TORCH_CHECK((scoring_func == "softmax") || (scoring_func == "sigmoid"), "grouped_topk scoring_func only suppot softmax or sigmoid");
+    const bool isBiased = false;
+    bool isSoftmax = scoring_func == "softmax" ? true : false;
+    int num_tokens = gating_output.size(0);
+    int num_experts = gating_output.size(1);
+    int topk = topk_ids.size(1);
+    size_t stride_tk = topk_ids.stride(0);
+    auto correction_bias = topk_ids;
+    TORCH_CHECK(stride_tk == topk_weights.stride(0), "topk_ids.stride(0) == topk_weights.stride(0)");
+
+    dim3 grid(num_tokens);
+    dim3 block(64);
+    size_t shared_mem_size = (num_experts * sizeof(float) +
+                              num_expert_group * sizeof(float) +
+                              num_expert_group * sizeof(bool) +
+                              topk * sizeof(int) +
+                              topk * sizeof(float) + 255) &
+                             ~255;
+
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(gating_output));
+    const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+    LAUNCH_KERNEL()
+}
+
 #undef LAUNCHER4
 #undef LAUNCHER3
 #undef LAUNCHER2