From b1584aec7c26184ca4f361ef482fe5d4a26e1684 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E9=91=AB?= <liuxin@modelbest.cn>
Date: Mon, 13 Apr 2026 15:38:53 +0800
Subject: [PATCH] fix: stabilize CPU SDPA mask broadcasting

Use an explicit broadcastable attention mask shape during MiniCPM incremental decoding so CPU runtimes avoid a PyTorch SDPA dimension error without changing attention semantics.

Made-with: Cursor
---
 src/voxcpm/modules/minicpm4/model.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/voxcpm/modules/minicpm4/model.py b/src/voxcpm/modules/minicpm4/model.py
index 99d6f0b..6075807 100644
--- a/src/voxcpm/modules/minicpm4/model.py
+++ b/src/voxcpm/modules/minicpm4/model.py
@@ -196,7 +196,9 @@ class MiniCPMAttention(nn.Module):
         key_cache[:, :, position_id, :] = key_states
         value_cache[:, :, position_id, :] = value_states
 
-        attn_mask = torch.arange(key_cache.size(2), device=key_cache.device) <= position_id
+        # Use an explicit broadcastable mask shape for SDPA. A 1D mask can
+        # trigger a CPU-side dimension bug in some PyTorch versions.
+        attn_mask = (torch.arange(key_cache.size(2), device=key_cache.device) <= position_id).view(1, 1, 1, -1)
 
         # ref: https://github.com/pytorch/pytorch/issues/163597
         # there is a bug in MPS for non-contiguous tensors, so we need to make them contiguous