fix(markdown): fix UTF-8 truncation for CJK characters in snippet generation

The truncateAtWord function was slicing strings by byte position instead of character position. When truncating text with multi-byte UTF-8 characters (like CJK), this could cut in the middle of a character, creating invalid UTF-8 and causing gRPC marshaling errors. Fixed by converting to runes before truncation to ensure we always cut at proper character boundaries. Added test cases for CJK characters. Fixes #5276 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-11 14:46:03 +08:00 · 2025-11-26 07:34:28 +08:00 · 2025-11-26 07:34:28 +08:00 · 68c17469a3
commit 68c17469a3
parent e17cd163c6
2 changed files with 19 additions and 4 deletions
--- a/plugin/markdown/markdown.go
+++ b/plugin/markdown/markdown.go
@ -389,15 +389,18 @@ func uniqueLowercase(strs []string) []string {
 }

 // truncateAtWord truncates a string at the last word boundary before maxLength.
+// maxLength is treated as a rune (character) count to properly handle UTF-8 multi-byte characters.
 func truncateAtWord(s string, maxLength int) string {
-	if len(s) <= maxLength {
+	// Convert to runes to properly handle multi-byte UTF-8 characters
+	runes := []rune(s)
+	if len(runes) <= maxLength {
 		return s
 	}

-	// Truncate to max length
-	truncated := s[:maxLength]
+	// Truncate to max length (by character count, not byte count)
+	truncated := string(runes[:maxLength])

-	// Find last space
+	// Find last space to avoid cutting in the middle of a word
 	lastSpace := strings.LastIndexAny(truncated, " \t\n\r")
 	if lastSpace > 0 {
 		truncated = truncated[:lastSpace]
--- a/plugin/markdown/markdown_test.go
+++ b/plugin/markdown/markdown_test.go
@ -382,6 +382,18 @@ func TestTruncateAtWord(t *testing.T) {
 			maxLength: 10,
 			expected:  "supercalif ...",
 		},
+		{
+			name:      "CJK characters without spaces",
+			input:     "这是一个很长的中文句子没有空格的情况下也要正确处理",
+			maxLength: 15,
+			expected:  "这是一个很长的中文句子没有空格 ...",
+		},
+		{
+			name:      "mixed CJK and Latin",
+			input:     "这是中文mixed with English文字",
+			maxLength: 10,
+			expected:  "这是中文mixed ...",
+		},
 	}

 	for _, tt := range tests {