fix(markdown): fix UTF-8 truncation for CJK characters in snippet generation

The truncateAtWord function was slicing strings by byte position instead of
character position. When truncating text with multi-byte UTF-8 characters
(like CJK), this could cut in the middle of a character, creating invalid
UTF-8 and causing gRPC marshaling errors.

Fixed by converting to runes before truncation to ensure we always cut at
proper character boundaries. Added test cases for CJK characters.

Fixes #5276

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Steven 2025-11-26 07:34:28 +08:00
parent e17cd163c6
commit 68c17469a3
2 changed files with 19 additions and 4 deletions

View file

@ -389,15 +389,18 @@ func uniqueLowercase(strs []string) []string {
}
// truncateAtWord truncates a string at the last word boundary before maxLength.
// maxLength is treated as a rune (character) count to properly handle UTF-8 multi-byte characters.
func truncateAtWord(s string, maxLength int) string {
if len(s) <= maxLength {
// Convert to runes to properly handle multi-byte UTF-8 characters
runes := []rune(s)
if len(runes) <= maxLength {
return s
}
// Truncate to max length
truncated := s[:maxLength]
// Truncate to max length (by character count, not byte count)
truncated := string(runes[:maxLength])
// Find last space
// Find last space to avoid cutting in the middle of a word
lastSpace := strings.LastIndexAny(truncated, " \t\n\r")
if lastSpace > 0 {
truncated = truncated[:lastSpace]

View file

@ -382,6 +382,18 @@ func TestTruncateAtWord(t *testing.T) {
maxLength: 10,
expected: "supercalif ...",
},
{
name: "CJK characters without spaces",
input: "这是一个很长的中文句子没有空格的情况下也要正确处理",
maxLength: 15,
expected: "这是一个很长的中文句子没有空格 ...",
},
{
name: "mixed CJK and Latin",
input: "这是中文mixed with English文字",
maxLength: 10,
expected: "这是中文mixed ...",
},
}
for _, tt := range tests {