mirror of
https://github.com/usememos/memos.git
synced 2025-12-11 14:46:03 +08:00
fix(markdown): fix UTF-8 truncation for CJK characters in snippet generation
The truncateAtWord function was slicing strings by byte position instead of character position. When truncating text with multi-byte UTF-8 characters (like CJK), this could cut in the middle of a character, creating invalid UTF-8 and causing gRPC marshaling errors. Fixed by converting to runes before truncation to ensure we always cut at proper character boundaries. Added test cases for CJK characters. Fixes #5276 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
parent
e17cd163c6
commit
68c17469a3
2 changed files with 19 additions and 4 deletions
|
|
@ -389,15 +389,18 @@ func uniqueLowercase(strs []string) []string {
|
|||
}
|
||||
|
||||
// truncateAtWord truncates a string at the last word boundary before maxLength.
|
||||
// maxLength is treated as a rune (character) count to properly handle UTF-8 multi-byte characters.
|
||||
func truncateAtWord(s string, maxLength int) string {
|
||||
if len(s) <= maxLength {
|
||||
// Convert to runes to properly handle multi-byte UTF-8 characters
|
||||
runes := []rune(s)
|
||||
if len(runes) <= maxLength {
|
||||
return s
|
||||
}
|
||||
|
||||
// Truncate to max length
|
||||
truncated := s[:maxLength]
|
||||
// Truncate to max length (by character count, not byte count)
|
||||
truncated := string(runes[:maxLength])
|
||||
|
||||
// Find last space
|
||||
// Find last space to avoid cutting in the middle of a word
|
||||
lastSpace := strings.LastIndexAny(truncated, " \t\n\r")
|
||||
if lastSpace > 0 {
|
||||
truncated = truncated[:lastSpace]
|
||||
|
|
|
|||
|
|
@ -382,6 +382,18 @@ func TestTruncateAtWord(t *testing.T) {
|
|||
maxLength: 10,
|
||||
expected: "supercalif ...",
|
||||
},
|
||||
{
|
||||
name: "CJK characters without spaces",
|
||||
input: "这是一个很长的中文句子没有空格的情况下也要正确处理",
|
||||
maxLength: 15,
|
||||
expected: "这是一个很长的中文句子没有空格 ...",
|
||||
},
|
||||
{
|
||||
name: "mixed CJK and Latin",
|
||||
input: "这是中文mixed with English文字",
|
||||
maxLength: 10,
|
||||
expected: "这是中文mixed ...",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue