fix: handling of characters outside the BMP (@fehmer) (#5911)

Handle multi-byte characters outside the [basic multilingual plane](https://en.wikipedia.org/wiki/Plane_(Unicode)) correctly. Fixes #5906
2025-11-09 21:51:29 +08:00 · 2024-09-25 13:49:23 +02:00 · 2024-09-25 13:49:23 +02:00 · f9409e3fcf
commit f9409e3fcf
parent 6bf1cb8672
4 changed files with 54 additions and 21 deletions
--- a/frontend/tests/utils/strings.spec.ts
+++ b/frontend/tests/utils/strings.spec.ts
@ -0,0 +1,12 @@
+import * as Strings from "../../src/ts/utils/strings";
+
+describe("string utils", () => {
+  describe("splitIntoCharacters", () => {
+    it("splits regular characters", () => {
+      expect(Strings.splitIntoCharacters("abc")).toEqual(["a", "b", "c"]);
+    });
+    it("splits characters outside of the bmp", () => {
+      expect(Strings.splitIntoCharacters("t𐑩e")).toEqual(["t", "𐑩", "e"]);
+    });
+  });
+});
--- a/frontend/src/ts/test/caret.ts
+++ b/frontend/src/ts/test/caret.ts
@ -6,6 +6,7 @@ import * as TestState from "../test/test-state";
 import * as TestWords from "./test-words";
 import { prefersReducedMotion } from "../utils/misc";
 import { convertRemToPixels } from "../utils/numbers";
+import { splitIntoCharacters } from "../utils/strings";

 export let caretAnimating = true;
 const caret = document.querySelector("#caret") as HTMLElement;
@ -133,8 +134,8 @@ export async function updatePosition(noAnim = false): Promise<void> {
    Config.caretStyle
  );

-  let wordLen = TestWords.words.getCurrent().length;
-  const inputLen = TestInput.input.current.length;
+  let wordLen = splitIntoCharacters(TestWords.words.getCurrent()).length;
+  const inputLen = splitIntoCharacters(TestInput.input.current).length;
  if (Config.mode === "zen") wordLen = inputLen;
  const activeWordEl = document?.querySelector("#words .active") as HTMLElement;
  //insert temporary character so the caret will work in zen mode
--- a/frontend/src/ts/test/test-ui.ts
+++ b/frontend/src/ts/test/test-ui.ts
@ -41,13 +41,14 @@ function createHintsHtml(
  activeWordLetters: NodeListOf<Element>,
  inputWord: string
 ): string {
+  const inputChars = Strings.splitIntoCharacters(inputWord);
  let hintsHtml = "";
  for (const adjacentLetters of incorrectLtrIndices) {
    for (const indx of adjacentLetters) {
      const blockLeft = (activeWordLetters[indx] as HTMLElement).offsetLeft;
      const blockWidth = (activeWordLetters[indx] as HTMLElement).offsetWidth;
      const blockIndices = `[${indx}]`;
-      const blockChars = inputWord[indx];
+      const blockChars = inputChars[indx];

      hintsHtml +=
        `<hint data-length=1 data-chars-index=${blockIndices}` +
@ -332,16 +333,17 @@ function getWordHTML(word: string): string {
  const funbox = FunboxList.get(Config.funbox).find(
    (f) => f.functions?.getWordHtml
  );
-  for (let c = 0; c < word.length; c++) {
+  const chars = Strings.splitIntoCharacters(word);
+  for (const char of chars) {
    if (funbox?.functions?.getWordHtml) {
-      retval += funbox.functions.getWordHtml(word.charAt(c), true);
-    } else if (word.charAt(c) === "\t") {
+      retval += funbox.functions.getWordHtml(char, true);
+    } else if (char === "\t") {
      retval += `<letter class='tabChar'><i class="fas fa-long-arrow-alt-right fa-fw"></i></letter>`;
-    } else if (word.charAt(c) === "\n") {
+    } else if (char === "\n") {
      newlineafter = true;
      retval += `<letter class='nlChar'><i class="fas fa-level-down-alt fa-rotate-90 fa-fw"></i></letter>`;
    } else {
-      retval += "<letter>" + word.charAt(c) + "</letter>";
+      retval += "<letter>" + char + "</letter>";
    }
  }
  retval += "</div>";
@ -833,10 +835,12 @@ export async function updateActiveWordLetters(
      (f) => f.functions?.getWordHtml
    );

-    for (let i = 0; i < input.length; i++) {
-      const charCorrect = currentWord[i] === input[i];
+    const inputChars = Strings.splitIntoCharacters(input);
+    const currentWordChars = Strings.splitIntoCharacters(currentWord);
+    for (let i = 0; i < inputChars.length; i++) {
+      const charCorrect = currentWordChars[i] === inputChars[i];

-      let currentLetter = currentWord[i] as string;
+      let currentLetter = currentWordChars[i] as string;
      let tabChar = "";
      let nlChar = "";
      if (funbox?.functions?.getWordHtml) {
@ -862,13 +866,13 @@ export async function updateActiveWordLetters(
      ) {
        ret += `<letter class="dead">${
          Config.indicateTypos === "replace"
-            ? input[i] === " "
+            ? inputChars[i] === " "
              ? "_"
-              : input[i]
+              : inputChars[i]
            : currentLetter
        }</letter>`;
      } else if (currentLetter === undefined) {
-        let letter = input[i];
+        let letter = inputChars[i];
        if (letter === " " || letter === "\t" || letter === "\n") {
          letter = "_";
        }
@ -877,9 +881,9 @@ export async function updateActiveWordLetters(
        ret +=
          `<letter class="incorrect ${tabChar}${nlChar}">` +
          (Config.indicateTypos === "replace"
-            ? input[i] === " "
+            ? inputChars[i] === " "
              ? "_"
-              : input[i]
+              : inputChars[i]
            : currentLetter) +
          "</letter>";
        if (Config.indicateTypos === "below") {
@ -893,15 +897,16 @@ export async function updateActiveWordLetters(
      }
    }

-    for (let i = input.length; i < currentWord.length; i++) {
+    for (let i = inputChars.length; i < currentWordChars.length; i++) {
+      const currentLetter = currentWordChars[i];
      if (funbox?.functions?.getWordHtml) {
-        ret += funbox.functions.getWordHtml(currentWord[i] as string, true);
-      } else if (currentWord[i] === "\t") {
+        ret += funbox.functions.getWordHtml(currentLetter as string, true);
+      } else if (currentLetter === "\t") {
        ret += `<letter class='tabChar'><i class="fas fa-long-arrow-alt-right fa-fw"></i></letter>`;
-      } else if (currentWord[i] === "\n") {
+      } else if (currentLetter === "\n") {
        ret += `<letter class='nlChar'><i class="fas fa-level-down-alt fa-rotate-90 fa-fw"></i></letter>`;
      } else {
-        ret += `<letter>` + currentWord[i] + "</letter>";
+        ret += `<letter>` + currentLetter + "</letter>";
      }
    }
  }
--- a/frontend/src/ts/utils/strings.ts
+++ b/frontend/src/ts/utils/strings.ts
@ -149,3 +149,18 @@ export function cleanTypographySymbols(textToClean: string): string {
    (char) => specials[char as keyof typeof specials] || ""
  );
 }
+
+/**
+ * Split a string into characters. This supports multi-byte characters outside of the [Basic Multilinugal Plane](https://en.wikipedia.org/wiki/Plane_(Unicode).
+ * Using  `string.length` and `string[index]` does not work.
+ * @param s string to be tokenized into characters
+ * @returns array of characters
+ */
+export function splitIntoCharacters(s: string): string[] {
+  const result: string[] = [];
+  for (const t of s) {
+    result.push(t);
+  }
+
+  return result;
+}