impr(lazy mode): support replacing 2 characters with one

also adds lazy mode to yiddish
closes #6321
This commit is contained in:
Miodec 2025-06-22 10:49:05 +02:00
parent 5ca47e116b
commit ced4b6e162
3 changed files with 67 additions and 14 deletions

View file

@ -6,6 +6,12 @@ let germanAccents = [
["ü", "ue"],
] as [string, string][];
let multicharAccents = [
["a", "bc"],
["de", "f"],
["gh", "ij"],
] as [string, string][];
describe("lazy-mode", () => {
describe("replaceAccents", () => {
it("should replace common accents", () => {
@ -38,5 +44,23 @@ describe("lazy-mode", () => {
expect(result).toBe("aesse");
});
});
describe("multicharacter accents", () => {
it("should correctly replace multicharacter accents", () => {
const tests = [
{ input: "a", expected: "bc" },
{ input: "aa", expected: "bcbc" },
{ input: "de", expected: "f" },
{ input: "dede", expected: "ff" },
{ input: "gh", expected: "ij" },
{ input: "ghgh", expected: "ijij" },
{ input: "abcdefgh", expected: "bcbcffij" },
];
tests.forEach(({ input, expected }) => {
const result = replaceAccents(input, multicharAccents);
expect(result).toBe(expected);
});
});
});
});
});

View file

@ -49,14 +49,21 @@ const accentsMap = new Map<string, string>(
export type Accents = [string, string][];
function findAccent(
char: string,
wordSlice: string,
additionalAccents?: Accents
): string | undefined {
const lookup = char.toLowerCase();
): [string, string] | undefined {
const lookup = wordSlice.toLowerCase();
const found = additionalAccents?.find((rule) => rule[0].includes(lookup));
const found = additionalAccents?.find((rule) => lookup.startsWith(rule[0]));
return found !== undefined ? found[1] : accentsMap.get(lookup);
const common = accentsMap.get(lookup[0] as string);
const commonFound =
common !== undefined
? ([lookup[0], common] as [string, string])
: undefined;
return found !== undefined ? found : commonFound;
}
export function replaceAccents(
@ -68,19 +75,24 @@ export function replaceAccents(
const cases = [...word].map((it, i) => it === uppercased[i]);
const newWordArray: string[] = [];
let offset = 0;
for (let i = 0; i < word.length; i++) {
const char = word[i] as string;
const isUpperCase = cases[i];
const accent = findAccent(char, additionalAccents);
const index = i + offset;
if (index >= word.length) break;
const wordSlice = word.slice(index);
const caseSlice = cases.slice(index);
const accent = findAccent(wordSlice, additionalAccents);
if (accent !== undefined) {
if (isUpperCase) {
newWordArray.push(accent.substring(0, 1).toUpperCase());
newWordArray.push(accent.substring(1));
} else {
newWordArray.push(accent);
for (let j = 0; j < accent[1].length; j++) {
const char = accent[1][j] as string;
const isUpperCase = caseSlice[j] ?? false;
newWordArray.push(isUpperCase ? char.toUpperCase() : char);
}
offset += accent[0].length - 1;
} else {
const char = word[index] as string;
const isUpperCase = cases[index];
newWordArray.push(isUpperCase ? char.toUpperCase() : char);
}
}

View file

@ -2,8 +2,25 @@
"name": "yiddish",
"rightToLeft": true,
"ligatures": true,
"noLazyMode": true,
"bcp47": "yi",
"additionalAccents": [
["אַ", "א"],
["אָ", "א"],
["בּ", "ב"],
["בֿ", "ב"],
["וּ", "ו"],
["וֹ", "ו"],
["יִ", "י"],
["כּ", "כ"],
["פּ", "פ"],
["פֿ", "פ"],
["שׂ", "ש"],
["תּ", "ת"],
["ײַ", "יי"],
["ײ", "יי"],
["ױ", "וי"],
["װ", "וו"]
],
"words": [
"קאַווע",
"אויפּס",