fix(zipf): improve approximation of zipf distribution (@NadAlaba) (#5515)

* fix(zipf): improve approximation of zipf distribution

* rename

---------

Co-authored-by: Miodec <jack@monkeytype.com>
This commit is contained in:
Nad Alaba 2024-06-24 15:08:47 +03:00 committed by GitHub
parent d12da37050
commit 211253becb
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 15 additions and 9 deletions

View file

@ -1,5 +1,5 @@
import * as FunboxList from "./funbox/funbox-list";
import { dreymarIndex } from "../utils/misc";
import { zipfyRandomArrayIndex } from "../utils/misc";
import { randomElementFromArray, shuffle } from "../utils/arrays";
import Config from "../config";
@ -25,7 +25,7 @@ export class Wordset implements MonkeyTypes.Wordset {
randomWord(mode: MonkeyTypes.FunboxWordsFrequency): string {
if (mode === "zipf") {
return this.words[dreymarIndex(this.words.length)] as string;
return this.words[zipfyRandomArrayIndex(this.words.length)] as string;
} else {
return randomElementFromArray(this.words);
}

View file

@ -579,14 +579,20 @@ export function isDevEnvironment(): boolean {
return envConfig.isDevelopment;
}
export function dreymarIndex(arrayLength: number): number {
const n = arrayLength;
const g = 0.5772156649;
const M = Math.log(n) + g;
export function zipfyRandomArrayIndex(dictLength: number): number {
/**
* get random index based on probability distribution of Zipf's law,
* where PMF is (1/n)/H_N,
* where H_N is the Harmonic number of (N), where N is dictLength
* and the harmonic number is approximated using the formula:
* H_n = ln(n + 0.5) + gamma
*/
const gamma = 0.5772156649015329; // EulerMascheroni constant
const H_N = Math.log(dictLength + 0.5) + gamma; // approximation of H_N
const r = Math.random();
const h = Math.exp(r * M - g);
const W = Math.ceil(h);
return W - 1;
/* inverse of CDF where CDF is H_n/H_N */
const inverseCDF = Math.exp(r * H_N - gamma) - 0.5;
return Math.floor(inverseCDF);
}
export async function checkIfLanguageSupportsZipf(