mirror of
https://github.com/monkeytypegame/monkeytype.git
synced 2025-10-23 05:56:10 +08:00
Use maximum tf norm (#3077) Bruception
This commit is contained in:
parent
b25ac99527
commit
c96956c9f1
1 changed files with 33 additions and 2 deletions
|
@ -13,6 +13,8 @@ interface SearchServiceOptions {
|
|||
|
||||
interface InternalDocument {
|
||||
id: number;
|
||||
maxTermFrequency: number;
|
||||
termFrequencies: Record<string, number>;
|
||||
}
|
||||
|
||||
interface ReverseIndex {
|
||||
|
@ -47,6 +49,18 @@ function inverseDocumentFrequency(
|
|||
return Math.log10(numberOfDocuments / numberOfDocumentsWithTerm);
|
||||
}
|
||||
|
||||
const ALPHA = 0.4; // Smoothing term that dampens the contribution of tf/max tf
|
||||
|
||||
function normalizedTermFrequency(
|
||||
term: string,
|
||||
document: InternalDocument
|
||||
): number {
|
||||
return (
|
||||
ALPHA +
|
||||
(1 - ALPHA) * (document.termFrequencies[term] / document.maxTermFrequency)
|
||||
);
|
||||
}
|
||||
|
||||
function tokenize(text: string): string[] {
|
||||
return text.match(/[^\\\][.,"/#!?$%^&*;:{}=\-_`~()\s]+/g) || [];
|
||||
}
|
||||
|
@ -64,8 +78,12 @@ export const buildSearchService = <T>(
|
|||
|
||||
const internalDocument: InternalDocument = {
|
||||
id: documentIndex,
|
||||
termFrequencies: {},
|
||||
maxTermFrequency: 0,
|
||||
};
|
||||
|
||||
let maxTermFrequency = 0;
|
||||
|
||||
rawTokens.forEach((token) => {
|
||||
const stemmedToken = stemmer(token);
|
||||
|
||||
|
@ -78,7 +96,19 @@ export const buildSearchService = <T>(
|
|||
reverseIndex[stemmedToken] = new Set<InternalDocument>();
|
||||
}
|
||||
reverseIndex[stemmedToken].add(internalDocument);
|
||||
|
||||
if (!(stemmedToken in internalDocument.termFrequencies)) {
|
||||
internalDocument.termFrequencies[stemmedToken] = 0;
|
||||
}
|
||||
|
||||
internalDocument.termFrequencies[stemmedToken]++;
|
||||
maxTermFrequency = Math.max(
|
||||
maxTermFrequency,
|
||||
internalDocument.termFrequencies[stemmedToken]
|
||||
);
|
||||
});
|
||||
|
||||
internalDocument.maxTermFrequency = maxTermFrequency;
|
||||
});
|
||||
|
||||
const tokenSet = Object.keys(reverseIndex);
|
||||
|
@ -113,10 +143,11 @@ export const buildSearchService = <T>(
|
|||
documents.length,
|
||||
documentMatches.size
|
||||
);
|
||||
|
||||
documentMatches.forEach((document) => {
|
||||
const currentScore = results.get(document.id) ?? 0;
|
||||
|
||||
const termFrequency = normalizedTermFrequency(token, document);
|
||||
|
||||
const scoreForExactMatch = matchesSearchToken
|
||||
? options.scoreForExactMatch
|
||||
: 0;
|
||||
|
@ -125,7 +156,7 @@ export const buildSearchService = <T>(
|
|||
: 0;
|
||||
const score = scoreForExactMatch + scoreForSimilarity;
|
||||
|
||||
const scoreForToken = score * idf;
|
||||
const scoreForToken = score * idf * termFrequency;
|
||||
|
||||
results.set(document.id, currentScore + scoreForToken);
|
||||
});
|
||||
|
|
Loading…
Add table
Reference in a new issue