From c96956c9f1c60e598d5a46ffdc9bbbb2cb256c43 Mon Sep 17 00:00:00 2001 From: Bruce Berrios <58147810+Bruception@users.noreply.github.com> Date: Tue, 7 Jun 2022 17:14:21 -0400 Subject: [PATCH] Use maximum tf norm (#3077) Bruception --- frontend/src/ts/utils/search-service.ts | 35 +++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/frontend/src/ts/utils/search-service.ts b/frontend/src/ts/utils/search-service.ts index 8ab0d9440..1ae5aae07 100644 --- a/frontend/src/ts/utils/search-service.ts +++ b/frontend/src/ts/utils/search-service.ts @@ -13,6 +13,8 @@ interface SearchServiceOptions { interface InternalDocument { id: number; + maxTermFrequency: number; + termFrequencies: Record; } interface ReverseIndex { @@ -47,6 +49,18 @@ function inverseDocumentFrequency( return Math.log10(numberOfDocuments / numberOfDocumentsWithTerm); } +const ALPHA = 0.4; // Smoothing term that dampens the contribution of tf/max tf + +function normalizedTermFrequency( + term: string, + document: InternalDocument +): number { + return ( + ALPHA + + (1 - ALPHA) * (document.termFrequencies[term] / document.maxTermFrequency) + ); +} + function tokenize(text: string): string[] { return text.match(/[^\\\][.,"/#!?$%^&*;:{}=\-_`~()\s]+/g) || []; } @@ -64,8 +78,12 @@ export const buildSearchService = ( const internalDocument: InternalDocument = { id: documentIndex, + termFrequencies: {}, + maxTermFrequency: 0, }; + let maxTermFrequency = 0; + rawTokens.forEach((token) => { const stemmedToken = stemmer(token); @@ -78,7 +96,19 @@ export const buildSearchService = ( reverseIndex[stemmedToken] = new Set(); } reverseIndex[stemmedToken].add(internalDocument); + + if (!(stemmedToken in internalDocument.termFrequencies)) { + internalDocument.termFrequencies[stemmedToken] = 0; + } + + internalDocument.termFrequencies[stemmedToken]++; + maxTermFrequency = Math.max( + maxTermFrequency, + internalDocument.termFrequencies[stemmedToken] + ); }); + + internalDocument.maxTermFrequency = maxTermFrequency; }); const tokenSet = Object.keys(reverseIndex); @@ -113,10 +143,11 @@ export const buildSearchService = ( documents.length, documentMatches.size ); - documentMatches.forEach((document) => { const currentScore = results.get(document.id) ?? 0; + const termFrequency = normalizedTermFrequency(token, document); + const scoreForExactMatch = matchesSearchToken ? options.scoreForExactMatch : 0; @@ -125,7 +156,7 @@ export const buildSearchService = ( : 0; const score = scoreForExactMatch + scoreForSimilarity; - const scoreForToken = score * idf; + const scoreForToken = score * idf * termFrequency; results.set(document.id, currentScore + scoreForToken); });