Use maximum tf norm (#3077) Bruception

This commit is contained in:
Bruce Berrios 2022-06-07 17:14:21 -04:00 committed by GitHub
parent b25ac99527
commit c96956c9f1
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -13,6 +13,8 @@ interface SearchServiceOptions {
interface InternalDocument {
id: number;
maxTermFrequency: number;
termFrequencies: Record<string, number>;
}
interface ReverseIndex {
@ -47,6 +49,18 @@ function inverseDocumentFrequency(
return Math.log10(numberOfDocuments / numberOfDocumentsWithTerm);
}
const ALPHA = 0.4; // Smoothing term that dampens the contribution of tf/max tf
function normalizedTermFrequency(
term: string,
document: InternalDocument
): number {
return (
ALPHA +
(1 - ALPHA) * (document.termFrequencies[term] / document.maxTermFrequency)
);
}
function tokenize(text: string): string[] {
return text.match(/[^\\\][.,"/#!?$%^&*;:{}=\-_`~()\s]+/g) || [];
}
@ -64,8 +78,12 @@ export const buildSearchService = <T>(
const internalDocument: InternalDocument = {
id: documentIndex,
termFrequencies: {},
maxTermFrequency: 0,
};
let maxTermFrequency = 0;
rawTokens.forEach((token) => {
const stemmedToken = stemmer(token);
@ -78,7 +96,19 @@ export const buildSearchService = <T>(
reverseIndex[stemmedToken] = new Set<InternalDocument>();
}
reverseIndex[stemmedToken].add(internalDocument);
if (!(stemmedToken in internalDocument.termFrequencies)) {
internalDocument.termFrequencies[stemmedToken] = 0;
}
internalDocument.termFrequencies[stemmedToken]++;
maxTermFrequency = Math.max(
maxTermFrequency,
internalDocument.termFrequencies[stemmedToken]
);
});
internalDocument.maxTermFrequency = maxTermFrequency;
});
const tokenSet = Object.keys(reverseIndex);
@ -113,10 +143,11 @@ export const buildSearchService = <T>(
documents.length,
documentMatches.size
);
documentMatches.forEach((document) => {
const currentScore = results.get(document.id) ?? 0;
const termFrequency = normalizedTermFrequency(token, document);
const scoreForExactMatch = matchesSearchToken
? options.scoreForExactMatch
: 0;
@ -125,7 +156,7 @@ export const buildSearchService = <T>(
: 0;
const score = scoreForExactMatch + scoreForSimilarity;
const scoreForToken = score * idf;
const scoreForToken = score * idf * termFrequency;
results.set(document.id, currentScore + scoreForToken);
});