Improve quote search experience (#2644) by Bruception

* Initial

* Improve

* Fix comment

* Add stemming and idf

* Remove normalization

* Move idf calc outside
This commit is contained in:
Bruce Berrios 2022-03-07 08:17:15 -05:00 committed by GitHub
parent 1622f37efd
commit 538fb9d385
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 291 additions and 59 deletions

View file

@ -14,12 +14,14 @@
"chartjs-plugin-annotation": "^0.5.7",
"chartjs-plugin-trendline": "^0.2.2",
"crypto-browserify": "^3.12.0",
"damerau-levenshtein": "1.0.8",
"dom-to-image": "^2.6.0",
"firebase": "^8.4.2",
"gulp-replace": "^1.1.3",
"howler": "^2.2.1",
"moment-timezone": "^0.5.33",
"node-object-hash": "2.3.10",
"stemmer": "2.0.0",
"tinycolor2": "^1.4.2"
},
"devDependencies": {
@ -27,6 +29,7 @@
"@babel/plugin-transform-modules-commonjs": "^7.16.8",
"@babel/plugin-transform-runtime": "^7.17.0",
"@babel/preset-env": "^7.16.11",
"@types/damerau-levenshtein": "1.0.0",
"@types/grecaptcha": "^3.0.3",
"@types/howler": "^2.2.5",
"@types/jquery": "^3.5.13",
@ -2236,6 +2239,12 @@
"resolved": "https://registry.npmjs.org/@protobufjs/utf8/-/utf8-1.1.0.tgz",
"integrity": "sha1-p3c2C1s5oaLlEG+OhY8v0tBgxXA="
},
"node_modules/@types/damerau-levenshtein": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/@types/damerau-levenshtein/-/damerau-levenshtein-1.0.0.tgz",
"integrity": "sha512-8XQ1jJHlOl6HjZ3/fU9Yrm/14jxM4gXVezPWiwkyiG0GnYROsI6wdh8DwKccAFGDNiNYBooTZkRXVe4du6plKA==",
"dev": true
},
"node_modules/@types/eslint": {
"version": "8.4.1",
"resolved": "https://registry.npmjs.org/@types/eslint/-/eslint-8.4.1.tgz",
@ -4058,6 +4067,11 @@
"type": "^1.0.1"
}
},
"node_modules/damerau-levenshtein": {
"version": "1.0.8",
"resolved": "https://registry.npmjs.org/damerau-levenshtein/-/damerau-levenshtein-1.0.8.tgz",
"integrity": "sha512-sdQSFB7+llfUcQHUQO3+B8ERRj0Oa4w9POWMI/puGtuf7gFywGmkaLCElnudfTiKZV+NvHqL0ifzdrI8Ro7ESA=="
},
"node_modules/dart-sass": {
"version": "1.25.0",
"resolved": "https://registry.npmjs.org/dart-sass/-/dart-sass-1.25.0.tgz",
@ -10652,6 +10666,18 @@
"node": ">=0.10.0"
}
},
"node_modules/stemmer": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/stemmer/-/stemmer-2.0.0.tgz",
"integrity": "sha512-0YS2oMdTZ/wAWUHMMpf7AAJ8Gm6dHXyHddJ0zCu2DIfOfIbdwqAm1bbk4+Vti6gxNIcOrnm5jAP7vYTzQDvc5A==",
"bin": {
"stemmer": "cli.js"
},
"funding": {
"type": "github",
"url": "https://github.com/sponsors/wooorm"
}
},
"node_modules/stream-browserify": {
"version": "3.0.0",
"resolved": "https://registry.npmjs.org/stream-browserify/-/stream-browserify-3.0.0.tgz",
@ -13742,6 +13768,12 @@
"resolved": "https://registry.npmjs.org/@protobufjs/utf8/-/utf8-1.1.0.tgz",
"integrity": "sha1-p3c2C1s5oaLlEG+OhY8v0tBgxXA="
},
"@types/damerau-levenshtein": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/@types/damerau-levenshtein/-/damerau-levenshtein-1.0.0.tgz",
"integrity": "sha512-8XQ1jJHlOl6HjZ3/fU9Yrm/14jxM4gXVezPWiwkyiG0GnYROsI6wdh8DwKccAFGDNiNYBooTZkRXVe4du6plKA==",
"dev": true
},
"@types/eslint": {
"version": "8.4.1",
"resolved": "https://registry.npmjs.org/@types/eslint/-/eslint-8.4.1.tgz",
@ -15235,6 +15267,11 @@
"type": "^1.0.1"
}
},
"damerau-levenshtein": {
"version": "1.0.8",
"resolved": "https://registry.npmjs.org/damerau-levenshtein/-/damerau-levenshtein-1.0.8.tgz",
"integrity": "sha512-sdQSFB7+llfUcQHUQO3+B8ERRj0Oa4w9POWMI/puGtuf7gFywGmkaLCElnudfTiKZV+NvHqL0ifzdrI8Ro7ESA=="
},
"dart-sass": {
"version": "1.25.0",
"resolved": "https://registry.npmjs.org/dart-sass/-/dart-sass-1.25.0.tgz",
@ -20387,6 +20424,11 @@
}
}
},
"stemmer": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/stemmer/-/stemmer-2.0.0.tgz",
"integrity": "sha512-0YS2oMdTZ/wAWUHMMpf7AAJ8Gm6dHXyHddJ0zCu2DIfOfIbdwqAm1bbk4+Vti6gxNIcOrnm5jAP7vYTzQDvc5A=="
},
"stream-browserify": {
"version": "3.0.0",
"resolved": "https://registry.npmjs.org/stream-browserify/-/stream-browserify-3.0.0.tgz",

View file

@ -20,6 +20,7 @@
"@babel/plugin-transform-modules-commonjs": "^7.16.8",
"@babel/plugin-transform-runtime": "^7.17.0",
"@babel/preset-env": "^7.16.11",
"@types/damerau-levenshtein": "1.0.0",
"@types/grecaptcha": "^3.0.3",
"@types/howler": "^2.2.5",
"@types/jquery": "^3.5.13",
@ -56,12 +57,14 @@
"chartjs-plugin-annotation": "^0.5.7",
"chartjs-plugin-trendline": "^0.2.2",
"crypto-browserify": "^3.12.0",
"damerau-levenshtein": "1.0.8",
"dom-to-image": "^2.6.0",
"firebase": "^8.4.2",
"gulp-replace": "^1.1.3",
"howler": "^2.2.1",
"moment-timezone": "^0.5.33",
"node-object-hash": "2.3.10",
"stemmer": "2.0.0",
"tinycolor2": "^1.4.2"
}
}

View file

@ -7,6 +7,12 @@ import * as QuoteSubmitPopup from "./quote-submit-popup";
import * as QuoteApprovePopup from "./quote-approve-popup";
import * as QuoteReportPopup from "./quote-report-popup";
import * as Misc from "../misc";
import {
buildSearchService,
SearchService,
TextExtractor,
} from "../utils/search-service";
import { debounce } from "../utils/debounce";
export let selectedId = 1;
@ -14,39 +20,65 @@ export function setSelectedId(val: number): void {
selectedId = val;
}
const searchServiceCache: Record<string, SearchService<any>> = {};
function getSearchService<T>(
language: string,
data: T[],
textExtractor: TextExtractor<T>
): SearchService<T> {
if (language in searchServiceCache) {
return searchServiceCache[language];
}
const newSearchService = buildSearchService<T>(data, textExtractor);
searchServiceCache[language] = newSearchService;
return newSearchService;
}
function highlightMatches(text: string, matchedText: string[]): string {
if (matchedText.length === 0) {
return text;
}
const words = text.split(
/(?=[.,'"/#!$%^&*;:{}=\-_`~()\s])|(?<=[.,'"/#!$%^&*;:{}=\-_`~()\s])/g
);
const normalizedWords = words.map((word) => {
const shouldHighlight = matchedText.find((match) => {
return word.startsWith(match);
});
return shouldHighlight ? `<span class="highlight">${word}</span>` : word;
});
return normalizedWords.join("");
}
async function updateResults(searchText: string): Promise<void> {
const quotes = await Misc.getQuotes(Config.language);
const reg = new RegExp(searchText, "i");
const found: MonkeyTypes.Quote[] = [];
quotes.quotes.forEach((quote) => {
const quoteText = quote["text"].replace(/[.,'"/#!$%^&*;:{}=\-_`~()]/g, "");
const test1 = reg.test(quoteText);
if (test1) {
found.push(quote);
const { quotes } = await Misc.getQuotes(Config.language);
const quoteSearchService = getSearchService<MonkeyTypes.Quote>(
Config.language,
quotes,
(quote: MonkeyTypes.Quote) => {
return `${quote.text} ${quote.id} ${quote.source}`;
}
});
quotes.quotes.forEach((quote) => {
const quoteSource = quote["source"].replace(
/[.,'"/#!$%^&*;:{}=\-_`~()]/g,
""
);
const quoteId = quote["id"];
const test2 = reg.test(quoteSource);
const test3 = reg.test(quoteId.toString());
if ((test2 || test3) && found.filter((q) => q.id == quote.id).length == 0) {
found.push(quote);
}
});
);
const { results: matches, matchedQueryTerms } =
quoteSearchService.query(searchText);
$("#quoteSearchResults").remove();
$("#quoteSearchPopup").append(
'<div class="quoteSearchResults" id="quoteSearchResults"></div>'
);
const resultsList = $("#quoteSearchResults");
let resultListLength = 0;
const resultsList = $("#quoteSearchResults");
const isNotAuthed = !firebase.auth().currentUser;
found.forEach(async (quote) => {
const quotesToShow = searchText === "" ? quotes : matches;
quotesToShow.slice(0, 100).forEach((quote) => {
let lengthDesc;
if (quote.length < 101) {
lengthDesc = "short";
@ -57,15 +89,21 @@ async function updateResults(searchText: string): Promise<void> {
} else {
lengthDesc = "thicc";
}
if (resultListLength++ < 100) {
resultsList.append(`
resultsList.append(`
<div class="searchResult" id="${quote.id}">
<div class="text">${quote.text}</div>
<div class="id"><div class="sub">id</div><span class="quote-id">${
quote.id
}</span></div>
<div class="text">${highlightMatches(
quote.text,
matchedQueryTerms
)}</div>
<div class="id"><div class="sub">id</div><span class="quote-id">${highlightMatches(
quote.id.toString(),
matchedQueryTerms
)}</span></div>
<div class="length"><div class="sub">length</div>${lengthDesc}</div>
<div class="source"><div class="sub">source</div>${quote.source}</div>
<div class="source"><div class="sub">source</div>${highlightMatches(
quote.source,
matchedQueryTerms
)}</div>
<div class="icon-button report ${
isNotAuthed && "hidden"
}" aria-label="Report quote" data-balloon-pos="left">
@ -73,15 +111,14 @@ async function updateResults(searchText: string): Promise<void> {
</div>
</div>
`);
}
});
if (found.length > 100) {
if (quotesToShow.length > 100) {
$("#extraResults").html(
found.length +
quotesToShow.length +
" results <span style='opacity: 0.5'>(only showing 100)</span>"
);
} else {
$("#extraResults").html(found.length + " results");
$("#extraResults").html(quotesToShow.length + " results");
}
}
@ -158,17 +195,14 @@ export function apply(val: number): boolean {
return ret;
}
$("#quoteSearchPopup .searchBox").keydown((e) => {
if (e.code == "Escape") return;
setTimeout(() => {
let searchText = (<HTMLInputElement>document.getElementById("searchBox"))
.value;
searchText = searchText
.replace(/[.,'"/#!$%^&*;:{}=\-_`~()]/g, "")
.replace(/[-[\]{}()*+?.,\\^$|#\s]/g, "\\$&");
const debouncedSearch = debounce(updateResults);
updateResults(searchText);
}, 0.1); //arbitrarily v. small time as it's only to allow text to input before searching
$("#quoteSearchPopup .searchBox").on("keyup", (e) => {
if (e.code === "Escape") return;
const searchText = (<HTMLInputElement>document.getElementById("searchBox"))
.value;
debouncedSearch(searchText);
});
$("#quoteSearchPopupWrapper").click((e) => {
@ -218,17 +252,3 @@ $(document).keydown((event) => {
event.preventDefault();
}
});
// $("#quoteSearchPopup input").keypress((e) => {
// if (e.keyCode == 13) {
// if (!isNaN(document.getElementById("searchBox").value)) {
// apply();
// } else {
// let results = document.getElementsByClassName("searchResult");
// if (results.length > 0) {
// selectedId = parseInt(results[0].getAttribute("id"));
// apply(selectedId);
// }
// }
// }
// });

View file

@ -0,0 +1,8 @@
export function debounce(fn: any, ms = 250): any {
let timeoutId: ReturnType<typeof setTimeout>;
return function (this: any, ...args: any[]) {
clearTimeout(timeoutId);
timeoutId = setTimeout(() => fn.apply(this, args), ms);
};
}

View file

@ -0,0 +1,155 @@
import { stemmer } from "stemmer";
import levenshtein from "damerau-levenshtein";
export interface SearchService<T> {
query: (query: string) => SearchResult<T>;
}
interface SearchServiceOptions {
fuzzyMatchSensitivity: number;
scoreForSimilarMatch: number;
scoreForExactMatch: number;
}
interface InternalDocument {
id: number;
}
interface ReverseIndex {
[key: string]: Set<InternalDocument>;
}
interface TokenMap {
[key: string]: Set<string>;
}
interface SearchResult<T> {
results: T[];
matchedQueryTerms: string[];
}
export type TextExtractor<T> = (document: T) => string;
const DEFAULT_OPTIONS: SearchServiceOptions = {
fuzzyMatchSensitivity: 0.2, // Value between 0-1. Higher = more tolerant to spelling mistakes, too high and you get nonsense.
scoreForSimilarMatch: 0.5, // When ranking results, the score a match gets for having a token that is similar to a search token.
scoreForExactMatch: 1, // When ranking results, the score a match gets for having an exact match with a token in the search query.
};
function inverseDocumentFrequency(
numberOfDocuments: number,
numberOfDocumentsWithTerm: number
): number {
if (numberOfDocumentsWithTerm === 0) {
return 0;
}
return Math.log10(numberOfDocuments / numberOfDocumentsWithTerm);
}
function tokenize(text: string): string[] {
return text.match(/[a-zA-Z0-9]+/g) || [];
}
export const buildSearchService = <T>(
documents: T[],
getSearchableText: TextExtractor<T>,
options: SearchServiceOptions = DEFAULT_OPTIONS
): SearchService<T> => {
const reverseIndex: ReverseIndex = {};
const normalizedTokenToOriginal: TokenMap = {};
documents.forEach((document, documentIndex) => {
const rawTokens = tokenize(getSearchableText(document));
const internalDocument: InternalDocument = {
id: documentIndex,
};
rawTokens.forEach((token) => {
const stemmedToken = stemmer(token);
if (!(stemmedToken in normalizedTokenToOriginal)) {
normalizedTokenToOriginal[stemmedToken] = new Set<string>();
}
normalizedTokenToOriginal[stemmedToken].add(token);
if (!(stemmedToken in reverseIndex)) {
reverseIndex[stemmedToken] = new Set<InternalDocument>();
}
reverseIndex[stemmedToken].add(internalDocument);
});
});
const tokenSet = Object.keys(reverseIndex);
const query = (searchQuery: string): SearchResult<T> => {
const searchResult: SearchResult<T> = {
results: [],
matchedQueryTerms: [],
};
const normalizedSearchQuery = new Set<string>(
tokenize(searchQuery).map((token) => stemmer(token))
);
if (normalizedSearchQuery.size === 0) {
return searchResult;
}
const results = new Map<number, number>();
const matchedTokens = new Set<string>();
normalizedSearchQuery.forEach((searchToken) => {
tokenSet.forEach((token) => {
const { similarity } = levenshtein(searchToken, token);
const matchesSearchToken = token === searchToken;
const isSimilar = similarity >= 1 - options.fuzzyMatchSensitivity;
if (matchesSearchToken || isSimilar) {
const documentMatches = reverseIndex[token];
const idf = inverseDocumentFrequency(
documents.length,
documentMatches.size
);
documentMatches.forEach((document) => {
const currentScore = results.get(document.id) ?? 0;
const scoreForExactMatch = matchesSearchToken
? options.scoreForExactMatch
: 0;
const scoreForSimilarity = isSimilar
? options.scoreForSimilarMatch
: 0;
const score = scoreForExactMatch + scoreForSimilarity;
const scoreForToken = score * idf;
results.set(document.id, currentScore + scoreForToken);
});
normalizedTokenToOriginal[token].forEach((originalToken) => {
matchedTokens.add(originalToken);
});
}
});
});
const orderedResults = [...results]
.sort((match1, match2) => {
return match2[1] - match1[1];
})
.map((match) => documents[match[0]]);
searchResult.results = orderedResults;
searchResult.matchedQueryTerms = [...matchedTokens];
return searchResult;
};
return {
query,
};
};

View file

@ -412,6 +412,10 @@
}
#quoteSearchPopupWrapper {
.highlight {
color: var(--main-color);
}
#quoteSearchPopup {
background: var(--bg-color);
border-radius: var(--roundness);