fix(server): add stemmer filter (#12358)

CLOUD-214

<!-- This is an auto-generated comment: release notes by coderabbit.ai -->

## Summary by CodeRabbit

- **New Features**
  - Improved search functionality to support stemming, allowing searches for variations of words (e.g., "window", "windows", "design") to return relevant results.
- **Tests**
  - Added new tests to verify that search results correctly highlight and match stemmed word variations in document titles.

<!-- end of auto-generated comment: release notes by coderabbit.ai -->
This commit is contained in:
fengmk2
2025-05-20 03:09:22 +00:00
parent 42d527251a
commit cce66f6107
5 changed files with 180 additions and 2 deletions

View File

@@ -423,6 +423,77 @@ Generated by [AVA](https://avajs.dev).
},
]
## should search doc title support stemmer filter
> Snapshot 1
{
_id: 'workspace-test-doc-title-stemmer-filter/doc-0',
_source: {
doc_id: 'doc-0',
workspace_id: 'workspace-test-doc-title-stemmer-filter',
},
fields: {
doc_id: [
'doc-0',
],
title: [
'Deploy on Windows by a designer',
],
},
highlights: {
title: [
'Deploy on <b>Windows</b> by a designer',
],
},
}
> Snapshot 2
{
_id: 'workspace-test-doc-title-stemmer-filter/doc-0',
_source: {
doc_id: 'doc-0',
workspace_id: 'workspace-test-doc-title-stemmer-filter',
},
fields: {
doc_id: [
'doc-0',
],
title: [
'Deploy on Windows by a designer',
],
},
highlights: {
title: [
'Deploy on <b>Windows</b> by a designer',
],
},
}
> Snapshot 3
{
_id: 'workspace-test-doc-title-stemmer-filter/doc-0',
_source: {
doc_id: 'doc-0',
workspace_id: 'workspace-test-doc-title-stemmer-filter',
},
fields: {
doc_id: [
'doc-0',
],
title: [
'Deploy on Windows by a designer',
],
},
highlights: {
title: [
'Deploy on Windows by a <b>designer</b>',
],
},
}
## should aggregate query work
> Snapshot 1

View File

@@ -1210,6 +1210,99 @@ test('should search query match ref_doc_id work', async t => {
t.snapshot(result.nodes.map(node => pick(node, ['fields'])));
});
test('should search doc title support stemmer filter', async t => {
const docId = 'doc-0';
const workspaceId = 'workspace-test-doc-title-stemmer-filter';
await searchProvider.write(
SearchTable.doc,
[
{
workspace_id: workspaceId,
doc_id: docId,
title: 'Deploy on Windows by a designer',
},
],
{
refresh: true,
}
);
let result = await searchProvider.search(SearchTable.doc, {
_source: ['workspace_id', 'doc_id'],
query: {
bool: {
must: [
{ match: { workspace_id: workspaceId } },
{ match: { title: 'window' } },
],
},
},
fields: ['doc_id', 'title'],
highlight: {
fields: {
title: {
pre_tags: ['<b>'],
post_tags: ['</b>'],
},
},
},
sort: ['_score'],
});
t.is(result.total, 1);
t.snapshot(omit(result.nodes[0], ['_score']));
result = await searchProvider.search(SearchTable.doc, {
_source: ['workspace_id', 'doc_id'],
query: {
bool: {
must: [
{ match: { workspace_id: workspaceId } },
{ match: { title: 'windows' } },
],
},
},
fields: ['doc_id', 'title'],
highlight: {
fields: {
title: {
pre_tags: ['<b>'],
post_tags: ['</b>'],
},
},
},
sort: ['_score'],
});
t.is(result.total, 1);
t.snapshot(omit(result.nodes[0], ['_score']));
result = await searchProvider.search(SearchTable.doc, {
_source: ['workspace_id', 'doc_id'],
query: {
bool: {
must: [
{ match: { workspace_id: workspaceId } },
{ match: { title: 'design' } },
],
},
},
fields: ['doc_id', 'title'],
highlight: {
fields: {
title: {
pre_tags: ['<b>'],
post_tags: ['</b>'],
},
},
},
sort: ['_score'],
});
t.is(result.total, 1);
t.snapshot(omit(result.nodes[0], ['_score']));
});
// #endregion
// #region aggregate

View File

@@ -31,7 +31,15 @@ export const blockMapping = {
analyzer: {
standard_with_cjk: {
tokenizer: 'standard',
filter: ['lowercase', 'cjk_bigram_and_unigrams'],
filter: [
'lowercase',
'cjk_bigram_and_unigrams',
// support `windows designer` => `windows`, `window`, `designer`, `design`
// @see https://www.elastic.co/docs/reference/text-analysis/analysis-remove-duplicates-tokenfilter
'keyword_repeat',
'stemmer',
'remove_duplicates',
],
},
autocomplete: {
tokenizer: 'autocomplete_tokenizer',

View File

@@ -24,7 +24,13 @@ export const docMapping = {
analyzer: {
standard_with_cjk: {
tokenizer: 'standard',
filter: ['lowercase', 'cjk_bigram_and_unigrams'],
filter: [
'lowercase',
'cjk_bigram_and_unigrams',
'keyword_repeat',
'stemmer',
'remove_duplicates',
],
},
autocomplete: {
tokenizer: 'autocomplete_tokenizer',