fix(server): add stemmer filter (#12358)

CLOUD-214  ## Summary by CodeRabbit - **New Features** - Improved search functionality to support stemming, allowing searches for variations of words (e.g., "window", "windows", "design") to return relevant results. - **Tests** - Added new tests to verify that search results correctly highlight and match stemmed word variations in document titles.
2026-02-14 13:25:12 +00:00 · 2025-05-20 03:09:22 +00:00
parent 42d527251a
commit cce66f6107
5 changed files with 180 additions and 2 deletions
--- a/packages/backend/server/src/plugins/indexer/tests/providers/snapshots/elasticsearch.spec.ts.md
+++ b/packages/backend/server/src/plugins/indexer/tests/providers/snapshots/elasticsearch.spec.ts.md
@@ -423,6 +423,77 @@ Generated by [AVA](https://avajs.dev).
      },
    ]

+## should search doc title support stemmer filter
+
+> Snapshot 1
+
+    {
+      _id: 'workspace-test-doc-title-stemmer-filter/doc-0',
+      _source: {
+        doc_id: 'doc-0',
+        workspace_id: 'workspace-test-doc-title-stemmer-filter',
+      },
+      fields: {
+        doc_id: [
+          'doc-0',
+        ],
+        title: [
+          'Deploy on Windows by a designer',
+        ],
+      },
+      highlights: {
+        title: [
+          'Deploy on <b>Windows</b> by a designer',
+        ],
+      },
+    }
+
+> Snapshot 2
+
+    {
+      _id: 'workspace-test-doc-title-stemmer-filter/doc-0',
+      _source: {
+        doc_id: 'doc-0',
+        workspace_id: 'workspace-test-doc-title-stemmer-filter',
+      },
+      fields: {
+        doc_id: [
+          'doc-0',
+        ],
+        title: [
+          'Deploy on Windows by a designer',
+        ],
+      },
+      highlights: {
+        title: [
+          'Deploy on <b>Windows</b> by a designer',
+        ],
+      },
+    }
+
+> Snapshot 3
+
+    {
+      _id: 'workspace-test-doc-title-stemmer-filter/doc-0',
+      _source: {
+        doc_id: 'doc-0',
+        workspace_id: 'workspace-test-doc-title-stemmer-filter',
+      },
+      fields: {
+        doc_id: [
+          'doc-0',
+        ],
+        title: [
+          'Deploy on Windows by a designer',
+        ],
+      },
+      highlights: {
+        title: [
+          'Deploy on Windows by a <b>designer</b>',
+        ],
+      },
+    }
+
 ## should aggregate query work

 > Snapshot 1
--- a/packages/backend/server/src/plugins/indexer/tests/providers/snapshots/elasticsearch.spec.ts.snap
+++ b/packages/backend/server/src/plugins/indexer/tests/providers/snapshots/elasticsearch.spec.ts.snap
--- a/packages/backend/server/src/plugins/indexer/tests/providers/elasticsearch.spec.ts
+++ b/packages/backend/server/src/plugins/indexer/tests/providers/elasticsearch.spec.ts
@@ -1210,6 +1210,99 @@ test('should search query match ref_doc_id work', async t => {
  t.snapshot(result.nodes.map(node => pick(node, ['fields'])));
 });

+test('should search doc title support stemmer filter', async t => {
+  const docId = 'doc-0';
+  const workspaceId = 'workspace-test-doc-title-stemmer-filter';
+  await searchProvider.write(
+    SearchTable.doc,
+    [
+      {
+        workspace_id: workspaceId,
+        doc_id: docId,
+        title: 'Deploy on Windows by a designer',
+      },
+    ],
+    {
+      refresh: true,
+    }
+  );
+
+  let result = await searchProvider.search(SearchTable.doc, {
+    _source: ['workspace_id', 'doc_id'],
+    query: {
+      bool: {
+        must: [
+          { match: { workspace_id: workspaceId } },
+          { match: { title: 'window' } },
+        ],
+      },
+    },
+    fields: ['doc_id', 'title'],
+    highlight: {
+      fields: {
+        title: {
+          pre_tags: ['<b>'],
+          post_tags: ['</b>'],
+        },
+      },
+    },
+    sort: ['_score'],
+  });
+
+  t.is(result.total, 1);
+  t.snapshot(omit(result.nodes[0], ['_score']));
+
+  result = await searchProvider.search(SearchTable.doc, {
+    _source: ['workspace_id', 'doc_id'],
+    query: {
+      bool: {
+        must: [
+          { match: { workspace_id: workspaceId } },
+          { match: { title: 'windows' } },
+        ],
+      },
+    },
+    fields: ['doc_id', 'title'],
+    highlight: {
+      fields: {
+        title: {
+          pre_tags: ['<b>'],
+          post_tags: ['</b>'],
+        },
+      },
+    },
+    sort: ['_score'],
+  });
+
+  t.is(result.total, 1);
+  t.snapshot(omit(result.nodes[0], ['_score']));
+
+  result = await searchProvider.search(SearchTable.doc, {
+    _source: ['workspace_id', 'doc_id'],
+    query: {
+      bool: {
+        must: [
+          { match: { workspace_id: workspaceId } },
+          { match: { title: 'design' } },
+        ],
+      },
+    },
+    fields: ['doc_id', 'title'],
+    highlight: {
+      fields: {
+        title: {
+          pre_tags: ['<b>'],
+          post_tags: ['</b>'],
+        },
+      },
+    },
+    sort: ['_score'],
+  });
+
+  t.is(result.total, 1);
+  t.snapshot(omit(result.nodes[0], ['_score']));
+});
+
 // #endregion

 // #region aggregate
--- a/packages/backend/server/src/plugins/indexer/tables/block.ts
+++ b/packages/backend/server/src/plugins/indexer/tables/block.ts
@@ -31,7 +31,15 @@ export const blockMapping = {
      analyzer: {
        standard_with_cjk: {
          tokenizer: 'standard',
-          filter: ['lowercase', 'cjk_bigram_and_unigrams'],
+          filter: [
+            'lowercase',
+            'cjk_bigram_and_unigrams',
+            // support `windows designer` => `windows`, `window`, `designer`, `design`
+            // @see https://www.elastic.co/docs/reference/text-analysis/analysis-remove-duplicates-tokenfilter
+            'keyword_repeat',
+            'stemmer',
+            'remove_duplicates',
+          ],
        },
        autocomplete: {
          tokenizer: 'autocomplete_tokenizer',
--- a/packages/backend/server/src/plugins/indexer/tables/doc.ts
+++ b/packages/backend/server/src/plugins/indexer/tables/doc.ts
@@ -24,7 +24,13 @@ export const docMapping = {
      analyzer: {
        standard_with_cjk: {
          tokenizer: 'standard',
-          filter: ['lowercase', 'cjk_bigram_and_unigrams'],
+          filter: [
+            'lowercase',
+            'cjk_bigram_and_unigrams',
+            'keyword_repeat',
+            'stemmer',
+            'remove_duplicates',
+          ],
        },
        autocomplete: {
          tokenizer: 'autocomplete_tokenizer',