feat(server): improve indexer (#14698)

fix #13862 #### PR Dependency Tree * **PR #14698** 👈 This tree was auto-generated by [Charcoal](https://github.com/danerwilliams/charcoal)  ## Summary by CodeRabbit * **New Features** * Enhanced search support for Chinese, Japanese, and Korean languages with improved text segmentation and character matching. * Added index management capabilities with table recreation functionality. * **Bug Fixes** * Improved search accuracy for non-Latin scripts through updated morphology and n-gram configuration. * **Chores** * Added database migration for search index optimization.
2026-07-02 18:20:39 +08:00 · 2026-03-22 02:50:59 +08:00
parent bcf2a51d41
commit f47ee2bc8a
10 changed files with 402 additions and 50 deletions
@@ -0,0 +1,12 @@
+import { ModuleRef } from '@nestjs/core';
+import { PrismaClient } from '@prisma/client';
+
+import { IndexerService } from '../../plugins/indexer';
+
+export class RebuildManticoreMixedScriptIndexes1763800000000 {
+  static async up(_db: PrismaClient, ref: ModuleRef) {
+    await ref.get(IndexerService, { strict: false }).rebuildManticoreIndexes();
+  }
+
+  static async down(_db: PrismaClient) {}
+}
@@ -3,3 +3,4 @@ export * from './1703756315970-unamed-account';
 export * from './1721299086340-refresh-unnamed-user';
 export * from './1745211351719-create-indexer-tables';
 export * from './1751966744168-correct-session-update-time';
+export * from './1763800000000-rebuild-manticore-mixed-script-indexes';
@@ -4,6 +4,75 @@ The actual snapshot is saved in `manticoresearch.spec.ts.snap`.

 Generated by [AVA](https://avajs.dev).

+## should search doc title match chinese word segmentation
+
+> Snapshot 1
+
+    [
+      {
+        _id: '5373363211628325828',
+        _source: {
+          doc_id: 'doc-chinese',
+          workspace_id: 'workspace-test-doc-title-chinese',
+        },
+        fields: {
+          doc_id: [
+            'doc-chinese',
+          ],
+          title: [
+            'AFFiNE 是一个基于云端的笔记应用',
+          ],
+        },
+        highlights: undefined,
+      },
+    ]
+
+## should search block content match korean ngram
+
+> Snapshot 1
+
+    [
+      {
+        _id: '1227635764506850985',
+        _source: {
+          doc_id: 'doc-korean',
+          workspace_id: 'workspace-test-block-content-korean',
+        },
+        fields: {
+          block_id: [
+            'block-korean',
+          ],
+          content: [
+            '다람쥐 헌 쳇바퀴에 타고파',
+          ],
+        },
+        highlights: undefined,
+      },
+    ]
+
+## should search block content match japanese kana ngram
+
+> Snapshot 1
+
+    [
+      {
+        _id: '381498385699454292',
+        _source: {
+          doc_id: 'doc-japanese',
+          workspace_id: 'workspace-test-block-content-japanese',
+        },
+        fields: {
+          block_id: [
+            'block-japanese',
+          ],
+          content: [
+            'いろはにほへと ちりぬるを',
+          ],
+        },
+        highlights: undefined,
+      },
+    ]
+
 ## should write document work

 > Snapshot 1
@@ -889,7 +958,7 @@ Generated by [AVA](https://avajs.dev).
 > Snapshot 1

    {
-      term: {
+      equals: {
        workspace_id: 'workspaceId1',
      },
    }
@@ -897,7 +966,7 @@ Generated by [AVA](https://avajs.dev).
 > Snapshot 2

    {
-      term: {
+      equals: {
        workspace_id: 'workspaceId1',
      },
    }
@@ -33,8 +33,8 @@ const user = await module.create(Mockers.User);
 const workspace = await module.create(Mockers.Workspace);

 test.before(async () => {
-  await searchProvider.createTable(SearchTable.block, blockSQL);
-  await searchProvider.createTable(SearchTable.doc, docSQL);
+  await searchProvider.recreateTable(SearchTable.block, blockSQL);
+  await searchProvider.recreateTable(SearchTable.doc, docSQL);

  await searchProvider.write(
    SearchTable.block,
@@ -163,6 +163,135 @@ test('should provider is manticoresearch', t => {
  t.is(searchProvider.type, SearchProviderType.Manticoresearch);
 });

+test('should search doc title match chinese word segmentation', async t => {
+  const workspaceId = 'workspace-test-doc-title-chinese';
+  const docId = 'doc-chinese';
+  const title = 'AFFiNE 是一个基于云端的笔记应用';
+
+  await searchProvider.write(
+    SearchTable.doc,
+    [
+      {
+        workspace_id: workspaceId,
+        doc_id: docId,
+        title,
+      },
+    ],
+    {
+      refresh: true,
+    }
+  );
+
+  const result = await searchProvider.search(SearchTable.doc, {
+    _source: ['workspace_id', 'doc_id'],
+    query: {
+      bool: {
+        must: [
+          { term: { workspace_id: { value: workspaceId } } },
+          { match: { title: '笔记' } },
+        ],
+      },
+    },
+    fields: ['doc_id', 'title'],
+    sort: ['_score'],
+  });
+
+  t.true(result.total >= 1);
+  t.snapshot(
+    result.nodes
+      .filter(node => node._source.doc_id === docId)
+      .map(node => omit(node, ['_score']))
+  );
+});
+
+test('should search block content match korean ngram', async t => {
+  const workspaceId = 'workspace-test-block-content-korean';
+  const docId = 'doc-korean';
+  const blockId = 'block-korean';
+  const content = '다람쥐 헌 쳇바퀴에 타고파';
+
+  await searchProvider.write(
+    SearchTable.block,
+    [
+      {
+        workspace_id: workspaceId,
+        doc_id: docId,
+        block_id: blockId,
+        content,
+        flavour: 'affine:paragraph',
+      },
+    ],
+    {
+      refresh: true,
+    }
+  );
+
+  const result = await searchProvider.search(SearchTable.block, {
+    _source: ['workspace_id', 'doc_id'],
+    query: {
+      bool: {
+        must: [
+          { term: { workspace_id: { value: workspaceId } } },
+          { match: { content: '쥐' } },
+        ],
+      },
+    },
+    fields: ['block_id', 'content'],
+    sort: ['_score'],
+  });
+
+  t.true(result.total >= 1);
+  t.snapshot(
+    result.nodes
+      .filter(node => node.fields.block_id?.[0] === blockId)
+      .map(node => omit(node, ['_score']))
+  );
+});
+
+test('should search block content match japanese kana ngram', async t => {
+  const workspaceId = 'workspace-test-block-content-japanese';
+  const docId = 'doc-japanese';
+  const blockId = 'block-japanese';
+  const content = 'いろはにほへと ちりぬるを';
+
+  await searchProvider.write(
+    SearchTable.block,
+    [
+      {
+        workspace_id: workspaceId,
+        doc_id: docId,
+        block_id: blockId,
+        content,
+        flavour: 'affine:paragraph',
+      },
+    ],
+    {
+      refresh: true,
+    }
+  );
+
+  const result = await searchProvider.search(SearchTable.block, {
+    _source: ['workspace_id', 'doc_id'],
+    query: {
+      bool: {
+        must: [
+          { term: { workspace_id: { value: workspaceId } } },
+          { match: { content: 'へ' } },
+        ],
+      },
+    },
+    fields: ['block_id', 'content'],
+    sort: ['_score'],
+  });
+
+  t.true(result.total >= 1);
+  t.snapshot(
+    result.nodes
+      .filter(node => node.fields.block_id?.[0] === blockId)
+      .map(node => omit(node, ['_score']))
+  );
+});
+
 // #region write

 test('should write document work', async t => {
@@ -189,7 +318,7 @@ test('should write document work', async t => {

  let result = await searchProvider.search(SearchTable.block, {
    _source: ['workspace_id', 'doc_id'],
-    query: { match: { doc_id: docId } },
+    query: { term: { doc_id: { value: docId } } },
    fields: [
      'flavour',
      'flavour_indexed',
@@ -232,7 +361,7 @@ test('should write document work', async t => {

  result = await searchProvider.search(SearchTable.block, {
    _source: ['workspace_id', 'doc_id'],
-    query: { match: { doc_id: docId } },
+    query: { term: { doc_id: { value: docId } } },
    fields: ['flavour', 'block_id', 'content', 'ref_doc_id'],
    sort: ['_score'],
  });
@@ -263,7 +392,7 @@ test('should write document work', async t => {

  result = await searchProvider.search(SearchTable.block, {
    _source: ['workspace_id', 'doc_id'],
-    query: { match: { doc_id: docId } },
+    query: { term: { doc_id: { value: docId } } },
    fields: ['flavour', 'block_id', 'content', 'ref_doc_id'],
    sort: ['_score'],
  });
@@ -319,8 +448,8 @@ test('should handle ref_doc_id as string[]', async t => {
    query: {
      bool: {
        must: [
-          { match: { workspace_id: workspaceId } },
-          { match: { doc_id: docId } },
+          { term: { workspace_id: { value: workspaceId } } },
+          { term: { doc_id: { value: docId } } },
        ],
      },
    },
@@ -371,8 +500,8 @@ test('should handle ref_doc_id as string[]', async t => {
    query: {
      bool: {
        must: [
-          { match: { workspace_id: workspaceId } },
-          { match: { doc_id: docId } },
+          { term: { workspace_id: { value: workspaceId } } },
+          { term: { doc_id: { value: docId } } },
        ],
      },
    },
@@ -416,8 +545,8 @@ test('should handle content as string[]', async t => {
    query: {
      bool: {
        must: [
-          { match: { workspace_id: workspaceId } },
-          { match: { doc_id: docId } },
+          { term: { workspace_id: { value: workspaceId } } },
+          { term: { doc_id: { value: docId } } },
        ],
      },
    },
@@ -455,8 +584,8 @@ test('should handle content as string[]', async t => {
    query: {
      bool: {
        must: [
-          { match: { workspace_id: workspaceId } },
-          { match: { doc_id: docId } },
+          { term: { workspace_id: { value: workspaceId } } },
+          { term: { doc_id: { value: docId } } },
        ],
      },
    },
@@ -497,8 +626,8 @@ test('should handle blob as string[]', async t => {
    query: {
      bool: {
        must: [
-          { match: { workspace_id: workspaceId } },
-          { match: { doc_id: docId } },
+          { term: { workspace_id: { value: workspaceId } } },
+          { term: { doc_id: { value: docId } } },
        ],
      },
    },
@@ -534,8 +663,8 @@ test('should handle blob as string[]', async t => {
    query: {
      bool: {
        must: [
-          { match: { workspace_id: workspaceId } },
-          { match: { doc_id: docId } },
+          { term: { workspace_id: { value: workspaceId } } },
+          { term: { doc_id: { value: docId } } },
        ],
      },
    },
@@ -571,8 +700,8 @@ test('should handle blob as string[]', async t => {
    query: {
      bool: {
        must: [
-          { match: { workspace_id: workspaceId } },
-          { match: { doc_id: docId } },
+          { term: { workspace_id: { value: workspaceId } } },
+          { term: { doc_id: { value: docId } } },
        ],
      },
    },
@@ -682,8 +811,10 @@ test('should search query all and get next cursor work', async t => {
      'id',
    ],
    query: {
-      match: {
-        workspace_id: workspaceId,
+      term: {
+        workspace_id: {
+          value: workspaceId,
+        },
      },
    },
    fields: ['flavour', 'workspace_id', 'doc_id', 'block_id'],
@@ -708,8 +839,10 @@ test('should search query all and get next cursor work', async t => {
      'id',
    ],
    query: {
-      match: {
-        workspace_id: workspaceId,
+      term: {
+        workspace_id: {
+          value: workspaceId,
+        },
      },
    },
    fields: ['flavour', 'workspace_id', 'doc_id', 'block_id'],
@@ -734,8 +867,10 @@ test('should search query all and get next cursor work', async t => {
      'id',
    ],
    query: {
-      match: {
-        workspace_id: workspaceId,
+      term: {
+        workspace_id: {
+          value: workspaceId,
+        },
      },
    },
    fields: ['flavour', 'workspace_id', 'doc_id', 'block_id'],
@@ -780,16 +915,20 @@ test('should filter by workspace_id work', async t => {
      bool: {
        must: [
          {
-            match: {
-              workspace_id: workspaceId,
+            term: {
+              workspace_id: {
+                value: workspaceId,
+              },
            },
          },
          {
            bool: {
              must: [
                {
-                  match: {
-                    doc_id: docId,
+                  term: {
+                    doc_id: {
+                      value: docId,
+                    },
                  },
                },
              ],
@@ -8,11 +8,12 @@ import { createModule } from '../../../__tests__/create-module';
 import { Mockers } from '../../../__tests__/mocks';
 import { ConfigModule } from '../../../base/config';
 import { ServerConfigModule } from '../../../core/config';
+import { Models } from '../../../models';
 import { SearchProviderFactory } from '../factory';
 import { IndexerModule, IndexerService } from '../index';
 import { ManticoresearchProvider } from '../providers';
 import { UpsertDoc } from '../service';
-import { SearchTable } from '../tables';
+import { blockSQL, docSQL, SearchTable } from '../tables';
 import {
  AggregateInput,
  SearchInput,
@@ -35,6 +36,7 @@ const module = await createModule({
 const indexerService = module.get(IndexerService);
 const searchProviderFactory = module.get(SearchProviderFactory);
 const manticoresearch = module.get(ManticoresearchProvider);
+const models = module.get(Models);
 const user = await module.create(Mockers.User);
 const workspace = await module.create(Mockers.Workspace, {
  snapshot: true,
@@ -50,7 +52,8 @@ test.after.always(async () => {
 });

 test.before(async () => {
-  await indexerService.createTables();
+  await manticoresearch.recreateTable(SearchTable.block, blockSQL);
+  await manticoresearch.recreateTable(SearchTable.doc, docSQL);
 });

 test.afterEach.always(async () => {
@@ -2311,3 +2314,29 @@ test('should search docs by keyword work', async t => {
 });

 // #endregion
+
+test('should rebuild manticore indexes and requeue workspaces', async t => {
+  const workspace1 = await module.create(Mockers.Workspace, {
+    indexed: true,
+  });
+  const workspace2 = await module.create(Mockers.Workspace, {
+    indexed: true,
+  });
+  const queueCount = module.queue.count('indexer.indexWorkspace');
+
+  await indexerService.rebuildManticoreIndexes();
+
+  const queuedWorkspaceIds = new Set(
+    module.queue.add
+      .getCalls()
+      .filter(call => call.args[0] === 'indexer.indexWorkspace')
+      .slice(queueCount)
+      .map(call => call.args[1].workspaceId)
+  );
+
+  t.true(queuedWorkspaceIds.has(workspace1.id));
+  t.true(queuedWorkspaceIds.has(workspace2.id));
+
+  t.is((await models.workspace.get(workspace1.id))?.indexed, false);
+  t.is((await models.workspace.get(workspace2.id))?.indexed, false);
+});
@@ -38,6 +38,17 @@ const SupportIndexedAttributes = [
  'parent_block_id',
 ];

+const SupportExactTermFields = new Set([
+  'workspace_id',
+  'doc_id',
+  'block_id',
+  'flavour',
+  'parent_flavour',
+  'parent_block_id',
+  'created_by_user_id',
+  'updated_by_user_id',
+]);
+
 const ConvertEmptyStringToNullValueFields = new Set([
  'ref_doc_id',
  'ref',
@@ -55,23 +66,20 @@ export class ManticoresearchProvider extends ElasticsearchProvider {
    table: SearchTable,
    mapping: string
  ): Promise<void> {
-    const url = `${this.config.provider.endpoint}/cli`;
-    const response = await fetch(url, {
-      method: 'POST',
-      body: mapping,
-      headers: {
-        'Content-Type': 'text/plain',
-      },
-    });
-    // manticoresearch cli response is not json, so we need to handle it manually
-    const text = (await response.text()).trim();
-    if (!response.ok) {
-      this.logger.error(`failed to create table ${table}, response: ${text}`);
-      throw new InternalServerError();
-    }
+    const text = await this.#executeSQL(mapping);
    this.logger.log(`created table ${table}, response: ${text}`);
  }

+  async dropTable(table: SearchTable): Promise<void> {
+    const text = await this.#executeSQL(`DROP TABLE IF EXISTS ${table}`);
+    this.logger.log(`dropped table ${table}, response: ${text}`);
+  }
+
+  async recreateTable(table: SearchTable, mapping: string): Promise<void> {
+    await this.dropTable(table);
+    await this.createTable(table, mapping);
+  }
+
  override async write(
    table: SearchTable,
    documents: Record<string, unknown>[],
@@ -252,6 +260,12 @@ export class ManticoresearchProvider extends ElasticsearchProvider {
      // 1750389254 => new Date(1750389254 * 1000)
      return new Date(value * 1000);
    }
+    if (value && typeof value === 'string') {
+      const timestamp = Date.parse(value);
+      if (!Number.isNaN(timestamp)) {
+        return new Date(timestamp);
+      }
+    }
    return value;
  }

@@ -302,8 +316,10 @@ export class ManticoresearchProvider extends ElasticsearchProvider {
      //     workspace_id: 'workspaceId1'
      //   }
      // }
-      let termField = options?.termMappingField ?? 'term';
      let field = Object.keys(query.term)[0];
+      let termField =
+        options?.termMappingField ??
+        (SupportExactTermFields.has(field) ? 'equals' : 'term');
      let value = query.term[field];
      if (typeof value === 'object' && 'value' in value) {
        if ('boost' in value) {
@@ -432,4 +448,28 @@ export class ManticoresearchProvider extends ElasticsearchProvider {
    }
    return value;
  }
+
+  async #executeSQL(sql: string) {
+    const url = `${this.config.provider.endpoint}/cli`;
+    const headers: Record<string, string> = {
+      'Content-Type': 'text/plain',
+    };
+    if (this.config.provider.apiKey) {
+      headers.Authorization = `ApiKey ${this.config.provider.apiKey}`;
+    } else if (this.config.provider.password) {
+      headers.Authorization = `Basic ${Buffer.from(`${this.config.provider.username}:${this.config.provider.password}`).toString('base64')}`;
+    }
+
+    const response = await fetch(url, {
+      method: 'POST',
+      body: sql,
+      headers,
+    });
+    const text = (await response.text()).trim();
+    if (!response.ok) {
+      this.logger.error(`failed to execute SQL "${sql}", response: ${text}`);
+      throw new InternalServerError();
+    }
+    return text;
+  }
 }
@@ -14,6 +14,7 @@ import {
  AggregateQueryDSL,
  BaseQueryDSL,
  HighlightDSL,
+  ManticoresearchProvider,
  OperationOptions,
  SearchNode,
  SearchProvider,
@@ -130,6 +131,63 @@ export class IndexerService {
    }
  }

+  async rebuildManticoreIndexes() {
+    let searchProvider: SearchProvider | undefined;
+    try {
+      searchProvider = this.factory.get();
+    } catch (err) {
+      if (err instanceof SearchProviderNotFound) {
+        this.logger.debug('No search provider found, skip rebuilding tables');
+        return;
+      }
+      throw err;
+    }
+
+    if (!(searchProvider instanceof ManticoresearchProvider)) {
+      this.logger.debug(
+        `Search provider ${searchProvider.type} does not need manticore rebuild`
+      );
+      return;
+    }
+
+    const mappings = SearchTableMappingStrings[searchProvider.type];
+    for (const table of Object.keys(mappings) as SearchTable[]) {
+      await searchProvider.recreateTable(table, mappings[table]);
+    }
+
+    let lastWorkspaceSid = 0;
+    while (true) {
+      const workspaces = await this.models.workspace.list(
+        { sid: { gt: lastWorkspaceSid } },
+        { id: true, sid: true },
+        100
+      );
+      if (!workspaces.length) {
+        break;
+      }
+
+      for (const workspace of workspaces) {
+        await this.models.workspace.update(
+          workspace.id,
+          { indexed: false },
+          false
+        );
+        await this.queue.add(
+          'indexer.indexWorkspace',
+          {
+            workspaceId: workspace.id,
+          },
+          {
+            jobId: `indexWorkspace/${workspace.id}`,
+            priority: 100,
+          }
+        );
+      }
+
+      lastWorkspaceSid = workspaces[workspaces.length - 1].sid;
+    }
+  }
+
  async write<T extends SearchTable>(
    table: T,
    documents: UpsertTypeByTable<T>[],
@@ -150,6 +150,8 @@ CREATE TABLE IF NOT EXISTS block (
  updated_at timestamp
 )
 morphology = 'jieba_chinese, lemmatize_en_all, lemmatize_de_all, lemmatize_ru_all, libstemmer_ar, libstemmer_ca, stem_cz, libstemmer_da, libstemmer_nl, libstemmer_fi, libstemmer_fr, libstemmer_el, libstemmer_hi, libstemmer_hu, libstemmer_id, libstemmer_ga, libstemmer_it, libstemmer_lt, libstemmer_ne, libstemmer_no, libstemmer_pt, libstemmer_ro, libstemmer_es, libstemmer_sv, libstemmer_ta, libstemmer_tr'
-charset_table = 'non_cjk, cjk'
+charset_table = 'non_cjk, chinese'
+ngram_len = '1'
+ngram_chars = 'U+1100..U+11FF, U+3130..U+318F, U+A960..U+A97F, U+AC00..U+D7AF, U+D7B0..U+D7FF, U+3040..U+30FF, U+0E00..U+0E7F'
 index_field_lengths = '1'
 `;
@@ -109,6 +109,8 @@ CREATE TABLE IF NOT EXISTS doc (
  updated_at timestamp
 )
 morphology = 'jieba_chinese, lemmatize_en_all, lemmatize_de_all, lemmatize_ru_all, libstemmer_ar, libstemmer_ca, stem_cz, libstemmer_da, libstemmer_nl, libstemmer_fi, libstemmer_fr, libstemmer_el, libstemmer_hi, libstemmer_hu, libstemmer_id, libstemmer_ga, libstemmer_it, libstemmer_lt, libstemmer_ne, libstemmer_no, libstemmer_pt, libstemmer_ro, libstemmer_es, libstemmer_sv, libstemmer_ta, libstemmer_tr'
-charset_table = 'non_cjk, cjk'
+charset_table = 'non_cjk, chinese'
+ngram_len = '1'
+ngram_chars = 'U+1100..U+11FF, U+3130..U+318F, U+A960..U+A97F, U+AC00..U+D7AF, U+D7B0..U+D7FF, U+3040..U+30FF, U+0E00..U+0E7F'
 index_field_lengths = '1'
 `;