feat(server): improve indexer (#14698)

fix #13862 


#### PR Dependency Tree


* **PR #14698** 👈

This tree was auto-generated by
[Charcoal](https://github.com/danerwilliams/charcoal)

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->

## Summary by CodeRabbit

* **New Features**
* Enhanced search support for Chinese, Japanese, and Korean languages
with improved text segmentation and character matching.
* Added index management capabilities with table recreation
functionality.

* **Bug Fixes**
* Improved search accuracy for non-Latin scripts through updated
morphology and n-gram configuration.

* **Chores**
  * Added database migration for search index optimization.

<!-- end of auto-generated comment: release notes by coderabbit.ai -->
This commit is contained in:
DarkSky
2026-03-22 02:50:59 +08:00
committed by GitHub
parent bcf2a51d41
commit f47ee2bc8a
10 changed files with 402 additions and 50 deletions

View File

@@ -0,0 +1,12 @@
import { ModuleRef } from '@nestjs/core';
import { PrismaClient } from '@prisma/client';
import { IndexerService } from '../../plugins/indexer';
export class RebuildManticoreMixedScriptIndexes1763800000000 {
static async up(_db: PrismaClient, ref: ModuleRef) {
await ref.get(IndexerService, { strict: false }).rebuildManticoreIndexes();
}
static async down(_db: PrismaClient) {}
}

View File

@@ -3,3 +3,4 @@ export * from './1703756315970-unamed-account';
export * from './1721299086340-refresh-unnamed-user';
export * from './1745211351719-create-indexer-tables';
export * from './1751966744168-correct-session-update-time';
export * from './1763800000000-rebuild-manticore-mixed-script-indexes';

View File

@@ -4,6 +4,75 @@ The actual snapshot is saved in `manticoresearch.spec.ts.snap`.
Generated by [AVA](https://avajs.dev).
## should search doc title match chinese word segmentation
> Snapshot 1
[
{
_id: '5373363211628325828',
_source: {
doc_id: 'doc-chinese',
workspace_id: 'workspace-test-doc-title-chinese',
},
fields: {
doc_id: [
'doc-chinese',
],
title: [
'AFFiNE 是一个基于云端的笔记应用',
],
},
highlights: undefined,
},
]
## should search block content match korean ngram
> Snapshot 1
[
{
_id: '1227635764506850985',
_source: {
doc_id: 'doc-korean',
workspace_id: 'workspace-test-block-content-korean',
},
fields: {
block_id: [
'block-korean',
],
content: [
'다람쥐 헌 쳇바퀴에 타고파',
],
},
highlights: undefined,
},
]
## should search block content match japanese kana ngram
> Snapshot 1
[
{
_id: '381498385699454292',
_source: {
doc_id: 'doc-japanese',
workspace_id: 'workspace-test-block-content-japanese',
},
fields: {
block_id: [
'block-japanese',
],
content: [
'いろはにほへと ちりぬるを',
],
},
highlights: undefined,
},
]
## should write document work
> Snapshot 1
@@ -889,7 +958,7 @@ Generated by [AVA](https://avajs.dev).
> Snapshot 1
{
term: {
equals: {
workspace_id: 'workspaceId1',
},
}
@@ -897,7 +966,7 @@ Generated by [AVA](https://avajs.dev).
> Snapshot 2
{
term: {
equals: {
workspace_id: 'workspaceId1',
},
}

View File

@@ -33,8 +33,8 @@ const user = await module.create(Mockers.User);
const workspace = await module.create(Mockers.Workspace);
test.before(async () => {
await searchProvider.createTable(SearchTable.block, blockSQL);
await searchProvider.createTable(SearchTable.doc, docSQL);
await searchProvider.recreateTable(SearchTable.block, blockSQL);
await searchProvider.recreateTable(SearchTable.doc, docSQL);
await searchProvider.write(
SearchTable.block,
@@ -163,6 +163,135 @@ test('should provider is manticoresearch', t => {
t.is(searchProvider.type, SearchProviderType.Manticoresearch);
});
test('should search doc title match chinese word segmentation', async t => {
const workspaceId = 'workspace-test-doc-title-chinese';
const docId = 'doc-chinese';
const title = 'AFFiNE 是一个基于云端的笔记应用';
await searchProvider.write(
SearchTable.doc,
[
{
workspace_id: workspaceId,
doc_id: docId,
title,
},
],
{
refresh: true,
}
);
const result = await searchProvider.search(SearchTable.doc, {
_source: ['workspace_id', 'doc_id'],
query: {
bool: {
must: [
{ term: { workspace_id: { value: workspaceId } } },
{ match: { title: '笔记' } },
],
},
},
fields: ['doc_id', 'title'],
sort: ['_score'],
});
t.true(result.total >= 1);
t.snapshot(
result.nodes
.filter(node => node._source.doc_id === docId)
.map(node => omit(node, ['_score']))
);
});
test('should search block content match korean ngram', async t => {
const workspaceId = 'workspace-test-block-content-korean';
const docId = 'doc-korean';
const blockId = 'block-korean';
const content = '다람쥐 헌 쳇바퀴에 타고파';
await searchProvider.write(
SearchTable.block,
[
{
workspace_id: workspaceId,
doc_id: docId,
block_id: blockId,
content,
flavour: 'affine:paragraph',
},
],
{
refresh: true,
}
);
const result = await searchProvider.search(SearchTable.block, {
_source: ['workspace_id', 'doc_id'],
query: {
bool: {
must: [
{ term: { workspace_id: { value: workspaceId } } },
{ match: { content: '쥐' } },
],
},
},
fields: ['block_id', 'content'],
sort: ['_score'],
});
t.true(result.total >= 1);
t.snapshot(
result.nodes
.filter(node => node.fields.block_id?.[0] === blockId)
.map(node => omit(node, ['_score']))
);
});
test('should search block content match japanese kana ngram', async t => {
const workspaceId = 'workspace-test-block-content-japanese';
const docId = 'doc-japanese';
const blockId = 'block-japanese';
const content = 'いろはにほへと ちりぬるを';
await searchProvider.write(
SearchTable.block,
[
{
workspace_id: workspaceId,
doc_id: docId,
block_id: blockId,
content,
flavour: 'affine:paragraph',
},
],
{
refresh: true,
}
);
const result = await searchProvider.search(SearchTable.block, {
_source: ['workspace_id', 'doc_id'],
query: {
bool: {
must: [
{ term: { workspace_id: { value: workspaceId } } },
{ match: { content: 'へ' } },
],
},
},
fields: ['block_id', 'content'],
sort: ['_score'],
});
t.true(result.total >= 1);
t.snapshot(
result.nodes
.filter(node => node.fields.block_id?.[0] === blockId)
.map(node => omit(node, ['_score']))
);
});
// #region write
test('should write document work', async t => {
@@ -189,7 +318,7 @@ test('should write document work', async t => {
let result = await searchProvider.search(SearchTable.block, {
_source: ['workspace_id', 'doc_id'],
query: { match: { doc_id: docId } },
query: { term: { doc_id: { value: docId } } },
fields: [
'flavour',
'flavour_indexed',
@@ -232,7 +361,7 @@ test('should write document work', async t => {
result = await searchProvider.search(SearchTable.block, {
_source: ['workspace_id', 'doc_id'],
query: { match: { doc_id: docId } },
query: { term: { doc_id: { value: docId } } },
fields: ['flavour', 'block_id', 'content', 'ref_doc_id'],
sort: ['_score'],
});
@@ -263,7 +392,7 @@ test('should write document work', async t => {
result = await searchProvider.search(SearchTable.block, {
_source: ['workspace_id', 'doc_id'],
query: { match: { doc_id: docId } },
query: { term: { doc_id: { value: docId } } },
fields: ['flavour', 'block_id', 'content', 'ref_doc_id'],
sort: ['_score'],
});
@@ -319,8 +448,8 @@ test('should handle ref_doc_id as string[]', async t => {
query: {
bool: {
must: [
{ match: { workspace_id: workspaceId } },
{ match: { doc_id: docId } },
{ term: { workspace_id: { value: workspaceId } } },
{ term: { doc_id: { value: docId } } },
],
},
},
@@ -371,8 +500,8 @@ test('should handle ref_doc_id as string[]', async t => {
query: {
bool: {
must: [
{ match: { workspace_id: workspaceId } },
{ match: { doc_id: docId } },
{ term: { workspace_id: { value: workspaceId } } },
{ term: { doc_id: { value: docId } } },
],
},
},
@@ -416,8 +545,8 @@ test('should handle content as string[]', async t => {
query: {
bool: {
must: [
{ match: { workspace_id: workspaceId } },
{ match: { doc_id: docId } },
{ term: { workspace_id: { value: workspaceId } } },
{ term: { doc_id: { value: docId } } },
],
},
},
@@ -455,8 +584,8 @@ test('should handle content as string[]', async t => {
query: {
bool: {
must: [
{ match: { workspace_id: workspaceId } },
{ match: { doc_id: docId } },
{ term: { workspace_id: { value: workspaceId } } },
{ term: { doc_id: { value: docId } } },
],
},
},
@@ -497,8 +626,8 @@ test('should handle blob as string[]', async t => {
query: {
bool: {
must: [
{ match: { workspace_id: workspaceId } },
{ match: { doc_id: docId } },
{ term: { workspace_id: { value: workspaceId } } },
{ term: { doc_id: { value: docId } } },
],
},
},
@@ -534,8 +663,8 @@ test('should handle blob as string[]', async t => {
query: {
bool: {
must: [
{ match: { workspace_id: workspaceId } },
{ match: { doc_id: docId } },
{ term: { workspace_id: { value: workspaceId } } },
{ term: { doc_id: { value: docId } } },
],
},
},
@@ -571,8 +700,8 @@ test('should handle blob as string[]', async t => {
query: {
bool: {
must: [
{ match: { workspace_id: workspaceId } },
{ match: { doc_id: docId } },
{ term: { workspace_id: { value: workspaceId } } },
{ term: { doc_id: { value: docId } } },
],
},
},
@@ -682,8 +811,10 @@ test('should search query all and get next cursor work', async t => {
'id',
],
query: {
match: {
workspace_id: workspaceId,
term: {
workspace_id: {
value: workspaceId,
},
},
},
fields: ['flavour', 'workspace_id', 'doc_id', 'block_id'],
@@ -708,8 +839,10 @@ test('should search query all and get next cursor work', async t => {
'id',
],
query: {
match: {
workspace_id: workspaceId,
term: {
workspace_id: {
value: workspaceId,
},
},
},
fields: ['flavour', 'workspace_id', 'doc_id', 'block_id'],
@@ -734,8 +867,10 @@ test('should search query all and get next cursor work', async t => {
'id',
],
query: {
match: {
workspace_id: workspaceId,
term: {
workspace_id: {
value: workspaceId,
},
},
},
fields: ['flavour', 'workspace_id', 'doc_id', 'block_id'],
@@ -780,16 +915,20 @@ test('should filter by workspace_id work', async t => {
bool: {
must: [
{
match: {
workspace_id: workspaceId,
term: {
workspace_id: {
value: workspaceId,
},
},
},
{
bool: {
must: [
{
match: {
doc_id: docId,
term: {
doc_id: {
value: docId,
},
},
},
],

View File

@@ -8,11 +8,12 @@ import { createModule } from '../../../__tests__/create-module';
import { Mockers } from '../../../__tests__/mocks';
import { ConfigModule } from '../../../base/config';
import { ServerConfigModule } from '../../../core/config';
import { Models } from '../../../models';
import { SearchProviderFactory } from '../factory';
import { IndexerModule, IndexerService } from '../index';
import { ManticoresearchProvider } from '../providers';
import { UpsertDoc } from '../service';
import { SearchTable } from '../tables';
import { blockSQL, docSQL, SearchTable } from '../tables';
import {
AggregateInput,
SearchInput,
@@ -35,6 +36,7 @@ const module = await createModule({
const indexerService = module.get(IndexerService);
const searchProviderFactory = module.get(SearchProviderFactory);
const manticoresearch = module.get(ManticoresearchProvider);
const models = module.get(Models);
const user = await module.create(Mockers.User);
const workspace = await module.create(Mockers.Workspace, {
snapshot: true,
@@ -50,7 +52,8 @@ test.after.always(async () => {
});
test.before(async () => {
await indexerService.createTables();
await manticoresearch.recreateTable(SearchTable.block, blockSQL);
await manticoresearch.recreateTable(SearchTable.doc, docSQL);
});
test.afterEach.always(async () => {
@@ -2311,3 +2314,29 @@ test('should search docs by keyword work', async t => {
});
// #endregion
test('should rebuild manticore indexes and requeue workspaces', async t => {
const workspace1 = await module.create(Mockers.Workspace, {
indexed: true,
});
const workspace2 = await module.create(Mockers.Workspace, {
indexed: true,
});
const queueCount = module.queue.count('indexer.indexWorkspace');
await indexerService.rebuildManticoreIndexes();
const queuedWorkspaceIds = new Set(
module.queue.add
.getCalls()
.filter(call => call.args[0] === 'indexer.indexWorkspace')
.slice(queueCount)
.map(call => call.args[1].workspaceId)
);
t.true(queuedWorkspaceIds.has(workspace1.id));
t.true(queuedWorkspaceIds.has(workspace2.id));
t.is((await models.workspace.get(workspace1.id))?.indexed, false);
t.is((await models.workspace.get(workspace2.id))?.indexed, false);
});

View File

@@ -38,6 +38,17 @@ const SupportIndexedAttributes = [
'parent_block_id',
];
const SupportExactTermFields = new Set([
'workspace_id',
'doc_id',
'block_id',
'flavour',
'parent_flavour',
'parent_block_id',
'created_by_user_id',
'updated_by_user_id',
]);
const ConvertEmptyStringToNullValueFields = new Set([
'ref_doc_id',
'ref',
@@ -55,23 +66,20 @@ export class ManticoresearchProvider extends ElasticsearchProvider {
table: SearchTable,
mapping: string
): Promise<void> {
const url = `${this.config.provider.endpoint}/cli`;
const response = await fetch(url, {
method: 'POST',
body: mapping,
headers: {
'Content-Type': 'text/plain',
},
});
// manticoresearch cli response is not json, so we need to handle it manually
const text = (await response.text()).trim();
if (!response.ok) {
this.logger.error(`failed to create table ${table}, response: ${text}`);
throw new InternalServerError();
}
const text = await this.#executeSQL(mapping);
this.logger.log(`created table ${table}, response: ${text}`);
}
async dropTable(table: SearchTable): Promise<void> {
const text = await this.#executeSQL(`DROP TABLE IF EXISTS ${table}`);
this.logger.log(`dropped table ${table}, response: ${text}`);
}
async recreateTable(table: SearchTable, mapping: string): Promise<void> {
await this.dropTable(table);
await this.createTable(table, mapping);
}
override async write(
table: SearchTable,
documents: Record<string, unknown>[],
@@ -252,6 +260,12 @@ export class ManticoresearchProvider extends ElasticsearchProvider {
// 1750389254 => new Date(1750389254 * 1000)
return new Date(value * 1000);
}
if (value && typeof value === 'string') {
const timestamp = Date.parse(value);
if (!Number.isNaN(timestamp)) {
return new Date(timestamp);
}
}
return value;
}
@@ -302,8 +316,10 @@ export class ManticoresearchProvider extends ElasticsearchProvider {
// workspace_id: 'workspaceId1'
// }
// }
let termField = options?.termMappingField ?? 'term';
let field = Object.keys(query.term)[0];
let termField =
options?.termMappingField ??
(SupportExactTermFields.has(field) ? 'equals' : 'term');
let value = query.term[field];
if (typeof value === 'object' && 'value' in value) {
if ('boost' in value) {
@@ -432,4 +448,28 @@ export class ManticoresearchProvider extends ElasticsearchProvider {
}
return value;
}
async #executeSQL(sql: string) {
const url = `${this.config.provider.endpoint}/cli`;
const headers: Record<string, string> = {
'Content-Type': 'text/plain',
};
if (this.config.provider.apiKey) {
headers.Authorization = `ApiKey ${this.config.provider.apiKey}`;
} else if (this.config.provider.password) {
headers.Authorization = `Basic ${Buffer.from(`${this.config.provider.username}:${this.config.provider.password}`).toString('base64')}`;
}
const response = await fetch(url, {
method: 'POST',
body: sql,
headers,
});
const text = (await response.text()).trim();
if (!response.ok) {
this.logger.error(`failed to execute SQL "${sql}", response: ${text}`);
throw new InternalServerError();
}
return text;
}
}

View File

@@ -14,6 +14,7 @@ import {
AggregateQueryDSL,
BaseQueryDSL,
HighlightDSL,
ManticoresearchProvider,
OperationOptions,
SearchNode,
SearchProvider,
@@ -130,6 +131,63 @@ export class IndexerService {
}
}
async rebuildManticoreIndexes() {
let searchProvider: SearchProvider | undefined;
try {
searchProvider = this.factory.get();
} catch (err) {
if (err instanceof SearchProviderNotFound) {
this.logger.debug('No search provider found, skip rebuilding tables');
return;
}
throw err;
}
if (!(searchProvider instanceof ManticoresearchProvider)) {
this.logger.debug(
`Search provider ${searchProvider.type} does not need manticore rebuild`
);
return;
}
const mappings = SearchTableMappingStrings[searchProvider.type];
for (const table of Object.keys(mappings) as SearchTable[]) {
await searchProvider.recreateTable(table, mappings[table]);
}
let lastWorkspaceSid = 0;
while (true) {
const workspaces = await this.models.workspace.list(
{ sid: { gt: lastWorkspaceSid } },
{ id: true, sid: true },
100
);
if (!workspaces.length) {
break;
}
for (const workspace of workspaces) {
await this.models.workspace.update(
workspace.id,
{ indexed: false },
false
);
await this.queue.add(
'indexer.indexWorkspace',
{
workspaceId: workspace.id,
},
{
jobId: `indexWorkspace/${workspace.id}`,
priority: 100,
}
);
}
lastWorkspaceSid = workspaces[workspaces.length - 1].sid;
}
}
async write<T extends SearchTable>(
table: T,
documents: UpsertTypeByTable<T>[],

View File

@@ -150,6 +150,8 @@ CREATE TABLE IF NOT EXISTS block (
updated_at timestamp
)
morphology = 'jieba_chinese, lemmatize_en_all, lemmatize_de_all, lemmatize_ru_all, libstemmer_ar, libstemmer_ca, stem_cz, libstemmer_da, libstemmer_nl, libstemmer_fi, libstemmer_fr, libstemmer_el, libstemmer_hi, libstemmer_hu, libstemmer_id, libstemmer_ga, libstemmer_it, libstemmer_lt, libstemmer_ne, libstemmer_no, libstemmer_pt, libstemmer_ro, libstemmer_es, libstemmer_sv, libstemmer_ta, libstemmer_tr'
charset_table = 'non_cjk, cjk'
charset_table = 'non_cjk, chinese'
ngram_len = '1'
ngram_chars = 'U+1100..U+11FF, U+3130..U+318F, U+A960..U+A97F, U+AC00..U+D7AF, U+D7B0..U+D7FF, U+3040..U+30FF, U+0E00..U+0E7F'
index_field_lengths = '1'
`;

View File

@@ -109,6 +109,8 @@ CREATE TABLE IF NOT EXISTS doc (
updated_at timestamp
)
morphology = 'jieba_chinese, lemmatize_en_all, lemmatize_de_all, lemmatize_ru_all, libstemmer_ar, libstemmer_ca, stem_cz, libstemmer_da, libstemmer_nl, libstemmer_fi, libstemmer_fr, libstemmer_el, libstemmer_hi, libstemmer_hu, libstemmer_id, libstemmer_ga, libstemmer_it, libstemmer_lt, libstemmer_ne, libstemmer_no, libstemmer_pt, libstemmer_ro, libstemmer_es, libstemmer_sv, libstemmer_ta, libstemmer_tr'
charset_table = 'non_cjk, cjk'
charset_table = 'non_cjk, chinese'
ngram_len = '1'
ngram_chars = 'U+1100..U+11FF, U+3130..U+318F, U+A960..U+A97F, U+AC00..U+D7AF, U+D7B0..U+D7FF, U+3040..U+30FF, U+0E00..U+0E7F'
index_field_lengths = '1'
`;