diff --git a/packages/backend/server/src/data/migrations/1763800000000-rebuild-manticore-mixed-script-indexes.ts b/packages/backend/server/src/data/migrations/1763800000000-rebuild-manticore-mixed-script-indexes.ts new file mode 100644 index 0000000000..627aa18100 --- /dev/null +++ b/packages/backend/server/src/data/migrations/1763800000000-rebuild-manticore-mixed-script-indexes.ts @@ -0,0 +1,12 @@ +import { ModuleRef } from '@nestjs/core'; +import { PrismaClient } from '@prisma/client'; + +import { IndexerService } from '../../plugins/indexer'; + +export class RebuildManticoreMixedScriptIndexes1763800000000 { + static async up(_db: PrismaClient, ref: ModuleRef) { + await ref.get(IndexerService, { strict: false }).rebuildManticoreIndexes(); + } + + static async down(_db: PrismaClient) {} +} diff --git a/packages/backend/server/src/data/migrations/index.ts b/packages/backend/server/src/data/migrations/index.ts index 98e629bf51..d3b11f2175 100644 --- a/packages/backend/server/src/data/migrations/index.ts +++ b/packages/backend/server/src/data/migrations/index.ts @@ -3,3 +3,4 @@ export * from './1703756315970-unamed-account'; export * from './1721299086340-refresh-unnamed-user'; export * from './1745211351719-create-indexer-tables'; export * from './1751966744168-correct-session-update-time'; +export * from './1763800000000-rebuild-manticore-mixed-script-indexes'; diff --git a/packages/backend/server/src/plugins/indexer/__tests__/providers/__snapshots__/manticoresearch.spec.ts.md b/packages/backend/server/src/plugins/indexer/__tests__/providers/__snapshots__/manticoresearch.spec.ts.md index 29990dcf01..fe7c7e98ba 100644 --- a/packages/backend/server/src/plugins/indexer/__tests__/providers/__snapshots__/manticoresearch.spec.ts.md +++ b/packages/backend/server/src/plugins/indexer/__tests__/providers/__snapshots__/manticoresearch.spec.ts.md @@ -4,6 +4,75 @@ The actual snapshot is saved in `manticoresearch.spec.ts.snap`. Generated by [AVA](https://avajs.dev). +## should search doc title match chinese word segmentation + +> Snapshot 1 + + [ + { + _id: '5373363211628325828', + _source: { + doc_id: 'doc-chinese', + workspace_id: 'workspace-test-doc-title-chinese', + }, + fields: { + doc_id: [ + 'doc-chinese', + ], + title: [ + 'AFFiNE 是一个基于云端的笔记应用', + ], + }, + highlights: undefined, + }, + ] + +## should search block content match korean ngram + +> Snapshot 1 + + [ + { + _id: '1227635764506850985', + _source: { + doc_id: 'doc-korean', + workspace_id: 'workspace-test-block-content-korean', + }, + fields: { + block_id: [ + 'block-korean', + ], + content: [ + '다람쥐 헌 쳇바퀴에 타고파', + ], + }, + highlights: undefined, + }, + ] + +## should search block content match japanese kana ngram + +> Snapshot 1 + + [ + { + _id: '381498385699454292', + _source: { + doc_id: 'doc-japanese', + workspace_id: 'workspace-test-block-content-japanese', + }, + fields: { + block_id: [ + 'block-japanese', + ], + content: [ + 'いろはにほへと ちりぬるを', + ], + }, + highlights: undefined, + }, + ] + ## should write document work > Snapshot 1 @@ -889,7 +958,7 @@ Generated by [AVA](https://avajs.dev). > Snapshot 1 { - term: { + equals: { workspace_id: 'workspaceId1', }, } @@ -897,7 +966,7 @@ Generated by [AVA](https://avajs.dev). > Snapshot 2 { - term: { + equals: { workspace_id: 'workspaceId1', }, } diff --git a/packages/backend/server/src/plugins/indexer/__tests__/providers/__snapshots__/manticoresearch.spec.ts.snap b/packages/backend/server/src/plugins/indexer/__tests__/providers/__snapshots__/manticoresearch.spec.ts.snap index d9d8b475ec..5e308ddd22 100644 Binary files a/packages/backend/server/src/plugins/indexer/__tests__/providers/__snapshots__/manticoresearch.spec.ts.snap and b/packages/backend/server/src/plugins/indexer/__tests__/providers/__snapshots__/manticoresearch.spec.ts.snap differ diff --git a/packages/backend/server/src/plugins/indexer/__tests__/providers/manticoresearch.spec.ts b/packages/backend/server/src/plugins/indexer/__tests__/providers/manticoresearch.spec.ts index b68919554e..395082edb9 100644 --- a/packages/backend/server/src/plugins/indexer/__tests__/providers/manticoresearch.spec.ts +++ b/packages/backend/server/src/plugins/indexer/__tests__/providers/manticoresearch.spec.ts @@ -33,8 +33,8 @@ const user = await module.create(Mockers.User); const workspace = await module.create(Mockers.Workspace); test.before(async () => { - await searchProvider.createTable(SearchTable.block, blockSQL); - await searchProvider.createTable(SearchTable.doc, docSQL); + await searchProvider.recreateTable(SearchTable.block, blockSQL); + await searchProvider.recreateTable(SearchTable.doc, docSQL); await searchProvider.write( SearchTable.block, @@ -163,6 +163,135 @@ test('should provider is manticoresearch', t => { t.is(searchProvider.type, SearchProviderType.Manticoresearch); }); +test('should search doc title match chinese word segmentation', async t => { + const workspaceId = 'workspace-test-doc-title-chinese'; + const docId = 'doc-chinese'; + const title = 'AFFiNE 是一个基于云端的笔记应用'; + + await searchProvider.write( + SearchTable.doc, + [ + { + workspace_id: workspaceId, + doc_id: docId, + title, + }, + ], + { + refresh: true, + } + ); + + const result = await searchProvider.search(SearchTable.doc, { + _source: ['workspace_id', 'doc_id'], + query: { + bool: { + must: [ + { term: { workspace_id: { value: workspaceId } } }, + { match: { title: '笔记' } }, + ], + }, + }, + fields: ['doc_id', 'title'], + sort: ['_score'], + }); + + t.true(result.total >= 1); + t.snapshot( + result.nodes + .filter(node => node._source.doc_id === docId) + .map(node => omit(node, ['_score'])) + ); +}); + +test('should search block content match korean ngram', async t => { + const workspaceId = 'workspace-test-block-content-korean'; + const docId = 'doc-korean'; + const blockId = 'block-korean'; + const content = '다람쥐 헌 쳇바퀴에 타고파'; + + await searchProvider.write( + SearchTable.block, + [ + { + workspace_id: workspaceId, + doc_id: docId, + block_id: blockId, + content, + flavour: 'affine:paragraph', + }, + ], + { + refresh: true, + } + ); + + const result = await searchProvider.search(SearchTable.block, { + _source: ['workspace_id', 'doc_id'], + query: { + bool: { + must: [ + { term: { workspace_id: { value: workspaceId } } }, + { match: { content: '쥐' } }, + ], + }, + }, + fields: ['block_id', 'content'], + sort: ['_score'], + }); + + t.true(result.total >= 1); + t.snapshot( + result.nodes + .filter(node => node.fields.block_id?.[0] === blockId) + .map(node => omit(node, ['_score'])) + ); +}); + +test('should search block content match japanese kana ngram', async t => { + const workspaceId = 'workspace-test-block-content-japanese'; + const docId = 'doc-japanese'; + const blockId = 'block-japanese'; + const content = 'いろはにほへと ちりぬるを'; + + await searchProvider.write( + SearchTable.block, + [ + { + workspace_id: workspaceId, + doc_id: docId, + block_id: blockId, + content, + flavour: 'affine:paragraph', + }, + ], + { + refresh: true, + } + ); + + const result = await searchProvider.search(SearchTable.block, { + _source: ['workspace_id', 'doc_id'], + query: { + bool: { + must: [ + { term: { workspace_id: { value: workspaceId } } }, + { match: { content: 'へ' } }, + ], + }, + }, + fields: ['block_id', 'content'], + sort: ['_score'], + }); + + t.true(result.total >= 1); + t.snapshot( + result.nodes + .filter(node => node.fields.block_id?.[0] === blockId) + .map(node => omit(node, ['_score'])) + ); +}); + // #region write test('should write document work', async t => { @@ -189,7 +318,7 @@ test('should write document work', async t => { let result = await searchProvider.search(SearchTable.block, { _source: ['workspace_id', 'doc_id'], - query: { match: { doc_id: docId } }, + query: { term: { doc_id: { value: docId } } }, fields: [ 'flavour', 'flavour_indexed', @@ -232,7 +361,7 @@ test('should write document work', async t => { result = await searchProvider.search(SearchTable.block, { _source: ['workspace_id', 'doc_id'], - query: { match: { doc_id: docId } }, + query: { term: { doc_id: { value: docId } } }, fields: ['flavour', 'block_id', 'content', 'ref_doc_id'], sort: ['_score'], }); @@ -263,7 +392,7 @@ test('should write document work', async t => { result = await searchProvider.search(SearchTable.block, { _source: ['workspace_id', 'doc_id'], - query: { match: { doc_id: docId } }, + query: { term: { doc_id: { value: docId } } }, fields: ['flavour', 'block_id', 'content', 'ref_doc_id'], sort: ['_score'], }); @@ -319,8 +448,8 @@ test('should handle ref_doc_id as string[]', async t => { query: { bool: { must: [ - { match: { workspace_id: workspaceId } }, - { match: { doc_id: docId } }, + { term: { workspace_id: { value: workspaceId } } }, + { term: { doc_id: { value: docId } } }, ], }, }, @@ -371,8 +500,8 @@ test('should handle ref_doc_id as string[]', async t => { query: { bool: { must: [ - { match: { workspace_id: workspaceId } }, - { match: { doc_id: docId } }, + { term: { workspace_id: { value: workspaceId } } }, + { term: { doc_id: { value: docId } } }, ], }, }, @@ -416,8 +545,8 @@ test('should handle content as string[]', async t => { query: { bool: { must: [ - { match: { workspace_id: workspaceId } }, - { match: { doc_id: docId } }, + { term: { workspace_id: { value: workspaceId } } }, + { term: { doc_id: { value: docId } } }, ], }, }, @@ -455,8 +584,8 @@ test('should handle content as string[]', async t => { query: { bool: { must: [ - { match: { workspace_id: workspaceId } }, - { match: { doc_id: docId } }, + { term: { workspace_id: { value: workspaceId } } }, + { term: { doc_id: { value: docId } } }, ], }, }, @@ -497,8 +626,8 @@ test('should handle blob as string[]', async t => { query: { bool: { must: [ - { match: { workspace_id: workspaceId } }, - { match: { doc_id: docId } }, + { term: { workspace_id: { value: workspaceId } } }, + { term: { doc_id: { value: docId } } }, ], }, }, @@ -534,8 +663,8 @@ test('should handle blob as string[]', async t => { query: { bool: { must: [ - { match: { workspace_id: workspaceId } }, - { match: { doc_id: docId } }, + { term: { workspace_id: { value: workspaceId } } }, + { term: { doc_id: { value: docId } } }, ], }, }, @@ -571,8 +700,8 @@ test('should handle blob as string[]', async t => { query: { bool: { must: [ - { match: { workspace_id: workspaceId } }, - { match: { doc_id: docId } }, + { term: { workspace_id: { value: workspaceId } } }, + { term: { doc_id: { value: docId } } }, ], }, }, @@ -682,8 +811,10 @@ test('should search query all and get next cursor work', async t => { 'id', ], query: { - match: { - workspace_id: workspaceId, + term: { + workspace_id: { + value: workspaceId, + }, }, }, fields: ['flavour', 'workspace_id', 'doc_id', 'block_id'], @@ -708,8 +839,10 @@ test('should search query all and get next cursor work', async t => { 'id', ], query: { - match: { - workspace_id: workspaceId, + term: { + workspace_id: { + value: workspaceId, + }, }, }, fields: ['flavour', 'workspace_id', 'doc_id', 'block_id'], @@ -734,8 +867,10 @@ test('should search query all and get next cursor work', async t => { 'id', ], query: { - match: { - workspace_id: workspaceId, + term: { + workspace_id: { + value: workspaceId, + }, }, }, fields: ['flavour', 'workspace_id', 'doc_id', 'block_id'], @@ -780,16 +915,20 @@ test('should filter by workspace_id work', async t => { bool: { must: [ { - match: { - workspace_id: workspaceId, + term: { + workspace_id: { + value: workspaceId, + }, }, }, { bool: { must: [ { - match: { - doc_id: docId, + term: { + doc_id: { + value: docId, + }, }, }, ], diff --git a/packages/backend/server/src/plugins/indexer/__tests__/service.spec.ts b/packages/backend/server/src/plugins/indexer/__tests__/service.spec.ts index cd8ff47d1f..e90a781e02 100644 --- a/packages/backend/server/src/plugins/indexer/__tests__/service.spec.ts +++ b/packages/backend/server/src/plugins/indexer/__tests__/service.spec.ts @@ -8,11 +8,12 @@ import { createModule } from '../../../__tests__/create-module'; import { Mockers } from '../../../__tests__/mocks'; import { ConfigModule } from '../../../base/config'; import { ServerConfigModule } from '../../../core/config'; +import { Models } from '../../../models'; import { SearchProviderFactory } from '../factory'; import { IndexerModule, IndexerService } from '../index'; import { ManticoresearchProvider } from '../providers'; import { UpsertDoc } from '../service'; -import { SearchTable } from '../tables'; +import { blockSQL, docSQL, SearchTable } from '../tables'; import { AggregateInput, SearchInput, @@ -35,6 +36,7 @@ const module = await createModule({ const indexerService = module.get(IndexerService); const searchProviderFactory = module.get(SearchProviderFactory); const manticoresearch = module.get(ManticoresearchProvider); +const models = module.get(Models); const user = await module.create(Mockers.User); const workspace = await module.create(Mockers.Workspace, { snapshot: true, @@ -50,7 +52,8 @@ test.after.always(async () => { }); test.before(async () => { - await indexerService.createTables(); + await manticoresearch.recreateTable(SearchTable.block, blockSQL); + await manticoresearch.recreateTable(SearchTable.doc, docSQL); }); test.afterEach.always(async () => { @@ -2311,3 +2314,29 @@ test('should search docs by keyword work', async t => { }); // #endregion + +test('should rebuild manticore indexes and requeue workspaces', async t => { + const workspace1 = await module.create(Mockers.Workspace, { + indexed: true, + }); + const workspace2 = await module.create(Mockers.Workspace, { + indexed: true, + }); + const queueCount = module.queue.count('indexer.indexWorkspace'); + + await indexerService.rebuildManticoreIndexes(); + + const queuedWorkspaceIds = new Set( + module.queue.add + .getCalls() + .filter(call => call.args[0] === 'indexer.indexWorkspace') + .slice(queueCount) + .map(call => call.args[1].workspaceId) + ); + + t.true(queuedWorkspaceIds.has(workspace1.id)); + t.true(queuedWorkspaceIds.has(workspace2.id)); + + t.is((await models.workspace.get(workspace1.id))?.indexed, false); + t.is((await models.workspace.get(workspace2.id))?.indexed, false); +}); diff --git a/packages/backend/server/src/plugins/indexer/providers/manticoresearch.ts b/packages/backend/server/src/plugins/indexer/providers/manticoresearch.ts index cdf0ac2de5..d928b76623 100644 --- a/packages/backend/server/src/plugins/indexer/providers/manticoresearch.ts +++ b/packages/backend/server/src/plugins/indexer/providers/manticoresearch.ts @@ -38,6 +38,17 @@ const SupportIndexedAttributes = [ 'parent_block_id', ]; +const SupportExactTermFields = new Set([ + 'workspace_id', + 'doc_id', + 'block_id', + 'flavour', + 'parent_flavour', + 'parent_block_id', + 'created_by_user_id', + 'updated_by_user_id', +]); + const ConvertEmptyStringToNullValueFields = new Set([ 'ref_doc_id', 'ref', @@ -55,23 +66,20 @@ export class ManticoresearchProvider extends ElasticsearchProvider { table: SearchTable, mapping: string ): Promise { - const url = `${this.config.provider.endpoint}/cli`; - const response = await fetch(url, { - method: 'POST', - body: mapping, - headers: { - 'Content-Type': 'text/plain', - }, - }); - // manticoresearch cli response is not json, so we need to handle it manually - const text = (await response.text()).trim(); - if (!response.ok) { - this.logger.error(`failed to create table ${table}, response: ${text}`); - throw new InternalServerError(); - } + const text = await this.#executeSQL(mapping); this.logger.log(`created table ${table}, response: ${text}`); } + async dropTable(table: SearchTable): Promise { + const text = await this.#executeSQL(`DROP TABLE IF EXISTS ${table}`); + this.logger.log(`dropped table ${table}, response: ${text}`); + } + + async recreateTable(table: SearchTable, mapping: string): Promise { + await this.dropTable(table); + await this.createTable(table, mapping); + } + override async write( table: SearchTable, documents: Record[], @@ -252,6 +260,12 @@ export class ManticoresearchProvider extends ElasticsearchProvider { // 1750389254 => new Date(1750389254 * 1000) return new Date(value * 1000); } + if (value && typeof value === 'string') { + const timestamp = Date.parse(value); + if (!Number.isNaN(timestamp)) { + return new Date(timestamp); + } + } return value; } @@ -302,8 +316,10 @@ export class ManticoresearchProvider extends ElasticsearchProvider { // workspace_id: 'workspaceId1' // } // } - let termField = options?.termMappingField ?? 'term'; let field = Object.keys(query.term)[0]; + let termField = + options?.termMappingField ?? + (SupportExactTermFields.has(field) ? 'equals' : 'term'); let value = query.term[field]; if (typeof value === 'object' && 'value' in value) { if ('boost' in value) { @@ -432,4 +448,28 @@ export class ManticoresearchProvider extends ElasticsearchProvider { } return value; } + + async #executeSQL(sql: string) { + const url = `${this.config.provider.endpoint}/cli`; + const headers: Record = { + 'Content-Type': 'text/plain', + }; + if (this.config.provider.apiKey) { + headers.Authorization = `ApiKey ${this.config.provider.apiKey}`; + } else if (this.config.provider.password) { + headers.Authorization = `Basic ${Buffer.from(`${this.config.provider.username}:${this.config.provider.password}`).toString('base64')}`; + } + + const response = await fetch(url, { + method: 'POST', + body: sql, + headers, + }); + const text = (await response.text()).trim(); + if (!response.ok) { + this.logger.error(`failed to execute SQL "${sql}", response: ${text}`); + throw new InternalServerError(); + } + return text; + } } diff --git a/packages/backend/server/src/plugins/indexer/service.ts b/packages/backend/server/src/plugins/indexer/service.ts index 66d1d0eb91..8e75dde571 100644 --- a/packages/backend/server/src/plugins/indexer/service.ts +++ b/packages/backend/server/src/plugins/indexer/service.ts @@ -14,6 +14,7 @@ import { AggregateQueryDSL, BaseQueryDSL, HighlightDSL, + ManticoresearchProvider, OperationOptions, SearchNode, SearchProvider, @@ -130,6 +131,63 @@ export class IndexerService { } } + async rebuildManticoreIndexes() { + let searchProvider: SearchProvider | undefined; + try { + searchProvider = this.factory.get(); + } catch (err) { + if (err instanceof SearchProviderNotFound) { + this.logger.debug('No search provider found, skip rebuilding tables'); + return; + } + throw err; + } + + if (!(searchProvider instanceof ManticoresearchProvider)) { + this.logger.debug( + `Search provider ${searchProvider.type} does not need manticore rebuild` + ); + return; + } + + const mappings = SearchTableMappingStrings[searchProvider.type]; + for (const table of Object.keys(mappings) as SearchTable[]) { + await searchProvider.recreateTable(table, mappings[table]); + } + + let lastWorkspaceSid = 0; + while (true) { + const workspaces = await this.models.workspace.list( + { sid: { gt: lastWorkspaceSid } }, + { id: true, sid: true }, + 100 + ); + if (!workspaces.length) { + break; + } + + for (const workspace of workspaces) { + await this.models.workspace.update( + workspace.id, + { indexed: false }, + false + ); + await this.queue.add( + 'indexer.indexWorkspace', + { + workspaceId: workspace.id, + }, + { + jobId: `indexWorkspace/${workspace.id}`, + priority: 100, + } + ); + } + + lastWorkspaceSid = workspaces[workspaces.length - 1].sid; + } + } + async write( table: T, documents: UpsertTypeByTable[], diff --git a/packages/backend/server/src/plugins/indexer/tables/block.ts b/packages/backend/server/src/plugins/indexer/tables/block.ts index 828d8094ac..99dd29234e 100644 --- a/packages/backend/server/src/plugins/indexer/tables/block.ts +++ b/packages/backend/server/src/plugins/indexer/tables/block.ts @@ -150,6 +150,8 @@ CREATE TABLE IF NOT EXISTS block ( updated_at timestamp ) morphology = 'jieba_chinese, lemmatize_en_all, lemmatize_de_all, lemmatize_ru_all, libstemmer_ar, libstemmer_ca, stem_cz, libstemmer_da, libstemmer_nl, libstemmer_fi, libstemmer_fr, libstemmer_el, libstemmer_hi, libstemmer_hu, libstemmer_id, libstemmer_ga, libstemmer_it, libstemmer_lt, libstemmer_ne, libstemmer_no, libstemmer_pt, libstemmer_ro, libstemmer_es, libstemmer_sv, libstemmer_ta, libstemmer_tr' -charset_table = 'non_cjk, cjk' +charset_table = 'non_cjk, chinese' +ngram_len = '1' +ngram_chars = 'U+1100..U+11FF, U+3130..U+318F, U+A960..U+A97F, U+AC00..U+D7AF, U+D7B0..U+D7FF, U+3040..U+30FF, U+0E00..U+0E7F' index_field_lengths = '1' `; diff --git a/packages/backend/server/src/plugins/indexer/tables/doc.ts b/packages/backend/server/src/plugins/indexer/tables/doc.ts index 1647fad7e0..ff5ce52e58 100644 --- a/packages/backend/server/src/plugins/indexer/tables/doc.ts +++ b/packages/backend/server/src/plugins/indexer/tables/doc.ts @@ -109,6 +109,8 @@ CREATE TABLE IF NOT EXISTS doc ( updated_at timestamp ) morphology = 'jieba_chinese, lemmatize_en_all, lemmatize_de_all, lemmatize_ru_all, libstemmer_ar, libstemmer_ca, stem_cz, libstemmer_da, libstemmer_nl, libstemmer_fi, libstemmer_fr, libstemmer_el, libstemmer_hi, libstemmer_hu, libstemmer_id, libstemmer_ga, libstemmer_it, libstemmer_lt, libstemmer_ne, libstemmer_no, libstemmer_pt, libstemmer_ro, libstemmer_es, libstemmer_sv, libstemmer_ta, libstemmer_tr' -charset_table = 'non_cjk, cjk' +charset_table = 'non_cjk, chinese' +ngram_len = '1' +ngram_chars = 'U+1100..U+11FF, U+3130..U+318F, U+A960..U+A97F, U+AC00..U+D7AF, U+D7B0..U+D7FF, U+3040..U+30FF, U+0E00..U+0E7F' index_field_lengths = '1' `;