feat(infra): collect more data to indexer (#8528)

This commit is contained in:
EYHN
2024-10-19 20:22:26 +08:00
committed by GitHub
parent 8f92be926b
commit 01c3a3b4c0
11 changed files with 341 additions and 169 deletions

View File

@@ -40,7 +40,10 @@ export class Document<S extends Schema = any> {
}
} else {
for (const key in map) {
doc.insert(key, map[key] as string | string[]);
if (map[key] === undefined) {
continue;
}
doc.insert(key, map[key]);
}
}
return doc;

View File

@@ -1,3 +1,4 @@
import { DebugLogger } from '@affine/debug';
import {
type DBSchema,
type IDBPDatabase,
@@ -25,6 +26,8 @@ import {
} from './inverted-index';
import { Match } from './match';
const logger = new DebugLogger('indexeddb');
export interface IndexDB extends DBSchema {
kvMetadata: {
key: string;
@@ -75,14 +78,19 @@ export class DataStruct {
constructor(
readonly databaseName: string,
schema: Schema
readonly schema: Schema
) {
for (const [key, type] of Object.entries(schema)) {
if (type === 'String') {
const typeInfo = typeof type === 'string' ? { type } : type;
if (typeInfo.index === false) {
// If index is false, we don't need to create an inverted index for this field.
continue;
}
if (typeInfo.type === 'String') {
this.invertedIndex.set(key, new StringInvertedIndex(key));
} else if (type === 'Integer') {
} else if (typeInfo.type === 'Integer') {
this.invertedIndex.set(key, new IntegerInvertedIndex(key));
} else if (type === 'FullText') {
} else if (typeInfo.type === 'FullText') {
this.invertedIndex.set(key, new FullTextInvertedIndex(key));
} else if (type === 'Boolean') {
this.invertedIndex.set(key, new BooleanInvertedIndex(key));
@@ -102,17 +110,29 @@ export class DataStruct {
throw new Error('Document already exists');
}
const dataMap = new Map();
for (const [key, values] of document.fields) {
const type = this.schema[key as string];
if (!type) {
return;
}
const typeInfo = typeof type === 'string' ? { type } : type;
if (typeInfo.store !== false) {
// If store is false, the field will not be stored
dataMap.set(key, values);
}
}
const nid = await trx.objectStore('records').add({
id: document.id,
data: new Map(document.fields as Map<string, string[]>),
data: dataMap,
});
for (const [key, values] of document.fields) {
const iidx = this.invertedIndex.get(key as string);
if (!iidx) {
throw new Error(
`Inverted index '${key.toString()}' not found, document not match schema`
);
return;
}
await iidx.insert(trx, nid, values);
}
@@ -164,7 +184,7 @@ export class DataStruct {
if (query.type === 'match') {
const iidx = this.invertedIndex.get(query.field as string);
if (!iidx) {
throw new Error(`Field '${query.field as string}' not found`);
return new Match();
}
return await iidx.match(trx, query.match);
} else if (query.type === 'boolean') {
@@ -187,7 +207,7 @@ export class DataStruct {
} else if (query.type === 'exists') {
const iidx = this.invertedIndex.get(query.field as string);
if (!iidx) {
throw new Error(`Field '${query.field as string}' not found`);
return new Match();
}
return await iidx.all(trx);
}
@@ -217,31 +237,41 @@ export class DataStruct {
query: Query<any>,
options: SearchOptions<any>
): Promise<SearchResult<any, any>> {
const pagination = {
skip: options.pagination?.skip ?? 0,
limit: options.pagination?.limit ?? 100,
};
const startTime = performance.now();
try {
const pagination = {
skip: options.pagination?.skip ?? 0,
limit: options.pagination?.limit ?? 100,
};
const match = await this.query(trx, query);
const match = await this.query(trx, query);
const nids = match
.toArray()
.slice(pagination.skip, pagination.skip + pagination.limit);
const nids = match
.toArray()
.slice(pagination.skip, pagination.skip + pagination.limit);
const nodes = [];
for (const nid of nids) {
nodes.push(await this.resultNode(trx, match, nid, options));
const nodes = [];
for (const nid of nids) {
nodes.push(await this.resultNode(trx, match, nid, options));
}
return {
pagination: {
count: match.size(),
hasMore: match.size() > pagination.limit + pagination.skip,
limit: pagination.limit,
skip: pagination.skip,
},
nodes: nodes,
};
} finally {
logger.debug(
`[indexer ${this.databaseName}] search`,
performance.now() - startTime,
'ms',
query
);
}
return {
pagination: {
count: match.size(),
hasMore: match.size() > pagination.limit + pagination.skip,
limit: pagination.limit,
skip: pagination.skip,
},
nodes: nodes,
};
}
async aggregate(
@@ -250,95 +280,105 @@ export class DataStruct {
field: string,
options: AggregateOptions<any>
): Promise<AggregateResult<any, any>> {
const pagination = {
skip: options.pagination?.skip ?? 0,
limit: options.pagination?.limit ?? 100,
};
const startTime = performance.now();
try {
const pagination = {
skip: options.pagination?.skip ?? 0,
limit: options.pagination?.limit ?? 100,
};
const hitPagination = options.hits
? {
skip: options.hits.pagination?.skip ?? 0,
limit: options.hits.pagination?.limit ?? 3,
}
: {
skip: 0,
limit: 0,
};
const hitPagination = options.hits
? {
skip: options.hits.pagination?.skip ?? 0,
limit: options.hits.pagination?.limit ?? 3,
}
: {
skip: 0,
limit: 0,
};
const match = await this.query(trx, query);
const match = await this.query(trx, query);
const nids = match.toArray();
const nids = match.toArray();
const buckets: {
key: string;
nids: number[];
hits: SearchResult<any, any>['nodes'];
}[] = [];
const buckets: {
key: string;
nids: number[];
hits: SearchResult<any, any>['nodes'];
}[] = [];
for (const nid of nids) {
const values = (await trx.objectStore('records').get(nid))?.data.get(
field
);
for (const value of values ?? []) {
let bucket;
let bucketIndex = buckets.findIndex(b => b.key === value);
if (bucketIndex === -1) {
bucket = { key: value, nids: [], hits: [] };
buckets.push(bucket);
bucketIndex = buckets.length - 1;
} else {
bucket = buckets[bucketIndex];
}
for (const nid of nids) {
const values = (await trx.objectStore('records').get(nid))?.data.get(
field
);
for (const value of values ?? []) {
let bucket;
let bucketIndex = buckets.findIndex(b => b.key === value);
if (bucketIndex === -1) {
bucket = { key: value, nids: [], hits: [] };
buckets.push(bucket);
bucketIndex = buckets.length - 1;
} else {
bucket = buckets[bucketIndex];
}
if (
bucketIndex >= pagination.skip &&
bucketIndex < pagination.skip + pagination.limit
) {
bucket.nids.push(nid);
if (
bucket.nids.length - 1 >= hitPagination.skip &&
bucket.nids.length - 1 < hitPagination.skip + hitPagination.limit
bucketIndex >= pagination.skip &&
bucketIndex < pagination.skip + pagination.limit
) {
bucket.hits.push(
await this.resultNode(trx, match, nid, options.hits ?? {})
);
bucket.nids.push(nid);
if (
bucket.nids.length - 1 >= hitPagination.skip &&
bucket.nids.length - 1 < hitPagination.skip + hitPagination.limit
) {
bucket.hits.push(
await this.resultNode(trx, match, nid, options.hits ?? {})
);
}
}
}
}
return {
buckets: buckets
.slice(pagination.skip, pagination.skip + pagination.limit)
.map(bucket => {
const result = {
key: bucket.key,
score: match.getScore(bucket.nids[0]),
count: bucket.nids.length,
} as AggregateResult<any, any>['buckets'][number];
if (options.hits) {
(result as any).hits = {
pagination: {
count: bucket.nids.length,
hasMore:
bucket.nids.length >
hitPagination.limit + hitPagination.skip,
limit: hitPagination.limit,
skip: hitPagination.skip,
},
nodes: bucket.hits,
} as SearchResult<any, any>;
}
return result;
}),
pagination: {
count: buckets.length,
hasMore: buckets.length > pagination.limit + pagination.skip,
limit: pagination.limit,
skip: pagination.skip,
},
};
} finally {
logger.debug(
`[indexer ${this.databaseName}] aggregate`,
performance.now() - startTime,
'ms'
);
}
return {
buckets: buckets
.slice(pagination.skip, pagination.skip + pagination.limit)
.map(bucket => {
const result = {
key: bucket.key,
score: match.getScore(bucket.nids[0]),
count: bucket.nids.length,
} as AggregateResult<any, any>['buckets'][number];
if (options.hits) {
(result as any).hits = {
pagination: {
count: bucket.nids.length,
hasMore:
bucket.nids.length > hitPagination.limit + hitPagination.skip,
limit: hitPagination.limit,
skip: hitPagination.skip,
},
nodes: bucket.hits,
} as SearchResult<any, any>;
}
return result;
}),
pagination: {
count: buckets.length,
hasMore: buckets.length > pagination.limit + pagination.skip,
limit: pagination.limit,
skip: pagination.skip,
},
};
}
async getAll(

View File

@@ -21,7 +21,11 @@ export interface InvertedIndex {
}
export class StringInvertedIndex implements InvertedIndex {
constructor(readonly fieldKey: string) {}
constructor(
readonly fieldKey: string,
readonly index: boolean = true,
readonly store: boolean = true
) {}
async match(trx: DataStructROTransaction, term: string): Promise<Match> {
const objs = await trx
@@ -69,7 +73,11 @@ export class StringInvertedIndex implements InvertedIndex {
}
export class IntegerInvertedIndex implements InvertedIndex {
constructor(readonly fieldKey: string) {}
constructor(
readonly fieldKey: string,
readonly index: boolean = true,
readonly store: boolean = true
) {}
async match(trx: DataStructROTransaction, term: string): Promise<Match> {
const objs = await trx
@@ -118,7 +126,11 @@ export class IntegerInvertedIndex implements InvertedIndex {
}
export class BooleanInvertedIndex implements InvertedIndex {
constructor(readonly fieldKey: string) {}
constructor(
readonly fieldKey: string,
readonly index: boolean = true,
readonly store: boolean = true
) {}
// eslint-disable-next-line sonarjs/no-identical-functions
async all(trx: DataStructROTransaction): Promise<Match> {
@@ -172,7 +184,11 @@ export class BooleanInvertedIndex implements InvertedIndex {
}
export class FullTextInvertedIndex implements InvertedIndex {
constructor(readonly fieldKey: string) {}
constructor(
readonly fieldKey: string,
readonly index: boolean = true,
readonly store: boolean = true
) {}
async match(trx: DataStructROTransaction, term: string): Promise<Match> {
const queryTokens = new GeneralTokenizer().tokenize(term);

View File

@@ -31,13 +31,15 @@ export class DataStruct {
constructor(schema: Schema) {
for (const [key, type] of Object.entries(schema)) {
if (type === 'String') {
const typeInfo = typeof type === 'string' ? { type } : type;
if (typeInfo.type === 'String') {
this.invertedIndex.set(key, new StringInvertedIndex(key));
} else if (type === 'Integer') {
} else if (typeInfo.type === 'Integer') {
this.invertedIndex.set(key, new IntegerInvertedIndex(key));
} else if (type === 'FullText') {
} else if (typeInfo.type === 'FullText') {
this.invertedIndex.set(key, new FullTextInvertedIndex(key));
} else if (type === 'Boolean') {
} else if (typeInfo.type === 'Boolean') {
this.invertedIndex.set(key, new BooleanInvertedIndex(key));
} else {
throw new Error(`Field type '${type}' not supported`);

View File

@@ -1,6 +1,24 @@
import type { FieldType } from './field-type';
export type Schema = Record<string, FieldType>;
export type Schema = Record<
string,
| FieldType
| {
type: FieldType;
/**
* If false, the field will not be indexed, and thus not searchable.
*
* default: true
*/
index?: boolean;
/**
* If false, the field will not be stored, and not included in the search result.
*
* default: true
*/
store?: boolean;
}
>;
export function defineSchema<T extends Schema>(schema: T): T {
return schema;

View File

@@ -1,6 +1,10 @@
import { DocLinksService } from '@affine/core/modules/doc-link';
import {
type Backlink,
DocLinksService,
type Link,
} from '@affine/core/modules/doc-link';
import { useI18n } from '@affine/i18n';
import { useLiveData, useServices } from '@toeverything/infra';
import { LiveData, useLiveData, useServices } from '@toeverything/infra';
import { useCallback, useState } from 'react';
import { AffinePageReference } from '../../affine/reference-link';
@@ -13,8 +17,12 @@ export const BiDirectionalLinkPanel = () => {
});
const t = useI18n();
const links = useLiveData(docLinksService.links.links$);
const backlinks = useLiveData(docLinksService.backlinks.backlinks$);
const links = useLiveData(
show ? docLinksService.links.links$ : new LiveData([] as Link[])
);
const backlinks = useLiveData(
show ? docLinksService.backlinks.backlinks$ : new LiveData([] as Backlink[])
);
const handleClickShow = useCallback(() => {
setShow(!show);
}, [show]);

View File

@@ -103,7 +103,10 @@ export const DocPropertiesTableHeader = ({
EditorSettingService,
});
const docBacklinks = docLinksService.backlinks;
const backlinks = useLiveData(docBacklinks.backlinks$);
const backlinks = useMemo(
() => docBacklinks.backlinks$.value,
[docBacklinks]
);
const displayDocInfo = useLiveData(
editorSettingService.editorSetting.settings$.selector(s => s.displayDocInfo)

View File

@@ -36,7 +36,7 @@ export class DocsIndexer extends Entity {
/**
* increase this number to re-index all docs
*/
static INDEXER_VERSION = 2;
static INDEXER_VERSION = 5;
private readonly jobQueue: JobQueue<IndexerJobPayload> =
new IndexedDBJobQueue<IndexerJobPayload>(

View File

@@ -2,6 +2,9 @@ import { defineSchema } from '@toeverything/infra';
export const docIndexSchema = defineSchema({
title: 'FullText',
// summary of the doc, used for preview
summary: { type: 'String', index: false },
journal: 'String',
});
export type DocIndexSchema = typeof docIndexSchema;
@@ -15,9 +18,16 @@ export const blockIndexSchema = defineSchema({
// reference doc id
// ['xxx','yyy']
refDocId: 'String',
// reference info
// reference info, used for backlink to specific block
// [{"docId":"xxx","mode":"page","blockIds":["gt5Yfq1maYvgNgpi13rIq"]},{"docId":"yyy","mode":"edgeless","blockIds":["k5prpOlDF-9CzfatmO0W7"]}]
ref: 'String',
ref: { type: 'String', index: false },
// parent block flavour
parentFlavour: 'String',
// parent block id
parentBlockId: 'String',
// additional info
// { "databaseName": "xxx" }
additional: { type: 'String', index: false },
});
export type BlockIndexSchema = typeof blockIndexSchema;

View File

@@ -9,10 +9,10 @@ import {
Array as YArray,
Doc as YDoc,
Map as YMap,
type Text as YText,
Text as YText,
} from 'yjs';
import type { BlockIndexSchema, docIndexSchema } from '../schema';
import type { BlockIndexSchema, DocIndexSchema } from '../schema';
import type {
WorkerIngoingMessage,
WorkerInput,
@@ -68,12 +68,6 @@ async function crawlingDocData({
return {};
}
const ydoc = new YDoc();
if (!isEmptyUpdate(docBuffer)) {
applyUpdate(ydoc, docBuffer);
}
let docExists: boolean | null = null;
(
@@ -89,23 +83,68 @@ async function crawlingDocData({
deletedDoc: [docId],
};
} else {
const ydoc = new YDoc();
let docTitle = '';
let summaryLenNeeded = 1000;
let summary = '';
const blockDocuments: Document<BlockIndexSchema>[] = [];
if (!isEmptyUpdate(docBuffer)) {
applyUpdate(ydoc, docBuffer);
}
const blocks = ydoc.getMap<any>('blocks');
if (blocks.size === 0) {
return {};
return { deletedDoc: [docId] };
}
let docTitle = '';
const blockDocuments: Document<BlockIndexSchema>[] = [];
let rootBlockId: string | null = null;
for (const block of blocks.values()) {
const flavour = block.get('sys:flavour')?.toString();
const blockId = block.get('sys:id')?.toString();
if (!flavour || !blockId) {
continue;
if (flavour === 'affine:page' && blockId) {
rootBlockId = blockId;
}
}
if (!rootBlockId) {
return { deletedDoc: [docId] };
}
const queue: { parent?: string; id: string }[] = [{ id: rootBlockId }];
const visited = new Set<string>(); // avoid loop
const pushChildren = (id: string, block: YMap<any>) => {
const children = block.get('sys:children');
if (children instanceof YArray && children.length) {
for (let i = children.length - 1; i >= 0; i--) {
const childId = children.get(i);
if (childId && !visited.has(childId)) {
queue.push({ parent: id, id: childId });
visited.add(childId);
}
}
}
};
while (queue.length) {
const next = queue.pop();
if (!next) {
break;
}
const { parent: parentBlockId, id: blockId } = next;
const block = blockId ? blocks.get(blockId) : null;
const parentBlock = parentBlockId ? blocks.get(parentBlockId) : null;
if (!block) {
break;
}
const flavour = block.get('sys:flavour')?.toString();
const parentFlavour = parentBlock?.get('sys:flavour')?.toString();
pushChildren(blockId, block);
if (flavour === 'affine:page') {
docTitle = block.get('prop:title').toString();
@@ -150,6 +189,11 @@ async function crawlingDocData({
.filter(ref => !!ref)
);
const databaseName =
flavour === 'affine:paragraph' && parentFlavour === 'affine:database' // if block is a database row
? parentBlock?.get('prop:title')?.toString()
: undefined;
blockDocuments.push(
Document.from<BlockIndexSchema>(`${docId}:${blockId}`, {
docId,
@@ -164,8 +208,18 @@ async function crawlingDocData({
},
{ refDocId: [], ref: [] }
),
parentFlavour,
parentBlockId,
additional: databaseName
? JSON.stringify({ databaseName })
: undefined,
})
);
if (summaryLenNeeded > 0) {
summary += text.toString();
summaryLenNeeded -= text.length;
}
}
if (
@@ -183,6 +237,8 @@ async function crawlingDocData({
blockId,
refDocId: [pageId],
ref: [JSON.stringify({ docId: pageId, ...params })],
parentFlavour,
parentBlockId,
})
);
}
@@ -197,6 +253,8 @@ async function crawlingDocData({
flavour,
blockId,
blob: [blobId],
parentFlavour,
parentBlockId,
})
);
}
@@ -237,6 +295,8 @@ async function crawlingDocData({
flavour,
blockId,
content: texts,
parentFlavour,
parentBlockId,
})
);
}
@@ -244,32 +304,35 @@ async function crawlingDocData({
if (flavour === 'affine:database') {
const texts = [];
const columnsObj = block.get('prop:columns');
if (!(columnsObj instanceof YArray)) {
continue;
const databaseTitle = block.get('prop:title');
if (databaseTitle instanceof YText) {
texts.push(databaseTitle.toString());
}
for (const column of columnsObj) {
if (!(column instanceof YMap)) {
continue;
}
if (typeof column.get('name') === 'string') {
texts.push(column.get('name'));
}
const data = column.get('data');
if (!(data instanceof YMap)) {
continue;
}
const options = data.get('options');
if (!(options instanceof YArray)) {
continue;
}
for (const option of options) {
if (!(option instanceof YMap)) {
if (columnsObj instanceof YArray) {
for (const column of columnsObj) {
if (!(column instanceof YMap)) {
continue;
}
const value = option.get('value');
if (typeof value === 'string') {
texts.push(value);
if (typeof column.get('name') === 'string') {
texts.push(column.get('name'));
}
const data = column.get('data');
if (!(data instanceof YMap)) {
continue;
}
const options = data.get('options');
if (!(options instanceof YArray)) {
continue;
}
for (const option of options) {
if (!(option instanceof YMap)) {
continue;
}
const value = option.get('value');
if (typeof value === 'string') {
texts.push(value);
}
}
}
}
@@ -289,8 +352,9 @@ async function crawlingDocData({
addedDoc: [
{
id: docId,
doc: Document.from<typeof docIndexSchema>(docId, {
doc: Document.from<DocIndexSchema>(docId, {
title: docTitle,
summary,
}),
blocks: blockDocuments,
},

View File

@@ -22,6 +22,7 @@ import {
useServices,
} from '@toeverything/infra';
import { useCallback, useLayoutEffect, useMemo, useState } from 'react';
import { NEVER } from 'rxjs';
import { ExplorerTreeNode, type ExplorerTreeNodeDropEffect } from '../../tree';
import type { GenericExplorerNode } from '../types';
@@ -82,10 +83,15 @@ export const ExplorerDocNode = ({
const children = useLiveData(
useMemo(
() => LiveData.from(docsSearchService.watchRefsFrom(docId), null),
[docsSearchService, docId]
() =>
LiveData.from(
!collapsed ? docsSearchService.watchRefsFrom(docId) : NEVER,
null
),
[docsSearchService, docId, collapsed]
)
);
const searching = children === null;
const indexerLoading = useLiveData(
docsSearchService.indexer.status$.map(
@@ -231,7 +237,9 @@ export const ExplorerDocNode = ({
}
reorderable={reorderable}
onRename={handleRename}
childrenPlaceholder={<Empty onDrop={handleDropOnPlaceholder} />}
childrenPlaceholder={
searching ? null : <Empty onDrop={handleDropOnPlaceholder} />
}
operations={finalOperations}
dropEffect={handleDropEffectOnDoc}
data-testid={`explorer-doc-${docId}`}