feat(nbstore): add indexer storage (#10953)

This commit is contained in:
EYHN
2025-03-31 12:59:51 +00:00
parent c9e14ac0db
commit 8957d0645f
82 changed files with 3393 additions and 4753 deletions

View File

@@ -5,5 +5,4 @@ export * from './livedata';
export * from './media';
export * from './orm';
export * from './storage';
export * from './sync';
export * from './utils';

View File

@@ -1,8 +0,0 @@
export * from './indexer';
export {
IndexedDBIndex,
IndexedDBIndexStorage,
} from './indexer/impl/indexeddb';
export { MemoryIndex, MemoryIndexStorage } from './indexer/impl/memory';
export * from './job';
export { IndexedDBJobQueue } from './job/impl/indexeddb';

View File

@@ -1,147 +0,0 @@
# index
Search engine abstraction layer for AFFiNE.
## Using
1. Define schema
First, we need to define the shape of the data. Currently, there are the following data types.
- 'Integer'
- 'Boolean'
- 'FullText': for full-text search, it will be tokenized and stemmed.
- 'String': for exact match search, e.g. tags, ids.
```typescript
const schema = defineSchema({
title: 'FullText',
tag: 'String',
size: 'Integer',
});
```
> **Array type**
> All types can contain one or more values, so each field can store an array.
2. Pick a backend
Currently, there are two backends available.
- `MemoryIndex`: in-memory indexer, useful for testing.
- `IndexedDBIndex`: persistent indexer using IndexedDB.
> **Underlying Data Table**
> Some back-end processes need to maintain underlying data tables, including table creation and migration. This operation should be silently executed the first time the indexer is invoked.
> Callers do not need to worry about these details.
>
> This design conforms to the usual conventions of search engine APIs, such as in Elasticsearch: https://www.elastic.co/guide/en/elasticsearch/reference/current/array.html
3. Write data
Write data to the indexer. you need to start a write transaction by `await index.write()` first and then complete the batch write through `await writer.commit()`.
> **Transactional**
> Typically, the indexer does not provide transactional guarantees; reliable locking logic needs to be implemented at a higher level.
```typescript
const indexer = new IndexedDBIndex(schema);
const writer = await index.write();
writer.insert(
Document.from('id', {
title: 'hello world',
tag: ['doc', 'page'],
size: '100',
})
);
await writer.commit();
```
4. Search data
To search for content in the indexer, you need to use a specific **query language**. Here are some examples:
```typescript
// match title == 'hello world'
{
type: 'match',
field: 'title',
match: 'hello world',
}
// match title == 'hello world' && tag == 'doc'
{
type: 'boolean',
occur: 'must',
queries: [
{
type: 'match',
field: 'title',
match: 'hello world',
},
{
type: 'match',
field: 'tag',
match: 'doc',
},
],
}
```
There are two ways to perform the search, `index.search()` and `index.aggregate()`.
- **search**: return each matched node and pagination information.
- **aggregate**: aggregate all matched results based on a certain field into buckets, and return the count and score of items in each bucket.
Examples:
```typescript
const result = await index.search({
type: 'match',
field: 'title',
match: 'hello world',
});
// result = {
// nodes: [
// {
// id: '1',
// score: 1,
// },
// ],
// pagination: {
// count: 1,
// hasMore: false,
// limit: 10,
// skip: 0,
// },
// }
```
```typescript
const result = await index.aggregate(
{
type: 'match',
field: 'title',
match: 'affine',
},
'tag'
);
// result = {
// buckets: [
// { key: 'motorcycle', count: 2, score: 1 },
// { key: 'bike', count: 1, score: 1 },
// { key: 'airplane', count: 1, score: 1 },
// ],
// pagination: {
// count: 3,
// hasMore: false,
// limit: 10,
// skip: 0,
// },
// }
```
More uses:
[black-box.spec.ts](./__tests__/black-box.spec.ts)

View File

@@ -1,560 +0,0 @@
/**
* @vitest-environment happy-dom
*/
import 'fake-indexeddb/auto';
import { map } from 'rxjs';
import { beforeEach, describe, expect, test, vitest } from 'vitest';
import { defineSchema, Document, type Index } from '..';
import { IndexedDBIndex } from '../impl/indexeddb';
import { MemoryIndex } from '../impl/memory';
const schema = defineSchema({
title: 'FullText',
tag: 'String',
size: 'Integer',
});
let index: Index<typeof schema> = null!;
describe.each([
{ name: 'memory', backend: MemoryIndex },
{ name: 'idb', backend: IndexedDBIndex },
])('index tests($name)', ({ backend }) => {
async function writeData(
data: Record<
string,
Partial<Record<keyof typeof schema, string | string[]>>
>
) {
const writer = await index.write();
for (const [id, item] of Object.entries(data)) {
const doc = new Document(id);
for (const [key, value] of Object.entries(item)) {
if (Array.isArray(value)) {
for (const v of value) {
doc.insert(key, v);
}
} else {
doc.insert(key, value);
}
}
writer.insert(doc);
}
await writer.commit();
}
beforeEach(async () => {
index = new backend(schema);
await index.clear();
});
test('basic', async () => {
await writeData({
'1': {
title: 'hello world',
},
});
const result = await index.search({
type: 'match',
field: 'title',
match: 'hello world',
});
expect(result).toEqual({
nodes: [
{
id: '1',
score: expect.anything(),
},
],
pagination: {
count: 1,
hasMore: false,
limit: expect.anything(),
skip: 0,
},
});
});
test('basic integer', async () => {
await writeData({
'1': {
title: 'hello world',
size: '100',
},
});
const result = await index.search({
type: 'match',
field: 'size',
match: '100',
});
expect(result).toEqual({
nodes: [
{
id: '1',
score: expect.anything(),
},
],
pagination: {
count: 1,
hasMore: false,
limit: expect.anything(),
skip: 0,
},
});
});
test('fuzz', async () => {
await writeData({
'1': {
title: 'hello world',
},
});
const result = await index.search({
type: 'match',
field: 'title',
match: 'hell',
});
expect(result).toEqual({
nodes: [
{
id: '1',
score: expect.anything(),
},
],
pagination: {
count: 1,
hasMore: false,
limit: expect.anything(),
skip: 0,
},
});
});
test('highlight', async () => {
await writeData({
'1': {
title: 'hello world',
size: '100',
},
});
const result = await index.search(
{
type: 'match',
field: 'title',
match: 'hello',
},
{
highlights: [
{
field: 'title',
before: '<b>',
end: '</b>',
},
],
}
);
expect(result).toEqual({
nodes: expect.arrayContaining([
{
id: '1',
score: expect.anything(),
highlights: {
title: [expect.stringContaining('<b>hello</b>')],
},
},
]),
pagination: {
count: 1,
hasMore: false,
limit: expect.anything(),
skip: 0,
},
});
});
test('fields', async () => {
await writeData({
'1': {
title: 'hello world',
tag: ['car', 'bike'],
},
});
const result = await index.search(
{
type: 'match',
field: 'title',
match: 'hello',
},
{
fields: ['title', 'tag'],
}
);
expect(result.nodes[0].fields).toEqual({
title: 'hello world',
tag: expect.arrayContaining(['bike', 'car']),
});
});
test('pagination', async () => {
await writeData(
Array.from({ length: 100 }).reduce((acc: any, _, i) => {
acc['apple' + i] = {
tag: ['apple'],
};
return acc;
}, {}) as any
);
const result = await index.search(
{
type: 'match',
field: 'tag',
match: 'apple',
},
{
pagination: {
skip: 0,
limit: 10,
},
}
);
expect(result).toEqual({
nodes: expect.arrayContaining(
Array.from({ length: 10 }).fill({
id: expect.stringContaining('apple'),
score: expect.anything(),
})
),
pagination: {
count: 100,
hasMore: true,
limit: 10,
skip: 0,
},
});
const result2 = await index.search(
{
type: 'match',
field: 'tag',
match: 'apple',
},
{
pagination: {
skip: 10,
limit: 10,
},
}
);
expect(result2).toEqual({
nodes: expect.arrayContaining(
Array.from({ length: 10 }).fill({
id: expect.stringContaining('apple'),
score: expect.anything(),
})
),
pagination: {
count: 100,
hasMore: true,
limit: 10,
skip: 10,
},
});
});
test('aggr', async () => {
await writeData({
'1': {
title: 'hello world',
tag: ['car', 'bike'],
},
affine1: {
title: 'affine',
tag: ['motorcycle', 'bike'],
},
affine2: {
title: 'affine',
tag: ['motorcycle', 'airplane'],
},
});
const result = await index.aggregate(
{
type: 'match',
field: 'title',
match: 'affine',
},
'tag'
);
expect(result).toEqual({
buckets: expect.arrayContaining([
{ key: 'motorcycle', count: 2, score: expect.anything() },
{ key: 'bike', count: 1, score: expect.anything() },
{ key: 'airplane', count: 1, score: expect.anything() },
]),
pagination: {
count: 3,
hasMore: false,
limit: expect.anything(),
skip: 0,
},
});
});
test('hits', async () => {
await writeData(
Array.from({ length: 100 }).reduce((acc: any, _, i) => {
acc['apple' + i] = {
title: 'apple',
tag: ['apple', 'fruit'],
};
return acc;
}, {}) as any
);
const result = await index.aggregate(
{
type: 'match',
field: 'title',
match: 'apple',
},
'tag',
{
hits: {
pagination: {
skip: 0,
limit: 5,
},
highlights: [
{
field: 'title',
before: '<b>',
end: '</b>',
},
],
fields: ['title', 'tag'],
},
}
);
expect(result).toEqual({
buckets: expect.arrayContaining([
{
key: 'apple',
count: 100,
score: expect.anything(),
hits: {
pagination: {
count: 100,
hasMore: true,
limit: 5,
skip: 0,
},
nodes: expect.arrayContaining(
Array.from({ length: 5 }).fill({
id: expect.stringContaining('apple'),
score: expect.anything(),
highlights: {
title: [expect.stringContaining('<b>apple</b>')],
},
fields: {
title: expect.stringContaining('apple'),
tag: expect.arrayContaining(['apple', 'fruit']),
},
})
),
},
},
{
key: 'fruit',
count: 100,
score: expect.anything(),
hits: {
pagination: {
count: 100,
hasMore: true,
limit: 5,
skip: 0,
},
nodes: expect.arrayContaining(
Array.from({ length: 5 }).fill({
id: expect.stringContaining('apple'),
score: expect.anything(),
highlights: {
title: [expect.stringContaining('<b>apple</b>')],
},
fields: {
title: expect.stringContaining('apple'),
tag: expect.arrayContaining(['apple', 'fruit']),
},
})
),
},
},
]),
pagination: {
count: 2,
hasMore: false,
limit: expect.anything(),
skip: 0,
},
});
});
test('exists', async () => {
await writeData({
'1': {
title: 'hello world',
tag: '111',
},
'2': {
tag: '222',
},
'3': {
title: 'hello world',
tag: '333',
},
});
const result = await index.search({
type: 'exists',
field: 'title',
});
expect(result).toEqual({
nodes: expect.arrayContaining([
{
id: '1',
score: expect.anything(),
},
{
id: '3',
score: expect.anything(),
},
]),
pagination: {
count: 2,
hasMore: false,
limit: expect.anything(),
skip: 0,
},
});
});
test(
'subscribe',
{
timeout: 30000,
},
async () => {
await writeData({
'1': {
title: 'hello world',
},
});
let value = null as any;
index
.search$({
type: 'match',
field: 'title',
match: 'hello world',
})
.pipe(map(v => (value = v)))
.subscribe();
await vitest.waitFor(
() => {
expect(value).toEqual({
nodes: [
{
id: '1',
score: expect.anything(),
},
],
pagination: {
count: 1,
hasMore: false,
limit: expect.anything(),
skip: 0,
},
});
},
{
timeout: 10000,
}
);
await writeData({
'2': {
title: 'hello world',
},
});
await vitest.waitFor(
() => {
expect(value).toEqual({
nodes: [
{
id: '1',
score: expect.anything(),
},
{
id: '2',
score: expect.anything(),
},
],
pagination: {
count: 2,
hasMore: false,
limit: expect.anything(),
skip: 0,
},
});
},
{
timeout: 10000,
}
);
const writer = await index.write();
writer.delete('1');
await writer.commit();
await vitest.waitFor(
() => {
expect(value).toEqual({
nodes: [
{
id: '2',
score: expect.anything(),
},
],
pagination: {
count: 1,
hasMore: false,
limit: expect.anything(),
skip: 0,
},
});
},
{
timeout: 10000,
}
);
}
);
});

View File

@@ -1,51 +0,0 @@
import type { Schema } from './schema';
export class Document<S extends Schema = any> {
constructor(public readonly id: string) {}
fields = new Map<keyof S, string[]>();
public insert<F extends keyof S>(field: F, value: string | string[]) {
const values = this.fields.get(field) ?? [];
if (Array.isArray(value)) {
values.push(...value);
} else {
values.push(value);
}
this.fields.set(field, values);
}
get<F extends keyof S>(field: F): string[] | string | undefined {
const values = this.fields.get(field);
if (values === undefined) {
return undefined;
} else if (values.length === 1) {
return values[0];
} else {
return values;
}
}
static from<S extends Schema>(
id: string,
map:
| Partial<Record<keyof S, string | string[]>>
| Map<keyof S, string | string[]>
): Document<S> {
const doc = new Document(id);
if (map instanceof Map) {
for (const [key, value] of map) {
doc.insert(key, value);
}
} else {
for (const key in map) {
if (map[key] === undefined || map[key] === null) {
continue;
}
doc.insert(key, map[key]);
}
}
return doc;
}
}

View File

@@ -1 +0,0 @@
export type FieldType = 'Integer' | 'FullText' | 'String' | 'Boolean';

View File

@@ -1,10 +0,0 @@
import { expect, test } from 'vitest';
import { bm25 } from '../bm25';
test('bm25', () => {
expect(bm25(1, 1, 10, 10, 15)).toEqual(3.2792079793859643);
expect(bm25(2, 1, 10, 10, 15) > bm25(1, 1, 10, 10, 15)).toBeTruthy();
expect(bm25(1, 1, 10, 10, 15) > bm25(2, 1, 10, 100, 15)).toBeTruthy();
expect(bm25(1, 1, 10, 10, 15) > bm25(1, 1, 10, 100, 15)).toBeTruthy();
});

View File

@@ -1,32 +0,0 @@
import { expect, test } from 'vitest';
import { highlighter } from '../highlighter';
test('highlighter', () => {
expect(highlighter('0123456789', '<b>', '</b>', [[3, 5]])).toEqual(
'012<b>34</b>56789'
);
expect(
highlighter(
'012345678901234567890123456789012345678901234567890123456789',
'<b>',
'</b>',
[[59, 60]]
)
).toEqual('...0123456789012345678901234567890123456789012345678<b>9</b>');
expect(
highlighter(
'012345678901234567890123456789012345678901234567890123456789',
'<b>',
'</b>',
[
[10, 11],
[49, 51],
]
)
).toEqual(
'0123456789<b>0</b>12345678901234567890123456789012345678<b>9</b>...'
);
});

View File

@@ -1,128 +0,0 @@
import { expect, test } from 'vitest';
import { GeneralTokenizer } from '../tokenizer';
test('tokenizer', () => {
{
const tokens = new GeneralTokenizer().tokenize('hello world,\n AFFiNE');
expect(tokens).toEqual([
{ term: 'hello', start: 0, end: 5 },
{ term: 'world', start: 7, end: 12 },
{ term: 'affine', start: 15, end: 21 },
]);
}
{
const tokens = new GeneralTokenizer().tokenize('你好世界,阿芬');
expect(tokens).toEqual([
{
end: 2,
start: 0,
term: '你好',
},
{
end: 3,
start: 1,
term: '好世',
},
{
end: 4,
start: 2,
term: '世界',
},
{
end: 7,
start: 5,
term: '阿芬',
},
]);
}
{
const tokens = new GeneralTokenizer().tokenize('1阿2芬');
expect(tokens).toEqual([
{ term: '1', start: 0, end: 1 },
{ term: '阿', start: 1, end: 2 },
{ term: '2', start: 2, end: 3 },
{ term: '芬', start: 3, end: 4 },
]);
}
{
const tokens = new GeneralTokenizer().tokenize('안녕하세요 세계');
expect(tokens).toEqual([
{
end: 2,
start: 0,
term: '안녕',
},
{
end: 3,
start: 1,
term: '녕하',
},
{
end: 4,
start: 2,
term: '하세',
},
{
end: 5,
start: 3,
term: '세요',
},
{
end: 8,
start: 6,
term: '세계',
},
]);
}
{
const tokens = new GeneralTokenizer().tokenize('ハローワールド');
expect(tokens).toEqual([
{ term: 'ハロ', start: 0, end: 2 },
{ term: 'ロー', start: 1, end: 3 },
{ term: 'ーワ', start: 2, end: 4 },
{ term: 'ワー', start: 3, end: 5 },
{ term: 'ール', start: 4, end: 6 },
{ term: 'ルド', start: 5, end: 7 },
]);
}
{
const tokens = new GeneralTokenizer().tokenize('はろーわーるど');
expect(tokens).toEqual([
{ term: 'はろ', start: 0, end: 2 },
{ term: 'ろー', start: 1, end: 3 },
{ term: 'ーわ', start: 2, end: 4 },
{ term: 'わー', start: 3, end: 5 },
{ term: 'ーる', start: 4, end: 6 },
{ term: 'るど', start: 5, end: 7 },
]);
}
{
const tokens = new GeneralTokenizer().tokenize('👋1⃣🚪👋🏿');
expect(tokens).toEqual([
{ term: '👋', start: 0, end: 2 },
{ term: '1⃣', start: 2, end: 5 },
{ term: '🚪', start: 5, end: 7 },
{ term: '👋🏿', start: 7, end: 11 },
]);
}
{
const tokens = new GeneralTokenizer().tokenize('1');
expect(tokens).toEqual([{ term: '1', start: 0, end: 2 }]);
}
});

View File

@@ -1,62 +0,0 @@
/**
* Parameters of the BM25+ scoring algorithm. Customizing these is almost never
* necessary, and finetuning them requires an understanding of the BM25 scoring
* model.
*
* Some information about BM25 (and BM25+) can be found at these links:
*
* - https://en.wikipedia.org/wiki/Okapi_BM25
* - https://opensourceconnections.com/blog/2015/10/16/bm25-the-next-generation-of-lucene-relevation/
*/
export type BM25Params = {
/** Term frequency saturation point.
*
* Recommended values are between `1.2` and `2`. Higher values increase the
* difference in score between documents with higher and lower term
* frequencies. Setting this to `0` or a negative value is invalid. Defaults
* to `1.2`
*/
k: number;
/**
* Length normalization impact.
*
* Recommended values are around `0.75`. Higher values increase the weight
* that field length has on scoring. Setting this to `0` (not recommended)
* means that the field length has no effect on scoring. Negative values are
* invalid. Defaults to `0.7`.
*/
b: number;
/**
* BM25+ frequency normalization lower bound (usually called δ).
*
* Recommended values are between `0.5` and `1`. Increasing this parameter
* increases the minimum relevance of one occurrence of a search term
* regardless of its (possibly very long) field length. Negative values are
* invalid. Defaults to `0.5`.
*/
d: number;
};
const defaultBM25params: BM25Params = { k: 1.2, b: 0.7, d: 0.5 };
export const bm25 = (
termFreq: number,
matchingCount: number,
totalCount: number,
fieldLength: number,
avgFieldLength: number,
bm25params: BM25Params = defaultBM25params
): number => {
const { k, b, d } = bm25params;
const invDocFreq = Math.log(
1 + (totalCount - matchingCount + 0.5) / (matchingCount + 0.5)
);
return (
invDocFreq *
(d +
(termFreq * (k + 1)) /
(termFreq + k * (1 - b + b * (fieldLength / avgFieldLength))))
);
};

View File

@@ -1,551 +0,0 @@
import { DebugLogger } from '@affine/debug';
import {
type DBSchema,
type IDBPDatabase,
type IDBPTransaction,
openDB,
type StoreNames,
} from 'idb';
import {
type AggregateOptions,
type AggregateResult,
Document,
type Query,
type Schema,
type SearchOptions,
type SearchResult,
} from '../../';
import { highlighter } from './highlighter';
import {
BooleanInvertedIndex,
FullTextInvertedIndex,
IntegerInvertedIndex,
type InvertedIndex,
StringInvertedIndex,
} from './inverted-index';
import { Match } from './match';
const logger = new DebugLogger('indexeddb');
export interface IndexDB extends DBSchema {
kvMetadata: {
key: string;
value: {
key: string;
value: any;
};
};
records: {
key: number;
value: {
id: string;
data: Map<string, string[]>;
};
indexes: { id: string };
};
invertedIndex: {
key: number;
value: {
nid: number;
pos?: {
i: number /* index */;
l: number /* length */;
rs: [number, number][] /* ranges: [start, end] */;
};
key: ArrayBuffer;
};
indexes: { key: ArrayBuffer; nid: number };
};
}
export type DataStructRWTransaction = IDBPTransaction<
IndexDB,
ArrayLike<StoreNames<IndexDB>>,
'readwrite'
>;
export type DataStructROTransaction = IDBPTransaction<
IndexDB,
ArrayLike<StoreNames<IndexDB>>,
'readonly' | 'readwrite'
>;
export class DataStruct {
private initializePromise: Promise<void> | null = null;
database: IDBPDatabase<IndexDB> = null as any;
invertedIndex = new Map<string, InvertedIndex>();
constructor(
readonly databaseName: string,
readonly schema: Schema
) {
for (const [key, type] of Object.entries(schema)) {
const typeInfo = typeof type === 'string' ? { type } : type;
if (typeInfo.index === false) {
// If index is false, we don't need to create an inverted index for this field.
continue;
}
if (typeInfo.type === 'String') {
this.invertedIndex.set(key, new StringInvertedIndex(key));
} else if (typeInfo.type === 'Integer') {
this.invertedIndex.set(key, new IntegerInvertedIndex(key));
} else if (typeInfo.type === 'FullText') {
this.invertedIndex.set(key, new FullTextInvertedIndex(key));
} else if (type === 'Boolean') {
this.invertedIndex.set(key, new BooleanInvertedIndex(key));
} else {
throw new Error(`Field type '${type}' not supported`);
}
}
}
private async insert(trx: DataStructRWTransaction, document: Document) {
const exists = await trx
.objectStore('records')
.index('id')
.get(document.id);
if (exists) {
throw new Error('Document already exists');
}
const dataMap = new Map();
for (const [key, values] of document.fields) {
const type = this.schema[key as string];
if (!type) {
continue;
}
const typeInfo = typeof type === 'string' ? { type } : type;
if (typeInfo.store !== false) {
// If store is false, the field will not be stored
dataMap.set(key, values);
}
}
const nid = await trx.objectStore('records').put({
id: document.id,
data: dataMap,
});
for (const [key, values] of document.fields) {
const iidx = this.invertedIndex.get(key as string);
if (!iidx) {
continue;
}
await iidx.insert(trx, nid, values);
}
}
private async delete(trx: DataStructRWTransaction, id: string) {
const nid = await trx.objectStore('records').index('id').getKey(id);
if (nid) {
await trx.objectStore('records').delete(nid);
} else {
return;
}
const indexIds = await trx
.objectStore('invertedIndex')
.index('nid')
.getAllKeys(nid);
for (const indexId of indexIds) {
await trx.objectStore('invertedIndex').delete(indexId);
}
}
async batchWrite(
trx: DataStructRWTransaction,
deletes: string[],
inserts: Document[]
) {
const startTime = performance.now();
try {
for (const del of deletes) {
await this.delete(trx, del);
}
for (const inst of inserts) {
await this.insert(trx, inst);
}
} finally {
const endTime = performance.now();
if (BUILD_CONFIG.debug) {
performance.measure(
`[IndexedDB Indexer] Batch Write (${this.databaseName})`,
{
start: startTime,
end: endTime,
}
);
}
logger.debug(
`[indexer ${this.databaseName}] batchWrite`,
endTime - startTime,
'ms'
);
}
}
async matchAll(trx: DataStructROTransaction): Promise<Match> {
const allNids = await trx.objectStore('records').getAllKeys();
const match = new Match();
for (const nid of allNids) {
match.addScore(nid, 1);
}
return match;
}
private async queryRaw(
trx: DataStructROTransaction,
query: Query<any>
): Promise<Match> {
if (query.type === 'match') {
const iidx = this.invertedIndex.get(query.field as string);
if (!iidx) {
return new Match();
}
return await iidx.match(trx, query.match);
} else if (query.type === 'boolean') {
const weights = [];
for (const q of query.queries) {
weights.push(await this.queryRaw(trx, q));
}
if (query.occur === 'must') {
return weights.reduce((acc, w) => acc.and(w));
} else if (query.occur === 'must_not') {
const total = weights.reduce((acc, w) => acc.and(w));
return (await this.matchAll(trx)).exclude(total);
} else if (query.occur === 'should') {
return weights.reduce((acc, w) => acc.or(w));
}
} else if (query.type === 'all') {
return await this.matchAll(trx);
} else if (query.type === 'boost') {
return (await this.queryRaw(trx, query.query)).boost(query.boost);
} else if (query.type === 'exists') {
const iidx = this.invertedIndex.get(query.field as string);
if (!iidx) {
return new Match();
}
return await iidx.all(trx);
}
throw new Error(`Query type '${query.type}' not supported`);
}
async clear(trx: DataStructRWTransaction) {
await trx.objectStore('records').clear();
await trx.objectStore('invertedIndex').clear();
await trx.objectStore('kvMetadata').clear();
}
async search(
trx: DataStructROTransaction,
query: Query<any>,
options: SearchOptions<any>
): Promise<SearchResult<any, any>> {
const startTime = performance.now();
try {
const pagination = {
skip: options.pagination?.skip ?? 0,
limit: options.pagination?.limit ?? 100,
};
const match = await this.queryRaw(trx, query);
const nids = match
.toArray()
.slice(pagination.skip, pagination.skip + pagination.limit);
const nodes = [];
for (const nid of nids) {
const record = await trx.objectStore('records').get(nid);
if (!record) {
continue;
}
nodes.push(this.resultNode(record, options, match, nid));
}
return {
pagination: {
count: match.size(),
hasMore: match.size() > pagination.limit + pagination.skip,
limit: pagination.limit,
skip: pagination.skip,
},
nodes: nodes,
};
} finally {
const endTime = performance.now();
if (BUILD_CONFIG.debug) {
performance.measure(
`[IndexedDB Indexer] Search (${this.databaseName})`,
{
detail: { query, options },
start: startTime,
end: endTime,
}
);
}
logger.debug(
`[indexer ${this.databaseName}] search`,
endTime - startTime,
'ms',
query
);
}
}
async aggregate(
trx: DataStructROTransaction,
query: Query<any>,
field: string,
options: AggregateOptions<any>
): Promise<AggregateResult<any, any>> {
const startTime = performance.now();
try {
const pagination = {
skip: options.pagination?.skip ?? 0,
limit: options.pagination?.limit ?? 100,
};
const hitPagination = options.hits
? {
skip: options.hits.pagination?.skip ?? 0,
limit: options.hits.pagination?.limit ?? 3,
}
: {
skip: 0,
limit: 0,
};
const match = await this.queryRaw(trx, query);
const nids = match.toArray();
const buckets: {
key: string;
nids: number[];
hits: SearchResult<any, any>['nodes'];
}[] = [];
for (const nid of nids) {
const record = await trx.objectStore('records').get(nid);
if (!record) {
continue;
}
const values = record.data.get(field);
for (const value of values ?? []) {
let bucket;
let bucketIndex = buckets.findIndex(b => b.key === value);
if (bucketIndex === -1) {
bucket = { key: value, nids: [], hits: [] };
buckets.push(bucket);
bucketIndex = buckets.length - 1;
} else {
bucket = buckets[bucketIndex];
}
if (
bucketIndex >= pagination.skip &&
bucketIndex < pagination.skip + pagination.limit
) {
bucket.nids.push(nid);
if (
bucket.nids.length - 1 >= hitPagination.skip &&
bucket.nids.length - 1 < hitPagination.skip + hitPagination.limit
) {
bucket.hits.push(
this.resultNode(record, options.hits ?? {}, match, nid)
);
}
}
}
}
return {
buckets: buckets
.slice(pagination.skip, pagination.skip + pagination.limit)
.map(bucket => {
const result = {
key: bucket.key,
score: match.getScore(bucket.nids[0]),
count: bucket.nids.length,
} as AggregateResult<any, any>['buckets'][number];
if (options.hits) {
(result as any).hits = {
pagination: {
count: bucket.nids.length,
hasMore:
bucket.nids.length >
hitPagination.limit + hitPagination.skip,
limit: hitPagination.limit,
skip: hitPagination.skip,
},
nodes: bucket.hits,
} as SearchResult<any, any>;
}
return result;
}),
pagination: {
count: buckets.length,
hasMore: buckets.length > pagination.limit + pagination.skip,
limit: pagination.limit,
skip: pagination.skip,
},
};
} finally {
const endTime = performance.now();
if (BUILD_CONFIG.debug) {
performance.measure(
`[IndexedDB Indexer] Aggregate (${this.databaseName})`,
{
detail: { query, field, options },
start: startTime,
end: endTime,
}
);
}
logger.debug(
`[indexer ${this.databaseName}] aggregate`,
endTime - startTime,
'ms'
);
}
}
async getAll(
trx: DataStructROTransaction,
ids?: string[]
): Promise<Document[]> {
const docs = [];
if (ids) {
for (const id of ids) {
const record = await trx.objectStore('records').index('id').get(id);
if (record) {
docs.push(Document.from(record.id, record.data));
}
}
} else {
const records = await trx.objectStore('records').getAll();
for (const record of records) {
docs.push(Document.from(record.id, record.data));
}
}
return docs;
}
async has(trx: DataStructROTransaction, id: string): Promise<boolean> {
const nid = await trx.objectStore('records').index('id').getKey(id);
return nid !== undefined;
}
async readonly() {
await this.ensureInitialized();
return this.database.transaction(
['records', 'invertedIndex', 'kvMetadata'],
'readonly',
{
durability: 'relaxed',
}
);
}
async readwrite() {
await this.ensureInitialized();
return this.database.transaction(
['records', 'invertedIndex', 'kvMetadata'],
'readwrite',
{
durability: 'relaxed',
}
);
}
private async ensureInitialized() {
if (this.database) {
return;
}
this.initializePromise ??= this.initialize();
await this.initializePromise;
}
private async initialize() {
this.database = await openDB<IndexDB>(this.databaseName, 1, {
upgrade(database) {
database.createObjectStore('kvMetadata', {
keyPath: 'key',
});
const recordsStore = database.createObjectStore('records', {
autoIncrement: true,
});
recordsStore.createIndex('id', 'id', {
unique: true,
});
const invertedIndexStore = database.createObjectStore('invertedIndex', {
autoIncrement: true,
});
invertedIndexStore.createIndex('key', 'key', { unique: false });
invertedIndexStore.createIndex('nid', 'nid', { unique: false });
},
});
}
private resultNode(
record: { id: string; data: Map<string, string[]> },
options: SearchOptions<any>,
match?: Match,
nid?: number
): SearchResult<any, any>['nodes'][number] {
const node = {
id: record.id,
score: match && nid ? match.getScore(nid) : 1,
} as any;
if (options.fields) {
const fields = {} as Record<string, string | string[]>;
for (const field of options.fields as string[]) {
fields[field] = record.data.get(field) ?? [''];
if (fields[field].length === 1) {
fields[field] = fields[field][0];
}
}
node.fields = fields;
}
if (match && nid && options.highlights) {
const highlights = {} as Record<string, string[]>;
for (const { field, before, end } of options.highlights) {
const highlightValues = match.getHighlighters(nid, field);
if (highlightValues) {
const rawValues = record.data.get(field) ?? [];
highlights[field] = Array.from(highlightValues)
.map(([index, ranges]) => {
const raw = rawValues[index];
if (raw) {
return (
highlighter(raw, before, end, ranges, {
maxPrefix: 20,
maxLength: 50,
}) ?? ''
);
}
return '';
})
.filter(Boolean);
}
}
node.highlights = highlights;
}
return node;
}
}

View File

@@ -1,77 +0,0 @@
export function highlighter(
originText: string,
before: string,
after: string,
matches: [number, number][],
{
maxLength = 50,
maxPrefix = 20,
}: { maxLength?: number; maxPrefix?: number } = {}
) {
const merged = mergeRanges(matches);
if (merged.length === 0) {
return null;
}
const firstMatch = merged[0][0];
const start = Math.max(
0,
Math.min(firstMatch - maxPrefix, originText.length - maxLength)
);
const end = Math.min(start + maxLength, originText.length);
const text = originText.substring(start, end);
let result = '';
let pointer = 0;
for (const match of merged) {
const matchStart = match[0] - start;
const matchEnd = match[1] - start;
if (matchStart >= text.length) {
break;
}
result += text.substring(pointer, matchStart);
pointer = matchStart;
const highlighted = text.substring(matchStart, matchEnd);
if (highlighted.length === 0) {
continue;
}
result += `${before}${highlighted}${after}`;
pointer = matchEnd;
}
result += text.substring(pointer);
if (start > 0) {
result = `...${result}`;
}
if (end < originText.length) {
result = `${result}...`;
}
return result;
}
function mergeRanges(intervals: [number, number][]) {
if (intervals.length === 0) return [];
intervals.sort((a, b) => a[0] - b[0]);
const merged = [intervals[0]];
for (let i = 1; i < intervals.length; i++) {
const last = merged[merged.length - 1];
const current = intervals[i];
if (current[0] <= last[1]) {
last[1] = Math.max(last[1], current[1]);
} else {
merged.push(current);
}
}
return merged;
}

View File

@@ -1,182 +0,0 @@
import { DebugLogger } from '@affine/debug';
import type { Observable } from 'rxjs';
import { merge, of, Subject, throttleTime } from 'rxjs';
import { backoffRetry, fromPromise } from '../../../../livedata';
import { exhaustMapWithTrailing } from '../../../../utils/';
import {
type AggregateOptions,
type AggregateResult,
type Document,
type Index,
type IndexStorage,
type IndexWriter,
type Query,
type Schema,
type SearchOptions,
type SearchResult,
} from '../../';
import { DataStruct, type DataStructRWTransaction } from './data-struct';
const logger = new DebugLogger('IndexedDBIndex');
export class IndexedDBIndex<S extends Schema> implements Index<S> {
data: DataStruct = new DataStruct(this.databaseName, this.schema);
broadcast$ = new Subject();
constructor(
private readonly schema: S,
private readonly databaseName: string = 'indexer'
) {
const channel = new BroadcastChannel(this.databaseName + ':indexer');
channel.onmessage = () => {
this.broadcast$.next(1);
};
}
async get(id: string): Promise<Document<S> | null> {
return (await this.getAll([id]))[0] ?? null;
}
async getAll(ids: string[]): Promise<Document<S>[]> {
const trx = await this.data.readonly();
return this.data.getAll(trx, ids);
}
async write(): Promise<IndexWriter<S>> {
return new IndexedDBIndexWriter(this.data, await this.data.readwrite());
}
async has(id: string): Promise<boolean> {
const trx = await this.data.readonly();
return this.data.has(trx, id);
}
async search(
query: Query<any>,
options: SearchOptions<any> = {}
): Promise<SearchResult<any, SearchOptions<any>>> {
const trx = await this.data.readonly();
return this.data.search(trx, query, options);
}
search$(
query: Query<any>,
options: SearchOptions<any> = {}
): Observable<SearchResult<any, SearchOptions<any>>> {
return merge(of(1), this.broadcast$).pipe(
throttleTime(3000, undefined, { leading: true, trailing: true }),
exhaustMapWithTrailing(() => {
return fromPromise(async () => {
try {
const trx = await this.data.readonly();
return await this.data.search(trx, query, options);
} catch (error) {
logger.error('search error', error);
throw error;
}
}).pipe(backoffRetry());
})
);
}
async aggregate(
query: Query<any>,
field: string,
options: AggregateOptions<any> = {}
): Promise<AggregateResult<any, AggregateOptions<any>>> {
const trx = await this.data.readonly();
return this.data.aggregate(trx, query, field, options);
}
aggregate$(
query: Query<any>,
field: string,
options: AggregateOptions<any> = {}
): Observable<AggregateResult<S, AggregateOptions<any>>> {
return merge(of(1), this.broadcast$).pipe(
throttleTime(3000, undefined, { leading: true, trailing: true }),
exhaustMapWithTrailing(() => {
return fromPromise(async () => {
try {
const trx = await this.data.readonly();
return await this.data.aggregate(trx, query, field, options);
} catch (error) {
logger.error('aggregate error', error);
throw error;
}
}).pipe(backoffRetry());
})
);
}
async clear(): Promise<void> {
const trx = await this.data.readwrite();
return this.data.clear(trx);
}
}
export class IndexedDBIndexWriter<S extends Schema> implements IndexWriter<S> {
inserts: Document[] = [];
deletes: string[] = [];
channel = new BroadcastChannel(this.data.databaseName + ':indexer');
constructor(
private readonly data: DataStruct,
private readonly trx: DataStructRWTransaction
) {}
async get(id: string): Promise<Document<S> | null> {
return (await this.getAll([id]))[0] ?? null;
}
async getAll(ids?: string[]): Promise<Document<S>[]> {
const trx = await this.data.readonly();
return this.data.getAll(trx, ids);
}
insert(document: Document): void {
this.inserts.push(document);
}
delete(id: string): void {
this.deletes.push(id);
}
put(document: Document): void {
this.delete(document.id);
this.insert(document);
}
async commit(): Promise<void> {
await this.data.batchWrite(this.trx, this.deletes, this.inserts);
this.trx.commit();
this.channel.postMessage(1);
}
rollback(): void {}
has(id: string): Promise<boolean> {
return this.data.has(this.trx, id);
}
async search(
query: Query<any>,
options: SearchOptions<any> = {}
): Promise<SearchResult<any, SearchOptions<any>>> {
return this.data.search(this.trx, query, options);
}
async aggregate(
query: Query<any>,
field: string,
options: AggregateOptions<any> = {}
): Promise<AggregateResult<any, AggregateOptions<any>>> {
return this.data.aggregate(this.trx, query, field, options);
}
}
export class IndexedDBIndexStorage implements IndexStorage {
constructor(private readonly databaseName: string) {}
getIndex<S extends Schema>(name: string, s: S): Index<S> {
return new IndexedDBIndex(s, this.databaseName + ':' + name);
}
}

View File

@@ -1,469 +0,0 @@
import { bm25 } from './bm25';
import type {
DataStructROTransaction,
DataStructRWTransaction,
} from './data-struct';
import { Match } from './match';
import { GeneralTokenizer, type Token } from './tokenizer';
export interface InvertedIndex {
fieldKey: string;
match(trx: DataStructROTransaction, term: string): Promise<Match>;
all(trx: DataStructROTransaction): Promise<Match>;
insert(
trx: DataStructRWTransaction,
id: number,
terms: string[]
): Promise<void>;
}
export class StringInvertedIndex implements InvertedIndex {
constructor(
readonly fieldKey: string,
readonly index: boolean = true,
readonly store: boolean = true
) {}
async match(trx: DataStructROTransaction, term: string): Promise<Match> {
const objs = await trx
.objectStore('invertedIndex')
.index('key')
.getAll(InvertedIndexKey.forString(this.fieldKey, term).buffer());
const match = new Match();
for (const obj of objs) {
match.addScore(obj.nid, 1);
}
return match;
}
async all(trx: DataStructROTransaction): Promise<Match> {
const objs = await trx
.objectStore('invertedIndex')
.index('key')
.getAll(
IDBKeyRange.bound(
InvertedIndexKey.forPrefix(this.fieldKey).buffer(),
InvertedIndexKey.forPrefix(this.fieldKey).add1().buffer()
)
);
const set = new Set<number>();
for (const obj of objs) {
set.add(obj.nid);
}
const match = new Match();
for (const nid of set) {
match.addScore(nid, 1);
}
return match;
}
async insert(trx: DataStructRWTransaction, id: number, terms: string[]) {
for (const term of terms) {
await trx.objectStore('invertedIndex').put({
key: InvertedIndexKey.forString(this.fieldKey, term).buffer(),
nid: id,
});
}
}
}
export class IntegerInvertedIndex implements InvertedIndex {
constructor(
readonly fieldKey: string,
readonly index: boolean = true,
readonly store: boolean = true
) {}
async match(trx: DataStructROTransaction, term: string): Promise<Match> {
const objs = await trx
.objectStore('invertedIndex')
.index('key')
.getAll(InvertedIndexKey.forInt64(this.fieldKey, BigInt(term)).buffer());
const match = new Match();
for (const obj of objs) {
match.addScore(obj.nid, 1);
}
return match;
}
// eslint-disable-next-line sonarjs/no-identical-functions
async all(trx: DataStructROTransaction): Promise<Match> {
const objs = await trx
.objectStore('invertedIndex')
.index('key')
.getAll(
IDBKeyRange.bound(
InvertedIndexKey.forPrefix(this.fieldKey).buffer(),
InvertedIndexKey.forPrefix(this.fieldKey).add1().buffer()
)
);
const set = new Set<number>();
for (const obj of objs) {
set.add(obj.nid);
}
const match = new Match();
for (const nid of set) {
match.addScore(nid, 1);
}
return match;
}
async insert(trx: DataStructRWTransaction, id: number, terms: string[]) {
for (const term of terms) {
await trx.objectStore('invertedIndex').put({
key: InvertedIndexKey.forInt64(this.fieldKey, BigInt(term)).buffer(),
nid: id,
});
}
}
}
export class BooleanInvertedIndex implements InvertedIndex {
constructor(
readonly fieldKey: string,
readonly index: boolean = true,
readonly store: boolean = true
) {}
// eslint-disable-next-line sonarjs/no-identical-functions
async all(trx: DataStructROTransaction): Promise<Match> {
const objs = await trx
.objectStore('invertedIndex')
.index('key')
.getAll(
IDBKeyRange.bound(
InvertedIndexKey.forPrefix(this.fieldKey).buffer(),
InvertedIndexKey.forPrefix(this.fieldKey).add1().buffer()
)
);
const set = new Set<number>();
for (const obj of objs) {
set.add(obj.nid);
}
const match = new Match();
for (const nid of set) {
match.addScore(nid, 1);
}
return match;
}
async match(trx: DataStructROTransaction, term: string): Promise<Match> {
const objs = await trx
.objectStore('invertedIndex')
.index('key')
.getAll(
InvertedIndexKey.forBoolean(this.fieldKey, term === 'true').buffer()
);
const match = new Match();
for (const obj of objs) {
match.addScore(obj.nid, 1);
}
return match;
}
async insert(trx: DataStructRWTransaction, id: number, terms: string[]) {
for (const term of terms) {
await trx.objectStore('invertedIndex').put({
key: InvertedIndexKey.forBoolean(
this.fieldKey,
term === 'true'
).buffer(),
nid: id,
});
}
}
}
export class FullTextInvertedIndex implements InvertedIndex {
constructor(
readonly fieldKey: string,
readonly index: boolean = true,
readonly store: boolean = true
) {}
async match(trx: DataStructROTransaction, term: string): Promise<Match> {
const queryTokens = new GeneralTokenizer().tokenize(term);
const matched = new Map<
number,
Map<
number, // index
{
score: number;
ranges: [number, number][];
}
>
>();
const avgFieldLength =
(
await trx
.objectStore('kvMetadata')
.get(`full-text:avg-field-length:${this.fieldKey}`)
)?.value ?? 0;
for (const token of queryTokens) {
const key = InvertedIndexKey.forString(this.fieldKey, token.term);
const objs = await trx
.objectStore('invertedIndex')
.index('key')
.getAll(
IDBKeyRange.bound(key.buffer(), key.add1().buffer(), false, true)
);
const submatched: {
nid: number;
score: number;
position: {
index: number;
ranges: [number, number][];
};
}[] = [];
for (const obj of objs) {
const key = InvertedIndexKey.fromBuffer(obj.key);
const originTokenTerm = key.asString();
const matchLength = token.term.length;
const position = obj.pos ?? {
i: 0,
l: 0,
rs: [],
};
const termFreq = position.rs.length;
const totalCount = objs.length;
const fieldLength = position.l;
const score =
bm25(termFreq, 1, totalCount, fieldLength, avgFieldLength) *
(matchLength / originTokenTerm.length);
const match = {
score,
positions: new Map(),
};
const ranges = match.positions.get(position.i) || [];
ranges.push(
...position.rs.map(([start, _end]) => [start, start + matchLength])
);
match.positions.set(position.i, ranges);
submatched.push({
nid: obj.nid,
score,
position: {
index: position.i,
ranges: position.rs.map(([start, _end]) => [
start,
start + matchLength,
]),
},
});
}
// normalize score
const maxScore = submatched.reduce((acc, s) => Math.max(acc, s.score), 0);
const minScore = submatched.reduce((acc, s) => Math.min(acc, s.score), 0);
for (const { nid, score, position } of submatched) {
const normalizedScore =
maxScore === minScore
? score
: (score - minScore) / (maxScore - minScore);
const match =
matched.get(nid) ??
new Map<
number, // index
{
score: number;
ranges: [number, number][];
}
>();
const item = match.get(position.index) || {
score: 0,
ranges: [],
};
item.score += normalizedScore;
item.ranges.push(...position.ranges);
match.set(position.index, item);
matched.set(nid, match);
}
}
const match = new Match();
for (const [nid, items] of matched) {
if (items.size === 0) {
break;
}
let highestScore = -1;
let highestIndex = -1;
let highestRanges: [number, number][] = [];
for (const [index, { score, ranges }] of items) {
if (score > highestScore) {
highestScore = score;
highestIndex = index;
highestRanges = ranges;
}
}
match.addScore(nid, highestScore);
match.addHighlighter(nid, this.fieldKey, highestIndex, highestRanges);
}
return match;
}
// eslint-disable-next-line sonarjs/no-identical-functions
async all(trx: DataStructROTransaction): Promise<Match> {
const objs = await trx
.objectStore('invertedIndex')
.index('key')
.getAll(
IDBKeyRange.bound(
InvertedIndexKey.forPrefix(this.fieldKey).buffer(),
InvertedIndexKey.forPrefix(this.fieldKey).add1().buffer()
)
);
const set = new Set<number>();
for (const obj of objs) {
set.add(obj.nid);
}
const match = new Match();
for (const nid of set) {
match.addScore(nid, 1);
}
return match;
}
async insert(trx: DataStructRWTransaction, id: number, terms: string[]) {
for (let i = 0; i < terms.length; i++) {
const tokenMap = new Map<string, Token[]>();
const originString = terms[i];
const tokens = new GeneralTokenizer().tokenize(originString);
for (const token of tokens) {
const tokens = tokenMap.get(token.term) || [];
tokens.push(token);
tokenMap.set(token.term, tokens);
}
for (const [term, tokens] of tokenMap) {
await trx.objectStore('invertedIndex').put({
key: InvertedIndexKey.forString(this.fieldKey, term).buffer(),
nid: id,
pos: {
l: originString.length,
i: i,
rs: tokens.map(token => [token.start, token.end]),
},
});
}
const kvMetadataStore = trx.objectStore('kvMetadata');
// update avg-field-length
const totalCount =
(await kvMetadataStore.get(`full-text:field-count:${this.fieldKey}`))
?.value ?? 0;
const avgFieldLength =
(
await kvMetadataStore.get(
`full-text:avg-field-length:${this.fieldKey}`
)
)?.value ?? 0;
await kvMetadataStore.put({
key: `full-text:field-count:${this.fieldKey}`,
value: totalCount + 1,
});
await kvMetadataStore.put({
key: `full-text:avg-field-length:${this.fieldKey}`,
value:
avgFieldLength +
(terms.reduce((acc, term) => acc + term.length, 0) - avgFieldLength) /
(totalCount + 1),
});
}
}
}
export class InvertedIndexKey {
constructor(
readonly field: Uint8Array,
readonly value: Uint8Array,
readonly gap: Uint8Array = new Uint8Array([58])
) {}
asString() {
return new TextDecoder().decode(this.value);
}
asInt64() {
return new DataView(this.value.buffer).getBigInt64(
0,
false
); /* big-endian */
}
add1() {
if (this.value.byteLength > 0) {
const bytes = new Uint8Array(this.value.slice(0));
let carry = 1;
for (let i = bytes.length - 1; i >= 0 && carry > 0; i--) {
const sum = bytes[i] + carry;
bytes[i] = sum % 256;
carry = sum >> 8;
}
return new InvertedIndexKey(this.field, bytes);
} else {
return new InvertedIndexKey(
this.field,
new Uint8Array(0),
new Uint8Array([59])
);
}
}
static forPrefix(field: string) {
return new InvertedIndexKey(
new TextEncoder().encode(field),
new Uint8Array(0)
);
}
static forString(field: string, value: string) {
return new InvertedIndexKey(
new TextEncoder().encode(field),
new TextEncoder().encode(value)
);
}
static forBoolean(field: string, value: boolean) {
const bytes = new Uint8Array(1);
bytes.set([value ? 1 : 0]);
return new InvertedIndexKey(new TextEncoder().encode(field), bytes);
}
static forInt64(field: string, value: bigint) {
const bytes = new Uint8Array(8);
new DataView(bytes.buffer).setBigInt64(0, value, false); /* big-endian */
return new InvertedIndexKey(new TextEncoder().encode(field), bytes);
}
buffer() {
const tmp = new Uint8Array(
this.field.byteLength + (this.value?.byteLength ?? 0) + 1
);
tmp.set(new Uint8Array(this.field), 0);
tmp.set(new Uint8Array(this.gap), this.field.byteLength);
if (this.value.byteLength > 0) {
tmp.set(new Uint8Array(this.value), this.field.byteLength + 1);
}
return tmp.buffer;
}
static fromBuffer(buffer: ArrayBuffer) {
const array = new Uint8Array(buffer);
const fieldLength = array.indexOf(58);
const field = array.slice(0, fieldLength);
const value = array.slice(fieldLength + 1);
return new InvertedIndexKey(field, value);
}
}

View File

@@ -1,105 +0,0 @@
export class Match {
scores = new Map<number, number>();
/**
* nid -> field -> index(multi value field) -> [start, end][]
*/
highlighters = new Map<
number,
Map<string, Map<number, [number, number][]>>
>();
constructor() {}
size() {
return this.scores.size;
}
getScore(id: number) {
return this.scores.get(id) ?? 0;
}
addScore(id: number, score: number) {
const currentScore = this.scores.get(id) || 0;
this.scores.set(id, currentScore + score);
}
getHighlighters(id: number, field: string) {
return this.highlighters.get(id)?.get(field);
}
addHighlighter(
id: number,
field: string,
index: number,
newRanges: [number, number][]
) {
const fields =
this.highlighters.get(id) ||
new Map<string, Map<number, [number, number][]>>();
const values = fields.get(field) || new Map<number, [number, number][]>();
const ranges = values.get(index) || [];
ranges.push(...newRanges);
values.set(index, ranges);
fields.set(field, values);
this.highlighters.set(id, fields);
}
and(other: Match) {
const newWeight = new Match();
for (const [id, score] of this.scores) {
if (other.scores.has(id)) {
newWeight.addScore(id, score + (other.scores.get(id) ?? 0));
newWeight.copyExtData(this, id);
newWeight.copyExtData(other, id);
}
}
return newWeight;
}
or(other: Match) {
const newWeight = new Match();
for (const [id, score] of this.scores) {
newWeight.addScore(id, score);
newWeight.copyExtData(this, id);
}
for (const [id, score] of other.scores) {
newWeight.addScore(id, score);
newWeight.copyExtData(other, id);
}
return newWeight;
}
exclude(other: Match) {
const newWeight = new Match();
for (const [id, score] of this.scores) {
if (!other.scores.has(id)) {
newWeight.addScore(id, score);
newWeight.copyExtData(this, id);
}
}
return newWeight;
}
boost(boost: number) {
const newWeight = new Match();
for (const [id, score] of this.scores) {
newWeight.addScore(id, score * boost);
newWeight.copyExtData(this, id);
}
return newWeight;
}
toArray() {
return Array.from(this.scores.entries())
.sort((a, b) => b[1] - a[1])
.map(e => e[0]);
}
private copyExtData(from: Match, id: number) {
for (const [field, values] of from.highlighters.get(id) ?? []) {
for (const [index, ranges] of values) {
this.addHighlighter(id, field, index, ranges);
}
}
}
}

View File

@@ -1,162 +0,0 @@
import Graphemer from 'graphemer';
export interface Tokenizer {
tokenize(text: string): Token[];
}
export interface Token {
term: string;
start: number;
end: number;
}
export class SimpleTokenizer implements Tokenizer {
tokenize(text: string): Token[] {
const tokens: Token[] = [];
let start = 0;
let end = 0;
let inWord = false;
for (let i = 0; i < text.length; i++) {
const c = text[i];
if (c.match(/[\n\r\p{Z}\p{P}]/u)) {
if (inWord) {
end = i;
tokens.push({
term: text.substring(start, end).toLowerCase(),
start,
end,
});
inWord = false;
}
} else {
if (!inWord) {
start = i;
end = i;
inWord = true;
}
}
}
if (inWord) {
tokens.push({
term: text.substring(start).toLowerCase(),
start,
end: text.length,
});
}
return tokens;
}
}
export class NGramTokenizer implements Tokenizer {
constructor(private readonly n: number) {}
tokenize(text: string): Token[] {
const splitted: Token[] = [];
for (let i = 0; i < text.length; ) {
const nextBreak = Graphemer.nextBreak(text, i);
const c = text.substring(i, nextBreak);
splitted.push({
term: c,
start: i,
end: nextBreak,
});
i = nextBreak;
}
const tokens: Token[] = [];
for (let i = 0; i < splitted.length - this.n + 1; i++) {
tokens.push(
splitted.slice(i, i + this.n).reduce(
(acc, t) => ({
term: acc.term + t.term,
start: Math.min(acc.start, t.start),
end: Math.max(acc.end, t.end),
}),
{ term: '', start: Infinity, end: -Infinity }
)
);
}
return tokens;
}
}
export class GeneralTokenizer implements Tokenizer {
constructor() {}
tokenizeWord(word: string, lang: string): Token[] {
if (lang === 'en') {
return [{ term: word.toLowerCase(), start: 0, end: word.length }];
} else if (lang === 'cjk') {
if (word.length < 3) {
return [{ term: word, start: 0, end: word.length }];
}
return new NGramTokenizer(2).tokenize(word);
} else if (lang === 'emoji') {
return new NGramTokenizer(1).tokenize(word);
} else if (lang === '-') {
return [];
}
throw new Error('Not implemented');
}
testLang(c: string): string {
if (c.match(/[\p{Emoji}]/u)) {
return 'emoji';
} else if (c.match(/[\p{sc=Han}\p{scx=Hira}\p{scx=Kana}\p{sc=Hang}]/u)) {
return 'cjk';
} else if (c.match(/[\n\r\p{Z}\p{P}]/u)) {
return '-';
} else {
return 'en';
}
}
tokenize(text: string): Token[] {
const tokens: Token[] = [];
let start = 0;
let end = 0;
let lang: string | null = null;
for (let i = 0; i < text.length; ) {
const nextBreak = Graphemer.nextBreak(text, i);
const c = text.substring(i, nextBreak);
const l = this.testLang(c);
if (lang !== l) {
if (lang !== null) {
end = i;
tokens.push(
...this.tokenizeWord(text.substring(start, end), lang).map(
token => ({
...token,
start: token.start + start,
end: token.end + start,
})
)
);
}
start = i;
end = i;
lang = l;
}
i = nextBreak;
}
if (lang !== null) {
tokens.push(
...this.tokenizeWord(text.substring(start, text.length), lang).map(
token => ({
...token,
start: token.start + start,
end: token.end + start,
})
)
);
}
return tokens;
}
}

View File

@@ -1,290 +0,0 @@
import {
type AggregateOptions,
type AggregateResult,
Document,
type Query,
type Schema,
type SearchOptions,
type SearchResult,
} from '../../';
import {
BooleanInvertedIndex,
FullTextInvertedIndex,
IntegerInvertedIndex,
type InvertedIndex,
StringInvertedIndex,
} from './inverted-index';
import { Match } from './match';
type DataRecord = {
id: string;
data: Map<string, string[]>;
deleted: boolean;
};
export class DataStruct {
records: DataRecord[] = [];
idMap = new Map<string, number>();
invertedIndex = new Map<string, InvertedIndex>();
constructor(schema: Schema) {
for (const [key, type] of Object.entries(schema)) {
const typeInfo = typeof type === 'string' ? { type } : type;
if (typeInfo.type === 'String') {
this.invertedIndex.set(key, new StringInvertedIndex(key));
} else if (typeInfo.type === 'Integer') {
this.invertedIndex.set(key, new IntegerInvertedIndex(key));
} else if (typeInfo.type === 'FullText') {
this.invertedIndex.set(key, new FullTextInvertedIndex(key));
} else if (typeInfo.type === 'Boolean') {
this.invertedIndex.set(key, new BooleanInvertedIndex(key));
} else {
throw new Error(`Field type '${type}' not supported`);
}
}
}
getAll(ids?: string[]): Document[] {
if (ids) {
return ids
.map(id => {
const nid = this.idMap.get(id);
if (nid === undefined) {
return undefined;
}
return Document.from(id, this.records[nid].data);
})
.filter((v): v is Document => v !== undefined);
} else {
return this.records
.filter(record => !record.deleted)
.map(record => Document.from(record.id, record.data));
}
}
insert(document: Document) {
if (this.idMap.has(document.id)) {
throw new Error('Document already exists');
}
this.records.push({
id: document.id,
data: document.fields as Map<string, string[]>,
deleted: false,
});
const nid = this.records.length - 1;
this.idMap.set(document.id, nid);
for (const [key, values] of document.fields) {
for (const value of values) {
const iidx = this.invertedIndex.get(key as string);
if (!iidx) {
throw new Error(
`Inverted index '${key.toString()}' not found, document not match schema`
);
}
iidx.insert(nid, value);
}
}
}
delete(id: string) {
const nid = this.idMap.get(id);
if (nid === undefined) {
throw new Error('Document not found');
}
this.records[nid].deleted = true;
this.records[nid].data = new Map();
}
matchAll(): Match {
const weight = new Match();
for (let i = 0; i < this.records.length; i++) {
weight.addScore(i, 1);
}
return weight;
}
clear() {
this.records = [];
this.idMap.clear();
this.invertedIndex.forEach(v => v.clear());
}
private queryRaw(query: Query<any>): Match {
if (query.type === 'match') {
const iidx = this.invertedIndex.get(query.field as string);
if (!iidx) {
throw new Error(`Field '${query.field as string}' not found`);
}
return iidx.match(query.match);
} else if (query.type === 'boolean') {
const weights = query.queries.map(q => this.queryRaw(q));
if (query.occur === 'must') {
return weights.reduce((acc, w) => acc.and(w));
} else if (query.occur === 'must_not') {
const total = weights.reduce((acc, w) => acc.and(w));
return this.matchAll().exclude(total);
} else if (query.occur === 'should') {
return weights.reduce((acc, w) => acc.or(w));
}
} else if (query.type === 'all') {
return this.matchAll();
} else if (query.type === 'boost') {
return this.queryRaw(query.query).boost(query.boost);
} else if (query.type === 'exists') {
const iidx = this.invertedIndex.get(query.field as string);
if (!iidx) {
throw new Error(`Field '${query.field as string}' not found`);
}
return iidx.all();
}
throw new Error(`Query type '${query.type}' not supported`);
}
query(query: Query<any>): Match {
return this.queryRaw(query).filter(id => !this.records[id].deleted);
}
search(
query: Query<any>,
options: SearchOptions<any> = {}
): SearchResult<any, any> {
const pagination = {
skip: options.pagination?.skip ?? 0,
limit: options.pagination?.limit ?? 100,
};
const match = this.query(query);
const nids = match
.toArray()
.slice(pagination.skip, pagination.skip + pagination.limit);
return {
pagination: {
count: match.size(),
hasMore: match.size() > pagination.limit + pagination.skip,
limit: pagination.limit,
skip: pagination.skip,
},
nodes: nids.map(nid => this.resultNode(match, nid, options)),
};
}
aggregate(
query: Query<any>,
field: string,
options: AggregateOptions<any> = {}
): AggregateResult<any, any> {
const pagination = {
skip: options.pagination?.skip ?? 0,
limit: options.pagination?.limit ?? 100,
};
const match = this.query(query);
const nids = match.toArray();
const buckets: { key: string; nids: number[] }[] = [];
for (const nid of nids) {
for (const value of this.records[nid].data.get(field) ?? []) {
let bucket = buckets.find(b => b.key === value);
if (!bucket) {
bucket = { key: value, nids: [] };
buckets.push(bucket);
}
bucket.nids.push(nid);
}
}
return {
buckets: buckets
.slice(pagination.skip, pagination.skip + pagination.limit)
.map(bucket => {
const result = {
key: bucket.key,
score: match.getScore(bucket.nids[0]),
count: bucket.nids.length,
} as AggregateResult<any, any>['buckets'][number];
if (options.hits) {
const hitsOptions = options.hits;
const pagination = {
skip: options.hits.pagination?.skip ?? 0,
limit: options.hits.pagination?.limit ?? 3,
};
const hits = bucket.nids.slice(
pagination.skip,
pagination.skip + pagination.limit
);
(result as any).hits = {
pagination: {
count: bucket.nids.length,
hasMore:
bucket.nids.length > pagination.limit + pagination.skip,
limit: pagination.limit,
skip: pagination.skip,
},
nodes: hits.map(nid => this.resultNode(match, nid, hitsOptions)),
} as SearchResult<any, any>;
}
return result;
}),
pagination: {
count: buckets.length,
hasMore: buckets.length > pagination.limit + pagination.skip,
limit: pagination.limit,
skip: pagination.skip,
},
};
}
has(id: string): boolean {
return this.idMap.has(id);
}
private resultNode(
match: Match,
nid: number,
options: SearchOptions<any>
): SearchResult<any, any>['nodes'][number] {
const node = {
id: this.records[nid].id,
score: match.getScore(nid),
} as any;
if (options.fields) {
const fields = {} as Record<string, string | string[]>;
for (const field of options.fields as string[]) {
fields[field] = this.records[nid].data.get(field) ?? [''];
if (fields[field].length === 1) {
fields[field] = fields[field][0];
}
}
node.fields = fields;
}
if (options.highlights) {
const highlights = {} as Record<string, string[]>;
for (const { field, before, end } of options.highlights) {
highlights[field] = match
.getHighlighters(nid, field)
.flatMap(highlighter => {
return highlighter(before, end);
});
}
node.highlights = highlights;
}
return node;
}
}

View File

@@ -1,141 +0,0 @@
import { map, merge, type Observable, of, Subject, throttleTime } from 'rxjs';
import type {
AggregateOptions,
AggregateResult,
Document,
Index,
IndexStorage,
IndexWriter,
Query,
Schema,
SearchOptions,
SearchResult,
} from '../../';
import { DataStruct } from './data-struct';
export class MemoryIndex<S extends Schema> implements Index<S> {
private readonly data: DataStruct = new DataStruct(this.schema);
broadcast$ = new Subject<number>();
constructor(private readonly schema: Schema) {}
write(): Promise<IndexWriter<S>> {
return Promise.resolve(new MemoryIndexWriter(this.data, this.broadcast$));
}
async get(id: string): Promise<Document<S> | null> {
return (await this.getAll([id]))[0] ?? null;
}
getAll(ids?: string[]): Promise<Document<S>[]> {
return Promise.resolve(this.data.getAll(ids));
}
has(id: string): Promise<boolean> {
return Promise.resolve(this.data.has(id));
}
async search(
query: Query<any>,
options: SearchOptions<any> = {}
): Promise<SearchResult<any, any>> {
return this.data.search(query, options);
}
search$(
query: Query<any>,
options: SearchOptions<any> = {}
): Observable<SearchResult<any, any>> {
return merge(of(1), this.broadcast$).pipe(
throttleTime(500, undefined, { leading: false, trailing: true }),
map(() => this.data.search(query, options))
);
}
async aggregate(
query: Query<any>,
field: string,
options: AggregateOptions<any> = {}
): Promise<AggregateResult<any, any>> {
return this.data.aggregate(query, field, options);
}
aggregate$(
query: Query<any>,
field: string,
options: AggregateOptions<any> = {}
): Observable<AggregateResult<S, AggregateOptions<any>>> {
return merge(of(1), this.broadcast$).pipe(
throttleTime(500, undefined, { leading: false, trailing: true }),
map(() => this.data.aggregate(query, field, options))
);
}
clear(): Promise<void> {
this.data.clear();
return Promise.resolve();
}
}
export class MemoryIndexWriter<S extends Schema> implements IndexWriter<S> {
inserts: Document[] = [];
deletes: string[] = [];
constructor(
private readonly data: DataStruct,
private readonly broadcast$: Subject<number>
) {}
async get(id: string): Promise<Document<S> | null> {
return (await this.getAll([id]))[0] ?? null;
}
getAll(ids: string[]): Promise<Document<S>[]> {
return Promise.resolve(this.data.getAll(ids));
}
insert(document: Document): void {
this.inserts.push(document);
}
delete(id: string): void {
this.deletes.push(id);
}
put(document: Document): void {
this.delete(document.id);
this.insert(document);
}
async search(
query: Query<any>,
options: SearchOptions<any> = {}
): Promise<SearchResult<any, any>> {
return this.data.search(query, options);
}
async aggregate(
query: Query<any>,
field: string,
options: AggregateOptions<any> = {}
): Promise<AggregateResult<any, any>> {
return this.data.aggregate(query, field, options);
}
commit(): Promise<void> {
for (const del of this.deletes) {
this.data.delete(del);
}
for (const inst of this.inserts) {
this.data.insert(inst);
}
this.broadcast$.next(1);
return Promise.resolve();
}
rollback(): void {}
has(id: string): Promise<boolean> {
return Promise.resolve(this.data.has(id));
}
}
export class MemoryIndexStorage implements IndexStorage {
getIndex<S extends Schema>(_: string, schema: S): Index<S> {
return new MemoryIndex(schema);
}
}

View File

@@ -1,220 +0,0 @@
import Fuse from 'fuse.js';
import { Match } from './match';
export interface InvertedIndex {
fieldKey: string;
match(term: string): Match;
all(): Match;
insert(id: number, term: string): void;
clear(): void;
}
export class StringInvertedIndex implements InvertedIndex {
index: Map<string, number[]> = new Map();
constructor(readonly fieldKey: string) {}
match(term: string): Match {
const match = new Match();
for (const id of this.index.get(term) ?? []) {
match.addScore(id, 1);
}
return match;
}
all(): Match {
const match = new Match();
for (const [_term, ids] of this.index) {
for (const id of ids) {
if (match.getScore(id) === 0) {
match.addScore(id, 1);
}
}
}
return match;
}
insert(id: number, term: string): void {
const ids = this.index.get(term) ?? [];
ids.push(id);
this.index.set(term, ids);
}
clear(): void {
this.index.clear();
}
}
export class IntegerInvertedIndex implements InvertedIndex {
index: Map<string, number[]> = new Map();
constructor(readonly fieldKey: string) {}
// eslint-disable-next-line sonarjs/no-identical-functions
match(term: string): Match {
const match = new Match();
for (const id of this.index.get(term) ?? []) {
match.addScore(id, 1);
}
return match;
}
// eslint-disable-next-line sonarjs/no-identical-functions
all(): Match {
const match = new Match();
for (const [_term, ids] of this.index) {
for (const id of ids) {
if (match.getScore(id) === 0) {
match.addScore(id, 1);
}
}
}
return match;
}
// eslint-disable-next-line sonarjs/no-identical-functions
insert(id: number, term: string): void {
const ids = this.index.get(term) ?? [];
ids.push(id);
this.index.set(term, ids);
}
clear(): void {
this.index.clear();
}
}
export class BooleanInvertedIndex implements InvertedIndex {
index: Map<boolean, number[]> = new Map();
constructor(readonly fieldKey: string) {}
// eslint-disable-next-line sonarjs/no-identical-functions
match(term: string): Match {
const match = new Match();
for (const id of this.index.get(term === 'true') ?? []) {
match.addScore(id, 1);
}
return match;
}
// eslint-disable-next-line sonarjs/no-identical-functions
all(): Match {
const match = new Match();
for (const [_term, ids] of this.index) {
for (const id of ids) {
if (match.getScore(id) === 0) {
match.addScore(id, 1);
}
}
}
return match;
}
// eslint-disable-next-line sonarjs/no-identical-functions
insert(id: number, term: string): void {
const ids = this.index.get(term === 'true') ?? [];
ids.push(id);
this.index.set(term === 'true', ids);
}
clear(): void {
this.index.clear();
}
}
export class FullTextInvertedIndex implements InvertedIndex {
records = [] as { id: number; v: string }[];
index = Fuse.createIndex(['v'], [] as { id: number; v: string }[]);
constructor(readonly fieldKey: string) {}
match(term: string): Match {
const searcher = new Fuse(
this.records,
{
includeScore: true,
includeMatches: true,
shouldSort: true,
keys: ['v'],
},
this.index
);
const result = searcher.search(term);
const match = new Match();
for (const value of result) {
match.addScore(value.item.id, 1 - (value.score ?? 1));
match.addHighlighter(value.item.id, this.fieldKey, (before, after) => {
const matches = value.matches;
if (!matches || matches.length === 0) {
return [''];
}
const firstMatch = matches[0];
const text = firstMatch.value;
if (!text) {
return [''];
}
let result = '';
let pointer = 0;
for (const match of matches) {
for (const [start, end] of match.indices) {
result += text.substring(pointer, start);
result += `${before}${text.substring(start, end + 1)}${after}`;
pointer = end + 1;
}
}
result += text.substring(pointer);
return [result];
});
}
return match;
}
// eslint-disable-next-line sonarjs/no-identical-functions
all(): Match {
const match = new Match();
for (const { id } of this.records) {
if (match.getScore(id) === 0) {
match.addScore(id, 1);
}
}
return match;
}
insert(id: number, term: string): void {
this.index.add({ id, v: term });
this.records.push({ id, v: term });
}
clear(): void {
this.records = [];
this.index = Fuse.createIndex(['v'], [] as { id: number; v: string }[]);
}
}

View File

@@ -1,108 +0,0 @@
export class Match {
scores = new Map<number, number>();
highlighters = new Map<
number,
Map<string, ((before: string, after: string) => string[])[]>
>();
constructor() {}
size() {
return this.scores.size;
}
getScore(id: number) {
return this.scores.get(id) ?? 0;
}
addScore(id: number, score: number) {
const currentScore = this.scores.get(id) || 0;
this.scores.set(id, currentScore + score);
}
getHighlighters(id: number, field: string) {
return this.highlighters.get(id)?.get(field) ?? [];
}
addHighlighter(
id: number,
field: string,
highlighter: (before: string, after: string) => string[]
) {
const fields = this.highlighters.get(id) || new Map();
const highlighters = fields.get(field) || [];
highlighters.push(highlighter);
fields.set(field, highlighters);
this.highlighters.set(id, fields);
}
and(other: Match) {
const newWeight = new Match();
for (const [id, score] of this.scores) {
if (other.scores.has(id)) {
newWeight.addScore(id, score + (other.scores.get(id) ?? 0));
newWeight.copyExtData(this, id);
newWeight.copyExtData(other, id);
}
}
return newWeight;
}
or(other: Match) {
const newWeight = new Match();
for (const [id, score] of this.scores) {
newWeight.addScore(id, score);
newWeight.copyExtData(this, id);
}
for (const [id, score] of other.scores) {
newWeight.addScore(id, score);
newWeight.copyExtData(other, id);
}
return newWeight;
}
exclude(other: Match) {
const newWeight = new Match();
for (const [id, score] of this.scores) {
if (!other.scores.has(id)) {
newWeight.addScore(id, score);
newWeight.copyExtData(this, id);
}
}
return newWeight;
}
boost(boost: number) {
const newWeight = new Match();
for (const [id, score] of this.scores) {
newWeight.addScore(id, score * boost);
newWeight.copyExtData(this, id);
}
return newWeight;
}
toArray() {
return Array.from(this.scores.entries())
.sort((a, b) => b[1] - a[1])
.map(e => e[0]);
}
filter(predicate: (id: number) => boolean) {
const newWeight = new Match();
for (const [id, score] of this.scores) {
if (predicate(id)) {
newWeight.addScore(id, score);
newWeight.copyExtData(this, id);
}
}
return newWeight;
}
private copyExtData(from: Match, id: number) {
for (const [field, highlighters] of from.highlighters.get(id) ?? []) {
for (const highlighter of highlighters) {
this.addHighlighter(id, field, highlighter);
}
}
}
}

View File

@@ -1,6 +0,0 @@
export * from './document';
export * from './field-type';
export * from './indexer';
export * from './query';
export * from './schema';
export * from './searcher';

View File

@@ -1,41 +0,0 @@
import type { Document } from './document';
import type { Schema } from './schema';
import type { Searcher, Subscriber } from './searcher';
export interface Index<S extends Schema>
extends IndexReader<S>,
Searcher<S>,
Subscriber<S> {
write(): Promise<IndexWriter<S>>;
clear(): Promise<void>;
}
export interface IndexWriter<S extends Schema>
extends IndexReader<S>,
Searcher<S> {
insert(document: Document<S>): void;
put(document: Document<S>): void;
delete(id: string): void;
// TODO(@eyhn)
// deleteByQuery(query: Query<S>): void;
commit(): Promise<void>;
rollback(): void;
}
export interface IndexReader<S extends Schema> {
get(id: string): Promise<Document<S> | null>;
getAll(ids?: string[]): Promise<Document<S>[]>;
has(id: string): Promise<boolean>;
}
export interface IndexStorage {
getIndex<S extends Schema>(name: string, schema: S): Index<S>;
}

View File

@@ -1,35 +0,0 @@
import type { Schema } from './schema';
export type MatchQuery<S extends Schema> = {
type: 'match';
field: keyof S;
match: string;
};
export type BoostQuery = {
type: 'boost';
query: Query<any>;
boost: number;
};
export type BooleanQuery<S extends Schema> = {
type: 'boolean';
occur: 'should' | 'must' | 'must_not';
queries: Query<S>[];
};
export type ExistsQuery<S extends Schema> = {
type: 'exists';
field: keyof S;
};
export type AllQuery = {
type: 'all';
};
export type Query<S extends Schema> =
| BooleanQuery<S>
| MatchQuery<S>
| AllQuery
| ExistsQuery<S>
| BoostQuery;

View File

@@ -1,25 +0,0 @@
import type { FieldType } from './field-type';
export type Schema = Record<
string,
| FieldType
| {
type: FieldType;
/**
* If false, the field will not be indexed, and thus not searchable.
*
* default: true
*/
index?: boolean;
/**
* If false, the field will not be stored, and not included in the search result.
*
* default: true
*/
store?: boolean;
}
>;
export function defineSchema<T extends Schema>(schema: T): T {
return schema;
}

View File

@@ -1,83 +0,0 @@
import type { Observable } from 'rxjs';
import type { Query } from './query';
import type { Schema } from './schema';
type HighlightAbleField<S extends Schema> = {
[K in keyof S]: S[K] extends 'FullText' ? K : never;
}[keyof S];
export interface Searcher<S extends Schema = any> {
search<const O extends SearchOptions<S>>(
query: Query<S>,
options?: O
): Promise<SearchResult<S, O>>;
aggregate<const O extends AggregateOptions<S>>(
query: Query<S>,
field: keyof S,
options?: O
): Promise<AggregateResult<S, O>>;
}
export interface Subscriber<S extends Schema = any> {
search$<const O extends SearchOptions<S>>(
query: Query<S>,
options?: O
): Observable<SearchResult<S, O>>;
aggregate$<const O extends AggregateOptions<S>>(
query: Query<S>,
field: keyof S,
options?: O
): Observable<AggregateResult<S, O>>;
}
type ResultPagination = {
count: number;
limit: number;
skip: number;
hasMore: boolean;
};
type PaginationOption = {
limit?: number;
skip?: number;
};
export type SearchOptions<S extends Schema> = {
pagination?: PaginationOption;
highlights?: {
field: HighlightAbleField<S>;
before: string;
end: string;
}[];
fields?: (keyof S)[];
};
export type SearchResult<S extends Schema, O extends SearchOptions<S>> = {
pagination: ResultPagination;
nodes: ({
id: string;
score: number;
} & (O['fields'] extends any[]
? { fields: { [key in O['fields'][number]]: string | string[] } }
: unknown) &
(O['highlights'] extends any[]
? { highlights: { [key in O['highlights'][number]['field']]: string[] } }
: unknown))[];
};
export interface AggregateOptions<S extends Schema> {
pagination?: PaginationOption;
hits?: SearchOptions<S>;
}
export type AggregateResult<S extends Schema, O extends AggregateOptions<S>> = {
pagination: ResultPagination;
buckets: ({
key: string;
score: number;
count: number;
} & (O['hits'] extends object
? { hits: SearchResult<S, O['hits']> }
: unknown))[];
};

View File

@@ -1,47 +0,0 @@
# job
Job system abstraction for AFFiNE. Currently, only `IndexedDBJobQueue` is implemented; more backends will be implemented in the future.
Run background jobs in browser & distributed environment. `runners` can consume tasks simultaneously without additional communication.
# Basic Usage
```ts
const queue = new IndexedDBJobQueue('my-queue');
await queue.enqueue([
{
batchKey: '1',
payload: { a: 'hello' },
},
{
batchKey: '2',
payload: { a: 'world' },
},
]);
const runner = new JobRunner(queue, job => {
console.log(job);
});
runner.start();
// Output:
// { batchKey: '1', payload: { a: 'hello' } }
// { batchKey: '2', payload: { a: 'world' } }
```
## `batchKey`
Each job has a `batchKey`, and jobs with the same `batchKey` are handed over to one `runner` for execution at once.
Additionally, if there are ongoing jobs with the same `batchKey`, other `runners` will not take on jobs with this `batchKey`, ensuring exclusive resource locking.
> In the future, `batchKey` will be used to implement priority.
## `timeout`
If the job execution time exceeds 30 seconds, it will be considered a timeout and reassigned to another `runner`.
## Error Handling
If an error is thrown during job execution, will log an error, but the job will be considered complete.

View File

@@ -1,231 +0,0 @@
/**
* @vitest-environment happy-dom
*/
import 'fake-indexeddb/auto';
import { afterEach, beforeEach, describe, expect, test, vitest } from 'vitest';
import { IndexedDBJobQueue } from '../impl/indexeddb';
import type { JobQueue } from '../queue';
let queue: JobQueue<{
a: string;
}> = null!;
describe.each([{ name: 'idb', backend: IndexedDBJobQueue }])(
'impl tests($name)',
({ backend }) => {
beforeEach(async () => {
queue = new backend();
await queue.clear();
vitest.useFakeTimers({
toFake: ['Date'],
});
});
afterEach(() => {
vitest.useRealTimers();
});
test('basic', async () => {
await queue.enqueue([
{
batchKey: '1',
payload: { a: 'hello' },
},
{
batchKey: '2',
payload: { a: 'world' },
},
]);
const job1 = await queue.accept();
const job2 = await queue.accept();
expect([job1!, job2!]).toEqual([
[
{
id: expect.any(String),
batchKey: '1',
payload: { a: 'hello' },
},
],
[
{
id: expect.any(String),
batchKey: '2',
payload: { a: 'world' },
},
],
]);
const job3 = await queue.accept();
expect(job3).toBeNull();
await queue.return(job1!);
await queue.return(job2!);
});
test('batch', async () => {
await queue.enqueue([
{
batchKey: '1',
payload: { a: 'hello' },
},
{
batchKey: '1',
payload: { a: 'world' },
},
]);
const job1 = await queue.accept();
expect(job1).toEqual(
expect.arrayContaining([
{
id: expect.any(String),
batchKey: '1',
payload: { a: 'hello' },
},
{
id: expect.any(String),
batchKey: '1',
payload: { a: 'world' },
},
])
);
});
test('timeout', async () => {
await queue.enqueue([
{
batchKey: '1',
payload: { a: 'hello' },
},
]);
{
const job = await queue.accept();
expect(job).toEqual([
{
id: expect.any(String),
batchKey: '1',
payload: { a: 'hello' },
},
]);
}
{
const job = await queue.accept();
expect(job).toBeNull();
}
vitest.advanceTimersByTime(1000 * 60 * 60);
{
const job = await queue.accept();
expect(job).toEqual([
{
id: expect.any(String),
batchKey: '1',
payload: { a: 'hello' },
},
]);
}
});
test('waitForAccept', async () => {
const abort = new AbortController();
let result = null as any;
queue.waitForAccept(abort.signal).then(jobs => (result = jobs));
await new Promise(resolve => setTimeout(resolve, 100));
expect(result).toBeNull();
await queue.enqueue([
{
batchKey: '1',
payload: { a: 'hello' },
},
]);
await vitest.waitFor(() => {
expect(result).toEqual([
{
id: expect.any(String),
batchKey: '1',
payload: { a: 'hello' },
},
]);
});
});
test('waitForAccept race', async () => {
const abort = new AbortController();
let result1 = null as any;
let result2 = null as any;
queue.waitForAccept(abort.signal).then(jobs => (result1 = jobs));
queue.waitForAccept(abort.signal).then(jobs => (result2 = jobs));
await new Promise(resolve => setTimeout(resolve, 100));
expect(result1).toBeNull();
expect(result2).toBeNull();
await queue.enqueue([
{
batchKey: '1',
payload: { a: 'hello' },
},
]);
await new Promise(resolve => setTimeout(resolve, 100));
expect([result1, result2]).toEqual(
expect.arrayContaining([
[
{
id: expect.any(String),
batchKey: '1',
payload: { a: 'hello' },
},
],
null,
])
);
await queue.enqueue([
{
batchKey: '2',
payload: { a: 'world' },
},
]);
await vitest.waitFor(() => {
expect([result1, result2]).toEqual(
expect.arrayContaining([
[
{
id: expect.any(String),
batchKey: '1',
payload: { a: 'hello' },
},
],
[
{
id: expect.any(String),
batchKey: '2',
payload: { a: 'world' },
},
],
])
);
});
});
}
);

View File

@@ -1,257 +0,0 @@
import type { DBSchema, IDBPDatabase } from 'idb';
import { openDB } from 'idb';
import { merge, Observable, of, throttleTime } from 'rxjs';
import { fromPromise } from '../../../../livedata';
import { throwIfAborted } from '../../../../utils';
import { exhaustMapWithTrailing } from '../../../../utils/';
import type { Job, JobParams, JobQueue } from '../../';
interface IndexDB extends DBSchema {
jobs: {
key: number;
value: JobRecord;
indexes: {
batchKey: string;
};
};
}
interface JobRecord {
batchKey: string;
startTime: number | null;
payload: any;
}
export class IndexedDBJobQueue<J> implements JobQueue<J> {
database: IDBPDatabase<IndexDB> = null as any;
broadcast = new BroadcastChannel('idb-job-queue:' + this.databaseName);
constructor(private readonly databaseName: string = 'jobs') {}
async enqueue(jobs: JobParams[]): Promise<void> {
await this.ensureInitialized();
const trx = this.database.transaction(['jobs'], 'readwrite');
for (const job of jobs) {
await trx.objectStore('jobs').add({
batchKey: job.batchKey,
payload: job.payload,
startTime: null,
});
}
trx.commit();
// send broadcast to notify new jobs
this.broadcast.postMessage('new-jobs');
}
async accept(): Promise<Job[] | null> {
await this.ensureInitialized();
const jobs = [];
const trx = this.database.transaction(['jobs'], 'readwrite', {
durability: 'relaxed',
});
// if no priority jobs
if (jobs.length === 0) {
const batchKeys = trx.objectStore('jobs').index('batchKey').iterate();
let currentBatchKey: string = null as any;
let currentBatchJobs = [];
let skipCurrentBatch = false;
for await (const item of batchKeys) {
if (item.value.batchKey !== currentBatchKey) {
if (!skipCurrentBatch && currentBatchJobs.length > 0) {
break;
}
currentBatchKey = item.value.batchKey;
currentBatchJobs = [];
skipCurrentBatch = false;
}
if (skipCurrentBatch) {
continue;
}
if (this.isAcceptable(item.value)) {
currentBatchJobs.push({
id: item.primaryKey,
job: item.value,
});
} else {
skipCurrentBatch = true;
}
}
if (skipCurrentBatch === false && currentBatchJobs.length > 0) {
jobs.push(...currentBatchJobs);
}
}
for (const { id, job } of jobs) {
const startTime = Date.now();
await trx.objectStore('jobs').put({ ...job, startTime }, id);
}
if (jobs.length === 0) {
return null;
}
return jobs.map(({ id, job }) => ({
id: id.toString(),
batchKey: job.batchKey,
payload: job.payload,
}));
}
async waitForAccept(signal: AbortSignal): Promise<Job<J>[]> {
const broadcast = new BroadcastChannel(
'idb-job-queue:' + this.databaseName
);
try {
let deferred = Promise.withResolvers<void>();
broadcast.onmessage = () => {
deferred.resolve();
};
while (throwIfAborted(signal)) {
const jobs = await this.accept();
if (jobs !== null) {
return jobs;
}
await Promise.race([
deferred.promise,
new Promise(resolve => {
setTimeout(resolve, 5000);
}),
new Promise((_, reject) => {
// exit if manually stopped
if (signal?.aborted) {
reject(signal.reason);
}
signal?.addEventListener('abort', () => {
reject(signal.reason);
});
}),
]);
deferred = Promise.withResolvers<void>();
}
return [];
} finally {
broadcast.close();
}
}
async complete(jobs: Job[]): Promise<void> {
await this.ensureInitialized();
const trx = this.database.transaction(['jobs'], 'readwrite', {
durability: 'relaxed',
});
for (const { id } of jobs) {
await trx
.objectStore('jobs')
.delete(typeof id === 'string' ? parseInt(id) : id);
}
trx.commit();
this.broadcast.postMessage('job-completed');
}
async return(jobs: Job[], retry: boolean = false): Promise<void> {
await this.ensureInitialized();
const trx = this.database.transaction(['jobs'], 'readwrite', {
durability: 'relaxed',
});
for (const { id } of jobs) {
if (retry) {
const nid = typeof id === 'string' ? parseInt(id) : id;
const job = await trx.objectStore('jobs').get(nid);
if (job) {
await trx.objectStore('jobs').put({ ...job, startTime: null }, nid);
}
} else {
await trx
.objectStore('jobs')
.delete(typeof id === 'string' ? parseInt(id) : id);
}
}
trx.commit();
this.broadcast.postMessage('job-completed');
}
async clear(): Promise<void> {
await this.ensureInitialized();
const trx = this.database.transaction(['jobs'], 'readwrite', {
durability: 'relaxed',
});
await trx.objectStore('jobs').clear();
}
private async ensureInitialized(): Promise<void> {
if (!this.database) {
await this.initialize();
}
}
private async initialize(): Promise<void> {
if (this.database) {
return;
}
this.database = await openDB(this.databaseName, 1, {
upgrade(database) {
const jobs = database.createObjectStore('jobs', {
autoIncrement: true,
});
jobs.createIndex('batchKey', 'batchKey');
},
});
}
TIMEOUT = 1000 * 30 /* 30 seconds */;
private isTimeout(job: JobRecord) {
return job.startTime !== null && job.startTime + this.TIMEOUT < Date.now();
}
private isAcceptable(job: JobRecord) {
return job.startTime === null || this.isTimeout(job);
}
get status$() {
return merge(
of(1),
new Observable(subscriber => {
const broadcast = new BroadcastChannel(
'idb-job-queue:' + this.databaseName
);
broadcast.onmessage = () => {
subscriber.next(1);
};
return () => {
broadcast.close();
};
})
).pipe(
throttleTime(300, undefined, { leading: true, trailing: true }),
exhaustMapWithTrailing(() =>
fromPromise(async () => {
await this.ensureInitialized();
const trx = this.database.transaction(['jobs'], 'readonly');
const remaining = await trx.objectStore('jobs').count();
return { remaining };
})
)
);
}
}

View File

@@ -1,2 +0,0 @@
export * from './queue';
export * from './runner';

View File

@@ -1,28 +0,0 @@
import type { Observable } from 'rxjs';
export interface JobParams<Payload = any> {
batchKey: string;
payload: Payload;
}
export interface Job<Payload = any> extends JobParams<Payload> {
id: string;
}
export interface JobQueueStatus {
remaining: number;
}
export interface JobQueue<Payload> {
enqueue(jobs: JobParams<Payload>[]): Promise<void>;
accept(): Promise<Job<Payload>[] | null>;
waitForAccept(signal: AbortSignal): Promise<Job<Payload>[]>;
return(jobs: Job<Payload>[], retry?: boolean): Promise<void>;
clear(): Promise<void>;
status$: Observable<JobQueueStatus>;
}

View File

@@ -1,63 +0,0 @@
import { DebugLogger } from '@affine/debug';
import { MANUALLY_STOP, throwIfAborted } from '../../utils';
import type { Job, JobQueue } from './queue';
const logger = new DebugLogger('job-runner');
export class JobRunner<J> {
abort: AbortController | null = null;
constructor(
private readonly queue: JobQueue<J>,
private readonly worker: (
jobs: Job<J>[],
signal: AbortSignal
) => Promise<void>,
private readonly interval: () => Promise<void> = async () => {}
) {}
start() {
this.stop();
this.abort = new AbortController();
this.loop(this.abort.signal).catch(err => {
if (err === MANUALLY_STOP) {
return;
}
logger.error(err);
});
}
stop() {
this.abort?.abort(MANUALLY_STOP);
this.abort = null;
}
async loop(signal: AbortSignal) {
while (throwIfAborted(signal)) {
const jobs = await this.queue.waitForAccept(signal);
if (jobs !== null) {
try {
await this.worker(jobs, signal);
await this.queue.return(jobs);
} catch (err) {
if (err === MANUALLY_STOP) {
await this.queue.return(jobs, true);
} else {
// TODO: retry logic
await this.queue.return(jobs);
}
logger.error(
'Error processing jobs',
err instanceof Error ? (err.stack ?? err.message) : err
);
}
} else {
await new Promise(resolve => setTimeout(resolve, 1000));
}
await this.interval();
}
}
}