feat: database indexing support (#14181)

This commit is contained in:
DarkSky
2025-12-30 05:23:09 +08:00
committed by GitHub
parent 95a5e941e7
commit ff2e96d847
7 changed files with 339 additions and 194 deletions

View File

@@ -50,6 +50,16 @@ export interface NativeMarkdownResult {
markdown: string
}
export interface NativePageDocContent {
title: string
summary: string
}
export interface NativeWorkspaceDocContent {
name: string
avatarKey: string
}
export interface ParsedDoc {
name: string
chunks: Array<Chunk>
@@ -61,6 +71,10 @@ export declare function parseDocFromBinary(docBin: Buffer, docId: string): Nativ
export declare function parseDocToMarkdown(docBin: Buffer, docId: string, aiEditable?: boolean | undefined | null, docUrlPrefix?: string | undefined | null): NativeMarkdownResult
export declare function parsePageDoc(docBin: Buffer, maxSummaryLength?: number | undefined | null): NativePageDocContent | null
export declare function parseWorkspaceDoc(docBin: Buffer): NativeWorkspaceDocContent | null
export declare function readAllDocIdsFromRootDoc(docBin: Buffer, includeTrash?: boolean | undefined | null): Array<string>
export declare function verifyChallengeResponse(response: string, bits: number, resource: string): Promise<boolean>

View File

@@ -1,4 +1,6 @@
use affine_common::doc_parser::{self, BlockInfo, CrawlResult, MarkdownResult};
use affine_common::doc_parser::{
self, BlockInfo, CrawlResult, MarkdownResult, PageDocContent, WorkspaceDocContent,
};
use napi::bindgen_prelude::*;
use napi_derive::napi;
@@ -17,6 +19,36 @@ impl From<MarkdownResult> for NativeMarkdownResult {
}
}
#[napi(object)]
pub struct NativePageDocContent {
pub title: String,
pub summary: String,
}
impl From<PageDocContent> for NativePageDocContent {
fn from(result: PageDocContent) -> Self {
Self {
title: result.title,
summary: result.summary,
}
}
}
#[napi(object)]
pub struct NativeWorkspaceDocContent {
pub name: String,
pub avatar_key: String,
}
impl From<WorkspaceDocContent> for NativeWorkspaceDocContent {
fn from(result: WorkspaceDocContent) -> Self {
Self {
name: result.name,
avatar_key: result.avatar_key,
}
}
}
#[napi(object)]
pub struct NativeBlockInfo {
pub block_id: String,
@@ -70,6 +102,23 @@ pub fn parse_doc_from_binary(doc_bin: Buffer, doc_id: String) -> Result<NativeCr
Ok(result.into())
}
#[napi]
pub fn parse_page_doc(
doc_bin: Buffer,
max_summary_length: Option<i32>,
) -> Result<Option<NativePageDocContent>> {
let result = doc_parser::parse_page_doc(doc_bin.into(), max_summary_length.map(|v| v as isize))
.map_err(|e| Error::new(Status::GenericFailure, e.to_string()))?;
Ok(result.map(Into::into))
}
#[napi]
pub fn parse_workspace_doc(doc_bin: Buffer) -> Result<Option<NativeWorkspaceDocContent>> {
let result = doc_parser::parse_workspace_doc(doc_bin.into())
.map_err(|e| Error::new(Status::GenericFailure, e.to_string()))?;
Ok(result.map(Into::into))
}
#[napi]
pub fn parse_doc_to_markdown(
doc_bin: Buffer,

View File

@@ -1,11 +1,6 @@
import { FactoryProvider, Injectable, Logger } from '@nestjs/common';
import { ModuleRef } from '@nestjs/core';
import {
applyUpdate,
diffUpdate,
Doc as YDoc,
encodeStateVectorFromUpdate,
} from 'yjs';
import { diffUpdate, encodeStateVectorFromUpdate } from 'yjs';
import {
Cache,
@@ -48,16 +43,14 @@ export abstract class DocReader {
protected readonly blobStorage: WorkspaceBlobStorage
) {}
// keep methods to allow test mocking
parseDocContent(bin: Uint8Array, maxSummaryLength = 150) {
const doc = new YDoc();
applyUpdate(doc, bin);
return parsePageDoc(doc, { maxSummaryLength });
return parsePageDoc(bin, { maxSummaryLength });
}
// keep methods to allow test mocking
parseWorkspaceContent(bin: Uint8Array) {
const doc = new YDoc();
applyUpdate(doc, bin);
return parseWorkspaceDoc(doc);
return parseWorkspaceDoc(bin);
}
abstract getDoc(

View File

@@ -1,6 +1,6 @@
import { Array as YArray, Doc as YDoc, Map as YMap } from 'yjs';
import {
parsePageDocFromBinary,
parseWorkspaceDocFromBinary,
parseYDocFromBinary,
parseYDocToMarkdown,
readAllDocIdsFromRootDoc,
@@ -16,144 +16,26 @@ export interface WorkspaceDocContent {
avatarKey: string;
}
type KnownFlavour =
| 'affine:page'
| 'affine:note'
| 'affine:surface'
| 'affine:paragraph'
| 'affine:list'
| 'affine:code'
| 'affine:image'
| 'affine:attachment'
| 'affine:transcription'
| 'affine:callout'
| 'affine:table';
export function parseWorkspaceDoc(doc: YDoc): WorkspaceDocContent | null {
// not a workspace doc
if (!doc.share.has('meta')) {
return null;
}
const meta = doc.getMap('meta');
return {
name: meta.get('name') as string,
avatarKey: meta.get('avatar') as string,
};
export interface ParsePageOptions {
maxSummaryLength?: number;
}
export interface ParsePageOptions {
maxSummaryLength: number;
export function parseWorkspaceDoc(
snapshot: Uint8Array
): WorkspaceDocContent | null {
return parseWorkspaceDocFromBinary(Buffer.from(snapshot)) ?? null;
}
export function parsePageDoc(
doc: YDoc,
docSnapshot: Uint8Array,
opts: ParsePageOptions = { maxSummaryLength: 150 }
): PageDocContent | null {
// not a page doc
if (!doc.share.has('blocks')) {
return null;
}
const blocks = doc.getMap<YMap<any>>('blocks');
if (!blocks.size) {
return null;
}
const content: PageDocContent = {
title: '',
summary: '',
};
let summaryLenNeeded = opts.maxSummaryLength;
let root: YMap<any> | null = null;
for (const block of blocks.values()) {
const flavour = block.get('sys:flavour') as KnownFlavour;
if (flavour === 'affine:page') {
content.title = block.get('prop:title') as string;
root = block;
}
}
if (!root) {
return null;
}
const queue: string[] = [root.get('sys:id')];
function pushChildren(block: YMap<any>) {
const children = block.get('sys:children') as YArray<string> | undefined;
if (children?.length) {
for (let i = children.length - 1; i >= 0; i--) {
queue.push(children.get(i));
}
}
}
while (queue.length) {
const blockId = queue.pop();
const block = blockId ? blocks.get(blockId) : null;
if (!block) {
break;
}
const flavour = block.get('sys:flavour') as KnownFlavour;
switch (flavour) {
case 'affine:page':
case 'affine:note': {
pushChildren(block);
break;
}
case 'affine:attachment':
case 'affine:transcription':
case 'affine:callout': {
// only extract text in full content mode
if (summaryLenNeeded === -1) {
pushChildren(block);
}
break;
}
case 'affine:table': {
// only extract text in full content mode
if (summaryLenNeeded === -1) {
const contents: string[] = [...block.keys()]
.map(key => {
if (key.startsWith('prop:cells.') && key.endsWith('.text')) {
return block.get(key)?.toString() ?? '';
}
return '';
})
.filter(Boolean);
content.summary += contents.join('|');
}
break;
}
case 'affine:paragraph':
case 'affine:list':
case 'affine:code': {
pushChildren(block);
const text = block.get('prop:text');
if (!text) {
continue;
}
if (summaryLenNeeded === -1) {
content.summary += text.toString();
} else if (summaryLenNeeded > 0) {
content.summary += text.toString();
summaryLenNeeded -= text.length;
} else {
break;
}
}
}
}
return content;
return (
parsePageDocFromBinary(
Buffer.from(docSnapshot),
opts?.maxSummaryLength ?? 150
) ?? null
);
}
export function readAllDocIdsFromWorkspaceSnapshot(snapshot: Uint8Array) {

View File

@@ -42,6 +42,8 @@ export const parseDoc = serverNativeModule.parseDoc;
export const htmlSanitize = serverNativeModule.htmlSanitize;
export const parseYDocFromBinary = serverNativeModule.parseDocFromBinary;
export const parseYDocToMarkdown = serverNativeModule.parseDocToMarkdown;
export const parsePageDocFromBinary = serverNativeModule.parsePageDoc;
export const parseWorkspaceDocFromBinary = serverNativeModule.parseWorkspaceDoc;
export const readAllDocIdsFromRootDoc =
serverNativeModule.readAllDocIdsFromRootDoc;
export const AFFINE_PRO_PUBLIC_KEY = serverNativeModule.AFFINE_PRO_PUBLIC_KEY;

View File

@@ -71,6 +71,19 @@ pub struct CrawlResult {
pub summary: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PageDocContent {
pub title: String,
pub summary: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct WorkspaceDocContent {
pub name: String,
#[serde(rename = "avatarKey")]
pub avatar_key: String,
}
#[derive(Error, Debug, Serialize, Deserialize)]
pub enum ParseError {
#[error("doc_not_found")]
@@ -97,6 +110,114 @@ pub struct MarkdownResult {
pub markdown: String,
}
pub fn parse_workspace_doc(doc_bin: Vec<u8>) -> Result<Option<WorkspaceDocContent>, ParseError> {
if doc_bin.is_empty() || doc_bin == [0, 0] {
return Err(ParseError::InvalidBinary);
}
let mut doc = DocOptions::new().build();
doc
.apply_update_from_binary_v1(&doc_bin)
.map_err(|_| ParseError::InvalidBinary)?;
let meta = match doc.get_map("meta") {
Ok(meta) => meta,
Err(_) => return Ok(None),
};
let name = get_string(&meta, "name").unwrap_or_default();
let avatar_key = get_string(&meta, "avatar").unwrap_or_default();
Ok(Some(WorkspaceDocContent { name, avatar_key }))
}
pub fn parse_page_doc(
doc_bin: Vec<u8>,
max_summary_length: Option<isize>,
) -> Result<Option<PageDocContent>, ParseError> {
if doc_bin.is_empty() || doc_bin == [0, 0] {
return Err(ParseError::InvalidBinary);
}
let mut doc = DocOptions::new().build();
doc
.apply_update_from_binary_v1(&doc_bin)
.map_err(|_| ParseError::InvalidBinary)?;
let blocks_map = match doc.get_map("blocks") {
Ok(map) => map,
Err(_) => return Ok(None),
};
if blocks_map.is_empty() {
return Ok(None);
}
let Some(context) = DocContext::from_blocks_map(&blocks_map, PAGE_FLAVOUR) else {
return Ok(None);
};
let mut stack = vec![context.root_block_id.clone()];
let mut content = PageDocContent {
title: context
.block_pool
.get(&context.root_block_id)
.and_then(|block| get_string(block, "prop:title"))
.unwrap_or_default(),
summary: String::new(),
};
let mut summary_remaining = max_summary_length.unwrap_or(150);
while let Some(block_id) = stack.pop() {
let Some(block) = context.block_pool.get(&block_id) else {
break;
};
let Some(flavour) = get_flavour(block) else {
continue;
};
match flavour.as_str() {
"affine:page" | "affine:note" => {
push_children(&mut stack, block);
}
"affine:attachment" | "affine:transcription" | "affine:callout" => {
if summary_remaining == -1 {
push_children(&mut stack, block);
}
}
"affine:database" => {
if summary_remaining == -1 {
append_database_summary(&mut content.summary, block, &context);
}
}
"affine:table" => {
if summary_remaining == -1 {
let contents = gather_table_contents(block);
if !contents.is_empty() {
content.summary.push_str(&contents.join("|"));
}
}
}
"affine:paragraph" | "affine:list" | "affine:code" => {
push_children(&mut stack, block);
if let Some((text, len)) = text_content_for_summary(block, "prop:text") {
if summary_remaining == -1 {
content.summary.push_str(&text);
} else if summary_remaining > 0 {
content.summary.push_str(&text);
summary_remaining -= len as isize;
}
}
}
_ => {}
}
}
Ok(Some(content))
}
pub fn parse_doc_to_markdown(
doc_bin: Vec<u8>,
doc_id: String,
@@ -161,60 +282,32 @@ pub fn parse_doc_to_markdown(
let title = get_string(block, "prop:title").unwrap_or_default();
markdown.push_str(&format!("\n### {title}\n"));
let columns = parse_database_columns(block);
let cells_map = block.get("prop:cells").and_then(|v| v.to_map());
if let (Some(columns), Some(cells_map)) = (columns, cells_map) {
if let Some(table) = build_database_table(block, &context, &md_options) {
let escape_table = |s: &str| s.replace('|', "\\|").replace('\n', "<br>");
let mut table = String::new();
let mut table_md = String::new();
table.push('|');
for column in &columns {
table.push_str(&escape_table(column.name.as_deref().unwrap_or_default()));
table.push('|');
table_md.push('|');
for column in &table.columns {
table_md.push_str(&escape_table(column.name.as_deref().unwrap_or_default()));
table_md.push('|');
}
table.push('\n');
table_md.push('\n');
table.push('|');
for _ in &columns {
table.push_str("---|");
table_md.push('|');
for _ in &table.columns {
table_md.push_str("---|");
}
table.push('\n');
table_md.push('\n');
let child_ids = collect_child_ids(block);
for child_id in child_ids {
table.push('|');
let row_cells = cells_map.get(&child_id).and_then(|v| v.to_map());
for column in &columns {
let mut cell_text = String::new();
if column.col_type == "title" {
if let Some(child_block) = context.block_pool.get(&child_id) {
if let Some(text_md) =
text_to_inline_markdown(child_block, "prop:text", &md_options)
{
cell_text = text_md;
} else if let Some((text, _)) = text_content(child_block, "prop:text") {
cell_text = text;
}
}
} else if let Some(row_cells) = &row_cells {
if let Some(cell_val) = row_cells.get(&column.id).and_then(|v| v.to_map()) {
if let Some(value) = cell_val.get("value") {
if let Some(text_md) = delta_value_to_inline_markdown(&value, &md_options) {
cell_text = text_md;
} else {
cell_text = format_cell_value(&value, column);
}
}
}
}
table.push_str(&escape_table(&cell_text));
table.push('|');
for row in table.rows.into_iter() {
table_md.push('|');
for cell_text in row.into_iter() {
table_md.push_str(&escape_table(&cell_text));
table_md.push('|');
}
table.push('\n');
table_md.push('\n');
}
append_table_block(&mut markdown, &table);
append_table_block(&mut markdown, &table_md);
}
continue;
}
@@ -702,6 +795,117 @@ fn gather_table_contents(block: &Map) -> Vec<String> {
contents
}
struct DatabaseTable {
columns: Vec<DatabaseColumn>,
rows: Vec<Vec<String>>,
}
fn build_database_table(
block: &Map,
context: &DocContext,
md_options: &DeltaToMdOptions,
) -> Option<DatabaseTable> {
let columns = parse_database_columns(block)?;
let cells_map = block.get("prop:cells").and_then(|v| v.to_map())?;
let child_ids = collect_child_ids(block);
let mut rows = Vec::new();
for child_id in child_ids {
let row_cells = cells_map.get(&child_id).and_then(|v| v.to_map());
let mut row = Vec::new();
for column in columns.iter() {
let mut cell_text = String::new();
if column.col_type == "title" {
if let Some(child_block) = context.block_pool.get(&child_id) {
if let Some(text_md) = text_to_inline_markdown(child_block, "prop:text", md_options) {
cell_text = text_md;
} else if let Some((text, _)) = text_content(child_block, "prop:text") {
cell_text = text;
} else if let Some((text, _)) = text_content_for_summary(child_block, "prop:text") {
cell_text = text;
}
}
} else if let Some(row_cells) = &row_cells {
if let Some(cell_val) = row_cells.get(&column.id).and_then(|v| v.to_map()) {
if let Some(value) = cell_val.get("value") {
if let Some(text_md) = delta_value_to_inline_markdown(&value, md_options) {
cell_text = text_md;
} else {
cell_text = format_cell_value(&value, column);
}
}
}
}
row.push(cell_text);
}
rows.push(row);
}
Some(DatabaseTable { columns, rows })
}
fn append_database_summary(summary: &mut String, block: &Map, context: &DocContext) {
let md_options = DeltaToMdOptions::new(None);
let Some(table) = build_database_table(block, context, &md_options) else {
return;
};
if let Some(title) = get_string(block, "prop:title") {
if !title.is_empty() {
summary.push_str(&title);
summary.push('|');
}
}
for column in table.columns.iter() {
if let Some(name) = column.name.as_ref() {
if !name.is_empty() {
summary.push_str(name);
summary.push('|');
}
}
for option in column.options.iter() {
if let Some(value) = option.value.as_ref() {
if !value.is_empty() {
summary.push_str(value);
summary.push('|');
}
}
}
}
for row in table.rows.iter() {
for cell_text in row.iter() {
if !cell_text.is_empty() {
summary.push_str(cell_text);
summary.push('|');
}
}
}
}
fn push_children(queue: &mut Vec<String>, block: &Map) {
let mut child_ids = collect_child_ids(block);
for child_id in child_ids.drain(..).rev() {
queue.push(child_id);
}
}
fn text_content_for_summary(block: &Map, key: &str) -> Option<(String, usize)> {
if let Some((text, len)) = text_content(block, key) {
return Some((text, len));
}
block.get(key).and_then(|value| {
value_to_string(&value).map(|text| {
let len = text.chars().count();
(text, len)
})
})
}
struct DatabaseOption {
id: Option<String>,
value: Option<String>,

View File

@@ -4,6 +4,7 @@ mod delta_markdown;
mod value;
pub use affine::{
get_doc_ids_from_binary, parse_doc_from_binary, parse_doc_to_markdown, BlockInfo, CrawlResult,
MarkdownResult, ParseError,
get_doc_ids_from_binary, parse_doc_from_binary, parse_doc_to_markdown, parse_page_doc,
parse_workspace_doc, BlockInfo, CrawlResult, MarkdownResult, PageDocContent, ParseError,
WorkspaceDocContent,
};