feat: native doc reader (#13881)

This commit is contained in:
DarkSky
2025-11-08 23:07:16 +08:00
committed by GitHub
parent 02dcfdcc40
commit 7e6ead4232
15 changed files with 1460 additions and 269 deletions

View File

@@ -0,0 +1,528 @@
use std::collections::{HashMap, HashSet};
use serde::{Deserialize, Serialize};
use serde_json::{Map as JsonMap, Value as JsonValue};
use thiserror::Error;
use y_octo::{Any, DocOptions, JwstCodecError, Map, Value};
const SUMMARY_LIMIT: usize = 1000;
const PAGE_FLAVOUR: &str = "affine:page";
const NOTE_FLAVOUR: &str = "affine:note";
const BOOKMARK_FLAVOURS: [&str; 5] = [
"affine:bookmark",
"affine:embed-youtube",
"affine:embed-figma",
"affine:embed-github",
"affine:embed-loom",
];
#[derive(Debug, Clone)]
pub struct CrawlDocInput {
pub doc_bin: Vec<u8>,
pub root_doc_bin: Option<Vec<u8>>,
pub space_id: String,
pub doc_id: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BlockInfo {
pub block_id: String,
pub flavour: String,
pub content: Option<Vec<String>>,
pub blob: Option<Vec<String>>,
pub ref_doc_id: Option<Vec<String>>,
pub ref_info: Option<Vec<String>>,
pub parent_flavour: Option<String>,
pub parent_block_id: Option<String>,
pub additional: Option<String>,
}
impl BlockInfo {
fn base(
block_id: &str,
flavour: &str,
parent_flavour: Option<&String>,
parent_block_id: Option<&String>,
additional: Option<String>,
) -> Self {
Self {
block_id: block_id.to_string(),
flavour: flavour.to_string(),
content: None,
blob: None,
ref_doc_id: None,
ref_info: None,
parent_flavour: parent_flavour.cloned(),
parent_block_id: parent_block_id.cloned(),
additional,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CrawlResult {
pub blocks: Vec<BlockInfo>,
pub title: String,
pub summary: String,
}
#[derive(Error, Debug, Serialize, Deserialize)]
pub enum ParseError {
#[error("doc_not_found")]
DocNotFound,
#[error("invalid_binary")]
InvalidBinary,
#[error("sqlite_error: {0}")]
SqliteError(String),
#[error("parser_error: {0}")]
ParserError(String),
#[error("unknown: {0}")]
Unknown(String),
}
impl From<JwstCodecError> for ParseError {
fn from(value: JwstCodecError) -> Self {
Self::ParserError(value.to_string())
}
}
pub fn parse_doc_from_binary(input: CrawlDocInput) -> Result<CrawlResult, ParseError> {
let CrawlDocInput {
doc_bin,
root_doc_bin: _,
space_id: _,
doc_id,
} = input;
if doc_bin.is_empty() {
return Err(ParseError::InvalidBinary);
}
let mut doc = DocOptions::new().with_guid(doc_id.clone()).build();
doc
.apply_update_from_binary_v1(&doc_bin)
.map_err(|_| ParseError::InvalidBinary)?;
let blocks_map = doc.get_map("blocks")?;
if blocks_map.is_empty() {
return Err(ParseError::ParserError("blocks map is empty".into()));
}
let mut block_pool: HashMap<String, Map> = HashMap::new();
let mut parent_lookup: HashMap<String, String> = HashMap::new();
for (_, value) in blocks_map.iter() {
if let Some(block_map) = value.to_map() {
if let Some(block_id) = get_block_id(&block_map) {
for child_id in collect_child_ids(&block_map) {
parent_lookup.insert(child_id, block_id.clone());
}
block_pool.insert(block_id, block_map);
}
}
}
let root_block_id = block_pool
.iter()
.find_map(|(id, block)| {
get_flavour(block)
.filter(|flavour| flavour == PAGE_FLAVOUR)
.map(|_| id.clone())
})
.ok_or_else(|| ParseError::ParserError("root block not found".into()))?;
let mut queue: Vec<(Option<String>, String)> = vec![(None, root_block_id.clone())];
let mut visited: HashSet<String> = HashSet::from([root_block_id.clone()]);
let mut blocks: Vec<BlockInfo> = Vec::with_capacity(block_pool.len());
let mut doc_title = String::new();
let mut summary = String::new();
let mut summary_remaining = SUMMARY_LIMIT as isize;
while let Some((parent_block_id, block_id)) = queue.pop() {
let block = match block_pool.get(&block_id) {
Some(block) => block,
None => continue,
};
let flavour = match get_flavour(block) {
Some(flavour) => flavour,
None => continue,
};
let parent_block = parent_block_id.as_ref().and_then(|id| block_pool.get(id));
let parent_flavour = parent_block.and_then(get_flavour);
let note_block = nearest_by_flavour(&block_id, NOTE_FLAVOUR, &parent_lookup, &block_pool);
let note_block_id = note_block.as_ref().and_then(get_block_id);
let display_mode = determine_display_mode(note_block.as_ref());
// enqueue children first to keep traversal order similar to JS implementation
let mut child_ids = collect_child_ids(block);
for child_id in child_ids.drain(..).rev() {
if visited.insert(child_id.clone()) {
queue.push((Some(block_id.clone()), child_id));
}
}
let build_block = |database_name: Option<&String>| {
BlockInfo::base(
&block_id,
&flavour,
parent_flavour.as_ref(),
parent_block_id.as_ref(),
compose_additional(&display_mode, note_block_id.as_ref(), database_name),
)
};
if flavour == PAGE_FLAVOUR {
let title = get_string(block, "prop:title").unwrap_or_default();
doc_title = title.clone();
let mut info = build_block(None);
info.content = Some(vec![title]);
blocks.push(info);
continue;
}
if matches!(
flavour.as_str(),
"affine:paragraph" | "affine:list" | "affine:code"
) {
if let Some((text, text_len)) = text_content(block, "prop:text") {
let database_name = if flavour == "affine:paragraph"
&& parent_flavour.as_deref() == Some("affine:database")
{
parent_block.and_then(|map| get_string(map, "prop:title"))
} else {
None
};
let mut info = build_block(database_name.as_ref());
info.content = Some(vec![text.clone()]);
blocks.push(info);
append_summary(&mut summary, &mut summary_remaining, text_len, &text);
}
continue;
}
if matches!(
flavour.as_str(),
"affine:embed-linked-doc" | "affine:embed-synced-doc"
) {
if let Some(page_id) = get_string(block, "prop:pageId") {
let mut info = build_block(None);
info.ref_doc_id = Some(vec![page_id.clone()]);
if let Some(payload) = embed_ref_payload(block, &page_id) {
info.ref_info = Some(vec![payload]);
}
blocks.push(info);
}
continue;
}
if flavour == "affine:attachment" {
if let Some(blob_id) = get_string(block, "prop:sourceId") {
let mut info = build_block(None);
info.blob = Some(vec![blob_id]);
info.content = Some(vec![get_string(block, "prop:name").unwrap_or_default()]);
blocks.push(info);
}
continue;
}
if flavour == "affine:image" {
if let Some(blob_id) = get_string(block, "prop:sourceId") {
let mut info = build_block(None);
info.blob = Some(vec![blob_id]);
info.content = Some(vec![get_string(block, "prop:caption").unwrap_or_default()]);
blocks.push(info);
}
continue;
}
if flavour == "affine:surface" {
let texts = gather_surface_texts(block);
let mut info = build_block(None);
info.content = Some(texts);
blocks.push(info);
continue;
}
if flavour == "affine:database" {
let (texts, database_name) = gather_database_texts(block);
let mut info = BlockInfo::base(
&block_id,
&flavour,
parent_flavour.as_ref(),
parent_block_id.as_ref(),
compose_additional(
&display_mode,
note_block_id.as_ref(),
database_name.as_ref(),
),
);
info.content = Some(texts);
blocks.push(info);
continue;
}
if flavour == "affine:latex" {
if let Some(content) = get_string(block, "prop:latex") {
let mut info = build_block(None);
info.content = Some(vec![content]);
blocks.push(info);
}
continue;
}
if flavour == "affine:table" {
let contents = gather_table_contents(block);
let mut info = build_block(None);
info.content = Some(contents);
blocks.push(info);
continue;
}
if BOOKMARK_FLAVOURS.contains(&flavour.as_str()) {
blocks.push(build_block(None));
}
}
if doc_title.is_empty() {
doc_title = "Untitled".into();
}
Ok(CrawlResult {
blocks,
title: doc_title,
summary,
})
}
fn collect_child_ids(block: &Map) -> Vec<String> {
block
.get("sys:children")
.and_then(|value| value.to_array())
.map(|array| {
array
.iter()
.filter_map(|value| value_to_string(&value))
.collect::<Vec<_>>()
})
.unwrap_or_default()
}
fn get_block_id(block: &Map) -> Option<String> {
get_string(block, "sys:id")
}
fn get_flavour(block: &Map) -> Option<String> {
get_string(block, "sys:flavour")
}
fn get_string(block: &Map, key: &str) -> Option<String> {
block.get(key).and_then(|value| value_to_string(&value))
}
fn text_content(block: &Map, key: &str) -> Option<(String, usize)> {
block.get(key).and_then(|value| {
value.to_text().map(|text| {
let content = text.to_string();
let len = text.len() as usize;
(content, len)
})
})
}
fn nearest_by_flavour(
start: &str,
flavour: &str,
parent_lookup: &HashMap<String, String>,
blocks: &HashMap<String, Map>,
) -> Option<Map> {
let mut cursor = Some(start.to_string());
while let Some(node) = cursor {
if let Some(block) = blocks.get(&node) {
if get_flavour(block).as_deref() == Some(flavour) {
return Some(block.clone());
}
}
cursor = parent_lookup.get(&node).cloned();
}
None
}
fn determine_display_mode(note_block: Option<&Map>) -> String {
match note_block.and_then(|block| get_string(block, "prop:displayMode")) {
Some(mode) if mode == "both" => "page".into(),
Some(mode) => mode,
None => "edgeless".into(),
}
}
fn compose_additional(
display_mode: &str,
note_block_id: Option<&String>,
database_name: Option<&String>,
) -> Option<String> {
let mut payload = JsonMap::new();
payload.insert(
"displayMode".into(),
JsonValue::String(display_mode.to_string()),
);
if let Some(note_id) = note_block_id {
payload.insert("noteBlockId".into(), JsonValue::String(note_id.clone()));
}
if let Some(name) = database_name {
payload.insert("databaseName".into(), JsonValue::String(name.clone()));
}
Some(JsonValue::Object(payload).to_string())
}
fn embed_ref_payload(block: &Map, page_id: &str) -> Option<String> {
let mut payload = JsonMap::new();
payload.insert("docId".into(), JsonValue::String(page_id.to_string()));
if let Some(params_value) = block.get("prop:params") {
if let Ok(JsonValue::Object(params)) = serde_json::to_value(&params_value) {
for (key, value) in params.into_iter() {
payload.insert(key, value);
}
}
}
Some(JsonValue::Object(payload).to_string())
}
fn gather_surface_texts(block: &Map) -> Vec<String> {
let mut texts = Vec::new();
let elements = match block.get("prop:elements").and_then(|value| value.to_map()) {
Some(map) => map,
None => return texts,
};
if elements
.get("type")
.and_then(|value| value_to_string(&value))
.as_deref()
!= Some("$blocksuite:internal:native$")
{
return texts;
}
if let Some(value_map) = elements.get("value").and_then(|value| value.to_map()) {
for value in value_map.values() {
if let Some(element) = value.to_map() {
if let Some(text) = element.get("text").and_then(|value| value.to_text()) {
texts.push(text.to_string());
}
}
}
}
texts.sort();
texts
}
fn gather_database_texts(block: &Map) -> (Vec<String>, Option<String>) {
let mut texts = Vec::new();
let database_title = get_string(block, "prop:title");
if let Some(title) = &database_title {
texts.push(title.clone());
}
if let Some(columns) = block.get("prop:columns").and_then(|value| value.to_array()) {
for column_value in columns.iter() {
if let Some(column) = column_value.to_map() {
if let Some(name) = get_string(&column, "name") {
texts.push(name);
}
if let Some(data) = column.get("data").and_then(|value| value.to_map()) {
if let Some(options) = data.get("options").and_then(|value| value.to_array()) {
for option_value in options.iter() {
if let Some(option) = option_value.to_map() {
if let Some(value) = get_string(&option, "value") {
texts.push(value);
}
}
}
}
}
}
}
}
(texts, database_title)
}
fn gather_table_contents(block: &Map) -> Vec<String> {
let mut contents = Vec::new();
for key in block.keys() {
if key.starts_with("prop:cells.") && key.ends_with(".text") {
if let Some(value) = block.get(key).and_then(|value| value_to_string(&value)) {
if !value.is_empty() {
contents.push(value);
}
}
}
}
contents
}
fn value_to_string(value: &Value) -> Option<String> {
if let Some(text) = value.to_text() {
return Some(text.to_string());
}
if let Some(any) = value.to_any() {
return any_to_string(&any);
}
None
}
fn any_to_string(any: &Any) -> Option<String> {
match any {
Any::String(value) => Some(value.to_string()),
Any::Integer(value) => Some(value.to_string()),
Any::Float32(value) => Some(value.0.to_string()),
Any::Float64(value) => Some(value.0.to_string()),
Any::BigInt64(value) => Some(value.to_string()),
Any::True => Some("true".into()),
Any::False => Some("false".into()),
Any::Null | Any::Undefined => None,
Any::Array(_) | Any::Object(_) | Any::Binary(_) => serde_json::to_string(any).ok(),
}
}
fn append_summary(summary: &mut String, remaining: &mut isize, text_len: usize, text: &str) {
if *remaining > 0 {
summary.push_str(text);
*remaining -= text_len as isize;
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_doc_from_binary() {
let json = include_bytes!("../fixtures/demo.ydoc.json");
let input = CrawlDocInput {
doc_bin: include_bytes!("../fixtures/demo.ydoc").to_vec(),
root_doc_bin: None,
space_id: "o9WCLGyxkLxdULZ-f2B9V".to_string(),
doc_id: "dYpV7PPhk8amRkY5IAcVO".to_string(),
};
let result = parse_doc_from_binary(input).unwrap();
let config = assert_json_diff::Config::new(assert_json_diff::CompareMode::Strict)
.numeric_mode(assert_json_diff::NumericMode::AssumeFloat);
assert_json_diff::assert_json_matches!(
serde_json::from_slice::<serde_json::Value>(json).unwrap(),
serde_json::json!(result),
config
);
}
}

View File

@@ -1,3 +1,6 @@
#[cfg(feature = "doc-loader")]
pub mod doc_loader;
#[cfg(feature = "ydoc-loader")]
pub mod doc_parser;
#[cfg(feature = "hashcash")]
pub mod hashcash;