diff --git a/Cargo.lock b/Cargo.lock index 872e43bfc9..63793b04dd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -43,8 +43,10 @@ dependencies = [ "criterion", "docx-parser", "infer", + "nanoid", "path-ext", "pdf-extract", + "pulldown-cmark", "rand 0.9.2", "rayon", "readability", @@ -1793,6 +1795,15 @@ dependencies = [ "version_check", ] +[[package]] +name = "getopts" +version = "0.2.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfe4fbac503b8d1f88e6676011885f34b7174f46e59956bba534ba83abded4df" +dependencies = [ + "unicode-width", +] + [[package]] name = "getrandom" version = "0.2.16" @@ -3474,10 +3485,18 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e8bbe1a966bd2f362681a44f6edce3c2310ac21e4d5067a6e7ec396297a6ea0" dependencies = [ "bitflags 2.10.0", + "getopts", "memchr", + "pulldown-cmark-escape", "unicase", ] +[[package]] +name = "pulldown-cmark-escape" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "007d8adb5ddab6f8e3f491ac63566a7d5002cc7ed73901f72057943fa71ae1ae" + [[package]] name = "quick-error" version = "1.2.3" @@ -5161,6 +5180,12 @@ version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" +[[package]] +name = "unicode-width" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254" + [[package]] name = "uniffi" version = "0.29.5" diff --git a/Cargo.toml b/Cargo.toml index 4bfdc1be47..0cdf5344fd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -71,6 +71,7 @@ resolver = "3" phf = { version = "0.11", features = ["macros"] } proptest = "1.3" proptest-derive = "0.5" + pulldown-cmark = "0.13" rand = "0.9" rand_chacha = "0.9" rand_distr = "0.5" diff --git a/packages/backend/native/index.d.ts b/packages/backend/native/index.d.ts index b13c0ad846..175895cd8e 100644 --- a/packages/backend/native/index.d.ts +++ b/packages/backend/native/index.d.ts @@ -4,6 +4,20 @@ export declare class Tokenizer { count(content: string, allowedSpecial?: Array | undefined | null): number } +/** + * Adds a document ID to the workspace root doc's meta.pages array. + * This registers the document in the workspace so it appears in the UI. + * + * # Arguments + * * `root_doc_bin` - The current root doc binary (workspaceId doc) + * * `doc_id` - The document ID to add + * * `title` - Optional title for the document + * + * # Returns + * A Buffer containing the y-octo update binary to apply to the root doc + */ +export declare function addDocToRootDoc(rootDocBin: Buffer, docId: string, title?: string | undefined | null): Buffer + export const AFFINE_PRO_LICENSE_AES_KEY: string | undefined | null export const AFFINE_PRO_PUBLIC_KEY: string | undefined | null @@ -19,6 +33,18 @@ export declare function getMime(input: Uint8Array): string export declare function htmlSanitize(input: string): string +/** + * Converts markdown content to AFFiNE-compatible y-octo document binary. + * + * # Arguments + * * `markdown` - The markdown content to convert + * * `doc_id` - The document ID to use for the y-octo doc + * + * # Returns + * A Buffer containing the y-octo document update binary + */ +export declare function markdownToDocBinary(markdown: string, docId: string): Buffer + /** * Merge updates in form like `Y.applyUpdate(doc, update)` way and return the * result binary. @@ -77,4 +103,18 @@ export declare function parseWorkspaceDoc(docBin: Buffer): NativeWorkspaceDocCon export declare function readAllDocIdsFromRootDoc(docBin: Buffer, includeTrash?: boolean | undefined | null): Array +/** + * Updates an existing document with new markdown content. + * Uses structural and text-level diffing to apply minimal changes. + * + * # Arguments + * * `existing_binary` - The current document binary + * * `new_markdown` - The new markdown content to apply + * * `doc_id` - The document ID + * + * # Returns + * A Buffer containing only the delta (changes) as a y-octo update binary + */ +export declare function updateDocWithMarkdown(existingBinary: Buffer, newMarkdown: string, docId: string): Buffer + export declare function verifyChallengeResponse(response: string, bits: number, resource: string): Promise diff --git a/packages/backend/native/src/doc.rs b/packages/backend/native/src/doc.rs index 418d4ab78a..550150d541 100644 --- a/packages/backend/native/src/doc.rs +++ b/packages/backend/native/src/doc.rs @@ -132,3 +132,52 @@ pub fn read_all_doc_ids_from_root_doc(doc_bin: Buffer, include_trash: Option Result { + let result = + doc_parser::markdown_to_ydoc(&markdown, &doc_id).map_err(|e| Error::new(Status::GenericFailure, e.to_string()))?; + Ok(Buffer::from(result)) +} + +/// Updates an existing document with new markdown content. +/// Uses structural and text-level diffing to apply minimal changes. +/// +/// # Arguments +/// * `existing_binary` - The current document binary +/// * `new_markdown` - The new markdown content to apply +/// * `doc_id` - The document ID +/// +/// # Returns +/// A Buffer containing only the delta (changes) as a y-octo update binary +#[napi] +pub fn update_doc_with_markdown(existing_binary: Buffer, new_markdown: String, doc_id: String) -> Result { + let result = doc_parser::update_ydoc(&existing_binary, &new_markdown, &doc_id) + .map_err(|e| Error::new(Status::GenericFailure, e.to_string()))?; + Ok(Buffer::from(result)) +} + +/// Adds a document ID to the workspace root doc's meta.pages array. +/// This registers the document in the workspace so it appears in the UI. +/// +/// # Arguments +/// * `root_doc_bin` - The current root doc binary (workspaceId doc) +/// * `doc_id` - The document ID to add +/// * `title` - Optional title for the document +/// +/// # Returns +/// A Buffer containing the y-octo update binary to apply to the root doc +#[napi] +pub fn add_doc_to_root_doc(root_doc_bin: Buffer, doc_id: String, title: Option) -> Result { + let result = doc_parser::add_doc_to_root_doc(root_doc_bin.into(), &doc_id, title.as_deref()) + .map_err(|e| Error::new(Status::GenericFailure, e.to_string()))?; + Ok(Buffer::from(result)) +} diff --git a/packages/backend/server/src/core/doc/index.ts b/packages/backend/server/src/core/doc/index.ts index ffffc3926b..8058bed7dc 100644 --- a/packages/backend/server/src/core/doc/index.ts +++ b/packages/backend/server/src/core/doc/index.ts @@ -11,6 +11,7 @@ import { DocEventsListener } from './event'; import { DocStorageCronJob } from './job'; import { DocStorageOptions } from './options'; import { DatabaseDocReader, DocReader, DocReaderProvider } from './reader'; +import { DocWriter } from './writer'; @Module({ imports: [QuotaModule, PermissionModule, StorageModule], @@ -22,10 +23,12 @@ import { DatabaseDocReader, DocReader, DocReaderProvider } from './reader'; DocReaderProvider, DatabaseDocReader, DocEventsListener, + DocWriter, ], exports: [ DatabaseDocReader, DocReader, + DocWriter, PgWorkspaceDocStorageAdapter, PgUserspaceDocStorageAdapter, ], @@ -35,6 +38,7 @@ export { // only for doc-service DatabaseDocReader, DocReader, + DocWriter, PgUserspaceDocStorageAdapter, PgWorkspaceDocStorageAdapter, }; diff --git a/packages/backend/server/src/core/doc/writer.ts b/packages/backend/server/src/core/doc/writer.ts new file mode 100644 index 0000000000..ecfcf04f08 --- /dev/null +++ b/packages/backend/server/src/core/doc/writer.ts @@ -0,0 +1,131 @@ +import { Injectable, Logger, NotFoundException } from '@nestjs/common'; +import { nanoid } from 'nanoid'; + +import { + addDocToRootDoc, + markdownToDocBinary, + updateDocWithMarkdown, +} from '../../native'; +import { PgWorkspaceDocStorageAdapter } from './adapters/workspace'; + +export interface CreateDocResult { + docId: string; +} + +export interface UpdateDocResult { + success: boolean; +} + +@Injectable() +export class DocWriter { + private readonly logger = new Logger(DocWriter.name); + + constructor(private readonly storage: PgWorkspaceDocStorageAdapter) {} + + /** + * Creates a new document from markdown content. + * + * @param workspaceId - The workspace ID + * @param markdown - The markdown content + * @param editorId - Optional editor ID for tracking + * @returns The created document ID + */ + async createDoc( + workspaceId: string, + markdown: string, + editorId?: string + ): Promise { + // Fetch workspace root doc first - reject if not found + // The root doc (docId = workspaceId) contains meta.pages array + const rootDoc = await this.storage.getDoc(workspaceId, workspaceId); + if (!rootDoc?.bin) { + throw new NotFoundException( + `Workspace ${workspaceId} not found or has no root document` + ); + } + + const rootDocBin = Buffer.isBuffer(rootDoc.bin) + ? rootDoc.bin + : Buffer.from( + rootDoc.bin.buffer, + rootDoc.bin.byteOffset, + rootDoc.bin.byteLength + ); + + const docId = nanoid(); + + this.logger.debug( + `Creating doc ${docId} in workspace ${workspaceId} from markdown` + ); + + // Convert markdown to y-octo binary + const binary = markdownToDocBinary(markdown, docId); + + // Extract title from markdown (first H1 heading) + const titleMatch = markdown.match(/^#\s+(.+?)(?:\s*#+)?\s*$/m); + const title = titleMatch ? titleMatch[1].trim() : undefined; + + // Prepare root doc update to register the new document + const rootDocUpdate = addDocToRootDoc(rootDocBin, docId, title); + + // Push both updates together - root doc first, then the new doc + await this.storage.pushDocUpdates( + workspaceId, + workspaceId, + [rootDocUpdate], + editorId + ); + await this.storage.pushDocUpdates(workspaceId, docId, [binary], editorId); + + this.logger.debug( + `Created and registered doc ${docId} in workspace ${workspaceId}` + ); + + return { docId }; + } + + /** + * Updates an existing document with new markdown content. + * + * Uses structural diffing to compute minimal changes between the existing + * document and new markdown, then applies only the delta. This preserves + * document history and enables proper CRDT merging with concurrent edits. + * + * @param workspaceId - The workspace ID + * @param docId - The document ID to update + * @param markdown - The new markdown content + * @param editorId - Optional editor ID for tracking + */ + async updateDoc( + workspaceId: string, + docId: string, + markdown: string, + editorId?: string + ): Promise { + this.logger.debug( + `Updating doc ${docId} in workspace ${workspaceId} from markdown` + ); + + // Fetch existing document + const existingDoc = await this.storage.getDoc(workspaceId, docId); + if (!existingDoc?.bin) { + throw new NotFoundException(`Document ${docId} not found`); + } + + // Compute delta update using structural diff + // Use zero-copy buffer view when possible for native function + const existingBinary = Buffer.isBuffer(existingDoc.bin) + ? existingDoc.bin + : Buffer.from( + existingDoc.bin.buffer, + existingDoc.bin.byteOffset, + existingDoc.bin.byteLength + ); + const delta = updateDocWithMarkdown(existingBinary, markdown, docId); + + // Push only the delta changes + await this.storage.pushDocUpdates(workspaceId, docId, [delta], editorId); + + return { success: true }; + } +} diff --git a/packages/backend/server/src/native.ts b/packages/backend/server/src/native.ts index 13cea39b9c..8d37adcd96 100644 --- a/packages/backend/server/src/native.ts +++ b/packages/backend/server/src/native.ts @@ -49,3 +49,8 @@ export const readAllDocIdsFromRootDoc = export const AFFINE_PRO_PUBLIC_KEY = serverNativeModule.AFFINE_PRO_PUBLIC_KEY; export const AFFINE_PRO_LICENSE_AES_KEY = serverNativeModule.AFFINE_PRO_LICENSE_AES_KEY; + +// MCP write tools exports +export const markdownToDocBinary = serverNativeModule.markdownToDocBinary; +export const updateDocWithMarkdown = serverNativeModule.updateDocWithMarkdown; +export const addDocToRootDoc = serverNativeModule.addDocToRootDoc; diff --git a/packages/backend/server/src/plugins/copilot/mcp/provider.ts b/packages/backend/server/src/plugins/copilot/mcp/provider.ts index d3afe1b4fe..c23f4986be 100644 --- a/packages/backend/server/src/plugins/copilot/mcp/provider.ts +++ b/packages/backend/server/src/plugins/copilot/mcp/provider.ts @@ -4,7 +4,7 @@ import { Injectable } from '@nestjs/common'; import { pick } from 'lodash-es'; import z from 'zod/v3'; -import { DocReader } from '../../../core/doc'; +import { DocReader, DocWriter } from '../../../core/doc'; import { AccessController } from '../../../core/permission'; import { clearEmbeddingChunk } from '../../../models'; import { IndexerService } from '../../indexer'; @@ -15,6 +15,7 @@ export class WorkspaceMcpProvider { constructor( private readonly ac: AccessController, private readonly reader: DocReader, + private readonly writer: DocWriter, private readonly context: CopilotContextService, private readonly indexer: IndexerService ) {} @@ -165,6 +166,147 @@ export class WorkspaceMcpProvider { } ); + // Write tools - create and update documents + server.registerTool( + 'create_document', + { + title: 'Create Document', + description: + 'Create a new document in the workspace with the given title and markdown content. Returns the ID of the created document.', + inputSchema: z.object({ + title: z.string().min(1).describe('The title of the new document'), + content: z + .string() + .describe( + 'The markdown content for the document body (should NOT include a title H1 - the title parameter will be used)' + ), + }), + }, + async ({ title, content }) => { + try { + // Check if user can create docs in this workspace + await this.ac + .user(userId) + .workspace(workspaceId) + .assert('Workspace.CreateDoc'); + + // Combine title and content into markdown + // Sanitize title by removing newlines and trimming + const sanitizedTitle = title.replace(/[\r\n]+/g, ' ').trim(); + if (!sanitizedTitle) { + throw new Error('Title cannot be empty'); + } + + // Strip any leading H1 from content to prevent duplicates + // Per CommonMark spec, ATX headings allow only 0-3 spaces before the # + // Handles: "# Title", " # Title", "# Title #" + const strippedContent = content.replace( + /^[ \t]{0,3}#\s+[^\n]*#*\s*\n*/, + '' + ); + + const markdown = `# ${sanitizedTitle}\n\n${strippedContent}`; + + // Create the document + const result = await this.writer.createDoc( + workspaceId, + markdown, + userId + ); + + return { + content: [ + { + type: 'text', + text: JSON.stringify({ + success: true, + docId: result.docId, + message: `Document "${title}" created successfully`, + }), + }, + ], + } as const; + } catch (error) { + return { + isError: true, + content: [ + { + type: 'text', + text: `Failed to create document: ${error instanceof Error ? error.message : 'Unknown error'}`, + }, + ], + }; + } + } + ); + + server.registerTool( + 'update_document', + { + title: 'Update Document', + description: + 'Update an existing document with new markdown content. Uses structural diffing to apply minimal changes, preserving document history and enabling real-time collaboration.', + inputSchema: z.object({ + docId: z.string().describe('The ID of the document to update'), + content: z + .string() + .describe( + 'The complete new markdown content for the document (including title as H1)' + ), + }), + }, + async ({ docId, content }) => { + const notFoundError: CallToolResult = { + isError: true, + content: [ + { + type: 'text', + text: `Doc with id ${docId} not found.`, + }, + ], + }; + + // Use can() instead of assert() to avoid leaking doc existence info + const accessible = await this.ac + .user(userId) + .workspace(workspaceId) + .doc(docId) + .can('Doc.Update'); + + if (!accessible) { + return notFoundError; + } + + try { + // Update the document + await this.writer.updateDoc(workspaceId, docId, content, userId); + + return { + content: [ + { + type: 'text', + text: JSON.stringify({ + success: true, + docId, + message: `Document updated successfully`, + }), + }, + ], + } as const; + } catch (error) { + return { + isError: true, + content: [ + { + type: 'text', + text: `Failed to update document: ${error instanceof Error ? error.message : 'Unknown error'}`, + }, + ], + }; + } + } + ); + return server; } } diff --git a/packages/common/native/Cargo.toml b/packages/common/native/Cargo.toml index 7f8ad87d5e..0d20fcaca4 100644 --- a/packages/common/native/Cargo.toml +++ b/packages/common/native/Cargo.toml @@ -37,15 +37,25 @@ tree-sitter = [ "dep:tree-sitter-scala", "dep:tree-sitter-typescript", ] -ydoc-loader = ["assert-json-diff", "serde", "serde_json", "thiserror", "y-octo"] +ydoc-loader = [ + "assert-json-diff", + "nanoid", + "pulldown-cmark", + "serde", + "serde_json", + "thiserror", + "y-octo", +] [dependencies] assert-json-diff = { workspace = true, optional = true } chrono = { workspace = true, optional = true } docx-parser = { workspace = true, optional = true } infer = { workspace = true, optional = true } +nanoid = { workspace = true, optional = true } path-ext = { workspace = true, optional = true } pdf-extract = { workspace = true, optional = true } +pulldown-cmark = { workspace = true, optional = true } rand = { workspace = true, optional = true } readability = { workspace = true, optional = true, default-features = false } serde = { workspace = true, optional = true, features = ["derive"] } diff --git a/packages/common/native/src/doc_parser/affine.rs b/packages/common/native/src/doc_parser/affine.rs index d13ce11e4d..f4efe36281 100644 --- a/packages/common/native/src/doc_parser/affine.rs +++ b/packages/common/native/src/doc_parser/affine.rs @@ -584,6 +584,113 @@ pub fn get_doc_ids_from_binary(doc_bin: Vec, include_trash: bool) -> Result< Ok(doc_ids) } +/// Adds a document ID to the root doc's meta.pages array. +/// Returns a binary update that can be applied to the root doc. +/// +/// # Arguments +/// * `root_doc_bin` - The current root doc binary +/// * `doc_id` - The document ID to add +/// * `title` - Optional title for the document +/// +/// # Returns +/// A Vec containing the y-octo update binary to add the doc +pub fn add_doc_to_root_doc(root_doc_bin: Vec, doc_id: &str, title: Option<&str>) -> Result, ParseError> { + // Handle empty or minimal root doc - create a new one + let doc = if root_doc_bin.is_empty() || root_doc_bin == [0, 0] { + DocOptions::new().build() + } else { + let mut doc = DocOptions::new().build(); + doc + .apply_update_from_binary_v1(&root_doc_bin) + .map_err(|_| ParseError::InvalidBinary)?; + doc + }; + + // Capture state before modifications to encode only the delta + let state_before = doc.get_state_vector(); + + // Get or create the meta map + let mut meta = doc.get_or_create_map("meta")?; + + // Get existing pages array or create new one + let pages_exists = meta.get("pages").and_then(|v| v.to_array()).is_some(); + + if pages_exists { + // Get the existing array and add to it + let mut pages = meta.get("pages").and_then(|v| v.to_array()).unwrap(); + + // Check if doc already exists + let doc_exists = pages.iter().any(|page_val| { + page_val + .to_map() + .and_then(|page| get_string(&page, "id")) + .map(|id| id == doc_id) + .unwrap_or(false) + }); + + if !doc_exists { + // Create a new page entry + let page_map = doc.create_map().map_err(|e| ParseError::ParserError(e.to_string()))?; + + // Insert into pages array first, then populate + let idx = pages.len(); + pages + .insert(idx, page_map) + .map_err(|e| ParseError::ParserError(e.to_string()))?; + + // Now get the inserted map and populate it + if let Some(mut inserted_page) = pages.get(idx).and_then(|v| v.to_map()) { + inserted_page + .insert("id".to_string(), Any::String(doc_id.to_string())) + .map_err(|e| ParseError::ParserError(e.to_string()))?; + + if let Some(t) = title { + inserted_page + .insert("title".to_string(), Any::String(t.to_string())) + .map_err(|e| ParseError::ParserError(e.to_string()))?; + } + + // Set createDate to current timestamp + let timestamp = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .map(|d| d.as_millis() as i64) + .unwrap_or(0); + inserted_page + .insert("createDate".to_string(), Any::BigInt64(timestamp)) + .map_err(|e| ParseError::ParserError(e.to_string()))?; + } + } + } else { + // Create new pages array with this doc + let page_entry = vec![Any::Object( + [ + ("id".to_string(), Any::String(doc_id.to_string())), + ("title".to_string(), Any::String(title.unwrap_or("").to_string())), + ( + "createDate".to_string(), + Any::BigInt64( + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .map(|d| d.as_millis() as i64) + .unwrap_or(0), + ), + ), + ] + .into_iter() + .collect(), + )]; + + meta + .insert("pages".to_string(), Any::Array(page_entry)) + .map_err(|e| ParseError::ParserError(e.to_string()))?; + } + + // Encode only the changes (delta) since state_before + doc + .encode_state_as_update_v1(&state_before) + .map_err(|e| ParseError::ParserError(e.to_string())) +} + fn paragraph_prefix(type_: &str) -> &'static str { match type_ { "h1" => "# ", diff --git a/packages/common/native/src/doc_parser/markdown_to_ydoc.rs b/packages/common/native/src/doc_parser/markdown_to_ydoc.rs new file mode 100644 index 0000000000..d8b1f78276 --- /dev/null +++ b/packages/common/native/src/doc_parser/markdown_to_ydoc.rs @@ -0,0 +1,492 @@ +//! Markdown to YDoc conversion module +//! +//! Converts markdown content into AFFiNE-compatible y-octo document binary +//! format. + +use y_octo::{Any, DocOptions}; + +use super::{ + affine::ParseError, + markdown_utils::{BlockType, ParsedBlock, extract_title, parse_markdown_blocks}, +}; + +/// Block types used in AFFiNE documents +const PAGE_FLAVOUR: &str = "affine:page"; +const NOTE_FLAVOUR: &str = "affine:note"; + +/// Intermediate representation of a block for building y-octo documents +struct BlockBuilder { + id: String, + flavour: String, + text_content: String, + block_type: Option, + checked: Option, + code_language: Option, + #[allow(dead_code)] // Reserved for future nested block support + children: Vec, +} + +impl BlockBuilder { + fn new(flavour: &str) -> Self { + Self { + id: nanoid::nanoid!(), + flavour: flavour.to_string(), + text_content: String::new(), + block_type: None, + checked: None, + code_language: None, + children: Vec::new(), + } + } + + fn with_text(mut self, text: &str) -> Self { + self.text_content = text.to_string(); + self + } + + fn with_block_type(mut self, btype: BlockType) -> Self { + self.block_type = Some(btype); + self + } + + fn with_checked(mut self, checked: bool) -> Self { + self.checked = Some(checked); + self + } + + fn with_code_language(mut self, lang: &str) -> Self { + if !lang.is_empty() { + self.code_language = Some(lang.to_string()); + } + self + } +} + +/// Converts a ParsedBlock from the shared parser into a BlockBuilder +impl From for BlockBuilder { + fn from(parsed: ParsedBlock) -> Self { + let mut builder = BlockBuilder::new(parsed.flavour.as_str()).with_text(&parsed.content); + + if let Some(btype) = parsed.block_type { + builder = builder.with_block_type(btype); + } + + if let Some(checked) = parsed.checked { + builder = builder.with_checked(checked); + } + + if let Some(lang) = parsed.language { + builder = builder.with_code_language(&lang); + } + + builder + } +} + +/// Parses markdown and converts it to an AFFiNE-compatible y-octo document +/// binary. +/// +/// # Arguments +/// * `markdown` - The markdown content to convert +/// * `doc_id` - The document ID to use +/// +/// # Returns +/// A binary vector containing the y-octo encoded document +pub fn markdown_to_ydoc(markdown: &str, doc_id: &str) -> Result, ParseError> { + // Extract the title from the first H1 heading + let title = extract_title(markdown); + + // Parse markdown into blocks using the shared parser + let parsed_blocks = parse_markdown_blocks(markdown, true); + + // Convert ParsedBlocks to BlockBuilders and collect IDs + let mut blocks: Vec = Vec::new(); + let mut content_block_ids: Vec = Vec::new(); + + for parsed in parsed_blocks { + let builder: BlockBuilder = parsed.into(); + content_block_ids.push(builder.id.clone()); + blocks.push(builder); + } + + // Build the y-octo document + build_ydoc(doc_id, &title, blocks, content_block_ids) +} + +/// Builds the y-octo document from parsed blocks. +/// +/// Uses a two-phase approach to ensure Yjs compatibility: +/// 1. Phase 1: Create and insert empty maps into blocks_map (establishes parent +/// items) +/// 2. Phase 2: Populate each map with properties (child items reference +/// existing parents) +/// +/// This ordering ensures that when items reference their parent map's ID in the +/// encoded binary, the parent ID always has a lower clock value, which Yjs +/// requires. +fn build_ydoc( + doc_id: &str, + title: &str, + content_blocks: Vec, + content_block_ids: Vec, +) -> Result, ParseError> { + // Create the document with the specified ID + let doc = DocOptions::new().with_guid(doc_id.to_string()).build(); + + // Create the blocks map + let mut blocks_map = doc + .get_or_create_map("blocks") + .map_err(|e| ParseError::ParserError(e.to_string()))?; + + // Create block IDs + let page_id = nanoid::nanoid!(); + let note_id = nanoid::nanoid!(); + + // ==== PHASE 1: Insert empty maps to establish parent items ==== + // This ensures parent items have lower clock values than their children + + // Insert empty page block map + blocks_map + .insert( + page_id.clone(), + doc.create_map().map_err(|e| ParseError::ParserError(e.to_string()))?, + ) + .map_err(|e| ParseError::ParserError(e.to_string()))?; + + // Insert empty note block map + blocks_map + .insert( + note_id.clone(), + doc.create_map().map_err(|e| ParseError::ParserError(e.to_string()))?, + ) + .map_err(|e| ParseError::ParserError(e.to_string()))?; + + // Insert empty content block maps + for block in &content_blocks { + blocks_map + .insert( + block.id.clone(), + doc.create_map().map_err(|e| ParseError::ParserError(e.to_string()))?, + ) + .map_err(|e| ParseError::ParserError(e.to_string()))?; + } + + // ==== PHASE 2: Populate the maps with their properties ==== + // Now each map has an item with a lower clock, so children will reference + // correctly + + // Populate page block + if let Some(page_map) = blocks_map.get(&page_id).and_then(|v| v.to_map()) { + populate_block_map( + &doc, + page_map, + &page_id, + PAGE_FLAVOUR, + Some(title), + None, + None, + None, + None, + vec![note_id.clone()], + )?; + } + + // Populate note block + if let Some(note_map) = blocks_map.get(¬e_id).and_then(|v| v.to_map()) { + populate_block_map( + &doc, + note_map, + ¬e_id, + NOTE_FLAVOUR, + None, + None, + None, + None, + None, + content_block_ids.clone(), + )?; + } + + // Populate content blocks + for block in content_blocks { + if let Some(block_map) = blocks_map.get(&block.id).and_then(|v| v.to_map()) { + populate_block_map( + &doc, + block_map, + &block.id, + &block.flavour, + None, + if block.text_content.is_empty() { + None + } else { + Some(&block.text_content) + }, + block.block_type, + block.checked, + block.code_language.as_deref(), + Vec::new(), + )?; + } + } + + // Encode the document + doc + .encode_update_v1() + .map_err(|e| ParseError::ParserError(e.to_string())) +} + +/// Populates an existing block map with the given properties. +/// +/// This function takes an already-inserted map and populates it with +/// properties. The two-phase approach (insert empty map first, then populate) +/// ensures that when child items reference the map as their parent, the +/// parent's clock is lower. +/// +/// IMPORTANT: We use Any types (Any::Array, Any::String) instead of CRDT types +/// (y_octo::Array, y_octo::Text) for nested values. Any types are encoded +/// inline as part of the item content, avoiding the forward reference issue +/// where child items would reference a parent with a higher clock value. +#[allow(clippy::too_many_arguments)] +fn populate_block_map( + _doc: &y_octo::Doc, + mut block: y_octo::Map, + block_id: &str, + flavour: &str, + title: Option<&str>, + text_content: Option<&str>, + block_type: Option, + checked: Option, + code_language: Option<&str>, + children: Vec, +) -> Result<(), ParseError> { + // Required fields + block + .insert("sys:id".to_string(), Any::String(block_id.to_string())) + .map_err(|e| ParseError::ParserError(e.to_string()))?; + block + .insert("sys:flavour".to_string(), Any::String(flavour.to_string())) + .map_err(|e| ParseError::ParserError(e.to_string()))?; + + // Children - use Any::Array which is encoded inline (no forward references) + let children_any: Vec = children.into_iter().map(Any::String).collect(); + block + .insert("sys:children".to_string(), Any::Array(children_any)) + .map_err(|e| ParseError::ParserError(e.to_string()))?; + + // Title + if let Some(title) = title { + block + .insert("prop:title".to_string(), Any::String(title.to_string())) + .map_err(|e| ParseError::ParserError(e.to_string()))?; + } + + // Text content - use Any::String instead of Y.Text + // This is simpler and avoids CRDT overhead for initial document creation + if let Some(content) = text_content { + block + .insert("prop:text".to_string(), Any::String(content.to_string())) + .map_err(|e| ParseError::ParserError(e.to_string()))?; + } + + // Block type + if let Some(btype) = block_type { + block + .insert("prop:type".to_string(), Any::String(btype.as_str().to_string())) + .map_err(|e| ParseError::ParserError(e.to_string()))?; + } + + // Checked state + if let Some(is_checked) = checked { + block + .insert( + "prop:checked".to_string(), + if is_checked { Any::True } else { Any::False }, + ) + .map_err(|e| ParseError::ParserError(e.to_string()))?; + } + + // Code language + if let Some(lang) = code_language { + block + .insert("prop:language".to_string(), Any::String(lang.to_string())) + .map_err(|e| ParseError::ParserError(e.to_string()))?; + } + + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_simple_markdown() { + let markdown = "# Hello World\n\nThis is a test paragraph."; + let result = markdown_to_ydoc(markdown, "test-doc-id"); + assert!(result.is_ok()); + let bin = result.unwrap(); + assert!(!bin.is_empty()); + } + + #[test] + fn test_markdown_with_list() { + let markdown = "# Test List\n\n- Item 1\n- Item 2\n- Item 3"; + let result = markdown_to_ydoc(markdown, "test-doc-id"); + assert!(result.is_ok()); + } + + #[test] + fn test_markdown_with_code() { + let markdown = "# Code Example\n\n```rust\nfn main() {\n println!(\"Hello\");\n}\n```"; + let result = markdown_to_ydoc(markdown, "test-doc-id"); + assert!(result.is_ok()); + } + + #[test] + fn test_markdown_with_headings() { + let markdown = "# H1\n\n## H2\n\n### H3\n\nParagraph text."; + let result = markdown_to_ydoc(markdown, "test-doc-id"); + assert!(result.is_ok()); + } + + #[test] + fn test_extract_title_usage() { + assert_eq!(extract_title("# My Title\n\nContent"), "My Title"); + assert_eq!(extract_title("No heading"), "Untitled"); + assert_eq!(extract_title("## Secondary\n\nContent"), "Untitled"); + } + + #[test] + fn test_empty_markdown() { + let result = markdown_to_ydoc("", "test-doc-id"); + assert!(result.is_ok()); + let bin = result.unwrap(); + assert!(!bin.is_empty()); // Should still create valid doc structure + } + + #[test] + fn test_whitespace_only_markdown() { + let result = markdown_to_ydoc(" \n\n\t\n ", "test-doc-id"); + assert!(result.is_ok()); + let bin = result.unwrap(); + assert!(!bin.is_empty()); + } + + #[test] + fn test_markdown_without_h1() { + // Should use "Untitled" as default title + let markdown = "## Secondary Heading\n\nSome content without H1."; + let result = markdown_to_ydoc(markdown, "test-doc-id"); + assert!(result.is_ok()); + } + + #[test] + fn test_nested_lists() { + let markdown = "# Nested Lists\n\n- Item 1\n - Nested 1.1\n - Nested 1.2\n- Item 2\n - Nested 2.1"; + let result = markdown_to_ydoc(markdown, "test-doc-id"); + assert!(result.is_ok()); + } + + #[test] + fn test_blockquote() { + let markdown = "# Title\n\n> A blockquote"; + let result = markdown_to_ydoc(markdown, "test-doc-id"); + assert!(result.is_ok()); + } + + #[test] + fn test_divider() { + let markdown = "# Title\n\nBefore divider\n\n---\n\nAfter divider"; + let result = markdown_to_ydoc(markdown, "test-doc-id"); + assert!(result.is_ok()); + } + + #[test] + fn test_numbered_list() { + let markdown = "# Title\n\n1. First item\n2. Second item"; + let result = markdown_to_ydoc(markdown, "test-doc-id"); + assert!(result.is_ok()); + } + + #[test] + fn test_four_paragraphs() { + // Test with 4 paragraphs + let markdown = "# Title\n\nP1.\n\nP2.\n\nP3.\n\nP4."; + let result = markdown_to_ydoc(markdown, "test-doc-id"); + assert!(result.is_ok()); + } + + #[test] + fn test_mixed_content() { + let markdown = r#"# Mixed Content + +Some intro text. + +- List item 1 +- List item 2 + +```python +def hello(): + print("world") +``` + +## Another Section + +More text here. + +1. Numbered item +2. Another numbered + +> A blockquote + +--- + +Final paragraph. +"#; + let result = markdown_to_ydoc(markdown, "test-doc-id"); + assert!(result.is_ok()); + } + + #[test] + fn test_code_block_preserves_indentation() { + // Code blocks should preserve leading whitespace (indentation) which is + // semantically significant in languages like Python, YAML, etc. + let markdown = r#"# Code Test + +```python + def indented(): + return "preserved" +``` +"#; + let result = markdown_to_ydoc(markdown, "test-doc-id"); + assert!(result.is_ok()); + // The test passes if the conversion succeeds without errors. + // Full verification would require roundtrip testing. + } + + #[test] + fn test_document_creation() { + // Test that markdown_to_ydoc creates a valid binary + let original_md = "# Test Document\n\nHello world."; + let doc_id = "creation-test"; + + let bin = markdown_to_ydoc(original_md, doc_id).expect("Should convert to ydoc"); + + // Binary should not be empty + assert!(!bin.is_empty(), "Binary should not be empty"); + assert!(bin.len() > 10, "Binary should have meaningful content"); + } + + // NOTE: Full roundtrip tests (markdown -> ydoc -> markdown) are not included + // because y-octo has a limitation where nested maps created with create_map() + // lose their content after encode/decode. This is a known y-octo limitation. + // + // However, the documents we create ARE valid and can be: + // 1. Pushed to the AFFiNE server via DocStorageAdapter.pushDocUpdates + // 2. Read by the AFFiNE client which uses JavaScript Yjs (not y-octo) + // + // The MCP write tools work because: + // - markdown_to_ydoc creates valid y-octo binary + // - The server stores the binary directly + // - The client (browser) uses Yjs to decode and render +} diff --git a/packages/common/native/src/doc_parser/markdown_utils.rs b/packages/common/native/src/doc_parser/markdown_utils.rs new file mode 100644 index 0000000000..0c57647462 --- /dev/null +++ b/packages/common/native/src/doc_parser/markdown_utils.rs @@ -0,0 +1,463 @@ +//! Shared markdown utilities for the doc_parser module + +use pulldown_cmark::{CodeBlockKind, Event, HeadingLevel, Options, Parser, Tag, TagEnd}; + +/// Block flavours used in AFFiNE documents +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum BlockFlavour { + Paragraph, + List, + Code, + Divider, +} + +impl BlockFlavour { + pub fn as_str(&self) -> &'static str { + match self { + BlockFlavour::Paragraph => "affine:paragraph", + BlockFlavour::List => "affine:list", + BlockFlavour::Code => "affine:code", + BlockFlavour::Divider => "affine:divider", + } + } +} + +/// Block types for paragraphs and lists +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum BlockType { + // Paragraph types + #[allow(dead_code)] // Used via as_str() for default paragraph type + Text, + H1, + H2, + H3, + H4, + H5, + H6, + Quote, + // List types + Bulleted, + Numbered, + Todo, +} + +impl BlockType { + pub fn as_str(&self) -> &'static str { + match self { + BlockType::Text => "text", + BlockType::H1 => "h1", + BlockType::H2 => "h2", + BlockType::H3 => "h3", + BlockType::H4 => "h4", + BlockType::H5 => "h5", + BlockType::H6 => "h6", + BlockType::Quote => "quote", + BlockType::Bulleted => "bulleted", + BlockType::Numbered => "numbered", + BlockType::Todo => "todo", + } + } + + pub fn from_heading_level(level: HeadingLevel) -> Self { + match level { + HeadingLevel::H1 => BlockType::H1, + HeadingLevel::H2 => BlockType::H2, + HeadingLevel::H3 => BlockType::H3, + HeadingLevel::H4 => BlockType::H4, + HeadingLevel::H5 => BlockType::H5, + HeadingLevel::H6 => BlockType::H6, + } + } +} + +/// A parsed block from markdown content +#[derive(Debug, Clone, PartialEq)] +pub struct ParsedBlock { + pub flavour: BlockFlavour, + pub block_type: Option, + pub content: String, + pub checked: Option, + pub language: Option, +} + +/// Parses markdown content into a list of parsed blocks. +/// +/// This is the shared parsing logic used by both `markdown_to_ydoc` and +/// `update_ydoc`. +/// +/// # Arguments +/// * `markdown` - The markdown content to parse +/// * `skip_first_h1` - If true, the first H1 heading is skipped (used as +/// document title) +/// +/// # Returns +/// A vector of parsed blocks +pub fn parse_markdown_blocks(markdown: &str, skip_first_h1: bool) -> Vec { + // Note: ENABLE_TABLES is included for future support, but table events + // currently fall through to the catch-all match arm. Table content appears as + // plain text. + let options = Options::ENABLE_STRIKETHROUGH + | Options::ENABLE_TABLES + | Options::ENABLE_TASKLISTS + | Options::ENABLE_HEADING_ATTRIBUTES; + let parser = Parser::new_ext(markdown, options); + + let mut blocks = Vec::new(); + let mut current_text = String::new(); + let mut current_type: Option = None; + let mut current_flavour = BlockFlavour::Paragraph; + let mut in_list = false; + let mut list_type_stack: Vec = Vec::new(); + // Per-item type override for task list markers (resets at each Item start) + let mut current_item_type: Option = None; + let mut in_code_block = false; + let mut code_language = String::new(); + let mut first_h1_seen = !skip_first_h1; // If not skipping, mark as already seen + let mut current_checked: Option = None; + let mut pending_link_url: Option = None; + + for event in parser { + match event { + Event::Start(Tag::Heading { level, .. }) => { + flush_block( + &mut blocks, + &mut current_text, + current_flavour, + current_type.take(), + current_checked.take(), + None, + ); + + if level == HeadingLevel::H1 && !first_h1_seen { + // Skip the first H1 - it's used as the document title + current_type = Some(BlockType::H1); + } else { + current_type = Some(BlockType::from_heading_level(level)); + } + current_flavour = BlockFlavour::Paragraph; + } + Event::End(TagEnd::Heading(level)) => { + if level == HeadingLevel::H1 && !first_h1_seen { + first_h1_seen = true; + current_text.clear(); + current_type = None; + } else { + flush_block( + &mut blocks, + &mut current_text, + current_flavour, + current_type.take(), + current_checked.take(), + None, + ); + } + } + Event::Start(Tag::Paragraph) => {} + Event::End(TagEnd::Paragraph) => { + if !in_list { + flush_block( + &mut blocks, + &mut current_text, + current_flavour, + current_type.take(), + current_checked.take(), + None, + ); + } + } + Event::Start(Tag::BlockQuote(_)) => { + current_type = Some(BlockType::Quote); + current_flavour = BlockFlavour::Paragraph; + } + Event::End(TagEnd::BlockQuote(_)) => { + flush_block( + &mut blocks, + &mut current_text, + current_flavour, + current_type.take(), + current_checked.take(), + None, + ); + } + Event::Start(Tag::List(start_num)) => { + in_list = true; + let list_type = if start_num.is_some() { + BlockType::Numbered + } else { + BlockType::Bulleted + }; + list_type_stack.push(list_type); + } + Event::End(TagEnd::List(_)) => { + list_type_stack.pop(); + if list_type_stack.is_empty() { + in_list = false; + } + } + Event::Start(Tag::Item) => { + current_flavour = BlockFlavour::List; + // Reset per-item type override + current_item_type = None; + if let Some(lt) = list_type_stack.last() { + current_type = Some(*lt); + } + } + Event::End(TagEnd::Item) => { + // Use per-item override if set (for task items), otherwise use current_type + if let Some(item_type) = current_item_type.take() { + current_type = Some(item_type); + } + flush_block( + &mut blocks, + &mut current_text, + current_flavour, + current_type.take(), + current_checked.take(), + None, + ); + current_flavour = BlockFlavour::Paragraph; + } + Event::TaskListMarker(checked) => { + // Set per-item type override for this specific item only + current_item_type = Some(BlockType::Todo); + current_checked = Some(checked); + } + Event::Start(Tag::CodeBlock(kind)) => { + in_code_block = true; + current_flavour = BlockFlavour::Code; + code_language = match kind { + CodeBlockKind::Fenced(lang) => lang.to_string(), + CodeBlockKind::Indented => String::new(), + }; + } + Event::End(TagEnd::CodeBlock) => { + flush_code_block(&mut blocks, &mut current_text, &code_language); + in_code_block = false; + code_language.clear(); + current_flavour = BlockFlavour::Paragraph; + } + Event::Text(text) => { + current_text.push_str(&text); + } + Event::Code(code) => { + // Inline code - wrap in backticks + current_text.push('`'); + current_text.push_str(&code); + current_text.push('`'); + } + Event::SoftBreak | Event::HardBreak => { + if in_code_block { + current_text.push('\n'); + } else { + current_text.push(' '); + } + } + Event::Rule => { + flush_block( + &mut blocks, + &mut current_text, + current_flavour, + current_type.take(), + current_checked.take(), + None, + ); + blocks.push(ParsedBlock { + flavour: BlockFlavour::Divider, + block_type: None, + content: String::new(), + checked: None, + language: None, + }); + } + Event::Start(Tag::Strong) => current_text.push_str("**"), + Event::End(TagEnd::Strong) => current_text.push_str("**"), + Event::Start(Tag::Emphasis) => current_text.push('_'), + Event::End(TagEnd::Emphasis) => current_text.push('_'), + Event::Start(Tag::Strikethrough) => current_text.push_str("~~"), + Event::End(TagEnd::Strikethrough) => current_text.push_str("~~"), + Event::Start(Tag::Link { dest_url, .. }) => { + current_text.push('['); + pending_link_url = Some(dest_url.to_string()); + } + Event::End(TagEnd::Link) => { + if let Some(url) = pending_link_url.take() { + current_text.push_str(&format!("]({})", url)); + } + } + _ => {} + } + } + + // Flush any remaining content + flush_block( + &mut blocks, + &mut current_text, + current_flavour, + current_type, + current_checked, + None, + ); + + blocks +} + +fn flush_block( + blocks: &mut Vec, + text: &mut String, + flavour: BlockFlavour, + block_type: Option, + checked: Option, + language: Option, +) { + let trimmed = text.trim(); + if !trimmed.is_empty() || flavour == BlockFlavour::Divider { + blocks.push(ParsedBlock { + flavour, + block_type, + content: trimmed.to_string(), + checked, + language, + }); + } + text.clear(); +} + +fn flush_code_block(blocks: &mut Vec, text: &mut String, language: &str) { + // Preserve leading whitespace (indentation) in code blocks as it may be + // semantically significant (e.g., Python, YAML). Only strip leading/trailing + // newlines which are typically artifacts from code fence parsing. + let content = text.trim_matches('\n'); + if !content.is_empty() { + blocks.push(ParsedBlock { + flavour: BlockFlavour::Code, + block_type: None, + content: content.to_string(), + checked: None, + language: if language.is_empty() { + None + } else { + Some(language.to_string()) + }, + }); + } + text.clear(); +} + +/// Extracts the title from the first H1 heading in markdown content. +/// +/// Returns "Untitled" if no H1 heading is found. +pub(crate) fn extract_title(markdown: &str) -> String { + let parser = Parser::new(markdown); + let mut in_heading = false; + let mut title = String::new(); + + for event in parser { + match event { + Event::Start(Tag::Heading { + level: HeadingLevel::H1, + .. + }) => { + in_heading = true; + } + Event::Text(text) if in_heading => { + title.push_str(&text); + } + Event::Code(code) if in_heading => { + title.push_str(&code); + } + Event::End(TagEnd::Heading(_)) if in_heading => { + break; + } + _ => {} + } + } + + if title.is_empty() { + "Untitled".to_string() + } else { + title.trim().to_string() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_extract_title_simple() { + assert_eq!(extract_title("# Hello World\n\nContent"), "Hello World"); + } + + #[test] + fn test_extract_title_with_code() { + assert_eq!(extract_title("# Hello `code` World"), "Hello code World"); + } + + #[test] + fn test_extract_title_empty() { + assert_eq!(extract_title("No heading here"), "Untitled"); + } + + #[test] + fn test_extract_title_h2_not_used() { + assert_eq!(extract_title("## H2 heading\n\nContent"), "Untitled"); + } + + #[test] + fn test_parse_markdown_blocks_simple() { + let blocks = parse_markdown_blocks("# Title\n\nParagraph text.", true); + assert_eq!(blocks.len(), 1); + assert_eq!(blocks[0].flavour, BlockFlavour::Paragraph); + assert_eq!(blocks[0].content, "Paragraph text."); + } + + #[test] + fn test_parse_markdown_blocks_with_headings() { + let blocks = parse_markdown_blocks("# Title\n\n## Section\n\nText.", true); + assert_eq!(blocks.len(), 2); + assert_eq!(blocks[0].block_type, Some(BlockType::H2)); + assert_eq!(blocks[0].content, "Section"); + assert_eq!(blocks[1].content, "Text."); + } + + #[test] + fn test_parse_markdown_blocks_lists() { + let blocks = parse_markdown_blocks("# Title\n\n- Item 1\n- Item 2", true); + assert_eq!(blocks.len(), 2); + assert_eq!(blocks[0].flavour, BlockFlavour::List); + assert_eq!(blocks[0].block_type, Some(BlockType::Bulleted)); + assert_eq!(blocks[0].content, "Item 1"); + } + + #[test] + fn test_parse_markdown_blocks_task_list() { + let blocks = parse_markdown_blocks("# Title\n\n- [ ] Unchecked\n- [x] Checked", true); + assert_eq!(blocks.len(), 2); + assert_eq!(blocks[0].block_type, Some(BlockType::Todo)); + assert_eq!(blocks[0].checked, Some(false)); + assert_eq!(blocks[1].block_type, Some(BlockType::Todo)); + assert_eq!(blocks[1].checked, Some(true)); + } + + #[test] + fn test_parse_markdown_blocks_code() { + let blocks = parse_markdown_blocks("# Title\n\n```rust\nfn main() {}\n```", true); + assert_eq!(blocks.len(), 1); + assert_eq!(blocks[0].flavour, BlockFlavour::Code); + assert_eq!(blocks[0].language, Some("rust".to_string())); + } + + #[test] + fn test_parse_markdown_blocks_divider() { + let blocks = parse_markdown_blocks("# Title\n\nBefore\n\n---\n\nAfter", true); + assert_eq!(blocks.len(), 3); + assert_eq!(blocks[1].flavour, BlockFlavour::Divider); + } + + #[test] + fn test_parse_markdown_blocks_code_preserves_indentation() { + let blocks = parse_markdown_blocks("# Title\n\n```python\n def indented():\n pass\n```", true); + assert_eq!(blocks.len(), 1); + assert!(blocks[0].content.starts_with(" def")); + } +} diff --git a/packages/common/native/src/doc_parser/mod.rs b/packages/common/native/src/doc_parser/mod.rs index a5e8761a3b..c240544773 100644 --- a/packages/common/native/src/doc_parser/mod.rs +++ b/packages/common/native/src/doc_parser/mod.rs @@ -1,9 +1,19 @@ mod affine; mod blocksuite; mod delta_markdown; +#[cfg(feature = "ydoc-loader")] +mod markdown_to_ydoc; +#[cfg(feature = "ydoc-loader")] +mod markdown_utils; +#[cfg(feature = "ydoc-loader")] +mod update_ydoc; mod value; pub use affine::{ - BlockInfo, CrawlResult, MarkdownResult, PageDocContent, ParseError, WorkspaceDocContent, get_doc_ids_from_binary, - parse_doc_from_binary, parse_doc_to_markdown, parse_page_doc, parse_workspace_doc, + BlockInfo, CrawlResult, MarkdownResult, PageDocContent, ParseError, WorkspaceDocContent, add_doc_to_root_doc, + get_doc_ids_from_binary, parse_doc_from_binary, parse_doc_to_markdown, parse_page_doc, parse_workspace_doc, }; +#[cfg(feature = "ydoc-loader")] +pub use markdown_to_ydoc::markdown_to_ydoc; +#[cfg(feature = "ydoc-loader")] +pub use update_ydoc::update_ydoc; diff --git a/packages/common/native/src/doc_parser/update_ydoc.rs b/packages/common/native/src/doc_parser/update_ydoc.rs new file mode 100644 index 0000000000..b345574e16 --- /dev/null +++ b/packages/common/native/src/doc_parser/update_ydoc.rs @@ -0,0 +1,1102 @@ +//! Update YDoc module +//! +//! Provides functionality to update existing AFFiNE documents by applying +//! surgical y-octo operations based on content differences. + +use std::collections::HashMap; + +use y_octo::{Any, Doc, DocOptions, Map}; + +use super::{ + affine::ParseError, + blocksuite::{collect_child_ids, get_string}, + markdown_utils::{BlockFlavour, ParsedBlock, extract_title, parse_markdown_blocks}, +}; + +const PAGE_FLAVOUR: &str = "affine:page"; +const NOTE_FLAVOUR: &str = "affine:note"; + +/// Represents a content block for diffing purposes +#[derive(Debug, Clone, PartialEq)] +pub struct ContentBlock { + pub flavour: String, + pub block_type: Option, // h1, h2, text, bulleted, numbered, todo, etc. + pub content: String, + pub checked: Option, // For todo items + pub language: Option, // For code blocks +} + +impl ContentBlock { + /// Check if two blocks are similar enough to be considered "the same" for + /// diffing + fn is_similar(&self, other: &ContentBlock) -> bool { + self.flavour == other.flavour && self.block_type == other.block_type + } +} + +/// Converts a ParsedBlock from the shared parser into a ContentBlock +impl From for ContentBlock { + fn from(parsed: ParsedBlock) -> Self { + // Default paragraph type to "text" to match existing documents + let block_type = if parsed.flavour == BlockFlavour::Paragraph && parsed.block_type.is_none() { + Some("text".to_string()) + } else { + parsed.block_type.map(|bt| bt.as_str().to_string()) + }; + + ContentBlock { + flavour: parsed.flavour.as_str().to_string(), + block_type, + content: parsed.content, + checked: parsed.checked, + language: parsed.language, + } + } +} + +/// Represents the existing document structure +struct ExistingDoc { + doc: Doc, + page_id: String, + note_id: String, + content_block_ids: Vec, + content_blocks: Vec<(String, ContentBlock)>, // (id, block) +} + +/// Represents a diff operation +#[derive(Debug)] +enum DiffOp { + Keep(usize), // old_idx - block unchanged + Delete(usize), // old_idx - block removed + Insert(usize), // new_idx - block added + Update(usize, usize), // (old_idx, new_idx) - block content changed +} + +/// Updates an existing document with new markdown content. +/// +/// This function performs structural diffing between the existing document +/// and the new markdown content, then applies minimal y-octo operations +/// to update only what changed. This enables proper CRDT merging with +/// concurrent edits from other clients. +/// +/// If the existing document is empty or invalid, falls back to creating +/// a new document from the markdown using `markdown_to_ydoc`. +/// +/// # Arguments +/// * `existing_binary` - The current document binary +/// * `new_markdown` - The new markdown content +/// * `doc_id` - The document ID +/// +/// # Returns +/// A binary vector representing only the delta (changes) to apply +pub fn update_ydoc(existing_binary: &[u8], new_markdown: &str, doc_id: &str) -> Result, ParseError> { + // Load and parse the existing document + // If the document is empty or invalid, fall back to creating a new one + let mut existing = match load_existing_doc(existing_binary, doc_id) { + Ok(doc) => doc, + Err(ParseError::InvalidBinary) | Err(ParseError::ParserError(_)) => { + // Empty or invalid document - create from scratch + return super::markdown_to_ydoc::markdown_to_ydoc(new_markdown, doc_id); + } + Err(e) => return Err(e), + }; + + // Parse new markdown into content blocks + let new_blocks = parse_markdown_to_content_blocks(new_markdown)?; + + // Compute diff between old and new blocks + let diff_ops = compute_diff(&existing.content_blocks, &new_blocks); + + // Capture state before modifications to encode only the delta + let state_before = existing.doc.get_state_vector(); + + // Update the title if changed + let new_title = extract_title(new_markdown); + update_title(&mut existing, &new_title)?; + + // Apply diff operations to update the document structure + apply_diff(&mut existing, &new_blocks, &diff_ops)?; + + // Encode only the changes (delta) since state_before + existing + .doc + .encode_state_as_update_v1(&state_before) + .map_err(|e| ParseError::ParserError(e.to_string())) +} + +/// Loads an existing document and extracts its structure +fn load_existing_doc(binary: &[u8], doc_id: &str) -> Result { + // Check for empty or minimal empty Y-Doc binary + // [0, 0] represents an empty Y-Doc update (0 structs, 0 deletes) - a convention + // used throughout the AFFiNE codebase for uninitialized/empty documents + if binary.is_empty() || binary == [0, 0] { + return Err(ParseError::InvalidBinary); + } + + let mut doc = DocOptions::new().with_guid(doc_id.to_string()).build(); + doc + .apply_update_from_binary_v1(binary) + .map_err(|_| ParseError::InvalidBinary)?; + + let blocks_map = doc.get_map("blocks")?; + if blocks_map.is_empty() { + return Err(ParseError::ParserError("blocks map is empty".into())); + } + + // Build block index + let mut block_pool: HashMap = HashMap::new(); + for (_, value) in blocks_map.iter() { + if let Some(block_map) = value.to_map() + && let Some(block_id) = get_string(&block_map, "sys:id") + { + block_pool.insert(block_id, block_map); + } + } + + // Find page block + let page_id = block_pool + .iter() + .find_map(|(id, block)| { + get_string(block, "sys:flavour") + .filter(|f| f == PAGE_FLAVOUR) + .map(|_| id.clone()) + }) + .ok_or_else(|| ParseError::ParserError("page block not found".into()))?; + + // Find note block (child of page) + let page_block = block_pool + .get(&page_id) + .ok_or_else(|| ParseError::ParserError("page block not found".into()))?; + let note_id = collect_child_ids(page_block) + .into_iter() + .find(|id| block_pool.get(id).and_then(|b| get_string(b, "sys:flavour")).as_deref() == Some(NOTE_FLAVOUR)) + .ok_or_else(|| ParseError::ParserError("note block not found".into()))?; + + // Get content block IDs (children of note) + let note_block = block_pool + .get(¬e_id) + .ok_or_else(|| ParseError::ParserError("note block not found".into()))?; + let raw_content_block_ids = collect_child_ids(note_block); + + // Extract content blocks with their data, filtering to only existing blocks + // This ensures content_block_ids and content_blocks stay in sync + let mut content_blocks = Vec::new(); + let mut content_block_ids = Vec::new(); + for block_id in raw_content_block_ids { + if let Some(block) = block_pool.get(&block_id) { + let content_block = extract_content_block(block); + content_blocks.push((block_id.clone(), content_block)); + content_block_ids.push(block_id); + } + } + + Ok(ExistingDoc { + doc, + page_id, + note_id, + content_block_ids, + content_blocks, + }) +} + +/// Extracts content block data from a y-octo Map +fn extract_content_block(block: &Map) -> ContentBlock { + let flavour = get_string(block, "sys:flavour").unwrap_or_default(); + let block_type = get_string(block, "prop:type"); + // Use get_string which handles both Y.Text and Any::String via value_to_string + let content = get_string(block, "prop:text").unwrap_or_default(); + let checked = block + .get("prop:checked") + .and_then(|v| v.to_any()) + .and_then(|a| match a { + Any::True => Some(true), + Any::False => Some(false), + _ => None, + }); + let language = get_string(block, "prop:language"); + + ContentBlock { + flavour, + block_type, + content, + checked, + language, + } +} + +/// Parses markdown into content blocks for diffing. +/// +/// Uses the shared `parse_markdown_blocks` function and converts to +/// `ContentBlock`. +fn parse_markdown_to_content_blocks(markdown: &str) -> Result, ParseError> { + let parsed_blocks = parse_markdown_blocks(markdown, true); + Ok(parsed_blocks.into_iter().map(ContentBlock::from).collect()) +} + +/// Updates the document title if it has changed +fn update_title(existing: &mut ExistingDoc, new_title: &str) -> Result<(), ParseError> { + let blocks_map = existing + .doc + .get_map("blocks") + .map_err(|e| ParseError::ParserError(e.to_string()))?; + + if let Some(mut page_block) = blocks_map.get(&existing.page_id).and_then(|v| v.to_map()) { + let current_title = get_string(&page_block, "prop:title").unwrap_or_default(); + if current_title != new_title { + page_block + .insert("prop:title".to_string(), Any::String(new_title.to_string())) + .map_err(|e| ParseError::ParserError(e.to_string()))?; + } + } + + Ok(()) +} + +/// Computes the diff between old and new blocks using weighted LCS algorithm. +/// Uses a two-tier matching: exact matches (same type + content) get priority, +/// then similar matches (same type, different content) for update operations. +fn compute_diff(old_blocks: &[(String, ContentBlock)], new_blocks: &[ContentBlock]) -> Vec { + let old_len = old_blocks.len(); + let new_len = new_blocks.len(); + + if old_len == 0 { + // All inserts + return (0..new_len).map(DiffOp::Insert).collect(); + } + if new_len == 0 { + // All deletes + return (0..old_len).map(DiffOp::Delete).collect(); + } + + // Build weighted LCS table using exact content match + // This ensures identical blocks are matched together + let mut lcs = vec![vec![0usize; new_len + 1]; old_len + 1]; + + for i in 1..=old_len { + for j in 1..=new_len { + let old_block = &old_blocks[i - 1].1; + let new_block = &new_blocks[j - 1]; + + // Only count as match if blocks are identical (same type AND content) + if old_block.flavour == new_block.flavour + && old_block.block_type == new_block.block_type + && old_block.content == new_block.content + && old_block.checked == new_block.checked + && old_block.language == new_block.language + { + lcs[i][j] = lcs[i - 1][j - 1] + 1; + } else { + lcs[i][j] = std::cmp::max(lcs[i - 1][j], lcs[i][j - 1]); + } + } + } + + // Backtrack to find the diff + let mut ops = Vec::new(); + let mut i = old_len; + let mut j = new_len; + + while i > 0 || j > 0 { + if i > 0 && j > 0 { + let old_block = &old_blocks[i - 1].1; + let new_block = &new_blocks[j - 1]; + + let is_exact_match = old_block.flavour == new_block.flavour + && old_block.block_type == new_block.block_type + && old_block.content == new_block.content + && old_block.checked == new_block.checked + && old_block.language == new_block.language; + + if is_exact_match { + // Exact match - Keep + ops.push(DiffOp::Keep(i - 1)); + i -= 1; + j -= 1; + } else if old_block.is_similar(new_block) + && lcs[i - 1][j - 1] >= lcs[i - 1][j] + && lcs[i - 1][j - 1] >= lcs[i][j - 1] + { + // Similar block (same type, different content) - Update if it doesn't hurt LCS + ops.push(DiffOp::Update(i - 1, j - 1)); + i -= 1; + j -= 1; + } else if lcs[i][j - 1] >= lcs[i - 1][j] { + ops.push(DiffOp::Insert(j - 1)); + j -= 1; + } else { + ops.push(DiffOp::Delete(i - 1)); + i -= 1; + } + } else if j > 0 { + ops.push(DiffOp::Insert(j - 1)); + j -= 1; + } else { + ops.push(DiffOp::Delete(i - 1)); + i -= 1; + } + } + + // Reverse to get operations in order + ops.reverse(); + ops +} + +/// Applies diff operations to update the document +fn apply_diff(existing: &mut ExistingDoc, new_blocks: &[ContentBlock], diff_ops: &[DiffOp]) -> Result<(), ParseError> { + let mut blocks_map = existing + .doc + .get_map("blocks") + .map_err(|e| ParseError::ParserError(e.to_string()))?; + + // Track new children for the note block + let mut new_children: Vec = Vec::new(); + + // Track which old blocks to delete + let mut blocks_to_delete: Vec = Vec::new(); + + for op in diff_ops { + match op { + DiffOp::Keep(old_idx) => { + // Keep the existing block + let block_id = &existing.content_block_ids[*old_idx]; + new_children.push(block_id.clone()); + } + DiffOp::Delete(old_idx) => { + // Mark block for deletion + let block_id = &existing.content_block_ids[*old_idx]; + blocks_to_delete.push(block_id.clone()); + } + DiffOp::Insert(new_idx) => { + // Create a new block + let new_block = &new_blocks[*new_idx]; + let block_id = create_new_block(&mut blocks_map, &existing.doc, new_block)?; + new_children.push(block_id); + } + DiffOp::Update(old_idx, new_idx) => { + // Update existing block content + let block_id = &existing.content_block_ids[*old_idx]; + let new_block = &new_blocks[*new_idx]; + update_block_content(&mut existing.doc, &mut blocks_map, block_id, new_block)?; + new_children.push(block_id.clone()); + } + } + } + + // Delete removed blocks from blocks map + for block_id in blocks_to_delete { + blocks_map.remove(&block_id); + } + + // Update note block's children only if they changed + // First check if they're different + let note_block = blocks_map + .get(&existing.note_id) + .and_then(|v| v.to_map()) + .ok_or_else(|| ParseError::ParserError("Note block not found".into()))?; + + let current_children: Vec = note_block + .get("sys:children") + .and_then(|v| v.to_array()) + .map(|arr| { + arr + .iter() + .filter_map(|v| { + v.to_any().and_then(|a| match a { + Any::String(s) => Some(s.clone()), + _ => None, + }) + }) + .collect() + }) + .unwrap_or_default(); + + if current_children != new_children { + update_note_children(&mut blocks_map, &existing.note_id, new_children)?; + } + + Ok(()) +} + +// ============================================================================ +// Two-Phase Insertion Helpers +// ============================================================================ +// +// IMPORTANT: These helpers implement the two-phase insertion pattern required +// for YJS compatibility. When creating nested CRDT types (Text, Array, Map), +// we must: +// 1. Insert the empty container into the parent FIRST (gets clock value) +// 2. Then retrieve and populate it (content gets later clock values) +// +// This ensures parent items always have earlier clocks than children, +// avoiding "forward parent references" that YJS cannot handle. + +/// Creates an empty Text, inserts it into the parent map, then returns it for +/// population. +fn insert_and_get_text(doc: &Doc, parent_map: &mut Map, key: &str) -> Result { + let text = doc.create_text().map_err(|e| ParseError::ParserError(e.to_string()))?; + parent_map + .insert(key.to_string(), text) + .map_err(|e| ParseError::ParserError(e.to_string()))?; + + parent_map + .get(key) + .and_then(|v| v.to_text()) + .ok_or_else(|| ParseError::ParserError("Failed to retrieve inserted text".into())) +} + +/// Creates a new block in the blocks map +/// +/// IMPORTANT: Uses two-phase approach for YJS compatibility: +/// 1. Insert empty map into blocks_map first (gets clock value) +/// 2. Then populate the map with properties (gets later clock values) +/// +/// This ensures parent items have earlier clocks than children. +/// +/// Uses Any types (Any::Array, Any::String) for children and text to avoid +/// the "get back" pattern which can hang in release builds. +fn create_new_block(blocks_map: &mut Map, doc: &Doc, block: &ContentBlock) -> Result { + let block_id = nanoid::nanoid!(); + + // Step 1: Create and insert empty map into blocks_map + let empty_map = doc.create_map().map_err(|e| ParseError::ParserError(e.to_string()))?; + blocks_map + .insert(block_id.clone(), empty_map) + .map_err(|e| ParseError::ParserError(e.to_string()))?; + + // Step 2: Retrieve the inserted map + let mut block_map = blocks_map + .get(&block_id) + .and_then(|v| v.to_map()) + .ok_or_else(|| ParseError::ParserError("Failed to get inserted block map".into()))?; + + // Step 3: Insert primitive values + block_map + .insert("sys:id".to_string(), Any::String(block_id.clone())) + .map_err(|e| ParseError::ParserError(e.to_string()))?; + block_map + .insert("sys:flavour".to_string(), Any::String(block.flavour.clone())) + .map_err(|e| ParseError::ParserError(e.to_string()))?; + + if let Some(ref block_type) = block.block_type { + block_map + .insert("prop:type".to_string(), Any::String(block_type.clone())) + .map_err(|e| ParseError::ParserError(e.to_string()))?; + } + + if let Some(checked) = block.checked { + block_map + .insert("prop:checked".to_string(), if checked { Any::True } else { Any::False }) + .map_err(|e| ParseError::ParserError(e.to_string()))?; + } + + if let Some(ref language) = block.language { + block_map + .insert("prop:language".to_string(), Any::String(language.clone())) + .map_err(|e| ParseError::ParserError(e.to_string()))?; + } + + // Step 4: Use Any::Array for children (avoids "get back" pattern) + block_map + .insert("sys:children".to_string(), Any::Array(vec![])) + .map_err(|e| ParseError::ParserError(e.to_string()))?; + + // Step 5: Use Any::String for text content (avoids "get back" pattern) + if !block.content.is_empty() { + block_map + .insert("prop:text".to_string(), Any::String(block.content.clone())) + .map_err(|e| ParseError::ParserError(e.to_string()))?; + } + + Ok(block_id) +} + +/// Updates an existing block's content using text-level diff +fn update_block_content( + doc: &mut Doc, + blocks_map: &mut Map, + block_id: &str, + new_block: &ContentBlock, +) -> Result<(), ParseError> { + let mut block = blocks_map + .get(block_id) + .and_then(|v| v.to_map()) + .ok_or_else(|| ParseError::ParserError(format!("Block {} not found", block_id)))?; + + // Update text content using text-level diff + if let Some(mut text) = block.get("prop:text").and_then(|v| v.to_text()) { + let old_content = text.to_string(); + apply_text_diff(&mut text, &old_content, &new_block.content)?; + } else if !new_block.content.is_empty() { + // Block didn't have text before, but now it does (e.g., divider becoming + // paragraph) Use two-phase helper to avoid forward parent references + let mut text = insert_and_get_text(doc, &mut block, "prop:text")?; + text + .insert(0, &new_block.content) + .map_err(|e| ParseError::ParserError(e.to_string()))?; + } + + // Update checked state - set if present, clear if stale + match new_block.checked { + Some(checked) => { + block + .insert("prop:checked".to_string(), if checked { Any::True } else { Any::False }) + .map_err(|e| ParseError::ParserError(e.to_string()))?; + } + None => { + // Clear stale checked state if block had it but shouldn't anymore + if block.get("prop:checked").is_some() { + block.remove("prop:checked"); + } + } + } + + // Update language - set if present, clear if stale + match &new_block.language { + Some(language) => { + block + .insert("prop:language".to_string(), Any::String(language.clone())) + .map_err(|e| ParseError::ParserError(e.to_string()))?; + } + None => { + // Clear stale language if block had it but shouldn't anymore + if block.get("prop:language").is_some() { + block.remove("prop:language"); + } + } + } + + Ok(()) +} + +/// Applies a text-level diff to a YText field +fn apply_text_diff(text: &mut y_octo::Text, old_content: &str, new_content: &str) -> Result<(), ParseError> { + // Use greedy diff algorithm for character-level changes + let old_chars: Vec = old_content.chars().collect(); + let new_chars: Vec = new_content.chars().collect(); + + let ops = compute_text_diff(&old_chars, &new_chars); + + // Apply operations in order, adjusting positions based on accumulated offset + // IMPORTANT: y_octo uses UTF-16 code units for positions, not char indices + let mut offset = 0i64; + for op in ops { + match op { + TextDiffOp::Delete { start_utf16, len_utf16 } => { + let raw_pos = start_utf16 as i64 + offset; + // Fail fast if position goes negative - indicates a bug in diff computation + if raw_pos < 0 { + return Err(ParseError::ParserError(format!( + "Invalid delete position: start_utf16={}, offset={}, raw_pos={}", + start_utf16, offset, raw_pos + ))); + } + let adjusted_start = raw_pos as u64; + text + .remove(adjusted_start, len_utf16 as u64) + .map_err(|e| ParseError::ParserError(e.to_string()))?; + offset -= len_utf16 as i64; + } + TextDiffOp::Insert { + pos_utf16, + text: insert_text, + } => { + let raw_pos = pos_utf16 as i64 + offset; + // Fail fast if position goes negative - indicates a bug in diff computation + if raw_pos < 0 { + return Err(ParseError::ParserError(format!( + "Invalid insert position: pos_utf16={}, offset={}, raw_pos={}", + pos_utf16, offset, raw_pos + ))); + } + let adjusted_pos = raw_pos as u64; + let utf16_len: usize = insert_text.chars().map(|c| c.len_utf16()).sum(); + text + .insert(adjusted_pos, &insert_text) + .map_err(|e| ParseError::ParserError(e.to_string()))?; + offset += utf16_len as i64; + } + } + } + + Ok(()) +} + +#[derive(Debug)] +enum TextDiffOp { + /// Delete operation with UTF-16 code unit positions + Delete { start_utf16: usize, len_utf16: usize }, + /// Insert operation with UTF-16 code unit position and text to insert + Insert { pos_utf16: usize, text: String }, +} + +/// Computes character-level diff between two strings using greedy matching. +/// Returns operations with UTF-16 code unit positions (required by y_octo). +fn compute_text_diff(old: &[char], new: &[char]) -> Vec { + // Find common prefix + let mut prefix_len = 0; + while prefix_len < old.len() && prefix_len < new.len() && old[prefix_len] == new[prefix_len] { + prefix_len += 1; + } + + // Find common suffix (from the non-prefix parts) + let old_remaining = &old[prefix_len..]; + let new_remaining = &new[prefix_len..]; + + let mut suffix_len = 0; + while suffix_len < old_remaining.len() + && suffix_len < new_remaining.len() + && old_remaining[old_remaining.len() - 1 - suffix_len] == new_remaining[new_remaining.len() - 1 - suffix_len] + { + suffix_len += 1; + } + + // The middle parts that differ + let old_mid_len = old_remaining.len() - suffix_len; + let new_mid_start = prefix_len; + let new_mid_len = new_remaining.len() - suffix_len; + + #[derive(Debug, Clone)] + enum Edit { + Keep(char), // Keep this char from old + Delete(char), // Delete this char from old + Insert(char), // Insert this char (from new) + } + + let mut edits = Vec::new(); + + // Keep prefix (store the actual chars for UTF-16 length calculation) + for &c in old.iter().take(prefix_len) { + edits.push(Edit::Keep(c)); + } + + // Delete middle of old + for &c in old.iter().skip(prefix_len).take(old_mid_len) { + edits.push(Edit::Delete(c)); + } + + // Insert middle of new + for &c in new.iter().skip(new_mid_start).take(new_mid_len) { + edits.push(Edit::Insert(c)); + } + + // Keep suffix (store the actual chars for UTF-16 length calculation) + for &c in old.iter().skip(prefix_len + old_mid_len).take(suffix_len) { + edits.push(Edit::Keep(c)); + } + + // Convert edits to operations, tracking position in UTF-16 code units + let mut ops = Vec::new(); + let mut old_pos_utf16 = 0usize; + + // Pending delete + let mut del_start_utf16: Option = None; + let mut del_len_utf16 = 0usize; + + // Pending insert + let mut ins_pos_utf16: Option = None; + let mut ins_text = String::new(); + + for edit in edits { + match edit { + Edit::Keep(c) => { + // Flush pending operations + if let Some(start) = del_start_utf16.take() { + ops.push(TextDiffOp::Delete { + start_utf16: start, + len_utf16: del_len_utf16, + }); + del_len_utf16 = 0; + } + if let Some(pos) = ins_pos_utf16.take() { + ops.push(TextDiffOp::Insert { + pos_utf16: pos, + text: std::mem::take(&mut ins_text), + }); + } + old_pos_utf16 += c.len_utf16(); + } + Edit::Delete(c) => { + // Flush pending inserts first + if let Some(pos) = ins_pos_utf16.take() { + ops.push(TextDiffOp::Insert { + pos_utf16: pos, + text: std::mem::take(&mut ins_text), + }); + } + if del_start_utf16.is_none() { + del_start_utf16 = Some(old_pos_utf16); + } + del_len_utf16 += c.len_utf16(); + old_pos_utf16 += c.len_utf16(); + } + Edit::Insert(c) => { + // Flush pending deletes first + if let Some(start) = del_start_utf16.take() { + ops.push(TextDiffOp::Delete { + start_utf16: start, + len_utf16: del_len_utf16, + }); + del_len_utf16 = 0; + } + if ins_pos_utf16.is_none() { + ins_pos_utf16 = Some(old_pos_utf16); + } + ins_text.push(c); + } + } + } + + // Flush remaining operations + if let Some(start) = del_start_utf16 { + ops.push(TextDiffOp::Delete { + start_utf16: start, + len_utf16: del_len_utf16, + }); + } + if let Some(pos) = ins_pos_utf16 { + ops.push(TextDiffOp::Insert { + pos_utf16: pos, + text: ins_text, + }); + } + + ops +} + +/// Updates the note block's children array, only if the children have changed +fn update_note_children(blocks_map: &mut Map, note_id: &str, new_children: Vec) -> Result<(), ParseError> { + let mut note_block = blocks_map + .get(note_id) + .and_then(|v| v.to_map()) + .ok_or_else(|| ParseError::ParserError("Note block not found".into()))?; + + // Replace children atomically with Any::Array (single CRDT operation) + // This is cleaner than clearing element-by-element and re-inserting + let children_any: Vec = new_children.into_iter().map(Any::String).collect(); + note_block + .insert("sys:children".to_string(), Any::Array(children_any)) + .map_err(|e| ParseError::ParserError(e.to_string()))?; + + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_markdown_to_content_blocks() { + let markdown = "# Title\n\nParagraph one.\n\nParagraph two."; + let blocks = parse_markdown_to_content_blocks(markdown).unwrap(); + + assert_eq!(blocks.len(), 2); + assert_eq!(blocks[0].flavour, BlockFlavour::Paragraph.as_str()); + assert_eq!(blocks[0].content, "Paragraph one."); + assert_eq!(blocks[1].content, "Paragraph two."); + } + + #[test] + fn test_compute_text_diff_simple() { + let old: Vec = "hello".chars().collect(); + let new: Vec = "hello world".chars().collect(); + let ops = compute_text_diff(&old, &new); + + // Should have one insert operation at UTF-16 position 5 + assert!(!ops.is_empty()); + match &ops[0] { + TextDiffOp::Insert { pos_utf16, text } => { + assert_eq!(*pos_utf16, 5); // "hello" is 5 UTF-16 code units + assert_eq!(text, " world"); + } + _ => panic!("Expected Insert operation"), + } + } + + #[test] + fn test_compute_text_diff_emoji() { + // Test with emoji (outside BMP, uses 2 UTF-16 code units per char) + let old: Vec = "a😀b".chars().collect(); + let new: Vec = "a😀c".chars().collect(); + let ops = compute_text_diff(&old, &new); + + // Should delete 'b' at UTF-16 position 3 (1 for 'a', 2 for emoji) + // Insert position is 4 (after 'b'), but offset adjustment in apply_text_diff + // accounts for the delete (-1), resulting in actual insert at position 3 + assert_eq!(ops.len(), 2); + match &ops[0] { + TextDiffOp::Delete { start_utf16, len_utf16 } => { + assert_eq!(*start_utf16, 3); // 'a'=1, '😀'=2, total=3 + assert_eq!(*len_utf16, 1); // 'b' is 1 UTF-16 code unit + } + _ => panic!("Expected Delete operation"), + } + match &ops[1] { + TextDiffOp::Insert { pos_utf16, text } => { + // Position recorded as 4 (after processing delete in old string) + // Offset adjustment will bring this to 3 when applied + assert_eq!(*pos_utf16, 4); + assert_eq!(text, "c"); + } + _ => panic!("Expected Insert operation"), + } + } + + #[test] + fn test_compute_text_diff_replace() { + let old: Vec = "abc".chars().collect(); + let new: Vec = "axc".chars().collect(); + let ops = compute_text_diff(&old, &new); + + // Should have delete 'b' and insert 'x' + assert_eq!(ops.len(), 2); + } + + #[test] + fn test_content_block_similarity() { + let paragraph_flavour = BlockFlavour::Paragraph.as_str(); + let b1 = ContentBlock { + flavour: paragraph_flavour.to_string(), + block_type: Some("h1".to_string()), + content: "Hello".to_string(), + checked: None, + language: None, + }; + let b2 = ContentBlock { + flavour: paragraph_flavour.to_string(), + block_type: Some("h1".to_string()), + content: "World".to_string(), + checked: None, + language: None, + }; + let b3 = ContentBlock { + flavour: paragraph_flavour.to_string(), + block_type: Some("h2".to_string()), + content: "Hello".to_string(), + checked: None, + language: None, + }; + + assert!(b1.is_similar(&b2)); // Same type, different content + assert!(!b1.is_similar(&b3)); // Different type + } + + #[test] + fn test_extract_title() { + assert_eq!(extract_title("# My Title\n\nContent"), "My Title"); + assert_eq!(extract_title("No heading"), "Untitled"); + assert_eq!(extract_title("## Secondary\n\nContent"), "Untitled"); + assert_eq!(extract_title("# **Bold** Title"), "Bold Title"); + } + + #[test] + fn test_update_ydoc_roundtrip() { + use crate::doc_parser::markdown_to_ydoc; + + // Create initial document + let initial_md = "# Test Document\n\nFirst paragraph.\n\nSecond paragraph."; + let doc_id = "update-test"; + + let initial_bin = markdown_to_ydoc(initial_md, doc_id).expect("Should create initial doc"); + + // Update with new content + let updated_md = "# Test Document\n\nFirst paragraph.\n\nModified second paragraph.\n\nNew third paragraph."; + + let delta = update_ydoc(&initial_bin, updated_md, doc_id).expect("Should compute delta"); + + // Delta should not be empty (changes were made) + assert!(!delta.is_empty(), "Delta should contain changes"); + + // Apply delta to original and verify structure + let mut doc = DocOptions::new().with_guid(doc_id.to_string()).build(); + doc + .apply_update_from_binary_v1(&initial_bin) + .expect("Should apply initial"); + doc.apply_update_from_binary_v1(&delta).expect("Should apply delta"); + + // Verify the document has the expected structure + let blocks_map = doc.get_map("blocks").expect("Should have blocks"); + assert!(!blocks_map.is_empty(), "Blocks should not be empty"); + } + + #[test] + fn test_update_ydoc_title_change() { + use crate::doc_parser::markdown_to_ydoc; + + let initial_md = "# Original Title\n\nContent here."; + let doc_id = "title-test"; + + let initial_bin = markdown_to_ydoc(initial_md, doc_id).expect("Should create initial doc"); + + let updated_md = "# New Title\n\nContent here."; + let delta = update_ydoc(&initial_bin, updated_md, doc_id).expect("Should compute delta"); + + // Apply and verify + let mut doc = DocOptions::new().with_guid(doc_id.to_string()).build(); + doc + .apply_update_from_binary_v1(&initial_bin) + .expect("Should apply initial"); + doc.apply_update_from_binary_v1(&delta).expect("Should apply delta"); + + let blocks_map = doc.get_map("blocks").expect("Should have blocks"); + assert!(!blocks_map.is_empty()); + } + + #[test] + fn test_update_ydoc_no_changes() { + use crate::doc_parser::markdown_to_ydoc; + + let markdown = "# Same Title\n\nSame content."; + let doc_id = "no-change-test"; + + let initial_bin = markdown_to_ydoc(markdown, doc_id).expect("Should create initial doc"); + + // Update with identical content + let delta = update_ydoc(&initial_bin, markdown, doc_id).expect("Should compute delta"); + + // Applying the delta should not fail + let mut doc = DocOptions::new().with_guid(doc_id.to_string()).build(); + doc + .apply_update_from_binary_v1(&initial_bin) + .expect("Should apply initial"); + doc + .apply_update_from_binary_v1(&delta) + .expect("Should apply delta even with no changes"); + + // Document should still be valid + let blocks_map = doc.get_map("blocks").expect("Should have blocks"); + assert!(!blocks_map.is_empty()); + } + + #[test] + fn test_update_ydoc_add_block() { + use crate::doc_parser::markdown_to_ydoc; + + // Create initial document with one paragraph + let initial_md = "# Add Block Test\n\nOriginal paragraph."; + let doc_id = "add-block-test"; + + let initial_bin = markdown_to_ydoc(initial_md, doc_id).expect("Should create initial doc"); + let initial_size = initial_bin.len(); + + // Add a new paragraph + let updated_md = "# Add Block Test\n\nOriginal paragraph.\n\nNew paragraph added."; + let delta = update_ydoc(&initial_bin, updated_md, doc_id).expect("Should compute delta"); + + // Delta should be smaller than a full document (indicates true delta encoding) + // Note: For small changes, delta might not always be smaller due to overhead + assert!(!delta.is_empty(), "Delta should contain changes"); + + // Apply delta and verify + let mut doc = DocOptions::new().with_guid(doc_id.to_string()).build(); + doc + .apply_update_from_binary_v1(&initial_bin) + .expect("Should apply initial"); + doc + .apply_update_from_binary_v1(&delta) + .expect("Should apply delta with new block"); + + // Verify block count increased + let blocks_map = doc.get_map("blocks").expect("Should have blocks"); + let block_count = blocks_map.len(); + // Should have: page + note + 2 content blocks = 4 blocks + assert!(block_count >= 4, "Should have at least 4 blocks, got {}", block_count); + + println!( + "Add block test: initial={} bytes, delta={} bytes, blocks={}", + initial_size, + delta.len(), + block_count + ); + } + + #[test] + fn test_update_ydoc_delete_block() { + use crate::doc_parser::markdown_to_ydoc; + + // Create initial document with two paragraphs + let initial_md = "# Delete Block Test\n\nFirst paragraph.\n\nSecond paragraph to delete."; + let doc_id = "delete-block-test"; + + let initial_bin = markdown_to_ydoc(initial_md, doc_id).expect("Should create initial doc"); + + // Remove the second paragraph + let updated_md = "# Delete Block Test\n\nFirst paragraph."; + let delta = update_ydoc(&initial_bin, updated_md, doc_id).expect("Should compute delta"); + + assert!(!delta.is_empty(), "Delta should contain changes"); + + // Apply delta and verify + let mut doc = DocOptions::new().with_guid(doc_id.to_string()).build(); + doc + .apply_update_from_binary_v1(&initial_bin) + .expect("Should apply initial"); + doc + .apply_update_from_binary_v1(&delta) + .expect("Should apply delta with block deletion"); + + // Verify document still valid + let blocks_map = doc.get_map("blocks").expect("Should have blocks"); + assert!(!blocks_map.is_empty(), "Blocks should not be empty after deletion"); + } + + #[test] + fn test_update_ydoc_concurrent_merge_simulation() { + use crate::doc_parser::markdown_to_ydoc; + + // This test simulates concurrent editing by creating two different updates + // from the same base document and merging them. + let base_md = "# Concurrent Test\n\nBase paragraph."; + let doc_id = "concurrent-test"; + + let base_bin = markdown_to_ydoc(base_md, doc_id).expect("Should create base doc"); + + // Client A modifies the paragraph + let client_a_md = "# Concurrent Test\n\nModified by client A."; + let delta_a = update_ydoc(&base_bin, client_a_md, doc_id).expect("Delta A"); + + // Client B adds a new paragraph (from same base) + let client_b_md = "# Concurrent Test\n\nBase paragraph.\n\nAdded by client B."; + let delta_b = update_ydoc(&base_bin, client_b_md, doc_id).expect("Delta B"); + + // Apply both deltas to base document + let mut final_doc = DocOptions::new().with_guid(doc_id.to_string()).build(); + final_doc.apply_update_from_binary_v1(&base_bin).expect("Apply base"); + final_doc.apply_update_from_binary_v1(&delta_a).expect("Apply delta A"); + final_doc.apply_update_from_binary_v1(&delta_b).expect("Apply delta B"); + + // Document should be valid with merged changes + let blocks_map = final_doc.get_map("blocks").expect("Should have blocks"); + assert!(!blocks_map.is_empty(), "Merged document should have blocks"); + + // Should have more blocks than just page + note + 1 paragraph + // The merge should result in at least 4 blocks (page, note, modified para, new + // para) + let block_count = blocks_map.len(); + println!("Concurrent merge test: final block count = {}", block_count); + assert!(block_count >= 4, "Should have merged blocks, got {}", block_count); + } + + #[test] + fn test_update_ydoc_empty_binary_fallback() { + // Test that update_ydoc falls back to markdown_to_ydoc for empty binaries + let markdown = "# New Document\n\nCreated from empty binary."; + let doc_id = "empty-fallback-test"; + + // Empty binary should trigger fallback + let result = update_ydoc(&[], markdown, doc_id).expect("Should create from empty"); + assert!(!result.is_empty(), "Result should not be empty"); + + // [0, 0] minimal empty binary should also trigger fallback + let result = update_ydoc(&[0, 0], markdown, doc_id).expect("Should create from minimal empty"); + assert!(!result.is_empty(), "Result should not be empty"); + + // Verify the result is a valid document + let mut doc = DocOptions::new().with_guid(doc_id.to_string()).build(); + doc + .apply_update_from_binary_v1(&result) + .expect("Should apply created doc"); + + let blocks_map = doc.get_map("blocks").expect("Should have blocks"); + assert!(!blocks_map.is_empty(), "Document created from empty should have blocks"); + } +}