mirror of
https://github.com/toeverything/AFFiNE.git
synced 2026-02-04 00:28:33 +00:00
feat(server): add document write tools for mcp (#14245)
## Summary This PR adds write capabilities to AFFiNE's MCP (Model Context Protocol) integration, enabling external tools (Claude, GPT, etc.) to create and modify documents programmatically. **New MCP Tools:** - `create_document` - Create new documents from markdown content - `update_document` - Update document content using structural diffing for minimal changes (preserves document history and enables real-time collaboration) **Implementation:** - `markdown_to_ydoc.rs` - Converts markdown to AFFiNE-compatible y-octo binary format - `markdown_utils.rs` - Shared markdown parsing utilities (used by both ydoc-to-md and md-to-ydoc) - `update_ydoc.rs` - Structural diffing implementation for updating existing documents - `DocWriter` service - TypeScript service for document operations - Exposes `markdownToDocBinary` and `updateDocBinary` via napi bindings **Supported Markdown Elements:** - Headings (H1-H6) - Paragraphs - Bullet lists and numbered lists - Code blocks (with language detection) - Blockquotes - Horizontal dividers - Todo items (checkboxes) **y-octo Changes:** This PR reverts the y-octo sync (ca2462f,a5b60cf) which introduced a concurrency bug causing hangs when creating documents with many nested block structures. It also ports the improved `get_node_index` binary search fix from upstream that prevents divide-by-zero panics when decoding documents. ## Test Results ✅ ### Unit Tests (47/47 passing) | Test Suite | Tests | Status | |------------|-------|--------| | markdown_to_ydoc | 16/16 | ✅ Pass | | markdown_utils | 11/11 | ✅ Pass | | update_ydoc | 13/13 | ✅ Pass | | delta_markdown | 2/2 | ✅ Pass | | affine (doc parser) | 5/5 | ✅ Pass | ### End-to-End MCP Testing ✅ Tested against local AFFiNE server with real MCP client requests: | Tool | Result | Notes | |------|--------|-------| | `tools/list` | ✅ Pass | Returns all 5 tools with correct schemas | | `create_document` | ✅ Pass | Successfully created test documents | | `update_document` | ✅ Pass | Successfully updated documents with structural diffing | | `read_document` | ✅ Pass | Existing tool, works correctly | | `keyword_search` | ✅ Pass | Existing tool, works correctly | **E2E Test Details:** - Started local AFFiNE server with PostgreSQL, Redis, and Manticore - Created test user and workspace via seed/GraphQL - Verified MCP endpoint at `/api/workspaces/:workspaceId/mcp` - Tested JSON-RPC calls with proper SSE streaming - Confirmed documents are stored and indexed correctly (verified via server logs) ## Test Plan - [x] All Rust unit tests pass (47 tests) - [x] Native bindings build successfully (release mode) - [x] Document creation via MCP works end-to-end - [x] Document update via MCP works end-to-end - [x] CodeRabbit feedback addressed - [ ] Integration testing with Claude/GPT MCP clients Closes #14161 --- **Requested by:** @realies **Key guidance from:** @darkskygit (use y-octo instead of yjs for memory efficiency) <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit * **New Features** * Create documents from Markdown: generate new documents directly from Markdown content with automatic title extraction * Update documents with Markdown: modify existing documents using Markdown as the source with automatic diff calculation for efficient updates * Copilot integration: new tools for document creation and updates through Copilot's interface <sub>✏️ Tip: You can customize this high-level summary in your review settings.</sub> <!-- end of auto-generated comment: release notes by coderabbit.ai -->
This commit is contained in:
25
Cargo.lock
generated
25
Cargo.lock
generated
@@ -43,8 +43,10 @@ dependencies = [
|
||||
"criterion",
|
||||
"docx-parser",
|
||||
"infer",
|
||||
"nanoid",
|
||||
"path-ext",
|
||||
"pdf-extract",
|
||||
"pulldown-cmark",
|
||||
"rand 0.9.2",
|
||||
"rayon",
|
||||
"readability",
|
||||
@@ -1793,6 +1795,15 @@ dependencies = [
|
||||
"version_check",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "getopts"
|
||||
version = "0.2.24"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cfe4fbac503b8d1f88e6676011885f34b7174f46e59956bba534ba83abded4df"
|
||||
dependencies = [
|
||||
"unicode-width",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "getrandom"
|
||||
version = "0.2.16"
|
||||
@@ -3474,10 +3485,18 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1e8bbe1a966bd2f362681a44f6edce3c2310ac21e4d5067a6e7ec396297a6ea0"
|
||||
dependencies = [
|
||||
"bitflags 2.10.0",
|
||||
"getopts",
|
||||
"memchr",
|
||||
"pulldown-cmark-escape",
|
||||
"unicase",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pulldown-cmark-escape"
|
||||
version = "0.11.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "007d8adb5ddab6f8e3f491ac63566a7d5002cc7ed73901f72057943fa71ae1ae"
|
||||
|
||||
[[package]]
|
||||
name = "quick-error"
|
||||
version = "1.2.3"
|
||||
@@ -5161,6 +5180,12 @@ version = "1.12.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493"
|
||||
|
||||
[[package]]
|
||||
name = "unicode-width"
|
||||
version = "0.2.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254"
|
||||
|
||||
[[package]]
|
||||
name = "uniffi"
|
||||
version = "0.29.5"
|
||||
|
||||
@@ -71,6 +71,7 @@ resolver = "3"
|
||||
phf = { version = "0.11", features = ["macros"] }
|
||||
proptest = "1.3"
|
||||
proptest-derive = "0.5"
|
||||
pulldown-cmark = "0.13"
|
||||
rand = "0.9"
|
||||
rand_chacha = "0.9"
|
||||
rand_distr = "0.5"
|
||||
|
||||
40
packages/backend/native/index.d.ts
vendored
40
packages/backend/native/index.d.ts
vendored
@@ -4,6 +4,20 @@ export declare class Tokenizer {
|
||||
count(content: string, allowedSpecial?: Array<string> | undefined | null): number
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a document ID to the workspace root doc's meta.pages array.
|
||||
* This registers the document in the workspace so it appears in the UI.
|
||||
*
|
||||
* # Arguments
|
||||
* * `root_doc_bin` - The current root doc binary (workspaceId doc)
|
||||
* * `doc_id` - The document ID to add
|
||||
* * `title` - Optional title for the document
|
||||
*
|
||||
* # Returns
|
||||
* A Buffer containing the y-octo update binary to apply to the root doc
|
||||
*/
|
||||
export declare function addDocToRootDoc(rootDocBin: Buffer, docId: string, title?: string | undefined | null): Buffer
|
||||
|
||||
export const AFFINE_PRO_LICENSE_AES_KEY: string | undefined | null
|
||||
|
||||
export const AFFINE_PRO_PUBLIC_KEY: string | undefined | null
|
||||
@@ -19,6 +33,18 @@ export declare function getMime(input: Uint8Array): string
|
||||
|
||||
export declare function htmlSanitize(input: string): string
|
||||
|
||||
/**
|
||||
* Converts markdown content to AFFiNE-compatible y-octo document binary.
|
||||
*
|
||||
* # Arguments
|
||||
* * `markdown` - The markdown content to convert
|
||||
* * `doc_id` - The document ID to use for the y-octo doc
|
||||
*
|
||||
* # Returns
|
||||
* A Buffer containing the y-octo document update binary
|
||||
*/
|
||||
export declare function markdownToDocBinary(markdown: string, docId: string): Buffer
|
||||
|
||||
/**
|
||||
* Merge updates in form like `Y.applyUpdate(doc, update)` way and return the
|
||||
* result binary.
|
||||
@@ -77,4 +103,18 @@ export declare function parseWorkspaceDoc(docBin: Buffer): NativeWorkspaceDocCon
|
||||
|
||||
export declare function readAllDocIdsFromRootDoc(docBin: Buffer, includeTrash?: boolean | undefined | null): Array<string>
|
||||
|
||||
/**
|
||||
* Updates an existing document with new markdown content.
|
||||
* Uses structural and text-level diffing to apply minimal changes.
|
||||
*
|
||||
* # Arguments
|
||||
* * `existing_binary` - The current document binary
|
||||
* * `new_markdown` - The new markdown content to apply
|
||||
* * `doc_id` - The document ID
|
||||
*
|
||||
* # Returns
|
||||
* A Buffer containing only the delta (changes) as a y-octo update binary
|
||||
*/
|
||||
export declare function updateDocWithMarkdown(existingBinary: Buffer, newMarkdown: string, docId: string): Buffer
|
||||
|
||||
export declare function verifyChallengeResponse(response: string, bits: number, resource: string): Promise<boolean>
|
||||
|
||||
@@ -132,3 +132,52 @@ pub fn read_all_doc_ids_from_root_doc(doc_bin: Buffer, include_trash: Option<boo
|
||||
.map_err(|e| Error::new(Status::GenericFailure, e.to_string()))?;
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
/// Converts markdown content to AFFiNE-compatible y-octo document binary.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `markdown` - The markdown content to convert
|
||||
/// * `doc_id` - The document ID to use for the y-octo doc
|
||||
///
|
||||
/// # Returns
|
||||
/// A Buffer containing the y-octo document update binary
|
||||
#[napi]
|
||||
pub fn markdown_to_doc_binary(markdown: String, doc_id: String) -> Result<Buffer> {
|
||||
let result =
|
||||
doc_parser::markdown_to_ydoc(&markdown, &doc_id).map_err(|e| Error::new(Status::GenericFailure, e.to_string()))?;
|
||||
Ok(Buffer::from(result))
|
||||
}
|
||||
|
||||
/// Updates an existing document with new markdown content.
|
||||
/// Uses structural and text-level diffing to apply minimal changes.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `existing_binary` - The current document binary
|
||||
/// * `new_markdown` - The new markdown content to apply
|
||||
/// * `doc_id` - The document ID
|
||||
///
|
||||
/// # Returns
|
||||
/// A Buffer containing only the delta (changes) as a y-octo update binary
|
||||
#[napi]
|
||||
pub fn update_doc_with_markdown(existing_binary: Buffer, new_markdown: String, doc_id: String) -> Result<Buffer> {
|
||||
let result = doc_parser::update_ydoc(&existing_binary, &new_markdown, &doc_id)
|
||||
.map_err(|e| Error::new(Status::GenericFailure, e.to_string()))?;
|
||||
Ok(Buffer::from(result))
|
||||
}
|
||||
|
||||
/// Adds a document ID to the workspace root doc's meta.pages array.
|
||||
/// This registers the document in the workspace so it appears in the UI.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `root_doc_bin` - The current root doc binary (workspaceId doc)
|
||||
/// * `doc_id` - The document ID to add
|
||||
/// * `title` - Optional title for the document
|
||||
///
|
||||
/// # Returns
|
||||
/// A Buffer containing the y-octo update binary to apply to the root doc
|
||||
#[napi]
|
||||
pub fn add_doc_to_root_doc(root_doc_bin: Buffer, doc_id: String, title: Option<String>) -> Result<Buffer> {
|
||||
let result = doc_parser::add_doc_to_root_doc(root_doc_bin.into(), &doc_id, title.as_deref())
|
||||
.map_err(|e| Error::new(Status::GenericFailure, e.to_string()))?;
|
||||
Ok(Buffer::from(result))
|
||||
}
|
||||
|
||||
@@ -11,6 +11,7 @@ import { DocEventsListener } from './event';
|
||||
import { DocStorageCronJob } from './job';
|
||||
import { DocStorageOptions } from './options';
|
||||
import { DatabaseDocReader, DocReader, DocReaderProvider } from './reader';
|
||||
import { DocWriter } from './writer';
|
||||
|
||||
@Module({
|
||||
imports: [QuotaModule, PermissionModule, StorageModule],
|
||||
@@ -22,10 +23,12 @@ import { DatabaseDocReader, DocReader, DocReaderProvider } from './reader';
|
||||
DocReaderProvider,
|
||||
DatabaseDocReader,
|
||||
DocEventsListener,
|
||||
DocWriter,
|
||||
],
|
||||
exports: [
|
||||
DatabaseDocReader,
|
||||
DocReader,
|
||||
DocWriter,
|
||||
PgWorkspaceDocStorageAdapter,
|
||||
PgUserspaceDocStorageAdapter,
|
||||
],
|
||||
@@ -35,6 +38,7 @@ export {
|
||||
// only for doc-service
|
||||
DatabaseDocReader,
|
||||
DocReader,
|
||||
DocWriter,
|
||||
PgUserspaceDocStorageAdapter,
|
||||
PgWorkspaceDocStorageAdapter,
|
||||
};
|
||||
|
||||
131
packages/backend/server/src/core/doc/writer.ts
Normal file
131
packages/backend/server/src/core/doc/writer.ts
Normal file
@@ -0,0 +1,131 @@
|
||||
import { Injectable, Logger, NotFoundException } from '@nestjs/common';
|
||||
import { nanoid } from 'nanoid';
|
||||
|
||||
import {
|
||||
addDocToRootDoc,
|
||||
markdownToDocBinary,
|
||||
updateDocWithMarkdown,
|
||||
} from '../../native';
|
||||
import { PgWorkspaceDocStorageAdapter } from './adapters/workspace';
|
||||
|
||||
export interface CreateDocResult {
|
||||
docId: string;
|
||||
}
|
||||
|
||||
export interface UpdateDocResult {
|
||||
success: boolean;
|
||||
}
|
||||
|
||||
@Injectable()
|
||||
export class DocWriter {
|
||||
private readonly logger = new Logger(DocWriter.name);
|
||||
|
||||
constructor(private readonly storage: PgWorkspaceDocStorageAdapter) {}
|
||||
|
||||
/**
|
||||
* Creates a new document from markdown content.
|
||||
*
|
||||
* @param workspaceId - The workspace ID
|
||||
* @param markdown - The markdown content
|
||||
* @param editorId - Optional editor ID for tracking
|
||||
* @returns The created document ID
|
||||
*/
|
||||
async createDoc(
|
||||
workspaceId: string,
|
||||
markdown: string,
|
||||
editorId?: string
|
||||
): Promise<CreateDocResult> {
|
||||
// Fetch workspace root doc first - reject if not found
|
||||
// The root doc (docId = workspaceId) contains meta.pages array
|
||||
const rootDoc = await this.storage.getDoc(workspaceId, workspaceId);
|
||||
if (!rootDoc?.bin) {
|
||||
throw new NotFoundException(
|
||||
`Workspace ${workspaceId} not found or has no root document`
|
||||
);
|
||||
}
|
||||
|
||||
const rootDocBin = Buffer.isBuffer(rootDoc.bin)
|
||||
? rootDoc.bin
|
||||
: Buffer.from(
|
||||
rootDoc.bin.buffer,
|
||||
rootDoc.bin.byteOffset,
|
||||
rootDoc.bin.byteLength
|
||||
);
|
||||
|
||||
const docId = nanoid();
|
||||
|
||||
this.logger.debug(
|
||||
`Creating doc ${docId} in workspace ${workspaceId} from markdown`
|
||||
);
|
||||
|
||||
// Convert markdown to y-octo binary
|
||||
const binary = markdownToDocBinary(markdown, docId);
|
||||
|
||||
// Extract title from markdown (first H1 heading)
|
||||
const titleMatch = markdown.match(/^#\s+(.+?)(?:\s*#+)?\s*$/m);
|
||||
const title = titleMatch ? titleMatch[1].trim() : undefined;
|
||||
|
||||
// Prepare root doc update to register the new document
|
||||
const rootDocUpdate = addDocToRootDoc(rootDocBin, docId, title);
|
||||
|
||||
// Push both updates together - root doc first, then the new doc
|
||||
await this.storage.pushDocUpdates(
|
||||
workspaceId,
|
||||
workspaceId,
|
||||
[rootDocUpdate],
|
||||
editorId
|
||||
);
|
||||
await this.storage.pushDocUpdates(workspaceId, docId, [binary], editorId);
|
||||
|
||||
this.logger.debug(
|
||||
`Created and registered doc ${docId} in workspace ${workspaceId}`
|
||||
);
|
||||
|
||||
return { docId };
|
||||
}
|
||||
|
||||
/**
|
||||
* Updates an existing document with new markdown content.
|
||||
*
|
||||
* Uses structural diffing to compute minimal changes between the existing
|
||||
* document and new markdown, then applies only the delta. This preserves
|
||||
* document history and enables proper CRDT merging with concurrent edits.
|
||||
*
|
||||
* @param workspaceId - The workspace ID
|
||||
* @param docId - The document ID to update
|
||||
* @param markdown - The new markdown content
|
||||
* @param editorId - Optional editor ID for tracking
|
||||
*/
|
||||
async updateDoc(
|
||||
workspaceId: string,
|
||||
docId: string,
|
||||
markdown: string,
|
||||
editorId?: string
|
||||
): Promise<UpdateDocResult> {
|
||||
this.logger.debug(
|
||||
`Updating doc ${docId} in workspace ${workspaceId} from markdown`
|
||||
);
|
||||
|
||||
// Fetch existing document
|
||||
const existingDoc = await this.storage.getDoc(workspaceId, docId);
|
||||
if (!existingDoc?.bin) {
|
||||
throw new NotFoundException(`Document ${docId} not found`);
|
||||
}
|
||||
|
||||
// Compute delta update using structural diff
|
||||
// Use zero-copy buffer view when possible for native function
|
||||
const existingBinary = Buffer.isBuffer(existingDoc.bin)
|
||||
? existingDoc.bin
|
||||
: Buffer.from(
|
||||
existingDoc.bin.buffer,
|
||||
existingDoc.bin.byteOffset,
|
||||
existingDoc.bin.byteLength
|
||||
);
|
||||
const delta = updateDocWithMarkdown(existingBinary, markdown, docId);
|
||||
|
||||
// Push only the delta changes
|
||||
await this.storage.pushDocUpdates(workspaceId, docId, [delta], editorId);
|
||||
|
||||
return { success: true };
|
||||
}
|
||||
}
|
||||
@@ -49,3 +49,8 @@ export const readAllDocIdsFromRootDoc =
|
||||
export const AFFINE_PRO_PUBLIC_KEY = serverNativeModule.AFFINE_PRO_PUBLIC_KEY;
|
||||
export const AFFINE_PRO_LICENSE_AES_KEY =
|
||||
serverNativeModule.AFFINE_PRO_LICENSE_AES_KEY;
|
||||
|
||||
// MCP write tools exports
|
||||
export const markdownToDocBinary = serverNativeModule.markdownToDocBinary;
|
||||
export const updateDocWithMarkdown = serverNativeModule.updateDocWithMarkdown;
|
||||
export const addDocToRootDoc = serverNativeModule.addDocToRootDoc;
|
||||
|
||||
@@ -4,7 +4,7 @@ import { Injectable } from '@nestjs/common';
|
||||
import { pick } from 'lodash-es';
|
||||
import z from 'zod/v3';
|
||||
|
||||
import { DocReader } from '../../../core/doc';
|
||||
import { DocReader, DocWriter } from '../../../core/doc';
|
||||
import { AccessController } from '../../../core/permission';
|
||||
import { clearEmbeddingChunk } from '../../../models';
|
||||
import { IndexerService } from '../../indexer';
|
||||
@@ -15,6 +15,7 @@ export class WorkspaceMcpProvider {
|
||||
constructor(
|
||||
private readonly ac: AccessController,
|
||||
private readonly reader: DocReader,
|
||||
private readonly writer: DocWriter,
|
||||
private readonly context: CopilotContextService,
|
||||
private readonly indexer: IndexerService
|
||||
) {}
|
||||
@@ -165,6 +166,147 @@ export class WorkspaceMcpProvider {
|
||||
}
|
||||
);
|
||||
|
||||
// Write tools - create and update documents
|
||||
server.registerTool(
|
||||
'create_document',
|
||||
{
|
||||
title: 'Create Document',
|
||||
description:
|
||||
'Create a new document in the workspace with the given title and markdown content. Returns the ID of the created document.',
|
||||
inputSchema: z.object({
|
||||
title: z.string().min(1).describe('The title of the new document'),
|
||||
content: z
|
||||
.string()
|
||||
.describe(
|
||||
'The markdown content for the document body (should NOT include a title H1 - the title parameter will be used)'
|
||||
),
|
||||
}),
|
||||
},
|
||||
async ({ title, content }) => {
|
||||
try {
|
||||
// Check if user can create docs in this workspace
|
||||
await this.ac
|
||||
.user(userId)
|
||||
.workspace(workspaceId)
|
||||
.assert('Workspace.CreateDoc');
|
||||
|
||||
// Combine title and content into markdown
|
||||
// Sanitize title by removing newlines and trimming
|
||||
const sanitizedTitle = title.replace(/[\r\n]+/g, ' ').trim();
|
||||
if (!sanitizedTitle) {
|
||||
throw new Error('Title cannot be empty');
|
||||
}
|
||||
|
||||
// Strip any leading H1 from content to prevent duplicates
|
||||
// Per CommonMark spec, ATX headings allow only 0-3 spaces before the #
|
||||
// Handles: "# Title", " # Title", "# Title #"
|
||||
const strippedContent = content.replace(
|
||||
/^[ \t]{0,3}#\s+[^\n]*#*\s*\n*/,
|
||||
''
|
||||
);
|
||||
|
||||
const markdown = `# ${sanitizedTitle}\n\n${strippedContent}`;
|
||||
|
||||
// Create the document
|
||||
const result = await this.writer.createDoc(
|
||||
workspaceId,
|
||||
markdown,
|
||||
userId
|
||||
);
|
||||
|
||||
return {
|
||||
content: [
|
||||
{
|
||||
type: 'text',
|
||||
text: JSON.stringify({
|
||||
success: true,
|
||||
docId: result.docId,
|
||||
message: `Document "${title}" created successfully`,
|
||||
}),
|
||||
},
|
||||
],
|
||||
} as const;
|
||||
} catch (error) {
|
||||
return {
|
||||
isError: true,
|
||||
content: [
|
||||
{
|
||||
type: 'text',
|
||||
text: `Failed to create document: ${error instanceof Error ? error.message : 'Unknown error'}`,
|
||||
},
|
||||
],
|
||||
};
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
server.registerTool(
|
||||
'update_document',
|
||||
{
|
||||
title: 'Update Document',
|
||||
description:
|
||||
'Update an existing document with new markdown content. Uses structural diffing to apply minimal changes, preserving document history and enabling real-time collaboration.',
|
||||
inputSchema: z.object({
|
||||
docId: z.string().describe('The ID of the document to update'),
|
||||
content: z
|
||||
.string()
|
||||
.describe(
|
||||
'The complete new markdown content for the document (including title as H1)'
|
||||
),
|
||||
}),
|
||||
},
|
||||
async ({ docId, content }) => {
|
||||
const notFoundError: CallToolResult = {
|
||||
isError: true,
|
||||
content: [
|
||||
{
|
||||
type: 'text',
|
||||
text: `Doc with id ${docId} not found.`,
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
// Use can() instead of assert() to avoid leaking doc existence info
|
||||
const accessible = await this.ac
|
||||
.user(userId)
|
||||
.workspace(workspaceId)
|
||||
.doc(docId)
|
||||
.can('Doc.Update');
|
||||
|
||||
if (!accessible) {
|
||||
return notFoundError;
|
||||
}
|
||||
|
||||
try {
|
||||
// Update the document
|
||||
await this.writer.updateDoc(workspaceId, docId, content, userId);
|
||||
|
||||
return {
|
||||
content: [
|
||||
{
|
||||
type: 'text',
|
||||
text: JSON.stringify({
|
||||
success: true,
|
||||
docId,
|
||||
message: `Document updated successfully`,
|
||||
}),
|
||||
},
|
||||
],
|
||||
} as const;
|
||||
} catch (error) {
|
||||
return {
|
||||
isError: true,
|
||||
content: [
|
||||
{
|
||||
type: 'text',
|
||||
text: `Failed to update document: ${error instanceof Error ? error.message : 'Unknown error'}`,
|
||||
},
|
||||
],
|
||||
};
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
return server;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -37,15 +37,25 @@ tree-sitter = [
|
||||
"dep:tree-sitter-scala",
|
||||
"dep:tree-sitter-typescript",
|
||||
]
|
||||
ydoc-loader = ["assert-json-diff", "serde", "serde_json", "thiserror", "y-octo"]
|
||||
ydoc-loader = [
|
||||
"assert-json-diff",
|
||||
"nanoid",
|
||||
"pulldown-cmark",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"thiserror",
|
||||
"y-octo",
|
||||
]
|
||||
|
||||
[dependencies]
|
||||
assert-json-diff = { workspace = true, optional = true }
|
||||
chrono = { workspace = true, optional = true }
|
||||
docx-parser = { workspace = true, optional = true }
|
||||
infer = { workspace = true, optional = true }
|
||||
nanoid = { workspace = true, optional = true }
|
||||
path-ext = { workspace = true, optional = true }
|
||||
pdf-extract = { workspace = true, optional = true }
|
||||
pulldown-cmark = { workspace = true, optional = true }
|
||||
rand = { workspace = true, optional = true }
|
||||
readability = { workspace = true, optional = true, default-features = false }
|
||||
serde = { workspace = true, optional = true, features = ["derive"] }
|
||||
|
||||
@@ -584,6 +584,113 @@ pub fn get_doc_ids_from_binary(doc_bin: Vec<u8>, include_trash: bool) -> Result<
|
||||
Ok(doc_ids)
|
||||
}
|
||||
|
||||
/// Adds a document ID to the root doc's meta.pages array.
|
||||
/// Returns a binary update that can be applied to the root doc.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `root_doc_bin` - The current root doc binary
|
||||
/// * `doc_id` - The document ID to add
|
||||
/// * `title` - Optional title for the document
|
||||
///
|
||||
/// # Returns
|
||||
/// A Vec<u8> containing the y-octo update binary to add the doc
|
||||
pub fn add_doc_to_root_doc(root_doc_bin: Vec<u8>, doc_id: &str, title: Option<&str>) -> Result<Vec<u8>, ParseError> {
|
||||
// Handle empty or minimal root doc - create a new one
|
||||
let doc = if root_doc_bin.is_empty() || root_doc_bin == [0, 0] {
|
||||
DocOptions::new().build()
|
||||
} else {
|
||||
let mut doc = DocOptions::new().build();
|
||||
doc
|
||||
.apply_update_from_binary_v1(&root_doc_bin)
|
||||
.map_err(|_| ParseError::InvalidBinary)?;
|
||||
doc
|
||||
};
|
||||
|
||||
// Capture state before modifications to encode only the delta
|
||||
let state_before = doc.get_state_vector();
|
||||
|
||||
// Get or create the meta map
|
||||
let mut meta = doc.get_or_create_map("meta")?;
|
||||
|
||||
// Get existing pages array or create new one
|
||||
let pages_exists = meta.get("pages").and_then(|v| v.to_array()).is_some();
|
||||
|
||||
if pages_exists {
|
||||
// Get the existing array and add to it
|
||||
let mut pages = meta.get("pages").and_then(|v| v.to_array()).unwrap();
|
||||
|
||||
// Check if doc already exists
|
||||
let doc_exists = pages.iter().any(|page_val| {
|
||||
page_val
|
||||
.to_map()
|
||||
.and_then(|page| get_string(&page, "id"))
|
||||
.map(|id| id == doc_id)
|
||||
.unwrap_or(false)
|
||||
});
|
||||
|
||||
if !doc_exists {
|
||||
// Create a new page entry
|
||||
let page_map = doc.create_map().map_err(|e| ParseError::ParserError(e.to_string()))?;
|
||||
|
||||
// Insert into pages array first, then populate
|
||||
let idx = pages.len();
|
||||
pages
|
||||
.insert(idx, page_map)
|
||||
.map_err(|e| ParseError::ParserError(e.to_string()))?;
|
||||
|
||||
// Now get the inserted map and populate it
|
||||
if let Some(mut inserted_page) = pages.get(idx).and_then(|v| v.to_map()) {
|
||||
inserted_page
|
||||
.insert("id".to_string(), Any::String(doc_id.to_string()))
|
||||
.map_err(|e| ParseError::ParserError(e.to_string()))?;
|
||||
|
||||
if let Some(t) = title {
|
||||
inserted_page
|
||||
.insert("title".to_string(), Any::String(t.to_string()))
|
||||
.map_err(|e| ParseError::ParserError(e.to_string()))?;
|
||||
}
|
||||
|
||||
// Set createDate to current timestamp
|
||||
let timestamp = std::time::SystemTime::now()
|
||||
.duration_since(std::time::UNIX_EPOCH)
|
||||
.map(|d| d.as_millis() as i64)
|
||||
.unwrap_or(0);
|
||||
inserted_page
|
||||
.insert("createDate".to_string(), Any::BigInt64(timestamp))
|
||||
.map_err(|e| ParseError::ParserError(e.to_string()))?;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Create new pages array with this doc
|
||||
let page_entry = vec![Any::Object(
|
||||
[
|
||||
("id".to_string(), Any::String(doc_id.to_string())),
|
||||
("title".to_string(), Any::String(title.unwrap_or("").to_string())),
|
||||
(
|
||||
"createDate".to_string(),
|
||||
Any::BigInt64(
|
||||
std::time::SystemTime::now()
|
||||
.duration_since(std::time::UNIX_EPOCH)
|
||||
.map(|d| d.as_millis() as i64)
|
||||
.unwrap_or(0),
|
||||
),
|
||||
),
|
||||
]
|
||||
.into_iter()
|
||||
.collect(),
|
||||
)];
|
||||
|
||||
meta
|
||||
.insert("pages".to_string(), Any::Array(page_entry))
|
||||
.map_err(|e| ParseError::ParserError(e.to_string()))?;
|
||||
}
|
||||
|
||||
// Encode only the changes (delta) since state_before
|
||||
doc
|
||||
.encode_state_as_update_v1(&state_before)
|
||||
.map_err(|e| ParseError::ParserError(e.to_string()))
|
||||
}
|
||||
|
||||
fn paragraph_prefix(type_: &str) -> &'static str {
|
||||
match type_ {
|
||||
"h1" => "# ",
|
||||
|
||||
492
packages/common/native/src/doc_parser/markdown_to_ydoc.rs
Normal file
492
packages/common/native/src/doc_parser/markdown_to_ydoc.rs
Normal file
@@ -0,0 +1,492 @@
|
||||
//! Markdown to YDoc conversion module
|
||||
//!
|
||||
//! Converts markdown content into AFFiNE-compatible y-octo document binary
|
||||
//! format.
|
||||
|
||||
use y_octo::{Any, DocOptions};
|
||||
|
||||
use super::{
|
||||
affine::ParseError,
|
||||
markdown_utils::{BlockType, ParsedBlock, extract_title, parse_markdown_blocks},
|
||||
};
|
||||
|
||||
/// Block types used in AFFiNE documents
|
||||
const PAGE_FLAVOUR: &str = "affine:page";
|
||||
const NOTE_FLAVOUR: &str = "affine:note";
|
||||
|
||||
/// Intermediate representation of a block for building y-octo documents
|
||||
struct BlockBuilder {
|
||||
id: String,
|
||||
flavour: String,
|
||||
text_content: String,
|
||||
block_type: Option<BlockType>,
|
||||
checked: Option<bool>,
|
||||
code_language: Option<String>,
|
||||
#[allow(dead_code)] // Reserved for future nested block support
|
||||
children: Vec<String>,
|
||||
}
|
||||
|
||||
impl BlockBuilder {
|
||||
fn new(flavour: &str) -> Self {
|
||||
Self {
|
||||
id: nanoid::nanoid!(),
|
||||
flavour: flavour.to_string(),
|
||||
text_content: String::new(),
|
||||
block_type: None,
|
||||
checked: None,
|
||||
code_language: None,
|
||||
children: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
fn with_text(mut self, text: &str) -> Self {
|
||||
self.text_content = text.to_string();
|
||||
self
|
||||
}
|
||||
|
||||
fn with_block_type(mut self, btype: BlockType) -> Self {
|
||||
self.block_type = Some(btype);
|
||||
self
|
||||
}
|
||||
|
||||
fn with_checked(mut self, checked: bool) -> Self {
|
||||
self.checked = Some(checked);
|
||||
self
|
||||
}
|
||||
|
||||
fn with_code_language(mut self, lang: &str) -> Self {
|
||||
if !lang.is_empty() {
|
||||
self.code_language = Some(lang.to_string());
|
||||
}
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
/// Converts a ParsedBlock from the shared parser into a BlockBuilder
|
||||
impl From<ParsedBlock> for BlockBuilder {
|
||||
fn from(parsed: ParsedBlock) -> Self {
|
||||
let mut builder = BlockBuilder::new(parsed.flavour.as_str()).with_text(&parsed.content);
|
||||
|
||||
if let Some(btype) = parsed.block_type {
|
||||
builder = builder.with_block_type(btype);
|
||||
}
|
||||
|
||||
if let Some(checked) = parsed.checked {
|
||||
builder = builder.with_checked(checked);
|
||||
}
|
||||
|
||||
if let Some(lang) = parsed.language {
|
||||
builder = builder.with_code_language(&lang);
|
||||
}
|
||||
|
||||
builder
|
||||
}
|
||||
}
|
||||
|
||||
/// Parses markdown and converts it to an AFFiNE-compatible y-octo document
|
||||
/// binary.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `markdown` - The markdown content to convert
|
||||
/// * `doc_id` - The document ID to use
|
||||
///
|
||||
/// # Returns
|
||||
/// A binary vector containing the y-octo encoded document
|
||||
pub fn markdown_to_ydoc(markdown: &str, doc_id: &str) -> Result<Vec<u8>, ParseError> {
|
||||
// Extract the title from the first H1 heading
|
||||
let title = extract_title(markdown);
|
||||
|
||||
// Parse markdown into blocks using the shared parser
|
||||
let parsed_blocks = parse_markdown_blocks(markdown, true);
|
||||
|
||||
// Convert ParsedBlocks to BlockBuilders and collect IDs
|
||||
let mut blocks: Vec<BlockBuilder> = Vec::new();
|
||||
let mut content_block_ids: Vec<String> = Vec::new();
|
||||
|
||||
for parsed in parsed_blocks {
|
||||
let builder: BlockBuilder = parsed.into();
|
||||
content_block_ids.push(builder.id.clone());
|
||||
blocks.push(builder);
|
||||
}
|
||||
|
||||
// Build the y-octo document
|
||||
build_ydoc(doc_id, &title, blocks, content_block_ids)
|
||||
}
|
||||
|
||||
/// Builds the y-octo document from parsed blocks.
|
||||
///
|
||||
/// Uses a two-phase approach to ensure Yjs compatibility:
|
||||
/// 1. Phase 1: Create and insert empty maps into blocks_map (establishes parent
|
||||
/// items)
|
||||
/// 2. Phase 2: Populate each map with properties (child items reference
|
||||
/// existing parents)
|
||||
///
|
||||
/// This ordering ensures that when items reference their parent map's ID in the
|
||||
/// encoded binary, the parent ID always has a lower clock value, which Yjs
|
||||
/// requires.
|
||||
fn build_ydoc(
|
||||
doc_id: &str,
|
||||
title: &str,
|
||||
content_blocks: Vec<BlockBuilder>,
|
||||
content_block_ids: Vec<String>,
|
||||
) -> Result<Vec<u8>, ParseError> {
|
||||
// Create the document with the specified ID
|
||||
let doc = DocOptions::new().with_guid(doc_id.to_string()).build();
|
||||
|
||||
// Create the blocks map
|
||||
let mut blocks_map = doc
|
||||
.get_or_create_map("blocks")
|
||||
.map_err(|e| ParseError::ParserError(e.to_string()))?;
|
||||
|
||||
// Create block IDs
|
||||
let page_id = nanoid::nanoid!();
|
||||
let note_id = nanoid::nanoid!();
|
||||
|
||||
// ==== PHASE 1: Insert empty maps to establish parent items ====
|
||||
// This ensures parent items have lower clock values than their children
|
||||
|
||||
// Insert empty page block map
|
||||
blocks_map
|
||||
.insert(
|
||||
page_id.clone(),
|
||||
doc.create_map().map_err(|e| ParseError::ParserError(e.to_string()))?,
|
||||
)
|
||||
.map_err(|e| ParseError::ParserError(e.to_string()))?;
|
||||
|
||||
// Insert empty note block map
|
||||
blocks_map
|
||||
.insert(
|
||||
note_id.clone(),
|
||||
doc.create_map().map_err(|e| ParseError::ParserError(e.to_string()))?,
|
||||
)
|
||||
.map_err(|e| ParseError::ParserError(e.to_string()))?;
|
||||
|
||||
// Insert empty content block maps
|
||||
for block in &content_blocks {
|
||||
blocks_map
|
||||
.insert(
|
||||
block.id.clone(),
|
||||
doc.create_map().map_err(|e| ParseError::ParserError(e.to_string()))?,
|
||||
)
|
||||
.map_err(|e| ParseError::ParserError(e.to_string()))?;
|
||||
}
|
||||
|
||||
// ==== PHASE 2: Populate the maps with their properties ====
|
||||
// Now each map has an item with a lower clock, so children will reference
|
||||
// correctly
|
||||
|
||||
// Populate page block
|
||||
if let Some(page_map) = blocks_map.get(&page_id).and_then(|v| v.to_map()) {
|
||||
populate_block_map(
|
||||
&doc,
|
||||
page_map,
|
||||
&page_id,
|
||||
PAGE_FLAVOUR,
|
||||
Some(title),
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
vec![note_id.clone()],
|
||||
)?;
|
||||
}
|
||||
|
||||
// Populate note block
|
||||
if let Some(note_map) = blocks_map.get(¬e_id).and_then(|v| v.to_map()) {
|
||||
populate_block_map(
|
||||
&doc,
|
||||
note_map,
|
||||
¬e_id,
|
||||
NOTE_FLAVOUR,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
content_block_ids.clone(),
|
||||
)?;
|
||||
}
|
||||
|
||||
// Populate content blocks
|
||||
for block in content_blocks {
|
||||
if let Some(block_map) = blocks_map.get(&block.id).and_then(|v| v.to_map()) {
|
||||
populate_block_map(
|
||||
&doc,
|
||||
block_map,
|
||||
&block.id,
|
||||
&block.flavour,
|
||||
None,
|
||||
if block.text_content.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(&block.text_content)
|
||||
},
|
||||
block.block_type,
|
||||
block.checked,
|
||||
block.code_language.as_deref(),
|
||||
Vec::new(),
|
||||
)?;
|
||||
}
|
||||
}
|
||||
|
||||
// Encode the document
|
||||
doc
|
||||
.encode_update_v1()
|
||||
.map_err(|e| ParseError::ParserError(e.to_string()))
|
||||
}
|
||||
|
||||
/// Populates an existing block map with the given properties.
|
||||
///
|
||||
/// This function takes an already-inserted map and populates it with
|
||||
/// properties. The two-phase approach (insert empty map first, then populate)
|
||||
/// ensures that when child items reference the map as their parent, the
|
||||
/// parent's clock is lower.
|
||||
///
|
||||
/// IMPORTANT: We use Any types (Any::Array, Any::String) instead of CRDT types
|
||||
/// (y_octo::Array, y_octo::Text) for nested values. Any types are encoded
|
||||
/// inline as part of the item content, avoiding the forward reference issue
|
||||
/// where child items would reference a parent with a higher clock value.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn populate_block_map(
|
||||
_doc: &y_octo::Doc,
|
||||
mut block: y_octo::Map,
|
||||
block_id: &str,
|
||||
flavour: &str,
|
||||
title: Option<&str>,
|
||||
text_content: Option<&str>,
|
||||
block_type: Option<BlockType>,
|
||||
checked: Option<bool>,
|
||||
code_language: Option<&str>,
|
||||
children: Vec<String>,
|
||||
) -> Result<(), ParseError> {
|
||||
// Required fields
|
||||
block
|
||||
.insert("sys:id".to_string(), Any::String(block_id.to_string()))
|
||||
.map_err(|e| ParseError::ParserError(e.to_string()))?;
|
||||
block
|
||||
.insert("sys:flavour".to_string(), Any::String(flavour.to_string()))
|
||||
.map_err(|e| ParseError::ParserError(e.to_string()))?;
|
||||
|
||||
// Children - use Any::Array which is encoded inline (no forward references)
|
||||
let children_any: Vec<Any> = children.into_iter().map(Any::String).collect();
|
||||
block
|
||||
.insert("sys:children".to_string(), Any::Array(children_any))
|
||||
.map_err(|e| ParseError::ParserError(e.to_string()))?;
|
||||
|
||||
// Title
|
||||
if let Some(title) = title {
|
||||
block
|
||||
.insert("prop:title".to_string(), Any::String(title.to_string()))
|
||||
.map_err(|e| ParseError::ParserError(e.to_string()))?;
|
||||
}
|
||||
|
||||
// Text content - use Any::String instead of Y.Text
|
||||
// This is simpler and avoids CRDT overhead for initial document creation
|
||||
if let Some(content) = text_content {
|
||||
block
|
||||
.insert("prop:text".to_string(), Any::String(content.to_string()))
|
||||
.map_err(|e| ParseError::ParserError(e.to_string()))?;
|
||||
}
|
||||
|
||||
// Block type
|
||||
if let Some(btype) = block_type {
|
||||
block
|
||||
.insert("prop:type".to_string(), Any::String(btype.as_str().to_string()))
|
||||
.map_err(|e| ParseError::ParserError(e.to_string()))?;
|
||||
}
|
||||
|
||||
// Checked state
|
||||
if let Some(is_checked) = checked {
|
||||
block
|
||||
.insert(
|
||||
"prop:checked".to_string(),
|
||||
if is_checked { Any::True } else { Any::False },
|
||||
)
|
||||
.map_err(|e| ParseError::ParserError(e.to_string()))?;
|
||||
}
|
||||
|
||||
// Code language
|
||||
if let Some(lang) = code_language {
|
||||
block
|
||||
.insert("prop:language".to_string(), Any::String(lang.to_string()))
|
||||
.map_err(|e| ParseError::ParserError(e.to_string()))?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_simple_markdown() {
|
||||
let markdown = "# Hello World\n\nThis is a test paragraph.";
|
||||
let result = markdown_to_ydoc(markdown, "test-doc-id");
|
||||
assert!(result.is_ok());
|
||||
let bin = result.unwrap();
|
||||
assert!(!bin.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_markdown_with_list() {
|
||||
let markdown = "# Test List\n\n- Item 1\n- Item 2\n- Item 3";
|
||||
let result = markdown_to_ydoc(markdown, "test-doc-id");
|
||||
assert!(result.is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_markdown_with_code() {
|
||||
let markdown = "# Code Example\n\n```rust\nfn main() {\n println!(\"Hello\");\n}\n```";
|
||||
let result = markdown_to_ydoc(markdown, "test-doc-id");
|
||||
assert!(result.is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_markdown_with_headings() {
|
||||
let markdown = "# H1\n\n## H2\n\n### H3\n\nParagraph text.";
|
||||
let result = markdown_to_ydoc(markdown, "test-doc-id");
|
||||
assert!(result.is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_title_usage() {
|
||||
assert_eq!(extract_title("# My Title\n\nContent"), "My Title");
|
||||
assert_eq!(extract_title("No heading"), "Untitled");
|
||||
assert_eq!(extract_title("## Secondary\n\nContent"), "Untitled");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_empty_markdown() {
|
||||
let result = markdown_to_ydoc("", "test-doc-id");
|
||||
assert!(result.is_ok());
|
||||
let bin = result.unwrap();
|
||||
assert!(!bin.is_empty()); // Should still create valid doc structure
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_whitespace_only_markdown() {
|
||||
let result = markdown_to_ydoc(" \n\n\t\n ", "test-doc-id");
|
||||
assert!(result.is_ok());
|
||||
let bin = result.unwrap();
|
||||
assert!(!bin.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_markdown_without_h1() {
|
||||
// Should use "Untitled" as default title
|
||||
let markdown = "## Secondary Heading\n\nSome content without H1.";
|
||||
let result = markdown_to_ydoc(markdown, "test-doc-id");
|
||||
assert!(result.is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_nested_lists() {
|
||||
let markdown = "# Nested Lists\n\n- Item 1\n - Nested 1.1\n - Nested 1.2\n- Item 2\n - Nested 2.1";
|
||||
let result = markdown_to_ydoc(markdown, "test-doc-id");
|
||||
assert!(result.is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_blockquote() {
|
||||
let markdown = "# Title\n\n> A blockquote";
|
||||
let result = markdown_to_ydoc(markdown, "test-doc-id");
|
||||
assert!(result.is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_divider() {
|
||||
let markdown = "# Title\n\nBefore divider\n\n---\n\nAfter divider";
|
||||
let result = markdown_to_ydoc(markdown, "test-doc-id");
|
||||
assert!(result.is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_numbered_list() {
|
||||
let markdown = "# Title\n\n1. First item\n2. Second item";
|
||||
let result = markdown_to_ydoc(markdown, "test-doc-id");
|
||||
assert!(result.is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_four_paragraphs() {
|
||||
// Test with 4 paragraphs
|
||||
let markdown = "# Title\n\nP1.\n\nP2.\n\nP3.\n\nP4.";
|
||||
let result = markdown_to_ydoc(markdown, "test-doc-id");
|
||||
assert!(result.is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_mixed_content() {
|
||||
let markdown = r#"# Mixed Content
|
||||
|
||||
Some intro text.
|
||||
|
||||
- List item 1
|
||||
- List item 2
|
||||
|
||||
```python
|
||||
def hello():
|
||||
print("world")
|
||||
```
|
||||
|
||||
## Another Section
|
||||
|
||||
More text here.
|
||||
|
||||
1. Numbered item
|
||||
2. Another numbered
|
||||
|
||||
> A blockquote
|
||||
|
||||
---
|
||||
|
||||
Final paragraph.
|
||||
"#;
|
||||
let result = markdown_to_ydoc(markdown, "test-doc-id");
|
||||
assert!(result.is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_code_block_preserves_indentation() {
|
||||
// Code blocks should preserve leading whitespace (indentation) which is
|
||||
// semantically significant in languages like Python, YAML, etc.
|
||||
let markdown = r#"# Code Test
|
||||
|
||||
```python
|
||||
def indented():
|
||||
return "preserved"
|
||||
```
|
||||
"#;
|
||||
let result = markdown_to_ydoc(markdown, "test-doc-id");
|
||||
assert!(result.is_ok());
|
||||
// The test passes if the conversion succeeds without errors.
|
||||
// Full verification would require roundtrip testing.
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_document_creation() {
|
||||
// Test that markdown_to_ydoc creates a valid binary
|
||||
let original_md = "# Test Document\n\nHello world.";
|
||||
let doc_id = "creation-test";
|
||||
|
||||
let bin = markdown_to_ydoc(original_md, doc_id).expect("Should convert to ydoc");
|
||||
|
||||
// Binary should not be empty
|
||||
assert!(!bin.is_empty(), "Binary should not be empty");
|
||||
assert!(bin.len() > 10, "Binary should have meaningful content");
|
||||
}
|
||||
|
||||
// NOTE: Full roundtrip tests (markdown -> ydoc -> markdown) are not included
|
||||
// because y-octo has a limitation where nested maps created with create_map()
|
||||
// lose their content after encode/decode. This is a known y-octo limitation.
|
||||
//
|
||||
// However, the documents we create ARE valid and can be:
|
||||
// 1. Pushed to the AFFiNE server via DocStorageAdapter.pushDocUpdates
|
||||
// 2. Read by the AFFiNE client which uses JavaScript Yjs (not y-octo)
|
||||
//
|
||||
// The MCP write tools work because:
|
||||
// - markdown_to_ydoc creates valid y-octo binary
|
||||
// - The server stores the binary directly
|
||||
// - The client (browser) uses Yjs to decode and render
|
||||
}
|
||||
463
packages/common/native/src/doc_parser/markdown_utils.rs
Normal file
463
packages/common/native/src/doc_parser/markdown_utils.rs
Normal file
@@ -0,0 +1,463 @@
|
||||
//! Shared markdown utilities for the doc_parser module
|
||||
|
||||
use pulldown_cmark::{CodeBlockKind, Event, HeadingLevel, Options, Parser, Tag, TagEnd};
|
||||
|
||||
/// Block flavours used in AFFiNE documents
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum BlockFlavour {
|
||||
Paragraph,
|
||||
List,
|
||||
Code,
|
||||
Divider,
|
||||
}
|
||||
|
||||
impl BlockFlavour {
|
||||
pub fn as_str(&self) -> &'static str {
|
||||
match self {
|
||||
BlockFlavour::Paragraph => "affine:paragraph",
|
||||
BlockFlavour::List => "affine:list",
|
||||
BlockFlavour::Code => "affine:code",
|
||||
BlockFlavour::Divider => "affine:divider",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Block types for paragraphs and lists
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum BlockType {
|
||||
// Paragraph types
|
||||
#[allow(dead_code)] // Used via as_str() for default paragraph type
|
||||
Text,
|
||||
H1,
|
||||
H2,
|
||||
H3,
|
||||
H4,
|
||||
H5,
|
||||
H6,
|
||||
Quote,
|
||||
// List types
|
||||
Bulleted,
|
||||
Numbered,
|
||||
Todo,
|
||||
}
|
||||
|
||||
impl BlockType {
|
||||
pub fn as_str(&self) -> &'static str {
|
||||
match self {
|
||||
BlockType::Text => "text",
|
||||
BlockType::H1 => "h1",
|
||||
BlockType::H2 => "h2",
|
||||
BlockType::H3 => "h3",
|
||||
BlockType::H4 => "h4",
|
||||
BlockType::H5 => "h5",
|
||||
BlockType::H6 => "h6",
|
||||
BlockType::Quote => "quote",
|
||||
BlockType::Bulleted => "bulleted",
|
||||
BlockType::Numbered => "numbered",
|
||||
BlockType::Todo => "todo",
|
||||
}
|
||||
}
|
||||
|
||||
pub fn from_heading_level(level: HeadingLevel) -> Self {
|
||||
match level {
|
||||
HeadingLevel::H1 => BlockType::H1,
|
||||
HeadingLevel::H2 => BlockType::H2,
|
||||
HeadingLevel::H3 => BlockType::H3,
|
||||
HeadingLevel::H4 => BlockType::H4,
|
||||
HeadingLevel::H5 => BlockType::H5,
|
||||
HeadingLevel::H6 => BlockType::H6,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A parsed block from markdown content
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
pub struct ParsedBlock {
|
||||
pub flavour: BlockFlavour,
|
||||
pub block_type: Option<BlockType>,
|
||||
pub content: String,
|
||||
pub checked: Option<bool>,
|
||||
pub language: Option<String>,
|
||||
}
|
||||
|
||||
/// Parses markdown content into a list of parsed blocks.
|
||||
///
|
||||
/// This is the shared parsing logic used by both `markdown_to_ydoc` and
|
||||
/// `update_ydoc`.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `markdown` - The markdown content to parse
|
||||
/// * `skip_first_h1` - If true, the first H1 heading is skipped (used as
|
||||
/// document title)
|
||||
///
|
||||
/// # Returns
|
||||
/// A vector of parsed blocks
|
||||
pub fn parse_markdown_blocks(markdown: &str, skip_first_h1: bool) -> Vec<ParsedBlock> {
|
||||
// Note: ENABLE_TABLES is included for future support, but table events
|
||||
// currently fall through to the catch-all match arm. Table content appears as
|
||||
// plain text.
|
||||
let options = Options::ENABLE_STRIKETHROUGH
|
||||
| Options::ENABLE_TABLES
|
||||
| Options::ENABLE_TASKLISTS
|
||||
| Options::ENABLE_HEADING_ATTRIBUTES;
|
||||
let parser = Parser::new_ext(markdown, options);
|
||||
|
||||
let mut blocks = Vec::new();
|
||||
let mut current_text = String::new();
|
||||
let mut current_type: Option<BlockType> = None;
|
||||
let mut current_flavour = BlockFlavour::Paragraph;
|
||||
let mut in_list = false;
|
||||
let mut list_type_stack: Vec<BlockType> = Vec::new();
|
||||
// Per-item type override for task list markers (resets at each Item start)
|
||||
let mut current_item_type: Option<BlockType> = None;
|
||||
let mut in_code_block = false;
|
||||
let mut code_language = String::new();
|
||||
let mut first_h1_seen = !skip_first_h1; // If not skipping, mark as already seen
|
||||
let mut current_checked: Option<bool> = None;
|
||||
let mut pending_link_url: Option<String> = None;
|
||||
|
||||
for event in parser {
|
||||
match event {
|
||||
Event::Start(Tag::Heading { level, .. }) => {
|
||||
flush_block(
|
||||
&mut blocks,
|
||||
&mut current_text,
|
||||
current_flavour,
|
||||
current_type.take(),
|
||||
current_checked.take(),
|
||||
None,
|
||||
);
|
||||
|
||||
if level == HeadingLevel::H1 && !first_h1_seen {
|
||||
// Skip the first H1 - it's used as the document title
|
||||
current_type = Some(BlockType::H1);
|
||||
} else {
|
||||
current_type = Some(BlockType::from_heading_level(level));
|
||||
}
|
||||
current_flavour = BlockFlavour::Paragraph;
|
||||
}
|
||||
Event::End(TagEnd::Heading(level)) => {
|
||||
if level == HeadingLevel::H1 && !first_h1_seen {
|
||||
first_h1_seen = true;
|
||||
current_text.clear();
|
||||
current_type = None;
|
||||
} else {
|
||||
flush_block(
|
||||
&mut blocks,
|
||||
&mut current_text,
|
||||
current_flavour,
|
||||
current_type.take(),
|
||||
current_checked.take(),
|
||||
None,
|
||||
);
|
||||
}
|
||||
}
|
||||
Event::Start(Tag::Paragraph) => {}
|
||||
Event::End(TagEnd::Paragraph) => {
|
||||
if !in_list {
|
||||
flush_block(
|
||||
&mut blocks,
|
||||
&mut current_text,
|
||||
current_flavour,
|
||||
current_type.take(),
|
||||
current_checked.take(),
|
||||
None,
|
||||
);
|
||||
}
|
||||
}
|
||||
Event::Start(Tag::BlockQuote(_)) => {
|
||||
current_type = Some(BlockType::Quote);
|
||||
current_flavour = BlockFlavour::Paragraph;
|
||||
}
|
||||
Event::End(TagEnd::BlockQuote(_)) => {
|
||||
flush_block(
|
||||
&mut blocks,
|
||||
&mut current_text,
|
||||
current_flavour,
|
||||
current_type.take(),
|
||||
current_checked.take(),
|
||||
None,
|
||||
);
|
||||
}
|
||||
Event::Start(Tag::List(start_num)) => {
|
||||
in_list = true;
|
||||
let list_type = if start_num.is_some() {
|
||||
BlockType::Numbered
|
||||
} else {
|
||||
BlockType::Bulleted
|
||||
};
|
||||
list_type_stack.push(list_type);
|
||||
}
|
||||
Event::End(TagEnd::List(_)) => {
|
||||
list_type_stack.pop();
|
||||
if list_type_stack.is_empty() {
|
||||
in_list = false;
|
||||
}
|
||||
}
|
||||
Event::Start(Tag::Item) => {
|
||||
current_flavour = BlockFlavour::List;
|
||||
// Reset per-item type override
|
||||
current_item_type = None;
|
||||
if let Some(lt) = list_type_stack.last() {
|
||||
current_type = Some(*lt);
|
||||
}
|
||||
}
|
||||
Event::End(TagEnd::Item) => {
|
||||
// Use per-item override if set (for task items), otherwise use current_type
|
||||
if let Some(item_type) = current_item_type.take() {
|
||||
current_type = Some(item_type);
|
||||
}
|
||||
flush_block(
|
||||
&mut blocks,
|
||||
&mut current_text,
|
||||
current_flavour,
|
||||
current_type.take(),
|
||||
current_checked.take(),
|
||||
None,
|
||||
);
|
||||
current_flavour = BlockFlavour::Paragraph;
|
||||
}
|
||||
Event::TaskListMarker(checked) => {
|
||||
// Set per-item type override for this specific item only
|
||||
current_item_type = Some(BlockType::Todo);
|
||||
current_checked = Some(checked);
|
||||
}
|
||||
Event::Start(Tag::CodeBlock(kind)) => {
|
||||
in_code_block = true;
|
||||
current_flavour = BlockFlavour::Code;
|
||||
code_language = match kind {
|
||||
CodeBlockKind::Fenced(lang) => lang.to_string(),
|
||||
CodeBlockKind::Indented => String::new(),
|
||||
};
|
||||
}
|
||||
Event::End(TagEnd::CodeBlock) => {
|
||||
flush_code_block(&mut blocks, &mut current_text, &code_language);
|
||||
in_code_block = false;
|
||||
code_language.clear();
|
||||
current_flavour = BlockFlavour::Paragraph;
|
||||
}
|
||||
Event::Text(text) => {
|
||||
current_text.push_str(&text);
|
||||
}
|
||||
Event::Code(code) => {
|
||||
// Inline code - wrap in backticks
|
||||
current_text.push('`');
|
||||
current_text.push_str(&code);
|
||||
current_text.push('`');
|
||||
}
|
||||
Event::SoftBreak | Event::HardBreak => {
|
||||
if in_code_block {
|
||||
current_text.push('\n');
|
||||
} else {
|
||||
current_text.push(' ');
|
||||
}
|
||||
}
|
||||
Event::Rule => {
|
||||
flush_block(
|
||||
&mut blocks,
|
||||
&mut current_text,
|
||||
current_flavour,
|
||||
current_type.take(),
|
||||
current_checked.take(),
|
||||
None,
|
||||
);
|
||||
blocks.push(ParsedBlock {
|
||||
flavour: BlockFlavour::Divider,
|
||||
block_type: None,
|
||||
content: String::new(),
|
||||
checked: None,
|
||||
language: None,
|
||||
});
|
||||
}
|
||||
Event::Start(Tag::Strong) => current_text.push_str("**"),
|
||||
Event::End(TagEnd::Strong) => current_text.push_str("**"),
|
||||
Event::Start(Tag::Emphasis) => current_text.push('_'),
|
||||
Event::End(TagEnd::Emphasis) => current_text.push('_'),
|
||||
Event::Start(Tag::Strikethrough) => current_text.push_str("~~"),
|
||||
Event::End(TagEnd::Strikethrough) => current_text.push_str("~~"),
|
||||
Event::Start(Tag::Link { dest_url, .. }) => {
|
||||
current_text.push('[');
|
||||
pending_link_url = Some(dest_url.to_string());
|
||||
}
|
||||
Event::End(TagEnd::Link) => {
|
||||
if let Some(url) = pending_link_url.take() {
|
||||
current_text.push_str(&format!("]({})", url));
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
// Flush any remaining content
|
||||
flush_block(
|
||||
&mut blocks,
|
||||
&mut current_text,
|
||||
current_flavour,
|
||||
current_type,
|
||||
current_checked,
|
||||
None,
|
||||
);
|
||||
|
||||
blocks
|
||||
}
|
||||
|
||||
fn flush_block(
|
||||
blocks: &mut Vec<ParsedBlock>,
|
||||
text: &mut String,
|
||||
flavour: BlockFlavour,
|
||||
block_type: Option<BlockType>,
|
||||
checked: Option<bool>,
|
||||
language: Option<String>,
|
||||
) {
|
||||
let trimmed = text.trim();
|
||||
if !trimmed.is_empty() || flavour == BlockFlavour::Divider {
|
||||
blocks.push(ParsedBlock {
|
||||
flavour,
|
||||
block_type,
|
||||
content: trimmed.to_string(),
|
||||
checked,
|
||||
language,
|
||||
});
|
||||
}
|
||||
text.clear();
|
||||
}
|
||||
|
||||
fn flush_code_block(blocks: &mut Vec<ParsedBlock>, text: &mut String, language: &str) {
|
||||
// Preserve leading whitespace (indentation) in code blocks as it may be
|
||||
// semantically significant (e.g., Python, YAML). Only strip leading/trailing
|
||||
// newlines which are typically artifacts from code fence parsing.
|
||||
let content = text.trim_matches('\n');
|
||||
if !content.is_empty() {
|
||||
blocks.push(ParsedBlock {
|
||||
flavour: BlockFlavour::Code,
|
||||
block_type: None,
|
||||
content: content.to_string(),
|
||||
checked: None,
|
||||
language: if language.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(language.to_string())
|
||||
},
|
||||
});
|
||||
}
|
||||
text.clear();
|
||||
}
|
||||
|
||||
/// Extracts the title from the first H1 heading in markdown content.
|
||||
///
|
||||
/// Returns "Untitled" if no H1 heading is found.
|
||||
pub(crate) fn extract_title(markdown: &str) -> String {
|
||||
let parser = Parser::new(markdown);
|
||||
let mut in_heading = false;
|
||||
let mut title = String::new();
|
||||
|
||||
for event in parser {
|
||||
match event {
|
||||
Event::Start(Tag::Heading {
|
||||
level: HeadingLevel::H1,
|
||||
..
|
||||
}) => {
|
||||
in_heading = true;
|
||||
}
|
||||
Event::Text(text) if in_heading => {
|
||||
title.push_str(&text);
|
||||
}
|
||||
Event::Code(code) if in_heading => {
|
||||
title.push_str(&code);
|
||||
}
|
||||
Event::End(TagEnd::Heading(_)) if in_heading => {
|
||||
break;
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
if title.is_empty() {
|
||||
"Untitled".to_string()
|
||||
} else {
|
||||
title.trim().to_string()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_extract_title_simple() {
|
||||
assert_eq!(extract_title("# Hello World\n\nContent"), "Hello World");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_title_with_code() {
|
||||
assert_eq!(extract_title("# Hello `code` World"), "Hello code World");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_title_empty() {
|
||||
assert_eq!(extract_title("No heading here"), "Untitled");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_title_h2_not_used() {
|
||||
assert_eq!(extract_title("## H2 heading\n\nContent"), "Untitled");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_markdown_blocks_simple() {
|
||||
let blocks = parse_markdown_blocks("# Title\n\nParagraph text.", true);
|
||||
assert_eq!(blocks.len(), 1);
|
||||
assert_eq!(blocks[0].flavour, BlockFlavour::Paragraph);
|
||||
assert_eq!(blocks[0].content, "Paragraph text.");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_markdown_blocks_with_headings() {
|
||||
let blocks = parse_markdown_blocks("# Title\n\n## Section\n\nText.", true);
|
||||
assert_eq!(blocks.len(), 2);
|
||||
assert_eq!(blocks[0].block_type, Some(BlockType::H2));
|
||||
assert_eq!(blocks[0].content, "Section");
|
||||
assert_eq!(blocks[1].content, "Text.");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_markdown_blocks_lists() {
|
||||
let blocks = parse_markdown_blocks("# Title\n\n- Item 1\n- Item 2", true);
|
||||
assert_eq!(blocks.len(), 2);
|
||||
assert_eq!(blocks[0].flavour, BlockFlavour::List);
|
||||
assert_eq!(blocks[0].block_type, Some(BlockType::Bulleted));
|
||||
assert_eq!(blocks[0].content, "Item 1");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_markdown_blocks_task_list() {
|
||||
let blocks = parse_markdown_blocks("# Title\n\n- [ ] Unchecked\n- [x] Checked", true);
|
||||
assert_eq!(blocks.len(), 2);
|
||||
assert_eq!(blocks[0].block_type, Some(BlockType::Todo));
|
||||
assert_eq!(blocks[0].checked, Some(false));
|
||||
assert_eq!(blocks[1].block_type, Some(BlockType::Todo));
|
||||
assert_eq!(blocks[1].checked, Some(true));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_markdown_blocks_code() {
|
||||
let blocks = parse_markdown_blocks("# Title\n\n```rust\nfn main() {}\n```", true);
|
||||
assert_eq!(blocks.len(), 1);
|
||||
assert_eq!(blocks[0].flavour, BlockFlavour::Code);
|
||||
assert_eq!(blocks[0].language, Some("rust".to_string()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_markdown_blocks_divider() {
|
||||
let blocks = parse_markdown_blocks("# Title\n\nBefore\n\n---\n\nAfter", true);
|
||||
assert_eq!(blocks.len(), 3);
|
||||
assert_eq!(blocks[1].flavour, BlockFlavour::Divider);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_markdown_blocks_code_preserves_indentation() {
|
||||
let blocks = parse_markdown_blocks("# Title\n\n```python\n def indented():\n pass\n```", true);
|
||||
assert_eq!(blocks.len(), 1);
|
||||
assert!(blocks[0].content.starts_with(" def"));
|
||||
}
|
||||
}
|
||||
@@ -1,9 +1,19 @@
|
||||
mod affine;
|
||||
mod blocksuite;
|
||||
mod delta_markdown;
|
||||
#[cfg(feature = "ydoc-loader")]
|
||||
mod markdown_to_ydoc;
|
||||
#[cfg(feature = "ydoc-loader")]
|
||||
mod markdown_utils;
|
||||
#[cfg(feature = "ydoc-loader")]
|
||||
mod update_ydoc;
|
||||
mod value;
|
||||
|
||||
pub use affine::{
|
||||
BlockInfo, CrawlResult, MarkdownResult, PageDocContent, ParseError, WorkspaceDocContent, get_doc_ids_from_binary,
|
||||
parse_doc_from_binary, parse_doc_to_markdown, parse_page_doc, parse_workspace_doc,
|
||||
BlockInfo, CrawlResult, MarkdownResult, PageDocContent, ParseError, WorkspaceDocContent, add_doc_to_root_doc,
|
||||
get_doc_ids_from_binary, parse_doc_from_binary, parse_doc_to_markdown, parse_page_doc, parse_workspace_doc,
|
||||
};
|
||||
#[cfg(feature = "ydoc-loader")]
|
||||
pub use markdown_to_ydoc::markdown_to_ydoc;
|
||||
#[cfg(feature = "ydoc-loader")]
|
||||
pub use update_ydoc::update_ydoc;
|
||||
|
||||
1102
packages/common/native/src/doc_parser/update_ydoc.rs
Normal file
1102
packages/common/native/src/doc_parser/update_ydoc.rs
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user