feat(server): impl native reader for server (#14100)

This commit is contained in:
DarkSky
2025-12-14 00:28:43 +08:00
committed by GitHub
parent a0eeed0cdb
commit 844b9d9592
28 changed files with 1333 additions and 1153 deletions

View File

@@ -6,7 +6,6 @@ yarn install
# Build Server Dependencies
yarn affine @affine/server-native build
yarn affine @affine/reader build
# Create database
yarn affine @affine/server prisma migrate reset -f

View File

@@ -4,11 +4,6 @@ description: 'Prepare Server Test Environment'
runs:
using: 'composite'
steps:
- name: Bundle @affine/reader
shell: bash
run: |
yarn affine @affine/reader build
- name: Initialize database
shell: bash
run: |

View File

@@ -187,8 +187,6 @@ jobs:
path: ./packages/backend/native
- name: List server-native files
run: ls -alh ./packages/backend/native
- name: Build @affine/reader
run: yarn workspace @affine/reader build
- name: Build Server
run: yarn workspace @affine/server build
- name: Upload server dist

View File

@@ -152,11 +152,6 @@ jobs:
name: server-native.node
path: ./packages/backend/native
- name: Bundle @affine/reader
shell: bash
run: |
yarn workspace @affine/reader build
- name: Run Check
run: |
yarn affine init

View File

@@ -438,7 +438,7 @@ describe('snapshot to pdf', () => {
expect(definition.styles?.code).toBeDefined();
expect(definition.defaultStyle).toBeDefined();
expect(definition.defaultStyle?.font).toBe('Roboto');
expect(definition.defaultStyle?.font).toBe('SarasaGothicCL');
});
describe('inline text styling', () => {
@@ -650,7 +650,7 @@ describe('snapshot to pdf', () => {
const codeText = textContent.text.find(
(t: any) =>
typeof t === 'object' &&
t.font === 'Roboto' &&
t.font === 'Inter' &&
t.background === '#f5f5f5'
);
expect(codeText).toBeDefined();
@@ -837,11 +837,7 @@ describe('snapshot to pdf', () => {
if (Array.isArray(textContent.text)) {
const refText = textContent.text.find(
(t: any) =>
typeof t === 'object' &&
t.text === 'Page not found' &&
Array.isArray(t.decoration) &&
t.decoration.includes('lineThrough')
(t: any) => typeof t === 'object' && t.text === 'Page not found'
);
expect(refText).toBeDefined();
}

View File

@@ -35,12 +35,6 @@ Server also requires native packages to be built, you can build them by running
yarn affine @affine/server-native build
```
## Build @affine/reader package
```sh
yarn affine @affine/reader build
```
## Prepare dev environment
```sh

View File

@@ -8,18 +8,22 @@ version = "1.0.0"
crate-type = ["cdylib"]
[dependencies]
affine_common = { workspace = true, features = ["doc-loader", "hashcash"] }
chrono = { workspace = true }
file-format = { workspace = true }
infer = { workspace = true }
mp4parse = { workspace = true }
napi = { workspace = true, features = ["async"] }
napi-derive = { workspace = true }
rand = { workspace = true }
sha3 = { workspace = true }
tiktoken-rs = { workspace = true }
v_htmlescape = { workspace = true }
y-octo = { workspace = true, features = ["large_refs"] }
affine_common = { workspace = true, features = [
"doc-loader",
"hashcash",
"ydoc-loader",
] }
chrono = { workspace = true }
file-format = { workspace = true }
infer = { workspace = true }
mp4parse = { workspace = true }
napi = { workspace = true, features = ["async"] }
napi-derive = { workspace = true }
rand = { workspace = true }
sha3 = { workspace = true }
tiktoken-rs = { workspace = true }
v_htmlescape = { workspace = true }
y-octo = { workspace = true, features = ["large_refs"] }
[target.'cfg(not(target_os = "linux"))'.dependencies]
mimalloc = { workspace = true }

View File

@@ -27,6 +27,29 @@ export declare function mergeUpdatesInApplyWay(updates: Array<Buffer>): Buffer
export declare function mintChallengeResponse(resource: string, bits?: number | undefined | null): Promise<string>
export interface NativeBlockInfo {
blockId: string
flavour: string
content?: Array<string>
blob?: Array<string>
refDocId?: Array<string>
refInfo?: Array<string>
parentFlavour?: string
parentBlockId?: string
additional?: string
}
export interface NativeCrawlResult {
blocks: Array<NativeBlockInfo>
title: string
summary: string
}
export interface NativeMarkdownResult {
title: string
markdown: string
}
export interface ParsedDoc {
name: string
chunks: Array<Chunk>
@@ -34,4 +57,10 @@ export interface ParsedDoc {
export declare function parseDoc(filePath: string, doc: Buffer): Promise<ParsedDoc>
export declare function parseDocFromBinary(docBin: Buffer, docId: string): NativeCrawlResult
export declare function parseDocToMarkdown(docBin: Buffer, docId: string, aiEditable?: boolean | undefined | null): NativeMarkdownResult
export declare function readAllDocIdsFromRootDoc(docBin: Buffer, includeTrash?: boolean | undefined | null): Array<string>
export declare function verifyChallengeResponse(response: string, bits: number, resource: string): Promise<boolean>

View File

@@ -0,0 +1,93 @@
use affine_common::doc_parser::{self, BlockInfo, CrawlResult, MarkdownResult};
use napi::bindgen_prelude::*;
use napi_derive::napi;
#[napi(object)]
pub struct NativeMarkdownResult {
pub title: String,
pub markdown: String,
}
impl From<MarkdownResult> for NativeMarkdownResult {
fn from(result: MarkdownResult) -> Self {
Self {
title: result.title,
markdown: result.markdown,
}
}
}
#[napi(object)]
pub struct NativeBlockInfo {
pub block_id: String,
pub flavour: String,
pub content: Option<Vec<String>>,
pub blob: Option<Vec<String>>,
pub ref_doc_id: Option<Vec<String>>,
pub ref_info: Option<Vec<String>>,
pub parent_flavour: Option<String>,
pub parent_block_id: Option<String>,
pub additional: Option<String>,
}
impl From<BlockInfo> for NativeBlockInfo {
fn from(info: BlockInfo) -> Self {
Self {
block_id: info.block_id,
flavour: info.flavour,
content: info.content,
blob: info.blob,
ref_doc_id: info.ref_doc_id,
ref_info: info.ref_info,
parent_flavour: info.parent_flavour,
parent_block_id: info.parent_block_id,
additional: info.additional,
}
}
}
#[napi(object)]
pub struct NativeCrawlResult {
pub blocks: Vec<NativeBlockInfo>,
pub title: String,
pub summary: String,
}
impl From<CrawlResult> for NativeCrawlResult {
fn from(result: CrawlResult) -> Self {
Self {
blocks: result.blocks.into_iter().map(Into::into).collect(),
title: result.title,
summary: result.summary,
}
}
}
#[napi]
pub fn parse_doc_from_binary(doc_bin: Buffer, doc_id: String) -> Result<NativeCrawlResult> {
let result = doc_parser::parse_doc_from_binary(doc_bin.into(), doc_id)
.map_err(|e| Error::new(Status::GenericFailure, e.to_string()))?;
Ok(result.into())
}
#[napi]
pub fn parse_doc_to_markdown(
doc_bin: Buffer,
doc_id: String,
ai_editable: Option<bool>,
) -> Result<NativeMarkdownResult> {
let result =
doc_parser::parse_doc_to_markdown(doc_bin.into(), doc_id, ai_editable.unwrap_or(false))
.map_err(|e| Error::new(Status::GenericFailure, e.to_string()))?;
Ok(result.into())
}
#[napi]
pub fn read_all_doc_ids_from_root_doc(
doc_bin: Buffer,
include_trash: Option<bool>,
) -> Result<Vec<String>> {
let result = doc_parser::get_doc_ids_from_binary(doc_bin.into(), include_trash.unwrap_or(false))
.map_err(|e| Error::new(Status::GenericFailure, e.to_string()))?;
Ok(result)
}

View File

@@ -2,6 +2,7 @@
mod utils;
pub mod doc;
pub mod doc_loader;
pub mod file_type;
pub mod hashcash;

View File

@@ -26,7 +26,6 @@
"postinstall": "prisma generate"
},
"dependencies": {
"@affine/reader": "workspace:*",
"@affine/server-native": "workspace:*",
"@ai-sdk/anthropic": "^2.0.54",
"@ai-sdk/google": "^2.0.45",

View File

@@ -9,64 +9,43 @@ Generated by [AVA](https://avajs.dev).
> Snapshot 1
{
markdown: `AFFiNE is an open source all in one workspace, an operating system for all the building blocks of your team wiki, knowledge management and digital assets and a better alternative to Notion and Miro.␊
markdown: `AFFiNE is an open source all in one workspace, an operating system for all the building blocks of your team wiki, knowledge management and digital assets and a better alternative to Notion and Miro.
# You own your data, with no compromises␊
## Local-first & Real-time collaborative␊
We love the idea proposed by Ink & Switch in the famous article about you owning your data, despite the cloud. Furthermore, AFFiNE is the first all-in-one workspace that keeps your data ownership with no compromises on real-time collaboration and editing experience.␊
AFFiNE is a local-first application upon CRDTs with real-time collaboration support. Your data is always stored locally while multiple nodes remain synced in real-time.␊
### Blocks that assemble your next docs, tasks kanban or whiteboard␊
There is a large overlap of their atomic "building blocks" between these apps. They are neither open source nor have a plugin system like VS Code for contributors to customize. We want to have something that contains all the features we love and goes one step further.␊
There is a large overlap of their atomic "building blocks" between these apps. They are neither open source nor have a plugin system like VS Code for contributors to customize. We want to have something that contains all the features we love and goes one step further.
We are building AFFiNE to be a fundamental open source platform that contains all the building blocks for docs, task management and visual collaboration, hoping you can shape your next workflow with us that can make your life better and also connect others, too.␊
If you want to learn more about the product design of AFFiNE, here goes the concepts:␊
To Shape, not to adapt. AFFiNE is built for individuals & teams who care about their data, who refuse vendor lock-in, and who want to have control over their essential tools.␊
## A true canvas for blocks in any form␊
[Many editor apps](http://notion.so) claimed to be a canvas for productivity. Since _the Mother of All Demos,_ Douglas Engelbart, a creative and programable digital workspace has been a pursuit and an ultimate mission for generations of tool makers.␊
Many editor apps claimed to be a canvas for productivity. Since the Mother of All Demos, Douglas Engelbart, a creative and programable digital workspace has been a pursuit and an ultimate mission for generations of tool makers.
"We shape our tools and thereafter our tools shape us”. A lot of pioneers have inspired us a long the way, e.g.:␊
* Quip & Notion with their great concept of "everything is a block"
* Trello with their Kanban
* Airtable & Miro with their no-code programable datasheets
* Miro & Whimiscal with their edgeless visual whiteboard
* Remnote & Capacities with their object-based tag system
For more details, please refer to our [RoadMap](https://docs.affine.pro/docs/core-concepts/roadmap)␊
- Quip & Notion with their great concept of "everything is a block"
- Trello with their Kanban
- Airtable & Miro with their no-code programable datasheets
- Miro & Whimiscal with their edgeless visual whiteboard
- Remnote & Capacities with their object-based tag system
For more details, please refer to our RoadMap
## Self Host␊
Self host AFFiNE␊
### Learning From␊
||Title|Tag|␊
|---|---|---|␊
|Affine Development|Affine Development|<span data-affine-option data-value="AxSe-53xjX" data-option-color="var(--affine-tag-pink)">AFFiNE</span>|␊
|For developers or installations guides, please go to AFFiNE Doc|For developers or installations guides, please go to AFFiNE Doc|<span data-affine-option data-value="0jh9gNw4Yl" data-option-color="var(--affine-tag-orange)">Developers</span>|␊
|Quip & Notion with their great concept of "everything is a block"|Quip & Notion with their great concept of "everything is a block"|<span data-affine-option data-value="HgHsKOUINZ" data-option-color="var(--affine-tag-blue)">Reference</span>|␊
|Trello with their Kanban|Trello with their Kanban|<span data-affine-option data-value="HgHsKOUINZ" data-option-color="var(--affine-tag-blue)">Reference</span>|␊
|Airtable & Miro with their no-code programable datasheets|Airtable & Miro with their no-code programable datasheets|<span data-affine-option data-value="HgHsKOUINZ" data-option-color="var(--affine-tag-blue)">Reference</span>|␊
|Miro & Whimiscal with their edgeless visual whiteboard|Miro & Whimiscal with their edgeless visual whiteboard|<span data-affine-option data-value="HgHsKOUINZ" data-option-color="var(--affine-tag-blue)">Reference</span>|␊
|Affine Development|Affine Development||␊
|For developers or installations guides, please go to AFFiNE Doc|For developers or installations guides, please go to AFFiNE Doc||␊
|Quip & Notion with their great concept of "everything is a block"|Quip & Notion with their great concept of "everything is a block"||␊
|Trello with their Kanban|Trello with their Kanban||␊
|Airtable & Miro with their no-code programable datasheets|Airtable & Miro with their no-code programable datasheets||␊
|Miro & Whimiscal with their edgeless visual whiteboard|Miro & Whimiscal with their edgeless visual whiteboard||␊
|Remnote & Capacities with their object-based tag system|Remnote & Capacities with their object-based tag system||␊
## Affine Development␊
For developer or installation guides, please go to [AFFiNE Development](https://docs.affine.pro/docs/development/quick-start)␊
For developer or installation guides, please go to AFFiNE Development
`,
title: 'Write, Draw, Plan all at Once.',

View File

@@ -9,64 +9,43 @@ Generated by [AVA](https://avajs.dev).
> Snapshot 1
{
markdown: `AFFiNE is an open source all in one workspace, an operating system for all the building blocks of your team wiki, knowledge management and digital assets and a better alternative to Notion and Miro.␊
markdown: `AFFiNE is an open source all in one workspace, an operating system for all the building blocks of your team wiki, knowledge management and digital assets and a better alternative to Notion and Miro.
# You own your data, with no compromises␊
## Local-first & Real-time collaborative␊
We love the idea proposed by Ink & Switch in the famous article about you owning your data, despite the cloud. Furthermore, AFFiNE is the first all-in-one workspace that keeps your data ownership with no compromises on real-time collaboration and editing experience.␊
AFFiNE is a local-first application upon CRDTs with real-time collaboration support. Your data is always stored locally while multiple nodes remain synced in real-time.␊
### Blocks that assemble your next docs, tasks kanban or whiteboard␊
There is a large overlap of their atomic "building blocks" between these apps. They are neither open source nor have a plugin system like VS Code for contributors to customize. We want to have something that contains all the features we love and goes one step further.␊
There is a large overlap of their atomic "building blocks" between these apps. They are neither open source nor have a plugin system like VS Code for contributors to customize. We want to have something that contains all the features we love and goes one step further.
We are building AFFiNE to be a fundamental open source platform that contains all the building blocks for docs, task management and visual collaboration, hoping you can shape your next workflow with us that can make your life better and also connect others, too.␊
If you want to learn more about the product design of AFFiNE, here goes the concepts:␊
To Shape, not to adapt. AFFiNE is built for individuals & teams who care about their data, who refuse vendor lock-in, and who want to have control over their essential tools.␊
## A true canvas for blocks in any form␊
[Many editor apps](http://notion.so) claimed to be a canvas for productivity. Since _the Mother of All Demos,_ Douglas Engelbart, a creative and programable digital workspace has been a pursuit and an ultimate mission for generations of tool makers.␊
Many editor apps claimed to be a canvas for productivity. Since the Mother of All Demos, Douglas Engelbart, a creative and programable digital workspace has been a pursuit and an ultimate mission for generations of tool makers.
"We shape our tools and thereafter our tools shape us”. A lot of pioneers have inspired us a long the way, e.g.:␊
* Quip & Notion with their great concept of "everything is a block"
* Trello with their Kanban
* Airtable & Miro with their no-code programable datasheets
* Miro & Whimiscal with their edgeless visual whiteboard
* Remnote & Capacities with their object-based tag system
For more details, please refer to our [RoadMap](https://docs.affine.pro/docs/core-concepts/roadmap)␊
- Quip & Notion with their great concept of "everything is a block"
- Trello with their Kanban
- Airtable & Miro with their no-code programable datasheets
- Miro & Whimiscal with their edgeless visual whiteboard
- Remnote & Capacities with their object-based tag system
For more details, please refer to our RoadMap
## Self Host␊
Self host AFFiNE␊
### Learning From␊
||Title|Tag|␊
|---|---|---|␊
|Affine Development|Affine Development|<span data-affine-option data-value="AxSe-53xjX" data-option-color="var(--affine-tag-pink)">AFFiNE</span>|␊
|For developers or installations guides, please go to AFFiNE Doc|For developers or installations guides, please go to AFFiNE Doc|<span data-affine-option data-value="0jh9gNw4Yl" data-option-color="var(--affine-tag-orange)">Developers</span>|␊
|Quip & Notion with their great concept of "everything is a block"|Quip & Notion with their great concept of "everything is a block"|<span data-affine-option data-value="HgHsKOUINZ" data-option-color="var(--affine-tag-blue)">Reference</span>|␊
|Trello with their Kanban|Trello with their Kanban|<span data-affine-option data-value="HgHsKOUINZ" data-option-color="var(--affine-tag-blue)">Reference</span>|␊
|Airtable & Miro with their no-code programable datasheets|Airtable & Miro with their no-code programable datasheets|<span data-affine-option data-value="HgHsKOUINZ" data-option-color="var(--affine-tag-blue)">Reference</span>|␊
|Miro & Whimiscal with their edgeless visual whiteboard|Miro & Whimiscal with their edgeless visual whiteboard|<span data-affine-option data-value="HgHsKOUINZ" data-option-color="var(--affine-tag-blue)">Reference</span>|␊
|Affine Development|Affine Development||␊
|For developers or installations guides, please go to AFFiNE Doc|For developers or installations guides, please go to AFFiNE Doc||␊
|Quip & Notion with their great concept of "everything is a block"|Quip & Notion with their great concept of "everything is a block"||␊
|Trello with their Kanban|Trello with their Kanban||␊
|Airtable & Miro with their no-code programable datasheets|Airtable & Miro with their no-code programable datasheets||␊
|Miro & Whimiscal with their edgeless visual whiteboard|Miro & Whimiscal with their edgeless visual whiteboard||␊
|Remnote & Capacities with their object-based tag system|Remnote & Capacities with their object-based tag system||␊
## Affine Development␊
For developer or installation guides, please go to [AFFiNE Development](https://docs.affine.pro/docs/development/quick-start)␊
For developer or installation guides, please go to AFFiNE Development
`,
title: 'Write, Draw, Plan all at Once.',

View File

@@ -9,64 +9,43 @@ Generated by [AVA](https://avajs.dev).
> Snapshot 1
{
markdown: `AFFiNE is an open source all in one workspace, an operating system for all the building blocks of your team wiki, knowledge management and digital assets and a better alternative to Notion and Miro.␊
markdown: `AFFiNE is an open source all in one workspace, an operating system for all the building blocks of your team wiki, knowledge management and digital assets and a better alternative to Notion and Miro.
# You own your data, with no compromises␊
## Local-first & Real-time collaborative␊
We love the idea proposed by Ink & Switch in the famous article about you owning your data, despite the cloud. Furthermore, AFFiNE is the first all-in-one workspace that keeps your data ownership with no compromises on real-time collaboration and editing experience.␊
AFFiNE is a local-first application upon CRDTs with real-time collaboration support. Your data is always stored locally while multiple nodes remain synced in real-time.␊
### Blocks that assemble your next docs, tasks kanban or whiteboard␊
There is a large overlap of their atomic "building blocks" between these apps. They are neither open source nor have a plugin system like VS Code for contributors to customize. We want to have something that contains all the features we love and goes one step further.␊
There is a large overlap of their atomic "building blocks" between these apps. They are neither open source nor have a plugin system like VS Code for contributors to customize. We want to have something that contains all the features we love and goes one step further.
We are building AFFiNE to be a fundamental open source platform that contains all the building blocks for docs, task management and visual collaboration, hoping you can shape your next workflow with us that can make your life better and also connect others, too.␊
If you want to learn more about the product design of AFFiNE, here goes the concepts:␊
To Shape, not to adapt. AFFiNE is built for individuals & teams who care about their data, who refuse vendor lock-in, and who want to have control over their essential tools.␊
## A true canvas for blocks in any form␊
[Many editor apps](http://notion.so) claimed to be a canvas for productivity. Since _the Mother of All Demos,_ Douglas Engelbart, a creative and programable digital workspace has been a pursuit and an ultimate mission for generations of tool makers.␊
Many editor apps claimed to be a canvas for productivity. Since the Mother of All Demos, Douglas Engelbart, a creative and programable digital workspace has been a pursuit and an ultimate mission for generations of tool makers.
"We shape our tools and thereafter our tools shape us”. A lot of pioneers have inspired us a long the way, e.g.:␊
* Quip & Notion with their great concept of "everything is a block"
* Trello with their Kanban
* Airtable & Miro with their no-code programable datasheets
* Miro & Whimiscal with their edgeless visual whiteboard
* Remnote & Capacities with their object-based tag system
For more details, please refer to our [RoadMap](https://docs.affine.pro/docs/core-concepts/roadmap)␊
- Quip & Notion with their great concept of "everything is a block"
- Trello with their Kanban
- Airtable & Miro with their no-code programable datasheets
- Miro & Whimiscal with their edgeless visual whiteboard
- Remnote & Capacities with their object-based tag system
For more details, please refer to our RoadMap
## Self Host␊
Self host AFFiNE␊
### Learning From␊
||Title|Tag|␊
|---|---|---|␊
|Affine Development|Affine Development|<span data-affine-option data-value="AxSe-53xjX" data-option-color="var(--affine-tag-pink)">AFFiNE</span>|␊
|For developers or installations guides, please go to AFFiNE Doc|For developers or installations guides, please go to AFFiNE Doc|<span data-affine-option data-value="0jh9gNw4Yl" data-option-color="var(--affine-tag-orange)">Developers</span>|␊
|Quip & Notion with their great concept of "everything is a block"|Quip & Notion with their great concept of "everything is a block"|<span data-affine-option data-value="HgHsKOUINZ" data-option-color="var(--affine-tag-blue)">Reference</span>|␊
|Trello with their Kanban|Trello with their Kanban|<span data-affine-option data-value="HgHsKOUINZ" data-option-color="var(--affine-tag-blue)">Reference</span>|␊
|Airtable & Miro with their no-code programable datasheets|Airtable & Miro with their no-code programable datasheets|<span data-affine-option data-value="HgHsKOUINZ" data-option-color="var(--affine-tag-blue)">Reference</span>|␊
|Miro & Whimiscal with their edgeless visual whiteboard|Miro & Whimiscal with their edgeless visual whiteboard|<span data-affine-option data-value="HgHsKOUINZ" data-option-color="var(--affine-tag-blue)">Reference</span>|␊
|Affine Development|Affine Development||␊
|For developers or installations guides, please go to AFFiNE Doc|For developers or installations guides, please go to AFFiNE Doc||␊
|Quip & Notion with their great concept of "everything is a block"|Quip & Notion with their great concept of "everything is a block"||␊
|Trello with their Kanban|Trello with their Kanban||␊
|Airtable & Miro with their no-code programable datasheets|Airtable & Miro with their no-code programable datasheets||␊
|Miro & Whimiscal with their edgeless visual whiteboard|Miro & Whimiscal with their edgeless visual whiteboard||␊
|Remnote & Capacities with their object-based tag system|Remnote & Capacities with their object-based tag system||␊
## Affine Development␊
For developer or installation guides, please go to [AFFiNE Development](https://docs.affine.pro/docs/development/quick-start)␊
For developer or installation guides, please go to AFFiNE Development
`,
title: 'Write, Draw, Plan all at Once.',

View File

@@ -192,12 +192,7 @@ export class DatabaseDocReader extends DocReader {
if (!doc) {
return null;
}
return parseDocToMarkdownFromDocSnapshot(
workspaceId,
docId,
doc.bin,
aiEditable
);
return parseDocToMarkdownFromDocSnapshot(docId, doc.bin, aiEditable);
}
async getDocDiff(

View File

@@ -44,12 +44,7 @@ test('can read all blocks from doc snapshot', async t => {
const doc = await models.doc.get(workspace.id, docSnapshot.id);
t.truthy(doc);
const result = await readAllBlocksFromDocSnapshot(
workspace.id,
'doc-0',
docSnapshot.blob,
rootDoc!.blob
);
const result = await readAllBlocksFromDocSnapshot('doc-0', docSnapshot.blob);
t.snapshot({
...result,
@@ -64,11 +59,7 @@ test('can read blob filename from doc snapshot', async t => {
snapshotFile: 'test-doc-with-blob.snapshot.bin',
});
const result = await readAllBlocksFromDocSnapshot(
workspace.id,
'doc-0',
docSnapshot.blob
);
const result = await readAllBlocksFromDocSnapshot('doc-0', docSnapshot.blob);
// NOTE: avoid snapshot result directly, because it will cause hanging
t.snapshot(JSON.parse(JSON.stringify(result)));
@@ -78,11 +69,7 @@ test('can read all blocks from doc snapshot without workspace snapshot', async t
const doc = await models.doc.get(workspace.id, docSnapshot.id);
t.truthy(doc);
const result = await readAllBlocksFromDocSnapshot(
workspace.id,
'doc-0',
docSnapshot.blob
);
const result = await readAllBlocksFromDocSnapshot('doc-0', docSnapshot.blob);
t.snapshot({
...result,
@@ -92,7 +79,6 @@ test('can read all blocks from doc snapshot without workspace snapshot', async t
test('can parse doc to markdown from doc snapshot', async t => {
const result = parseDocToMarkdownFromDocSnapshot(
workspace.id,
docSnapshot.id,
docSnapshot.blob
);
@@ -102,7 +88,6 @@ test('can parse doc to markdown from doc snapshot', async t => {
test('can parse doc to markdown from doc snapshot with ai editable', async t => {
const result = parseDocToMarkdownFromDocSnapshot(
workspace.id,
docSnapshot.id,
docSnapshot.blob,
true

View File

@@ -1,18 +1,10 @@
// TODO(@forehalo):
// Because of the `@affine/server` package can't import directly from workspace packages,
// this is a temporary solution to get the block suite data(title, description) from given yjs binary or yjs doc.
// The logic is mainly copied from
// - packages/frontend/core/src/modules/docs-search/worker/in-worker.ts
// - packages/frontend/core/src/components/page-list/use-block-suite-page-preview.ts
// and it's better to be provided by blocksuite
import { Array as YArray, Doc as YDoc, Map as YMap } from 'yjs';
// eslint-disable-next-line @typescript-eslint/no-restricted-imports -- import from bundle
import {
parsePageDoc as parseDocToMarkdown,
readAllBlocksFromDoc,
parseYDocFromBinary,
parseYDocToMarkdown,
readAllDocIdsFromRootDoc,
} from '@affine/reader/dist';
import { applyUpdate, Array as YArray, Doc as YDoc, Map as YMap } from 'yjs';
} from '../../native';
export interface PageDocContent {
title: string;
@@ -165,64 +157,49 @@ export function parsePageDoc(
}
export function readAllDocIdsFromWorkspaceSnapshot(snapshot: Uint8Array) {
const rootDoc = new YDoc();
applyUpdate(rootDoc, snapshot);
return readAllDocIdsFromRootDoc(rootDoc, {
includeTrash: false,
});
return readAllDocIdsFromRootDoc(Buffer.from(snapshot), false);
}
function safeParseJson<T>(str: string): T | undefined {
try {
return JSON.parse(str) as T;
} catch {
return undefined;
}
}
export async function readAllBlocksFromDocSnapshot(
workspaceId: string,
docId: string,
docSnapshot: Uint8Array,
workspaceSnapshot?: Uint8Array,
maxSummaryLength?: number
docSnapshot: Uint8Array
) {
let rootYDoc: YDoc | undefined;
if (workspaceSnapshot) {
rootYDoc = new YDoc({
guid: workspaceId,
});
applyUpdate(rootYDoc, workspaceSnapshot);
}
const ydoc = new YDoc({
guid: docId,
});
applyUpdate(ydoc, docSnapshot);
return await readAllBlocksFromDoc({
ydoc,
rootYDoc,
spaceId: workspaceId,
maxSummaryLength,
});
const result = parseYDocFromBinary(Buffer.from(docSnapshot), docId);
return {
...result,
blocks: result.blocks.map(block => ({
...block,
docId,
ref: block.refInfo,
additional: block.additional
? safeParseJson(block.additional)
: undefined,
})),
};
}
export function parseDocToMarkdownFromDocSnapshot(
workspaceId: string,
docId: string,
docSnapshot: Uint8Array,
aiEditable = false
) {
const ydoc = new YDoc({
guid: docId,
});
applyUpdate(ydoc, docSnapshot);
const parsed = parseDocToMarkdown({
workspaceId,
doc: ydoc,
buildBlobUrl: (blobId: string) => {
return `/${workspaceId}/blobs/${blobId}`;
},
buildDocUrl: (docId: string) => {
return `/workspace/${workspaceId}/${docId}`;
},
aiEditable,
});
const parsed = parseYDocToMarkdown(
Buffer.from(docSnapshot),
docId,
aiEditable
);
return {
title: parsed.title,
markdown: parsed.md,
markdown: parsed.markdown,
};
}

View File

@@ -40,6 +40,10 @@ export function getTokenEncoder(model?: string | null): Tokenizer | null {
export const getMime = serverNativeModule.getMime;
export const parseDoc = serverNativeModule.parseDoc;
export const htmlSanitize = serverNativeModule.htmlSanitize;
export const parseYDocFromBinary = serverNativeModule.parseDocFromBinary;
export const parseYDocToMarkdown = serverNativeModule.parseDocToMarkdown;
export const readAllDocIdsFromRootDoc =
serverNativeModule.readAllDocIdsFromRootDoc;
export const AFFINE_PRO_PUBLIC_KEY = serverNativeModule.AFFINE_PRO_PUBLIC_KEY;
export const AFFINE_PRO_LICENSE_AES_KEY =
serverNativeModule.AFFINE_PRO_LICENSE_AES_KEY;

View File

@@ -227,15 +227,7 @@ export class IndexerService {
this.logger.debug(`doc ${workspaceId}/${docId} is empty, skip indexing`);
return;
}
const MAX_WORKSPACE_SNAPSHOT_SIZE = 1024 * 1024 * 10; // 10MB
const result = await readAllBlocksFromDocSnapshot(
workspaceId,
docId,
docSnapshot.blob,
workspaceSnapshot.blob.length < MAX_WORKSPACE_SNAPSHOT_SIZE
? workspaceSnapshot.blob
: undefined
);
const result = await readAllBlocksFromDocSnapshot(docId, docSnapshot.blob);
if (!result) {
this.logger.warn(
`parse doc ${workspaceId}/${docId} failed, workspaceSnapshot size: ${workspaceSnapshot.blob.length}, docSnapshot size: ${docSnapshot.blob.length}`
@@ -277,7 +269,7 @@ export class IndexerService {
additional: block.additional
? JSON.stringify(block.additional)
: undefined,
markdownPreview: block.markdownPreview,
markdownPreview: undefined,
createdByUserId: docSnapshot.createdBy ?? '',
updatedByUserId: docSnapshot.updatedBy ?? '',
createdAt: docSnapshot.createdAt,

View File

@@ -12,7 +12,6 @@
},
"include": ["./src"],
"references": [
{ "path": "../../common/reader" },
{ "path": "../native" },
{ "path": "../../../tools/cli" },
{ "path": "../../../tools/utils" },

View File

@@ -79,6 +79,245 @@ impl From<JwstCodecError> for ParseError {
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MarkdownResult {
pub title: String,
pub markdown: String,
}
pub fn parse_doc_to_markdown(
doc_bin: Vec<u8>,
doc_id: String,
ai_editable: bool,
) -> Result<MarkdownResult, ParseError> {
if doc_bin.is_empty() || doc_bin == [0, 0] {
return Err(ParseError::InvalidBinary);
}
let mut doc = DocOptions::new().with_guid(doc_id.clone()).build();
doc
.apply_update_from_binary_v1(&doc_bin)
.map_err(|_| ParseError::InvalidBinary)?;
let blocks_map = doc.get_map("blocks")?;
if blocks_map.is_empty() {
return Ok(MarkdownResult {
title: "".into(),
markdown: "".into(),
});
}
let mut block_pool: HashMap<String, Map> = HashMap::new();
let mut parent_lookup: HashMap<String, String> = HashMap::new();
for (_, value) in blocks_map.iter() {
if let Some(block_map) = value.to_map() {
if let Some(block_id) = get_block_id(&block_map) {
for child_id in collect_child_ids(&block_map) {
parent_lookup.insert(child_id, block_id.clone());
}
block_pool.insert(block_id, block_map);
}
}
}
let root_block_id = block_pool
.iter()
.find_map(|(id, block)| {
get_flavour(block)
.filter(|flavour| flavour == PAGE_FLAVOUR)
.map(|_| id.clone())
})
.ok_or_else(|| ParseError::ParserError("root block not found".into()))?;
let mut queue: Vec<(Option<String>, String)> = vec![(None, root_block_id.clone())];
let mut visited: HashSet<String> = HashSet::from([root_block_id.clone()]);
let mut doc_title = String::from("Untitled");
let mut markdown = String::new();
while let Some((parent_block_id, block_id)) = queue.pop() {
let block = match block_pool.get(&block_id) {
Some(block) => block,
None => continue,
};
let flavour = match get_flavour(block) {
Some(flavour) => flavour,
None => continue,
};
let parent_id = parent_lookup.get(&block_id);
let parent_flavour = parent_id
.and_then(|id| block_pool.get(id))
.and_then(get_flavour);
if parent_flavour.as_deref() == Some("affine:database") {
continue;
}
// enqueue children first to keep traversal order similar to JS implementation
let mut child_ids = collect_child_ids(block);
for child_id in child_ids.drain(..).rev() {
if visited.insert(child_id.clone()) {
queue.push((Some(block_id.clone()), child_id));
}
}
if flavour == PAGE_FLAVOUR {
let title = get_string(block, "prop:title").unwrap_or_default();
doc_title = title.clone();
continue;
}
if flavour == "affine:database" {
let title = get_string(block, "prop:title").unwrap_or_default();
markdown.push_str(&format!("\n### {}\n", title));
let columns_array = block.get("prop:columns").and_then(|v| v.to_array());
let cells_map = block.get("prop:cells").and_then(|v| v.to_map());
if let (Some(columns_array), Some(cells_map)) = (columns_array, cells_map) {
let mut columns = Vec::new();
for col_val in columns_array.iter() {
if let Some(col_map) = col_val.to_map() {
let id = get_string(&col_map, "id").unwrap_or_default();
let name = get_string(&col_map, "name").unwrap_or_default();
let type_ = get_string(&col_map, "type").unwrap_or_default();
let data = col_map.get("data").and_then(|v| v.to_map());
columns.push((id, name, type_, data));
}
}
let escape_table = |s: &str| s.replace('|', "\\|").replace('\n', "<br>");
markdown.push('|');
for (_, name, _, _) in &columns {
markdown.push_str(&escape_table(name));
markdown.push('|');
}
markdown.push('\n');
markdown.push('|');
for _ in &columns {
markdown.push_str("---|");
}
markdown.push('\n');
let child_ids = collect_child_ids(block);
for child_id in child_ids {
markdown.push('|');
let row_cells = cells_map.get(&child_id).and_then(|v| v.to_map());
for (col_id, _, col_type, col_data) in &columns {
let mut cell_text = String::new();
if col_type == "title" {
if let Some(child_block) = block_pool.get(&child_id) {
if let Some((text, _)) = text_content(child_block, "prop:text") {
cell_text = text;
}
}
} else if let Some(row_cells) = &row_cells {
if let Some(cell_val) = row_cells.get(col_id).and_then(|v| v.to_map()) {
if let Some(value) = cell_val.get("value").and_then(|v| v.to_any()) {
cell_text = format_cell_value(&value, col_type, col_data.as_ref());
}
}
}
markdown.push_str(&escape_table(&cell_text));
markdown.push('|');
}
markdown.push('\n');
}
}
continue;
}
if flavour == "affine:table" {
let contents = gather_table_contents(block);
markdown.push_str(&contents.join("|"));
markdown.push('\n');
continue;
}
if ai_editable && parent_block_id.as_ref() == Some(&root_block_id) {
markdown.push_str(&format!(
"<!-- block_id={} flavour={} -->\n",
block_id, flavour
));
}
if flavour == "affine:paragraph" {
if let Some((text, _)) = text_content(block, "prop:text") {
let type_ = get_string(block, "prop:type").unwrap_or_default();
let prefix = match type_.as_str() {
"h1" => "# ",
"h2" => "## ",
"h3" => "### ",
"h4" => "#### ",
"h5" => "##### ",
"h6" => "###### ",
"quote" => "> ",
_ => "",
};
markdown.push_str(prefix);
markdown.push_str(&text);
markdown.push('\n');
}
continue;
}
if flavour == "affine:list" {
if let Some((text, _)) = text_content(block, "prop:text") {
let depth = get_list_depth(&block_id, &parent_lookup, &block_pool);
let indent = " ".repeat(depth);
markdown.push_str(&indent);
markdown.push_str("- ");
markdown.push_str(&text);
markdown.push('\n');
}
continue;
}
if flavour == "affine:code" {
if let Some((text, _)) = text_content(block, "prop:text") {
let lang = get_string(block, "prop:language").unwrap_or_default();
markdown.push_str("```");
markdown.push_str(&lang);
markdown.push('\n');
markdown.push_str(&text);
markdown.push_str("\n```\n");
}
continue;
}
}
Ok(MarkdownResult {
title: doc_title,
markdown,
})
}
fn get_list_depth(
block_id: &str,
parent_lookup: &HashMap<String, String>,
blocks: &HashMap<String, Map>,
) -> usize {
let mut depth = 0;
let mut current_id = block_id.to_string();
while let Some(parent_id) = parent_lookup.get(&current_id) {
if let Some(parent_block) = blocks.get(parent_id) {
if get_flavour(parent_block).as_deref() == Some("affine:list") {
depth += 1;
current_id = parent_id.clone();
continue;
}
}
break;
}
depth
}
pub fn parse_doc_from_binary(doc_bin: Vec<u8>, doc_id: String) -> Result<CrawlResult, ParseError> {
if doc_bin.is_empty() || doc_bin == [0, 0] {
return Err(ParseError::InvalidBinary);
@@ -284,6 +523,49 @@ pub fn parse_doc_from_binary(doc_bin: Vec<u8>, doc_id: String) -> Result<CrawlRe
})
}
pub fn get_doc_ids_from_binary(
doc_bin: Vec<u8>,
include_trash: bool,
) -> Result<Vec<String>, ParseError> {
if doc_bin.is_empty() || doc_bin == [0, 0] {
return Err(ParseError::InvalidBinary);
}
let mut doc = DocOptions::new().build();
doc
.apply_update_from_binary_v1(&doc_bin)
.map_err(|_| ParseError::InvalidBinary)?;
let meta = doc.get_map("meta")?;
let pages = match meta.get("pages").and_then(|v| v.to_array()) {
Some(arr) => arr,
None => return Ok(vec![]),
};
let mut doc_ids = Vec::new();
for page_val in pages.iter() {
if let Some(page) = page_val.to_map() {
let id = get_string(&page, "id");
if let Some(id) = id {
let trash = page
.get("trash")
.and_then(|v| match v.to_any() {
Some(Any::True) => Some(true),
Some(Any::False) => Some(false),
_ => None,
})
.unwrap_or(false);
if include_trash || !trash {
doc_ids.push(id);
}
}
}
}
Ok(doc_ids)
}
fn collect_child_ids(block: &Map) -> Vec<String> {
block
.get("sys:children")
@@ -454,6 +736,56 @@ fn gather_table_contents(block: &Map) -> Vec<String> {
contents
}
fn format_cell_value(value: &Any, col_type: &str, col_data: Option<&Map>) -> String {
match col_type {
"select" => {
if let Any::String(id) = value {
if let Some(options) = col_data
.and_then(|d| d.get("options"))
.and_then(|v| v.to_array())
{
for opt in options.iter() {
if let Some(opt_map) = opt.to_map() {
if let Some(opt_id) = get_string(&opt_map, "id") {
if opt_id == *id {
return get_string(&opt_map, "value").unwrap_or_default();
}
}
}
}
}
}
String::new()
}
"multi-select" => {
if let Any::Array(ids) = value {
let mut selected = Vec::new();
if let Some(options) = col_data
.and_then(|d| d.get("options"))
.and_then(|v| v.to_array())
{
for id_val in ids.iter() {
if let Any::String(id) = id_val {
for opt in options.iter() {
if let Some(opt_map) = opt.to_map() {
if let Some(opt_id) = get_string(&opt_map, "id") {
if opt_id == *id {
selected.push(get_string(&opt_map, "value").unwrap_or_default());
}
}
}
}
}
}
}
return selected.join(", ");
}
String::new()
}
_ => any_to_string(value).unwrap_or_default(),
}
}
fn value_to_string(value: &Value) -> Option<String> {
if let Some(text) = value.to_text() {
return Some(text.to_string());

View File

@@ -1166,7 +1166,6 @@ export const PackageList = [
location: 'packages/backend/server',
name: '@affine/server',
workspaceDependencies: [
'packages/common/reader',
'packages/backend/native',
'tools/cli',
'tools/utils',

854
yarn.lock

File diff suppressed because it is too large Load Diff