feat(core): moving in affine-reader doc parsers (#12840)

fix AI-191

#### PR Dependency Tree


* **PR #12840** 👈

This tree was auto-generated by
[Charcoal](https://github.com/danerwilliams/charcoal)

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

- **New Features**
- Introduced the ability to convert rich text documents into Markdown,
supporting a wide range of content types such as headings, lists,
tables, images, code blocks, attachments, and embedded documents.
- Added support for parsing collaborative document structures and
rendering them as structured Markdown or parsed representations.
- Enhanced handling of database and table blocks, including conversion
to Markdown tables with headers and cell content.

- **Documentation**
  - Added a README noting the use of a forked Markdown converter.

- **Tests**
  - Added new test coverage for document parsing features.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->


#### PR Dependency Tree


* **PR #12840** 👈

This tree was auto-generated by
[Charcoal](https://github.com/danerwilliams/charcoal)
This commit is contained in:
Peng Xiao
2025-06-17 16:32:11 +08:00
committed by GitHub
parent dfe4c22a75
commit f4c20056a0
11 changed files with 1428 additions and 0 deletions

View File

@@ -0,0 +1 @@
A fork of https://github.com/frysztak/quill-delta-to-markdown

View File

@@ -0,0 +1,95 @@
// eslint-disable
// @ts-nocheck
import { Node } from './utils/node';
import { encodeLink } from './utils/url';
export interface InlineReference {
type: 'LinkedPage';
pageId: string;
title?: string;
params?: { mode: 'doc' | 'edgeless' };
}
export interface ConverterOptions {
convertInlineReferenceLink?: (reference: InlineReference) => {
title: string;
link: string;
};
}
const defaultConvertInlineReferenceLink = (reference: InlineReference) => {
return {
title: reference.title || '',
link: [reference.type, reference.pageId, reference.params?.mode]
.filter(Boolean)
.join(':'),
};
};
export function getConverters(opts: ConverterOptions = {}) {
const { convertInlineReferenceLink = defaultConvertInlineReferenceLink } =
opts;
return {
embed: {
image: function (src) {
this.append('![](' + encodeLink(src) + ')');
},
// Not a default Quill feature, converts custom divider embed blot added when
// creating quill editor instance.
// See https://quilljs.com/guides/cloning-medium-with-parchment/#dividers
thematic_break: function () {
this.open = '\n---\n' + this.open;
},
},
inline: {
italic: function () {
return ['_', '_'];
},
bold: function () {
return ['**', '**'];
},
link: function (url) {
return ['[', '](' + url + ')'];
},
reference: function (reference: InlineReference) {
const { title, link } = convertInlineReferenceLink(reference);
return ['[', `${title}](${link})`];
},
strike: function () {
return ['~~', '~~'];
},
code: function () {
return ['`', '`'];
},
},
block: {
header: function ({ header }) {
this.open = '#'.repeat(header) + ' ' + this.open;
},
blockquote: function () {
this.open = '> ' + this.open;
},
list: {
group: function () {
return new Node(['', '\n']);
},
line: function (attrs, group) {
if (attrs.list === 'bullet') {
this.open = '- ' + this.open;
} else if (attrs.list === 'checked') {
this.open = '- [x] ' + this.open;
} else if (attrs.list === 'unchecked') {
this.open = '- [ ] ' + this.open;
} else if (attrs.list === 'ordered') {
group.count = group.count || 0;
var count = ++group.count;
this.open = count + '. ' + this.open;
}
},
},
},
};
}

View File

@@ -0,0 +1,147 @@
// eslint-disable
// @ts-nocheck
import { Node } from './utils/node';
export const deltaToMd = (delta, converters) => {
return convert(delta, converters).render().trimEnd() + '\n';
};
function convert(ops, converters) {
let group, line, el, activeInline, beginningOfLine;
let root = new Node();
function newLine() {
el = line = new Node(['', '\n']);
root.append(line);
activeInline = {};
}
newLine();
for (let i = 0; i < ops.length; i++) {
let op = ops[i];
if (typeof op.insert === 'object') {
for (let k in op.insert) {
if (converters.embed[k]) {
applyInlineAttributes(op.attributes);
converters.embed[k].call(el, op.insert[k], op.attributes);
}
}
} else {
let lines = op.insert.split('\n');
if (hasBlockLevelAttribute(op.attributes, converters)) {
// Some line-level styling (ie headings) is applied by inserting a \n
// with the style; the style applies back to the previous \n.
// There *should* only be one style in an insert operation.
for (let j = 1; j < lines.length; j++) {
for (let attr in op.attributes) {
if (converters.block[attr]) {
let fn = converters.block[attr];
if (typeof fn === 'object') {
if (group && group.type !== attr) {
group = null;
}
if (!group && fn.group) {
group = {
el: fn.group(),
type: attr,
value: op.attributes[attr],
distance: 0,
};
root.append(group.el);
}
if (group) {
group.el.append(line);
group.distance = 0;
}
fn = fn.line;
}
fn.call(line, op.attributes, group);
newLine();
break;
}
}
}
beginningOfLine = true;
} else {
for (let l = 0; l < lines.length; l++) {
if ((l > 0 || beginningOfLine) && group && ++group.distance >= 2) {
group = null;
}
applyInlineAttributes(
op.attributes,
ops[i + 1] && ops[i + 1].attributes
);
el.append(lines[l]);
if (l < lines.length - 1) {
newLine();
}
}
beginningOfLine = false;
}
}
}
return root;
function applyInlineAttributes(attrs, next?: any) {
let first: any[] = [];
let then: any[] = [];
attrs = attrs || {};
let tag = el,
seen = {};
while (tag._format) {
seen[tag._format] = true;
if (!attrs[tag._format] || tag.open !== tag.close) {
for (let k in seen) {
delete activeInline[k];
}
el = tag.parent();
}
tag = tag.parent();
}
for (let attr in attrs) {
if (converters.inline[attr] && attrs[attr]) {
if (activeInline[attr] && activeInline[attr] === attrs[attr]) {
continue; // do nothing -- we should already be inside this style's tag
}
if (next && attrs[attr] === next[attr]) {
first.push(attr); // if the next operation has the same style, this should be the outermost tag
} else {
then.push(attr);
}
activeInline[attr] = attrs[attr];
}
}
first.forEach(apply);
then.forEach(apply);
function apply(fmt) {
let newEl = converters.inline[fmt].call(null, attrs[fmt]);
if (Array.isArray(newEl)) {
newEl = new Node(newEl);
}
newEl._format = fmt;
el.append(newEl);
el = newEl;
}
}
}
function hasBlockLevelAttribute(attrs, converters) {
for (let k in attrs) {
if (Object.keys(converters.block).includes(k)) {
return true;
}
}
return false;
}

View File

@@ -0,0 +1,2 @@
export { getConverters } from './delta-converters';
export { deltaToMd } from './delta-to-md';

View File

@@ -0,0 +1,66 @@
// eslint-disable
// @ts-nocheck
let id = 0;
export class Node {
id = ++id;
children: Node[];
open: string;
close: string;
text: string;
_format: string;
_parent: Node;
constructor(data?: string[] | string) {
if (Array.isArray(data)) {
this.open = data[0];
this.close = data[1];
} else if (typeof data === 'string') {
this.text = data;
}
this.children = [];
}
append(e: Node) {
if (!(e instanceof Node)) {
e = new Node(e);
}
if (e._parent) {
const idx = e._parent.children.indexOf(e);
e._parent.children.splice(idx, 1);
}
e._parent = this;
this.children = this.children.concat(e);
}
render() {
const inner =
(this.text || '') + this.children.map(c => c.render()).join('');
if (
inner.trim() === '' &&
this.open === this.close &&
this.open &&
this.close
) {
return '';
}
const wrapped = this.open && this.close;
const emptyInner = inner.trim() === '';
const fragments = [
inner.startsWith(' ') && !emptyInner && wrapped ? ' ' : '',
this.open,
wrapped ? inner.trim() : inner,
this.close,
inner.endsWith(' ') && !emptyInner && wrapped ? ' ' : '',
].filter(f => f);
return fragments.join('');
}
parent() {
return this._parent;
}
}

View File

@@ -0,0 +1,5 @@
export const encodeLink = (link: string) =>
encodeURI(link)
.replace(/\(/g, '%28')
.replace(/\)/g, '%29')
.replace(/(\?|&)response-content-disposition=attachment.*$/, '');

View File

@@ -0,0 +1,437 @@
import type { ColumnDataType } from '@blocksuite/affine/model';
import { Array as YArray, type Map as YMap, type Text as YText } from 'yjs';
import { deltaToMd, getConverters } from './delta-to-md';
import type {
BaseParsedBlock,
Flavour,
ParsedBlock,
ParsedDoc,
ParserContext,
SerializedCells,
YBlock,
YBlocks,
} from './types';
export const parseBlockToMd = (
block: BaseParsedBlock,
padding = ''
): string => {
if (block.content) {
return (
block.content
.split('\n')
.map(line => padding + line)
.join('\n') +
'\n' +
block.children.map(b => parseBlockToMd(b, padding + ' ')).join('')
);
} else {
return block.children.map(b => parseBlockToMd(b, padding)).join('');
}
};
export function parseBlock(
context: ParserContext,
yBlock: YBlock | undefined,
yBlocks: YBlocks // all blocks
): ParsedBlock | null {
if (!yBlock) {
return null;
}
const deltaConverters = getConverters({
convertInlineReferenceLink: ref => {
return {
title: ref.title || context.renderDocTitle?.(ref.pageId) || '',
link: context.buildDocUrl(ref.pageId),
};
},
});
const id = yBlock.get('sys:id') as string;
const flavour = yBlock.get('sys:flavour') as Flavour;
const type = yBlock.get('prop:type') as string;
const toMd = () =>
deltaToMd((yBlock.get('prop:text') as YText).toDelta(), deltaConverters);
const hidden = yBlock.get('prop:hidden') as boolean;
const displayMode = yBlock.get('prop:displayMode') as string;
const childrenIds =
yBlock.get('sys:children') instanceof YArray
? (yBlock.get('sys:children') as YArray<string>).toJSON()
: [];
let result: ParsedBlock = {
id,
flavour,
content: '',
children: [],
type,
};
if (hidden || displayMode === 'edgeless') {
return result;
}
try {
switch (flavour) {
case 'affine:paragraph': {
let initial = '';
if (type === 'h1') {
initial = '# ';
} else if (type === 'h2') {
initial = '## ';
} else if (type === 'h3') {
initial = '### ';
} else if (type === 'h4') {
initial = '#### ';
} else if (type === 'h5') {
initial = '##### ';
} else if (type === 'h6') {
initial = '###### ';
} else if (type === 'quote') {
initial = '> ';
}
result.content = initial + toMd() + '\n';
break;
}
case 'affine:divider': {
result.content = '\n---\n\n';
break;
}
case 'affine:list': {
result.content = (type === 'bulleted' ? '* ' : '1. ') + toMd() + '\n';
break;
}
case 'affine:code': {
const lang =
(yBlock.get('prop:language') as string)?.toLowerCase() || 'txt';
// do not transform to delta for code block
const caption = yBlock.get('prop:caption') as string;
result.content =
'```' +
lang +
(caption ? ` ${caption}` : '') +
'\n' +
(yBlock.get('prop:text') as YText).toJSON() +
'\n```\n\n';
break;
}
case 'affine:image': {
const sourceId = yBlock.get('prop:sourceId') as string;
const width = yBlock.get('prop:width');
const height = yBlock.get('prop:height');
// fixme: this may not work if workspace is not public
const blobUrl = context.buildBlobUrl(sourceId);
const caption = yBlock.get('prop:caption') as string;
if (width || height || caption) {
result.content =
`<img
src="${blobUrl}"
alt="${caption}"
width="${width || 'auto'}"
height="${height || 'auto'}"
/>
` + '\n\n';
} else {
result.content = `\n![${caption || sourceId}](${blobUrl})\n\n`;
}
Object.assign(result, {
sourceId,
width,
height,
caption,
blobUrl,
});
break;
}
case 'affine:attachment': {
const sourceId = yBlock.get('prop:sourceId') as string;
const blobUrl = context.buildBlobUrl(sourceId);
const caption = yBlock.get('prop:caption') as string;
if (type.startsWith('video')) {
result.content =
`<video muted autoplay loop preload="auto" playsinline>
<source src="${blobUrl}" type="${type}" />
</video>
` + '\n\n';
} else {
// assume it is an image
result.content = `\n![${caption || sourceId}](${blobUrl})\n\n`;
}
Object.assign(result, {
sourceId,
blobUrl,
caption,
});
break;
}
case 'affine:embed-youtube': {
const videoId = yBlock.get('prop:videoId') as string;
// prettier-ignore
result.content = `
<iframe
type="text/html"
width="100%"
height="410px"
src="https://www.youtube.com/embed/${videoId}"
frameborder="0"
allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share"
allowfullscreen
credentialless>
</iframe>` + '\n\n';
break;
}
case 'affine:bookmark': {
const url = yBlock.get('prop:url') as string;
const caption = yBlock.get('prop:caption') as string;
result.content = `\n[](Bookmark,${url})\n\n`;
Object.assign(result, {
url,
caption,
});
break;
}
case 'affine:embed-linked-doc':
case 'affine:embed-synced-doc': {
const pageId = yBlock.get('prop:pageId') as string;
const caption = yBlock.get('prop:caption') as string;
result.content = `\n[${caption}](${context.buildDocUrl(pageId)})\n\n`;
Object.assign(result, {
pageId,
caption,
});
break;
}
case 'affine:surface':
case 'affine:page':
case 'affine:note':
case 'affine:frame': {
result.content = '';
break;
}
case 'affine:database': {
const title = (yBlock.get('prop:title') as YText).toJSON();
const childrenTitleById = Object.fromEntries(
childrenIds.map(cid => {
const child = parseBlock(
context,
yBlocks.get(cid) as YBlock | undefined,
yBlocks
);
if (!child) {
return [cid, ''];
}
return [cid, parseBlockToMd(child)] as const;
})
);
const cols = (
yBlock.get('prop:columns') as YArray<ColumnDataType>
).toJSON() as ColumnDataType[];
const cells = (
yBlock.get('prop:cells') as YMap<SerializedCells>
).toJSON() as SerializedCells;
const optionToTagHtml = (option: any) => {
return `<span data-affine-option data-value="${option.id}" data-option-color="${option.color}">${option.value}</span>`;
};
const dbRows: string[][] = childrenIds
.map(cid => {
const row = cells[cid];
return cols.map(col => {
const value = row?.[col.id]?.value;
if (col.type !== 'title' && !value) {
return '';
}
switch (col.type) {
case 'title':
return childrenTitleById[cid];
case 'select':
return optionToTagHtml(
(col.data['options'] as any).find(
(opt: any) => opt.id === value
)
);
case 'multi-select':
return (col.data['options'] as any)
.filter((opt: any) => (value as string[]).includes(opt.id))
.map(optionToTagHtml)
.join('');
default:
return value ?? '';
}
});
})
.filter(row => !row.every(v => !v));
const header = cols.map(col => {
return col.name;
});
const divider = cols.map(() => {
return '---';
});
// convert to markdown table
result.content =
[header, divider, ...dbRows]
.map(row => {
return (
'|' +
row
.map(cell => String(cell || '')?.trim())
.join('|')
.replace(/\n+/g, '<br />') +
'|'
);
})
.join('\n') + '\n\n';
Object.assign(result, {
title,
rows: dbRows.map(row => {
return Object.fromEntries(row.map((v, i) => [cols[i].name, v]));
}),
});
break;
}
case 'affine:table': {
// Extract row IDs and their order
const rowEntries = Object.entries(yBlock.toJSON())
.filter(
([key]) => key.startsWith('prop:rows.') && key.endsWith('.rowId')
)
.map(([key, value]) => {
const rowId = value as string;
const orderKey = key.replace('.rowId', '.order');
const order = yBlock.get(orderKey) as string;
const backgroundColor = yBlock.get(
key.replace('.rowId', '.backgroundColor')
) as string | undefined;
return { rowId, order, backgroundColor };
})
.sort((a, b) => a.order.localeCompare(b.order));
// Extract column IDs and their order
const columnEntries = Object.entries(yBlock.toJSON())
.filter(
([key]) =>
key.startsWith('prop:columns.') && key.endsWith('.columnId')
)
.map(([key, value]) => {
const columnId = value as string;
const orderKey = key.replace('.columnId', '.order');
const order = yBlock.get(orderKey) as string;
return { columnId, order };
})
.sort((a, b) => a.order.localeCompare(b.order));
// Build the table rows with cell data
const tableRows = rowEntries.map(({ rowId }) => {
return columnEntries.map(({ columnId }) => {
const cellKey = `prop:cells.${rowId}:${columnId}.text`;
const cellText = yBlock.get(cellKey) as string | undefined;
return cellText || '';
});
});
// Store column IDs for reference
const columnIds = columnEntries.map(({ columnId }) => columnId);
// Use the first row as header and the rest as data rows
if (tableRows.length > 0) {
const headerRow = tableRows[0];
const dataRows = tableRows.slice(1);
const separators = headerRow.map(() => '---');
// Convert to markdown table with first row as header
result.content =
[headerRow, separators, ...dataRows]
.map(row => {
return (
'|' +
row
.map(cell => String(cell || '')?.trim())
.join('|')
.replace(/\n+/g, '<br />') +
'|'
);
})
.join('\n') + '\n\n';
} else {
// Handle empty table case
result.content = '';
}
Object.assign(result, {
columns: columnIds,
rows: tableRows,
});
break;
}
default: {
// console.warn("Unknown or unsupported flavour", flavour);
}
}
result.children =
flavour !== 'affine:database'
? childrenIds
.map(cid =>
parseBlock(
context,
yBlocks.get(cid) as YBlock | undefined,
yBlocks
)
)
.filter(
(block): block is ParsedBlock =>
!!block &&
!(block.content === '' && block.children.length === 0)
)
: [];
} catch (e) {
console.warn('Error converting block to md', e);
}
return result;
}
export const parsePageDoc = (ctx: ParserContext): ParsedDoc => {
// we assume that the first block is the page block
const yBlocks: YBlocks = ctx.doc.getMap('blocks');
const maybePageBlock = Object.entries(yBlocks.toJSON()).findLast(
([_, b]) => b['sys:flavour'] === 'affine:page'
);
// there are cases that the page is empty due to some weird issues
if (!maybePageBlock) {
return {
title: '',
md: '',
};
} else {
const yPage = yBlocks.get(maybePageBlock[0]) as YBlock;
const title = yPage.get('prop:title') as YText;
const rootBlock = parseBlock(ctx, yPage, yBlocks);
if (!rootBlock) {
return {
title: '',
md: '',
};
}
rootBlock.children = rootBlock.children.filter(
(block): block is BaseParsedBlock => block.flavour === 'affine:note'
);
const md = parseBlockToMd(rootBlock);
return {
title: title.toJSON(),
parsedBlock: rootBlock,
md,
};
}
};

View File

@@ -0,0 +1,152 @@
import { type CellDataType } from '@blocksuite/affine/model';
import { type Doc as YDoc, type Map as YMap } from 'yjs';
export interface WorkspacePage {
id: string;
guid: string;
title: string;
createDate: number;
trash?: boolean;
favorite?: boolean;
properties?: Record<string, any>;
}
export type BaseFlavour<T extends string> = `affine:${T}`;
export type Flavour = BaseFlavour<
| 'page'
| 'frame'
| 'paragraph'
| 'code'
| 'note'
| 'list'
| 'divider'
| 'embed'
| 'image'
| 'surface'
| 'database'
| 'table'
| 'attachment'
| 'bookmark'
| 'embed-youtube'
| 'embed-linked-doc'
| 'embed-synced-doc'
>;
export interface BaseParsedBlock {
id: string;
flavour: Flavour;
content: string;
children: BaseParsedBlock[];
type?: string;
}
export interface ParsedDoc {
title: string;
md: string;
parsedBlock?: ParsedBlock;
}
export interface ParagraphBlock extends BaseParsedBlock {
flavour: 'affine:paragraph';
type: 'h1' | 'h2' | 'h3' | 'h4' | 'h5' | 'h6' | 'quote';
}
export interface DividerBlock extends BaseParsedBlock {
flavour: 'affine:divider';
}
export interface ListBlock extends BaseParsedBlock {
flavour: 'affine:list';
type: 'bulleted' | 'numbered';
}
export interface CodeBlock extends BaseParsedBlock {
flavour: 'affine:code';
language: string;
}
export interface ImageBlock extends BaseParsedBlock {
flavour: 'affine:image';
sourceId: string;
blobUrl: string;
width?: number;
height?: number;
caption?: string;
}
export interface AttachmentBlock extends BaseParsedBlock {
flavour: 'affine:attachment';
type: string;
sourceId: string;
}
export interface EmbedYoutubeBlock extends BaseParsedBlock {
flavour: 'affine:embed-youtube';
videoId: string;
}
export interface BookmarkBlock extends BaseParsedBlock {
flavour: 'affine:bookmark';
url: string;
}
export interface EmbedLinkedDocBlock extends BaseParsedBlock {
flavour: 'affine:embed-linked-doc';
pageId: string;
}
export interface EmbedSyncedDocBlock extends BaseParsedBlock {
flavour: 'affine:embed-synced-doc';
pageId: string;
}
export interface DatabaseBlock extends BaseParsedBlock {
title: string;
flavour: 'affine:database';
rows: Record<string, string>[];
}
export interface TableBlock extends BaseParsedBlock {
flavour: 'affine:table';
rows: string[][];
columns: string[];
}
export type ParsedBlock =
| ParagraphBlock
| DividerBlock
| ListBlock
| CodeBlock
| ImageBlock
| AttachmentBlock
| EmbedYoutubeBlock
| BookmarkBlock
| DatabaseBlock
| TableBlock
| BaseParsedBlock;
export interface ParsedDoc {
title: string;
md: string;
parsedBlock?: ParsedBlock;
}
export type SerializedCells = {
// row
[key: string]: {
// column
[key: string]: CellDataType;
};
};
export type YBlock = YMap<unknown>;
export type YBlocks = YMap<YBlock>;
export interface ParserContext {
workspaceId: string;
doc: YDoc;
buildBlobUrl: (blobId: string) => string;
buildDocUrl: (docId: string) => string;
renderDocTitle?: (docId: string) => string;
}

View File

@@ -915,3 +915,5 @@ export function readAllDocIdsFromRootDoc(
}
return Array.from(docIds);
}
export { parseBlock, parseBlockToMd, parsePageDoc } from './doc-parser/parser';