mirror of
https://github.com/toeverything/AFFiNE.git
synced 2026-02-12 20:38:52 +00:00
feat(core): moving in affine-reader doc parsers (#12840)
fix AI-191 #### PR Dependency Tree * **PR #12840** 👈 This tree was auto-generated by [Charcoal](https://github.com/danerwilliams/charcoal) <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit - **New Features** - Introduced the ability to convert rich text documents into Markdown, supporting a wide range of content types such as headings, lists, tables, images, code blocks, attachments, and embedded documents. - Added support for parsing collaborative document structures and rendering them as structured Markdown or parsed representations. - Enhanced handling of database and table blocks, including conversion to Markdown tables with headers and cell content. - **Documentation** - Added a README noting the use of a forked Markdown converter. - **Tests** - Added new test coverage for document parsing features. <!-- end of auto-generated comment: release notes by coderabbit.ai --> #### PR Dependency Tree * **PR #12840** 👈 This tree was auto-generated by [Charcoal](https://github.com/danerwilliams/charcoal)
This commit is contained in:
@@ -0,0 +1 @@
|
||||
A fork of https://github.com/frysztak/quill-delta-to-markdown
|
||||
@@ -0,0 +1,95 @@
|
||||
// eslint-disable
|
||||
// @ts-nocheck
|
||||
import { Node } from './utils/node';
|
||||
import { encodeLink } from './utils/url';
|
||||
|
||||
export interface InlineReference {
|
||||
type: 'LinkedPage';
|
||||
pageId: string;
|
||||
title?: string;
|
||||
params?: { mode: 'doc' | 'edgeless' };
|
||||
}
|
||||
|
||||
export interface ConverterOptions {
|
||||
convertInlineReferenceLink?: (reference: InlineReference) => {
|
||||
title: string;
|
||||
link: string;
|
||||
};
|
||||
}
|
||||
|
||||
const defaultConvertInlineReferenceLink = (reference: InlineReference) => {
|
||||
return {
|
||||
title: reference.title || '',
|
||||
link: [reference.type, reference.pageId, reference.params?.mode]
|
||||
.filter(Boolean)
|
||||
.join(':'),
|
||||
};
|
||||
};
|
||||
|
||||
export function getConverters(opts: ConverterOptions = {}) {
|
||||
const { convertInlineReferenceLink = defaultConvertInlineReferenceLink } =
|
||||
opts;
|
||||
|
||||
return {
|
||||
embed: {
|
||||
image: function (src) {
|
||||
this.append(' + ')');
|
||||
},
|
||||
// Not a default Quill feature, converts custom divider embed blot added when
|
||||
// creating quill editor instance.
|
||||
// See https://quilljs.com/guides/cloning-medium-with-parchment/#dividers
|
||||
thematic_break: function () {
|
||||
this.open = '\n---\n' + this.open;
|
||||
},
|
||||
},
|
||||
|
||||
inline: {
|
||||
italic: function () {
|
||||
return ['_', '_'];
|
||||
},
|
||||
bold: function () {
|
||||
return ['**', '**'];
|
||||
},
|
||||
link: function (url) {
|
||||
return ['[', '](' + url + ')'];
|
||||
},
|
||||
reference: function (reference: InlineReference) {
|
||||
const { title, link } = convertInlineReferenceLink(reference);
|
||||
return ['[', `${title}](${link})`];
|
||||
},
|
||||
strike: function () {
|
||||
return ['~~', '~~'];
|
||||
},
|
||||
code: function () {
|
||||
return ['`', '`'];
|
||||
},
|
||||
},
|
||||
|
||||
block: {
|
||||
header: function ({ header }) {
|
||||
this.open = '#'.repeat(header) + ' ' + this.open;
|
||||
},
|
||||
blockquote: function () {
|
||||
this.open = '> ' + this.open;
|
||||
},
|
||||
list: {
|
||||
group: function () {
|
||||
return new Node(['', '\n']);
|
||||
},
|
||||
line: function (attrs, group) {
|
||||
if (attrs.list === 'bullet') {
|
||||
this.open = '- ' + this.open;
|
||||
} else if (attrs.list === 'checked') {
|
||||
this.open = '- [x] ' + this.open;
|
||||
} else if (attrs.list === 'unchecked') {
|
||||
this.open = '- [ ] ' + this.open;
|
||||
} else if (attrs.list === 'ordered') {
|
||||
group.count = group.count || 0;
|
||||
var count = ++group.count;
|
||||
this.open = count + '. ' + this.open;
|
||||
}
|
||||
},
|
||||
},
|
||||
},
|
||||
};
|
||||
}
|
||||
147
packages/common/reader/src/doc-parser/delta-to-md/delta-to-md.ts
Normal file
147
packages/common/reader/src/doc-parser/delta-to-md/delta-to-md.ts
Normal file
@@ -0,0 +1,147 @@
|
||||
// eslint-disable
|
||||
// @ts-nocheck
|
||||
import { Node } from './utils/node';
|
||||
|
||||
export const deltaToMd = (delta, converters) => {
|
||||
return convert(delta, converters).render().trimEnd() + '\n';
|
||||
};
|
||||
|
||||
function convert(ops, converters) {
|
||||
let group, line, el, activeInline, beginningOfLine;
|
||||
let root = new Node();
|
||||
|
||||
function newLine() {
|
||||
el = line = new Node(['', '\n']);
|
||||
root.append(line);
|
||||
activeInline = {};
|
||||
}
|
||||
newLine();
|
||||
|
||||
for (let i = 0; i < ops.length; i++) {
|
||||
let op = ops[i];
|
||||
|
||||
if (typeof op.insert === 'object') {
|
||||
for (let k in op.insert) {
|
||||
if (converters.embed[k]) {
|
||||
applyInlineAttributes(op.attributes);
|
||||
converters.embed[k].call(el, op.insert[k], op.attributes);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
let lines = op.insert.split('\n');
|
||||
|
||||
if (hasBlockLevelAttribute(op.attributes, converters)) {
|
||||
// Some line-level styling (ie headings) is applied by inserting a \n
|
||||
// with the style; the style applies back to the previous \n.
|
||||
// There *should* only be one style in an insert operation.
|
||||
|
||||
for (let j = 1; j < lines.length; j++) {
|
||||
for (let attr in op.attributes) {
|
||||
if (converters.block[attr]) {
|
||||
let fn = converters.block[attr];
|
||||
if (typeof fn === 'object') {
|
||||
if (group && group.type !== attr) {
|
||||
group = null;
|
||||
}
|
||||
if (!group && fn.group) {
|
||||
group = {
|
||||
el: fn.group(),
|
||||
type: attr,
|
||||
value: op.attributes[attr],
|
||||
distance: 0,
|
||||
};
|
||||
root.append(group.el);
|
||||
}
|
||||
|
||||
if (group) {
|
||||
group.el.append(line);
|
||||
group.distance = 0;
|
||||
}
|
||||
fn = fn.line;
|
||||
}
|
||||
|
||||
fn.call(line, op.attributes, group);
|
||||
newLine();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
beginningOfLine = true;
|
||||
} else {
|
||||
for (let l = 0; l < lines.length; l++) {
|
||||
if ((l > 0 || beginningOfLine) && group && ++group.distance >= 2) {
|
||||
group = null;
|
||||
}
|
||||
applyInlineAttributes(
|
||||
op.attributes,
|
||||
ops[i + 1] && ops[i + 1].attributes
|
||||
);
|
||||
el.append(lines[l]);
|
||||
if (l < lines.length - 1) {
|
||||
newLine();
|
||||
}
|
||||
}
|
||||
beginningOfLine = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return root;
|
||||
|
||||
function applyInlineAttributes(attrs, next?: any) {
|
||||
let first: any[] = [];
|
||||
let then: any[] = [];
|
||||
attrs = attrs || {};
|
||||
|
||||
let tag = el,
|
||||
seen = {};
|
||||
while (tag._format) {
|
||||
seen[tag._format] = true;
|
||||
if (!attrs[tag._format] || tag.open !== tag.close) {
|
||||
for (let k in seen) {
|
||||
delete activeInline[k];
|
||||
}
|
||||
el = tag.parent();
|
||||
}
|
||||
|
||||
tag = tag.parent();
|
||||
}
|
||||
|
||||
for (let attr in attrs) {
|
||||
if (converters.inline[attr] && attrs[attr]) {
|
||||
if (activeInline[attr] && activeInline[attr] === attrs[attr]) {
|
||||
continue; // do nothing -- we should already be inside this style's tag
|
||||
}
|
||||
|
||||
if (next && attrs[attr] === next[attr]) {
|
||||
first.push(attr); // if the next operation has the same style, this should be the outermost tag
|
||||
} else {
|
||||
then.push(attr);
|
||||
}
|
||||
activeInline[attr] = attrs[attr];
|
||||
}
|
||||
}
|
||||
|
||||
first.forEach(apply);
|
||||
then.forEach(apply);
|
||||
|
||||
function apply(fmt) {
|
||||
let newEl = converters.inline[fmt].call(null, attrs[fmt]);
|
||||
if (Array.isArray(newEl)) {
|
||||
newEl = new Node(newEl);
|
||||
}
|
||||
newEl._format = fmt;
|
||||
el.append(newEl);
|
||||
el = newEl;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function hasBlockLevelAttribute(attrs, converters) {
|
||||
for (let k in attrs) {
|
||||
if (Object.keys(converters.block).includes(k)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
@@ -0,0 +1,2 @@
|
||||
export { getConverters } from './delta-converters';
|
||||
export { deltaToMd } from './delta-to-md';
|
||||
@@ -0,0 +1,66 @@
|
||||
// eslint-disable
|
||||
// @ts-nocheck
|
||||
let id = 0;
|
||||
|
||||
export class Node {
|
||||
id = ++id;
|
||||
children: Node[];
|
||||
open: string;
|
||||
close: string;
|
||||
text: string;
|
||||
|
||||
_format: string;
|
||||
_parent: Node;
|
||||
|
||||
constructor(data?: string[] | string) {
|
||||
if (Array.isArray(data)) {
|
||||
this.open = data[0];
|
||||
this.close = data[1];
|
||||
} else if (typeof data === 'string') {
|
||||
this.text = data;
|
||||
}
|
||||
this.children = [];
|
||||
}
|
||||
|
||||
append(e: Node) {
|
||||
if (!(e instanceof Node)) {
|
||||
e = new Node(e);
|
||||
}
|
||||
if (e._parent) {
|
||||
const idx = e._parent.children.indexOf(e);
|
||||
e._parent.children.splice(idx, 1);
|
||||
}
|
||||
e._parent = this;
|
||||
this.children = this.children.concat(e);
|
||||
}
|
||||
|
||||
render() {
|
||||
const inner =
|
||||
(this.text || '') + this.children.map(c => c.render()).join('');
|
||||
|
||||
if (
|
||||
inner.trim() === '' &&
|
||||
this.open === this.close &&
|
||||
this.open &&
|
||||
this.close
|
||||
) {
|
||||
return '';
|
||||
}
|
||||
|
||||
const wrapped = this.open && this.close;
|
||||
const emptyInner = inner.trim() === '';
|
||||
const fragments = [
|
||||
inner.startsWith(' ') && !emptyInner && wrapped ? ' ' : '',
|
||||
this.open,
|
||||
wrapped ? inner.trim() : inner,
|
||||
this.close,
|
||||
inner.endsWith(' ') && !emptyInner && wrapped ? ' ' : '',
|
||||
].filter(f => f);
|
||||
|
||||
return fragments.join('');
|
||||
}
|
||||
|
||||
parent() {
|
||||
return this._parent;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,5 @@
|
||||
export const encodeLink = (link: string) =>
|
||||
encodeURI(link)
|
||||
.replace(/\(/g, '%28')
|
||||
.replace(/\)/g, '%29')
|
||||
.replace(/(\?|&)response-content-disposition=attachment.*$/, '');
|
||||
437
packages/common/reader/src/doc-parser/parser.ts
Normal file
437
packages/common/reader/src/doc-parser/parser.ts
Normal file
@@ -0,0 +1,437 @@
|
||||
import type { ColumnDataType } from '@blocksuite/affine/model';
|
||||
import { Array as YArray, type Map as YMap, type Text as YText } from 'yjs';
|
||||
|
||||
import { deltaToMd, getConverters } from './delta-to-md';
|
||||
import type {
|
||||
BaseParsedBlock,
|
||||
Flavour,
|
||||
ParsedBlock,
|
||||
ParsedDoc,
|
||||
ParserContext,
|
||||
SerializedCells,
|
||||
YBlock,
|
||||
YBlocks,
|
||||
} from './types';
|
||||
|
||||
export const parseBlockToMd = (
|
||||
block: BaseParsedBlock,
|
||||
padding = ''
|
||||
): string => {
|
||||
if (block.content) {
|
||||
return (
|
||||
block.content
|
||||
.split('\n')
|
||||
.map(line => padding + line)
|
||||
.join('\n') +
|
||||
'\n' +
|
||||
block.children.map(b => parseBlockToMd(b, padding + ' ')).join('')
|
||||
);
|
||||
} else {
|
||||
return block.children.map(b => parseBlockToMd(b, padding)).join('');
|
||||
}
|
||||
};
|
||||
|
||||
export function parseBlock(
|
||||
context: ParserContext,
|
||||
yBlock: YBlock | undefined,
|
||||
yBlocks: YBlocks // all blocks
|
||||
): ParsedBlock | null {
|
||||
if (!yBlock) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const deltaConverters = getConverters({
|
||||
convertInlineReferenceLink: ref => {
|
||||
return {
|
||||
title: ref.title || context.renderDocTitle?.(ref.pageId) || '',
|
||||
link: context.buildDocUrl(ref.pageId),
|
||||
};
|
||||
},
|
||||
});
|
||||
|
||||
const id = yBlock.get('sys:id') as string;
|
||||
const flavour = yBlock.get('sys:flavour') as Flavour;
|
||||
const type = yBlock.get('prop:type') as string;
|
||||
const toMd = () =>
|
||||
deltaToMd((yBlock.get('prop:text') as YText).toDelta(), deltaConverters);
|
||||
const hidden = yBlock.get('prop:hidden') as boolean;
|
||||
const displayMode = yBlock.get('prop:displayMode') as string;
|
||||
const childrenIds =
|
||||
yBlock.get('sys:children') instanceof YArray
|
||||
? (yBlock.get('sys:children') as YArray<string>).toJSON()
|
||||
: [];
|
||||
|
||||
let result: ParsedBlock = {
|
||||
id,
|
||||
flavour,
|
||||
content: '',
|
||||
children: [],
|
||||
type,
|
||||
};
|
||||
|
||||
if (hidden || displayMode === 'edgeless') {
|
||||
return result;
|
||||
}
|
||||
|
||||
try {
|
||||
switch (flavour) {
|
||||
case 'affine:paragraph': {
|
||||
let initial = '';
|
||||
if (type === 'h1') {
|
||||
initial = '# ';
|
||||
} else if (type === 'h2') {
|
||||
initial = '## ';
|
||||
} else if (type === 'h3') {
|
||||
initial = '### ';
|
||||
} else if (type === 'h4') {
|
||||
initial = '#### ';
|
||||
} else if (type === 'h5') {
|
||||
initial = '##### ';
|
||||
} else if (type === 'h6') {
|
||||
initial = '###### ';
|
||||
} else if (type === 'quote') {
|
||||
initial = '> ';
|
||||
}
|
||||
result.content = initial + toMd() + '\n';
|
||||
break;
|
||||
}
|
||||
case 'affine:divider': {
|
||||
result.content = '\n---\n\n';
|
||||
break;
|
||||
}
|
||||
case 'affine:list': {
|
||||
result.content = (type === 'bulleted' ? '* ' : '1. ') + toMd() + '\n';
|
||||
break;
|
||||
}
|
||||
case 'affine:code': {
|
||||
const lang =
|
||||
(yBlock.get('prop:language') as string)?.toLowerCase() || 'txt';
|
||||
// do not transform to delta for code block
|
||||
const caption = yBlock.get('prop:caption') as string;
|
||||
result.content =
|
||||
'```' +
|
||||
lang +
|
||||
(caption ? ` ${caption}` : '') +
|
||||
'\n' +
|
||||
(yBlock.get('prop:text') as YText).toJSON() +
|
||||
'\n```\n\n';
|
||||
break;
|
||||
}
|
||||
case 'affine:image': {
|
||||
const sourceId = yBlock.get('prop:sourceId') as string;
|
||||
const width = yBlock.get('prop:width');
|
||||
const height = yBlock.get('prop:height');
|
||||
// fixme: this may not work if workspace is not public
|
||||
const blobUrl = context.buildBlobUrl(sourceId);
|
||||
const caption = yBlock.get('prop:caption') as string;
|
||||
if (width || height || caption) {
|
||||
result.content =
|
||||
`<img
|
||||
src="${blobUrl}"
|
||||
alt="${caption}"
|
||||
width="${width || 'auto'}"
|
||||
height="${height || 'auto'}"
|
||||
/>
|
||||
` + '\n\n';
|
||||
} else {
|
||||
result.content = `\n\n\n`;
|
||||
}
|
||||
Object.assign(result, {
|
||||
sourceId,
|
||||
width,
|
||||
height,
|
||||
caption,
|
||||
blobUrl,
|
||||
});
|
||||
|
||||
break;
|
||||
}
|
||||
case 'affine:attachment': {
|
||||
const sourceId = yBlock.get('prop:sourceId') as string;
|
||||
const blobUrl = context.buildBlobUrl(sourceId);
|
||||
const caption = yBlock.get('prop:caption') as string;
|
||||
if (type.startsWith('video')) {
|
||||
result.content =
|
||||
`<video muted autoplay loop preload="auto" playsinline>
|
||||
<source src="${blobUrl}" type="${type}" />
|
||||
</video>
|
||||
` + '\n\n';
|
||||
} else {
|
||||
// assume it is an image
|
||||
result.content = `\n\n\n`;
|
||||
}
|
||||
Object.assign(result, {
|
||||
sourceId,
|
||||
blobUrl,
|
||||
caption,
|
||||
});
|
||||
break;
|
||||
}
|
||||
case 'affine:embed-youtube': {
|
||||
const videoId = yBlock.get('prop:videoId') as string;
|
||||
// prettier-ignore
|
||||
result.content = `
|
||||
<iframe
|
||||
type="text/html"
|
||||
width="100%"
|
||||
height="410px"
|
||||
src="https://www.youtube.com/embed/${videoId}"
|
||||
frameborder="0"
|
||||
allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share"
|
||||
allowfullscreen
|
||||
credentialless>
|
||||
</iframe>` + '\n\n';
|
||||
break;
|
||||
}
|
||||
case 'affine:bookmark': {
|
||||
const url = yBlock.get('prop:url') as string;
|
||||
const caption = yBlock.get('prop:caption') as string;
|
||||
result.content = `\n[](Bookmark,${url})\n\n`;
|
||||
Object.assign(result, {
|
||||
url,
|
||||
caption,
|
||||
});
|
||||
break;
|
||||
}
|
||||
case 'affine:embed-linked-doc':
|
||||
case 'affine:embed-synced-doc': {
|
||||
const pageId = yBlock.get('prop:pageId') as string;
|
||||
const caption = yBlock.get('prop:caption') as string;
|
||||
result.content = `\n[${caption}](${context.buildDocUrl(pageId)})\n\n`;
|
||||
Object.assign(result, {
|
||||
pageId,
|
||||
caption,
|
||||
});
|
||||
break;
|
||||
}
|
||||
case 'affine:surface':
|
||||
case 'affine:page':
|
||||
case 'affine:note':
|
||||
case 'affine:frame': {
|
||||
result.content = '';
|
||||
break;
|
||||
}
|
||||
case 'affine:database': {
|
||||
const title = (yBlock.get('prop:title') as YText).toJSON();
|
||||
const childrenTitleById = Object.fromEntries(
|
||||
childrenIds.map(cid => {
|
||||
const child = parseBlock(
|
||||
context,
|
||||
yBlocks.get(cid) as YBlock | undefined,
|
||||
yBlocks
|
||||
);
|
||||
if (!child) {
|
||||
return [cid, ''];
|
||||
}
|
||||
return [cid, parseBlockToMd(child)] as const;
|
||||
})
|
||||
);
|
||||
const cols = (
|
||||
yBlock.get('prop:columns') as YArray<ColumnDataType>
|
||||
).toJSON() as ColumnDataType[];
|
||||
|
||||
const cells = (
|
||||
yBlock.get('prop:cells') as YMap<SerializedCells>
|
||||
).toJSON() as SerializedCells;
|
||||
|
||||
const optionToTagHtml = (option: any) => {
|
||||
return `<span data-affine-option data-value="${option.id}" data-option-color="${option.color}">${option.value}</span>`;
|
||||
};
|
||||
|
||||
const dbRows: string[][] = childrenIds
|
||||
.map(cid => {
|
||||
const row = cells[cid];
|
||||
return cols.map(col => {
|
||||
const value = row?.[col.id]?.value;
|
||||
|
||||
if (col.type !== 'title' && !value) {
|
||||
return '';
|
||||
}
|
||||
|
||||
switch (col.type) {
|
||||
case 'title':
|
||||
return childrenTitleById[cid];
|
||||
case 'select':
|
||||
return optionToTagHtml(
|
||||
(col.data['options'] as any).find(
|
||||
(opt: any) => opt.id === value
|
||||
)
|
||||
);
|
||||
case 'multi-select':
|
||||
return (col.data['options'] as any)
|
||||
.filter((opt: any) => (value as string[]).includes(opt.id))
|
||||
.map(optionToTagHtml)
|
||||
.join('');
|
||||
default:
|
||||
return value ?? '';
|
||||
}
|
||||
});
|
||||
})
|
||||
.filter(row => !row.every(v => !v));
|
||||
const header = cols.map(col => {
|
||||
return col.name;
|
||||
});
|
||||
|
||||
const divider = cols.map(() => {
|
||||
return '---';
|
||||
});
|
||||
|
||||
// convert to markdown table
|
||||
result.content =
|
||||
[header, divider, ...dbRows]
|
||||
.map(row => {
|
||||
return (
|
||||
'|' +
|
||||
row
|
||||
.map(cell => String(cell || '')?.trim())
|
||||
.join('|')
|
||||
.replace(/\n+/g, '<br />') +
|
||||
'|'
|
||||
);
|
||||
})
|
||||
.join('\n') + '\n\n';
|
||||
|
||||
Object.assign(result, {
|
||||
title,
|
||||
rows: dbRows.map(row => {
|
||||
return Object.fromEntries(row.map((v, i) => [cols[i].name, v]));
|
||||
}),
|
||||
});
|
||||
break;
|
||||
}
|
||||
case 'affine:table': {
|
||||
// Extract row IDs and their order
|
||||
const rowEntries = Object.entries(yBlock.toJSON())
|
||||
.filter(
|
||||
([key]) => key.startsWith('prop:rows.') && key.endsWith('.rowId')
|
||||
)
|
||||
.map(([key, value]) => {
|
||||
const rowId = value as string;
|
||||
const orderKey = key.replace('.rowId', '.order');
|
||||
const order = yBlock.get(orderKey) as string;
|
||||
const backgroundColor = yBlock.get(
|
||||
key.replace('.rowId', '.backgroundColor')
|
||||
) as string | undefined;
|
||||
return { rowId, order, backgroundColor };
|
||||
})
|
||||
.sort((a, b) => a.order.localeCompare(b.order));
|
||||
|
||||
// Extract column IDs and their order
|
||||
const columnEntries = Object.entries(yBlock.toJSON())
|
||||
.filter(
|
||||
([key]) =>
|
||||
key.startsWith('prop:columns.') && key.endsWith('.columnId')
|
||||
)
|
||||
.map(([key, value]) => {
|
||||
const columnId = value as string;
|
||||
const orderKey = key.replace('.columnId', '.order');
|
||||
const order = yBlock.get(orderKey) as string;
|
||||
return { columnId, order };
|
||||
})
|
||||
.sort((a, b) => a.order.localeCompare(b.order));
|
||||
|
||||
// Build the table rows with cell data
|
||||
const tableRows = rowEntries.map(({ rowId }) => {
|
||||
return columnEntries.map(({ columnId }) => {
|
||||
const cellKey = `prop:cells.${rowId}:${columnId}.text`;
|
||||
const cellText = yBlock.get(cellKey) as string | undefined;
|
||||
return cellText || '';
|
||||
});
|
||||
});
|
||||
|
||||
// Store column IDs for reference
|
||||
const columnIds = columnEntries.map(({ columnId }) => columnId);
|
||||
|
||||
// Use the first row as header and the rest as data rows
|
||||
if (tableRows.length > 0) {
|
||||
const headerRow = tableRows[0];
|
||||
const dataRows = tableRows.slice(1);
|
||||
const separators = headerRow.map(() => '---');
|
||||
|
||||
// Convert to markdown table with first row as header
|
||||
result.content =
|
||||
[headerRow, separators, ...dataRows]
|
||||
.map(row => {
|
||||
return (
|
||||
'|' +
|
||||
row
|
||||
.map(cell => String(cell || '')?.trim())
|
||||
.join('|')
|
||||
.replace(/\n+/g, '<br />') +
|
||||
'|'
|
||||
);
|
||||
})
|
||||
.join('\n') + '\n\n';
|
||||
} else {
|
||||
// Handle empty table case
|
||||
result.content = '';
|
||||
}
|
||||
|
||||
Object.assign(result, {
|
||||
columns: columnIds,
|
||||
rows: tableRows,
|
||||
});
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
// console.warn("Unknown or unsupported flavour", flavour);
|
||||
}
|
||||
}
|
||||
|
||||
result.children =
|
||||
flavour !== 'affine:database'
|
||||
? childrenIds
|
||||
.map(cid =>
|
||||
parseBlock(
|
||||
context,
|
||||
yBlocks.get(cid) as YBlock | undefined,
|
||||
yBlocks
|
||||
)
|
||||
)
|
||||
.filter(
|
||||
(block): block is ParsedBlock =>
|
||||
!!block &&
|
||||
!(block.content === '' && block.children.length === 0)
|
||||
)
|
||||
: [];
|
||||
} catch (e) {
|
||||
console.warn('Error converting block to md', e);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
export const parsePageDoc = (ctx: ParserContext): ParsedDoc => {
|
||||
// we assume that the first block is the page block
|
||||
const yBlocks: YBlocks = ctx.doc.getMap('blocks');
|
||||
const maybePageBlock = Object.entries(yBlocks.toJSON()).findLast(
|
||||
([_, b]) => b['sys:flavour'] === 'affine:page'
|
||||
);
|
||||
|
||||
// there are cases that the page is empty due to some weird issues
|
||||
if (!maybePageBlock) {
|
||||
return {
|
||||
title: '',
|
||||
md: '',
|
||||
};
|
||||
} else {
|
||||
const yPage = yBlocks.get(maybePageBlock[0]) as YBlock;
|
||||
const title = yPage.get('prop:title') as YText;
|
||||
const rootBlock = parseBlock(ctx, yPage, yBlocks);
|
||||
if (!rootBlock) {
|
||||
return {
|
||||
title: '',
|
||||
md: '',
|
||||
};
|
||||
}
|
||||
rootBlock.children = rootBlock.children.filter(
|
||||
(block): block is BaseParsedBlock => block.flavour === 'affine:note'
|
||||
);
|
||||
const md = parseBlockToMd(rootBlock);
|
||||
|
||||
return {
|
||||
title: title.toJSON(),
|
||||
parsedBlock: rootBlock,
|
||||
md,
|
||||
};
|
||||
}
|
||||
};
|
||||
152
packages/common/reader/src/doc-parser/types.ts
Normal file
152
packages/common/reader/src/doc-parser/types.ts
Normal file
@@ -0,0 +1,152 @@
|
||||
import { type CellDataType } from '@blocksuite/affine/model';
|
||||
import { type Doc as YDoc, type Map as YMap } from 'yjs';
|
||||
|
||||
export interface WorkspacePage {
|
||||
id: string;
|
||||
guid: string;
|
||||
title: string;
|
||||
createDate: number;
|
||||
trash?: boolean;
|
||||
favorite?: boolean;
|
||||
properties?: Record<string, any>;
|
||||
}
|
||||
|
||||
export type BaseFlavour<T extends string> = `affine:${T}`;
|
||||
|
||||
export type Flavour = BaseFlavour<
|
||||
| 'page'
|
||||
| 'frame'
|
||||
| 'paragraph'
|
||||
| 'code'
|
||||
| 'note'
|
||||
| 'list'
|
||||
| 'divider'
|
||||
| 'embed'
|
||||
| 'image'
|
||||
| 'surface'
|
||||
| 'database'
|
||||
| 'table'
|
||||
| 'attachment'
|
||||
| 'bookmark'
|
||||
| 'embed-youtube'
|
||||
| 'embed-linked-doc'
|
||||
| 'embed-synced-doc'
|
||||
>;
|
||||
|
||||
export interface BaseParsedBlock {
|
||||
id: string;
|
||||
flavour: Flavour;
|
||||
content: string;
|
||||
children: BaseParsedBlock[];
|
||||
type?: string;
|
||||
}
|
||||
|
||||
export interface ParsedDoc {
|
||||
title: string;
|
||||
md: string;
|
||||
parsedBlock?: ParsedBlock;
|
||||
}
|
||||
|
||||
export interface ParagraphBlock extends BaseParsedBlock {
|
||||
flavour: 'affine:paragraph';
|
||||
type: 'h1' | 'h2' | 'h3' | 'h4' | 'h5' | 'h6' | 'quote';
|
||||
}
|
||||
|
||||
export interface DividerBlock extends BaseParsedBlock {
|
||||
flavour: 'affine:divider';
|
||||
}
|
||||
|
||||
export interface ListBlock extends BaseParsedBlock {
|
||||
flavour: 'affine:list';
|
||||
type: 'bulleted' | 'numbered';
|
||||
}
|
||||
|
||||
export interface CodeBlock extends BaseParsedBlock {
|
||||
flavour: 'affine:code';
|
||||
language: string;
|
||||
}
|
||||
|
||||
export interface ImageBlock extends BaseParsedBlock {
|
||||
flavour: 'affine:image';
|
||||
sourceId: string;
|
||||
blobUrl: string;
|
||||
width?: number;
|
||||
height?: number;
|
||||
caption?: string;
|
||||
}
|
||||
|
||||
export interface AttachmentBlock extends BaseParsedBlock {
|
||||
flavour: 'affine:attachment';
|
||||
type: string;
|
||||
sourceId: string;
|
||||
}
|
||||
|
||||
export interface EmbedYoutubeBlock extends BaseParsedBlock {
|
||||
flavour: 'affine:embed-youtube';
|
||||
videoId: string;
|
||||
}
|
||||
|
||||
export interface BookmarkBlock extends BaseParsedBlock {
|
||||
flavour: 'affine:bookmark';
|
||||
url: string;
|
||||
}
|
||||
|
||||
export interface EmbedLinkedDocBlock extends BaseParsedBlock {
|
||||
flavour: 'affine:embed-linked-doc';
|
||||
pageId: string;
|
||||
}
|
||||
|
||||
export interface EmbedSyncedDocBlock extends BaseParsedBlock {
|
||||
flavour: 'affine:embed-synced-doc';
|
||||
pageId: string;
|
||||
}
|
||||
|
||||
export interface DatabaseBlock extends BaseParsedBlock {
|
||||
title: string;
|
||||
flavour: 'affine:database';
|
||||
rows: Record<string, string>[];
|
||||
}
|
||||
|
||||
export interface TableBlock extends BaseParsedBlock {
|
||||
flavour: 'affine:table';
|
||||
rows: string[][];
|
||||
columns: string[];
|
||||
}
|
||||
|
||||
export type ParsedBlock =
|
||||
| ParagraphBlock
|
||||
| DividerBlock
|
||||
| ListBlock
|
||||
| CodeBlock
|
||||
| ImageBlock
|
||||
| AttachmentBlock
|
||||
| EmbedYoutubeBlock
|
||||
| BookmarkBlock
|
||||
| DatabaseBlock
|
||||
| TableBlock
|
||||
| BaseParsedBlock;
|
||||
|
||||
export interface ParsedDoc {
|
||||
title: string;
|
||||
md: string;
|
||||
parsedBlock?: ParsedBlock;
|
||||
}
|
||||
|
||||
export type SerializedCells = {
|
||||
// row
|
||||
[key: string]: {
|
||||
// column
|
||||
[key: string]: CellDataType;
|
||||
};
|
||||
};
|
||||
|
||||
export type YBlock = YMap<unknown>;
|
||||
export type YBlocks = YMap<YBlock>;
|
||||
|
||||
export interface ParserContext {
|
||||
workspaceId: string;
|
||||
doc: YDoc;
|
||||
buildBlobUrl: (blobId: string) => string;
|
||||
buildDocUrl: (docId: string) => string;
|
||||
renderDocTitle?: (docId: string) => string;
|
||||
}
|
||||
@@ -915,3 +915,5 @@ export function readAllDocIdsFromRootDoc(
|
||||
}
|
||||
return Array.from(docIds);
|
||||
}
|
||||
|
||||
export { parseBlock, parseBlockToMd, parsePageDoc } from './doc-parser/parser';
|
||||
|
||||
Reference in New Issue
Block a user