Files
AFFiNE-Mirror/blocksuite/affine/widgets/linked-doc/src/transformers/markdown.ts
Francisco Jiménez 0b1a44863f feat(editor): add obsidian vault import support (#14593)
fix #14592 

### Description
> 🤖 **Note:** The code in this Pull Request were developed with the
assistance of AI, but have been thoroughly reviewed and manually tested.

> I noticed there's a check when opening an issue that asks _"Is your
content generated by AI?"_, so I mention it here in case it's a deal
breaker. If so I understand, you can close the PR, just wanted to share
this in case it's useful anyways.

This PR introduces **Obsidian Vault Import Support** to AFFiNE. 

Previously, users migrating from Obsidian had to rely on the generic
Markdown importer, which often resulted in broken cross-links, missing
directory structures, and metadata conflicts because Obsidian relies
heavily on proprietary structures not supported by standard Markdown.

This completely new feature makes migrating to AFFiNE easy.

**Key Features & Implementations:**

1. **Vault (Directory) Selection**
- Utilizes the `openDirectory` blocksuite utility in the import modal to
allow users to select an entire folder directly from their filesystem,
maintaining file context rather than forcing `.zip` uploads.

2. **Wikilink Resolution (Two-Pass Import)**
- Restructured the `importObsidianVault` process into a two-pass
architecture.
- **Pass 1:** Discovers all files, assigns new AFFiNE document IDs, and
maps them efficiently (by title, alias, and filename) into a
high-performance hash map.
- **Pass 2:** Processes the generic markdown AST and correctly maps
custom `[[wikilinks]]` to the actual pre-registered AFFiNE blocksuite
document IDs via `obsidianWikilinkToDeltaMatcher`.
- Safely strips leading emojis from wikilink aliases to prevent
duplicated page icons rendering mid-sentence.

3. **Emoji Metadata & State Fixes**
- Implemented an aggressive, single-pass RegExp to extract multiple
leading/combining emojis (`Emoji_Presentation` / `\ufe0f`) from H1
headers and Frontmatter. Emojis are assigned specifically to the page
icon metadata property and cleanly stripped from the visual document
title.
- Fixed a core mutation bug where the loop iterating over existing
`docMetas` was aggressively overwriting newly minted IDs for the current
import batch. This fully resolves the issue where imported pages
(especially re-imports) were incorrectly flagged as `trashed`.
   - Enforces explicit `trash: false` patch instructions.

4. **Syntax Conversion**
- Implemented conversion of Obsidian-style Callouts (`> [!NOTE] Title`)
into native AFFiNE block formats (`> 💡 **Title**`).
- Hardened the `blockquote` parser so that nested structures (like `> -
list items`) are fully preserved instead of discarded.

### UI Changes
- Updated the Import Modal to include the "Import Obsidian Vault" flow
utilizing the native filesystem directory picker.
- Regenerated and synced `i18n-completenesses.json` correctly up to 100%
across all supported locales for the new modal string additions.

### Testing Instructions
1. Navigate to the Workspace sidebar and click "Import".
2. Select "Obsidian" and use the directory picker to define a
comprehensive Vault folder.
3. Validate that cross-links between documents automatically resolve to
their specific AFFiNE instances.
4. Validate documents containing leading Emojis display exactly one
Emoji (in the page icon area), and none duplicated in the actual title
header.
5. Validate Callouts are rendered cleanly and correctly, and no
documents are incorrectly marked as "Trash".


<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

* **New Features**
* Import Obsidian vaults with wikilink resolution, emoji/title
preservation, asset handling, and automatic document creation.
* Folder-based imports via a Directory Picker (with hidden-input
fallback) integrated into the import dialog.

* **Localization**
  * Added Obsidian import label and tooltip translations.

* **Tests**
* Added end-to-end tests validating Obsidian vault import and asset
handling.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->

---------

Co-authored-by: DarkSky <25152247+darkskygit@users.noreply.github.com>
Co-authored-by: DarkSky <darksky2048@gmail.com>
2026-03-17 00:49:17 +08:00

542 lines
15 KiB
TypeScript

import {
defaultImageProxyMiddleware,
docLinkBaseURLMiddleware,
fileNameMiddleware,
filePathMiddleware,
MarkdownAdapter,
titleMiddleware,
} from '@blocksuite/affine-shared/adapters';
import { Container } from '@blocksuite/global/di';
import { BlockSuiteError, ErrorCode } from '@blocksuite/global/exceptions';
import { sha } from '@blocksuite/global/utils';
import type {
DocMeta,
ExtensionType,
Schema,
Store,
Workspace,
} from '@blocksuite/store';
import { extMimeMap, Transformer } from '@blocksuite/store';
import type { AssetMap, ImportedFileEntry, PathBlobIdMap } from './type.js';
import { createAssetsArchive, download, parseMatter, Unzip } from './utils.js';
export type ParsedFrontmatterMeta = Partial<
Pick<
DocMeta,
'title' | 'createDate' | 'updatedDate' | 'tags' | 'favorite' | 'trash'
>
>;
const FRONTMATTER_KEYS = {
title: ['title', 'name'],
created: [
'created',
'createdat',
'created_at',
'createddate',
'created_date',
'creationdate',
'date',
'time',
],
updated: [
'updated',
'updatedat',
'updated_at',
'updateddate',
'updated_date',
'modified',
'modifiedat',
'modified_at',
'lastmodified',
'last_modified',
'lastedited',
'last_edited',
'lasteditedtime',
'last_edited_time',
],
tags: ['tags', 'tag', 'categories', 'category', 'labels', 'keywords'],
favorite: ['favorite', 'favourite', 'star', 'starred', 'pinned'],
trash: ['trash', 'trashed', 'deleted', 'archived'],
};
const truthyStrings = new Set(['true', 'yes', 'y', '1', 'on']);
const falsyStrings = new Set(['false', 'no', 'n', '0', 'off']);
function parseBoolean(value: unknown): boolean | undefined {
if (typeof value === 'boolean') return value;
if (typeof value === 'number') {
if (value === 1) return true;
if (value === 0) return false;
}
if (typeof value === 'string') {
const normalized = value.trim().toLowerCase();
if (truthyStrings.has(normalized)) return true;
if (falsyStrings.has(normalized)) return false;
}
return undefined;
}
function parseTimestamp(value: unknown): number | undefined {
if (value && value instanceof Date) {
return value.getTime();
}
if (typeof value === 'number' && Number.isFinite(value)) {
return value > 1e10 ? value : Math.round(value * 1000);
}
if (typeof value === 'string') {
const num = Number(value);
if (!Number.isNaN(num)) {
return num > 1e10 ? num : Math.round(num * 1000);
}
const parsed = Date.parse(value);
if (!Number.isNaN(parsed)) {
return parsed;
}
}
return undefined;
}
function parseTags(value: unknown): string[] | undefined {
if (Array.isArray(value)) {
const tags = value
.map(v => (typeof v === 'string' ? v : String(v)))
.map(v => v.trim())
.filter(Boolean);
return tags.length ? [...new Set(tags)] : undefined;
}
if (typeof value === 'string') {
const tags = value
.split(/[,;]+/)
.map(v => v.trim())
.filter(Boolean);
return tags.length ? [...new Set(tags)] : undefined;
}
return undefined;
}
function buildMetaFromFrontmatter(
data: Record<string, unknown>
): ParsedFrontmatterMeta {
const meta: ParsedFrontmatterMeta = {};
for (const [rawKey, value] of Object.entries(data)) {
const key = rawKey.trim().toLowerCase();
if (FRONTMATTER_KEYS.title.includes(key) && typeof value === 'string') {
const title = value.trim();
if (title) meta.title = title;
continue;
}
if (FRONTMATTER_KEYS.created.includes(key)) {
const timestamp = parseTimestamp(value);
if (timestamp !== undefined) {
meta.createDate = timestamp;
}
continue;
}
if (FRONTMATTER_KEYS.updated.includes(key)) {
const timestamp = parseTimestamp(value);
if (timestamp !== undefined) {
meta.updatedDate = timestamp;
}
continue;
}
if (FRONTMATTER_KEYS.tags.includes(key)) {
const tags = parseTags(value);
if (tags) meta.tags = tags;
continue;
}
if (FRONTMATTER_KEYS.favorite.includes(key)) {
const favorite = parseBoolean(value);
if (favorite !== undefined) {
meta.favorite = favorite;
}
continue;
}
if (FRONTMATTER_KEYS.trash.includes(key)) {
const trash = parseBoolean(value);
if (trash !== undefined) {
meta.trash = trash;
}
continue;
}
}
return meta;
}
export function parseFrontmatter(markdown: string): {
content: string;
meta: ParsedFrontmatterMeta;
} {
try {
const parsed = parseMatter(markdown);
if (!parsed) {
return { content: markdown, meta: {} };
}
const content = parsed.body ?? markdown;
if (Array.isArray(parsed.metadata)) {
return { content: String(content), meta: {} };
}
const meta = buildMetaFromFrontmatter({ ...parsed.metadata });
return { content: String(content), meta };
} catch {
return { content: markdown, meta: {} };
}
}
export function applyMetaPatch(
collection: Workspace,
docId: string,
meta: ParsedFrontmatterMeta
) {
const metaPatch: Partial<DocMeta> = {};
if (meta.title) metaPatch.title = meta.title;
if (meta.createDate !== undefined) metaPatch.createDate = meta.createDate;
if (meta.updatedDate !== undefined) metaPatch.updatedDate = meta.updatedDate;
if (meta.tags) metaPatch.tags = meta.tags;
if (meta.favorite !== undefined) metaPatch.favorite = meta.favorite;
if (meta.trash !== undefined) metaPatch.trash = meta.trash;
if (Object.keys(metaPatch).length) {
collection.meta.setDocMeta(docId, metaPatch);
}
}
export function getProvider(extensions: ExtensionType[]) {
const container = new Container();
extensions.forEach(ext => {
ext.setup(container);
});
return container.provider();
}
type ImportMarkdownToBlockOptions = {
doc: Store;
markdown: string;
blockId: string;
extensions: ExtensionType[];
};
type ImportMarkdownToDocOptions = {
collection: Workspace;
schema: Schema;
markdown: string;
fileName?: string;
extensions: ExtensionType[];
};
type ImportMarkdownZipOptions = {
collection: Workspace;
schema: Schema;
imported: Blob;
extensions: ExtensionType[];
};
/**
* Filters hidden/system entries that should never participate in imports.
*/
export function isSystemImportPath(path: string) {
return path.includes('__MACOSX') || path.includes('.DS_Store');
}
/**
* Creates the doc CRUD bridge used by importer transformers.
*/
export function createCollectionDocCRUD(collection: Workspace) {
return {
create: (id: string) => collection.createDoc(id).getStore({ id }),
get: (id: string) => collection.getDoc(id)?.getStore({ id }) ?? null,
delete: (id: string) => collection.removeDoc(id),
};
}
type CreateMarkdownImportJobOptions = {
collection: Workspace;
schema: Schema;
preferredTitle?: string;
fullPath?: string;
};
/**
* Creates a markdown import job with the standard collection middlewares.
*/
export function createMarkdownImportJob({
collection,
schema,
preferredTitle,
fullPath,
}: CreateMarkdownImportJobOptions) {
return new Transformer({
schema,
blobCRUD: collection.blobSync,
docCRUD: createCollectionDocCRUD(collection),
middlewares: [
defaultImageProxyMiddleware,
fileNameMiddleware(preferredTitle),
docLinkBaseURLMiddleware(collection.id),
...(fullPath ? [filePathMiddleware(fullPath)] : []),
],
});
}
type StageImportedAssetOptions = {
pendingAssets: AssetMap;
pendingPathBlobIdMap: PathBlobIdMap;
path: string;
content: Blob;
fileName: string;
};
/**
* Hashes a non-markdown import file and stages it into the shared asset maps.
*/
export async function stageImportedAsset({
pendingAssets,
pendingPathBlobIdMap,
path,
content,
fileName,
}: StageImportedAssetOptions) {
const ext = path.split('.').at(-1) ?? '';
const mime = extMimeMap.get(ext.toLowerCase()) ?? '';
const key = await sha(await content.arrayBuffer());
pendingPathBlobIdMap.set(path, key);
pendingAssets.set(key, new File([content], fileName, { type: mime }));
}
/**
* Binds previously staged asset files into a transformer job before import.
*/
export function bindImportedAssetsToJob(
job: Transformer,
pendingAssets: AssetMap,
pendingPathBlobIdMap: PathBlobIdMap
) {
const pathBlobIdMap = job.assetsManager.getPathBlobIdMap();
// Iterate over all assets to be imported
for (const [assetPath, key] of pendingPathBlobIdMap.entries()) {
// Get the relative path of the asset to the markdown file
// Store the path to blobId map
pathBlobIdMap.set(assetPath, key);
// Store the asset to assets, the key is the blobId, the value is the file object
// In block adapter, it will use the blobId to get the file object
const assetFile = pendingAssets.get(key);
if (assetFile) {
job.assets.set(key, assetFile);
}
}
return pathBlobIdMap;
}
/**
* Exports a doc to a Markdown file or a zip archive containing Markdown and assets.
* @param doc The doc to export
* @returns A Promise that resolves when the export is complete
*/
async function exportDoc(doc: Store) {
const provider = doc.provider;
const job = doc.getTransformer([
docLinkBaseURLMiddleware(doc.workspace.id),
titleMiddleware(doc.workspace.meta.docMetas),
]);
const snapshot = job.docToSnapshot(doc);
const adapter = new MarkdownAdapter(job, provider);
if (!snapshot) {
return;
}
const markdownResult = await adapter.fromDocSnapshot({
snapshot,
assets: job.assetsManager,
});
let downloadBlob: Blob;
const docTitle = doc.meta?.title || 'Untitled';
let name: string;
const contentBlob = new Blob([markdownResult.file], { type: 'plain/text' });
if (markdownResult.assetsIds.length > 0) {
if (!job.assets) {
throw new BlockSuiteError(ErrorCode.ValueNotExists, 'No assets found');
}
const zip = await createAssetsArchive(job.assets, markdownResult.assetsIds);
await zip.file('index.md', contentBlob);
downloadBlob = await zip.generate();
name = `${docTitle}.zip`;
} else {
downloadBlob = contentBlob;
name = `${docTitle}.md`;
}
download(downloadBlob, name);
}
/**
* Imports Markdown content into a specific block within a doc.
* @param options Object containing import options
* @param options.doc The target doc
* @param options.markdown The Markdown content to import
* @param options.blockId The ID of the block where the content will be imported
* @returns A Promise that resolves when the import is complete
*/
async function importMarkdownToBlock({
doc,
markdown,
blockId,
extensions,
}: ImportMarkdownToBlockOptions) {
const provider = getProvider(extensions);
const job = doc.getTransformer([
defaultImageProxyMiddleware,
docLinkBaseURLMiddleware(doc.workspace.id),
]);
const adapter = new MarkdownAdapter(job, provider);
const snapshot = await adapter.toSliceSnapshot({
file: markdown,
assets: job.assetsManager,
workspaceId: doc.workspace.id,
pageId: doc.id,
});
if (!snapshot) {
throw new BlockSuiteError(
BlockSuiteError.ErrorCode.ValueNotExists,
'import markdown failed, expected to get a snapshot'
);
}
const blocks = snapshot.content.flatMap(x => x.children);
for (const block of blocks) {
await job.snapshotToBlock(block, doc, blockId);
}
return;
}
/**
* Imports Markdown content into a new doc within a collection.
* @param options Object containing import options
* @param options.collection The target doc collection
* @param options.schema The schema of the target doc collection
* @param options.markdown The Markdown content to import
* @param options.fileName Optional filename for the imported doc
* @returns A Promise that resolves to the ID of the newly created doc, or undefined if import fails
*/
async function importMarkdownToDoc({
collection,
schema,
markdown,
fileName,
extensions,
}: ImportMarkdownToDocOptions) {
const { content, meta } = parseFrontmatter(markdown);
const preferredTitle = meta.title ?? fileName;
const provider = getProvider(extensions);
const job = createMarkdownImportJob({
collection,
schema,
preferredTitle,
});
const mdAdapter = new MarkdownAdapter(job, provider);
const page = await mdAdapter.toDoc({
file: content,
assets: job.assetsManager,
});
if (!page) {
return;
}
applyMetaPatch(collection, page.id, meta);
return page.id;
}
/**
* Imports a zip file containing Markdown files and assets into a collection.
* @param options Object containing import options
* @param options.collection The target doc collection
* @param options.schema The schema of the target doc collection
* @param options.imported The zip file as a Blob
* @returns A Promise that resolves to an array of IDs of the newly created docs
*/
async function importMarkdownZip({
collection,
schema,
imported,
extensions,
}: ImportMarkdownZipOptions) {
const provider = getProvider(extensions);
const unzip = new Unzip();
await unzip.load(imported);
const docIds: string[] = [];
const pendingAssets: AssetMap = new Map();
const pendingPathBlobIdMap: PathBlobIdMap = new Map();
const markdownBlobs: ImportedFileEntry[] = [];
// Iterate over all files in the zip
for (const { path, content: blob } of unzip) {
// Skip the files that are not markdown files
if (isSystemImportPath(path)) {
continue;
}
// Get the file name
const fileName = path.split('/').pop() ?? '';
// If the file is a markdown file, store it to markdownBlobs
if (fileName.endsWith('.md')) {
markdownBlobs.push({
filename: fileName,
contentBlob: blob,
fullPath: path,
});
} else {
await stageImportedAsset({
pendingAssets,
pendingPathBlobIdMap,
path,
content: blob,
fileName,
});
}
}
await Promise.all(
markdownBlobs.map(async markdownFile => {
const { filename, contentBlob, fullPath } = markdownFile;
const fileNameWithoutExt = filename.replace(/\.[^/.]+$/, '');
const markdown = await contentBlob.text();
const { content, meta } = parseFrontmatter(markdown);
const preferredTitle = meta.title ?? fileNameWithoutExt;
const job = createMarkdownImportJob({
collection,
schema,
preferredTitle,
fullPath,
});
bindImportedAssetsToJob(job, pendingAssets, pendingPathBlobIdMap);
const mdAdapter = new MarkdownAdapter(job, provider);
const doc = await mdAdapter.toDoc({
file: content,
assets: job.assetsManager,
});
if (doc) {
applyMetaPatch(collection, doc.id, meta);
docIds.push(doc.id);
}
})
);
return docIds;
}
export const MarkdownTransformer = {
exportDoc,
importMarkdownToBlock,
importMarkdownToDoc,
importMarkdownZip,
};