mirror of
https://github.com/toeverything/AFFiNE.git
synced 2026-02-25 18:26:05 +08:00
feat: support get dynamic page meta data (#2632)
This commit is contained in:
49
apps/electron/layers/main/src/ui/get-meta-data/get-html.ts
Normal file
49
apps/electron/layers/main/src/ui/get-meta-data/get-html.ts
Normal file
@@ -0,0 +1,49 @@
|
|||||||
|
import { BrowserWindow } from 'electron';
|
||||||
|
|
||||||
|
import type { GetHTMLOptions } from './types';
|
||||||
|
|
||||||
|
async function getHTMLFromWindow(win: BrowserWindow): Promise<string> {
|
||||||
|
return win.webContents
|
||||||
|
.executeJavaScript(`document.documentElement.outerHTML;`)
|
||||||
|
.then(html => html);
|
||||||
|
}
|
||||||
|
|
||||||
|
// For normal web pages, obtaining html can be done directly,
|
||||||
|
// but for some dynamic web pages, obtaining html should wait for the complete loading of web pages. shouldReGetHTML should be used to judge whether to obtain html again
|
||||||
|
export async function getHTMLByURL(
|
||||||
|
url: string,
|
||||||
|
options: GetHTMLOptions
|
||||||
|
): Promise<string> {
|
||||||
|
return new Promise(resolve => {
|
||||||
|
const { timeout = 10000, shouldReGetHTML } = options;
|
||||||
|
const window = new BrowserWindow({
|
||||||
|
show: false,
|
||||||
|
});
|
||||||
|
let html = '';
|
||||||
|
window.loadURL(url);
|
||||||
|
|
||||||
|
const timer = setTimeout(() => {
|
||||||
|
resolve(html);
|
||||||
|
window.close();
|
||||||
|
}, timeout);
|
||||||
|
|
||||||
|
async function loopHandle() {
|
||||||
|
html = await getHTMLFromWindow(window);
|
||||||
|
if (!shouldReGetHTML) {
|
||||||
|
return html;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (await shouldReGetHTML(html)) {
|
||||||
|
setTimeout(loopHandle, 1000);
|
||||||
|
} else {
|
||||||
|
window.close();
|
||||||
|
clearTimeout(timer);
|
||||||
|
resolve(html);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
window.webContents.on('did-finish-load', async () => {
|
||||||
|
loopHandle();
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
@@ -1,23 +1,15 @@
|
|||||||
import type { CheerioAPI, Element } from 'cheerio';
|
import type { CheerioAPI, Element } from 'cheerio';
|
||||||
import { load } from 'cheerio';
|
import { load } from 'cheerio';
|
||||||
import got from 'got';
|
|
||||||
|
|
||||||
import type { Context, MetaData, Options, RuleSet } from './types';
|
import type { Context, MetaData, Options, RuleSet } from './types';
|
||||||
|
|
||||||
export * from './types';
|
export * from './types';
|
||||||
|
|
||||||
|
import { getHTMLByURL } from './get-html';
|
||||||
import { metaDataRules } from './rules';
|
import { metaDataRules } from './rules';
|
||||||
|
import type { GetMetaDataOptions } from './types';
|
||||||
|
|
||||||
const defaultOptions = {
|
function runRule(ruleSet: RuleSet, $: CheerioAPI, context: Context) {
|
||||||
maxRedirects: 5,
|
|
||||||
ua: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36',
|
|
||||||
lang: '*',
|
|
||||||
timeout: 10000,
|
|
||||||
forceImageHttps: true,
|
|
||||||
customRules: {},
|
|
||||||
};
|
|
||||||
|
|
||||||
const runRule = function (ruleSet: RuleSet, $: CheerioAPI, context: Context) {
|
|
||||||
let maxScore = 0;
|
let maxScore = 0;
|
||||||
let value;
|
let value;
|
||||||
|
|
||||||
@@ -58,61 +50,31 @@ const runRule = function (ruleSet: RuleSet, $: CheerioAPI, context: Context) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
return undefined;
|
return undefined;
|
||||||
};
|
}
|
||||||
|
|
||||||
const getMetaData = async function (
|
async function getMetaDataByHTML(
|
||||||
input: string | Partial<Options>,
|
html: string,
|
||||||
inputOptions: Partial<Options> = {}
|
url: string,
|
||||||
|
options: GetMetaDataOptions
|
||||||
) {
|
) {
|
||||||
let url;
|
const { customRules = {} } = options;
|
||||||
if (typeof input === 'object') {
|
|
||||||
inputOptions = input;
|
|
||||||
url = input.url || '';
|
|
||||||
} else {
|
|
||||||
url = input;
|
|
||||||
}
|
|
||||||
|
|
||||||
const options = Object.assign({}, defaultOptions, inputOptions);
|
|
||||||
|
|
||||||
const rules: Record<string, RuleSet> = { ...metaDataRules };
|
const rules: Record<string, RuleSet> = { ...metaDataRules };
|
||||||
Object.keys(options.customRules).forEach((key: string) => {
|
Object.keys(customRules).forEach((key: string) => {
|
||||||
rules[key] = {
|
rules[key] = {
|
||||||
rules: [...metaDataRules[key].rules, ...options.customRules[key].rules],
|
rules: [...metaDataRules[key].rules, ...customRules[key].rules],
|
||||||
defaultValue:
|
defaultValue:
|
||||||
options.customRules[key].defaultValue ||
|
customRules[key].defaultValue || metaDataRules[key].defaultValue,
|
||||||
metaDataRules[key].defaultValue,
|
processor: customRules[key].processor || metaDataRules[key].processor,
|
||||||
processor:
|
|
||||||
options.customRules[key].processor || metaDataRules[key].processor,
|
|
||||||
};
|
};
|
||||||
});
|
});
|
||||||
|
|
||||||
let html;
|
|
||||||
if (!options.html) {
|
|
||||||
const response = await got(url, {
|
|
||||||
headers: {
|
|
||||||
'User-Agent': options.ua,
|
|
||||||
'Accept-Language': options.lang,
|
|
||||||
},
|
|
||||||
timeout: options.timeout,
|
|
||||||
...(options.maxRedirects === 0
|
|
||||||
? { followRedirect: false }
|
|
||||||
: { maxRedirects: options.maxRedirects }),
|
|
||||||
});
|
|
||||||
html = response.body;
|
|
||||||
} else {
|
|
||||||
html = options.html;
|
|
||||||
}
|
|
||||||
|
|
||||||
const metadata: MetaData = {};
|
const metadata: MetaData = {};
|
||||||
const context: Context = {
|
const context: Context = {
|
||||||
url,
|
url,
|
||||||
options,
|
...options,
|
||||||
};
|
};
|
||||||
|
|
||||||
const $ = load(html);
|
const $ = load(html);
|
||||||
// console.log('===============================');
|
|
||||||
// console.log('html');
|
|
||||||
// console.log(doc);
|
|
||||||
|
|
||||||
Object.keys(rules).forEach((key: string) => {
|
Object.keys(rules).forEach((key: string) => {
|
||||||
const ruleSet = rules[key];
|
const ruleSet = rules[key];
|
||||||
@@ -120,6 +82,26 @@ const getMetaData = async function (
|
|||||||
});
|
});
|
||||||
|
|
||||||
return metadata;
|
return metadata;
|
||||||
};
|
}
|
||||||
|
|
||||||
export { getMetaData };
|
export async function getMetaData(url: string, options: Options = {}) {
|
||||||
|
const { customRules, forceImageHttps, shouldReGetHTML, ...other } = options;
|
||||||
|
const html = await getHTMLByURL(url, {
|
||||||
|
...other,
|
||||||
|
shouldReGetHTML: async html => {
|
||||||
|
const meta = await getMetaDataByHTML(html, url, {
|
||||||
|
customRules,
|
||||||
|
forceImageHttps,
|
||||||
|
});
|
||||||
|
return shouldReGetHTML ? await shouldReGetHTML(meta) : false;
|
||||||
|
},
|
||||||
|
}).catch(() => {
|
||||||
|
// TODO: report error
|
||||||
|
return '';
|
||||||
|
});
|
||||||
|
|
||||||
|
return await getMetaDataByHTML(html, url, {
|
||||||
|
customRules,
|
||||||
|
forceImageHttps,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|||||||
@@ -591,7 +591,7 @@ export const metaDataRules: Record<string, RuleSet> = {
|
|||||||
],
|
],
|
||||||
],
|
],
|
||||||
processor: (imageUrl: any, context) =>
|
processor: (imageUrl: any, context) =>
|
||||||
context.options.forceImageHttps === true
|
context.forceImageHttps === true
|
||||||
? makeUrlSecure(makeUrlAbsolute(context.url, imageUrl))
|
? makeUrlSecure(makeUrlAbsolute(context.url, imageUrl))
|
||||||
: makeUrlAbsolute(context.url, imageUrl),
|
: makeUrlAbsolute(context.url, imageUrl),
|
||||||
},
|
},
|
||||||
@@ -625,7 +625,7 @@ export const metaDataRules: Record<string, RuleSet> = {
|
|||||||
},
|
},
|
||||||
defaultValue: context => makeUrlAbsolute(context.url, '/favicon.ico'),
|
defaultValue: context => makeUrlAbsolute(context.url, '/favicon.ico'),
|
||||||
processor: (iconUrl, context) =>
|
processor: (iconUrl, context) =>
|
||||||
context.options.forceImageHttps === true
|
context.forceImageHttps === true
|
||||||
? makeUrlSecure(makeUrlAbsolute(context.url, iconUrl))
|
? makeUrlSecure(makeUrlAbsolute(context.url, iconUrl))
|
||||||
: makeUrlAbsolute(context.url, iconUrl),
|
: makeUrlAbsolute(context.url, iconUrl),
|
||||||
},
|
},
|
||||||
@@ -654,7 +654,7 @@ export const metaDataRules: Record<string, RuleSet> = {
|
|||||||
['meta[name="og:video"][content]', element => element.attribs['content']],
|
['meta[name="og:video"][content]', element => element.attribs['content']],
|
||||||
],
|
],
|
||||||
processor: (imageUrl: any, context) =>
|
processor: (imageUrl: any, context) =>
|
||||||
context.options.forceImageHttps === true
|
context.forceImageHttps === true
|
||||||
? makeUrlSecure(makeUrlAbsolute(context.url, imageUrl))
|
? makeUrlSecure(makeUrlAbsolute(context.url, imageUrl))
|
||||||
: makeUrlAbsolute(context.url, imageUrl),
|
: makeUrlAbsolute(context.url, imageUrl),
|
||||||
},
|
},
|
||||||
@@ -683,7 +683,7 @@ export const metaDataRules: Record<string, RuleSet> = {
|
|||||||
['meta[name="og:audio"][content]', element => element.attribs['content']],
|
['meta[name="og:audio"][content]', element => element.attribs['content']],
|
||||||
],
|
],
|
||||||
processor: (imageUrl: any, context) =>
|
processor: (imageUrl: any, context) =>
|
||||||
context.options.forceImageHttps === true
|
context.forceImageHttps === true
|
||||||
? makeUrlSecure(makeUrlAbsolute(context.url, imageUrl))
|
? makeUrlSecure(makeUrlAbsolute(context.url, imageUrl))
|
||||||
: makeUrlAbsolute(context.url, imageUrl),
|
: makeUrlAbsolute(context.url, imageUrl),
|
||||||
},
|
},
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
import type { Element } from 'cheerio';
|
import type { Element } from 'cheerio';
|
||||||
|
|
||||||
export interface MetaData {
|
export type MetaData = {
|
||||||
title?: string;
|
title?: string;
|
||||||
description?: string;
|
description?: string;
|
||||||
icon?: string;
|
icon?: string;
|
||||||
@@ -12,29 +12,32 @@ export interface MetaData {
|
|||||||
provider?: string;
|
provider?: string;
|
||||||
|
|
||||||
[x: string]: string | string[] | undefined;
|
[x: string]: string | string[] | undefined;
|
||||||
}
|
};
|
||||||
|
|
||||||
export type MetadataRule = [string, (el: Element) => string | null];
|
export type MetadataRule = [string, (el: Element) => string | null];
|
||||||
|
|
||||||
export interface Context {
|
export type Context = {
|
||||||
url: string;
|
url: string;
|
||||||
options: Options;
|
} & GetMetaDataOptions;
|
||||||
}
|
|
||||||
|
|
||||||
export interface RuleSet {
|
export type RuleSet = {
|
||||||
rules: MetadataRule[];
|
rules: MetadataRule[];
|
||||||
defaultValue?: (context: Context) => string | string[];
|
defaultValue?: (context: Context) => string | string[];
|
||||||
scorer?: (el: Element, score: any) => any;
|
scorer?: (el: Element, score: any) => any;
|
||||||
processor?: (input: any, context: Context) => any;
|
processor?: (input: any, context: Context) => any;
|
||||||
}
|
};
|
||||||
|
|
||||||
export interface Options {
|
export type GetMetaDataOptions = {
|
||||||
maxRedirects?: number;
|
|
||||||
ua?: string;
|
|
||||||
lang?: string;
|
|
||||||
timeout?: number;
|
|
||||||
forceImageHttps?: boolean;
|
|
||||||
html?: string;
|
|
||||||
url?: string;
|
|
||||||
customRules?: Record<string, RuleSet>;
|
customRules?: Record<string, RuleSet>;
|
||||||
}
|
forceImageHttps?: boolean;
|
||||||
|
};
|
||||||
|
|
||||||
|
export type GetHTMLOptions = {
|
||||||
|
timeout?: number;
|
||||||
|
shouldReGetHTML?: (currentHTML: string) => boolean | Promise<boolean>;
|
||||||
|
};
|
||||||
|
|
||||||
|
export type Options = {
|
||||||
|
shouldReGetHTML?: (metaData: MetaData) => boolean | Promise<boolean>;
|
||||||
|
} & GetMetaDataOptions &
|
||||||
|
Omit<GetHTMLOptions, 'shouldReGetHTML'>;
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
import { app, BrowserWindow, nativeTheme, session } from 'electron';
|
import { app, BrowserWindow, nativeTheme } from 'electron';
|
||||||
|
|
||||||
import type { NamespaceHandlers } from '../type';
|
import type { NamespaceHandlers } from '../type';
|
||||||
import { isMacOS } from '../utils';
|
import { isMacOS } from '../utils';
|
||||||
@@ -42,7 +42,9 @@ export const uiHandlers = {
|
|||||||
},
|
},
|
||||||
getBookmarkDataByLink: async (_, url: string) => {
|
getBookmarkDataByLink: async (_, url: string) => {
|
||||||
return getMetaData(url, {
|
return getMetaData(url, {
|
||||||
ua: session.defaultSession.getUserAgent(),
|
shouldReGetHTML: metaData => {
|
||||||
|
return !metaData.title && !metaData.description;
|
||||||
|
},
|
||||||
});
|
});
|
||||||
},
|
},
|
||||||
} satisfies NamespaceHandlers;
|
} satisfies NamespaceHandlers;
|
||||||
|
|||||||
Reference in New Issue
Block a user