mirror of
https://github.com/toeverything/AFFiNE.git
synced 2026-02-13 21:05:19 +00:00
feat: support get dynamic page meta data (#2632)
This commit is contained in:
49
apps/electron/layers/main/src/ui/get-meta-data/get-html.ts
Normal file
49
apps/electron/layers/main/src/ui/get-meta-data/get-html.ts
Normal file
@@ -0,0 +1,49 @@
|
||||
import { BrowserWindow } from 'electron';
|
||||
|
||||
import type { GetHTMLOptions } from './types';
|
||||
|
||||
async function getHTMLFromWindow(win: BrowserWindow): Promise<string> {
|
||||
return win.webContents
|
||||
.executeJavaScript(`document.documentElement.outerHTML;`)
|
||||
.then(html => html);
|
||||
}
|
||||
|
||||
// For normal web pages, obtaining html can be done directly,
|
||||
// but for some dynamic web pages, obtaining html should wait for the complete loading of web pages. shouldReGetHTML should be used to judge whether to obtain html again
|
||||
export async function getHTMLByURL(
|
||||
url: string,
|
||||
options: GetHTMLOptions
|
||||
): Promise<string> {
|
||||
return new Promise(resolve => {
|
||||
const { timeout = 10000, shouldReGetHTML } = options;
|
||||
const window = new BrowserWindow({
|
||||
show: false,
|
||||
});
|
||||
let html = '';
|
||||
window.loadURL(url);
|
||||
|
||||
const timer = setTimeout(() => {
|
||||
resolve(html);
|
||||
window.close();
|
||||
}, timeout);
|
||||
|
||||
async function loopHandle() {
|
||||
html = await getHTMLFromWindow(window);
|
||||
if (!shouldReGetHTML) {
|
||||
return html;
|
||||
}
|
||||
|
||||
if (await shouldReGetHTML(html)) {
|
||||
setTimeout(loopHandle, 1000);
|
||||
} else {
|
||||
window.close();
|
||||
clearTimeout(timer);
|
||||
resolve(html);
|
||||
}
|
||||
}
|
||||
|
||||
window.webContents.on('did-finish-load', async () => {
|
||||
loopHandle();
|
||||
});
|
||||
});
|
||||
}
|
||||
@@ -1,23 +1,15 @@
|
||||
import type { CheerioAPI, Element } from 'cheerio';
|
||||
import { load } from 'cheerio';
|
||||
import got from 'got';
|
||||
|
||||
import type { Context, MetaData, Options, RuleSet } from './types';
|
||||
|
||||
export * from './types';
|
||||
|
||||
import { getHTMLByURL } from './get-html';
|
||||
import { metaDataRules } from './rules';
|
||||
import type { GetMetaDataOptions } from './types';
|
||||
|
||||
const defaultOptions = {
|
||||
maxRedirects: 5,
|
||||
ua: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36',
|
||||
lang: '*',
|
||||
timeout: 10000,
|
||||
forceImageHttps: true,
|
||||
customRules: {},
|
||||
};
|
||||
|
||||
const runRule = function (ruleSet: RuleSet, $: CheerioAPI, context: Context) {
|
||||
function runRule(ruleSet: RuleSet, $: CheerioAPI, context: Context) {
|
||||
let maxScore = 0;
|
||||
let value;
|
||||
|
||||
@@ -58,61 +50,31 @@ const runRule = function (ruleSet: RuleSet, $: CheerioAPI, context: Context) {
|
||||
}
|
||||
|
||||
return undefined;
|
||||
};
|
||||
}
|
||||
|
||||
const getMetaData = async function (
|
||||
input: string | Partial<Options>,
|
||||
inputOptions: Partial<Options> = {}
|
||||
async function getMetaDataByHTML(
|
||||
html: string,
|
||||
url: string,
|
||||
options: GetMetaDataOptions
|
||||
) {
|
||||
let url;
|
||||
if (typeof input === 'object') {
|
||||
inputOptions = input;
|
||||
url = input.url || '';
|
||||
} else {
|
||||
url = input;
|
||||
}
|
||||
|
||||
const options = Object.assign({}, defaultOptions, inputOptions);
|
||||
|
||||
const { customRules = {} } = options;
|
||||
const rules: Record<string, RuleSet> = { ...metaDataRules };
|
||||
Object.keys(options.customRules).forEach((key: string) => {
|
||||
Object.keys(customRules).forEach((key: string) => {
|
||||
rules[key] = {
|
||||
rules: [...metaDataRules[key].rules, ...options.customRules[key].rules],
|
||||
rules: [...metaDataRules[key].rules, ...customRules[key].rules],
|
||||
defaultValue:
|
||||
options.customRules[key].defaultValue ||
|
||||
metaDataRules[key].defaultValue,
|
||||
processor:
|
||||
options.customRules[key].processor || metaDataRules[key].processor,
|
||||
customRules[key].defaultValue || metaDataRules[key].defaultValue,
|
||||
processor: customRules[key].processor || metaDataRules[key].processor,
|
||||
};
|
||||
});
|
||||
|
||||
let html;
|
||||
if (!options.html) {
|
||||
const response = await got(url, {
|
||||
headers: {
|
||||
'User-Agent': options.ua,
|
||||
'Accept-Language': options.lang,
|
||||
},
|
||||
timeout: options.timeout,
|
||||
...(options.maxRedirects === 0
|
||||
? { followRedirect: false }
|
||||
: { maxRedirects: options.maxRedirects }),
|
||||
});
|
||||
html = response.body;
|
||||
} else {
|
||||
html = options.html;
|
||||
}
|
||||
|
||||
const metadata: MetaData = {};
|
||||
const context: Context = {
|
||||
url,
|
||||
options,
|
||||
...options,
|
||||
};
|
||||
|
||||
const $ = load(html);
|
||||
// console.log('===============================');
|
||||
// console.log('html');
|
||||
// console.log(doc);
|
||||
|
||||
Object.keys(rules).forEach((key: string) => {
|
||||
const ruleSet = rules[key];
|
||||
@@ -120,6 +82,26 @@ const getMetaData = async function (
|
||||
});
|
||||
|
||||
return metadata;
|
||||
};
|
||||
}
|
||||
|
||||
export { getMetaData };
|
||||
export async function getMetaData(url: string, options: Options = {}) {
|
||||
const { customRules, forceImageHttps, shouldReGetHTML, ...other } = options;
|
||||
const html = await getHTMLByURL(url, {
|
||||
...other,
|
||||
shouldReGetHTML: async html => {
|
||||
const meta = await getMetaDataByHTML(html, url, {
|
||||
customRules,
|
||||
forceImageHttps,
|
||||
});
|
||||
return shouldReGetHTML ? await shouldReGetHTML(meta) : false;
|
||||
},
|
||||
}).catch(() => {
|
||||
// TODO: report error
|
||||
return '';
|
||||
});
|
||||
|
||||
return await getMetaDataByHTML(html, url, {
|
||||
customRules,
|
||||
forceImageHttps,
|
||||
});
|
||||
}
|
||||
|
||||
@@ -591,7 +591,7 @@ export const metaDataRules: Record<string, RuleSet> = {
|
||||
],
|
||||
],
|
||||
processor: (imageUrl: any, context) =>
|
||||
context.options.forceImageHttps === true
|
||||
context.forceImageHttps === true
|
||||
? makeUrlSecure(makeUrlAbsolute(context.url, imageUrl))
|
||||
: makeUrlAbsolute(context.url, imageUrl),
|
||||
},
|
||||
@@ -625,7 +625,7 @@ export const metaDataRules: Record<string, RuleSet> = {
|
||||
},
|
||||
defaultValue: context => makeUrlAbsolute(context.url, '/favicon.ico'),
|
||||
processor: (iconUrl, context) =>
|
||||
context.options.forceImageHttps === true
|
||||
context.forceImageHttps === true
|
||||
? makeUrlSecure(makeUrlAbsolute(context.url, iconUrl))
|
||||
: makeUrlAbsolute(context.url, iconUrl),
|
||||
},
|
||||
@@ -654,7 +654,7 @@ export const metaDataRules: Record<string, RuleSet> = {
|
||||
['meta[name="og:video"][content]', element => element.attribs['content']],
|
||||
],
|
||||
processor: (imageUrl: any, context) =>
|
||||
context.options.forceImageHttps === true
|
||||
context.forceImageHttps === true
|
||||
? makeUrlSecure(makeUrlAbsolute(context.url, imageUrl))
|
||||
: makeUrlAbsolute(context.url, imageUrl),
|
||||
},
|
||||
@@ -683,7 +683,7 @@ export const metaDataRules: Record<string, RuleSet> = {
|
||||
['meta[name="og:audio"][content]', element => element.attribs['content']],
|
||||
],
|
||||
processor: (imageUrl: any, context) =>
|
||||
context.options.forceImageHttps === true
|
||||
context.forceImageHttps === true
|
||||
? makeUrlSecure(makeUrlAbsolute(context.url, imageUrl))
|
||||
: makeUrlAbsolute(context.url, imageUrl),
|
||||
},
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import type { Element } from 'cheerio';
|
||||
|
||||
export interface MetaData {
|
||||
export type MetaData = {
|
||||
title?: string;
|
||||
description?: string;
|
||||
icon?: string;
|
||||
@@ -12,29 +12,32 @@ export interface MetaData {
|
||||
provider?: string;
|
||||
|
||||
[x: string]: string | string[] | undefined;
|
||||
}
|
||||
};
|
||||
|
||||
export type MetadataRule = [string, (el: Element) => string | null];
|
||||
|
||||
export interface Context {
|
||||
export type Context = {
|
||||
url: string;
|
||||
options: Options;
|
||||
}
|
||||
} & GetMetaDataOptions;
|
||||
|
||||
export interface RuleSet {
|
||||
export type RuleSet = {
|
||||
rules: MetadataRule[];
|
||||
defaultValue?: (context: Context) => string | string[];
|
||||
scorer?: (el: Element, score: any) => any;
|
||||
processor?: (input: any, context: Context) => any;
|
||||
}
|
||||
};
|
||||
|
||||
export interface Options {
|
||||
maxRedirects?: number;
|
||||
ua?: string;
|
||||
lang?: string;
|
||||
timeout?: number;
|
||||
forceImageHttps?: boolean;
|
||||
html?: string;
|
||||
url?: string;
|
||||
export type GetMetaDataOptions = {
|
||||
customRules?: Record<string, RuleSet>;
|
||||
}
|
||||
forceImageHttps?: boolean;
|
||||
};
|
||||
|
||||
export type GetHTMLOptions = {
|
||||
timeout?: number;
|
||||
shouldReGetHTML?: (currentHTML: string) => boolean | Promise<boolean>;
|
||||
};
|
||||
|
||||
export type Options = {
|
||||
shouldReGetHTML?: (metaData: MetaData) => boolean | Promise<boolean>;
|
||||
} & GetMetaDataOptions &
|
||||
Omit<GetHTMLOptions, 'shouldReGetHTML'>;
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
import { app, BrowserWindow, nativeTheme, session } from 'electron';
|
||||
import { app, BrowserWindow, nativeTheme } from 'electron';
|
||||
|
||||
import type { NamespaceHandlers } from '../type';
|
||||
import { isMacOS } from '../utils';
|
||||
@@ -42,7 +42,9 @@ export const uiHandlers = {
|
||||
},
|
||||
getBookmarkDataByLink: async (_, url: string) => {
|
||||
return getMetaData(url, {
|
||||
ua: session.defaultSession.getUserAgent(),
|
||||
shouldReGetHTML: metaData => {
|
||||
return !metaData.title && !metaData.description;
|
||||
},
|
||||
});
|
||||
},
|
||||
} satisfies NamespaceHandlers;
|
||||
|
||||
Reference in New Issue
Block a user