diff --git a/apps/electron/layers/main/src/ui/get-meta-data/get-html.ts b/apps/electron/layers/main/src/ui/get-meta-data/get-html.ts new file mode 100644 index 0000000000..5ff9134f21 --- /dev/null +++ b/apps/electron/layers/main/src/ui/get-meta-data/get-html.ts @@ -0,0 +1,49 @@ +import { BrowserWindow } from 'electron'; + +import type { GetHTMLOptions } from './types'; + +async function getHTMLFromWindow(win: BrowserWindow): Promise { + return win.webContents + .executeJavaScript(`document.documentElement.outerHTML;`) + .then(html => html); +} + +// For normal web pages, obtaining html can be done directly, +// but for some dynamic web pages, obtaining html should wait for the complete loading of web pages. shouldReGetHTML should be used to judge whether to obtain html again +export async function getHTMLByURL( + url: string, + options: GetHTMLOptions +): Promise { + return new Promise(resolve => { + const { timeout = 10000, shouldReGetHTML } = options; + const window = new BrowserWindow({ + show: false, + }); + let html = ''; + window.loadURL(url); + + const timer = setTimeout(() => { + resolve(html); + window.close(); + }, timeout); + + async function loopHandle() { + html = await getHTMLFromWindow(window); + if (!shouldReGetHTML) { + return html; + } + + if (await shouldReGetHTML(html)) { + setTimeout(loopHandle, 1000); + } else { + window.close(); + clearTimeout(timer); + resolve(html); + } + } + + window.webContents.on('did-finish-load', async () => { + loopHandle(); + }); + }); +} diff --git a/apps/electron/layers/main/src/ui/get-meta-data/index.ts b/apps/electron/layers/main/src/ui/get-meta-data/index.ts index f59a7c3808..6aebb2ea62 100644 --- a/apps/electron/layers/main/src/ui/get-meta-data/index.ts +++ b/apps/electron/layers/main/src/ui/get-meta-data/index.ts @@ -1,23 +1,15 @@ import type { CheerioAPI, Element } from 'cheerio'; import { load } from 'cheerio'; -import got from 'got'; import type { Context, MetaData, Options, RuleSet } from './types'; export * from './types'; +import { getHTMLByURL } from './get-html'; import { metaDataRules } from './rules'; +import type { GetMetaDataOptions } from './types'; -const defaultOptions = { - maxRedirects: 5, - ua: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36', - lang: '*', - timeout: 10000, - forceImageHttps: true, - customRules: {}, -}; - -const runRule = function (ruleSet: RuleSet, $: CheerioAPI, context: Context) { +function runRule(ruleSet: RuleSet, $: CheerioAPI, context: Context) { let maxScore = 0; let value; @@ -58,61 +50,31 @@ const runRule = function (ruleSet: RuleSet, $: CheerioAPI, context: Context) { } return undefined; -}; +} -const getMetaData = async function ( - input: string | Partial, - inputOptions: Partial = {} +async function getMetaDataByHTML( + html: string, + url: string, + options: GetMetaDataOptions ) { - let url; - if (typeof input === 'object') { - inputOptions = input; - url = input.url || ''; - } else { - url = input; - } - - const options = Object.assign({}, defaultOptions, inputOptions); - + const { customRules = {} } = options; const rules: Record = { ...metaDataRules }; - Object.keys(options.customRules).forEach((key: string) => { + Object.keys(customRules).forEach((key: string) => { rules[key] = { - rules: [...metaDataRules[key].rules, ...options.customRules[key].rules], + rules: [...metaDataRules[key].rules, ...customRules[key].rules], defaultValue: - options.customRules[key].defaultValue || - metaDataRules[key].defaultValue, - processor: - options.customRules[key].processor || metaDataRules[key].processor, + customRules[key].defaultValue || metaDataRules[key].defaultValue, + processor: customRules[key].processor || metaDataRules[key].processor, }; }); - let html; - if (!options.html) { - const response = await got(url, { - headers: { - 'User-Agent': options.ua, - 'Accept-Language': options.lang, - }, - timeout: options.timeout, - ...(options.maxRedirects === 0 - ? { followRedirect: false } - : { maxRedirects: options.maxRedirects }), - }); - html = response.body; - } else { - html = options.html; - } - const metadata: MetaData = {}; const context: Context = { url, - options, + ...options, }; const $ = load(html); - // console.log('==============================='); - // console.log('html'); - // console.log(doc); Object.keys(rules).forEach((key: string) => { const ruleSet = rules[key]; @@ -120,6 +82,26 @@ const getMetaData = async function ( }); return metadata; -}; +} -export { getMetaData }; +export async function getMetaData(url: string, options: Options = {}) { + const { customRules, forceImageHttps, shouldReGetHTML, ...other } = options; + const html = await getHTMLByURL(url, { + ...other, + shouldReGetHTML: async html => { + const meta = await getMetaDataByHTML(html, url, { + customRules, + forceImageHttps, + }); + return shouldReGetHTML ? await shouldReGetHTML(meta) : false; + }, + }).catch(() => { + // TODO: report error + return ''; + }); + + return await getMetaDataByHTML(html, url, { + customRules, + forceImageHttps, + }); +} diff --git a/apps/electron/layers/main/src/ui/get-meta-data/rules.ts b/apps/electron/layers/main/src/ui/get-meta-data/rules.ts index 7800d67a9a..9f57ac11f3 100644 --- a/apps/electron/layers/main/src/ui/get-meta-data/rules.ts +++ b/apps/electron/layers/main/src/ui/get-meta-data/rules.ts @@ -591,7 +591,7 @@ export const metaDataRules: Record = { ], ], processor: (imageUrl: any, context) => - context.options.forceImageHttps === true + context.forceImageHttps === true ? makeUrlSecure(makeUrlAbsolute(context.url, imageUrl)) : makeUrlAbsolute(context.url, imageUrl), }, @@ -625,7 +625,7 @@ export const metaDataRules: Record = { }, defaultValue: context => makeUrlAbsolute(context.url, '/favicon.ico'), processor: (iconUrl, context) => - context.options.forceImageHttps === true + context.forceImageHttps === true ? makeUrlSecure(makeUrlAbsolute(context.url, iconUrl)) : makeUrlAbsolute(context.url, iconUrl), }, @@ -654,7 +654,7 @@ export const metaDataRules: Record = { ['meta[name="og:video"][content]', element => element.attribs['content']], ], processor: (imageUrl: any, context) => - context.options.forceImageHttps === true + context.forceImageHttps === true ? makeUrlSecure(makeUrlAbsolute(context.url, imageUrl)) : makeUrlAbsolute(context.url, imageUrl), }, @@ -683,7 +683,7 @@ export const metaDataRules: Record = { ['meta[name="og:audio"][content]', element => element.attribs['content']], ], processor: (imageUrl: any, context) => - context.options.forceImageHttps === true + context.forceImageHttps === true ? makeUrlSecure(makeUrlAbsolute(context.url, imageUrl)) : makeUrlAbsolute(context.url, imageUrl), }, diff --git a/apps/electron/layers/main/src/ui/get-meta-data/types.ts b/apps/electron/layers/main/src/ui/get-meta-data/types.ts index 4dc7171d59..9662f8fccc 100644 --- a/apps/electron/layers/main/src/ui/get-meta-data/types.ts +++ b/apps/electron/layers/main/src/ui/get-meta-data/types.ts @@ -1,6 +1,6 @@ import type { Element } from 'cheerio'; -export interface MetaData { +export type MetaData = { title?: string; description?: string; icon?: string; @@ -12,29 +12,32 @@ export interface MetaData { provider?: string; [x: string]: string | string[] | undefined; -} +}; export type MetadataRule = [string, (el: Element) => string | null]; -export interface Context { +export type Context = { url: string; - options: Options; -} +} & GetMetaDataOptions; -export interface RuleSet { +export type RuleSet = { rules: MetadataRule[]; defaultValue?: (context: Context) => string | string[]; scorer?: (el: Element, score: any) => any; processor?: (input: any, context: Context) => any; -} +}; -export interface Options { - maxRedirects?: number; - ua?: string; - lang?: string; - timeout?: number; - forceImageHttps?: boolean; - html?: string; - url?: string; +export type GetMetaDataOptions = { customRules?: Record; -} + forceImageHttps?: boolean; +}; + +export type GetHTMLOptions = { + timeout?: number; + shouldReGetHTML?: (currentHTML: string) => boolean | Promise; +}; + +export type Options = { + shouldReGetHTML?: (metaData: MetaData) => boolean | Promise; +} & GetMetaDataOptions & + Omit; diff --git a/apps/electron/layers/main/src/ui/index.ts b/apps/electron/layers/main/src/ui/index.ts index 2ea0d7cd19..4226fb38c6 100644 --- a/apps/electron/layers/main/src/ui/index.ts +++ b/apps/electron/layers/main/src/ui/index.ts @@ -1,4 +1,4 @@ -import { app, BrowserWindow, nativeTheme, session } from 'electron'; +import { app, BrowserWindow, nativeTheme } from 'electron'; import type { NamespaceHandlers } from '../type'; import { isMacOS } from '../utils'; @@ -42,7 +42,9 @@ export const uiHandlers = { }, getBookmarkDataByLink: async (_, url: string) => { return getMetaData(url, { - ua: session.defaultSession.getUserAgent(), + shouldReGetHTML: metaData => { + return !metaData.title && !metaData.description; + }, }); }, } satisfies NamespaceHandlers;