feat: support get dynamic page meta data (#2632)

This commit is contained in:
Qi
2023-06-01 11:03:16 +08:00
committed by Himself65
parent a018d50780
commit 369282e29e
5 changed files with 112 additions and 76 deletions

View File

@@ -0,0 +1,49 @@
import { BrowserWindow } from 'electron';
import type { GetHTMLOptions } from './types';
async function getHTMLFromWindow(win: BrowserWindow): Promise<string> {
return win.webContents
.executeJavaScript(`document.documentElement.outerHTML;`)
.then(html => html);
}
// For normal web pages, obtaining html can be done directly,
// but for some dynamic web pages, obtaining html should wait for the complete loading of web pages. shouldReGetHTML should be used to judge whether to obtain html again
export async function getHTMLByURL(
url: string,
options: GetHTMLOptions
): Promise<string> {
return new Promise(resolve => {
const { timeout = 10000, shouldReGetHTML } = options;
const window = new BrowserWindow({
show: false,
});
let html = '';
window.loadURL(url);
const timer = setTimeout(() => {
resolve(html);
window.close();
}, timeout);
async function loopHandle() {
html = await getHTMLFromWindow(window);
if (!shouldReGetHTML) {
return html;
}
if (await shouldReGetHTML(html)) {
setTimeout(loopHandle, 1000);
} else {
window.close();
clearTimeout(timer);
resolve(html);
}
}
window.webContents.on('did-finish-load', async () => {
loopHandle();
});
});
}

View File

@@ -1,23 +1,15 @@
import type { CheerioAPI, Element } from 'cheerio'; import type { CheerioAPI, Element } from 'cheerio';
import { load } from 'cheerio'; import { load } from 'cheerio';
import got from 'got';
import type { Context, MetaData, Options, RuleSet } from './types'; import type { Context, MetaData, Options, RuleSet } from './types';
export * from './types'; export * from './types';
import { getHTMLByURL } from './get-html';
import { metaDataRules } from './rules'; import { metaDataRules } from './rules';
import type { GetMetaDataOptions } from './types';
const defaultOptions = { function runRule(ruleSet: RuleSet, $: CheerioAPI, context: Context) {
maxRedirects: 5,
ua: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36',
lang: '*',
timeout: 10000,
forceImageHttps: true,
customRules: {},
};
const runRule = function (ruleSet: RuleSet, $: CheerioAPI, context: Context) {
let maxScore = 0; let maxScore = 0;
let value; let value;
@@ -58,61 +50,31 @@ const runRule = function (ruleSet: RuleSet, $: CheerioAPI, context: Context) {
} }
return undefined; return undefined;
}; }
const getMetaData = async function ( async function getMetaDataByHTML(
input: string | Partial<Options>, html: string,
inputOptions: Partial<Options> = {} url: string,
options: GetMetaDataOptions
) { ) {
let url; const { customRules = {} } = options;
if (typeof input === 'object') {
inputOptions = input;
url = input.url || '';
} else {
url = input;
}
const options = Object.assign({}, defaultOptions, inputOptions);
const rules: Record<string, RuleSet> = { ...metaDataRules }; const rules: Record<string, RuleSet> = { ...metaDataRules };
Object.keys(options.customRules).forEach((key: string) => { Object.keys(customRules).forEach((key: string) => {
rules[key] = { rules[key] = {
rules: [...metaDataRules[key].rules, ...options.customRules[key].rules], rules: [...metaDataRules[key].rules, ...customRules[key].rules],
defaultValue: defaultValue:
options.customRules[key].defaultValue || customRules[key].defaultValue || metaDataRules[key].defaultValue,
metaDataRules[key].defaultValue, processor: customRules[key].processor || metaDataRules[key].processor,
processor:
options.customRules[key].processor || metaDataRules[key].processor,
}; };
}); });
let html;
if (!options.html) {
const response = await got(url, {
headers: {
'User-Agent': options.ua,
'Accept-Language': options.lang,
},
timeout: options.timeout,
...(options.maxRedirects === 0
? { followRedirect: false }
: { maxRedirects: options.maxRedirects }),
});
html = response.body;
} else {
html = options.html;
}
const metadata: MetaData = {}; const metadata: MetaData = {};
const context: Context = { const context: Context = {
url, url,
options, ...options,
}; };
const $ = load(html); const $ = load(html);
// console.log('===============================');
// console.log('html');
// console.log(doc);
Object.keys(rules).forEach((key: string) => { Object.keys(rules).forEach((key: string) => {
const ruleSet = rules[key]; const ruleSet = rules[key];
@@ -120,6 +82,26 @@ const getMetaData = async function (
}); });
return metadata; return metadata;
}; }
export { getMetaData }; export async function getMetaData(url: string, options: Options = {}) {
const { customRules, forceImageHttps, shouldReGetHTML, ...other } = options;
const html = await getHTMLByURL(url, {
...other,
shouldReGetHTML: async html => {
const meta = await getMetaDataByHTML(html, url, {
customRules,
forceImageHttps,
});
return shouldReGetHTML ? await shouldReGetHTML(meta) : false;
},
}).catch(() => {
// TODO: report error
return '';
});
return await getMetaDataByHTML(html, url, {
customRules,
forceImageHttps,
});
}

View File

@@ -591,7 +591,7 @@ export const metaDataRules: Record<string, RuleSet> = {
], ],
], ],
processor: (imageUrl: any, context) => processor: (imageUrl: any, context) =>
context.options.forceImageHttps === true context.forceImageHttps === true
? makeUrlSecure(makeUrlAbsolute(context.url, imageUrl)) ? makeUrlSecure(makeUrlAbsolute(context.url, imageUrl))
: makeUrlAbsolute(context.url, imageUrl), : makeUrlAbsolute(context.url, imageUrl),
}, },
@@ -625,7 +625,7 @@ export const metaDataRules: Record<string, RuleSet> = {
}, },
defaultValue: context => makeUrlAbsolute(context.url, '/favicon.ico'), defaultValue: context => makeUrlAbsolute(context.url, '/favicon.ico'),
processor: (iconUrl, context) => processor: (iconUrl, context) =>
context.options.forceImageHttps === true context.forceImageHttps === true
? makeUrlSecure(makeUrlAbsolute(context.url, iconUrl)) ? makeUrlSecure(makeUrlAbsolute(context.url, iconUrl))
: makeUrlAbsolute(context.url, iconUrl), : makeUrlAbsolute(context.url, iconUrl),
}, },
@@ -654,7 +654,7 @@ export const metaDataRules: Record<string, RuleSet> = {
['meta[name="og:video"][content]', element => element.attribs['content']], ['meta[name="og:video"][content]', element => element.attribs['content']],
], ],
processor: (imageUrl: any, context) => processor: (imageUrl: any, context) =>
context.options.forceImageHttps === true context.forceImageHttps === true
? makeUrlSecure(makeUrlAbsolute(context.url, imageUrl)) ? makeUrlSecure(makeUrlAbsolute(context.url, imageUrl))
: makeUrlAbsolute(context.url, imageUrl), : makeUrlAbsolute(context.url, imageUrl),
}, },
@@ -683,7 +683,7 @@ export const metaDataRules: Record<string, RuleSet> = {
['meta[name="og:audio"][content]', element => element.attribs['content']], ['meta[name="og:audio"][content]', element => element.attribs['content']],
], ],
processor: (imageUrl: any, context) => processor: (imageUrl: any, context) =>
context.options.forceImageHttps === true context.forceImageHttps === true
? makeUrlSecure(makeUrlAbsolute(context.url, imageUrl)) ? makeUrlSecure(makeUrlAbsolute(context.url, imageUrl))
: makeUrlAbsolute(context.url, imageUrl), : makeUrlAbsolute(context.url, imageUrl),
}, },

View File

@@ -1,6 +1,6 @@
import type { Element } from 'cheerio'; import type { Element } from 'cheerio';
export interface MetaData { export type MetaData = {
title?: string; title?: string;
description?: string; description?: string;
icon?: string; icon?: string;
@@ -12,29 +12,32 @@ export interface MetaData {
provider?: string; provider?: string;
[x: string]: string | string[] | undefined; [x: string]: string | string[] | undefined;
} };
export type MetadataRule = [string, (el: Element) => string | null]; export type MetadataRule = [string, (el: Element) => string | null];
export interface Context { export type Context = {
url: string; url: string;
options: Options; } & GetMetaDataOptions;
}
export interface RuleSet { export type RuleSet = {
rules: MetadataRule[]; rules: MetadataRule[];
defaultValue?: (context: Context) => string | string[]; defaultValue?: (context: Context) => string | string[];
scorer?: (el: Element, score: any) => any; scorer?: (el: Element, score: any) => any;
processor?: (input: any, context: Context) => any; processor?: (input: any, context: Context) => any;
} };
export interface Options { export type GetMetaDataOptions = {
maxRedirects?: number;
ua?: string;
lang?: string;
timeout?: number;
forceImageHttps?: boolean;
html?: string;
url?: string;
customRules?: Record<string, RuleSet>; customRules?: Record<string, RuleSet>;
} forceImageHttps?: boolean;
};
export type GetHTMLOptions = {
timeout?: number;
shouldReGetHTML?: (currentHTML: string) => boolean | Promise<boolean>;
};
export type Options = {
shouldReGetHTML?: (metaData: MetaData) => boolean | Promise<boolean>;
} & GetMetaDataOptions &
Omit<GetHTMLOptions, 'shouldReGetHTML'>;

View File

@@ -1,4 +1,4 @@
import { app, BrowserWindow, nativeTheme, session } from 'electron'; import { app, BrowserWindow, nativeTheme } from 'electron';
import type { NamespaceHandlers } from '../type'; import type { NamespaceHandlers } from '../type';
import { isMacOS } from '../utils'; import { isMacOS } from '../utils';
@@ -42,7 +42,9 @@ export const uiHandlers = {
}, },
getBookmarkDataByLink: async (_, url: string) => { getBookmarkDataByLink: async (_, url: string) => {
return getMetaData(url, { return getMetaData(url, {
ua: session.defaultSession.getUserAgent(), shouldReGetHTML: metaData => {
return !metaData.title && !metaData.description;
},
}); });
}, },
} satisfies NamespaceHandlers; } satisfies NamespaceHandlers;