feat: support get dynamic page meta data (#2632)

This commit is contained in:
Qi
2023-06-01 11:03:16 +08:00
committed by Himself65
parent a018d50780
commit 369282e29e
5 changed files with 112 additions and 76 deletions

View File

@@ -0,0 +1,49 @@
import { BrowserWindow } from 'electron';
import type { GetHTMLOptions } from './types';
async function getHTMLFromWindow(win: BrowserWindow): Promise<string> {
return win.webContents
.executeJavaScript(`document.documentElement.outerHTML;`)
.then(html => html);
}
// For normal web pages, obtaining html can be done directly,
// but for some dynamic web pages, obtaining html should wait for the complete loading of web pages. shouldReGetHTML should be used to judge whether to obtain html again
export async function getHTMLByURL(
url: string,
options: GetHTMLOptions
): Promise<string> {
return new Promise(resolve => {
const { timeout = 10000, shouldReGetHTML } = options;
const window = new BrowserWindow({
show: false,
});
let html = '';
window.loadURL(url);
const timer = setTimeout(() => {
resolve(html);
window.close();
}, timeout);
async function loopHandle() {
html = await getHTMLFromWindow(window);
if (!shouldReGetHTML) {
return html;
}
if (await shouldReGetHTML(html)) {
setTimeout(loopHandle, 1000);
} else {
window.close();
clearTimeout(timer);
resolve(html);
}
}
window.webContents.on('did-finish-load', async () => {
loopHandle();
});
});
}

View File

@@ -1,23 +1,15 @@
import type { CheerioAPI, Element } from 'cheerio';
import { load } from 'cheerio';
import got from 'got';
import type { Context, MetaData, Options, RuleSet } from './types';
export * from './types';
import { getHTMLByURL } from './get-html';
import { metaDataRules } from './rules';
import type { GetMetaDataOptions } from './types';
const defaultOptions = {
maxRedirects: 5,
ua: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36',
lang: '*',
timeout: 10000,
forceImageHttps: true,
customRules: {},
};
const runRule = function (ruleSet: RuleSet, $: CheerioAPI, context: Context) {
function runRule(ruleSet: RuleSet, $: CheerioAPI, context: Context) {
let maxScore = 0;
let value;
@@ -58,61 +50,31 @@ const runRule = function (ruleSet: RuleSet, $: CheerioAPI, context: Context) {
}
return undefined;
};
}
const getMetaData = async function (
input: string | Partial<Options>,
inputOptions: Partial<Options> = {}
async function getMetaDataByHTML(
html: string,
url: string,
options: GetMetaDataOptions
) {
let url;
if (typeof input === 'object') {
inputOptions = input;
url = input.url || '';
} else {
url = input;
}
const options = Object.assign({}, defaultOptions, inputOptions);
const { customRules = {} } = options;
const rules: Record<string, RuleSet> = { ...metaDataRules };
Object.keys(options.customRules).forEach((key: string) => {
Object.keys(customRules).forEach((key: string) => {
rules[key] = {
rules: [...metaDataRules[key].rules, ...options.customRules[key].rules],
rules: [...metaDataRules[key].rules, ...customRules[key].rules],
defaultValue:
options.customRules[key].defaultValue ||
metaDataRules[key].defaultValue,
processor:
options.customRules[key].processor || metaDataRules[key].processor,
customRules[key].defaultValue || metaDataRules[key].defaultValue,
processor: customRules[key].processor || metaDataRules[key].processor,
};
});
let html;
if (!options.html) {
const response = await got(url, {
headers: {
'User-Agent': options.ua,
'Accept-Language': options.lang,
},
timeout: options.timeout,
...(options.maxRedirects === 0
? { followRedirect: false }
: { maxRedirects: options.maxRedirects }),
});
html = response.body;
} else {
html = options.html;
}
const metadata: MetaData = {};
const context: Context = {
url,
options,
...options,
};
const $ = load(html);
// console.log('===============================');
// console.log('html');
// console.log(doc);
Object.keys(rules).forEach((key: string) => {
const ruleSet = rules[key];
@@ -120,6 +82,26 @@ const getMetaData = async function (
});
return metadata;
};
}
export { getMetaData };
export async function getMetaData(url: string, options: Options = {}) {
const { customRules, forceImageHttps, shouldReGetHTML, ...other } = options;
const html = await getHTMLByURL(url, {
...other,
shouldReGetHTML: async html => {
const meta = await getMetaDataByHTML(html, url, {
customRules,
forceImageHttps,
});
return shouldReGetHTML ? await shouldReGetHTML(meta) : false;
},
}).catch(() => {
// TODO: report error
return '';
});
return await getMetaDataByHTML(html, url, {
customRules,
forceImageHttps,
});
}

View File

@@ -591,7 +591,7 @@ export const metaDataRules: Record<string, RuleSet> = {
],
],
processor: (imageUrl: any, context) =>
context.options.forceImageHttps === true
context.forceImageHttps === true
? makeUrlSecure(makeUrlAbsolute(context.url, imageUrl))
: makeUrlAbsolute(context.url, imageUrl),
},
@@ -625,7 +625,7 @@ export const metaDataRules: Record<string, RuleSet> = {
},
defaultValue: context => makeUrlAbsolute(context.url, '/favicon.ico'),
processor: (iconUrl, context) =>
context.options.forceImageHttps === true
context.forceImageHttps === true
? makeUrlSecure(makeUrlAbsolute(context.url, iconUrl))
: makeUrlAbsolute(context.url, iconUrl),
},
@@ -654,7 +654,7 @@ export const metaDataRules: Record<string, RuleSet> = {
['meta[name="og:video"][content]', element => element.attribs['content']],
],
processor: (imageUrl: any, context) =>
context.options.forceImageHttps === true
context.forceImageHttps === true
? makeUrlSecure(makeUrlAbsolute(context.url, imageUrl))
: makeUrlAbsolute(context.url, imageUrl),
},
@@ -683,7 +683,7 @@ export const metaDataRules: Record<string, RuleSet> = {
['meta[name="og:audio"][content]', element => element.attribs['content']],
],
processor: (imageUrl: any, context) =>
context.options.forceImageHttps === true
context.forceImageHttps === true
? makeUrlSecure(makeUrlAbsolute(context.url, imageUrl))
: makeUrlAbsolute(context.url, imageUrl),
},

View File

@@ -1,6 +1,6 @@
import type { Element } from 'cheerio';
export interface MetaData {
export type MetaData = {
title?: string;
description?: string;
icon?: string;
@@ -12,29 +12,32 @@ export interface MetaData {
provider?: string;
[x: string]: string | string[] | undefined;
}
};
export type MetadataRule = [string, (el: Element) => string | null];
export interface Context {
export type Context = {
url: string;
options: Options;
}
} & GetMetaDataOptions;
export interface RuleSet {
export type RuleSet = {
rules: MetadataRule[];
defaultValue?: (context: Context) => string | string[];
scorer?: (el: Element, score: any) => any;
processor?: (input: any, context: Context) => any;
}
};
export interface Options {
maxRedirects?: number;
ua?: string;
lang?: string;
timeout?: number;
forceImageHttps?: boolean;
html?: string;
url?: string;
export type GetMetaDataOptions = {
customRules?: Record<string, RuleSet>;
}
forceImageHttps?: boolean;
};
export type GetHTMLOptions = {
timeout?: number;
shouldReGetHTML?: (currentHTML: string) => boolean | Promise<boolean>;
};
export type Options = {
shouldReGetHTML?: (metaData: MetaData) => boolean | Promise<boolean>;
} & GetMetaDataOptions &
Omit<GetHTMLOptions, 'shouldReGetHTML'>;

View File

@@ -1,4 +1,4 @@
import { app, BrowserWindow, nativeTheme, session } from 'electron';
import { app, BrowserWindow, nativeTheme } from 'electron';
import type { NamespaceHandlers } from '../type';
import { isMacOS } from '../utils';
@@ -42,7 +42,9 @@ export const uiHandlers = {
},
getBookmarkDataByLink: async (_, url: string) => {
return getMetaData(url, {
ua: session.defaultSession.getUserAgent(),
shouldReGetHTML: metaData => {
return !metaData.title && !metaData.description;
},
});
},
} satisfies NamespaceHandlers;