feat: replace electron to puppeteer (#2700)

Co-authored-by: himself65 <himself65@outlook.com>
This commit is contained in:
Qi
2023-06-08 17:51:45 +08:00
committed by GitHub
parent de8af5f114
commit fda89b05e7
15 changed files with 134 additions and 982 deletions

View File

@@ -9,15 +9,13 @@
},
"dependencies": {
"@toeverything/plugin-infra": "workspace:*",
"cheerio": "^1.0.0-rc.12"
"link-preview-js": "^3.0.4"
},
"devDependencies": {
"electron": "=25.0.1",
"react": "18.3.0-canary-16d053d59-20230506",
"react-dom": "18.3.0-canary-16d053d59-20230506"
},
"peerDependencies": {
"electron": "*",
"react": "*",
"react-dom": "*"
},

View File

@@ -1,11 +1,56 @@
import { getMetaData } from './server/get-meta-data';
import { getLinkPreview } from 'link-preview-js';
type MetaData = {
title?: string;
description?: string;
icon?: string;
image?: string;
[x: string]: string | string[] | undefined;
};
export interface PreviewType {
url: string;
title: string;
siteName: string | undefined;
description: string | undefined;
mediaType: string;
contentType: string | undefined;
images: string[];
videos: {
url: string | undefined;
secureUrl: string | null | undefined;
type: string | null | undefined;
width: string | undefined;
height: string | undefined;
}[];
favicons: string[];
}
export default {
getBookmarkDataByLink: async (_: unknown, url: string) => {
return getMetaData(url, {
shouldReGetHTML: metaData => {
return !metaData.title && !metaData.description;
getBookmarkDataByLink: async (_: unknown, url: string): Promise<MetaData> => {
const previewData = (await getLinkPreview(url, {
timeout: 6000,
headers: {
'user-agent': 'googlebot',
},
});
followRedirects: 'follow',
}).catch(() => {
return {
title: '',
siteName: '',
description: '',
images: [],
videos: [],
contentType: `text/html`,
favicons: [],
};
})) as PreviewType;
return {
title: previewData.title,
description: previewData.description,
icon: previewData.favicons[0],
image: previewData.images[0],
};
},
};

View File

@@ -1,49 +0,0 @@
import { BrowserWindow } from 'electron';
import type { GetHTMLOptions } from './types';
async function getHTMLFromWindow(win: BrowserWindow): Promise<string> {
return win.webContents
.executeJavaScript(`document.documentElement.outerHTML;`)
.then(html => html);
}
// For normal web pages, obtaining html can be done directly,
// but for some dynamic web pages, obtaining html should wait for the complete loading of web pages. shouldReGetHTML should be used to judge whether to obtain html again
export async function getHTMLByURL(
url: string,
options: GetHTMLOptions
): Promise<string> {
return new Promise(resolve => {
const { timeout = 10000, shouldReGetHTML } = options;
const window = new BrowserWindow({
show: false,
});
let html = '';
window.loadURL(url);
const timer = setTimeout(() => {
resolve(html);
window.close();
}, timeout);
async function loopHandle() {
html = await getHTMLFromWindow(window);
if (!shouldReGetHTML) {
return html;
}
if (await shouldReGetHTML(html)) {
setTimeout(loopHandle, 1000);
} else {
window.close();
clearTimeout(timer);
resolve(html);
}
}
window.webContents.on('did-finish-load', async () => {
loopHandle();
});
});
}

View File

@@ -1,107 +0,0 @@
import type { CheerioAPI, Element } from 'cheerio';
import { load } from 'cheerio';
import type { Context, MetaData, Options, RuleSet } from './types';
export * from './types';
import { getHTMLByURL } from './get-html';
import { metaDataRules } from './rules';
import type { GetMetaDataOptions } from './types';
function runRule(ruleSet: RuleSet, $: CheerioAPI, context: Context) {
let maxScore = 0;
let value;
for (let currRule = 0; currRule < ruleSet.rules.length; currRule++) {
const [query, handler] = ruleSet.rules[currRule];
const elements = Array.from($(query));
if (elements.length) {
for (const element of elements) {
let score = ruleSet.rules.length - currRule;
if (ruleSet.scorer) {
const newScore = ruleSet.scorer(element as Element, score);
if (newScore) {
score = newScore;
}
}
if (score > maxScore) {
maxScore = score;
value = handler(element as Element);
}
}
}
}
if (value) {
if (ruleSet.processor) {
value = ruleSet.processor(value, context);
}
return value;
}
if (ruleSet.defaultValue) {
return ruleSet.defaultValue(context);
}
return undefined;
}
async function getMetaDataByHTML(
html: string,
url: string,
options: GetMetaDataOptions
) {
const { customRules = {} } = options;
const rules: Record<string, RuleSet> = { ...metaDataRules };
Object.keys(customRules).forEach((key: string) => {
rules[key] = {
rules: [...metaDataRules[key].rules, ...customRules[key].rules],
defaultValue:
customRules[key].defaultValue || metaDataRules[key].defaultValue,
processor: customRules[key].processor || metaDataRules[key].processor,
};
});
const metadata: MetaData = {};
const context: Context = {
url,
...options,
};
const $ = load(html);
Object.keys(rules).forEach((key: string) => {
const ruleSet = rules[key];
metadata[key] = runRule(ruleSet, $, context) || undefined;
});
return metadata;
}
export async function getMetaData(url: string, options: Options = {}) {
const { customRules, forceImageHttps, shouldReGetHTML, ...other } = options;
const html = await getHTMLByURL(url, {
...other,
shouldReGetHTML: async html => {
const meta = await getMetaDataByHTML(html, url, {
customRules,
forceImageHttps,
});
return shouldReGetHTML ? await shouldReGetHTML(meta) : false;
},
}).catch(() => {
// TODO: report error
return '';
});
return await getMetaDataByHTML(html, url, {
customRules,
forceImageHttps,
});
}

View File

@@ -1,690 +0,0 @@
import type { RuleSet } from './types';
import { getProvider, makeUrlAbsolute, makeUrlSecure, parseUrl } from './utils';
export const metaDataRules: Record<string, RuleSet> = {
title: {
rules: [
[
'meta[property="og:title"][content]',
element => element.attribs['content'],
],
['meta[name="og:title"][content]', element => element.attribs['content']],
[
'meta[property="twitter:title"][content]',
element => element.attribs['content'],
],
[
'meta[name="twitter:title"][content]',
element => element.attribs['content'],
],
[
'meta[property="parsely-title"][content]',
element => element.attribs['content'],
],
[
'meta[name="parsely-title"][content]',
element => element.attribs['content'],
],
[
'meta[property="sailthru.title"][content]',
element => element.attribs['content'],
],
[
'meta[name="sailthru.title"][content]',
element => element.attribs['content'],
],
['title', (element: any) => element.text],
],
},
description: {
rules: [
[
'meta[property="og:description"][content]',
element => element.attribs['content'],
],
[
'meta[name="og:description"][content]',
element => element.attribs['content'],
],
[
'meta[property="description" i][content]',
element => element.attribs['content'],
],
[
'meta[name="description" i][content]',
element => element.attribs['content'],
],
[
'meta[property="sailthru.description"][content]',
element => element.attribs['content'],
],
[
'meta[name="sailthru.description"][content]',
element => element.attribs['content'],
],
[
'meta[property="twitter:description"][content]',
element => element.attribs['content'],
],
[
'meta[name="twitter:description"][content]',
element => element.attribs['content'],
],
[
'meta[property="summary" i][content]',
element => element.attribs['content'],
],
[
'meta[name="summary" i][content]',
element => element.attribs['content'],
],
],
},
language: {
rules: [
['html[lang]', element => element.attribs['lang']],
[
'meta[property="language" i][content]',
element => element.attribs['content'],
],
[
'meta[name="language" i][content]',
element => element.attribs['content'],
],
[
'meta[property="og:locale"][content]',
element => element.attribs['content'],
],
[
'meta[name="og:locale"][content]',
element => element.attribs['content'],
],
],
processor: (language: any) => language.split('-')[0],
},
type: {
rules: [
[
'meta[property="og:type"][content]',
element => element.attribs['content'],
],
['meta[name="og:type"][content]', element => element.attribs['content']],
[
'meta[property="parsely-type"][content]',
element => element.attribs['content'],
],
[
'meta[name="parsely-type"][content]',
element => element.attribs['content'],
],
[
'meta[property="medium"][content]',
element => element.attribs['content'],
],
['meta[name="medium"][content]', element => element.attribs['content']],
],
},
url: {
rules: [
[
'meta[property="og:url"][content]',
element => element.attribs['content'],
],
['meta[name="og:url"][content]', element => element.attribs['content']],
[
'meta[property="al:web:url"][content]',
element => element.attribs['content'],
],
[
'meta[name="al:web:url"][content]',
element => element.attribs['content'],
],
[
'meta[property="parsely-link"][content]',
element => element.attribs['content'],
],
[
'meta[name="parsely-link"][content]',
element => element.attribs['content'],
],
['a.amp-canurl', element => element.attribs['href']],
['link[rel="canonical"][href]', element => element.attribs['href']],
],
defaultValue: context => context.url,
processor: (url: any, context) => makeUrlAbsolute(context.url, url),
},
provider: {
rules: [
[
'meta[property="og:site_name"][content]',
element => element.attribs['content'],
],
[
'meta[name="og:site_name"][content]',
element => element.attribs['content'],
],
[
'meta[property="publisher" i][content]',
element => element.attribs['content'],
],
[
'meta[name="publisher" i][content]',
element => element.attribs['content'],
],
[
'meta[property="application-name" i][content]',
element => element.attribs['content'],
],
[
'meta[name="application-name" i][content]',
element => element.attribs['content'],
],
[
'meta[property="al:android:app_name"][content]',
element => element.attribs['content'],
],
[
'meta[name="al:android:app_name"][content]',
element => element.attribs['content'],
],
[
'meta[property="al:iphone:app_name"][content]',
element => element.attribs['content'],
],
[
'meta[name="al:iphone:app_name"][content]',
element => element.attribs['content'],
],
[
'meta[property="al:ipad:app_name"][content]',
element => element.attribs['content'],
],
[
'meta[name="al:ipad:app_name"][content]',
element => element.attribs['content'],
],
[
'meta[property="al:ios:app_name"][content]',
element => element.attribs['content'],
],
[
'meta[name="al:ios:app_name"][content]',
element => element.attribs['content'],
],
[
'meta[property="twitter:app:name:iphone"][content]',
element => element.attribs['content'],
],
[
'meta[name="twitter:app:name:iphone"][content]',
element => element.attribs['content'],
],
[
'meta[property="twitter:app:name:ipad"][content]',
element => element.attribs['content'],
],
[
'meta[name="twitter:app:name:ipad"][content]',
element => element.attribs['content'],
],
[
'meta[property="twitter:app:name:googleplay"][content]',
element => element.attribs['content'],
],
[
'meta[name="twitter:app:name:googleplay"][content]',
element => element.attribs['content'],
],
],
defaultValue: context => getProvider(parseUrl(context.url)),
},
keywords: {
rules: [
[
'meta[property="keywords" i][content]',
element => element.attribs['content'],
],
[
'meta[name="keywords" i][content]',
element => element.attribs['content'],
],
[
'meta[property="parsely-tags"][content]',
element => element.attribs['content'],
],
[
'meta[name="parsely-tags"][content]',
element => element.attribs['content'],
],
[
'meta[property="sailthru.tags"][content]',
element => element.attribs['content'],
],
[
'meta[name="sailthru.tags"][content]',
element => element.attribs['content'],
],
[
'meta[property="article:tag" i][content]',
element => element.attribs['content'],
],
[
'meta[name="article:tag" i][content]',
element => element.attribs['content'],
],
[
'meta[property="book:tag" i][content]',
element => element.attribs['content'],
],
[
'meta[name="book:tag" i][content]',
element => element.attribs['content'],
],
[
'meta[property="topic" i][content]',
element => element.attribs['content'],
],
['meta[name="topic" i][content]', element => element.attribs['content']],
],
processor: (keywords: any) =>
keywords.split(',').map((keyword: string) => keyword.trim()),
},
section: {
rules: [
[
'meta[property="article:section"][content]',
element => element.attribs['content'],
],
[
'meta[name="article:section"][content]',
element => element.attribs['content'],
],
[
'meta[property="category"][content]',
element => element.attribs['content'],
],
['meta[name="category"][content]', element => element.attribs['content']],
],
},
author: {
rules: [
[
'meta[property="author" i][content]',
element => element.attribs['content'],
],
['meta[name="author" i][content]', element => element.attribs['content']],
[
'meta[property="article:author"][content]',
element => element.attribs['content'],
],
[
'meta[name="article:author"][content]',
element => element.attribs['content'],
],
[
'meta[property="book:author"][content]',
element => element.attribs['content'],
],
[
'meta[name="book:author"][content]',
element => element.attribs['content'],
],
[
'meta[property="parsely-author"][content]',
element => element.attribs['content'],
],
[
'meta[name="parsely-author"][content]',
element => element.attribs['content'],
],
[
'meta[property="sailthru.author"][content]',
element => element.attribs['content'],
],
[
'meta[name="sailthru.author"][content]',
element => element.attribs['content'],
],
['a[class*="author" i]', (element: any) => element.text],
['[rel="author"]', (element: any) => element.text],
[
'meta[property="twitter:creator"][content]',
element => element.attribs['content'],
],
[
'meta[name="twitter:creator"][content]',
element => element.attribs['content'],
],
[
'meta[property="profile:username"][content]',
element => element.attribs['content'],
],
[
'meta[name="profile:username"][content]',
element => element.attribs['content'],
],
],
},
published: {
rules: [
[
'meta[property="article:published_time"][content]',
element => element.attribs['content'],
],
[
'meta[name="article:published_time"][content]',
element => element.attribs['content'],
],
[
'meta[property="published_time"][content]',
element => element.attribs['content'],
],
[
'meta[name="published_time"][content]',
element => element.attribs['content'],
],
[
'meta[property="parsely-pub-date"][content]',
element => element.attribs['content'],
],
[
'meta[name="parsely-pub-date"][content]',
element => element.attribs['content'],
],
[
'meta[property="sailthru.date"][content]',
element => element.attribs['content'],
],
[
'meta[name="sailthru.date"][content]',
element => element.attribs['content'],
],
[
'meta[property="date" i][content]',
element => element.attribs['content'],
],
['meta[name="date" i][content]', element => element.attribs['content']],
[
'meta[property="release_date" i][content]',
element => element.attribs['content'],
],
[
'meta[name="release_date" i][content]',
element => element.attribs['content'],
],
['time[datetime]', element => element.attribs['datetime']],
['time[datetime][pubdate]', element => element.attribs['datetime']],
],
processor: (value: any) =>
Date.parse(value.toString())
? new Date(value.toString()).toISOString()
: undefined,
},
modified: {
rules: [
[
'meta[property="og:updated_time"][content]',
element => element.attribs['content'],
],
[
'meta[name="og:updated_time"][content]',
element => element.attribs['content'],
],
[
'meta[property="article:modified_time"][content]',
element => element.attribs['content'],
],
[
'meta[name="article:modified_time"][content]',
element => element.attribs['content'],
],
[
'meta[property="updated_time" i][content]',
element => element.attribs['content'],
],
[
'meta[name="updated_time" i][content]',
element => element.attribs['content'],
],
[
'meta[property="modified_time"][content]',
element => element.attribs['content'],
],
[
'meta[name="modified_time"][content]',
element => element.attribs['content'],
],
[
'meta[property="revised"][content]',
element => element.attribs['content'],
],
['meta[name="revised"][content]', element => element.attribs['content']],
],
processor: (value: any) =>
Date.parse(value.toString())
? new Date(value.toString()).toISOString()
: undefined,
},
robots: {
rules: [
[
'meta[property="robots" i][content]',
element => element.attribs['content'],
],
['meta[name="robots" i][content]', element => element.attribs['content']],
],
processor: (keywords: any) =>
keywords.split(',').map((keyword: string) => keyword.trim()),
},
copyright: {
rules: [
[
'meta[property="copyright" i][content]',
element => element.attribs['content'],
],
[
'meta[name="copyright" i][content]',
element => element.attribs['content'],
],
],
},
email: {
rules: [
[
'meta[property="email" i][content]',
element => element.attribs['content'],
],
['meta[name="email" i][content]', element => element.attribs['content']],
[
'meta[property="reply-to" i][content]',
element => element.attribs['content'],
],
[
'meta[name="reply-to" i][content]',
element => element.attribs['content'],
],
],
},
twitter: {
rules: [
[
'meta[property="twitter:site"][content]',
element => element.attribs['content'],
],
[
'meta[name="twitter:site"][content]',
element => element.attribs['content'],
],
],
},
facebook: {
rules: [
[
'meta[property="fb:pages"][content]',
element => element.attribs['content'],
],
['meta[name="fb:pages"][content]', element => element.attribs['content']],
],
},
image: {
rules: [
[
'meta[property="og:image:secure_url"][content]',
element => element.attribs['content'],
],
[
'meta[name="og:image:secure_url"][content]',
element => element.attribs['content'],
],
[
'meta[property="og:image:url"][content]',
element => element.attribs['content'],
],
[
'meta[name="og:image:url"][content]',
element => element.attribs['content'],
],
[
'meta[property="og:image"][content]',
element => element.attribs['content'],
],
['meta[name="og:image"][content]', element => element.attribs['content']],
[
'meta[property="twitter:image"][content]',
element => element.attribs['content'],
],
[
'meta[name="twitter:image"][content]',
element => element.attribs['content'],
],
[
'meta[property="twitter:image:src"][content]',
element => element.attribs['content'],
],
[
'meta[name="twitter:image:src"][content]',
element => element.attribs['content'],
],
[
'meta[property="thumbnail"][content]',
element => element.attribs['content'],
],
[
'meta[name="thumbnail"][content]',
element => element.attribs['content'],
],
[
'meta[property="parsely-image-url"][content]',
element => element.attribs['content'],
],
[
'meta[name="parsely-image-url"][content]',
element => element.attribs['content'],
],
[
'meta[property="sailthru.image.full"][content]',
element => element.attribs['content'],
],
[
'meta[name="sailthru.image.full"][content]',
element => element.attribs['content'],
],
],
processor: (imageUrl: any, context) =>
context.forceImageHttps === true
? makeUrlSecure(makeUrlAbsolute(context.url, imageUrl))
: makeUrlAbsolute(context.url, imageUrl),
},
icon: {
rules: [
[
'link[rel="apple-touch-icon"][href]',
element => element.attribs['href'],
],
[
'link[rel="apple-touch-icon-precomposed"][href]',
element => element.attribs['href'],
],
['link[rel="icon" i][href]', element => element.attribs['href']],
['link[rel="fluid-icon"][href]', element => element.attribs['href']],
['link[rel="shortcut icon"][href]', element => element.attribs['href']],
['link[rel="Shortcut Icon"][href]', element => element.attribs['href']],
['link[rel="mask-icon"][href]', element => element.attribs['href']],
],
scorer: element => {
const sizes = element.attribs['sizes'];
if (sizes) {
const sizeMatches = sizes.match(/\d+/g);
if (sizeMatches) {
const parsed = parseInt(sizeMatches[0]);
if (!isNaN(parsed)) {
return parsed;
}
}
}
},
defaultValue: context => makeUrlAbsolute(context.url, '/favicon.ico'),
processor: (iconUrl, context) =>
context.forceImageHttps === true
? makeUrlSecure(makeUrlAbsolute(context.url, iconUrl))
: makeUrlAbsolute(context.url, iconUrl),
},
video: {
rules: [
[
'meta[property="og:video:secure_url"][content]',
element => element.attribs['content'],
],
[
'meta[name="og:video:secure_url"][content]',
element => element.attribs['content'],
],
[
'meta[property="og:video:url"][content]',
element => element.attribs['content'],
],
[
'meta[name="og:video:url"][content]',
element => element.attribs['content'],
],
[
'meta[property="og:video"][content]',
element => element.attribs['content'],
],
['meta[name="og:video"][content]', element => element.attribs['content']],
],
processor: (imageUrl: any, context) =>
context.forceImageHttps === true
? makeUrlSecure(makeUrlAbsolute(context.url, imageUrl))
: makeUrlAbsolute(context.url, imageUrl),
},
audio: {
rules: [
[
'meta[property="og:audio:secure_url"][content]',
element => element.attribs['content'],
],
[
'meta[name="og:audio:secure_url"][content]',
element => element.attribs['content'],
],
[
'meta[property="og:audio:url"][content]',
element => element.attribs['content'],
],
[
'meta[name="og:audio:url"][content]',
element => element.attribs['content'],
],
[
'meta[property="og:audio"][content]',
element => element.attribs['content'],
],
['meta[name="og:audio"][content]', element => element.attribs['content']],
],
processor: (imageUrl: any, context) =>
context.forceImageHttps === true
? makeUrlSecure(makeUrlAbsolute(context.url, imageUrl))
: makeUrlAbsolute(context.url, imageUrl),
},
};

View File

@@ -1,43 +0,0 @@
import type { Element } from 'cheerio';
export type MetaData = {
title?: string;
description?: string;
icon?: string;
image?: string;
keywords?: string[];
language?: string;
type?: string;
url?: string;
provider?: string;
[x: string]: string | string[] | undefined;
};
export type MetadataRule = [string, (el: Element) => string | null];
export type Context = {
url: string;
} & GetMetaDataOptions;
export type RuleSet = {
rules: MetadataRule[];
defaultValue?: (context: Context) => string | string[];
scorer?: (el: Element, score: any) => any;
processor?: (input: any, context: Context) => any;
};
export type GetMetaDataOptions = {
customRules?: Record<string, RuleSet>;
forceImageHttps?: boolean;
};
export type GetHTMLOptions = {
timeout?: number;
shouldReGetHTML?: (currentHTML: string) => boolean | Promise<boolean>;
};
export type Options = {
shouldReGetHTML?: (metaData: MetaData) => boolean | Promise<boolean>;
} & GetMetaDataOptions &
Omit<GetHTMLOptions, 'shouldReGetHTML'>;

View File

@@ -1,28 +0,0 @@
import { parse, resolve } from 'node:url';
export function makeUrlAbsolute(base: string, relative: string): string {
const relativeParsed = parse(relative);
if (relativeParsed.host === null) {
return resolve(base, relative);
}
return relative;
}
export function makeUrlSecure(url: string): string {
return url.replace(/^http:/, 'https:');
}
export function parseUrl(url: string): string {
return parse(url).hostname || '';
}
export function getProvider(host: string): string {
return host
.replace(/www[a-zA-Z0-9]*\./, '')
.replace('.co.', '.')
.split('.')
.slice(0, -1)
.join(' ');
}