From 981b4efecff23964a1577e59eb0ecebadc24b7b4 Mon Sep 17 00:00:00 2001 From: darkskygit Date: Fri, 14 Feb 2025 09:47:57 +0000 Subject: [PATCH] feat(server): worker improve (#10176) fix AF-2225 --- .../__tests__/__snapshots__/worker.e2e.ts.md | 52 ++++++++++++++ .../__snapshots__/worker.e2e.ts.snap | Bin 674 -> 996 bytes .../server/src/__tests__/worker.e2e.ts | 52 ++++++++++++++ .../server/src/plugins/worker/controller.ts | 67 +++++++++++++++--- .../server/src/plugins/worker/types.ts | 1 + .../src/plugins/worker/utils/encoding.ts | 45 ++++++++++++ 6 files changed, 207 insertions(+), 10 deletions(-) create mode 100644 packages/backend/server/src/plugins/worker/utils/encoding.ts diff --git a/packages/backend/server/src/__tests__/__snapshots__/worker.e2e.ts.md b/packages/backend/server/src/__tests__/__snapshots__/worker.e2e.ts.md index bd6979b853..3e91ea9277 100644 --- a/packages/backend/server/src/__tests__/__snapshots__/worker.e2e.ts.md +++ b/packages/backend/server/src/__tests__/__snapshots__/worker.e2e.ts.md @@ -76,3 +76,55 @@ Generated by [AVA](https://avajs.dev). url: 'http://example.com/page', videos: [], } + +> Snapshot 5 + + { + charset: 'gbk', + favicons: [ + 'http://localhost:3010/api/worker/image-proxy?url=https%3A%2F%2Fexample.com%2Ffavicon.ico', + ], + images: [], + title: '你好,世界。', + url: 'http://example.com/gb2312', + videos: [], + } + +> Snapshot 6 + + { + charset: 'shift_jis', + favicons: [ + 'http://localhost:3010/api/worker/image-proxy?url=https%3A%2F%2Fexample.com%2Ffavicon.ico', + ], + images: [], + title: 'こんにちは、世界。', + url: 'http://example.com/shift-jis', + videos: [], + } + +> Snapshot 7 + + { + charset: 'big5', + favicons: [ + 'http://localhost:3010/api/worker/image-proxy?url=https%3A%2F%2Fexample.com%2Ffavicon.ico', + ], + images: [], + title: '你好,世界。', + url: 'http://example.com/big5', + videos: [], + } + +> Snapshot 8 + + { + charset: 'euc-kr', + favicons: [ + 'http://localhost:3010/api/worker/image-proxy?url=https%3A%2F%2Fexample.com%2Ffavicon.ico', + ], + images: [], + title: '안녕하세요, 세계.', + url: 'http://example.com/euc-kr', + videos: [], + } diff --git a/packages/backend/server/src/__tests__/__snapshots__/worker.e2e.ts.snap b/packages/backend/server/src/__tests__/__snapshots__/worker.e2e.ts.snap index 7d2ba025969c18e95caa4ecbcca8df475de466ee..3136f79eda357589e64ae795f7b9d9d2ea819ebd 100644 GIT binary patch literal 996 zcmVh!usJDg~ucJPGySB?pV(RrKho2(qn(Hqx7T@L=a< zXR;Y)qHDCJ#XZf;_wD<>`Tzdkd$U|D7?xHM^Pe)wrNJCgwrYl+u^sEwY=&29iE&|4 zTa+y+7M69LN;;j5qX+JfY{L$c*iI6!6izTr65?N`)9JP3*U7+oHc9LUFb2Q?@EL&b z0NeyNSq9``kZzC^c97U3Bo5NOopfoI&b*txQL1Mq*vT3bGLY9(WkS%BpZe$#E(AAA znO!x<*p;!VL#s?OC%C*xt8L4Vj2)jicy#LEtCNAegpgF$gu3mDy-$M6$XPpd8Bdzt zC}}PN_!hv=cTbxBm(4mgxSlC9s^23;rY&EPr%3#H051VJ;)^g^D;AmKijbV*rhH+< zap-I+odS6p10-3b6}IMCrA)ARvM?*zVP=+Od96Q^Y`I?eu4i@5W-4< zQc1rN;EzC3d}GpvSl5PFQf(9uxo!I`Ot*S9ER7mvOUQl0d-m?hQJd#xET_VpoRVcw zostJ?j`5=V2az2f$>zti`SG@Q%;v{i;tp}m3NQECzgkY^4%ueO8@*q`yjB5l2EfO6 zHrVe0#5(s{k5;CRVA7{uDpVBB$0MzEphIg7T)T4V`sY_}e|_%S%7q(C=bCf#n^#sMxE^Kz_ozJ! z;1d8pxUsDppsT=3&naPdMzY`kpALN3nc_EfEP{6Z&T(=2!J&OSjJ6425x}L^mqg#z zX(w1s8>~1=+GitqAL@|zW2;Mxx6Um6y7bZN!piE!4|Zo(7glb5wJ_A-Hs98q9>IN% S0B7%i?tcQ&3BSv-4*&q@D)R3D literal 674 zcmV;T0$u$9zHSj4{7p1P0kGfO8^1@?*V)U@E!P#8_-!$0aU^i<<3zaRG6S4Px?IB zj4d+XUE@z;u1!Wa+~HcYW-2|uDzp}1)18i0FzrT6u?{y}W!;CYGvIz<>CxKqy^ZDj z>lvM+4KuN}+o9Ym%DOSV=twsSn$hFXyan(Hz_*h_Gkz!RvOxH5i!=X}h-}If$OXz@ z1#k<%Y6@X7ZZx>+Lnv;D&|F(kiuFq666hi(sn}p`zSpysOg4CNt7rHM51Xd7mmd}P zyI!YV&%1(eyMYMXf7|xXakYH|;3I&qCs*6Cq_!JtE2q@z|8WfB$aw=eh8F?c2JjTX zYXBbrd;u^@e?;PB0+0DzdrCw`$S^r8uXAnOr2_+Koahn(iDIO|y26v8wwC&U#j)e0 zGf@h5YSOu~{@43Q8C^bBL3;JJjEUx}RldVIQNU+B*{Mc$RL6 { fetchSpy.restore(); } + + { + const encoded = [ + { + content: 'xOO6w6OsysC956Gj', + charset: 'gb2312', + }, + { + content: 'grGC8YLJgr+CzYFBkKKKRYFC', + charset: 'shift-jis', + }, + { + content: 'p0GmbqFBpUCsyaFD', + charset: 'big5', + }, + { + content: 'vsiz58fPvLy/5CwgvLyw6C4=', + charset: 'euc-kr', + }, + ]; + + for (const { content, charset } of encoded) { + const before = Buffer.from(` + + + + + + `); + const fakeHTML = new Response(Buffer.concat([before, encoded, after])); + + Object.defineProperty(fakeHTML, 'url', { + value: `http://example.com/${charset}`, + }); + + const fetchSpy = Sinon.stub(global, 'fetch').resolves(fakeHTML); + + await assertAndSnapshot( + '/api/worker/link-preview', + 'should decode HTML content with charset', + { + status: 200, + method: 'POST', + body: { url: `http://example.com/${charset}` }, + } + ); + + fetchSpy.restore(); + } + } }); diff --git a/packages/backend/server/src/plugins/worker/controller.ts b/packages/backend/server/src/plugins/worker/controller.ts index 636a7d2f4e..ef92cfec66 100644 --- a/packages/backend/server/src/plugins/worker/controller.ts +++ b/packages/backend/server/src/plugins/worker/controller.ts @@ -24,6 +24,10 @@ import { parseJson, reduceUrls, } from './utils'; +import { decodeWithCharset } from './utils/encoding'; + +// cache for 30 minutes +const CACHE_TTL = 1000 * 60 * 30; @Public() @Controller('/api/worker') @@ -67,6 +71,25 @@ export class WorkerController { throw new BadRequest(`Invalid URL`); } + const cachedUrl = `image-proxy:${targetURL.toString()}`; + const cachedResponse = await this.cache.get(cachedUrl); + if (cachedResponse) { + const buffer = Buffer.from(cachedResponse, 'base64'); + // if cached response is empty, it means the request is rejected by server previously + if (buffer.length === 0) { + return resp.status(404).header(getCorsHeaders(origin)).send(); + } + return resp + .status(200) + .header({ + 'Access-Control-Allow-Origin': origin, + Vary: 'Origin', + 'Access-Control-Allow-Methods': 'GET', + 'Content-Type': 'image/*', + }) + .send(buffer); + } + const response = await fetch( new Request(targetURL.toString(), { method: 'GET', @@ -75,8 +98,12 @@ export class WorkerController { ); if (response.ok) { const contentType = response.headers.get('Content-Type'); - const contentDisposition = response.headers.get('Content-Disposition'); if (contentType?.startsWith('image/')) { + const buffer = Buffer.from(await response.arrayBuffer()); + await this.cache.set(cachedUrl, buffer.toString('base64'), { + ttl: CACHE_TTL, + }); + const contentDisposition = response.headers.get('Content-Disposition'); return resp .status(200) .header({ @@ -86,11 +113,17 @@ export class WorkerController { 'Content-Type': contentType, 'Content-Disposition': contentDisposition, }) - .send(Buffer.from(await response.arrayBuffer())); + .send(buffer); } else { throw new BadRequest('Invalid content type'); } } else { + if (response.status >= 400 && response.status < 500) { + // rejected by server, cache a empty response + await this.cache.set(cachedUrl, Buffer.from([]).toString('base64'), { + ttl: CACHE_TTL, + }); + } this.logger.error('Failed to fetch image', { origin, url: imageURL, @@ -130,18 +163,19 @@ export class WorkerController { this.logger.debug('Received request', { origin, method: request.method }); - const targetBody = parseJson(request.body); - const targetURL = fixUrl(targetBody?.url); + const requestBody = parseJson(request.body); + const targetURL = fixUrl(requestBody?.url); // not allow same site preview if (!targetURL || isOriginAllowed(targetURL.origin, this.allowedOrigin)) { - this.logger.error('Invalid URL', { origin, url: targetBody?.url }); + this.logger.error('Invalid URL', { origin, url: requestBody?.url }); throw new BadRequest('Invalid URL'); } this.logger.debug('Processing request', { origin, url: targetURL }); try { - const cachedResponse = await this.cache.get(targetURL.toString()); + const cachedUrl = `link-preview:${targetURL.toString()}`; + const cachedResponse = await this.cache.get(cachedUrl); if (cachedResponse) { return resp .status(200) @@ -155,12 +189,23 @@ export class WorkerController { const response = await fetch(targetURL, { headers: cloneHeader(request.headers), }); - this.logger.error('Fetched URL', { + this.logger.debug('Fetched URL', { origin, url: targetURL, status: response.status, }); + if (requestBody?.head) { + return resp + .status( + response.status >= 200 && response.status < 400 + ? 204 + : response.status + ) + .header(getCorsHeaders(origin)) + .send(); + } + const res: LinkPreviewResponse = { url: response.url, images: [], @@ -170,6 +215,8 @@ export class WorkerController { const baseUrl = new URL(request.url, this.url.baseUrl).toString(); if (response.body) { + const resp = await decodeWithCharset(response, res); + const rewriter = new HTMLRewriter() .on('meta', { element(element) { @@ -230,11 +277,11 @@ export class WorkerController { }, }); - await rewriter.transform(response).text(); + await rewriter.transform(resp).text(); res.images = await reduceUrls(baseUrl, res.images); - this.logger.error('Processed response with HTMLRewriter', { + this.logger.debug('Processed response with HTMLRewriter', { origin, url: response.url, }); @@ -259,7 +306,7 @@ export class WorkerController { responseSize: json.length, }); - await this.cache.set(targetURL.toString(), res); + await this.cache.set(cachedUrl, res, { ttl: CACHE_TTL }); return resp .status(200) .header({ diff --git a/packages/backend/server/src/plugins/worker/types.ts b/packages/backend/server/src/plugins/worker/types.ts index 27765946c9..004767a943 100644 --- a/packages/backend/server/src/plugins/worker/types.ts +++ b/packages/backend/server/src/plugins/worker/types.ts @@ -1,5 +1,6 @@ export type LinkPreviewRequest = { url: string; + head?: boolean; }; export type LinkPreviewResponse = { diff --git a/packages/backend/server/src/plugins/worker/utils/encoding.ts b/packages/backend/server/src/plugins/worker/utils/encoding.ts new file mode 100644 index 0000000000..0d86c4bb92 --- /dev/null +++ b/packages/backend/server/src/plugins/worker/utils/encoding.ts @@ -0,0 +1,45 @@ +import { HTMLRewriter } from 'htmlrewriter'; + +import { LinkPreviewResponse } from '../types'; + +export async function decodeWithCharset( + response: Response, + res: LinkPreviewResponse +): Promise { + let charset: string | undefined; + const rewriter = new HTMLRewriter() + .on('html', { + element(element) { + charset = element.getAttribute('lang') || undefined; + }, + }) + .on('meta', { + element(element) { + const property = + element.getAttribute('property') ?? + element.getAttribute('name') ?? + element.getAttribute('http-equiv'); + const content = element.getAttribute('content'); + if (property && content) { + switch (property.toLowerCase()) { + case 'content-type': + charset = content + .split(';') + .find(x => x.includes('charset=')) + ?.trim() + ?.split('=')[1]; + break; + } + } + }, + }); + const body = await rewriter.transform(response).arrayBuffer(); + + if (charset) { + const decoder = new TextDecoder(charset); + res.charset = decoder.encoding; + return new Response(decoder.decode(body), response); + } else { + return new Response(body, response); + } +}