diff --git a/packages/backend/server/src/__tests__/__snapshots__/worker.e2e.ts.md b/packages/backend/server/src/__tests__/__snapshots__/worker.e2e.ts.md index bd6979b853..3e91ea9277 100644 --- a/packages/backend/server/src/__tests__/__snapshots__/worker.e2e.ts.md +++ b/packages/backend/server/src/__tests__/__snapshots__/worker.e2e.ts.md @@ -76,3 +76,55 @@ Generated by [AVA](https://avajs.dev). url: 'http://example.com/page', videos: [], } + +> Snapshot 5 + + { + charset: 'gbk', + favicons: [ + 'http://localhost:3010/api/worker/image-proxy?url=https%3A%2F%2Fexample.com%2Ffavicon.ico', + ], + images: [], + title: '你好,世界。', + url: 'http://example.com/gb2312', + videos: [], + } + +> Snapshot 6 + + { + charset: 'shift_jis', + favicons: [ + 'http://localhost:3010/api/worker/image-proxy?url=https%3A%2F%2Fexample.com%2Ffavicon.ico', + ], + images: [], + title: 'こんにちは、世界。', + url: 'http://example.com/shift-jis', + videos: [], + } + +> Snapshot 7 + + { + charset: 'big5', + favicons: [ + 'http://localhost:3010/api/worker/image-proxy?url=https%3A%2F%2Fexample.com%2Ffavicon.ico', + ], + images: [], + title: '你好,世界。', + url: 'http://example.com/big5', + videos: [], + } + +> Snapshot 8 + + { + charset: 'euc-kr', + favicons: [ + 'http://localhost:3010/api/worker/image-proxy?url=https%3A%2F%2Fexample.com%2Ffavicon.ico', + ], + images: [], + title: '안녕하세요, 세계.', + url: 'http://example.com/euc-kr', + videos: [], + } diff --git a/packages/backend/server/src/__tests__/__snapshots__/worker.e2e.ts.snap b/packages/backend/server/src/__tests__/__snapshots__/worker.e2e.ts.snap index 7d2ba02596..3136f79eda 100644 Binary files a/packages/backend/server/src/__tests__/__snapshots__/worker.e2e.ts.snap and b/packages/backend/server/src/__tests__/__snapshots__/worker.e2e.ts.snap differ diff --git a/packages/backend/server/src/__tests__/worker.e2e.ts b/packages/backend/server/src/__tests__/worker.e2e.ts index f03a6c2008..8ccfd423d1 100644 --- a/packages/backend/server/src/__tests__/worker.e2e.ts +++ b/packages/backend/server/src/__tests__/worker.e2e.ts @@ -171,4 +171,56 @@ test('should preview link', async t => { fetchSpy.restore(); } + + { + const encoded = [ + { + content: 'xOO6w6OsysC956Gj', + charset: 'gb2312', + }, + { + content: 'grGC8YLJgr+CzYFBkKKKRYFC', + charset: 'shift-jis', + }, + { + content: 'p0GmbqFBpUCsyaFD', + charset: 'big5', + }, + { + content: 'vsiz58fPvLy/5CwgvLyw6C4=', + charset: 'euc-kr', + }, + ]; + + for (const { content, charset } of encoded) { + const before = Buffer.from(` + + + + + + `); + const fakeHTML = new Response(Buffer.concat([before, encoded, after])); + + Object.defineProperty(fakeHTML, 'url', { + value: `http://example.com/${charset}`, + }); + + const fetchSpy = Sinon.stub(global, 'fetch').resolves(fakeHTML); + + await assertAndSnapshot( + '/api/worker/link-preview', + 'should decode HTML content with charset', + { + status: 200, + method: 'POST', + body: { url: `http://example.com/${charset}` }, + } + ); + + fetchSpy.restore(); + } + } }); diff --git a/packages/backend/server/src/plugins/worker/controller.ts b/packages/backend/server/src/plugins/worker/controller.ts index 636a7d2f4e..ef92cfec66 100644 --- a/packages/backend/server/src/plugins/worker/controller.ts +++ b/packages/backend/server/src/plugins/worker/controller.ts @@ -24,6 +24,10 @@ import { parseJson, reduceUrls, } from './utils'; +import { decodeWithCharset } from './utils/encoding'; + +// cache for 30 minutes +const CACHE_TTL = 1000 * 60 * 30; @Public() @Controller('/api/worker') @@ -67,6 +71,25 @@ export class WorkerController { throw new BadRequest(`Invalid URL`); } + const cachedUrl = `image-proxy:${targetURL.toString()}`; + const cachedResponse = await this.cache.get(cachedUrl); + if (cachedResponse) { + const buffer = Buffer.from(cachedResponse, 'base64'); + // if cached response is empty, it means the request is rejected by server previously + if (buffer.length === 0) { + return resp.status(404).header(getCorsHeaders(origin)).send(); + } + return resp + .status(200) + .header({ + 'Access-Control-Allow-Origin': origin, + Vary: 'Origin', + 'Access-Control-Allow-Methods': 'GET', + 'Content-Type': 'image/*', + }) + .send(buffer); + } + const response = await fetch( new Request(targetURL.toString(), { method: 'GET', @@ -75,8 +98,12 @@ export class WorkerController { ); if (response.ok) { const contentType = response.headers.get('Content-Type'); - const contentDisposition = response.headers.get('Content-Disposition'); if (contentType?.startsWith('image/')) { + const buffer = Buffer.from(await response.arrayBuffer()); + await this.cache.set(cachedUrl, buffer.toString('base64'), { + ttl: CACHE_TTL, + }); + const contentDisposition = response.headers.get('Content-Disposition'); return resp .status(200) .header({ @@ -86,11 +113,17 @@ export class WorkerController { 'Content-Type': contentType, 'Content-Disposition': contentDisposition, }) - .send(Buffer.from(await response.arrayBuffer())); + .send(buffer); } else { throw new BadRequest('Invalid content type'); } } else { + if (response.status >= 400 && response.status < 500) { + // rejected by server, cache a empty response + await this.cache.set(cachedUrl, Buffer.from([]).toString('base64'), { + ttl: CACHE_TTL, + }); + } this.logger.error('Failed to fetch image', { origin, url: imageURL, @@ -130,18 +163,19 @@ export class WorkerController { this.logger.debug('Received request', { origin, method: request.method }); - const targetBody = parseJson(request.body); - const targetURL = fixUrl(targetBody?.url); + const requestBody = parseJson(request.body); + const targetURL = fixUrl(requestBody?.url); // not allow same site preview if (!targetURL || isOriginAllowed(targetURL.origin, this.allowedOrigin)) { - this.logger.error('Invalid URL', { origin, url: targetBody?.url }); + this.logger.error('Invalid URL', { origin, url: requestBody?.url }); throw new BadRequest('Invalid URL'); } this.logger.debug('Processing request', { origin, url: targetURL }); try { - const cachedResponse = await this.cache.get(targetURL.toString()); + const cachedUrl = `link-preview:${targetURL.toString()}`; + const cachedResponse = await this.cache.get(cachedUrl); if (cachedResponse) { return resp .status(200) @@ -155,12 +189,23 @@ export class WorkerController { const response = await fetch(targetURL, { headers: cloneHeader(request.headers), }); - this.logger.error('Fetched URL', { + this.logger.debug('Fetched URL', { origin, url: targetURL, status: response.status, }); + if (requestBody?.head) { + return resp + .status( + response.status >= 200 && response.status < 400 + ? 204 + : response.status + ) + .header(getCorsHeaders(origin)) + .send(); + } + const res: LinkPreviewResponse = { url: response.url, images: [], @@ -170,6 +215,8 @@ export class WorkerController { const baseUrl = new URL(request.url, this.url.baseUrl).toString(); if (response.body) { + const resp = await decodeWithCharset(response, res); + const rewriter = new HTMLRewriter() .on('meta', { element(element) { @@ -230,11 +277,11 @@ export class WorkerController { }, }); - await rewriter.transform(response).text(); + await rewriter.transform(resp).text(); res.images = await reduceUrls(baseUrl, res.images); - this.logger.error('Processed response with HTMLRewriter', { + this.logger.debug('Processed response with HTMLRewriter', { origin, url: response.url, }); @@ -259,7 +306,7 @@ export class WorkerController { responseSize: json.length, }); - await this.cache.set(targetURL.toString(), res); + await this.cache.set(cachedUrl, res, { ttl: CACHE_TTL }); return resp .status(200) .header({ diff --git a/packages/backend/server/src/plugins/worker/types.ts b/packages/backend/server/src/plugins/worker/types.ts index 27765946c9..004767a943 100644 --- a/packages/backend/server/src/plugins/worker/types.ts +++ b/packages/backend/server/src/plugins/worker/types.ts @@ -1,5 +1,6 @@ export type LinkPreviewRequest = { url: string; + head?: boolean; }; export type LinkPreviewResponse = { diff --git a/packages/backend/server/src/plugins/worker/utils/encoding.ts b/packages/backend/server/src/plugins/worker/utils/encoding.ts new file mode 100644 index 0000000000..0d86c4bb92 --- /dev/null +++ b/packages/backend/server/src/plugins/worker/utils/encoding.ts @@ -0,0 +1,45 @@ +import { HTMLRewriter } from 'htmlrewriter'; + +import { LinkPreviewResponse } from '../types'; + +export async function decodeWithCharset( + response: Response, + res: LinkPreviewResponse +): Promise { + let charset: string | undefined; + const rewriter = new HTMLRewriter() + .on('html', { + element(element) { + charset = element.getAttribute('lang') || undefined; + }, + }) + .on('meta', { + element(element) { + const property = + element.getAttribute('property') ?? + element.getAttribute('name') ?? + element.getAttribute('http-equiv'); + const content = element.getAttribute('content'); + if (property && content) { + switch (property.toLowerCase()) { + case 'content-type': + charset = content + .split(';') + .find(x => x.includes('charset=')) + ?.trim() + ?.split('=')[1]; + break; + } + } + }, + }); + const body = await rewriter.transform(response).arrayBuffer(); + + if (charset) { + const decoder = new TextDecoder(charset); + res.charset = decoder.encoding; + return new Response(decoder.decode(body), response); + } else { + return new Response(body, response); + } +}