Skip to content

Commit

Permalink
Merge branch 'master' into enh-27p
Browse files Browse the repository at this point in the history
  • Loading branch information
kakkokari-gtyih authored Feb 2, 2025
2 parents 4d94afe + e9547a5 commit 9c6ad07
Show file tree
Hide file tree
Showing 14 changed files with 145 additions and 45 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,4 +31,4 @@ jobs:
pnpm build
- name: Test
run: |
pnpm test
SKIP_NETWORK_TEST=true pnpm test
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
------------------
* センシティブフラグの判定を `<meta property="rating">` および `rating` ヘッダでも行うように
* レスポンスに`Cache-Control`ヘッダを含むように
* Bluesky(bsky.app)のプレビューに対応
* `fediverse:creator` のパースに対応
* 依存関係の更新
* eslintの設定を更新

Expand Down
19 changes: 10 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -85,15 +85,16 @@ A Promise of an Object that contains properties below:

| Property | Type | Description |
|:----------------|:-------------------|:-----------------------------------------------------------|
| **title** | *string* \| *null* | The title of the web page |
| **icon** | *string* \| *null* | The url of the icon of the web page |
| **description** | *string* \| *null* | The description of the web page |
| **thumbnail** | *string* \| *null* | The url of the thumbnail of the web page |
| **sitename** | *string* \| *null* | The name of the web site |
| **player** | *Player* | The player of the web page |
| **sensitive** | *boolean* | Whether the url is sensitive |
| **activityPub** | *string* \| *null* | The url of the ActivityPub representation of that web page |
| **url** | *string* | The url of the web page |
| **title** | *string* \| *null* | The title of the web page |
| **icon** | *string* \| *null* | The url of the icon of the web page |
| **description** | *string* \| *null* | The description of the web page |
| **thumbnail** | *string* \| *null* | The url of the thumbnail of the web page |
| **sitename** | *string* \| *null* | The name of the web site |
| **player** | *Player* | The player of the web page |
| **sensitive** | *boolean* | Whether the url is sensitive |
| **activityPub** | *string* \| *null* | The url of the ActivityPub representation of that web page |
| **fediverseCreator** | *string* \| *null* | The pages fediverse handle |
| **url** | *string* | The url of the web page |

#### Summary

Expand Down
36 changes: 23 additions & 13 deletions src/general.ts
Original file line number Diff line number Diff line change
Expand Up @@ -138,19 +138,7 @@ export type GeneralScrapingOptions = {
contentLengthRequired?: boolean;
}

function headerEqualValueContains(search: string, headerValue: string | string[] | undefined) {
if (!headerValue) {
return false;
}

if (Array.isArray(headerValue)) {
return headerValue.some(value => value.toLowerCase() === search.toLowerCase());
}

return headerValue.toLowerCase() === search.toLowerCase();
}

export async function parseGeneral(_url: URL | string, opts?: GeneralScrapingOptions): Promise<Summary | null> {
export async function general(_url: URL | string, opts?: GeneralScrapingOptions): Promise<Summary | null> {
let lang = opts?.lang;
if (lang && !lang.match(/^[\w-]+(\s*,\s*[\w-]+)*$/)) lang = null;

Expand All @@ -164,6 +152,24 @@ export async function parseGeneral(_url: URL | string, opts?: GeneralScrapingOpt
contentLengthLimit: opts?.contentLengthLimit,
contentLengthRequired: opts?.contentLengthRequired,
});

return await parseGeneral(url, res);
}

function headerEqualValueContains(search: string, headerValue: string | string[] | undefined) {
if (!headerValue) {
return false;
}

if (Array.isArray(headerValue)) {
return headerValue.some(value => value.toLowerCase() === search.toLowerCase());
}

return headerValue.toLowerCase() === search.toLowerCase();
}

export async function parseGeneral(_url: URL | string, res: Awaited<ReturnType<typeof scpaping>>): Promise<Summary | null> {
const url = typeof _url === 'string' ? new URL(_url) : _url;
const $ = res.$;
const twitterCard =
$('meta[name="twitter:card"]').attr('content') ||
Expand Down Expand Up @@ -245,6 +251,9 @@ export async function parseGeneral(_url: URL | string, opts?: GeneralScrapingOpt
const activityPub =
$('link[rel="alternate"][type="application/activity+json"]').attr('href') || null;

const fediverseCreator: string | null =
$('meta[name=\'fediverse:creator\']').attr('content') || null;

// https://developer.mixi.co.jp/connect/mixi_plugin/mixi_check/spec_mixi_check/#toc-18-
const sensitive =
$('meta[property=\'mixi:content-rating\']').attr('content') === '1' ||
Expand Down Expand Up @@ -293,5 +302,6 @@ export async function parseGeneral(_url: URL | string, opts?: GeneralScrapingOpt
sitename: siteName || null,
sensitive,
activityPub,
fediverseCreator,
};
}
4 changes: 2 additions & 2 deletions src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import { got, type Agents as GotAgents } from 'got';
import type { FastifyInstance } from 'fastify';
import { SummalyResult } from '@/summary.js';
import { SummalyPlugin as _SummalyPlugin } from '@/iplugin.js';
import { parseGeneral, type GeneralScrapingOptions } from '@/general.js';
import { general, type GeneralScrapingOptions } from '@/general.js';
import { DEFAULT_OPERATION_TIMEOUT, DEFAULT_RESPONSE_TIMEOUT, agent, setAgent } from '@/utils/got.js';
import { plugins as builtinPlugins } from '@/plugins/index.js';

Expand Down Expand Up @@ -125,7 +125,7 @@ export const summaly = async (url: string, options?: SummalyOptions): Promise<Su
};

// eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
const summary = await (match ? match.summarize : parseGeneral)(_url, scrapingOptions);
const summary = await (match ? match.summarize : general)(_url, scrapingOptions);

if (summary == null) {
throw new Error('failed summarize');
Expand Down
1 change: 1 addition & 0 deletions src/plugins/amazon.ts
Original file line number Diff line number Diff line change
Expand Up @@ -55,5 +55,6 @@ export async function summarize(url: URL): Promise<summary> {
},
sitename: 'Amazon',
activityPub: null,
fediverseCreator: null,
};
}
26 changes: 26 additions & 0 deletions src/plugins/bluesky.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import * as cheerio from 'cheerio';
import type Summary from '@/summary.js';
import { getResponse, getGotOptions } from '@/utils/got.js';
import { parseGeneral, type GeneralScrapingOptions } from '@/general.js';

export function test(url: URL): boolean {
return url.hostname === 'bsky.app';
}

export async function summarize(url: URL, opts?: GeneralScrapingOptions): Promise<Summary | null> {
const args = getGotOptions(url.href, opts);

// HEADで取ると404が返るためGETのみで取得
const res = await getResponse({
...args,
method: 'GET',
});
const body = res.body;
const $ = cheerio.load(body);

return await parseGeneral(url, {
body,
$,
response: res,
});
}
4 changes: 2 additions & 2 deletions src/plugins/branchio-deeplinks.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { parseGeneral, type GeneralScrapingOptions } from '@/general.js';
import { general, type GeneralScrapingOptions } from '@/general.js';
import Summary from '@/summary.js';

export function test(url: URL): boolean {
Expand All @@ -12,5 +12,5 @@ export async function summarize(url: URL, opts?: GeneralScrapingOptions): Promis
// Web版に強制リダイレクトすることでbranch.ioの独自ページが開くのを防ぐ
url.searchParams.append('$web_only', 'true');

return await parseGeneral(url, opts);
return await general(url, opts);
}
2 changes: 2 additions & 0 deletions src/plugins/index.ts
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import * as amazon from './amazon.js';
import * as bluesky from './bluesky.js';
import * as wikipedia from './wikipedia.js';
import * as branchIoDeeplinks from './branchio-deeplinks.js';
import { SummalyPlugin } from '@/iplugin.js';

export const plugins: SummalyPlugin[] = [
amazon,
bluesky,
wikipedia,
branchIoDeeplinks,
];
1 change: 1 addition & 0 deletions src/plugins/wikipedia.ts
Original file line number Diff line number Diff line change
Expand Up @@ -43,5 +43,6 @@ export async function summarize(url: URL): Promise<summary> {
},
sitename: 'Wikipedia',
activityPub: null,
fediverseCreator: null,
};
}
5 changes: 5 additions & 0 deletions src/summary.ts
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,11 @@ type Summary = {
* The url of the ActivityPub representation of that web page
*/
activityPub: string | null;

/**
* The @ handle of a fediverse user (https://blog.joinmastodon.org/2024/07/highlighting-journalism-on-mastodon/)
*/
fediverseCreator: string | null;
};

export type SummalyResult = Summary & {
Expand Down
30 changes: 14 additions & 16 deletions src/utils/got.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@ import { readFileSync } from 'node:fs';
import got, * as Got from 'got';
import * as cheerio from 'cheerio';
import PrivateIp from 'private-ip';
import { StatusError } from './status-error.js';
import { detectEncoding, toUtf8 } from './encoding.js';
import type { GeneralScrapingOptions } from '@/general.js';
import { StatusError } from '@/utils/status-error.js';
import { detectEncoding, toUtf8 } from '@/utils/encoding.js';

const _filename = fileURLToPath(import.meta.url);
const _dirname = dirname(_filename);
Expand Down Expand Up @@ -36,30 +37,27 @@ export const DEFAULT_OPERATION_TIMEOUT = 60 * 1000;
export const DEFAULT_MAX_RESPONSE_SIZE = 10 * 1024 * 1024;
export const DEFAULT_BOT_UA = `SummalyBot/${repo.version}`;

export async function scpaping(
url: string,
opts?: {
lang?: string;
userAgent?: string;
responseTimeout?: number;
operationTimeout?: number;
contentLengthLimit?: number;
contentLengthRequired?: boolean;
},
) {
const args: Omit<GotOptions, 'method'> = {
export function getGotOptions(url: string, opts?: GeneralScrapingOptions): Omit<GotOptions, 'method'> {
return {
url,
headers: {
'accept': 'text/html,application/xhtml+xml',
'user-agent': opts?.userAgent ?? DEFAULT_BOT_UA,
'accept-language': opts?.lang,
'accept-language': opts?.lang ?? undefined,
},
typeFilter: /^(text\/html|application\/xhtml\+xml)/,
responseTimeout: opts?.responseTimeout,
operationTimeout: opts?.operationTimeout,
contentLengthLimit: opts?.contentLengthLimit,
contentLengthRequired: opts?.contentLengthRequired,
};
}

export async function scpaping(
url: string,
opts?: GeneralScrapingOptions,
) {
const args = getGotOptions(url, opts);

const headResponse = await getResponse({
...args,
Expand Down Expand Up @@ -110,7 +108,7 @@ export async function head(url: string) {
});
}

async function getResponse(args: GotOptions) {
export async function getResponse(args: GotOptions) {
const timeout = args.responseTimeout ?? DEFAULT_RESPONSE_TIMEOUT;
const operationTimeout = args.operationTimeout ?? DEFAULT_OPERATION_TIMEOUT;

Expand Down
13 changes: 13 additions & 0 deletions test/htmls/fediverse-creator.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
<!doctype html>

<html lang="en">
<head>
<meta charset="utf-8">
<meta name="fediverse:creator" content="@[email protected]">
<title>Meow</title>
</head>
<body>
<h1>Hellooo!</h1>
<p>:3</p>
</body>
</html>
45 changes: 43 additions & 2 deletions test/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ import { dirname } from 'node:path';
import { fileURLToPath } from 'node:url';
import { Agent as httpAgent } from 'node:http';
import { Agent as httpsAgent } from 'node:https';
import { expect, test, describe, beforeEach, afterEach } from '@jest/globals';
import { expect, test, describe, beforeEach, afterEach, xtest } from '@jest/globals';
import fastify, { type FastifyInstance } from 'fastify';
import { summaly } from '../src/index.js';
import { StatusError } from '../src/utils/status-error.js';
Expand All @@ -36,6 +36,15 @@ process.on('unhandledRejection', console.dir);

let app: FastifyInstance | null = null;

function skippableTest(name: string, fn: () => void) {
if (process.env.SKIP_NETWORK_TEST === 'true') {
console.log(`[SKIP] ${name}`);
xtest(name, fn);
} else {
test(name, fn);
}
}

afterEach(async () => {
if (app) {
await app.close();
Expand Down Expand Up @@ -73,10 +82,11 @@ test('basic', async () => {
sensitive: false,
url: host + '/',
activityPub: null,
fediverseCreator: null,
});
});

test('Stage Bye Stage', async () => {
skippableTest('Stage Bye Stage', async () => {
// If this test fails, you must rewrite the result data and the example in README.md.

const summary = await summaly('https://www.youtube.com/watch?v=NMIEAhH_fTU');
Expand All @@ -102,6 +112,7 @@ test('Stage Bye Stage', async () => {
'sitename': 'YouTube',
'sensitive': false,
'activityPub': null,
'fediverseCreator': null,
'url': 'https://www.youtube.com/watch?v=NMIEAhH_fTU',
},
);
Expand Down Expand Up @@ -507,6 +518,36 @@ describe('ActivityPub', () => {
});
});

describe('Fediverse Creator', () => {
test('Basic', async () => {
app = fastify();
app.get('*', (request, reply) => {
const content = fs.readFileSync(_dirname + '/htmls/fediverse-creator.html');
reply.header('content-length', content.length);
reply.header('content-type', 'text/html');
return reply.send(content);
});
await app.listen({ port });

const summary = await summaly(host);
expect(summary.fediverseCreator).toBe('@[email protected]');
});

test('Null', async () => {
app = fastify();
app.get('*', (request, reply) => {
const content = fs.readFileSync(_dirname + '/htmls/basic.html');
reply.header('content-length', content.length);
reply.header('content-type', 'text/html');
return reply.send(content);
});
await app.listen({ port });

const summary = await summaly(host);
expect(summary.fediverseCreator).toBeNull();
});
});

describe('sensitive', () => {
test('default', async () => {
app = fastify();
Expand Down

0 comments on commit 9c6ad07

Please sign in to comment.