Skip to content

Commit

Permalink
fix(encoding): ISO-2022-JPのページで文字化けが発生する問題を修正
Browse files Browse the repository at this point in the history
  • Loading branch information
kakkokari-gtyih committed Nov 19, 2024
1 parent 7fbab86 commit 490e230
Show file tree
Hide file tree
Showing 4 changed files with 69 additions and 2 deletions.
2 changes: 2 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
"@swc/jest": "^0.2.37",
"@types/cheerio": "0.22.35",
"@types/debug": "4.1.12",
"@types/encoding-japanese": "^2.2.1",
"@types/escape-regexp": "^0.0.3",
"@types/node": "22.9.0",
"@typescript-eslint/eslint-plugin": "^7.17.0",
Expand All @@ -42,6 +43,7 @@
},
"dependencies": {
"cheerio": "1.0.0",
"encoding-japanese": "^2.2.0",
"escape-regexp": "0.0.1",
"got": "^14.4.4",
"html-entities": "2.5.2",
Expand Down
17 changes: 17 additions & 0 deletions pnpm-lock.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

50 changes: 49 additions & 1 deletion src/utils/encoding.ts
Original file line number Diff line number Diff line change
@@ -1,14 +1,47 @@
import iconv from 'iconv-lite';
import Encoding from 'encoding-japanese';
import jschardet from 'jschardet';

import type { Response } from 'got';

const regCharset = new RegExp(/charset\s*=\s*["']?([\w-]+)/, 'i');

export const ENCODING_JAPANESE_ENCODING_PREFIX = '__EJ__';

const ENCODING_JAPANESE_SUPPORTED_ENCODING: string[] = [
'UTF32',
'UTF16',
'UTF16BE',
'UTF16LE',
'BINARY',
'ASCII',
'JIS',
'UTF8',
'EUCJP',
'SJIS',
'UNICODE',
'AUTO',
] satisfies Encoding.Encoding[];

/**
* Detect HTML encoding
* @param body Body in Buffer
* @returns encoding
*/
export function detectEncoding(body: Buffer): string {
export function detectEncoding(res: Response): string {
// From header
const contentType = res.headers['content-type'];
if (contentType) {
const matchHeader = contentType.match(regCharset);
if (matchHeader) {
const candicate = matchHeader[1];
const encoding = toEncoding(candicate);
if (encoding != null) return encoding;
}
}

const body = res.rawBody;

// By detection
const detected = jschardet.detect(body, { minimumThreshold: 0.99 });
// eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
Expand All @@ -30,10 +63,25 @@ export function detectEncoding(body: Buffer): string {
}

export function toUtf8(body: Buffer, encoding: string): string {
if (encoding.startsWith(ENCODING_JAPANESE_ENCODING_PREFIX)) {
const _encoding = encoding.slice(ENCODING_JAPANESE_ENCODING_PREFIX.length);

function assertEncoding(enc: string): enc is Encoding.Encoding {
return ENCODING_JAPANESE_SUPPORTED_ENCODING.includes(enc);
}

if (assertEncoding(_encoding)) {
return Encoding.codeToString(Encoding.convert(body, 'UNICODE', _encoding));
}
}
return iconv.decode(body, encoding);
}

function toEncoding(candicate: string): string | null {
// iconvで処理できない
// https://github.com/ashtuchkin/iconv-lite/issues/60
if (candicate.toLowerCase() === 'iso-2022-jp') return '__EJ__JIS';

if (iconv.encodingExists(candicate)) {
if (['shift_jis', 'shift-jis', 'windows-31j', 'x-sjis'].includes(candicate.toLowerCase())) return 'cp932';
return candicate;
Expand Down
2 changes: 1 addition & 1 deletion src/utils/got.ts
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ export async function scpaping(
method: 'GET',
});

const encoding = detectEncoding(response.rawBody);
const encoding = detectEncoding(response);
const body = toUtf8(response.rawBody, encoding);
const $ = cheerio.load(body);

Expand Down

0 comments on commit 490e230

Please sign in to comment.