-
-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathigoworker.js
141 lines (131 loc) · 5.88 KB
/
igoworker.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
var DICT_FILES = ['char.category', 'code2category', 'word2id', 'word.dat', 'word.ary.idx', 'word.inf', 'matrix.bin'];
var TAGGER = null;
console.log('WORKER!!!')
// initialize IGO-JS
igo.getServerFileToArrayBufffer("res/ipadic.zip", function(buffer) {
try {
var blob = new Blob([new Uint8Array(buffer)]);
var reader = new FileReader();
reader.onload = function(e) {
console.log('Unzipping data for igo.js...')
var dic = Zip.inflate(new Uint8Array(reader.result))
console.log('Loading data for igo.js...')
TAGGER = loadTagger(dic);
console.log('igo.js is ready.')
}
reader.readAsArrayBuffer(blob);
console.log('Initialize data for igo.js....')
} catch (e) {
console.error(e.toString());
}
});
function loadTagger(dicdir) {
var files = new Array();
for (var i = 0; i < DICT_FILES.length; ++i) {
files[DICT_FILES[i]] = dicdir.files[DICT_FILES[i]].inflate();
}
var category = new igo.CharCategory(files['code2category'], files['char.category']);
var wdc = new igo.WordDic(files['word2id'], files['word.dat'], files['word.ary.idx'], files['word.inf']);
var unk = new igo.Unknown(category);
var mtx = new igo.Matrix(files['matrix.bin']);
return new igo.Tagger(wdc, unk, mtx);
}
onmessage = (_request) => {
const req = _request.data
// console.log('onMessage ==>', req)
const yomiStyle = req.options.yomiStyle
const preferLongerKanjiSegments = req.options.preferLongerKanjiSegments
const filterOkurigana = req.options.filterOkurigana
const furiganaType = req.options.furiganaType
const FURIGANAIZED = {};
for (key in req.textMapNeedsFuriganaize) {
FURIGANAIZED[key] = req.textMapNeedsFuriganaize[key];
const tagged = TAGGER.parse(req.textMapNeedsFuriganaize[key]);
// console.log('-->', tagged)
processed = '';
// override numeric term (dates, ages etc) readings
// TODO: implement override
// var numeric = false;
// var numeric_yomi = EXCEPTIONS;
// var numeric_kanji = '';
if (preferLongerKanjiSegments) {
// sort tagged in order to add furigana
// for the longer Kanji series first
tagged.sort(function(a, b) {
var kanjiRegExp = /([\u3400-\u9FBF]*)/;
var aKanji = a.surface.match(kanjiRegExp)[0];
var bKanji = b.surface.match(kanjiRegExp)[0];
return bKanji.length - aKanji.length;
});
}
// console.log('tagged ===>', tagged)
tagged.forEach((t) => {
if (t.surface.match(/[\u3400-\u9FBF]/)) {
let kanji = t.surface;
let yomi = t.feature.split(',')[t.feature.split(',').length - 2];
//filter okurigana (word endings)
if (filterOkurigana) {
const diff = JsDiff.diffChars(kanji, wanakana.toHiragana(yomi));
let kanjiFound = false;
let yomiFound = false;
//separate kanji and kana characters in the string using diff
//and inject furigana only into kanji part
diff.forEach((part) => {
if (part.added) {
yomi = wanakana.toKatakana(part.value);
yomiFound = true;
}
if (part.removed) {
kanji = part.value;
kanjiFound = true;
}
if (kanjiFound && yomiFound) {
addRuby(FURIGANAIZED, kanji, yomi, key, processed, yomiStyle, furiganaType, preferLongerKanjiSegments);
kanjiFound = false;
yomiFound = false;
}
});
} else {
addRuby(FURIGANAIZED, kanji, yomi, key, processed, yomiStyle, furiganaType, preferLongerKanjiSegments);
}
}
});
}
// console.log('Furiganized ===>', FURIGANAIZED)
postMessage({
reqId: req.reqId,
furiganaizedTextMap: FURIGANAIZED
})
}
//Ruby tag injector
function addRuby(furiganized, kanji, yomi, key, processed, yomiStyle, furiganaType, preferLongerKanjiSegments) {
//furigana can be displayed in either hiragana, katakana or romaji
switch (furiganaType) {
case "hira":
yomi = wanakana.toHiragana(yomi);
break;
case "roma":
yomi = wanakana.toRomaji(yomi);
break;
default:
break;
}
// const rubyPatt = new RegExp(`<ruby><rb>${kanji}<\\/rb><rp>\\(<\\/rp><rt[ style=]*.*?>([\\u3040-\\u3096|\\u30A1-\\u30FA|\\uFF66-\\uFF9D|\\u31F0-\\u31FF]+)<\\/rt><rp>\\)<\\/rp><\\/ruby>`, 'g');
const rubyPatt = new RegExp(`<ruby><rb>${kanji}<\\/rb><rt[ style=]*.*?>([\\u3400-\\u9FBF]+)<\\/rt><\\/ruby>`, 'g');
//inject furigana into text nodes
//a different regex is used for repeat passes to avoid having multiple rubies on the same base
if (processed.indexOf(kanji) == -1) {
processed += kanji;
if (furiganized[key].match(rubyPatt)) {
// furiganized[key] = furiganized[key].replace(rubyPatt, `<ruby><rb>${kanji}</rb><rp>(</rp><rt style="${yomiStyle}">${yomi}</rt><rp>)</rp></ruby>`);
furiganized[key] = furiganized[key].replace(rubyPatt, `<ruby><rb>${kanji}</rb><rt style="${yomiStyle}">${yomi}</rt></ruby>`);
} else {
if (preferLongerKanjiSegments) {
bare_rxp = new RegExp(kanji + `(?![^<]*<\/rb>)`, 'g');
} else {
bare_rxp = new RegExp(kanji, 'g');
}
furiganized[key] = furiganized[key].replace(bare_rxp, `<ruby><rb>${kanji}</rb><rt style="${yomiStyle}">${yomi}</rt></ruby>`);
}
}
}