Skip to content

Commit 7abc7d1

Browse files
authored
Merge pull request #158 from KINGNEWBLUSH/dev
[FEATURE] Update tokenizers
2 parents 855e250 + e86a5e6 commit 7abc7d1

File tree

4 files changed

+126
-14
lines changed

4 files changed

+126
-14
lines changed

AUTHORS.md

+1
Original file line numberDiff line numberDiff line change
@@ -24,4 +24,5 @@
2424

2525
[Heng Yu](https://github.com/GNEHUY)
2626

27+
[Tianyun Ji](https://github.com/KINGNEWBLUSH)
2728
The stared contributors are the corresponding authors.

EduNLP/SIF/tokenization/text/tokenization.py

+78-14
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,14 @@
22
# 2021/5/18 @ tongshiwei
33
import logging
44
import jieba
5+
from nltk.tokenize import word_tokenize
6+
import nltk
7+
import spacy
8+
import tokenizers as huggingface_tokenizer
9+
from tokenizers.trainers import BpeTrainer
510
from .stopwords import DEFAULT_STOPWORDS
11+
from tokenizers import Tokenizer as HGTokenizer
12+
613

714
jieba.setLogLevel(logging.INFO)
815

@@ -15,7 +22,13 @@ def is_chinese(word):
1522
return True
1623

1724

18-
def tokenize(text, granularity="word", stopwords="default"):
25+
def tokenize(text,
26+
granularity="word",
27+
stopwords="default",
28+
tokenizer="jieba",
29+
tok_model="en_core_web_sm",
30+
bpe_json='bpe.tokenizer.json',
31+
bpe_trainfile=None):
1932
"""
2033
Using jieba library to tokenize item by word or char.
2134
@@ -37,17 +50,68 @@ def tokenize(text, granularity="word", stopwords="default"):
3750
"""
3851
stopwords = DEFAULT_STOPWORDS if stopwords == "default" else stopwords
3952
stopwords = stopwords if stopwords is not None else {}
40-
if granularity == "word":
41-
return [token for token in jieba.cut(text) if token not in stopwords and token.strip()]
42-
elif granularity == "char":
43-
jieba_tokens = [token for token in jieba.cut(text) if token not in stopwords and token.strip()]
44-
# Use jieba_tokens to hangle sentence with mixed chinese and english.
45-
split_tokens = []
46-
for token in jieba_tokens:
47-
if is_chinese(token):
48-
split_tokens.extend(list(token))
49-
else:
50-
split_tokens.append(token)
51-
return split_tokens
53+
54+
if (tokenizer == 'jieba'):
55+
if granularity == "word":
56+
return [
57+
token for token in jieba.cut(text)
58+
if token not in stopwords and token.strip()
59+
]
60+
elif granularity == "char":
61+
jieba_tokens = [
62+
token for token in jieba.cut(text)
63+
if token not in stopwords and token.strip()
64+
]
65+
# Use jieba_tokens to hangle sentence with mixed chinese and english.
66+
split_tokens = []
67+
for token in jieba_tokens:
68+
if is_chinese(token):
69+
split_tokens.extend(list(token))
70+
else:
71+
split_tokens.append(token)
72+
return split_tokens
73+
else:
74+
raise TypeError("Unknown granularity %s" % granularity)
75+
76+
elif (tokenizer == 'nltk'):
77+
try:
78+
return [
79+
token for token in word_tokenize(text)
80+
if token not in stopwords and token.strip()
81+
]
82+
except LookupError:
83+
nltk.download('punkt')
84+
return [
85+
token for token in word_tokenize(text)
86+
if token not in stopwords and token.strip()
87+
]
88+
89+
elif (tokenizer == 'spacy'):
90+
try:
91+
spacy_tokenizer = spacy.load(tok_model)
92+
except OSError:
93+
spacy.cli.download(tok_model)
94+
spacy_tokenizer = spacy.load(tok_model)
95+
output = spacy_tokenizer(str(text))
96+
return [
97+
token.text for token in output
98+
if token.text not in stopwords
99+
]
100+
101+
elif (tokenizer == 'bpe'):
102+
try:
103+
tokenizer = HGTokenizer.from_file(bpe_json)
104+
except Exception:
105+
tokenizer = huggingface_tokenizer.Tokenizer(
106+
huggingface_tokenizer.models.BPE())
107+
if (bpe_trainfile is None):
108+
raise LookupError("bpe train file not found, using %s." % bpe_trainfile)
109+
trainer = BpeTrainer(
110+
special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
111+
tokenizer.train(files=[bpe_trainfile], trainer=trainer)
112+
tokenizer.save(bpe_json, pretty=True)
113+
output = tokenizer.encode(text)
114+
output = output.tokens
115+
return output[0]
52116
else:
53-
raise TypeError("Unknown granularity %s" % granularity)
117+
raise TypeError("Invalid Spliter: %s" % tokenizer)

setup.py

+3
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,9 @@
6161
'networkx',
6262
'numpy>=1.17.0',
6363
'jieba',
64+
'nltk',
65+
'spacy',
66+
'tokenizers',
6467
'js2py',
6568
'EduData>=0.0.16',
6669
'PyBaize>=0.0.3'

tests/test_tokenizer/test_tokenizer.py

+44
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import pytest
55
from EduNLP.Tokenizer import get_tokenizer
66
from EduNLP.Pretrain import DisenQTokenizer
7+
from EduNLP.utils import abs_current_dir, path_append
78

89

910
def test_tokenizer():
@@ -50,6 +51,49 @@ def test_CharTokenizer():
5051
assert ret == ans
5152

5253

54+
def test_TokenizerNLTK():
55+
items = ["The stationery store has 600 exercise books, and after selling\
56+
some, there are still 4 packs left, 25 each, how many are sold?"]
57+
ans = [
58+
'The', 'stationery', 'store', 'has', '600', 'exercise',
59+
'books', 'and', 'after', 'selling', 'some', 'there', 'are', 'still',
60+
'4', 'packs', 'left', '25', 'each', 'how', 'many', 'are', 'sold'
61+
]
62+
tokenizer = get_tokenizer("pure_text",
63+
text_params={"tokenizer": 'nltk', "stopwords": set(",?")})
64+
tokens = tokenizer(items)
65+
ret = next(tokens)
66+
assert ret == ans
67+
68+
69+
def test_TokenizerSpacy():
70+
items = ["The stationery store has 600 exercise books, and after selling\
71+
some, there are still 4 packs left, 25 each, how many are sold?"]
72+
ans = [
73+
'The', 'stationery', 'store', 'has', '600', 'exercise',
74+
'books', 'and', 'after', 'selling', ' ', 'some', 'there', 'are', 'still',
75+
'4', 'packs', 'left', '25', 'each', 'how', 'many', 'are', 'sold'
76+
]
77+
tokenizer = get_tokenizer("pure_text",
78+
text_params={"tokenizer": 'spacy', "stopwords": set(",?")})
79+
tokens = tokenizer(items)
80+
ret = next(tokens)
81+
assert ret == ans
82+
83+
84+
def test_TokenizerBPE():
85+
items = ['The stationery store has $600$ exercise books, and after selling some,\
86+
there are still $4$ packs left, $25$ each, how many are sold?']
87+
ans = ['h', '600', ' ', '4', ' ', '25', ' ']
88+
data_path = path_append(abs_current_dir(__file__),
89+
"../../static/test_data/standard_luna_data.json", to_str=True)
90+
tokenizer = get_tokenizer("pure_text", text_params={"tokenizer": 'bpe', "stopwords": set(",?"),
91+
"bpe_trainfile": data_path})
92+
tokens = tokenizer(items)
93+
ret = next(tokens)
94+
assert ret == ans
95+
96+
5397
def test_SpaceTokenizer():
5498
items = ['文具店有 $600$ 本练习本,卖出一些后,还剩 $4$ 包,每包 $25$ 本,卖出多少本?']
5599
tokenizer = get_tokenizer("space", stop_words=[])

0 commit comments

Comments
 (0)