6
6
7
7
# Current Version: 1.0.0
8
8
# Date : 2020-05-31
9
+ # Current Version: 1.0.1
10
+ # Date : 2022-12-12
9
11
##################################################################################################
10
12
"""
11
- import os
12
13
import copy
13
14
import numpy as np
14
15
18
19
@PIPELINES .register_module ()
19
20
class CharPadTokenize ():
20
21
"""Tokenize texts in characters and return their indexes ( padded if required)."""
22
+
21
23
def __init__ (self , vocab , targets , max_length = None , map_target_prefix = None ):
22
24
"""
23
25
Args:
@@ -32,15 +34,18 @@ def __init__(self, vocab, targets, max_length=None, map_target_prefix=None):
32
34
self .max_length = max_length
33
35
self .map_target_prefix = map_target_prefix
34
36
35
- if os . path . exists ( self .vocab ) :
37
+ if self .vocab is not None :
36
38
with open (self .vocab , 'r' , encoding = 'utf8' ) as read_f :
37
39
all_words = read_f .readline ().strip ()
38
- default_token = ['[PAD]' , '[UNK]' ]
39
40
all_words = list (all_words )
40
- self .character = default_token + all_words
41
+ else :
42
+ all_words = []
43
+
44
+ default_token = ['[PAD]' , '[UNK]' ]
45
+ self .character = default_token + all_words
41
46
42
- # default 0 to pad
43
- self .word2idx = {char : idx for idx , char in enumerate (self .character )}
47
+ # default 0 to pad
48
+ self .word2idx = {char : idx for idx , char in enumerate (self .character )}
44
49
45
50
def __call__ (self , results ):
46
51
"""Forward process, including tokenization and (optional) padding.
@@ -60,14 +65,14 @@ def __call__(self, results):
60
65
# pad to max length if required
61
66
if self .max_length is not None :
62
67
if len (tmp_per_line ) > self .max_length :
63
- per_target_token = tmp_per_line [:self .max_length ]
68
+ tmp_per_line = tmp_per_line [:self .max_length ]
64
69
else :
65
- tmp_per_line .extend ([self .word2idx ['[PAD]' ]]* (self .max_length - len (tmp_per_line )))
70
+ tmp_per_line .extend ([self .word2idx ['[PAD]' ]] * (self .max_length - len (tmp_per_line )))
66
71
per_target_token .append (np .array (tmp_per_line ))
67
72
68
73
# add map_target to results if required.
69
74
if self .map_target_prefix is not None :
70
- results [self .map_target_prefix + key ] = np .array (per_target_token )
75
+ results [self .map_target_prefix + key ] = np .array (per_target_token )
71
76
else :
72
77
results [key ] = np .array (per_target_token )
73
78
0 commit comments