Skip to content

Commit aa2a7d2

Browse files
committed
release code of ctunet
1 parent 49614e4 commit aa2a7d2

38 files changed

+3262
-54
lines changed

davarocr/davar_ie/datasets/pipelines/tokenizer.py

+14-9
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,10 @@
66
77
# Current Version: 1.0.0
88
# Date : 2020-05-31
9+
# Current Version: 1.0.1
10+
# Date : 2022-12-12
911
##################################################################################################
1012
"""
11-
import os
1213
import copy
1314
import numpy as np
1415

@@ -18,6 +19,7 @@
1819
@PIPELINES.register_module()
1920
class CharPadTokenize():
2021
"""Tokenize texts in characters and return their indexes ( padded if required)."""
22+
2123
def __init__(self, vocab, targets, max_length=None, map_target_prefix=None):
2224
"""
2325
Args:
@@ -32,15 +34,18 @@ def __init__(self, vocab, targets, max_length=None, map_target_prefix=None):
3234
self.max_length = max_length
3335
self.map_target_prefix = map_target_prefix
3436

35-
if os.path.exists(self.vocab):
37+
if self.vocab is not None:
3638
with open(self.vocab, 'r', encoding='utf8') as read_f:
3739
all_words = read_f.readline().strip()
38-
default_token = ['[PAD]', '[UNK]']
3940
all_words = list(all_words)
40-
self.character = default_token + all_words
41+
else:
42+
all_words = []
43+
44+
default_token = ['[PAD]', '[UNK]']
45+
self.character = default_token + all_words
4146

42-
# default 0 to pad
43-
self.word2idx = {char: idx for idx, char in enumerate(self.character)}
47+
# default 0 to pad
48+
self.word2idx = {char: idx for idx, char in enumerate(self.character)}
4449

4550
def __call__(self, results):
4651
"""Forward process, including tokenization and (optional) padding.
@@ -60,14 +65,14 @@ def __call__(self, results):
6065
# pad to max length if required
6166
if self.max_length is not None:
6267
if len(tmp_per_line) > self.max_length:
63-
per_target_token = tmp_per_line[:self.max_length]
68+
tmp_per_line = tmp_per_line[:self.max_length]
6469
else:
65-
tmp_per_line.extend([self.word2idx['[PAD]']]*(self.max_length - len(tmp_per_line)))
70+
tmp_per_line.extend([self.word2idx['[PAD]']] * (self.max_length - len(tmp_per_line)))
6671
per_target_token.append(np.array(tmp_per_line))
6772

6873
# add map_target to results if required.
6974
if self.map_target_prefix is not None:
70-
results[self.map_target_prefix+key] = np.array(per_target_token)
75+
results[self.map_target_prefix + key] = np.array(per_target_token)
7176
else:
7277
results[key] = np.array(per_target_token)
7378

davarocr/davar_ie/models/connects/__init__.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,13 @@
55
# Abstract :
66
77
# Current Version: 1.0.0
8-
# Date : 2020-05-31
8+
# Date : 2022-11-22
99
##################################################################################################
1010
"""
1111
from .util import BertConfig
1212
from .multimodal_feature_merge import MultiModalFusion
1313
from .multimodal_context_module import MultiModalContextModule
14+
from .multimodal_context_module_plusplus import MultiModalContextModulePlusPlus
1415
from .relation_module import BertEncoder
1516

16-
__all__ = ['MultiModalFusion', 'MultiModalContextModule', 'BertConfig', 'BertEncoder']
17+
__all__ = ['MultiModalFusion', 'MultiModalContextModule', 'BertConfig', 'BertEncoder', 'MultiModalContextModulePlusPlus']

0 commit comments

Comments
 (0)