Skip to content

Commit 47bfce8

Browse files
authored
Merge pull request #155 from KenelmQLH/dev
[FEATURE] Update D2V, AutoTokenizer, and pretraining scripts
2 parents 598d788 + d675143 commit 47bfce8

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

50 files changed

+1940
-587
lines changed

EduNLP/Formula/Formula.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
from .ast import str2ast, get_edges, link_variable
1010

11-
CONST_MATHORD = {r"\pi"}
11+
CONST_MATHORD = {"\\pi"}
1212

1313
__all__ = ["Formula", "FormulaGroup", "CONST_MATHORD", "link_formulas"]
1414

EduNLP/I2V/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,4 @@
22
# 2021/8/1 @ tongshiwei
33

44
from .i2v import I2V, get_pretrained_i2v
5-
from .i2v import D2V, W2V, Elmo, Bert, DisenQ, QuesNet
5+
from .i2v import D2V, W2V, Elmo, Bert, HfAuto, DisenQ, QuesNet

EduNLP/I2V/i2v.py

Lines changed: 79 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,10 @@
1111
from longling import path_append
1212
from EduData import get_data
1313
from ..Tokenizer import Tokenizer, get_tokenizer
14-
from EduNLP.Pretrain import ElmoTokenizer, BertTokenizer, DisenQTokenizer, QuesNetTokenizer, Question
14+
from EduNLP.Pretrain import ElmoTokenizer, BertTokenizer, HfAutoTokenizer, DisenQTokenizer, QuesNetTokenizer, Question
1515
from EduNLP import logger
1616

17-
__all__ = ["I2V", "D2V", "W2V", "Elmo", "Bert", "DisenQ", "QuesNet", "get_pretrained_i2v"]
17+
__all__ = ["I2V", "D2V", "W2V", "Elmo", "Bert", "HfAuto", "DisenQ", "QuesNet", "get_pretrained_i2v"]
1818

1919

2020
class I2V(object):
@@ -51,8 +51,8 @@ class I2V(object):
5151
(...)
5252
>>> path = path_append(path, os.path.basename(path) + '.bin', to_str=True)
5353
>>> i2v = D2V("pure_text", "d2v", filepath=path, pretrained_t2v=False)
54-
>>> i2v(item)
55-
([array([ ...dtype=float32)], None)
54+
>>> i2v(item) # doctest: +SKIP
55+
([array([ ...dtype=float32)], [[array([ ...dtype=float32)]])
5656
5757
Returns
5858
-------
@@ -69,6 +69,9 @@ def __init__(self, tokenizer, t2v, *args, tokenizer_kwargs: dict = None,
6969
if tokenizer == 'bert':
7070
self.tokenizer = BertTokenizer.from_pretrained(
7171
**tokenizer_kwargs if tokenizer_kwargs is not None else {})
72+
elif tokenizer == 'hf_auto':
73+
self.tokenizer = HfAutoTokenizer.from_pretrained(
74+
**tokenizer_kwargs if tokenizer_kwargs is not None else {})
7275
elif tokenizer == 'quesnet':
7376
self.tokenizer = QuesNetTokenizer.from_pretrained(
7477
**tokenizer_kwargs if tokenizer_kwargs is not None else {})
@@ -189,8 +192,8 @@ class D2V(I2V):
189192
(...)
190193
>>> path = path_append(path, os.path.basename(path) + '.bin', to_str=True)
191194
>>> i2v = D2V("pure_text","d2v",filepath=path, pretrained_t2v = False)
192-
>>> i2v(item)
193-
([array([ ...dtype=float32)], None)
195+
>>> i2v(item) # doctest: +SKIP
196+
# ([array([ ...dtype=float32)], [[array([ ...dtype=float32)]])
194197
195198
Returns
196199
-------
@@ -221,7 +224,7 @@ def infer_vector(self, items, tokenize=True, key=lambda x: x, *args,
221224
"""
222225
tokens = self.tokenize(items, key=key) if tokenize is True else items
223226
tokens = [token for token in tokens]
224-
return self.t2v(tokens, *args, **kwargs), None
227+
return self.t2v(tokens, *args, **kwargs), self.t2v.infer_tokens(tokens, *args, **kwargs)
225228

226229
@classmethod
227230
def from_pretrained(cls, name, model_dir=MODEL_DIR, *args, **kwargs):
@@ -426,6 +429,71 @@ def from_pretrained(cls, name, model_dir=MODEL_DIR, device='cpu', *args, **kwarg
426429
tokenizer_kwargs=tokenizer_kwargs)
427430

428431

432+
class HfAuto(I2V):
433+
"""
434+
The model aims to transfer item and tokens to vector with Bert.
435+
436+
Bases
437+
-------
438+
I2V
439+
440+
Parameters
441+
-----------
442+
tokenizer: str
443+
the tokenizer name
444+
t2v: str
445+
the name of token2vector model
446+
args:
447+
the parameters passed to t2v
448+
tokenizer_kwargs: dict
449+
the parameters passed to tokenizer
450+
pretrained_t2v: bool
451+
True: use pretrained t2v model
452+
False: use your own t2v model
453+
kwargs:
454+
the parameters passed to t2v
455+
456+
Returns
457+
-------
458+
i2v model: Bert
459+
"""
460+
461+
def infer_vector(self, items: Tuple[List[str], List[dict], str, dict],
462+
*args, key=lambda x: x, return_tensors='pt', **kwargs) -> tuple:
463+
"""
464+
It is a function to switch item to vector. And before using the function, it is nesseary to load model.
465+
466+
Parameters
467+
-----------
468+
items : str or dict or list
469+
the item of question, or question list
470+
return_tensors: str
471+
tensor type used in tokenizer
472+
args:
473+
the parameters passed to t2v
474+
kwargs:
475+
the parameters passed to t2v
476+
477+
Returns
478+
--------
479+
vector:list
480+
"""
481+
is_batch = isinstance(items, list)
482+
items = items if is_batch else [items]
483+
inputs = self.tokenize(items, key=key, return_tensors=return_tensors)
484+
return self.t2v.infer_vector(inputs, *args, **kwargs), self.t2v.infer_tokens(inputs, *args, **kwargs)
485+
486+
@classmethod
487+
def from_pretrained(cls, name, model_dir=MODEL_DIR, device='cpu', *args, **kwargs):
488+
model_path = path_append(model_dir, get_pretrained_model_info(name)[0].split('/')[-1], to_str=True)
489+
for i in [".tar.gz", ".tar.bz2", ".tar.bz", ".tar.tgz", ".tar", ".tgz", ".zip", ".rar"]:
490+
model_path = model_path.replace(i, "")
491+
logger.info("model_path: %s" % model_path)
492+
tokenizer_kwargs = {"tokenizer_config_dir": model_path}
493+
return cls("bert", name, pretrained_t2v=True, model_dir=model_dir, device=device,
494+
tokenizer_kwargs=tokenizer_kwargs)
495+
496+
429497
class DisenQ(I2V):
430498
"""
431499
The model aims to transfer item and tokens to vector with DisenQ.
@@ -542,6 +610,7 @@ def from_pretrained(cls, name, model_dir=MODEL_DIR, device='cpu', *args, **kwarg
542610
"w2v": W2V,
543611
"d2v": D2V,
544612
"bert": Bert,
613+
"hf_auto": HfAuto,
545614
"disenq": DisenQ,
546615
"quesnet": QuesNet,
547616
"elmo": Elmo
@@ -579,13 +648,13 @@ def get_pretrained_i2v(name, model_dir=MODEL_DIR, device='cpu'):
579648
>>> (); i2v = get_pretrained_i2v("d2v_test_256", "examples/test_model/d2v"); () # doctest: +SKIP
580649
(...)
581650
>>> print(i2v(item)) # doctest: +SKIP
582-
([array([ ...dtype=float32)], None)
651+
([array([ ...dtype=float32)], [[array([ ...dtype=float32)]])
583652
"""
584653
pretrained_models = get_all_pretrained_models()
585654
if name not in pretrained_models:
586655
raise KeyError(
587656
"Unknown model name %s, use one of the provided models: %s" % (name, ", ".join(pretrained_models))
588657
)
589-
_, t2v = get_pretrained_model_info(name)
590-
_class, *params = MODEL_MAP[t2v], name
658+
_, i2v = get_pretrained_model_info(name)
659+
_class, *params = MODEL_MAP[i2v], name
591660
return _class.from_pretrained(*params, model_dir=model_dir, device=device)

EduNLP/ModelZoo/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from .utils import *
22
from .bert import *
3+
from .hf_model import *
34
from .rnn import *
45
from .disenqnet import *
56
from .quesnet import *
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from .hf_model import *
Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
import torch
2+
from torch import nn
3+
import json
4+
import os
5+
from transformers import AutoModel, PretrainedConfig, AutoConfig
6+
from typing import List
7+
from EduNLP.utils.log import logger
8+
from ..base_model import BaseModel
9+
from ..utils import PropertyPredictionOutput, KnowledgePredictionOutput
10+
from ..rnn.harnn import HAM
11+
12+
13+
__all__ = ["HfModelForPropertyPrediction", "HfModelForKnowledgePrediction"]
14+
15+
16+
class HfModelForPropertyPrediction(BaseModel):
17+
def __init__(self, pretrained_model_dir=None, head_dropout=0.5, init=True):
18+
super(HfModelForPropertyPrediction, self).__init__()
19+
bert_config = AutoConfig.from_pretrained(pretrained_model_dir)
20+
if init:
21+
logger.info(f'Load AutoModel from checkpoint: {pretrained_model_dir}')
22+
self.model = AutoModel.from_pretrained(pretrained_model_dir)
23+
else:
24+
logger.info(f'Load AutoModel from config: {pretrained_model_dir}')
25+
self.model = AutoModel(bert_config)
26+
self.hidden_size = self.model.config.hidden_size
27+
self.head_dropout = head_dropout
28+
self.dropout = nn.Dropout(head_dropout)
29+
self.classifier = nn.Linear(self.hidden_size, 1)
30+
self.sigmoid = nn.Sigmoid()
31+
self.criterion = nn.MSELoss()
32+
33+
self.config = {k: v for k, v in locals().items() if k not in ["self", "__class__", "bert_config"]}
34+
self.config['architecture'] = 'HfModelForPropertyPrediction'
35+
self.config = PretrainedConfig.from_dict(self.config)
36+
37+
def forward(self,
38+
input_ids=None,
39+
attention_mask=None,
40+
token_type_ids=None,
41+
labels=None):
42+
outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
43+
item_embeds = outputs.last_hidden_state[:, 0, :]
44+
item_embeds = self.dropout(item_embeds)
45+
46+
logits = self.sigmoid(self.classifier(item_embeds)).squeeze(1)
47+
loss = None
48+
if labels is not None:
49+
loss = self.criterion(logits, labels) if labels is not None else None
50+
return PropertyPredictionOutput(
51+
loss=loss,
52+
logits=logits,
53+
)
54+
55+
@classmethod
56+
def from_config(cls, config_path, **kwargs):
57+
config_path = os.path.join(os.path.dirname(config_path), 'model_config.json')
58+
with open(config_path, "r", encoding="utf-8") as rf:
59+
model_config = json.load(rf)
60+
model_config['pretrained_model_dir'] = os.path.dirname(config_path)
61+
model_config.update(kwargs)
62+
return cls(
63+
pretrained_model_dir=model_config['pretrained_model_dir'],
64+
head_dropout=model_config.get("head_dropout", 0.5),
65+
init=model_config.get('init', False)
66+
)
67+
68+
def save_config(self, config_dir):
69+
config_path = os.path.join(config_dir, "model_config.json")
70+
with open(config_path, "w", encoding="utf-8") as wf:
71+
json.dump(self.config.to_dict(), wf, ensure_ascii=False, indent=2)
72+
self.model.config.save_pretrained(config_dir)
73+
74+
75+
class HfModelForKnowledgePrediction(BaseModel):
76+
def __init__(self,
77+
pretrained_model_dir=None,
78+
num_classes_list: List[int] = None,
79+
num_total_classes: int = None,
80+
head_dropout=0.5,
81+
flat_cls_weight=0.5,
82+
attention_unit_size=256,
83+
fc_hidden_size=512,
84+
beta=0.5,
85+
init=True
86+
):
87+
super(HfModelForKnowledgePrediction, self).__init__()
88+
bert_config = AutoConfig.from_pretrained(pretrained_model_dir)
89+
if init:
90+
logger.info(f'Load AutoModel from checkpoint: {pretrained_model_dir}')
91+
self.model = AutoModel.from_pretrained(pretrained_model_dir)
92+
else:
93+
logger.info(f'Load AutoModel from config: {pretrained_model_dir}')
94+
self.model = AutoModel(bert_config)
95+
self.hidden_size = self.model.config.hidden_size
96+
self.head_dropout = head_dropout
97+
self.dropout = nn.Dropout(head_dropout)
98+
self.sigmoid = nn.Sigmoid()
99+
self.criterion = nn.MSELoss()
100+
self.flat_classifier = nn.Linear(self.hidden_size, num_total_classes)
101+
self.ham_classifier = HAM(
102+
num_classes_list=num_classes_list,
103+
num_total_classes=num_total_classes,
104+
sequence_model_hidden_size=self.model.config.hidden_size,
105+
attention_unit_size=attention_unit_size,
106+
fc_hidden_size=fc_hidden_size,
107+
beta=beta,
108+
dropout_rate=head_dropout
109+
)
110+
self.flat_cls_weight = flat_cls_weight
111+
self.num_classes_list = num_classes_list
112+
self.num_total_classes = num_total_classes
113+
114+
self.config = {k: v for k, v in locals().items() if k not in ["self", "__class__", "bert_config"]}
115+
self.config['architecture'] = 'HfModelForKnowledgePrediction'
116+
self.config = PretrainedConfig.from_dict(self.config)
117+
118+
def forward(self,
119+
input_ids=None,
120+
attention_mask=None,
121+
token_type_ids=None,
122+
labels=None):
123+
outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
124+
item_embeds = outputs.last_hidden_state[:, 0, :]
125+
item_embeds = self.dropout(item_embeds)
126+
tokens_embeds = outputs.last_hidden_state
127+
tokens_embeds = self.dropout(tokens_embeds)
128+
flat_logits = self.sigmoid(self.flat_classifier(item_embeds))
129+
ham_outputs = self.ham_classifier(tokens_embeds)
130+
ham_logits = self.sigmoid(ham_outputs.scores)
131+
logits = self.flat_cls_weight * flat_logits + (1 - self.flat_cls_weight) * ham_logits
132+
loss = None
133+
if labels is not None:
134+
labels = torch.sum(torch.nn.functional.one_hot(labels, num_classes=self.num_total_classes), dim=1)
135+
labels = labels.float()
136+
loss = self.criterion(logits, labels) if labels is not None else None
137+
return KnowledgePredictionOutput(
138+
loss=loss,
139+
logits=logits,
140+
)
141+
142+
@classmethod
143+
def from_config(cls, config_path, **kwargs):
144+
config_path = os.path.join(os.path.dirname(config_path), 'model_config.json')
145+
with open(config_path, "r", encoding="utf-8") as rf:
146+
model_config = json.load(rf)
147+
model_config['pretrained_model_dir'] = os.path.dirname(config_path)
148+
model_config.update(kwargs)
149+
return cls(
150+
pretrained_model_dir=model_config['pretrained_model_dir'],
151+
head_dropout=model_config.get("head_dropout", 0.5),
152+
num_classes_list=model_config.get('num_classes_list'),
153+
num_total_classes=model_config.get('num_total_classes'),
154+
flat_cls_weight=model_config.get('flat_cls_weight', 0.5),
155+
attention_unit_size=model_config.get('attention_unit_size', 256),
156+
fc_hidden_size=model_config.get('fc_hidden_size', 512),
157+
beta=model_config.get('beta', 0.5),
158+
init=model_config.get('init', False)
159+
)
160+
161+
def save_config(self, config_dir):
162+
config_path = os.path.join(config_dir, "model_config.json")
163+
with open(config_path, "w", encoding="utf-8") as wf:
164+
json.dump(self.config.to_dict(), wf, ensure_ascii=False, indent=2)
165+
self.model.config.save_pretrained(config_dir)

0 commit comments

Comments
 (0)