Skip to content

Commit ddbb96f

Browse files
authored
Added BERT_PyTorch (pclubiitk#25)
* Add files via upload * Create readme.md * Create data_loader_for_pretrain.py * Create preprocess_pretraining.py * Create model_pretrain.py * Update data_loader_for_pretrain.py * Create pretrain.py * Update preprocess_pretraining.py * Update data_loader_for_pretrain.py * Update pretrain.py * Update preprocess_pretraining.py * Update data_loader_for_pretrain.py * Update pretrain.py * Create utils.py * Create classify_cola.py * Update readme.md * Create readme.md * Add files via upload * Update readme.md * Update readme.md * Add files via upload * Update classify_cola.py * Update readme.md * Update classify_cola.py * Update pretrain.py * Update utils.py * Update preprocess_pretraining.py * Update preprocess_pretraining.py * Update preprocess_pretraining.py * Update data_loader_for_pretrain.py * Update data_loader_for_pretrain.py * Update pretrain.py * Update pretrain.py * Update pretrain.py * Update model_pretrain.py * Update model_pretrain.py * Update pretrain.py * Update data_loader_for_pretrain.py * Update classify_cola.py * Update readme.md * Update readme.md
1 parent d4eb29f commit ddbb96f

File tree

13 files changed

+700
-0
lines changed

13 files changed

+700
-0
lines changed

NLP/BERT_PyTorch/assets/img1.png

75.1 KB
Loading

NLP/BERT_PyTorch/assets/img2.png

8.11 KB
Loading

NLP/BERT_PyTorch/assets/img3.png

157 KB
Loading

NLP/BERT_PyTorch/assets/img4.png

32.4 KB
Loading

NLP/BERT_PyTorch/assets/img5.png

55.7 KB
Loading

NLP/BERT_PyTorch/assets/readme.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
assets

NLP/BERT_PyTorch/classify_cola.py

Lines changed: 185 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,185 @@
1+
import torch
2+
import torch.nn as nn
3+
from random import randint, shuffle
4+
from random import random as rand
5+
from pytorch_pretrained_bert.tokenization import BertTokenizer
6+
import random
7+
import math
8+
import os
9+
import argparse
10+
import model_pretrain
11+
import pandas as pd
12+
from utils import load
13+
14+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
15+
16+
parser = argparse.ArgumentParser()
17+
# model config
18+
parser.add_argument('--dim', type=int, default=768)
19+
parser.add_argument('--max_len', type=int, default=512)
20+
parser.add_argument('--heads', type=int, default=12)
21+
parser.add_argument('--n_segs', type=int, default=2)
22+
23+
parser.add_argument('--pretrain_file', type=str, required=True)
24+
parser.add_argument('--dataset', type=str, required=True) #COLA dataset in csv format
25+
parser.add_argument('--epochs', type=int, default=4)
26+
parser.add_argument('--batch_size', type=int, default=32)
27+
parser.add_argument('--lr', type=float, default=0.00002)
28+
parser.add_argument('--beta1', type=float, default=0.9)
29+
parser.add_argument('--beta2', type=float, default=0.999)
30+
parser.add_argument('--decay', type=float, default=0.01)
31+
32+
args = parser.parse_args()
33+
34+
df = pd.read_csv(args.dataset, delimiter='\t', header=None, names=['sentence_source', 'label', 'label_notes', 'sentence'])
35+
sentences = df.sentence.values
36+
labels = df.label.values
37+
38+
train_sent=sentences[0:6000]
39+
train_label=labels[0:6000]
40+
test_sent=sentences[6000:]
41+
test_label=labels[6000:]
42+
43+
class PreprocessCola():
44+
""" Pre-processing steps for pretraining transformer """
45+
def __init__(self, max_len=512):
46+
super().__init__()
47+
48+
self.indexer = BertTokenizer.from_pretrained('bert-base-uncased')
49+
self.max_len = max_len
50+
51+
def __call__(self,data):
52+
token,label=data
53+
#truncate_tokens_pair(tokens_a, tokens_b, self.max_len - 3)
54+
55+
# Add Special Tokens
56+
tokens = ['[CLS]'] + token + ['[SEP]']
57+
segment_ids = [0]*(len(token)+2)
58+
input_mask = [1]*len(tokens)
59+
60+
# Token Indexing
61+
input_ids = self.indexer.convert_tokens_to_ids(tokens)
62+
63+
64+
# Zero Padding
65+
n_pad = self.max_len - len(input_ids)
66+
input_ids.extend([0]*int(n_pad))
67+
segment_ids.extend([0]*int(n_pad))
68+
input_mask.extend([0]*int(n_pad))
69+
70+
# Zero Padding for masked target
71+
72+
73+
return (input_ids, segment_ids, input_mask,label)
74+
75+
class DataLoaderCola():
76+
""" Load sentence pair from corpus """
77+
def __init__(self, sent,label, batch_size, max_len, short_sampling_prob=0.1):
78+
super().__init__()
79+
self.sent=sent
80+
self.label=label
81+
self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
82+
self.max_len = max_len
83+
self.short_sampling_prob = short_sampling_prob
84+
self.batch_size = batch_size
85+
self.preproc= PreprocessCola(max_len)
86+
87+
88+
def __iter__(self): # iterator to load data
89+
k=0
90+
while True:
91+
batch = []
92+
for i in range(self.batch_size):
93+
94+
len_tokens = randint(1, int(self.max_len / 2)) \
95+
if rand() < self.short_sampling_prob \
96+
else int(self.max_len / 2)
97+
98+
99+
tokens =self.tokenizer.tokenize( self.sent[k])
100+
label=self.label[k]
101+
k=k+1
102+
data = (tokens,label)
103+
data=self.preproc(data)
104+
if k>len(sentences):
105+
return
106+
107+
batch.append(data)
108+
109+
batch_tensors = [torch.tensor(x, dtype=torch.long) for x in zip(*batch)]
110+
yield batch_tensors
111+
112+
data_train=DataLoaderCola(train_sent,train_label,args.batch_size,args.max_len)
113+
data_test=DataLoaderCola(test_sent,test_label,args.batch_size,args.max_len)
114+
115+
# Function to calculate the accuracy of our predictions vs labels
116+
def flat_accuracy(preds, labels):
117+
pred_flat = np.argmax(preds, axis=1).flatten()
118+
labels_flat = labels.flatten()
119+
return np.sum(pred_flat == labels_flat) / len(labels_flat)
120+
121+
class ColaClassifier(nn.Module):
122+
def __init__(self,dim,heads,max_len,n_seg):
123+
super().__init__()
124+
self.allenc=model_pretrain.AllEncode(dim,heads,max_len,n_seg)
125+
self.fc1=nn.Linear(dim,dim)
126+
self.tanh=nn.Tanh()
127+
self.fc2=nn.Linear(dim,2)
128+
129+
def forward(self,batch):
130+
input_ids, segment_ids, input_mask,label=batch
131+
out=self.allenc(input_ids,input_mask,segment_ids)
132+
133+
out1=self.fc1(out[:,0])
134+
out1=self.tanh(out1)
135+
out1=self.fc2(out1)
136+
return out1
137+
138+
modelcls=ColaClassifier(args.dim,args.heads,args.max_len,args.n_segs).to(device)
139+
140+
criterion=nn.CrossEntropyLoss().to(device)
141+
optimizer = torch.optim.AdamW(modelcls.parameters(), lr=args.lr, betas=(args.beta1,args.beta2), weight_decay=0.01)
142+
143+
load(args.pretrain_file,modelcls.allenc)
144+
145+
def loss_func(model,batch):
146+
input_ids, segment_ids, input_mask,label=batch
147+
clsf=model(batch)
148+
lossclf=criterion(clsf,label)
149+
return lossclf
150+
151+
for epoch in range(args.epochs):
152+
train_loss=0
153+
for i,batch in enumerate(data_train):
154+
batch = [t.to(device) for t in batch]
155+
optimizer.zero_grad()
156+
loss=loss_func(modelcls,batch)
157+
train_loss += loss.item()
158+
loss.backward()
159+
optimizer.step()
160+
loss_list.append
161+
162+
avg_train_loss = train_loss / len(data_train)
163+
print(" Average training loss: {0:.2f}".format(avg_train_loss))
164+
165+
modelcls.eval()
166+
total_eval_accuracy = 0
167+
168+
for batch in data_test:
169+
batch = [t.to(device) for t in batch]
170+
input_ids, segment_ids, input_mask,label=batch
171+
with torch.no_grad():
172+
clsf=modelcls(batch)
173+
174+
total_eval_accuracy += flat_accuracy(clsf, label)
175+
176+
avg_val_accuracy = total_eval_accuracy / len(dat_test)
177+
print(" Accuracy: {0:.2f}".format(avg_val_accuracy))
178+
179+
180+
181+
182+
183+
184+
185+
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
import os
2+
from pytorch_pretrained_bert.tokenization import BertTokenizer
3+
import preprocess_pretraining
4+
import torch
5+
from utils import seek_random_offset
6+
from random import random as rand
7+
from random import randint, shuffle
8+
9+
10+
class DataLoader():
11+
""" Load sentence pair from corpus """
12+
def __init__(self, file, batch_size, max_len, short_sampling_prob=0.1):
13+
super().__init__()
14+
self.f_pos = open(file, "r", encoding='utf-8', errors='ignore')
15+
self.f_neg = open(file, "r", encoding='utf-8', errors='ignore')
16+
self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
17+
self.max_len = max_len
18+
self.short_sampling_prob = short_sampling_prob
19+
self.batch_size = batch_size
20+
self.preproc= preprocess_pretraining.PreProcess(max_len*0.15,0.15,max_len)
21+
22+
def read_tokens(self, f, length, discard_last_and_restart=True):
23+
""" Read tokens from file pointer with limited length """
24+
tokens = []
25+
while len(tokens) < length:
26+
line = f.readline()
27+
if not line: # end of file
28+
return None
29+
if not line.strip():
30+
if discard_last_and_restart:
31+
continue
32+
else:
33+
return tokens
34+
tokens.extend(self.tokenizer.tokenize(line.strip()))
35+
36+
return tokens
37+
38+
def __iter__(self): # iterator to load data
39+
while True:
40+
batch = []
41+
for i in range(self.batch_size):
42+
43+
len_tokens = randint(1, int(self.max_len / 2)) \
44+
if rand() < self.short_sampling_prob \
45+
else int(self.max_len / 2)
46+
47+
is_next = rand() < 0.5 # whether token_b is next to token_a or not
48+
49+
tokens_a = self.read_tokens(self.f_pos, len_tokens, True)
50+
seek_random_offset(self.f_neg)
51+
f_next = self.f_pos if is_next else self.f_neg
52+
tokens_b = self.read_tokens(f_next, len_tokens, False)
53+
54+
if tokens_a is None or tokens_b is None:
55+
self.f_pos.seek(0, 0)
56+
return
57+
58+
data = (is_next, tokens_a, tokens_b)
59+
data=self.preproc(data)
60+
61+
batch.append(data)
62+
63+
batch_tensors = [torch.tensor(x, dtype=torch.long) for x in zip(*batch)]
64+
yield batch_tensors
65+

0 commit comments

Comments
 (0)