-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbatch_functions.py
89 lines (70 loc) · 2.89 KB
/
batch_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# imports
import pandas as pd
from data_rnn import load_imdb
import torch
def data2df(final) -> tuple[pd.DataFrame, pd.DataFrame]:
"""
Converts the raw data into a pandas dataframe,
and converts the list with integers into tensors.
"""
# load data
(x_train, y_train), (x_test, y_test), (i2w, w2i), _ = load_imdb(final=final)
# Dataframes of training and test/validation data
df_train = pd.DataFrame({'x_train': x_train, 'y_train': y_train})
df_test = pd.DataFrame({'x_test': x_test, 'y_test': y_test})
# sorting data in ascending order
df_train['len'] = df_train['x_train'].apply(lambda x: len(x))
df_train = df_train.sort_values(by=['len'])
df_test['len'] = df_test['x_test'].apply(lambda x: len(x))
df_test = df_test.sort_values(by=['len'])
# convert training and test/validation data from lists to tensors
df_train['x_train'] = df_train['x_train'].apply(lambda x: torch.tensor(x, dtype=torch.long))
df_train['y_train'] = df_train['y_train'].apply(lambda x: torch.tensor(x, dtype=torch.long))
df_test['x_test'] = df_test['x_test'].apply(lambda x: torch.tensor(x, dtype=torch.long))
df_test['y_test'] = df_test['y_test'].apply(lambda x: torch.tensor(x, dtype=torch.long))
# return training data and validation/test data
return df_train, df_test
def padding(batch):
"""
pads batches according to the longest sequence
in a batch and truncates each batch until max_tokens
is reached.
"""
padded_batch = list()
max_tensor = max([len(x[0]) for x in batch]) # get longest tensor
pad = [max_tensor - len(x[0]) for x in batch] # how much padding is required for each sequence
# only apply padding is the sum of the pad list is not 0
if sum(pad) != 0:
for i in range(len(pad)):
if pad[i] > 0:
padded_batch.append((torch.cat((batch[i][0], torch.zeros(pad[i]))), batch[i][1])) # pad
else:
padded_batch.append(batch[i]) # append sequence if no padding is required
else:
padded_batch = batch
# compute amount of tokens in the padded batch
batch_length = sum([len(x[0]) for x in padded_batch])
# return batch and length of the batch
return padded_batch, batch_length
def formatted(batch):
"""
Format each batch such that each batch consist of 2 tensors.
The first tensors is a tensor consisting of multiple
tensors that share the same dimension. On the other
hand, the second tensor is 1 dimensional and holds all
class labels.
"""
sequences = torch.stack([x[0] for x in batch], dim=0)
labels = torch.stack([x[1] for x in batch])
batch = [sequences, labels]
return batch
def get_device():
"""
Checks if gpu is available
"""
if torch.cuda.is_available():
device = torch.device('cuda:0')
else:
device = torch.device('cpu')
print(device)
return device