-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathword_tokenizer.py
61 lines (45 loc) · 2.06 KB
/
word_tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import nltk
import re
import math
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
def tokenize_sentence(sentence):
bioclean = lambda t: re.sub('[.,?;*!%^&_+():-\[\]{}]', '',
t.replace('"', '').replace('/', '').replace('\\', '').replace("'",
'').strip().lower()).split()
if pd.isna(sentence[i]):
sentence[i]=""
sentence[i]=bioclean(sentence[i])
data=pd.read_csv("IU-XRay/all_data.csv")
findings=data["Findings"].tolist()
Image_Indexes=data["Image Index"].tolist()
# sentence="who is this mad man"
# nltk_tokens = nltk.word_tokenize(sentence)
# print (nltk_tokens)
bioclean = lambda t: re.sub('[.,?;*!%^&_+():-\[\]{}]', '',
t.replace('"', '').replace('/', '').replace('\\', '').replace("'",
'').strip().lower()).split()
for i in range(len(findings)):
if pd.isna(findings[i]):
findings[i]= ""
findings[i]=bioclean(findings[i])
# Tokenize the reviews
print("Tokenizing dataset..")
tokenizer = Tokenizer()
tokenizer.fit_on_texts(findings) # give each word a unique id
print("Tokenizing is complete.")
word_index = tokenizer.word_index # final id
print("word index: " + str(word_index))
train_seq = tokenizer.texts_to_sequences(findings) # convert dataset to ids
print("train_seq is complete.")
# Pad the reviews
max_review_length = 170
print("Padding dataset..")
train_pad = pad_sequences(train_seq, maxlen=max_review_length,padding='post') # padded with max length
review_lengths_longer_than_pad = 0
for seq in train_seq: # calculate how many reviews longer than pad length
if len(seq) > max_review_length:
review_lengths_longer_than_pad = review_lengths_longer_than_pad + 1
print("Number of reviews longer than pad length({}): {}".format(max_review_length, review_lengths_longer_than_pad))
print("train_pad is complete.")