-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathtextprocessing.py
66 lines (54 loc) · 2.04 KB
/
textprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import json
import os
import numpy as np
import pandas as pd
json_files = []
for each_json_file in os.listdir(files_path):
if each_json_file.endswith('.json'):
json_files.append(each_json_file)
input_texts = []
target_texts = []
for each_file in json_files:
file = json.load(open(each_file, 'r'))
text_data = []
for dictionary in file:
temp_list = []
for dialog in dictionary['dialog']:
temp_list.append((dialog['sender'], dialog['text']))
text_data.append(temp_list)
for each_sent in text_data:
for each_message in each_sent:
#print(each_message)
if each_message[0] == 'participant1':
#print(each_message)
input_texts.append(each_message[1])
elif each_message[0] == 'participant2':
target_texts.append(each_message[1])
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
import string
from string import punctuation
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))
stop_words = set(stopwords.words('english'))
english_words = set(nltk.corpus.words.words())
stop_words.update(['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}']) # remove it if you need punctuation
def tokenize(text):
tokenized_list = []
for each_sent in text:
# print(each_sent)
temp_sent_list= []
sentences = sent_tokenize(each_sent)
temp_sent_list.append(sentences)
#print(sentences)
for each_word in sentences:
temp_word_list = []
if each_word not in stop_words and each_word not in string.punctuation:
if each_word in english_words or not each_word.isalpha():
#print(each_word)
words = word_tokenize(each_word.lower())
temp_word_list.append(words)
#words = re.sub('\d+','',str(words))
#words = re.sub('[.,;?]','',str(words))
tokenized_list.append(words)
return tokenized_list