Skip to content

Commit 27e69bd

Browse files
committed
Code init commit
1 parent b2b5e91 commit 27e69bd

File tree

5 files changed

+1362
-0
lines changed

5 files changed

+1362
-0
lines changed

README.md

+21
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,23 @@
11
# delayed-memory-update-entnet
22
Recurrent Entity Networks with Delayed Memory Update for Targeted Aspect-based Sentiment Analysis, published at NAACL 2018
3+
4+
```
5+
Python-2.7.12
6+
TensorFlow-1.4.1
7+
Numpy-1.14.2
8+
```
9+
10+
```shell
11+
$ python main.py --embedding_file PATH/TO/GLOVE_EMBEDDING_FILE
12+
```
13+
14+
```
15+
@InProceedings{Liu+:2018,
16+
author = {Liu, Fei and Cohn, Trevor and Baldwin, Timothy},
17+
title = {Recurrent Entity Networks with Delayed Memory Update for Targeted Aspect-based Sentiment Analysis},
18+
booktitle = {Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL HLT 2018)},
19+
year = {2018},
20+
address = {New Orleans, USA},
21+
pages = {278--283}
22+
}
23+
```

data_utils_sentihood.py

+144
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
from __future__ import absolute_import
2+
3+
import os, sys
4+
import re
5+
import numpy as np
6+
import xml.etree.ElementTree
7+
from collections import defaultdict
8+
import nltk
9+
# from vocab_processor import *
10+
import operator
11+
import json
12+
13+
def vectorize_data(sentences, max_sentence_len, max_target_len, max_aspect_len,
14+
word_processor, label_processor):
15+
ret_sentences = word_processor.transform(
16+
[text for _, text, _, _, _ in sentences]
17+
)
18+
# [None, max_sentence_len]
19+
assert ret_sentences.shape[1] == max_sentence_len
20+
21+
ret_loc_indicator = np.zeros((len(sentences), 1), dtype=np.int32)
22+
for i, (_, _, target, _, _) in enumerate(sentences):
23+
assert target.lower() in ['location1', 'location2']
24+
ret_loc_indicator[i, :] = [0 if target.lower() == 'location1' else 1]
25+
26+
ret_targets = word_processor.transform(
27+
[[target] for _, _, target, _, _ in sentences]
28+
)
29+
assert ret_targets.shape[1] == max_sentence_len
30+
ret_targets = ret_targets[:, :max_target_len]
31+
32+
ret_aspects = word_processor.transform(
33+
[aspect_term for _, _, _, aspect_term, _ in sentences]
34+
)
35+
assert ret_aspects.shape[1] == max_sentence_len
36+
ret_aspects = ret_aspects[:, :max_aspect_len]
37+
38+
ret_label = label_processor.transform(
39+
[label for _, _, _, _, label in sentences]
40+
)
41+
# [None, 1]
42+
43+
ret_ids = [sent_id for sent_id, _, _, _, _ in sentences]
44+
return ret_sentences, ret_targets, ret_loc_indicator, ret_aspects, ret_label, np.array(ret_ids, dtype=np.object)
45+
46+
def load_task(data_dir, aspect2idx):
47+
in_file = os.path.join(data_dir, 'sentihood-train.json')
48+
train = parse_sentihood_json(in_file)
49+
in_file = os.path.join(data_dir, 'sentihood-dev.json')
50+
dev = parse_sentihood_json(in_file)
51+
in_file = os.path.join(data_dir, 'sentihood-test.json')
52+
test = parse_sentihood_json(in_file)
53+
54+
train = convert_input(train, aspect2idx)
55+
train_aspect_idx = get_aspect_idx(train, aspect2idx)
56+
train = tokenize(train)
57+
dev = convert_input(dev, aspect2idx)
58+
dev_aspect_idx = get_aspect_idx(dev, aspect2idx)
59+
dev = tokenize(dev)
60+
test = convert_input(test, aspect2idx)
61+
test_aspect_idx = get_aspect_idx(test, aspect2idx)
62+
test = tokenize(test)
63+
64+
return (train, train_aspect_idx), (dev, dev_aspect_idx), (test, test_aspect_idx)
65+
66+
def get_aspect_idx(data, aspect2idx):
67+
ret = []
68+
for _, _, _, aspect, _ in data:
69+
ret.append(aspect2idx[aspect])
70+
assert len(data) == len(ret)
71+
return np.array(ret)
72+
73+
def remove_replacement(data, replacement):
74+
ret_data = []
75+
ret_indices = []
76+
for sent in data:
77+
text = sent[0]
78+
assert replacement in text
79+
index = text.index(replacement)
80+
new_text = text[:index] + text[index+1:]
81+
ret_data.append((
82+
new_text, sent[1], sent[2]
83+
))
84+
ret_indices.append(index)
85+
return ret_data, ret_indices
86+
87+
def lower_case(data):
88+
ret = []
89+
for sent_id, text, target, aspect, sentiment in data:
90+
new_text = map(lambda x: x.lower(), text)
91+
new_aspect = map(lambda x: x.lower(), aspect)
92+
ret.append((sent_id, new_text, target.lower(), new_aspect, sentiment))
93+
return ret
94+
95+
def parse_sentihood_json(in_file):
96+
with open(in_file) as f:
97+
data = json.load(f)
98+
ret = []
99+
for d in data:
100+
text = d['text']
101+
sent_id = d['id']
102+
opinions = []
103+
targets = set()
104+
for opinion in d['opinions']:
105+
sentiment = opinion['sentiment']
106+
aspect = opinion['aspect']
107+
target_entity = opinion['target_entity']
108+
targets.add(target_entity)
109+
opinions.append((target_entity, aspect, sentiment))
110+
ret.append((sent_id, text, opinions))
111+
return ret
112+
113+
def get_all_aspects(data):
114+
aspects = set()
115+
for sent_id, text, opinions in data:
116+
for target_entity, aspect, sentiment in opinions:
117+
aspects.add(aspect)
118+
return aspects
119+
120+
def convert_input(data, all_aspects):
121+
ret = []
122+
for sent_id, text, opinions in data:
123+
for target_entity, aspect, sentiment in opinions:
124+
if aspect not in all_aspects:
125+
continue
126+
ret.append((sent_id, text, target_entity, aspect, sentiment))
127+
assert 'LOCATION1' in text
128+
targets = set(['LOCATION1'])
129+
if 'LOCATION2' in text:
130+
targets.add('LOCATION2')
131+
for target in targets:
132+
aspects = set([a for t, a, _ in opinions if t == target])
133+
none_aspects = [a for a in all_aspects if a not in aspects]
134+
for aspect in none_aspects:
135+
ret.append((sent_id, text, target, aspect, 'None'))
136+
return ret
137+
138+
def tokenize(data):
139+
ret = []
140+
for sent_id, text, target_entity, aspect, sentiment in data:
141+
new_text = nltk.word_tokenize(text)
142+
new_aspect = aspect.split('-')
143+
ret.append((sent_id, new_text, target_entity, new_aspect, sentiment))
144+
return ret

0 commit comments

Comments
 (0)