|
| 1 | +from __future__ import absolute_import |
| 2 | + |
| 3 | +import os, sys |
| 4 | +import re |
| 5 | +import numpy as np |
| 6 | +import xml.etree.ElementTree |
| 7 | +from collections import defaultdict |
| 8 | +import nltk |
| 9 | +# from vocab_processor import * |
| 10 | +import operator |
| 11 | +import json |
| 12 | + |
| 13 | +def vectorize_data(sentences, max_sentence_len, max_target_len, max_aspect_len, |
| 14 | + word_processor, label_processor): |
| 15 | + ret_sentences = word_processor.transform( |
| 16 | + [text for _, text, _, _, _ in sentences] |
| 17 | + ) |
| 18 | + # [None, max_sentence_len] |
| 19 | + assert ret_sentences.shape[1] == max_sentence_len |
| 20 | + |
| 21 | + ret_loc_indicator = np.zeros((len(sentences), 1), dtype=np.int32) |
| 22 | + for i, (_, _, target, _, _) in enumerate(sentences): |
| 23 | + assert target.lower() in ['location1', 'location2'] |
| 24 | + ret_loc_indicator[i, :] = [0 if target.lower() == 'location1' else 1] |
| 25 | + |
| 26 | + ret_targets = word_processor.transform( |
| 27 | + [[target] for _, _, target, _, _ in sentences] |
| 28 | + ) |
| 29 | + assert ret_targets.shape[1] == max_sentence_len |
| 30 | + ret_targets = ret_targets[:, :max_target_len] |
| 31 | + |
| 32 | + ret_aspects = word_processor.transform( |
| 33 | + [aspect_term for _, _, _, aspect_term, _ in sentences] |
| 34 | + ) |
| 35 | + assert ret_aspects.shape[1] == max_sentence_len |
| 36 | + ret_aspects = ret_aspects[:, :max_aspect_len] |
| 37 | + |
| 38 | + ret_label = label_processor.transform( |
| 39 | + [label for _, _, _, _, label in sentences] |
| 40 | + ) |
| 41 | + # [None, 1] |
| 42 | + |
| 43 | + ret_ids = [sent_id for sent_id, _, _, _, _ in sentences] |
| 44 | + return ret_sentences, ret_targets, ret_loc_indicator, ret_aspects, ret_label, np.array(ret_ids, dtype=np.object) |
| 45 | + |
| 46 | +def load_task(data_dir, aspect2idx): |
| 47 | + in_file = os.path.join(data_dir, 'sentihood-train.json') |
| 48 | + train = parse_sentihood_json(in_file) |
| 49 | + in_file = os.path.join(data_dir, 'sentihood-dev.json') |
| 50 | + dev = parse_sentihood_json(in_file) |
| 51 | + in_file = os.path.join(data_dir, 'sentihood-test.json') |
| 52 | + test = parse_sentihood_json(in_file) |
| 53 | + |
| 54 | + train = convert_input(train, aspect2idx) |
| 55 | + train_aspect_idx = get_aspect_idx(train, aspect2idx) |
| 56 | + train = tokenize(train) |
| 57 | + dev = convert_input(dev, aspect2idx) |
| 58 | + dev_aspect_idx = get_aspect_idx(dev, aspect2idx) |
| 59 | + dev = tokenize(dev) |
| 60 | + test = convert_input(test, aspect2idx) |
| 61 | + test_aspect_idx = get_aspect_idx(test, aspect2idx) |
| 62 | + test = tokenize(test) |
| 63 | + |
| 64 | + return (train, train_aspect_idx), (dev, dev_aspect_idx), (test, test_aspect_idx) |
| 65 | + |
| 66 | +def get_aspect_idx(data, aspect2idx): |
| 67 | + ret = [] |
| 68 | + for _, _, _, aspect, _ in data: |
| 69 | + ret.append(aspect2idx[aspect]) |
| 70 | + assert len(data) == len(ret) |
| 71 | + return np.array(ret) |
| 72 | + |
| 73 | +def remove_replacement(data, replacement): |
| 74 | + ret_data = [] |
| 75 | + ret_indices = [] |
| 76 | + for sent in data: |
| 77 | + text = sent[0] |
| 78 | + assert replacement in text |
| 79 | + index = text.index(replacement) |
| 80 | + new_text = text[:index] + text[index+1:] |
| 81 | + ret_data.append(( |
| 82 | + new_text, sent[1], sent[2] |
| 83 | + )) |
| 84 | + ret_indices.append(index) |
| 85 | + return ret_data, ret_indices |
| 86 | + |
| 87 | +def lower_case(data): |
| 88 | + ret = [] |
| 89 | + for sent_id, text, target, aspect, sentiment in data: |
| 90 | + new_text = map(lambda x: x.lower(), text) |
| 91 | + new_aspect = map(lambda x: x.lower(), aspect) |
| 92 | + ret.append((sent_id, new_text, target.lower(), new_aspect, sentiment)) |
| 93 | + return ret |
| 94 | + |
| 95 | +def parse_sentihood_json(in_file): |
| 96 | + with open(in_file) as f: |
| 97 | + data = json.load(f) |
| 98 | + ret = [] |
| 99 | + for d in data: |
| 100 | + text = d['text'] |
| 101 | + sent_id = d['id'] |
| 102 | + opinions = [] |
| 103 | + targets = set() |
| 104 | + for opinion in d['opinions']: |
| 105 | + sentiment = opinion['sentiment'] |
| 106 | + aspect = opinion['aspect'] |
| 107 | + target_entity = opinion['target_entity'] |
| 108 | + targets.add(target_entity) |
| 109 | + opinions.append((target_entity, aspect, sentiment)) |
| 110 | + ret.append((sent_id, text, opinions)) |
| 111 | + return ret |
| 112 | + |
| 113 | +def get_all_aspects(data): |
| 114 | + aspects = set() |
| 115 | + for sent_id, text, opinions in data: |
| 116 | + for target_entity, aspect, sentiment in opinions: |
| 117 | + aspects.add(aspect) |
| 118 | + return aspects |
| 119 | + |
| 120 | +def convert_input(data, all_aspects): |
| 121 | + ret = [] |
| 122 | + for sent_id, text, opinions in data: |
| 123 | + for target_entity, aspect, sentiment in opinions: |
| 124 | + if aspect not in all_aspects: |
| 125 | + continue |
| 126 | + ret.append((sent_id, text, target_entity, aspect, sentiment)) |
| 127 | + assert 'LOCATION1' in text |
| 128 | + targets = set(['LOCATION1']) |
| 129 | + if 'LOCATION2' in text: |
| 130 | + targets.add('LOCATION2') |
| 131 | + for target in targets: |
| 132 | + aspects = set([a for t, a, _ in opinions if t == target]) |
| 133 | + none_aspects = [a for a in all_aspects if a not in aspects] |
| 134 | + for aspect in none_aspects: |
| 135 | + ret.append((sent_id, text, target, aspect, 'None')) |
| 136 | + return ret |
| 137 | + |
| 138 | +def tokenize(data): |
| 139 | + ret = [] |
| 140 | + for sent_id, text, target_entity, aspect, sentiment in data: |
| 141 | + new_text = nltk.word_tokenize(text) |
| 142 | + new_aspect = aspect.split('-') |
| 143 | + ret.append((sent_id, new_text, target_entity, new_aspect, sentiment)) |
| 144 | + return ret |
0 commit comments