flypythoncom
diff --git a/‎.gitignore
+1 b/‎.gitignore
+1
diff --git a/‎Constants.py
+9 b/‎Constants.py
+9
diff --git a/‎LICENSE
+21 b/‎LICENSE
+21
diff --git a/‎README.md
+43 b/‎README.md
+43
diff --git a/‎config.py
+69 b/‎config.py
+69
diff --git a/‎dataset.py
+214 b/‎dataset.py
+214
diff --git a/‎fetch_and_preprocess.sh
+8 b/‎fetch_and_preprocess.sh
+8
@@ -0,0 +1 @@
+.idea
@@ -0,0 +1,9 @@
+PAD = 0
+UNK = 1
+BOS = 2
+EOS = 3
+
+PAD_WORD = '<blank>'
+UNK_WORD = '<unk>'
+BOS_WORD = '<s>'
+EOS_WORD = '</s>'
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2017 Riddhiman Dasgupta
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
@@ -0,0 +1,43 @@
+# Tree-Structured Long Short-Term Memory Networks
+A [PyTorch](http://pytorch.org/) based implementation of Tree-LSTM from Kai Sheng Tai's paper
+[Improved Semantic Representations From Tree-Structured Long Short-Term Memory
+Networks](http://arxiv.org/abs/1503.00075).
+
+### Requirements
+- [PyTorch](http://pytorch.org/) Deep learning library
+- [tqdm](https://github.com/tqdm/tqdm): display progress bar
+- [meowlogtool](https://pypi.python.org/pypi/meowlogtool): a logger that write everything on console to file
+- Java >= 8 (for Stanford CoreNLP utilities)
+- Python >= 2.7
+
+## Usage
+First run the script `./fetch_and_preprocess.sh`
+
+This downloads the following data:
+  - [Stanford Sentiment Treebank](http://nlp.stanford.edu/sentiment/index.html) (sentiment classification task)
+  - [Glove word vectors](http://nlp.stanford.edu/projects/glove/) (Common Crawl 840B) -- **Warning:** this is a 2GB download!
+
+and the following libraries:
+
+  - [Stanford Parser](http://nlp.stanford.edu/software/lex-parser.shtml)
+  - [Stanford POS Tagger](http://nlp.stanford.edu/software/tagger.shtml)
+
+### Sentiment classification
+
+```
+python sentiment.py --name <name_of_log_file> --model_name <constituency|dependency> --epochs 10
+```
+We have not fully test on fine grain classification yet. Binary classification accuracy on both model are the same in original paper.
+
+### Acknowledgements
+[Kai Sheng Tai](https://github.com/kaishengtai/) for the [original LuaTorch implementation](https://github.com/stanfordnlp/treelstm) <br>
+[Pytorch team](https://github.com/pytorch/pytorch#the-team) for Python library<br>
+[Riddhiman Dasgupta](https://researchweb.iiit.ac.in/~riddhiman.dasgupta/) for his implement on sentiment relatedness [https://github.com/dasguptar/treelstm.pytorch](https://github.com/dasguptar/treelstm.pytorch) which I based on as starter code.
+
+
+
+
+
+
+### License
+MIT
@@ -0,0 +1,69 @@
+import argparse
+
+def parse_args(type=0):
+    if type == 0:
+        parser = argparse.ArgumentParser(description='PyTorch TreeLSTM for Sentence Similarity on Dependency Trees')
+        parser.add_argument('--data', default='data/sick/',
+                            help='path to dataset')
+        parser.add_argument('--glove', default='data/glove/',
+                            help='directory with GLOVE embeddings')
+        parser.add_argument('--batchsize', default=25, type=int,
+                            help='batchsize for optimizer updates')
+        parser.add_argument('--epochs', default=15, type=int,
+                            help='number of total epochs to run')
+        parser.add_argument('--lr', default=0.01, type=float,
+                            metavar='LR', help='initial learning rate')
+        parser.add_argument('--wd', default=1e-4, type=float,
+                            help='weight decay (default: 1e-4)')
+        parser.add_argument('--optim', default='adam',
+                            help='optimizer (default: adam)')
+        parser.add_argument('--seed', default=123, type=int,
+                            help='random seed (default: 123)')
+        cuda_parser = parser.add_mutually_exclusive_group(required=False)
+        cuda_parser.add_argument('--cuda', dest='cuda', action='store_true')
+        cuda_parser.add_argument('--no-cuda', dest='cuda', action='store_false')
+        parser.set_defaults(cuda=True)
+
+        args = parser.parse_args()
+        return args
+    else: # for sentiment classification on SST
+        parser = argparse.ArgumentParser(description='PyTorch TreeLSTM for Sentiment Analysis Trees')
+        parser.add_argument('--name', default='default_name',
+                            help='name for log and saved models')
+        parser.add_argument('--saved', default='saved_model',
+                            help='name for log and saved models')
+
+        parser.add_argument('--model_name', default='constituency',
+                            help='model name constituency or dependency')
+        parser.add_argument('--data', default='data/sst/',
+                            help='path to dataset')
+        parser.add_argument('--glove', default='data/glove/',
+                            help='directory with GLOVE embeddings')
+        parser.add_argument('--batchsize', default=25, type=int,
+                            help='batchsize for optimizer updates')
+        parser.add_argument('--epochs', default=10, type=int,
+                            help='number of total epochs to run')
+        parser.add_argument('--lr', default=0.05, type=float,
+                            metavar='LR', help='initial learning rate')
+        parser.add_argument('--emblr', default=0.1, type=float,
+                            metavar='EMLR', help='initial embedding learning rate')
+        parser.add_argument('--wd', default=1e-4, type=float,
+                            help='weight decay (default: 1e-4)')
+        parser.add_argument('--reg', default=1e-4, type=float,
+                            help='l2 regularization (default: 1e-4)')
+        parser.add_argument('--optim', default='adagrad',
+                            help='optimizer (default: adagrad)')
+        parser.add_argument('--seed', default=123, type=int,
+                            help='random seed (default: 123)')
+        parser.add_argument('--fine_grain', default=0, type=int,
+                            help='fine grained (default 0 - binary mode)')
+                            # untest on fine_grain yet.
+        cuda_parser = parser.add_mutually_exclusive_group(required=False)
+        cuda_parser.add_argument('--cuda', dest='cuda', action='store_true')
+        cuda_parser.add_argument('--no-cuda', dest='cuda', action='store_false')
+        cuda_parser.add_argument('--lower', dest='cuda', action='store_true')
+        parser.set_defaults(cuda=True)
+        parser.set_defaults(lower=True)
+
+        args = parser.parse_args()
+        return args
@@ -0,0 +1,214 @@
+import os
+from copy import deepcopy
+from tqdm import tqdm
+import torch
+import torch.utils.data as data
+from tree import Tree
+from vocab import Vocab
+import Constants
+import utils
+
+# Dataset class for SICK dataset
+class SICKDataset(data.Dataset):
+    def __init__(self, path, vocab, num_classes):
+        super(SICKDataset, self).__init__()
+        self.vocab = vocab
+        self.num_classes = num_classes
+
+        self.lsentences = self.read_sentences(os.path.join(path,'a.toks'))
+        self.rsentences = self.read_sentences(os.path.join(path,'b.toks'))
+
+        self.ltrees = self.read_trees(os.path.join(path,'a.parents'))
+        self.rtrees = self.read_trees(os.path.join(path,'b.parents'))
+
+        self.labels = self.read_labels(os.path.join(path,'sim.txt'))
+
+        self.size = self.labels.size(0)
+
+    def __len__(self):
+        return self.size
+
+    def __getitem__(self, index):
+        ltree = deepcopy(self.ltrees[index])
+        rtree = deepcopy(self.rtrees[index])
+        lsent = deepcopy(self.lsentences[index])
+        rsent = deepcopy(self.rsentences[index])
+        label = deepcopy(self.labels[index])
+        return (ltree,lsent,rtree,rsent,label)
+
+    def read_sentences(self, filename):
+        with open(filename,'r') as f:
+            sentences = [self.read_sentence(line) for line in tqdm(f.readlines())]
+        return sentences
+
+    def read_sentence(self, line):
+        indices = self.vocab.convertToIdx(line.split(), Constants.UNK_WORD)
+        return torch.LongTensor(indices)
+
+    def read_trees(self, filename):
+        with open(filename,'r') as f:
+            trees = [self.read_tree(line) for line in tqdm(f.readlines())]
+        return trees
+
+    def read_tree(self, line):
+        parents = map(int,line.split())
+        trees = dict()
+        root = None
+        for i in xrange(1,len(parents)+1):
+            #if not trees[i-1] and parents[i-1]!=-1:
+            if i-1 not in trees.keys() and parents[i-1]!=-1:
+                idx = i
+                prev = None
+                while True:
+                    parent = parents[idx-1]
+                    if parent == -1:
+                        break
+                    tree = Tree()
+                    if prev is not None:
+                        tree.add_child(prev)
+                    trees[idx-1] = tree
+                    tree.idx = idx-1
+                    #if trees[parent-1] is not None:
+                    if parent-1 in trees.keys():
+                        trees[parent-1].add_child(tree)
+                        break
+                    elif parent==0:
+                        root = tree
+                        break
+                    else:
+                        prev = tree
+                        idx = parent
+        return root
+
+    def read_labels(self, filename):
+        with open(filename,'r') as f:
+            labels = map(lambda x: float(x), f.readlines())
+            labels = torch.Tensor(labels)
+        return labels
+
+# Dataset class for SICK dataset
+class SSTDataset(data.Dataset):
+    def __init__(self, path, vocab, num_classes, fine_grain, model_name):
+        super(SSTDataset, self).__init__()
+        self.vocab = vocab
+        self.num_classes = num_classes
+        self.fine_grain = fine_grain
+        self.model_name = model_name
+
+        temp_sentences = self.read_sentences(os.path.join(path,'sents.toks'))
+        if model_name == "dependency":
+            temp_trees = self.read_trees(os.path.join(path,'dparents.txt'), os.path.join(path,'dlabels.txt'))
+        else:
+            temp_trees = self.read_trees(os.path.join(path, 'parents.txt'), os.path.join(path, 'labels.txt'))
+
+        # self.labels = self.read_labels(os.path.join(path,'dlabels.txt'))
+        self.labels = []
+
+        if not self.fine_grain:
+            # only get pos or neg
+            new_trees = []
+            new_sentences = []
+            for i in range(len(temp_trees)):
+                if temp_trees[i].gold_label != 1: # 0 neg, 1 neutral, 2 pos
+                    new_trees.append(temp_trees[i])
+                    new_sentences.append(temp_sentences[i])
+            self.trees = new_trees
+            self.sentences = new_sentences
+        else:
+            self.trees = temp_trees
+            self.sentences = temp_sentences
+
+        for i in xrange(0, len(self.trees)):
+            self.labels.append(self.trees[i].gold_label)
+        self.labels = torch.Tensor(self.labels) # let labels be tensor
+        self.size = len(self.trees)
+
+    def __len__(self):
+        return self.size
+
+    def __getitem__(self, index):
+        # ltree = deepcopy(self.ltrees[index])
+        # rtree = deepcopy(self.rtrees[index])
+        # lsent = deepcopy(self.lsentences[index])
+        # rsent = deepcopy(self.rsentences[index])
+        # label = deepcopy(self.labels[index])
+        tree = deepcopy(self.trees[index])
+        sent = deepcopy(self.sentences[index])
+        label = deepcopy(self.labels[index])
+        return (tree, sent, label)
+
+    def read_sentences(self, filename):
+        with open(filename,'r') as f:
+            sentences = [self.read_sentence(line) for line in tqdm(f.readlines())]
+        return sentences
+
+    def read_sentence(self, line):
+        indices = self.vocab.convertToIdx(line.split(), Constants.UNK_WORD)
+        return torch.LongTensor(indices)
+
+    def read_trees(self, filename_parents, filename_labels):
+        pfile = open(filename_parents, 'r') # parent node
+        lfile = open(filename_labels, 'r') # label node
+        p = pfile.readlines()
+        l = lfile.readlines()
+        pl = zip(p, l) # (parent, label) tuple
+        trees = [self.read_tree(p_line, l_line) for p_line, l_line in tqdm(pl)]
+
+        return trees
+
+    def parse_dlabel_token(self, x):
+        if x == '#':
+            return None
+        else:
+            if self.fine_grain: # -2 -1 0 1 2 => 0 1 2 3 4
+                return int(x)+2
+            else: # # -2 -1 0 1 2 => 0 1 2
+                tmp = int(x)
+                if tmp < 0:
+                    return 0
+                elif tmp == 0:
+                    return 1
+                elif tmp >0 :
+                    return 2
+
+    def read_tree(self, line, label_line):
+        # FIXED: tree.idx, also tree dict() use base 1 as it was in dataset
+        # parents is list base 0, keep idx-1
+        # labels is list base 0, keep idx-1
+        parents = map(int,line.split()) # split each number and turn to int
+        trees = dict() # this is dict
+        root = None
+        labels = map(self.parse_dlabel_token, label_line.split())
+        for i in xrange(1,len(parents)+1):
+            #if not trees[i-1] and parents[i-1]!=-1:
+            if i not in trees.keys() and parents[i-1]!=-1:
+                idx = i
+                prev = None
+                while True:
+                    parent = parents[idx-1]
+                    if parent == -1:
+                        break
+                    tree = Tree()
+                    if prev is not None:
+                        tree.add_child(prev)
+                    trees[idx] = tree
+                    tree.idx = idx # -1 remove -1 here to prevent embs[tree.idx -1] = -1 while tree.idx = 0
+                    tree.gold_label = labels[idx-1] # add node label
+                    #if trees[parent-1] is not None:
+                    if parent in trees.keys():
+                        trees[parent].add_child(tree)
+                        break
+                    elif parent==0:
+                        root = tree
+                        break
+                    else:
+                        prev = tree
+                        idx = parent
+        return root
+
+    def read_labels(self, filename):
+        # Not in used
+        with open(filename,'r') as f:
+            labels = map(lambda x: float(x), f.readlines())
+            labels = torch.Tensor(labels)
+        return labels
@@ -0,0 +1,8 @@
+#!/bin/bash
+set -e
+python2.7 scripts/download.py
+
+CLASSPATH="lib:lib/stanford-parser/stanford-parser.jar:lib/stanford-parser/stanford-parser-3.5.1-models.jar"
+javac -cp $CLASSPATH lib/*.java
+python2.7 scripts/preprocess-sst.py
+