[week5] Simplify the project format to ipython notebook

aborg-dev · aborg-dev · commit b53ea4d0852c · 2020-06-28T15:36:01.000+01:00
diff --git a/README.md b/README.md
@@ -46,4 +46,4 @@ Two options here:
 
 It might take a significant amount of time and resources to run the assignments code, but we expect that an average laptop is enough to accomplish the tasks. All assignments were tested in the Docker on Mac with 8GB RAM. If you have memory errors, that could be caused by not tested configurations or inefficient code. Consider reporting these cases or double-checking your code.
 
-For the final project, you will need to set up AWS machine - see [AWS tutorial here](AWS-tutorial.md). You are also welcome to try it out earlier during the course.
+If you want to run the code of the course on the AWS machine, we've prepared the [AWS tutorial here](AWS-tutorial.md).
diff --git a/week5/dialogue_manager.py b/week5/dialogue_manager.py
@@ -0,0 +1,90 @@
+import os
+from sklearn.metrics.pairwise import pairwise_distances_argmin
+
+from chatterbot import ChatBot
+from chatterbot.trainers import ChatterBotCorpusTrainer
+from utils import *
+
+
+class ThreadRanker(object):
+    def __init__(self, paths):
+        self.word_embeddings, self.embeddings_dim = load_embeddings(paths['WORD_EMBEDDINGS'])
+        self.thread_embeddings_folder = paths['THREAD_EMBEDDINGS_FOLDER']
+
+    def __load_embeddings_by_tag(self, tag_name):
+        embeddings_path = os.path.join(self.thread_embeddings_folder, tag_name + ".pkl")
+        thread_ids, thread_embeddings = unpickle_file(embeddings_path)
+        return thread_ids, thread_embeddings
+
+    def get_best_thread(self, question, tag_name):
+        """ Returns id of the most similar thread for the question.
+            The search is performed across the threads with a given tag.
+        """
+        thread_ids, thread_embeddings = self.__load_embeddings_by_tag(tag_name)
+
+        # HINT: you have already implemented a similar routine in the 3rd assignment.
+        
+        question_vec = #### YOUR CODE HERE ####
+        best_thread = #### YOUR CODE HERE ####
+        
+        return thread_ids[best_thread]
+
+
+class DialogueManager(object):
+    def __init__(self, paths):
+        print("Loading resources...")
+
+        # Intent recognition:
+        self.intent_recognizer = unpickle_file(paths['INTENT_RECOGNIZER'])
+        self.tfidf_vectorizer = unpickle_file(paths['TFIDF_VECTORIZER'])
+
+        self.ANSWER_TEMPLATE = 'I think its about %s\nThis thread might help you: https://stackoverflow.com/questions/%s'
+
+        # Goal-oriented part:
+        self.tag_classifier = unpickle_file(paths['TAG_CLASSIFIER'])
+        self.thread_ranker = ThreadRanker(paths)
+        self.__init_chitchat_bot()
+
+    def __init_chitchat_bot(self):
+        """Initializes self.chitchat_bot with some conversational model."""
+
+        # Hint: you might want to create and train chatterbot.ChatBot here.
+        # Create an instance of the ChatBot class.
+        # Create a trainer (chatterbot.trainers.ChatterBotCorpusTrainer) for the ChatBot.
+        # Train the ChatBot with "chatterbot.corpus.english" param.
+        
+        ########################
+        #### YOUR CODE HERE ####
+        ########################
+
+        # remove this when you're done
+        raise NotImplementedError(
+            "Open dialogue_manager.py and fill with your code. In case of Google Colab, download"
+            "(https://github.com/hse-aml/natural-language-processing/blob/master/project/dialogue_manager.py), "
+            "edit locally and upload using '> arrow on the left edge' -> Files -> UPLOAD")
+       
+    def generate_answer(self, question):
+        """Combines stackoverflow and chitchat parts using intent recognition."""
+
+        # Recognize intent of the question using `intent_recognizer`.
+        # Don't forget to prepare question and calculate features for the question.
+        
+        prepared_question = #### YOUR CODE HERE ####
+        features = #### YOUR CODE HERE ####
+        intent = #### YOUR CODE HERE ####
+
+        # Chit-chat part:   
+        if intent == 'dialogue':
+            # Pass question to chitchat_bot to generate a response.       
+            response = #### YOUR CODE HERE ####
+            return response
+        
+        # Goal-oriented part:
+        else:        
+            # Pass features to tag_classifier to get predictions.
+            tag = #### YOUR CODE HERE ####
+            
+            # Pass prepared_question to thread_ranker to get predictions.
+            thread_id = #### YOUR CODE HERE ####
+            
+            return self.ANSWER_TEMPLATE % (tag, thread_id)
diff --git a/week5/utils.py b/week5/utils.py
@@ -0,0 +1,76 @@
+import nltk
+import pickle
+import re
+import numpy as np
+
+nltk.download('stopwords')
+from nltk.corpus import stopwords
+
+# Paths for all resources for the bot.
+RESOURCE_PATH = {
+    'INTENT_RECOGNIZER': 'intent_recognizer.pkl',
+    'TAG_CLASSIFIER': 'tag_classifier.pkl',
+    'TFIDF_VECTORIZER': 'tfidf_vectorizer.pkl',
+    'THREAD_EMBEDDINGS_FOLDER': 'thread_embeddings_by_tags',
+    'WORD_EMBEDDINGS': 'data/word_embeddings.tsv',
+}
+
+
+def text_prepare(text):
+    """Performs tokenization and simple preprocessing."""
+    replace_by_space_re = re.compile('[/(){}\[\]\|@,;]')
+    good_symbols_re = re.compile('[^0-9a-z #+_]')
+    stopwords_set = set(stopwords.words('english'))
+
+    text = text.lower()
+    text = replace_by_space_re.sub(' ', text)
+    text = good_symbols_re.sub('', text)
+    text = ' '.join([x for x in text.split() if x and x not in stopwords_set])
+
+    return text.strip()
+
+
+def load_embeddings(embeddings_path):
+    """Loads pre-trained word embeddings from tsv file.
+    Args:
+      embeddings_path - path to the embeddings file.
+    Returns:
+      embeddings - dict mapping words to vectors;
+      embeddings_dim - dimension of the vectors.
+    """
+
+    # Hint: you have already implemented a similar routine in the 3rd assignment.
+    # Note that here you also need to know the dimension of the loaded embeddings.
+    # When you load the embeddings, use numpy.float32 type as dtype
+
+    ########################
+    #### YOUR CODE HERE ####
+    ########################
+
+    # remove this when you're done
+    raise NotImplementedError(
+        "Open utils.py and fill with your code. In case of Google Colab, download"
+        "(https://github.com/hse-aml/natural-language-processing/blob/master/project/utils.py), "
+        "edit locally and upload using '> arrow on the left edge' -> Files -> UPLOAD")
+
+
+def question_to_vec(question, embeddings, dim):
+    """Transforms a string to an embedding by averaging word embeddings."""
+
+    # Hint: you have already implemented exactly this function in the 3rd assignment.
+
+    ########################
+    #### YOUR CODE HERE ####
+    ########################
+
+    # remove this when you're done
+    raise NotImplementedError(
+        "Open utils.py and fill with your code. In case of Google Colab, download"
+        "(https://github.com/hse-aml/natural-language-processing/blob/master/project/utils.py), "
+        "edit locally and upload using '> arrow on the left edge' -> Files -> UPLOAD")
+
+
+def unpickle_file(filename):
+    """Returns the result of unpickling the file content."""
+    with open(filename, 'rb') as f:
+        return pickle.load(f)
diff --git a/week5/week5-project.ipynb b/week5/week5-project.ipynb

Original file line number	Diff line number	Diff line change
`@@ -46,4 +46,4 @@ Two options here:`
`46`	`46`
`47`	`47`	`It might take a significant amount of time and resources to run the assignments code, but we expect that an average laptop is enough to accomplish the tasks. All assignments were tested in the Docker on Mac with 8GB RAM. If you have memory errors, that could be caused by not tested configurations or inefficient code. Consider reporting these cases or double-checking your code.`
`48`	`48`
`49`		`-For the final project, you will need to set up AWS machine - see [AWS tutorial here](AWS-tutorial.md). You are also welcome to try it out earlier during the course.`
	`49`	`+If you want to run the code of the course on the AWS machine, we've prepared the [AWS tutorial here](AWS-tutorial.md).`