Add file

susanli2016 · web-flow · commit 328d43bc1ac4 · 2017-07-16T21:54:52.000-04:00
diff --git a/Topic Modeling.ipynb b/Topic Modeling.ipynb
@@ -0,0 +1,104 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "doc1 = \"Sugar is bad to consume. My sister likes to have sugar, but not my father.\"\n",
+    "doc2 = \"My father spends a lot of time driving my sister around to dance practice.\"\n",
+    "doc3 = \"Doctors suggest that driving may cause increased stress and blood pressure.\"\n",
+    "doc4 = \"Sometimes I feel pressure to perform well at school, but my father never seems to drive my sister to do better.\"\n",
+    "doc5 = \"Health experts say that Sugar is not good for your lifestyle.\"\n",
+    "\n",
+    "doc_complete = [doc1, doc2, doc3, doc4, doc5]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from nltk.corpus import stopwords\n",
+    "from nltk.stem.wordnet import WordNetLemmatizer\n",
+    "import string\n",
+    "stop = set(stopwords.words('english'))\n",
+    "exclude = set(string.punctuation)\n",
+    "lemma = WordNetLemmatizer()\n",
+    "\n",
+    "def clean(doc):\n",
+    "    stop_free = ' '.join([i for i in doc.lower().split() if i not in stop])\n",
+    "    punc_free = ''.join([ch for ch in stop_free if ch not in exclude])\n",
+    "    normalized = ' '.join(lemma.lemmatize(word) for word in punc_free.split())\n",
+    "    return normalized\n",
+    "doc_clean = [clean(doc).split() for doc in doc_complete]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import gensim\n",
+    "from gensim import corpora\n",
+    "dictionary = corpora.Dictionary(doc_clean)\n",
+    "doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "Lda = gensim.models.ldamodel.LdaModel\n",
+    "ldamodel = Lda(doc_term_matrix, num_topics = 3, id2word = dictionary, passes=50)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[(0, '0.135*\"sugar\" + 0.054*\"like\" + 0.054*\"consume\" + 0.054*\"bad\"'), (1, '0.056*\"father\" + 0.056*\"sister\" + 0.056*\"pressure\" + 0.056*\"driving\"'), (2, '0.029*\"sister\" + 0.029*\"father\" + 0.029*\"blood\" + 0.029*\"may\"')]\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(ldamodel.print_topics(num_topics=3, num_words=4))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}