Skip to content

Commit 328d43b

Browse files
authored
Add file
1 parent 79b38df commit 328d43b

File tree

1 file changed

+104
-0
lines changed

1 file changed

+104
-0
lines changed

Topic Modeling.ipynb

+104
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 17,
6+
"metadata": {
7+
"collapsed": true
8+
},
9+
"outputs": [],
10+
"source": [
11+
"doc1 = \"Sugar is bad to consume. My sister likes to have sugar, but not my father.\"\n",
12+
"doc2 = \"My father spends a lot of time driving my sister around to dance practice.\"\n",
13+
"doc3 = \"Doctors suggest that driving may cause increased stress and blood pressure.\"\n",
14+
"doc4 = \"Sometimes I feel pressure to perform well at school, but my father never seems to drive my sister to do better.\"\n",
15+
"doc5 = \"Health experts say that Sugar is not good for your lifestyle.\"\n",
16+
"\n",
17+
"doc_complete = [doc1, doc2, doc3, doc4, doc5]"
18+
]
19+
},
20+
{
21+
"cell_type": "code",
22+
"execution_count": 18,
23+
"metadata": {},
24+
"outputs": [],
25+
"source": [
26+
"from nltk.corpus import stopwords\n",
27+
"from nltk.stem.wordnet import WordNetLemmatizer\n",
28+
"import string\n",
29+
"stop = set(stopwords.words('english'))\n",
30+
"exclude = set(string.punctuation)\n",
31+
"lemma = WordNetLemmatizer()\n",
32+
"\n",
33+
"def clean(doc):\n",
34+
" stop_free = ' '.join([i for i in doc.lower().split() if i not in stop])\n",
35+
" punc_free = ''.join([ch for ch in stop_free if ch not in exclude])\n",
36+
" normalized = ' '.join(lemma.lemmatize(word) for word in punc_free.split())\n",
37+
" return normalized\n",
38+
"doc_clean = [clean(doc).split() for doc in doc_complete]"
39+
]
40+
},
41+
{
42+
"cell_type": "code",
43+
"execution_count": 19,
44+
"metadata": {},
45+
"outputs": [],
46+
"source": [
47+
"import gensim\n",
48+
"from gensim import corpora\n",
49+
"dictionary = corpora.Dictionary(doc_clean)\n",
50+
"doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]"
51+
]
52+
},
53+
{
54+
"cell_type": "code",
55+
"execution_count": 20,
56+
"metadata": {
57+
"collapsed": true
58+
},
59+
"outputs": [],
60+
"source": [
61+
"Lda = gensim.models.ldamodel.LdaModel\n",
62+
"ldamodel = Lda(doc_term_matrix, num_topics = 3, id2word = dictionary, passes=50)"
63+
]
64+
},
65+
{
66+
"cell_type": "code",
67+
"execution_count": 27,
68+
"metadata": {},
69+
"outputs": [
70+
{
71+
"name": "stdout",
72+
"output_type": "stream",
73+
"text": [
74+
"[(0, '0.135*\"sugar\" + 0.054*\"like\" + 0.054*\"consume\" + 0.054*\"bad\"'), (1, '0.056*\"father\" + 0.056*\"sister\" + 0.056*\"pressure\" + 0.056*\"driving\"'), (2, '0.029*\"sister\" + 0.029*\"father\" + 0.029*\"blood\" + 0.029*\"may\"')]\n"
75+
]
76+
}
77+
],
78+
"source": [
79+
"print(ldamodel.print_topics(num_topics=3, num_words=4))"
80+
]
81+
}
82+
],
83+
"metadata": {
84+
"kernelspec": {
85+
"display_name": "Python 3",
86+
"language": "python",
87+
"name": "python3"
88+
},
89+
"language_info": {
90+
"codemirror_mode": {
91+
"name": "ipython",
92+
"version": 3
93+
},
94+
"file_extension": ".py",
95+
"mimetype": "text/x-python",
96+
"name": "python",
97+
"nbconvert_exporter": "python",
98+
"pygments_lexer": "ipython3",
99+
"version": "3.6.1"
100+
}
101+
},
102+
"nbformat": 4,
103+
"nbformat_minor": 2
104+
}

0 commit comments

Comments
 (0)