-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLDA_train.py
178 lines (134 loc) · 6.75 KB
/
LDA_train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
# -*- coding: utf-8 -*-
"""
Created on 10 March 2022
@author: Amin
"""
# Import liberaries and functions
import argparse
import pickle
import numpy as np
import matplotlib.pyplot as plt
import gensim.corpora as corpora
from gensim.models import CoherenceModel, LdaMulticore
import gc
import sys
from random import sample
from time import strftime, gmtime
sys.stdout = open("LDA_train_log.txt", "w")
"""
=============================================================================
This module trains multiple LDA models on the corpora and finds the best model based on the Coherence measure.
Training is done in 2 stapes:
Initial Training: Search for the optimal number of topics over a large interval, by training LDA models on a
fraction of the whole dataset. A large step size is used.
Fine Tuning: The more precise optimal number of topics is found by searching around the intial number of topics
using a smaller step size and training the LDA models on the dataset.
To run:
>>> python LDA_train.py --dictionary RF_lda_dict --tokens RF_tokens.txt --passes 10 --chunksize 0.25 --njobs 15
=============================================================================
"""
parser = argparse.ArgumentParser(description='LDA model trainer')
### data and file related arguments
parser.add_argument('--dictionary', type=str, default='RF_lda_dict', help='directory containing dictionary object')
parser.add_argument('--tokens', type=str, default='RF_tokens.txt', help='directory containing tokenized text data')
parser.add_argument('--save_model', type=str, default='RF_lda_model', help='directory to save tranied LDA model')
### arguments related to Initial Training
parser.add_argument('--num_topics_start', type=int, default=100, help='min number of topics to be trained')
parser.add_argument('--num_topics_limit', type=int, default=500, help='max number of topics to be trained')
parser.add_argument('--topic_lr_L', type=int, default=25, help='step of number of topics for initial training')
parser.add_argument('--init_passes', type=int, default=2, help='Number of passes through the corpus during initial training')
parser.add_argument('--sample_size', type=float, default=0.25, help='fraction of documents to be used in initial training stage')
### arguments related to Fine Tuning
parser.add_argument('--topic_lr_s', type=int, default=5, help='step for number of topics fine tuning')
parser.add_argument('--passes', type=int, default=10, help='Number of passes through the corpus during fine tuning')
parser.add_argument('--chunksize', type=float, default=None, help='fraction of documents to be used in each training chunk')
### Model and optimization arguments
parser.add_argument('--stop_factor', type=int, default=3, help='stopping criteria')
parser.add_argument('--njobs', type=int, default=None, help='number of cpu cores to be used for training')
args = parser.parse_args()
print(f"{strftime('%D %H:%M', gmtime())} | Loading data ...\n")
# Load saved dictionary object
lda_dict = corpora.Dictionary.load(args.dictionary)
# Load text data tokens
with open(args.tokens, "rb") as fp:
tokens = pickle.load(fp)
# Create corpus from Dictionary
lda_corpus = [lda_dict.doc2bow(text) for text in tokens]
if args.chunksize:
# Set number of documents to be used in each training chunk
chunksize = int(len(tokens)*args.chunksize)
def coherence_optimizer(dictionary, corpus, limit, start, step, npass = args.init_passes, init_train=False):
"""
Compute coherence for num_topics in a specific range
Parameters:
----------
dictionary : Gensim dictionary
corpus : Gensim corpus
limit : Max num of topics
Returns:
-------
coherence_models : Coherence values corresponding to the LDA model with respective number of topics
"""
# Create initial list of coherence scores
CVs = args.stop_factor*[np.inf]
coherence_models = {}
for num_topics in range(start, limit, step):
# Train ensemble LDA model
model = LdaMulticore(
corpus=corpus, id2word=dictionary, num_topics=num_topics,
random_state=101, workers=args.njobs,
alpha='asymmetric', passes = npass,
chunksize=4000, decay=0.5, offset=64 # best params from Hoffman paper
)
cm = CoherenceModel(model=model, corpus=corpus, dictionary=dictionary, coherence='u_mass')
cm_score = cm.get_coherence()
CVs.append(cm_score)
if init_train:
coherence_models[num_topics] = cm_score
else:
coherence_models[num_topics] = (cm_score, model)
print(f"{strftime('%D %H:%M', gmtime())} | Model with {num_topics} topics >>> Coherence score: {cm_score:.5f}")
# Check the stopping criteria
if all(cm_score > i for i in CVs[-args.stop_factor-1:-1]):
print(f"\n {strftime('%D %H:%M', gmtime())} | Stopping criteria met ... \n")
break
else:
continue
return coherence_models
print(f"{strftime('%D %H:%M', gmtime())} | Training LDA model started.\n")
print(f"Finding optimal number of topics with learning rate = {args.topic_lr_L}\n")
# Create a random sample of the corpus
init_lda_corpus = sample(lda_corpus, k=int(args.sample_size*len(lda_corpus)))
init_coherences = coherence_optimizer(
lda_dict, init_lda_corpus,
limit=args.num_topics_limit, start=args.num_topics_start, step=args.topic_lr_L,
npass=args.init_passes, init_train=True
)
# Identifying the initial optimal number of topics on coherence scores
init_best_NT = min(init_coherences, key=lambda x: init_coherences[x])
print(f"{strftime('%D %H:%M', gmtime())} | Fine tuning the number of topics with learning rate = {args.topic_lr_s}\n")
# Set new range for number of topics
num_topics_start = max(int(init_best_NT - args.topic_lr_L), 2)
num_topics_limit = int(init_best_NT + args.topic_lr_L) + 1
coherence_model = coherence_optimizer(
lda_dict, lda_corpus,
limit=num_topics_limit, start=num_topics_start, step=args.topic_lr_s,
npass=args.passes, init_train=False
)
print(f"{strftime('%D %H:%M', gmtime())} | Training ended.\n")
# Identifying the best model based on coherence scores
best_num_topics = min(coherence_model, key=lambda x: coherence_model[x][0])
print(f"Number of topics for the best model: {best_num_topics}\n")
best_model = coherence_model[best_num_topics][1]
best_model.save(args.save_model)
print(f"{strftime('%D %H:%M', gmtime())} | Best LDA model saved to disk.\n")
sys.stdout.close()
num_topics = list(coherence_model.keys())
coherence_values = [-x[0] for x in coherence_model.values()]
fig = plt.figure(figsize=(10,6))
plt.plot(num_topics, coherence_values, alpha=0.5)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.savefig('models.png')
plt.show()