-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathRetrievalEval.cpp
317 lines (260 loc) · 9.54 KB
/
RetrievalEval.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
/*==========================================================================
* Copyright (c) 2001 Carnegie Mellon University. All Rights Reserved.
*
* Use of the Lemur Toolkit for Language Modeling and Information Retrieval
* is subject to the terms of the software license set forth in the LICENSE
* file included with this software, and also available at
* http://www.lemurproject.org/license.html
*
*==========================================================================
*/
/*! \page Retrieval Evaluation Application within light Lemur toolkit
This application runs retrieval experiments to evaluate different retrieval models
Usage: RetrievalEval parameter_file
Please refor to the namespace LocalParameter for setting the parameters within the parameter_file
*/
#include "common_headers.hpp"
#include "IndexManager.hpp"
#include "BasicDocStream.hpp"
#include "Param.hpp"
#include "String.hpp"
#include "IndexedReal.hpp"
#include "ScoreAccumulator.hpp"
#include "ResultFile.hpp"
#include "TextQueryRep.hpp"
using namespace lemur::api;
namespace LocalParameter{
std::string databaseIndex; // the index of the documents
std::string queryStream; // the file of query stream
std::string resultFile; // the name of the result file
std::string weightScheme; // the weighting scheme
int resultCount; // the number of top ranked documents to return for each query
void get() {
// the string with quotes are the actual variable names to use for specifying the parameters
databaseIndex = ParamGetString("index");
queryStream = ParamGetString("query");
resultFile = ParamGetString("result","res");
weightScheme = ParamGetString("weightScheme","Okapi"); //Change the parameter for different algorithms.
resultCount = ParamGetInt("resultCount", 100);
}
};
void GetAppParam()
{
LocalParameter::get();
}
// compute the weight of a matched term
double computeRawTFWeight(int docID,
int termID,
int docTermFreq,
double qryTermWeight,
Index *ind)
{
//implementation of raw TF weighting scheme
return docTermFreq*qryTermWeight;
}
// compute the weight of a matched term
double computeRawTFIDFWeight(int docID,
int termID,
int docTermFreq,
double qryTermWeight,
Index *ind)
{
/*!!!!! Implement raw TF and IDF weighting scheme !!!!!*/
int totalNoDocs = ind->docCount();
int noDocsContainTerm = ind->docCount(termID);
return docTermFreq*qryTermWeight*(log(totalNoDocs/noDocsContainTerm));
}
// compute the weight of a matched term
double computeLogTFIDFWeight(int docID,
int termID,
int docTermFreq,
double qryTermWeight,
Index *ind)
{
/*!!!!! Implement log TF and IDF weighting scheme !!!!!*/
int totalNoDocs = ind->docCount();
int noDocsContainTerm = ind->docCount(termID);
return (log(docTermFreq)+1)*qryTermWeight*(log(totalNoDocs/noDocsContainTerm));
}
// compute the weight of a matched term
double computeOkapiWeight(int docID,
int termID,
int docTermFreq,
double qryTermWeight,
Index *ind)
{
/*!!!!! Implement Okapi weighting scheme !!!!!*/
float avgDocLen = ind->docLengthAvg();
int totalNoDocs = ind->docCount();
int noDocsContainTerm = ind->docCount(termID);
int length = ind->docLength(docID);
return (((docTermFreq)/(docTermFreq+0.5+((1.5)*(length/avgDocLen))))*((8+qryTermWeight)/(7+qryTermWeight))*(log((totalNoDocs-noDocsContainTerm+0.5)/(noDocsContainTerm+0.5))));
}
double computeCustomWeight(int docID,
int termID,
int docTermFreq,
double qryTermWeight,
Index *ind)
{
/*!!!!! Implement customized weighting scheme !!!!!*/
}
// compute the adjusted score
double computeAdjustedScore(double origScore, // the score from the accumulator
int docID, // doc ID
Index *ind) // index
{
//Do nothing now
return origScore;
}
// compute the adjusted score
double computeCustomAdjustedScore(double origScore, // the score from the accumulator
int docID, // doc ID
Index *ind) // index
{
/*!!!!! Implement customized method for adjusting score !!!!!*/
return origScore;
}
void ComputeQryArr(Document *qryDoc, double *qryArr, Index *ind){
//compute the array representation of query; it is the frequency representation for the original query
for (int t=1; t<=ind->termCountUnique(); t++) {
qryArr[t]=0;
}
qryDoc->startTermIteration();
while (qryDoc->hasMore()) {
const Term *qryTerm = qryDoc->nextTerm();
int qryTermID = ind->term(qryTerm->spelling());
qryArr[qryTermID] ++;
}
}
void Retrieval(double *qryArr, IndexedRealVector &results, Index *ind){
//retrieve documents with respect to the array representation of the query
lemur::retrieval::ArrayAccumulator scoreAccumulator(ind->docCount());
scoreAccumulator.reset();
for (int t=1; t<=ind->termCountUnique();t++) {
if (qryArr[t]>0) {
// fetch inverted entries for a specific query term
DocInfoList *docList = ind->docInfoList(t);
// iterate over all individual documents
docList->startIteration();
while (docList->hasMore()) {
DocInfo *matchInfo = docList->nextEntry();
// for each matched term, calculated the evidence
double wt;
if (strcmp(LocalParameter::weightScheme.c_str(),"RawTF")==0) {
wt = computeRawTFWeight(matchInfo->docID(), // doc ID
t, // term ID
matchInfo->termCount(), // freq of term t in this doc
qryArr[t], // freq of term t in the query
ind);
}else if (strcmp(LocalParameter::weightScheme.c_str(),"RawTFIDF")==0) {
wt = computeRawTFIDFWeight(matchInfo->docID(), // doc ID
t, // term ID
matchInfo->termCount(), // freq of term t in this doc
qryArr[t], // freq of term t in the query
ind);
}else if (strcmp(LocalParameter::weightScheme.c_str(),"LogTFIDF")==0) {
wt = computeLogTFIDFWeight(matchInfo->docID(), // doc ID
t, // term ID
matchInfo->termCount(), // freq of term t in this doc
qryArr[t], // freq of term t in the query
ind);
}else if (strcmp(LocalParameter::weightScheme.c_str(),"Okapi")==0) {
wt = computeOkapiWeight(matchInfo->docID(), // doc ID
t, // term ID
matchInfo->termCount(), // freq of term t in this doc
qryArr[t], // freq of term t in the query
ind);
}else if (strcmp(LocalParameter::weightScheme.c_str(),"Custom")==0){
wt = computeCustomWeight(matchInfo->docID(), // doc ID
t, // term ID
matchInfo->termCount(), // freq of term t in this doc
qryArr[t], // freq of term t in the query
ind);
}else{
cerr<<"The weighting scheme of "<<LocalParameter::weightScheme.c_str()<<" is not supported"<<endl;
exit(1);
}
scoreAccumulator.incScore(matchInfo->docID(),wt);
}
delete docList;
}
}
// Adjust the scores for the documents when it is necessary
double s;
int d;
for (d=1; d<=ind->docCount(); d++) {
if (scoreAccumulator.findScore(d,s)) {
} else {
s=0;
}
if (strcmp(LocalParameter::weightScheme.c_str(),"RawTF")==0) {
results.PushValue(d, computeAdjustedScore(s, // the score from the accumulator
d, // doc ID
ind)); // index
}else if (strcmp(LocalParameter::weightScheme.c_str(),"RawTFIDF")==0) {
results.PushValue(d, computeAdjustedScore(s, // the score from the accumulator
d, // doc ID
ind)); // index
}else if (strcmp(LocalParameter::weightScheme.c_str(),"LogTFIDF")==0) {
results.PushValue(d, computeAdjustedScore(s, // the score from the accumulator
d, // doc ID
ind)); // index
}else if (strcmp(LocalParameter::weightScheme.c_str(),"Okapi")==0) {
results.PushValue(d, computeAdjustedScore(s, // the score from the accumulator
d, // doc ID
ind)); // index
}else if (strcmp(LocalParameter::weightScheme.c_str(),"Custom")==0){
results.PushValue(d, computeCustomAdjustedScore(s, // the score from the accumulator
d, // doc ID
ind)); // index
}else{
cerr<<"The weighting scheme of "<<LocalParameter::weightScheme.c_str()<<" is not supported"<<endl;
exit(1);
}
}
}
/// A retrieval evaluation program
int AppMain(int argc, char *argv[]) {
//Step 1: Open the index file
Index *ind;
try {
ind = IndexManager::openIndex(LocalParameter::databaseIndex);
}
catch (Exception &ex) {
ex.writeMessage();
throw Exception("RelEval", "Can't open index, check parameter index");
}
//Step 2: Open the query file
DocStream *qryStream;
try {
qryStream = new lemur::parse::BasicDocStream(LocalParameter::queryStream);
}
catch (Exception &ex) {
ex.writeMessage(cerr);
throw Exception("RetEval",
"Can't open query file, check parameter textQuery");
}
//Step 3: Create the result file
ofstream result(LocalParameter::resultFile.c_str());
ResultFile resultFile(1);
resultFile.openForWrite(result, *ind);
// go through each query
qryStream->startDocIteration();
while (qryStream->hasMore()) {
Document *qryDoc = qryStream->nextDoc();
const char *queryID = qryDoc->getID();
cout << "query: "<< queryID <<endl;
double *queryArr = new double[ind->termCountUnique()+1]; //the array that contains the weights of query terms; for orignial query
ComputeQryArr(qryDoc,queryArr, ind);
IndexedRealVector results(ind->docCount());
results.clear();
Retrieval(queryArr,results,ind);
results.Sort();
resultFile.writeResults(queryID, &results, LocalParameter::resultCount);
delete queryArr;
}
result.close();
delete qryStream;
delete ind;
return 0;
}