-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathdm.pl
executable file
·132 lines (116 loc) · 4.44 KB
/
dm.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#!/usr/bin/perl
#
# Perl subroutine that generates Indri dependence model queries.
#
# Written by: Don Metzler ([email protected])
# Last update: 06/27/2005
#
# Feel free to distribute, edit, modify, or mangle this code as you see fit. If you make any interesting
# changes please email me a copy.
#
# For more technical details, see:
#
# * Metzler, D. and Croft, W.B., "A Markov Random Field Model for Term Dependencies," ACM SIGIR 2005.
#
# * Metzler, D., Strohman T., Turtle H., and Croft, W.B., "Indri at TREC 2004: Terabyte Track", TREC 2004.
#
# * http://ciir.cs.umass.edu/~metzler/
#
# MODIFICATIONS
# - Updated by Jamie Callan: 02/11/2015
# Modified to support a less cryptic Indri-like query language.
# #combine --> #and, #1 --> #near/1, #weight --> #wand, and #uw --> #window/
#
# NOTES
#
# * this script assumes that the query string has already been parsed and that all characters
# that are not compatible with Indri's query language have been removed.
#
# * it is not advisable to do a 'full dependence' variant on long strings because of the exponential
# number of terms that will result. it is suggested that the 'sequential dependence' variant be
# used for long strings. either that, or split up long strings into smaller cohesive chunks and
# apply the 'full dependence' variant to each of the chunks.
#
# * the unordered features use a window size of 4 * number of terms within the phrase. this has been
# found to work well across a wide range of collections and topics. however, this may need to be
# modified on an individual basis.
#
$file = $ARGV[0];
$model = $ARGV[1]; # sd or fd
$w1 = $ARGV[2];
$w2 = $ARGV[3];
$w3 = $ARGV[4];
open(F, $file);
while (<F>) {
chomp();
if ($_ =~ m/<text>([^<]*)<\/text>/) {
$query=$1;
$query=~ s/\./ /g;
print "<text>". formulate_query(lc($query), $model, $w1, $w2, $w3) . "</text>\n";
}
else {
print "$_\n";
}
}
close(F);
# example usage
#print formulate_query( "white house rose garden", "sd", 0.5, 0.25, 0.25 ) . "\n\n";
#print formulate_query( "white house rose garden", "fd", 0.8, 0.1, 0.1 ) . "\n\n";
#
# formulates a query based on query text and feature weights
#
# arguments:
# * query - string containing original query terms separated by spaces
# * type - string. "sd" for sequential dependence or "fd" for full dependence variant. defaults to "fd".
# * wt[0] - weight assigned to term features
# * wt[1] - weight assigned to ordered (#near) features
# * wt[2] - weight assigned to unordered (#window) features
#
sub formulate_query {
my ( $q, $type, @wt ) = @_;
# trim whitespace from beginning and end of query string
$q =~ s/^\s+|\s+$//g;
my $queryT = "#combine( ";
my $queryO = "#combine(";
my $queryU = "#combine(";
# generate term features (f_T)
my @terms = split(/\s+/ , $q);
my $term;
foreach $term ( @terms ) {
$queryT .= "$term ";
}
my $num_terms = @terms;
# skip the rest of the processing if we're just
# interested in term features or if we only have 1 term
if( ( $wt[1] == 0.0 && $wt[2] == 0.0 ) || $num_terms == 1 ) {
return $queryT . ")";
}
# generate the rest of the features
my $start = 1;
if( $type eq "sd" ) { $start = 3; }
for( my $i = $start ; $i < 2 ** $num_terms ; $i++ ) {
my $bin = unpack("B*", pack("N", $i)); # create binary representation of i
my $num_extracted = 0;
my $extracted_terms = "";
# get query terms corresponding to 'on' bits
for( my $j = 0 ; $j < $num_terms ; $j++ ) {
my $bit = substr($bin, $j - $num_terms, 1);
if( $bit eq "1" ) {
$extracted_terms .= "$terms[$j] ";
$num_extracted++;
}
}
if( $num_extracted == 1 ) { next; } # skip these, since we already took care of the term features...
if( $bin =~ /^0+11+[^1]*$/ ) { # words in contiguous phrase, ordered features (f_O)
$queryO .= " #1( $extracted_terms) ";
}
$queryU .= " #uw" . 4*$num_extracted . "( $extracted_terms) "; # every subset of terms, unordered features (f_U)
if( $type eq "sd" ) { $i *= 2; $i--; }
}
my $query = "#weight(";
if( $wt[0] != 0.0 && $queryT ne "#combine( " ) { $query .= " $wt[0] $queryT)"; }
if( $wt[1] != 0.0 && $queryO ne "#combine(" ) { $query .= " $wt[1] $queryO)"; }
if( $wt[2] != 0.0 && $queryU ne "#combine(" ) { $query .= " $wt[2] $queryU)"; }
if( $query eq "#weight(" ) { return ""; } # return "" if we couldn't formulate anything
return $query . " )";
}