1 | ? diffs
|
---|
2 | ? matcher/multimatch.cc.new
|
---|
3 | Index: ChangeLog
|
---|
4 | ===================================================================
|
---|
5 | RCS file: /home/cvs/xapian/xapian-core/ChangeLog,v
|
---|
6 | retrieving revision 1.2316
|
---|
7 | diff -u -r1.2316 ChangeLog
|
---|
8 | --- ChangeLog 22 May 2004 23:57:31 -0000 1.2316
|
---|
9 | +++ ChangeLog 26 May 2004 02:02:11 -0000
|
---|
10 | @@ -1,3 +1,16 @@
|
---|
11 | +Wed May 26 01:10:07 BST 2004 Richard Boulton <richard@tartarus.org>
|
---|
12 | +
|
---|
13 | + * matcher/multimatch.cc: When collapsing, keep track of the number
|
---|
14 | + of collapses performed, and use this information to modify the
|
---|
15 | + bounds and estimate of the number of matches.
|
---|
16 | + * include/xapian/enquire.h: Update documentation comments for
|
---|
17 | + MSet::get_matches_*() functions to make clear that collapsing and
|
---|
18 | + cutoffs are taken into account. (Previously, the most likely
|
---|
19 | + interpretation of the comments was that they wouldn't be taken
|
---|
20 | + into account, but the implementation was that percentage cutoffs
|
---|
21 | + were taken into account.) Due to this ambiguity, I think it is
|
---|
22 | + reasonable to say this isn't an API change.
|
---|
23 | +
|
---|
24 | Sun May 23 00:56:41 BST 2004 Olly Betts <olly@survex.com>
|
---|
25 |
|
---|
26 | * backends/quartz/btree.cc,backends/quartz/btree.h: Merge split_off()
|
---|
27 | Index: include/xapian/enquire.h
|
---|
28 | ===================================================================
|
---|
29 | RCS file: /home/cvs/xapian/xapian-core/include/xapian/enquire.h,v
|
---|
30 | retrieving revision 1.18
|
---|
31 | diff -u -r1.18 enquire.h
|
---|
32 | --- include/xapian/enquire.h 8 Apr 2004 14:33:04 -0000 1.18
|
---|
33 | +++ include/xapian/enquire.h 26 May 2004 02:02:15 -0000
|
---|
34 | @@ -130,7 +130,10 @@
|
---|
35 | Xapian::doccount get_firstitem() const;
|
---|
36 |
|
---|
37 | /** A lower bound on the number of documents in the database which
|
---|
38 | - * have a weight greater than zero.
|
---|
39 | + * match the query.
|
---|
40 | + *
|
---|
41 | + * This figure takes into account collapsing of duplicates,
|
---|
42 | + * and weighting cutoff values.
|
---|
43 | *
|
---|
44 | * This number is usually considerably less than the actual number
|
---|
45 | * of documents which match the query.
|
---|
46 | @@ -138,7 +141,10 @@
|
---|
47 | Xapian::doccount get_matches_lower_bound() const;
|
---|
48 |
|
---|
49 | /** An estimate for the number of documents in the database which
|
---|
50 | - * have a weight greater than zero.
|
---|
51 | + * match the query.
|
---|
52 | + *
|
---|
53 | + * This figure takes into account collapsing of duplicates,
|
---|
54 | + * and weighting cutoff values.
|
---|
55 | *
|
---|
56 | * This value is returned because there is sometimes a request to
|
---|
57 | * display such information. However, our experience is that
|
---|
58 | @@ -148,8 +154,11 @@
|
---|
59 | */
|
---|
60 | Xapian::doccount get_matches_estimated() const;
|
---|
61 |
|
---|
62 | - /** An upper bound on the number of documents in the database with
|
---|
63 | - * a weight greater than zero.
|
---|
64 | + /** An upper bound on the number of documents in the database which
|
---|
65 | + * match the query.
|
---|
66 | + *
|
---|
67 | + * This figure takes into account collapsing of duplicates,
|
---|
68 | + * and weighting cutoff values.
|
---|
69 | *
|
---|
70 | * This number is usually considerably greater than the actual
|
---|
71 | * number of documents which match the query.
|
---|
72 | Index: matcher/multimatch.cc
|
---|
73 | ===================================================================
|
---|
74 | RCS file: /home/cvs/xapian/xapian-core/matcher/multimatch.cc,v
|
---|
75 | retrieving revision 1.162
|
---|
76 | diff -u -r1.162 multimatch.cc
|
---|
77 | --- matcher/multimatch.cc 11 Aug 2003 11:11:00 -0000 1.162
|
---|
78 | +++ matcher/multimatch.cc 26 May 2004 02:02:19 -0000
|
---|
79 | @@ -431,11 +431,19 @@
|
---|
80 | Xapian::doccount matches_upper_bound = pl->get_termfreq_max();
|
---|
81 | Xapian::doccount matches_lower_bound = pl->get_termfreq_min();
|
---|
82 | Xapian::doccount matches_estimated = pl->get_termfreq_est();
|
---|
83 | + Xapian::doccount duplicates_found = 0;
|
---|
84 | + Xapian::doccount documents_considered = 0;
|
---|
85 |
|
---|
86 | // Check if any results have been asked for (might just be wanting
|
---|
87 | // maxweight)
|
---|
88 | if (maxitems == 0) {
|
---|
89 | delete pl;
|
---|
90 | + if (collapse_key != Xapian::valueno(-1)) {
|
---|
91 | + // Lower bound must be set to no more than 1, since it's possible that
|
---|
92 | + // all hits will be collapsed to a single hit.
|
---|
93 | + if (matches_lower_bound > 1) matches_lower_bound = 1;
|
---|
94 | + }
|
---|
95 | +
|
---|
96 | mset = Xapian::MSet(new Xapian::MSet::Internal(
|
---|
97 | first,
|
---|
98 | matches_upper_bound,
|
---|
99 | @@ -550,6 +558,7 @@
|
---|
100 | }
|
---|
101 |
|
---|
102 | bool pushback = true;
|
---|
103 | + documents_considered++;
|
---|
104 |
|
---|
105 | // Perform collapsing on key if requested.
|
---|
106 | if (collapse_key != Xapian::valueno(-1)) {
|
---|
107 | @@ -567,6 +576,7 @@
|
---|
108 | collapse_tab.insert(make_pair(new_item.collapse_key,
|
---|
109 | make_pair(new_item, Xapian::weight(0))));
|
---|
110 | } else {
|
---|
111 | + duplicates_found++;
|
---|
112 | const Xapian::Internal::MSetItem &old_item = oldkey->second.first;
|
---|
113 | // FIXME: what about sort_bands == 1 case here?
|
---|
114 | if (mcmp(old_item, new_item)) {
|
---|
115 | @@ -613,7 +623,7 @@
|
---|
116 | }
|
---|
117 | }
|
---|
118 | }
|
---|
119 | -
|
---|
120 | +
|
---|
121 | // OK, actually add the item to the mset.
|
---|
122 | if (pushback) {
|
---|
123 | ++docs_matched;
|
---|
124 | @@ -764,7 +774,7 @@
|
---|
125 | double denom = 0;
|
---|
126 | for (i = termfreqandwts.begin(); i != termfreqandwts.end(); ++i)
|
---|
127 | denom += i->second.termweight;
|
---|
128 | -
|
---|
129 | +
|
---|
130 | DEBUGLINE(MATCH, "denom = " << denom << " percent_scale = " << percent_scale);
|
---|
131 | Assert(percent_scale <= denom);
|
---|
132 | denom *= greatest_wt;
|
---|
133 | @@ -802,7 +812,7 @@
|
---|
134 | }
|
---|
135 | percent_scale *= 100.0;
|
---|
136 | }
|
---|
137 | -
|
---|
138 | +
|
---|
139 | if (sort_bands > 1) {
|
---|
140 | sort(items.begin(), items.end(),
|
---|
141 | MSetSortCmp(db, sort_bands, percent_scale,
|
---|
142 | @@ -818,21 +828,49 @@
|
---|
143 | Assert(percent_cutoff || docs_matched == items.size());
|
---|
144 | matches_lower_bound = matches_upper_bound = matches_estimated
|
---|
145 | = items.size();
|
---|
146 | - } else if (percent_cutoff) {
|
---|
147 | - // another approach: Xapian::doccount new_est = items.size() * (1 - percent_factor) / (1 - min_item.wt / greatest_wt);
|
---|
148 | - Xapian::doccount new_est = (Xapian::doccount)((1 - percent_factor) *
|
---|
149 | - matches_estimated);
|
---|
150 | - matches_estimated = max((size_t)new_est, items.size());
|
---|
151 | - // and another: items.size() + (1 - greatest_wt * percent_factor / min_item.wt) * (matches_estimated - items.size());
|
---|
152 | -
|
---|
153 | - // Very likely an underestimate, but we can't really do better without
|
---|
154 | - // checking further matches... Only possibility would be to track how
|
---|
155 | - // many docs made the min weight test but didn't make the candidate set
|
---|
156 | - // since the last greatest_wt change, which we could use if the top
|
---|
157 | - // documents matched all the prob terms.
|
---|
158 | - matches_lower_bound = items.size();
|
---|
159 | - // matches_upper_bound could be reduced by the number of documents
|
---|
160 | - // which fail the min weight test
|
---|
161 | + } else {
|
---|
162 | + if (percent_cutoff) {
|
---|
163 | + // another approach: Xapian::doccount new_est = items.size() * (1 - percent_factor) / (1 - min_item.wt / greatest_wt);
|
---|
164 | + Xapian::doccount new_est = (Xapian::doccount)((1 - percent_factor) *
|
---|
165 | + matches_estimated);
|
---|
166 | + matches_estimated = max((size_t)new_est, items.size());
|
---|
167 | + // and another: items.size() + (1 - greatest_wt * percent_factor / min_item.wt) * (matches_estimated - items.size());
|
---|
168 | +
|
---|
169 | + // Very likely an underestimate, but we can't really do better without
|
---|
170 | + // checking further matches... Only possibility would be to track how
|
---|
171 | + // many docs made the min weight test but didn't make the candidate set
|
---|
172 | + // since the last greatest_wt change, which we could use if the top
|
---|
173 | + // documents matched all the prob terms.
|
---|
174 | + matches_lower_bound = items.size();
|
---|
175 | + // matches_upper_bound could be reduced by the number of documents
|
---|
176 | + // which fail the min weight test
|
---|
177 | + }
|
---|
178 | +
|
---|
179 | + if (collapse_key != Xapian::valueno(-1)) {
|
---|
180 | + // Lower bound must be set to no more than the number of hits
|
---|
181 | + // we've got, since it's possible that all further hits will be
|
---|
182 | + // collapsed to a single value, and that all the futher
|
---|
183 | + // that all hits will be collapsed to a single hit.
|
---|
184 | + matches_lower_bound = items.size();
|
---|
185 | +
|
---|
186 | + // The estimate for the number of hits can be modified by
|
---|
187 | + // multiplying it by the rate at which we've been finding
|
---|
188 | + // duplicates.
|
---|
189 | + if (documents_considered > 0) {
|
---|
190 | + double collapse_rate =
|
---|
191 | + double(duplicates_found) / double(documents_considered);
|
---|
192 | + matches_estimated =
|
---|
193 | + (Xapian::doccount)(double(matches_estimated) *
|
---|
194 | + collapse_rate);
|
---|
195 | + }
|
---|
196 | +
|
---|
197 | + // We can safely reduce the upper bound by the number of
|
---|
198 | + // duplicates we've seen.
|
---|
199 | + matches_upper_bound -= duplicates_found;
|
---|
200 | +
|
---|
201 | + matches_estimated = max(matches_estimated, matches_lower_bound);
|
---|
202 | + matches_estimated = min(matches_estimated, matches_upper_bound);
|
---|
203 | + }
|
---|
204 | }
|
---|
205 |
|
---|
206 | DEBUGLINE(MATCH, items.size() << " items in potential mset");
|
---|
207 | @@ -856,7 +894,9 @@
|
---|
208 | "docs_matched = " << docs_matched << ", " <<
|
---|
209 | "matches_lower_bound = " << matches_lower_bound << ", " <<
|
---|
210 | "matches_estimated = " << matches_estimated << ", " <<
|
---|
211 | - "matches_upper_bound = " << matches_upper_bound);
|
---|
212 | + "matches_upper_bound = " << matches_upper_bound << ", " <<
|
---|
213 | + "duplicates_found = " << duplicates_found << ", " <<
|
---|
214 | + "documents_considered = " << documents_considered);
|
---|
215 |
|
---|
216 | if (sort_bands <= 1 && !items.empty()) {
|
---|
217 | DEBUGLINE(MATCH, "sorting");
|
---|
218 | Index: tests/api_db.cc
|
---|
219 | ===================================================================
|
---|
220 | RCS file: /home/cvs/xapian/xapian-core/tests/api_db.cc,v
|
---|
221 | retrieving revision 1.169
|
---|
222 | diff -u -r1.169 api_db.cc
|
---|
223 | --- tests/api_db.cc 27 Apr 2004 13:37:20 -0000 1.169
|
---|
224 | +++ tests/api_db.cc 26 May 2004 02:02:26 -0000
|
---|
225 | @@ -1173,6 +1173,88 @@
|
---|
226 | return true;
|
---|
227 | }
|
---|
228 |
|
---|
229 | +// tests that collapse-on-key modifies the predicted bounds for the number of
|
---|
230 | +// matches appropriately.
|
---|
231 | +static bool test_collapsekey3()
|
---|
232 | +{
|
---|
233 | + Xapian::Enquire enquire(get_simple_database());
|
---|
234 | + init_simple_enquire(enquire);
|
---|
235 | +
|
---|
236 | + Xapian::MSet mymset1 = enquire.get_mset(0, 3);
|
---|
237 | +
|
---|
238 | + for (Xapian::valueno value_no = 1; value_no < 7; ++value_no) {
|
---|
239 | + enquire.set_collapse_key(value_no);
|
---|
240 | + Xapian::MSet mymset = enquire.get_mset(0, 3);
|
---|
241 | +
|
---|
242 | + TEST_AND_EXPLAIN(mymset1.get_matches_lower_bound() > mymset.get_matches_lower_bound(),
|
---|
243 | + "Lower bound was not lower when performing collapse: don't know whether it worked.");
|
---|
244 | + TEST_AND_EXPLAIN(mymset1.get_matches_upper_bound() > mymset.get_matches_upper_bound(),
|
---|
245 | + "Upper bound was not lower when performing collapse: don't know whether it worked.");
|
---|
246 | +
|
---|
247 | + map<string, Xapian::docid> values;
|
---|
248 | + Xapian::MSetIterator i = mymset.begin();
|
---|
249 | + for ( ; i != mymset.end(); ++i) {
|
---|
250 | + string value = i.get_document().get_value(value_no);
|
---|
251 | + TEST(values[value] == 0 || value == "");
|
---|
252 | + values[value] = *i;
|
---|
253 | + }
|
---|
254 | + }
|
---|
255 | +
|
---|
256 | + // Test that, if no duplicates are found (eg, by collapsing on key 1000,
|
---|
257 | + // which has no entries), the upper bound stays the same, but the lower
|
---|
258 | + // bound drops.
|
---|
259 | + {
|
---|
260 | + Xapian::valueno value_no = 1000;
|
---|
261 | + enquire.set_collapse_key(value_no);
|
---|
262 | + Xapian::MSet mymset = enquire.get_mset(0, 3);
|
---|
263 | +
|
---|
264 | + TEST_AND_EXPLAIN(mymset1.get_matches_lower_bound() > mymset.get_matches_lower_bound(),
|
---|
265 | + "Lower bound was not lower when performing collapse: don't know whether it worked.");
|
---|
266 | + TEST_AND_EXPLAIN(mymset1.get_matches_upper_bound() == mymset.get_matches_upper_bound(),
|
---|
267 | + "Upper bound was not equal when collapse turned on, but no duplicates found.");
|
---|
268 | +
|
---|
269 | + map<string, Xapian::docid> values;
|
---|
270 | + Xapian::MSetIterator i = mymset.begin();
|
---|
271 | + for ( ; i != mymset.end(); ++i) {
|
---|
272 | + string value = i.get_document().get_value(value_no);
|
---|
273 | + TEST(values[value] == 0 || value == "");
|
---|
274 | + values[value] = *i;
|
---|
275 | + }
|
---|
276 | + }
|
---|
277 | +
|
---|
278 | + return true;
|
---|
279 | +}
|
---|
280 | +
|
---|
281 | +// tests that collapse-on-key modifies the predicted bounds for the number of
|
---|
282 | +// matches appropriately even when no results are requested.
|
---|
283 | +static bool test_collapsekey4()
|
---|
284 | +{
|
---|
285 | + Xapian::Enquire enquire(get_simple_database());
|
---|
286 | + init_simple_enquire(enquire);
|
---|
287 | +
|
---|
288 | + Xapian::MSet mymset1 = enquire.get_mset(0, 0);
|
---|
289 | +
|
---|
290 | + for (Xapian::valueno value_no = 1; value_no < 7; ++value_no) {
|
---|
291 | + enquire.set_collapse_key(value_no);
|
---|
292 | + Xapian::MSet mymset = enquire.get_mset(0, 0);
|
---|
293 | +
|
---|
294 | + TEST_AND_EXPLAIN(mymset.get_matches_lower_bound() == 1,
|
---|
295 | + "Lower bound was not 1 when performing collapse but not asking for any results.");
|
---|
296 | + TEST_AND_EXPLAIN(mymset1.get_matches_upper_bound() == mymset.get_matches_upper_bound(),
|
---|
297 | + "Upper bound was changed when performing collapse but not asking for any results.");
|
---|
298 | +
|
---|
299 | + map<string, Xapian::docid> values;
|
---|
300 | + Xapian::MSetIterator i = mymset.begin();
|
---|
301 | + for ( ; i != mymset.end(); ++i) {
|
---|
302 | + string value = i.get_document().get_value(value_no);
|
---|
303 | + TEST(values[value] == 0 || value == "");
|
---|
304 | + values[value] = *i;
|
---|
305 | + }
|
---|
306 | + }
|
---|
307 | +
|
---|
308 | + return true;
|
---|
309 | +}
|
---|
310 | +
|
---|
311 | // tests a reversed boolean query
|
---|
312 | static bool test_reversebool1()
|
---|
313 | {
|
---|
314 | @@ -3299,6 +3381,8 @@
|
---|
315 | /// The tests which require a database which supports values > 0 sensibly
|
---|
316 | test_desc multivalue_tests[] = {
|
---|
317 | {"collapsekey1", test_collapsekey1},
|
---|
318 | + {"collapsekey3", test_collapsekey3},
|
---|
319 | + {"collapsekey4", test_collapsekey4},
|
---|
320 | {0, 0}
|
---|
321 | };
|
---|
322 |
|
---|