Ticket #31: diffs

File diffs, 12.4 KB (added by Richard Boulton, 20 years ago)

Patch containing proposed fix

Line 
1? diffs
2? matcher/multimatch.cc.new
3Index: ChangeLog
4===================================================================
5RCS file: /home/cvs/xapian/xapian-core/ChangeLog,v
6retrieving revision 1.2316
7diff -u -r1.2316 ChangeLog
8--- ChangeLog 22 May 2004 23:57:31 -0000 1.2316
9+++ ChangeLog 26 May 2004 02:02:11 -0000
10@@ -1,3 +1,16 @@
11+Wed May 26 01:10:07 BST 2004 Richard Boulton <richard@tartarus.org>
12+
13+ * matcher/multimatch.cc: When collapsing, keep track of the number
14+ of collapses performed, and use this information to modify the
15+ bounds and estimate of the number of matches.
16+ * include/xapian/enquire.h: Update documentation comments for
17+ MSet::get_matches_*() functions to make clear that collapsing and
18+ cutoffs are taken into account. (Previously, the most likely
19+ interpretation of the comments was that they wouldn't be taken
20+ into account, but the implementation was that percentage cutoffs
21+ were taken into account.) Due to this ambiguity, I think it is
22+ reasonable to say this isn't an API change.
23+
24 Sun May 23 00:56:41 BST 2004 Olly Betts <olly@survex.com>
25
26 * backends/quartz/btree.cc,backends/quartz/btree.h: Merge split_off()
27Index: include/xapian/enquire.h
28===================================================================
29RCS file: /home/cvs/xapian/xapian-core/include/xapian/enquire.h,v
30retrieving revision 1.18
31diff -u -r1.18 enquire.h
32--- include/xapian/enquire.h 8 Apr 2004 14:33:04 -0000 1.18
33+++ include/xapian/enquire.h 26 May 2004 02:02:15 -0000
34@@ -130,7 +130,10 @@
35 Xapian::doccount get_firstitem() const;
36
37 /** A lower bound on the number of documents in the database which
38- * have a weight greater than zero.
39+ * match the query.
40+ *
41+ * This figure takes into account collapsing of duplicates,
42+ * and weighting cutoff values.
43 *
44 * This number is usually considerably less than the actual number
45 * of documents which match the query.
46@@ -138,7 +141,10 @@
47 Xapian::doccount get_matches_lower_bound() const;
48
49 /** An estimate for the number of documents in the database which
50- * have a weight greater than zero.
51+ * match the query.
52+ *
53+ * This figure takes into account collapsing of duplicates,
54+ * and weighting cutoff values.
55 *
56 * This value is returned because there is sometimes a request to
57 * display such information. However, our experience is that
58@@ -148,8 +154,11 @@
59 */
60 Xapian::doccount get_matches_estimated() const;
61
62- /** An upper bound on the number of documents in the database with
63- * a weight greater than zero.
64+ /** An upper bound on the number of documents in the database which
65+ * match the query.
66+ *
67+ * This figure takes into account collapsing of duplicates,
68+ * and weighting cutoff values.
69 *
70 * This number is usually considerably greater than the actual
71 * number of documents which match the query.
72Index: matcher/multimatch.cc
73===================================================================
74RCS file: /home/cvs/xapian/xapian-core/matcher/multimatch.cc,v
75retrieving revision 1.162
76diff -u -r1.162 multimatch.cc
77--- matcher/multimatch.cc 11 Aug 2003 11:11:00 -0000 1.162
78+++ matcher/multimatch.cc 26 May 2004 02:02:19 -0000
79@@ -431,11 +431,19 @@
80 Xapian::doccount matches_upper_bound = pl->get_termfreq_max();
81 Xapian::doccount matches_lower_bound = pl->get_termfreq_min();
82 Xapian::doccount matches_estimated = pl->get_termfreq_est();
83+ Xapian::doccount duplicates_found = 0;
84+ Xapian::doccount documents_considered = 0;
85
86 // Check if any results have been asked for (might just be wanting
87 // maxweight)
88 if (maxitems == 0) {
89 delete pl;
90+ if (collapse_key != Xapian::valueno(-1)) {
91+ // Lower bound must be set to no more than 1, since it's possible that
92+ // all hits will be collapsed to a single hit.
93+ if (matches_lower_bound > 1) matches_lower_bound = 1;
94+ }
95+
96 mset = Xapian::MSet(new Xapian::MSet::Internal(
97 first,
98 matches_upper_bound,
99@@ -550,6 +558,7 @@
100 }
101
102 bool pushback = true;
103+ documents_considered++;
104
105 // Perform collapsing on key if requested.
106 if (collapse_key != Xapian::valueno(-1)) {
107@@ -567,6 +576,7 @@
108 collapse_tab.insert(make_pair(new_item.collapse_key,
109 make_pair(new_item, Xapian::weight(0))));
110 } else {
111+ duplicates_found++;
112 const Xapian::Internal::MSetItem &old_item = oldkey->second.first;
113 // FIXME: what about sort_bands == 1 case here?
114 if (mcmp(old_item, new_item)) {
115@@ -613,7 +623,7 @@
116 }
117 }
118 }
119-
120+
121 // OK, actually add the item to the mset.
122 if (pushback) {
123 ++docs_matched;
124@@ -764,7 +774,7 @@
125 double denom = 0;
126 for (i = termfreqandwts.begin(); i != termfreqandwts.end(); ++i)
127 denom += i->second.termweight;
128-
129+
130 DEBUGLINE(MATCH, "denom = " << denom << " percent_scale = " << percent_scale);
131 Assert(percent_scale <= denom);
132 denom *= greatest_wt;
133@@ -802,7 +812,7 @@
134 }
135 percent_scale *= 100.0;
136 }
137-
138+
139 if (sort_bands > 1) {
140 sort(items.begin(), items.end(),
141 MSetSortCmp(db, sort_bands, percent_scale,
142@@ -818,21 +828,49 @@
143 Assert(percent_cutoff || docs_matched == items.size());
144 matches_lower_bound = matches_upper_bound = matches_estimated
145 = items.size();
146- } else if (percent_cutoff) {
147- // another approach: Xapian::doccount new_est = items.size() * (1 - percent_factor) / (1 - min_item.wt / greatest_wt);
148- Xapian::doccount new_est = (Xapian::doccount)((1 - percent_factor) *
149- matches_estimated);
150- matches_estimated = max((size_t)new_est, items.size());
151- // and another: items.size() + (1 - greatest_wt * percent_factor / min_item.wt) * (matches_estimated - items.size());
152-
153- // Very likely an underestimate, but we can't really do better without
154- // checking further matches... Only possibility would be to track how
155- // many docs made the min weight test but didn't make the candidate set
156- // since the last greatest_wt change, which we could use if the top
157- // documents matched all the prob terms.
158- matches_lower_bound = items.size();
159- // matches_upper_bound could be reduced by the number of documents
160- // which fail the min weight test
161+ } else {
162+ if (percent_cutoff) {
163+ // another approach: Xapian::doccount new_est = items.size() * (1 - percent_factor) / (1 - min_item.wt / greatest_wt);
164+ Xapian::doccount new_est = (Xapian::doccount)((1 - percent_factor) *
165+ matches_estimated);
166+ matches_estimated = max((size_t)new_est, items.size());
167+ // and another: items.size() + (1 - greatest_wt * percent_factor / min_item.wt) * (matches_estimated - items.size());
168+
169+ // Very likely an underestimate, but we can't really do better without
170+ // checking further matches... Only possibility would be to track how
171+ // many docs made the min weight test but didn't make the candidate set
172+ // since the last greatest_wt change, which we could use if the top
173+ // documents matched all the prob terms.
174+ matches_lower_bound = items.size();
175+ // matches_upper_bound could be reduced by the number of documents
176+ // which fail the min weight test
177+ }
178+
179+ if (collapse_key != Xapian::valueno(-1)) {
180+ // Lower bound must be set to no more than the number of hits
181+ // we've got, since it's possible that all further hits will be
182+ // collapsed to a single value, and that all the futher
183+ // that all hits will be collapsed to a single hit.
184+ matches_lower_bound = items.size();
185+
186+ // The estimate for the number of hits can be modified by
187+ // multiplying it by the rate at which we've been finding
188+ // duplicates.
189+ if (documents_considered > 0) {
190+ double collapse_rate =
191+ double(duplicates_found) / double(documents_considered);
192+ matches_estimated =
193+ (Xapian::doccount)(double(matches_estimated) *
194+ collapse_rate);
195+ }
196+
197+ // We can safely reduce the upper bound by the number of
198+ // duplicates we've seen.
199+ matches_upper_bound -= duplicates_found;
200+
201+ matches_estimated = max(matches_estimated, matches_lower_bound);
202+ matches_estimated = min(matches_estimated, matches_upper_bound);
203+ }
204 }
205
206 DEBUGLINE(MATCH, items.size() << " items in potential mset");
207@@ -856,7 +894,9 @@
208 "docs_matched = " << docs_matched << ", " <<
209 "matches_lower_bound = " << matches_lower_bound << ", " <<
210 "matches_estimated = " << matches_estimated << ", " <<
211- "matches_upper_bound = " << matches_upper_bound);
212+ "matches_upper_bound = " << matches_upper_bound << ", " <<
213+ "duplicates_found = " << duplicates_found << ", " <<
214+ "documents_considered = " << documents_considered);
215
216 if (sort_bands <= 1 && !items.empty()) {
217 DEBUGLINE(MATCH, "sorting");
218Index: tests/api_db.cc
219===================================================================
220RCS file: /home/cvs/xapian/xapian-core/tests/api_db.cc,v
221retrieving revision 1.169
222diff -u -r1.169 api_db.cc
223--- tests/api_db.cc 27 Apr 2004 13:37:20 -0000 1.169
224+++ tests/api_db.cc 26 May 2004 02:02:26 -0000
225@@ -1173,6 +1173,88 @@
226 return true;
227 }
228
229+// tests that collapse-on-key modifies the predicted bounds for the number of
230+// matches appropriately.
231+static bool test_collapsekey3()
232+{
233+ Xapian::Enquire enquire(get_simple_database());
234+ init_simple_enquire(enquire);
235+
236+ Xapian::MSet mymset1 = enquire.get_mset(0, 3);
237+
238+ for (Xapian::valueno value_no = 1; value_no < 7; ++value_no) {
239+ enquire.set_collapse_key(value_no);
240+ Xapian::MSet mymset = enquire.get_mset(0, 3);
241+
242+ TEST_AND_EXPLAIN(mymset1.get_matches_lower_bound() > mymset.get_matches_lower_bound(),
243+ "Lower bound was not lower when performing collapse: don't know whether it worked.");
244+ TEST_AND_EXPLAIN(mymset1.get_matches_upper_bound() > mymset.get_matches_upper_bound(),
245+ "Upper bound was not lower when performing collapse: don't know whether it worked.");
246+
247+ map<string, Xapian::docid> values;
248+ Xapian::MSetIterator i = mymset.begin();
249+ for ( ; i != mymset.end(); ++i) {
250+ string value = i.get_document().get_value(value_no);
251+ TEST(values[value] == 0 || value == "");
252+ values[value] = *i;
253+ }
254+ }
255+
256+ // Test that, if no duplicates are found (eg, by collapsing on key 1000,
257+ // which has no entries), the upper bound stays the same, but the lower
258+ // bound drops.
259+ {
260+ Xapian::valueno value_no = 1000;
261+ enquire.set_collapse_key(value_no);
262+ Xapian::MSet mymset = enquire.get_mset(0, 3);
263+
264+ TEST_AND_EXPLAIN(mymset1.get_matches_lower_bound() > mymset.get_matches_lower_bound(),
265+ "Lower bound was not lower when performing collapse: don't know whether it worked.");
266+ TEST_AND_EXPLAIN(mymset1.get_matches_upper_bound() == mymset.get_matches_upper_bound(),
267+ "Upper bound was not equal when collapse turned on, but no duplicates found.");
268+
269+ map<string, Xapian::docid> values;
270+ Xapian::MSetIterator i = mymset.begin();
271+ for ( ; i != mymset.end(); ++i) {
272+ string value = i.get_document().get_value(value_no);
273+ TEST(values[value] == 0 || value == "");
274+ values[value] = *i;
275+ }
276+ }
277+
278+ return true;
279+}
280+
281+// tests that collapse-on-key modifies the predicted bounds for the number of
282+// matches appropriately even when no results are requested.
283+static bool test_collapsekey4()
284+{
285+ Xapian::Enquire enquire(get_simple_database());
286+ init_simple_enquire(enquire);
287+
288+ Xapian::MSet mymset1 = enquire.get_mset(0, 0);
289+
290+ for (Xapian::valueno value_no = 1; value_no < 7; ++value_no) {
291+ enquire.set_collapse_key(value_no);
292+ Xapian::MSet mymset = enquire.get_mset(0, 0);
293+
294+ TEST_AND_EXPLAIN(mymset.get_matches_lower_bound() == 1,
295+ "Lower bound was not 1 when performing collapse but not asking for any results.");
296+ TEST_AND_EXPLAIN(mymset1.get_matches_upper_bound() == mymset.get_matches_upper_bound(),
297+ "Upper bound was changed when performing collapse but not asking for any results.");
298+
299+ map<string, Xapian::docid> values;
300+ Xapian::MSetIterator i = mymset.begin();
301+ for ( ; i != mymset.end(); ++i) {
302+ string value = i.get_document().get_value(value_no);
303+ TEST(values[value] == 0 || value == "");
304+ values[value] = *i;
305+ }
306+ }
307+
308+ return true;
309+}
310+
311 // tests a reversed boolean query
312 static bool test_reversebool1()
313 {
314@@ -3299,6 +3381,8 @@
315 /// The tests which require a database which supports values > 0 sensibly
316 test_desc multivalue_tests[] = {
317 {"collapsekey1", test_collapsekey1},
318+ {"collapsekey3", test_collapsekey3},
319+ {"collapsekey4", test_collapsekey4},
320 {0, 0}
321 };
322