Ticket #31: diffs2

File diffs2, 12.3 KB (added by Richard Boulton, 20 years ago)

Updated patch (corrects mistake in estimation)

Line 
1Index: ChangeLog
2===================================================================
3RCS file: /home/cvs/xapian/xapian-core/ChangeLog,v
4retrieving revision 1.2316
5diff -u -r1.2316 ChangeLog
6--- ChangeLog 22 May 2004 23:57:31 -0000 1.2316
7+++ ChangeLog 26 May 2004 15:07:53 -0000
8@@ -1,3 +1,16 @@
9+Wed May 26 01:10:07 BST 2004 Richard Boulton <richard@tartarus.org>
10+
11+ * matcher/multimatch.cc: When collapsing, keep track of the number
12+ of collapses performed, and use this information to modify the
13+ bounds and estimate of the number of matches.
14+ * include/xapian/enquire.h: Update documentation comments for
15+ MSet::get_matches_*() functions to make clear that collapsing and
16+ cutoffs are taken into account. (Previously, the most likely
17+ interpretation of the comments was that they wouldn't be taken
18+ into account, but the implementation was that percentage cutoffs
19+ were taken into account.) Due to this ambiguity, I think it is
20+ reasonable to say this isn't an API change.
21+
22 Sun May 23 00:56:41 BST 2004 Olly Betts <olly@survex.com>
23
24 * backends/quartz/btree.cc,backends/quartz/btree.h: Merge split_off()
25Index: include/xapian/enquire.h
26===================================================================
27RCS file: /home/cvs/xapian/xapian-core/include/xapian/enquire.h,v
28retrieving revision 1.18
29diff -u -r1.18 enquire.h
30--- include/xapian/enquire.h 8 Apr 2004 14:33:04 -0000 1.18
31+++ include/xapian/enquire.h 26 May 2004 15:07:58 -0000
32@@ -130,7 +130,10 @@
33 Xapian::doccount get_firstitem() const;
34
35 /** A lower bound on the number of documents in the database which
36- * have a weight greater than zero.
37+ * match the query.
38+ *
39+ * This figure takes into account collapsing of duplicates,
40+ * and weighting cutoff values.
41 *
42 * This number is usually considerably less than the actual number
43 * of documents which match the query.
44@@ -138,7 +141,10 @@
45 Xapian::doccount get_matches_lower_bound() const;
46
47 /** An estimate for the number of documents in the database which
48- * have a weight greater than zero.
49+ * match the query.
50+ *
51+ * This figure takes into account collapsing of duplicates,
52+ * and weighting cutoff values.
53 *
54 * This value is returned because there is sometimes a request to
55 * display such information. However, our experience is that
56@@ -148,8 +154,11 @@
57 */
58 Xapian::doccount get_matches_estimated() const;
59
60- /** An upper bound on the number of documents in the database with
61- * a weight greater than zero.
62+ /** An upper bound on the number of documents in the database which
63+ * match the query.
64+ *
65+ * This figure takes into account collapsing of duplicates,
66+ * and weighting cutoff values.
67 *
68 * This number is usually considerably greater than the actual
69 * number of documents which match the query.
70Index: matcher/multimatch.cc
71===================================================================
72RCS file: /home/cvs/xapian/xapian-core/matcher/multimatch.cc,v
73retrieving revision 1.162
74diff -u -r1.162 multimatch.cc
75--- matcher/multimatch.cc 11 Aug 2003 11:11:00 -0000 1.162
76+++ matcher/multimatch.cc 26 May 2004 15:08:05 -0000
77@@ -431,11 +431,19 @@
78 Xapian::doccount matches_upper_bound = pl->get_termfreq_max();
79 Xapian::doccount matches_lower_bound = pl->get_termfreq_min();
80 Xapian::doccount matches_estimated = pl->get_termfreq_est();
81+ Xapian::doccount duplicates_found = 0;
82+ Xapian::doccount documents_considered = 0;
83
84 // Check if any results have been asked for (might just be wanting
85 // maxweight)
86 if (maxitems == 0) {
87 delete pl;
88+ if (collapse_key != Xapian::valueno(-1)) {
89+ // Lower bound must be set to no more than 1, since it's possible that
90+ // all hits will be collapsed to a single hit.
91+ if (matches_lower_bound > 1) matches_lower_bound = 1;
92+ }
93+
94 mset = Xapian::MSet(new Xapian::MSet::Internal(
95 first,
96 matches_upper_bound,
97@@ -550,6 +558,7 @@
98 }
99
100 bool pushback = true;
101+ documents_considered++;
102
103 // Perform collapsing on key if requested.
104 if (collapse_key != Xapian::valueno(-1)) {
105@@ -567,6 +576,7 @@
106 collapse_tab.insert(make_pair(new_item.collapse_key,
107 make_pair(new_item, Xapian::weight(0))));
108 } else {
109+ duplicates_found++;
110 const Xapian::Internal::MSetItem &old_item = oldkey->second.first;
111 // FIXME: what about sort_bands == 1 case here?
112 if (mcmp(old_item, new_item)) {
113@@ -613,7 +623,7 @@
114 }
115 }
116 }
117-
118+
119 // OK, actually add the item to the mset.
120 if (pushback) {
121 ++docs_matched;
122@@ -764,7 +774,7 @@
123 double denom = 0;
124 for (i = termfreqandwts.begin(); i != termfreqandwts.end(); ++i)
125 denom += i->second.termweight;
126-
127+
128 DEBUGLINE(MATCH, "denom = " << denom << " percent_scale = " << percent_scale);
129 Assert(percent_scale <= denom);
130 denom *= greatest_wt;
131@@ -802,7 +812,7 @@
132 }
133 percent_scale *= 100.0;
134 }
135-
136+
137 if (sort_bands > 1) {
138 sort(items.begin(), items.end(),
139 MSetSortCmp(db, sort_bands, percent_scale,
140@@ -818,21 +828,49 @@
141 Assert(percent_cutoff || docs_matched == items.size());
142 matches_lower_bound = matches_upper_bound = matches_estimated
143 = items.size();
144- } else if (percent_cutoff) {
145- // another approach: Xapian::doccount new_est = items.size() * (1 - percent_factor) / (1 - min_item.wt / greatest_wt);
146- Xapian::doccount new_est = (Xapian::doccount)((1 - percent_factor) *
147- matches_estimated);
148- matches_estimated = max((size_t)new_est, items.size());
149- // and another: items.size() + (1 - greatest_wt * percent_factor / min_item.wt) * (matches_estimated - items.size());
150-
151- // Very likely an underestimate, but we can't really do better without
152- // checking further matches... Only possibility would be to track how
153- // many docs made the min weight test but didn't make the candidate set
154- // since the last greatest_wt change, which we could use if the top
155- // documents matched all the prob terms.
156- matches_lower_bound = items.size();
157- // matches_upper_bound could be reduced by the number of documents
158- // which fail the min weight test
159+ } else {
160+ if (percent_cutoff) {
161+ // another approach: Xapian::doccount new_est = items.size() * (1 - percent_factor) / (1 - min_item.wt / greatest_wt);
162+ Xapian::doccount new_est = (Xapian::doccount)((1 - percent_factor) *
163+ matches_estimated);
164+ matches_estimated = max((size_t)new_est, items.size());
165+ // and another: items.size() + (1 - greatest_wt * percent_factor / min_item.wt) * (matches_estimated - items.size());
166+
167+ // Very likely an underestimate, but we can't really do better without
168+ // checking further matches... Only possibility would be to track how
169+ // many docs made the min weight test but didn't make the candidate set
170+ // since the last greatest_wt change, which we could use if the top
171+ // documents matched all the prob terms.
172+ matches_lower_bound = items.size();
173+ // matches_upper_bound could be reduced by the number of documents
174+ // which fail the min weight test
175+ }
176+
177+ if (collapse_key != Xapian::valueno(-1)) {
178+ // Lower bound must be set to no more than the number of hits
179+ // we've got, since it's possible that all further hits will be
180+ // collapsed to a single value, and that all the futher
181+ // that all hits will be collapsed to a single hit.
182+ matches_lower_bound = items.size();
183+
184+ // The estimate for the number of hits can be modified by
185+ // multiplying it by the rate at which we've been finding
186+ // duplicates.
187+ if (documents_considered > 0) {
188+ double collapse_rate =
189+ double(duplicates_found) / double(documents_considered);
190+ matches_estimated -=
191+ (Xapian::doccount)(double(matches_estimated) *
192+ collapse_rate);
193+ }
194+
195+ // We can safely reduce the upper bound by the number of
196+ // duplicates we've seen.
197+ matches_upper_bound -= duplicates_found;
198+
199+ matches_estimated = max(matches_estimated, matches_lower_bound);
200+ matches_estimated = min(matches_estimated, matches_upper_bound);
201+ }
202 }
203
204 DEBUGLINE(MATCH, items.size() << " items in potential mset");
205@@ -856,7 +894,9 @@
206 "docs_matched = " << docs_matched << ", " <<
207 "matches_lower_bound = " << matches_lower_bound << ", " <<
208 "matches_estimated = " << matches_estimated << ", " <<
209- "matches_upper_bound = " << matches_upper_bound);
210+ "matches_upper_bound = " << matches_upper_bound << ", " <<
211+ "duplicates_found = " << duplicates_found << ", " <<
212+ "documents_considered = " << documents_considered);
213
214 if (sort_bands <= 1 && !items.empty()) {
215 DEBUGLINE(MATCH, "sorting");
216Index: tests/api_db.cc
217===================================================================
218RCS file: /home/cvs/xapian/xapian-core/tests/api_db.cc,v
219retrieving revision 1.169
220diff -u -r1.169 api_db.cc
221--- tests/api_db.cc 27 Apr 2004 13:37:20 -0000 1.169
222+++ tests/api_db.cc 26 May 2004 15:08:10 -0000
223@@ -1173,6 +1173,88 @@
224 return true;
225 }
226
227+// tests that collapse-on-key modifies the predicted bounds for the number of
228+// matches appropriately.
229+static bool test_collapsekey3()
230+{
231+ Xapian::Enquire enquire(get_simple_database());
232+ init_simple_enquire(enquire);
233+
234+ Xapian::MSet mymset1 = enquire.get_mset(0, 3);
235+
236+ for (Xapian::valueno value_no = 1; value_no < 7; ++value_no) {
237+ enquire.set_collapse_key(value_no);
238+ Xapian::MSet mymset = enquire.get_mset(0, 3);
239+
240+ TEST_AND_EXPLAIN(mymset1.get_matches_lower_bound() > mymset.get_matches_lower_bound(),
241+ "Lower bound was not lower when performing collapse: don't know whether it worked.");
242+ TEST_AND_EXPLAIN(mymset1.get_matches_upper_bound() > mymset.get_matches_upper_bound(),
243+ "Upper bound was not lower when performing collapse: don't know whether it worked.");
244+
245+ map<string, Xapian::docid> values;
246+ Xapian::MSetIterator i = mymset.begin();
247+ for ( ; i != mymset.end(); ++i) {
248+ string value = i.get_document().get_value(value_no);
249+ TEST(values[value] == 0 || value == "");
250+ values[value] = *i;
251+ }
252+ }
253+
254+ // Test that, if no duplicates are found (eg, by collapsing on key 1000,
255+ // which has no entries), the upper bound stays the same, but the lower
256+ // bound drops.
257+ {
258+ Xapian::valueno value_no = 1000;
259+ enquire.set_collapse_key(value_no);
260+ Xapian::MSet mymset = enquire.get_mset(0, 3);
261+
262+ TEST_AND_EXPLAIN(mymset1.get_matches_lower_bound() > mymset.get_matches_lower_bound(),
263+ "Lower bound was not lower when performing collapse: don't know whether it worked.");
264+ TEST_AND_EXPLAIN(mymset1.get_matches_upper_bound() == mymset.get_matches_upper_bound(),
265+ "Upper bound was not equal when collapse turned on, but no duplicates found.");
266+
267+ map<string, Xapian::docid> values;
268+ Xapian::MSetIterator i = mymset.begin();
269+ for ( ; i != mymset.end(); ++i) {
270+ string value = i.get_document().get_value(value_no);
271+ TEST(values[value] == 0 || value == "");
272+ values[value] = *i;
273+ }
274+ }
275+
276+ return true;
277+}
278+
279+// tests that collapse-on-key modifies the predicted bounds for the number of
280+// matches appropriately even when no results are requested.
281+static bool test_collapsekey4()
282+{
283+ Xapian::Enquire enquire(get_simple_database());
284+ init_simple_enquire(enquire);
285+
286+ Xapian::MSet mymset1 = enquire.get_mset(0, 0);
287+
288+ for (Xapian::valueno value_no = 1; value_no < 7; ++value_no) {
289+ enquire.set_collapse_key(value_no);
290+ Xapian::MSet mymset = enquire.get_mset(0, 0);
291+
292+ TEST_AND_EXPLAIN(mymset.get_matches_lower_bound() == 1,
293+ "Lower bound was not 1 when performing collapse but not asking for any results.");
294+ TEST_AND_EXPLAIN(mymset1.get_matches_upper_bound() == mymset.get_matches_upper_bound(),
295+ "Upper bound was changed when performing collapse but not asking for any results.");
296+
297+ map<string, Xapian::docid> values;
298+ Xapian::MSetIterator i = mymset.begin();
299+ for ( ; i != mymset.end(); ++i) {
300+ string value = i.get_document().get_value(value_no);
301+ TEST(values[value] == 0 || value == "");
302+ values[value] = *i;
303+ }
304+ }
305+
306+ return true;
307+}
308+
309 // tests a reversed boolean query
310 static bool test_reversebool1()
311 {
312@@ -3299,6 +3381,8 @@
313 /// The tests which require a database which supports values > 0 sensibly
314 test_desc multivalue_tests[] = {
315 {"collapsekey1", test_collapsekey1},
316+ {"collapsekey3", test_collapsekey3},
317+ {"collapsekey4", test_collapsekey4},
318 {0, 0}
319 };
320