Ticket #31: diffs3

File diffs3, 13.1 KB (added by Richard Boulton, 20 years ago)

Argh. Another patch, this time thoroughly tested.

Line 
1Index: ChangeLog
2===================================================================
3RCS file: /home/cvs/xapian/xapian-core/ChangeLog,v
4retrieving revision 1.2316
5diff -u -r1.2316 ChangeLog
6--- ChangeLog 22 May 2004 23:57:31 -0000 1.2316
7+++ ChangeLog 26 May 2004 15:43:31 -0000
8@@ -1,3 +1,16 @@
9+Wed May 26 01:10:07 BST 2004 Richard Boulton <richard@tartarus.org>
10+
11+ * matcher/multimatch.cc: When collapsing, keep track of the number
12+ of collapses performed, and use this information to modify the
13+ bounds and estimate of the number of matches.
14+ * include/xapian/enquire.h: Update documentation comments for
15+ MSet::get_matches_*() functions to make clear that collapsing and
16+ cutoffs are taken into account. (Previously, the most likely
17+ interpretation of the comments was that they wouldn't be taken
18+ into account, but the implementation was that percentage cutoffs
19+ were taken into account.) Due to this ambiguity, I think it is
20+ reasonable to say this isn't an API change.
21+
22 Sun May 23 00:56:41 BST 2004 Olly Betts <olly@survex.com>
23
24 * backends/quartz/btree.cc,backends/quartz/btree.h: Merge split_off()
25Index: include/xapian/enquire.h
26===================================================================
27RCS file: /home/cvs/xapian/xapian-core/include/xapian/enquire.h,v
28retrieving revision 1.18
29diff -u -r1.18 enquire.h
30--- include/xapian/enquire.h 8 Apr 2004 14:33:04 -0000 1.18
31+++ include/xapian/enquire.h 26 May 2004 15:43:35 -0000
32@@ -130,7 +130,10 @@
33 Xapian::doccount get_firstitem() const;
34
35 /** A lower bound on the number of documents in the database which
36- * have a weight greater than zero.
37+ * match the query.
38+ *
39+ * This figure takes into account collapsing of duplicates,
40+ * and weighting cutoff values.
41 *
42 * This number is usually considerably less than the actual number
43 * of documents which match the query.
44@@ -138,7 +141,10 @@
45 Xapian::doccount get_matches_lower_bound() const;
46
47 /** An estimate for the number of documents in the database which
48- * have a weight greater than zero.
49+ * match the query.
50+ *
51+ * This figure takes into account collapsing of duplicates,
52+ * and weighting cutoff values.
53 *
54 * This value is returned because there is sometimes a request to
55 * display such information. However, our experience is that
56@@ -148,8 +154,11 @@
57 */
58 Xapian::doccount get_matches_estimated() const;
59
60- /** An upper bound on the number of documents in the database with
61- * a weight greater than zero.
62+ /** An upper bound on the number of documents in the database which
63+ * match the query.
64+ *
65+ * This figure takes into account collapsing of duplicates,
66+ * and weighting cutoff values.
67 *
68 * This number is usually considerably greater than the actual
69 * number of documents which match the query.
70Index: matcher/multimatch.cc
71===================================================================
72RCS file: /home/cvs/xapian/xapian-core/matcher/multimatch.cc,v
73retrieving revision 1.162
74diff -u -r1.162 multimatch.cc
75--- matcher/multimatch.cc 11 Aug 2003 11:11:00 -0000 1.162
76+++ matcher/multimatch.cc 26 May 2004 15:43:39 -0000
77@@ -431,11 +431,19 @@
78 Xapian::doccount matches_upper_bound = pl->get_termfreq_max();
79 Xapian::doccount matches_lower_bound = pl->get_termfreq_min();
80 Xapian::doccount matches_estimated = pl->get_termfreq_est();
81+ Xapian::doccount duplicates_found = 0;
82+ Xapian::doccount documents_considered = 0;
83
84 // Check if any results have been asked for (might just be wanting
85 // maxweight)
86 if (maxitems == 0) {
87 delete pl;
88+ if (collapse_key != Xapian::valueno(-1)) {
89+ // Lower bound must be set to no more than 1, since it's possible that
90+ // all hits will be collapsed to a single hit.
91+ if (matches_lower_bound > 1) matches_lower_bound = 1;
92+ }
93+
94 mset = Xapian::MSet(new Xapian::MSet::Internal(
95 first,
96 matches_upper_bound,
97@@ -550,6 +558,7 @@
98 }
99
100 bool pushback = true;
101+ documents_considered++;
102
103 // Perform collapsing on key if requested.
104 if (collapse_key != Xapian::valueno(-1)) {
105@@ -567,6 +576,7 @@
106 collapse_tab.insert(make_pair(new_item.collapse_key,
107 make_pair(new_item, Xapian::weight(0))));
108 } else {
109+ duplicates_found++;
110 const Xapian::Internal::MSetItem &old_item = oldkey->second.first;
111 // FIXME: what about sort_bands == 1 case here?
112 if (mcmp(old_item, new_item)) {
113@@ -613,7 +623,7 @@
114 }
115 }
116 }
117-
118+
119 // OK, actually add the item to the mset.
120 if (pushback) {
121 ++docs_matched;
122@@ -764,7 +774,7 @@
123 double denom = 0;
124 for (i = termfreqandwts.begin(); i != termfreqandwts.end(); ++i)
125 denom += i->second.termweight;
126-
127+
128 DEBUGLINE(MATCH, "denom = " << denom << " percent_scale = " << percent_scale);
129 Assert(percent_scale <= denom);
130 denom *= greatest_wt;
131@@ -802,7 +812,7 @@
132 }
133 percent_scale *= 100.0;
134 }
135-
136+
137 if (sort_bands > 1) {
138 sort(items.begin(), items.end(),
139 MSetSortCmp(db, sort_bands, percent_scale,
140@@ -818,21 +828,54 @@
141 Assert(percent_cutoff || docs_matched == items.size());
142 matches_lower_bound = matches_upper_bound = matches_estimated
143 = items.size();
144- } else if (percent_cutoff) {
145- // another approach: Xapian::doccount new_est = items.size() * (1 - percent_factor) / (1 - min_item.wt / greatest_wt);
146- Xapian::doccount new_est = (Xapian::doccount)((1 - percent_factor) *
147- matches_estimated);
148- matches_estimated = max((size_t)new_est, items.size());
149- // and another: items.size() + (1 - greatest_wt * percent_factor / min_item.wt) * (matches_estimated - items.size());
150-
151- // Very likely an underestimate, but we can't really do better without
152- // checking further matches... Only possibility would be to track how
153- // many docs made the min weight test but didn't make the candidate set
154- // since the last greatest_wt change, which we could use if the top
155- // documents matched all the prob terms.
156- matches_lower_bound = items.size();
157- // matches_upper_bound could be reduced by the number of documents
158- // which fail the min weight test
159+ } else {
160+ if (percent_cutoff) {
161+ // another approach: Xapian::doccount new_est = items.size() * (1 - percent_factor) / (1 - min_item.wt / greatest_wt);
162+ Xapian::doccount new_est = (Xapian::doccount)((1 - percent_factor) *
163+ matches_estimated);
164+ matches_estimated = max((size_t)new_est, items.size());
165+ // and another: items.size() + (1 - greatest_wt * percent_factor / min_item.wt) * (matches_estimated - items.size());
166+
167+ // Very likely an underestimate, but we can't really do better without
168+ // checking further matches... Only possibility would be to track how
169+ // many docs made the min weight test but didn't make the candidate set
170+ // since the last greatest_wt change, which we could use if the top
171+ // documents matched all the prob terms.
172+ matches_lower_bound = items.size();
173+ // matches_upper_bound could be reduced by the number of documents
174+ // which fail the min weight test
175+ }
176+
177+ if (collapse_key != Xapian::valueno(-1)) {
178+ // Lower bound must be set to no more than the number of collapse
179+ // values we've got, since it's possible that all further hits
180+ // will be collapsed to a single value.
181+ matches_lower_bound = collapse_tab.size();
182+
183+ // The estimate for the number of hits can be modified by
184+ // multiplying it by the rate at which we've been finding
185+ // duplicates.
186+ if (documents_considered > 0) {
187+ double collapse_rate =
188+ double(duplicates_found) / double(documents_considered);
189+ matches_estimated -=
190+ (Xapian::doccount)(double(matches_estimated) *
191+ collapse_rate);
192+ }
193+
194+ // We can safely reduce the upper bound by the number of
195+ // duplicates we've seen.
196+ matches_upper_bound -= duplicates_found;
197+
198+ matches_estimated = max(matches_estimated, matches_lower_bound);
199+ matches_estimated = min(matches_estimated, matches_upper_bound);
200+ } else if (!percent_cutoff) {
201+ Assert(docs_matched <= matches_upper_bound);
202+ if (docs_matched > matches_lower_bound)
203+ matches_lower_bound = docs_matched;
204+ if (docs_matched > matches_estimated)
205+ matches_estimated = docs_matched;
206+ }
207 }
208
209 DEBUGLINE(MATCH, items.size() << " items in potential mset");
210@@ -856,7 +899,9 @@
211 "docs_matched = " << docs_matched << ", " <<
212 "matches_lower_bound = " << matches_lower_bound << ", " <<
213 "matches_estimated = " << matches_estimated << ", " <<
214- "matches_upper_bound = " << matches_upper_bound);
215+ "matches_upper_bound = " << matches_upper_bound << ", " <<
216+ "duplicates_found = " << duplicates_found << ", " <<
217+ "documents_considered = " << documents_considered);
218
219 if (sort_bands <= 1 && !items.empty()) {
220 DEBUGLINE(MATCH, "sorting");
221@@ -870,14 +915,6 @@
222
223 Assert(matches_estimated >= matches_lower_bound);
224 Assert(matches_estimated <= matches_upper_bound);
225-
226- if (!percent_cutoff) {
227- Assert(docs_matched <= matches_upper_bound);
228- if (docs_matched > matches_lower_bound)
229- matches_lower_bound = docs_matched;
230- if (docs_matched > matches_estimated)
231- matches_estimated = docs_matched;
232- }
233
234 // We may need to qualify any collapse_count to see if the highest weight
235 // collapsed item would have qualified percent_cutoff
236Index: tests/api_db.cc
237===================================================================
238RCS file: /home/cvs/xapian/xapian-core/tests/api_db.cc,v
239retrieving revision 1.169
240diff -u -r1.169 api_db.cc
241--- tests/api_db.cc 27 Apr 2004 13:37:20 -0000 1.169
242+++ tests/api_db.cc 26 May 2004 15:43:46 -0000
243@@ -1173,6 +1173,88 @@
244 return true;
245 }
246
247+// tests that collapse-on-key modifies the predicted bounds for the number of
248+// matches appropriately.
249+static bool test_collapsekey3()
250+{
251+ Xapian::Enquire enquire(get_simple_database());
252+ init_simple_enquire(enquire);
253+
254+ Xapian::MSet mymset1 = enquire.get_mset(0, 3);
255+
256+ for (Xapian::valueno value_no = 1; value_no < 7; ++value_no) {
257+ enquire.set_collapse_key(value_no);
258+ Xapian::MSet mymset = enquire.get_mset(0, 3);
259+
260+ TEST_AND_EXPLAIN(mymset1.get_matches_lower_bound() > mymset.get_matches_lower_bound(),
261+ "Lower bound was not lower when performing collapse: don't know whether it worked.");
262+ TEST_AND_EXPLAIN(mymset1.get_matches_upper_bound() > mymset.get_matches_upper_bound(),
263+ "Upper bound was not lower when performing collapse: don't know whether it worked.");
264+
265+ map<string, Xapian::docid> values;
266+ Xapian::MSetIterator i = mymset.begin();
267+ for ( ; i != mymset.end(); ++i) {
268+ string value = i.get_document().get_value(value_no);
269+ TEST(values[value] == 0 || value == "");
270+ values[value] = *i;
271+ }
272+ }
273+
274+ // Test that, if no duplicates are found (eg, by collapsing on key 1000,
275+ // which has no entries), the upper bound stays the same, but the lower
276+ // bound drops.
277+ {
278+ Xapian::valueno value_no = 1000;
279+ enquire.set_collapse_key(value_no);
280+ Xapian::MSet mymset = enquire.get_mset(0, 3);
281+
282+ TEST_AND_EXPLAIN(mymset1.get_matches_lower_bound() > mymset.get_matches_lower_bound(),
283+ "Lower bound was not lower when performing collapse: don't know whether it worked.");
284+ TEST_AND_EXPLAIN(mymset1.get_matches_upper_bound() == mymset.get_matches_upper_bound(),
285+ "Upper bound was not equal when collapse turned on, but no duplicates found.");
286+
287+ map<string, Xapian::docid> values;
288+ Xapian::MSetIterator i = mymset.begin();
289+ for ( ; i != mymset.end(); ++i) {
290+ string value = i.get_document().get_value(value_no);
291+ TEST(values[value] == 0 || value == "");
292+ values[value] = *i;
293+ }
294+ }
295+
296+ return true;
297+}
298+
299+// tests that collapse-on-key modifies the predicted bounds for the number of
300+// matches appropriately even when no results are requested.
301+static bool test_collapsekey4()
302+{
303+ Xapian::Enquire enquire(get_simple_database());
304+ init_simple_enquire(enquire);
305+
306+ Xapian::MSet mymset1 = enquire.get_mset(0, 0);
307+
308+ for (Xapian::valueno value_no = 1; value_no < 7; ++value_no) {
309+ enquire.set_collapse_key(value_no);
310+ Xapian::MSet mymset = enquire.get_mset(0, 0);
311+
312+ TEST_AND_EXPLAIN(mymset.get_matches_lower_bound() == 1,
313+ "Lower bound was not 1 when performing collapse but not asking for any results.");
314+ TEST_AND_EXPLAIN(mymset1.get_matches_upper_bound() == mymset.get_matches_upper_bound(),
315+ "Upper bound was changed when performing collapse but not asking for any results.");
316+
317+ map<string, Xapian::docid> values;
318+ Xapian::MSetIterator i = mymset.begin();
319+ for ( ; i != mymset.end(); ++i) {
320+ string value = i.get_document().get_value(value_no);
321+ TEST(values[value] == 0 || value == "");
322+ values[value] = *i;
323+ }
324+ }
325+
326+ return true;
327+}
328+
329 // tests a reversed boolean query
330 static bool test_reversebool1()
331 {
332@@ -3299,6 +3381,8 @@
333 /// The tests which require a database which supports values > 0 sensibly
334 test_desc multivalue_tests[] = {
335 {"collapsekey1", test_collapsekey1},
336+ {"collapsekey3", test_collapsekey3},
337+ {"collapsekey4", test_collapsekey4},
338 {0, 0}
339 };
340