1 | Index: ChangeLog
|
---|
2 | ===================================================================
|
---|
3 | RCS file: /home/cvs/xapian/xapian-core/ChangeLog,v
|
---|
4 | retrieving revision 1.2316
|
---|
5 | diff -u -r1.2316 ChangeLog
|
---|
6 | --- ChangeLog 22 May 2004 23:57:31 -0000 1.2316
|
---|
7 | +++ ChangeLog 26 May 2004 15:43:31 -0000
|
---|
8 | @@ -1,3 +1,16 @@
|
---|
9 | +Wed May 26 01:10:07 BST 2004 Richard Boulton <richard@tartarus.org>
|
---|
10 | +
|
---|
11 | + * matcher/multimatch.cc: When collapsing, keep track of the number
|
---|
12 | + of collapses performed, and use this information to modify the
|
---|
13 | + bounds and estimate of the number of matches.
|
---|
14 | + * include/xapian/enquire.h: Update documentation comments for
|
---|
15 | + MSet::get_matches_*() functions to make clear that collapsing and
|
---|
16 | + cutoffs are taken into account. (Previously, the most likely
|
---|
17 | + interpretation of the comments was that they wouldn't be taken
|
---|
18 | + into account, but the implementation was that percentage cutoffs
|
---|
19 | + were taken into account.) Due to this ambiguity, I think it is
|
---|
20 | + reasonable to say this isn't an API change.
|
---|
21 | +
|
---|
22 | Sun May 23 00:56:41 BST 2004 Olly Betts <olly@survex.com>
|
---|
23 |
|
---|
24 | * backends/quartz/btree.cc,backends/quartz/btree.h: Merge split_off()
|
---|
25 | Index: include/xapian/enquire.h
|
---|
26 | ===================================================================
|
---|
27 | RCS file: /home/cvs/xapian/xapian-core/include/xapian/enquire.h,v
|
---|
28 | retrieving revision 1.18
|
---|
29 | diff -u -r1.18 enquire.h
|
---|
30 | --- include/xapian/enquire.h 8 Apr 2004 14:33:04 -0000 1.18
|
---|
31 | +++ include/xapian/enquire.h 26 May 2004 15:43:35 -0000
|
---|
32 | @@ -130,7 +130,10 @@
|
---|
33 | Xapian::doccount get_firstitem() const;
|
---|
34 |
|
---|
35 | /** A lower bound on the number of documents in the database which
|
---|
36 | - * have a weight greater than zero.
|
---|
37 | + * match the query.
|
---|
38 | + *
|
---|
39 | + * This figure takes into account collapsing of duplicates,
|
---|
40 | + * and weighting cutoff values.
|
---|
41 | *
|
---|
42 | * This number is usually considerably less than the actual number
|
---|
43 | * of documents which match the query.
|
---|
44 | @@ -138,7 +141,10 @@
|
---|
45 | Xapian::doccount get_matches_lower_bound() const;
|
---|
46 |
|
---|
47 | /** An estimate for the number of documents in the database which
|
---|
48 | - * have a weight greater than zero.
|
---|
49 | + * match the query.
|
---|
50 | + *
|
---|
51 | + * This figure takes into account collapsing of duplicates,
|
---|
52 | + * and weighting cutoff values.
|
---|
53 | *
|
---|
54 | * This value is returned because there is sometimes a request to
|
---|
55 | * display such information. However, our experience is that
|
---|
56 | @@ -148,8 +154,11 @@
|
---|
57 | */
|
---|
58 | Xapian::doccount get_matches_estimated() const;
|
---|
59 |
|
---|
60 | - /** An upper bound on the number of documents in the database with
|
---|
61 | - * a weight greater than zero.
|
---|
62 | + /** An upper bound on the number of documents in the database which
|
---|
63 | + * match the query.
|
---|
64 | + *
|
---|
65 | + * This figure takes into account collapsing of duplicates,
|
---|
66 | + * and weighting cutoff values.
|
---|
67 | *
|
---|
68 | * This number is usually considerably greater than the actual
|
---|
69 | * number of documents which match the query.
|
---|
70 | Index: matcher/multimatch.cc
|
---|
71 | ===================================================================
|
---|
72 | RCS file: /home/cvs/xapian/xapian-core/matcher/multimatch.cc,v
|
---|
73 | retrieving revision 1.162
|
---|
74 | diff -u -r1.162 multimatch.cc
|
---|
75 | --- matcher/multimatch.cc 11 Aug 2003 11:11:00 -0000 1.162
|
---|
76 | +++ matcher/multimatch.cc 26 May 2004 15:43:39 -0000
|
---|
77 | @@ -431,11 +431,19 @@
|
---|
78 | Xapian::doccount matches_upper_bound = pl->get_termfreq_max();
|
---|
79 | Xapian::doccount matches_lower_bound = pl->get_termfreq_min();
|
---|
80 | Xapian::doccount matches_estimated = pl->get_termfreq_est();
|
---|
81 | + Xapian::doccount duplicates_found = 0;
|
---|
82 | + Xapian::doccount documents_considered = 0;
|
---|
83 |
|
---|
84 | // Check if any results have been asked for (might just be wanting
|
---|
85 | // maxweight)
|
---|
86 | if (maxitems == 0) {
|
---|
87 | delete pl;
|
---|
88 | + if (collapse_key != Xapian::valueno(-1)) {
|
---|
89 | + // Lower bound must be set to no more than 1, since it's possible that
|
---|
90 | + // all hits will be collapsed to a single hit.
|
---|
91 | + if (matches_lower_bound > 1) matches_lower_bound = 1;
|
---|
92 | + }
|
---|
93 | +
|
---|
94 | mset = Xapian::MSet(new Xapian::MSet::Internal(
|
---|
95 | first,
|
---|
96 | matches_upper_bound,
|
---|
97 | @@ -550,6 +558,7 @@
|
---|
98 | }
|
---|
99 |
|
---|
100 | bool pushback = true;
|
---|
101 | + documents_considered++;
|
---|
102 |
|
---|
103 | // Perform collapsing on key if requested.
|
---|
104 | if (collapse_key != Xapian::valueno(-1)) {
|
---|
105 | @@ -567,6 +576,7 @@
|
---|
106 | collapse_tab.insert(make_pair(new_item.collapse_key,
|
---|
107 | make_pair(new_item, Xapian::weight(0))));
|
---|
108 | } else {
|
---|
109 | + duplicates_found++;
|
---|
110 | const Xapian::Internal::MSetItem &old_item = oldkey->second.first;
|
---|
111 | // FIXME: what about sort_bands == 1 case here?
|
---|
112 | if (mcmp(old_item, new_item)) {
|
---|
113 | @@ -613,7 +623,7 @@
|
---|
114 | }
|
---|
115 | }
|
---|
116 | }
|
---|
117 | -
|
---|
118 | +
|
---|
119 | // OK, actually add the item to the mset.
|
---|
120 | if (pushback) {
|
---|
121 | ++docs_matched;
|
---|
122 | @@ -764,7 +774,7 @@
|
---|
123 | double denom = 0;
|
---|
124 | for (i = termfreqandwts.begin(); i != termfreqandwts.end(); ++i)
|
---|
125 | denom += i->second.termweight;
|
---|
126 | -
|
---|
127 | +
|
---|
128 | DEBUGLINE(MATCH, "denom = " << denom << " percent_scale = " << percent_scale);
|
---|
129 | Assert(percent_scale <= denom);
|
---|
130 | denom *= greatest_wt;
|
---|
131 | @@ -802,7 +812,7 @@
|
---|
132 | }
|
---|
133 | percent_scale *= 100.0;
|
---|
134 | }
|
---|
135 | -
|
---|
136 | +
|
---|
137 | if (sort_bands > 1) {
|
---|
138 | sort(items.begin(), items.end(),
|
---|
139 | MSetSortCmp(db, sort_bands, percent_scale,
|
---|
140 | @@ -818,21 +828,54 @@
|
---|
141 | Assert(percent_cutoff || docs_matched == items.size());
|
---|
142 | matches_lower_bound = matches_upper_bound = matches_estimated
|
---|
143 | = items.size();
|
---|
144 | - } else if (percent_cutoff) {
|
---|
145 | - // another approach: Xapian::doccount new_est = items.size() * (1 - percent_factor) / (1 - min_item.wt / greatest_wt);
|
---|
146 | - Xapian::doccount new_est = (Xapian::doccount)((1 - percent_factor) *
|
---|
147 | - matches_estimated);
|
---|
148 | - matches_estimated = max((size_t)new_est, items.size());
|
---|
149 | - // and another: items.size() + (1 - greatest_wt * percent_factor / min_item.wt) * (matches_estimated - items.size());
|
---|
150 | -
|
---|
151 | - // Very likely an underestimate, but we can't really do better without
|
---|
152 | - // checking further matches... Only possibility would be to track how
|
---|
153 | - // many docs made the min weight test but didn't make the candidate set
|
---|
154 | - // since the last greatest_wt change, which we could use if the top
|
---|
155 | - // documents matched all the prob terms.
|
---|
156 | - matches_lower_bound = items.size();
|
---|
157 | - // matches_upper_bound could be reduced by the number of documents
|
---|
158 | - // which fail the min weight test
|
---|
159 | + } else {
|
---|
160 | + if (percent_cutoff) {
|
---|
161 | + // another approach: Xapian::doccount new_est = items.size() * (1 - percent_factor) / (1 - min_item.wt / greatest_wt);
|
---|
162 | + Xapian::doccount new_est = (Xapian::doccount)((1 - percent_factor) *
|
---|
163 | + matches_estimated);
|
---|
164 | + matches_estimated = max((size_t)new_est, items.size());
|
---|
165 | + // and another: items.size() + (1 - greatest_wt * percent_factor / min_item.wt) * (matches_estimated - items.size());
|
---|
166 | +
|
---|
167 | + // Very likely an underestimate, but we can't really do better without
|
---|
168 | + // checking further matches... Only possibility would be to track how
|
---|
169 | + // many docs made the min weight test but didn't make the candidate set
|
---|
170 | + // since the last greatest_wt change, which we could use if the top
|
---|
171 | + // documents matched all the prob terms.
|
---|
172 | + matches_lower_bound = items.size();
|
---|
173 | + // matches_upper_bound could be reduced by the number of documents
|
---|
174 | + // which fail the min weight test
|
---|
175 | + }
|
---|
176 | +
|
---|
177 | + if (collapse_key != Xapian::valueno(-1)) {
|
---|
178 | + // Lower bound must be set to no more than the number of collapse
|
---|
179 | + // values we've got, since it's possible that all further hits
|
---|
180 | + // will be collapsed to a single value.
|
---|
181 | + matches_lower_bound = collapse_tab.size();
|
---|
182 | +
|
---|
183 | + // The estimate for the number of hits can be modified by
|
---|
184 | + // multiplying it by the rate at which we've been finding
|
---|
185 | + // duplicates.
|
---|
186 | + if (documents_considered > 0) {
|
---|
187 | + double collapse_rate =
|
---|
188 | + double(duplicates_found) / double(documents_considered);
|
---|
189 | + matches_estimated -=
|
---|
190 | + (Xapian::doccount)(double(matches_estimated) *
|
---|
191 | + collapse_rate);
|
---|
192 | + }
|
---|
193 | +
|
---|
194 | + // We can safely reduce the upper bound by the number of
|
---|
195 | + // duplicates we've seen.
|
---|
196 | + matches_upper_bound -= duplicates_found;
|
---|
197 | +
|
---|
198 | + matches_estimated = max(matches_estimated, matches_lower_bound);
|
---|
199 | + matches_estimated = min(matches_estimated, matches_upper_bound);
|
---|
200 | + } else if (!percent_cutoff) {
|
---|
201 | + Assert(docs_matched <= matches_upper_bound);
|
---|
202 | + if (docs_matched > matches_lower_bound)
|
---|
203 | + matches_lower_bound = docs_matched;
|
---|
204 | + if (docs_matched > matches_estimated)
|
---|
205 | + matches_estimated = docs_matched;
|
---|
206 | + }
|
---|
207 | }
|
---|
208 |
|
---|
209 | DEBUGLINE(MATCH, items.size() << " items in potential mset");
|
---|
210 | @@ -856,7 +899,9 @@
|
---|
211 | "docs_matched = " << docs_matched << ", " <<
|
---|
212 | "matches_lower_bound = " << matches_lower_bound << ", " <<
|
---|
213 | "matches_estimated = " << matches_estimated << ", " <<
|
---|
214 | - "matches_upper_bound = " << matches_upper_bound);
|
---|
215 | + "matches_upper_bound = " << matches_upper_bound << ", " <<
|
---|
216 | + "duplicates_found = " << duplicates_found << ", " <<
|
---|
217 | + "documents_considered = " << documents_considered);
|
---|
218 |
|
---|
219 | if (sort_bands <= 1 && !items.empty()) {
|
---|
220 | DEBUGLINE(MATCH, "sorting");
|
---|
221 | @@ -870,14 +915,6 @@
|
---|
222 |
|
---|
223 | Assert(matches_estimated >= matches_lower_bound);
|
---|
224 | Assert(matches_estimated <= matches_upper_bound);
|
---|
225 | -
|
---|
226 | - if (!percent_cutoff) {
|
---|
227 | - Assert(docs_matched <= matches_upper_bound);
|
---|
228 | - if (docs_matched > matches_lower_bound)
|
---|
229 | - matches_lower_bound = docs_matched;
|
---|
230 | - if (docs_matched > matches_estimated)
|
---|
231 | - matches_estimated = docs_matched;
|
---|
232 | - }
|
---|
233 |
|
---|
234 | // We may need to qualify any collapse_count to see if the highest weight
|
---|
235 | // collapsed item would have qualified percent_cutoff
|
---|
236 | Index: tests/api_db.cc
|
---|
237 | ===================================================================
|
---|
238 | RCS file: /home/cvs/xapian/xapian-core/tests/api_db.cc,v
|
---|
239 | retrieving revision 1.169
|
---|
240 | diff -u -r1.169 api_db.cc
|
---|
241 | --- tests/api_db.cc 27 Apr 2004 13:37:20 -0000 1.169
|
---|
242 | +++ tests/api_db.cc 26 May 2004 15:43:46 -0000
|
---|
243 | @@ -1173,6 +1173,88 @@
|
---|
244 | return true;
|
---|
245 | }
|
---|
246 |
|
---|
247 | +// tests that collapse-on-key modifies the predicted bounds for the number of
|
---|
248 | +// matches appropriately.
|
---|
249 | +static bool test_collapsekey3()
|
---|
250 | +{
|
---|
251 | + Xapian::Enquire enquire(get_simple_database());
|
---|
252 | + init_simple_enquire(enquire);
|
---|
253 | +
|
---|
254 | + Xapian::MSet mymset1 = enquire.get_mset(0, 3);
|
---|
255 | +
|
---|
256 | + for (Xapian::valueno value_no = 1; value_no < 7; ++value_no) {
|
---|
257 | + enquire.set_collapse_key(value_no);
|
---|
258 | + Xapian::MSet mymset = enquire.get_mset(0, 3);
|
---|
259 | +
|
---|
260 | + TEST_AND_EXPLAIN(mymset1.get_matches_lower_bound() > mymset.get_matches_lower_bound(),
|
---|
261 | + "Lower bound was not lower when performing collapse: don't know whether it worked.");
|
---|
262 | + TEST_AND_EXPLAIN(mymset1.get_matches_upper_bound() > mymset.get_matches_upper_bound(),
|
---|
263 | + "Upper bound was not lower when performing collapse: don't know whether it worked.");
|
---|
264 | +
|
---|
265 | + map<string, Xapian::docid> values;
|
---|
266 | + Xapian::MSetIterator i = mymset.begin();
|
---|
267 | + for ( ; i != mymset.end(); ++i) {
|
---|
268 | + string value = i.get_document().get_value(value_no);
|
---|
269 | + TEST(values[value] == 0 || value == "");
|
---|
270 | + values[value] = *i;
|
---|
271 | + }
|
---|
272 | + }
|
---|
273 | +
|
---|
274 | + // Test that, if no duplicates are found (eg, by collapsing on key 1000,
|
---|
275 | + // which has no entries), the upper bound stays the same, but the lower
|
---|
276 | + // bound drops.
|
---|
277 | + {
|
---|
278 | + Xapian::valueno value_no = 1000;
|
---|
279 | + enquire.set_collapse_key(value_no);
|
---|
280 | + Xapian::MSet mymset = enquire.get_mset(0, 3);
|
---|
281 | +
|
---|
282 | + TEST_AND_EXPLAIN(mymset1.get_matches_lower_bound() > mymset.get_matches_lower_bound(),
|
---|
283 | + "Lower bound was not lower when performing collapse: don't know whether it worked.");
|
---|
284 | + TEST_AND_EXPLAIN(mymset1.get_matches_upper_bound() == mymset.get_matches_upper_bound(),
|
---|
285 | + "Upper bound was not equal when collapse turned on, but no duplicates found.");
|
---|
286 | +
|
---|
287 | + map<string, Xapian::docid> values;
|
---|
288 | + Xapian::MSetIterator i = mymset.begin();
|
---|
289 | + for ( ; i != mymset.end(); ++i) {
|
---|
290 | + string value = i.get_document().get_value(value_no);
|
---|
291 | + TEST(values[value] == 0 || value == "");
|
---|
292 | + values[value] = *i;
|
---|
293 | + }
|
---|
294 | + }
|
---|
295 | +
|
---|
296 | + return true;
|
---|
297 | +}
|
---|
298 | +
|
---|
299 | +// tests that collapse-on-key modifies the predicted bounds for the number of
|
---|
300 | +// matches appropriately even when no results are requested.
|
---|
301 | +static bool test_collapsekey4()
|
---|
302 | +{
|
---|
303 | + Xapian::Enquire enquire(get_simple_database());
|
---|
304 | + init_simple_enquire(enquire);
|
---|
305 | +
|
---|
306 | + Xapian::MSet mymset1 = enquire.get_mset(0, 0);
|
---|
307 | +
|
---|
308 | + for (Xapian::valueno value_no = 1; value_no < 7; ++value_no) {
|
---|
309 | + enquire.set_collapse_key(value_no);
|
---|
310 | + Xapian::MSet mymset = enquire.get_mset(0, 0);
|
---|
311 | +
|
---|
312 | + TEST_AND_EXPLAIN(mymset.get_matches_lower_bound() == 1,
|
---|
313 | + "Lower bound was not 1 when performing collapse but not asking for any results.");
|
---|
314 | + TEST_AND_EXPLAIN(mymset1.get_matches_upper_bound() == mymset.get_matches_upper_bound(),
|
---|
315 | + "Upper bound was changed when performing collapse but not asking for any results.");
|
---|
316 | +
|
---|
317 | + map<string, Xapian::docid> values;
|
---|
318 | + Xapian::MSetIterator i = mymset.begin();
|
---|
319 | + for ( ; i != mymset.end(); ++i) {
|
---|
320 | + string value = i.get_document().get_value(value_no);
|
---|
321 | + TEST(values[value] == 0 || value == "");
|
---|
322 | + values[value] = *i;
|
---|
323 | + }
|
---|
324 | + }
|
---|
325 | +
|
---|
326 | + return true;
|
---|
327 | +}
|
---|
328 | +
|
---|
329 | // tests a reversed boolean query
|
---|
330 | static bool test_reversebool1()
|
---|
331 | {
|
---|
332 | @@ -3299,6 +3381,8 @@
|
---|
333 | /// The tests which require a database which supports values > 0 sensibly
|
---|
334 | test_desc multivalue_tests[] = {
|
---|
335 | {"collapsekey1", test_collapsekey1},
|
---|
336 | + {"collapsekey3", test_collapsekey3},
|
---|
337 | + {"collapsekey4", test_collapsekey4},
|
---|
338 | {0, 0}
|
---|
339 | };
|
---|
340 |
|
---|