? diffs ? matcher/multimatch.cc.new Index: ChangeLog =================================================================== RCS file: /home/cvs/xapian/xapian-core/ChangeLog,v retrieving revision 1.2316 diff -u -r1.2316 ChangeLog --- ChangeLog 22 May 2004 23:57:31 -0000 1.2316 +++ ChangeLog 26 May 2004 02:02:11 -0000 @@ -1,3 +1,16 @@ +Wed May 26 01:10:07 BST 2004 Richard Boulton + + * matcher/multimatch.cc: When collapsing, keep track of the number + of collapses performed, and use this information to modify the + bounds and estimate of the number of matches. + * include/xapian/enquire.h: Update documentation comments for + MSet::get_matches_*() functions to make clear that collapsing and + cutoffs are taken into account. (Previously, the most likely + interpretation of the comments was that they wouldn't be taken + into account, but the implementation was that percentage cutoffs + were taken into account.) Due to this ambiguity, I think it is + reasonable to say this isn't an API change. + Sun May 23 00:56:41 BST 2004 Olly Betts * backends/quartz/btree.cc,backends/quartz/btree.h: Merge split_off() Index: include/xapian/enquire.h =================================================================== RCS file: /home/cvs/xapian/xapian-core/include/xapian/enquire.h,v retrieving revision 1.18 diff -u -r1.18 enquire.h --- include/xapian/enquire.h 8 Apr 2004 14:33:04 -0000 1.18 +++ include/xapian/enquire.h 26 May 2004 02:02:15 -0000 @@ -130,7 +130,10 @@ Xapian::doccount get_firstitem() const; /** A lower bound on the number of documents in the database which - * have a weight greater than zero. + * match the query. + * + * This figure takes into account collapsing of duplicates, + * and weighting cutoff values. * * This number is usually considerably less than the actual number * of documents which match the query. @@ -138,7 +141,10 @@ Xapian::doccount get_matches_lower_bound() const; /** An estimate for the number of documents in the database which - * have a weight greater than zero. + * match the query. + * + * This figure takes into account collapsing of duplicates, + * and weighting cutoff values. * * This value is returned because there is sometimes a request to * display such information. However, our experience is that @@ -148,8 +154,11 @@ */ Xapian::doccount get_matches_estimated() const; - /** An upper bound on the number of documents in the database with - * a weight greater than zero. + /** An upper bound on the number of documents in the database which + * match the query. + * + * This figure takes into account collapsing of duplicates, + * and weighting cutoff values. * * This number is usually considerably greater than the actual * number of documents which match the query. Index: matcher/multimatch.cc =================================================================== RCS file: /home/cvs/xapian/xapian-core/matcher/multimatch.cc,v retrieving revision 1.162 diff -u -r1.162 multimatch.cc --- matcher/multimatch.cc 11 Aug 2003 11:11:00 -0000 1.162 +++ matcher/multimatch.cc 26 May 2004 02:02:19 -0000 @@ -431,11 +431,19 @@ Xapian::doccount matches_upper_bound = pl->get_termfreq_max(); Xapian::doccount matches_lower_bound = pl->get_termfreq_min(); Xapian::doccount matches_estimated = pl->get_termfreq_est(); + Xapian::doccount duplicates_found = 0; + Xapian::doccount documents_considered = 0; // Check if any results have been asked for (might just be wanting // maxweight) if (maxitems == 0) { delete pl; + if (collapse_key != Xapian::valueno(-1)) { + // Lower bound must be set to no more than 1, since it's possible that + // all hits will be collapsed to a single hit. + if (matches_lower_bound > 1) matches_lower_bound = 1; + } + mset = Xapian::MSet(new Xapian::MSet::Internal( first, matches_upper_bound, @@ -550,6 +558,7 @@ } bool pushback = true; + documents_considered++; // Perform collapsing on key if requested. if (collapse_key != Xapian::valueno(-1)) { @@ -567,6 +576,7 @@ collapse_tab.insert(make_pair(new_item.collapse_key, make_pair(new_item, Xapian::weight(0)))); } else { + duplicates_found++; const Xapian::Internal::MSetItem &old_item = oldkey->second.first; // FIXME: what about sort_bands == 1 case here? if (mcmp(old_item, new_item)) { @@ -613,7 +623,7 @@ } } } - + // OK, actually add the item to the mset. if (pushback) { ++docs_matched; @@ -764,7 +774,7 @@ double denom = 0; for (i = termfreqandwts.begin(); i != termfreqandwts.end(); ++i) denom += i->second.termweight; - + DEBUGLINE(MATCH, "denom = " << denom << " percent_scale = " << percent_scale); Assert(percent_scale <= denom); denom *= greatest_wt; @@ -802,7 +812,7 @@ } percent_scale *= 100.0; } - + if (sort_bands > 1) { sort(items.begin(), items.end(), MSetSortCmp(db, sort_bands, percent_scale, @@ -818,21 +828,49 @@ Assert(percent_cutoff || docs_matched == items.size()); matches_lower_bound = matches_upper_bound = matches_estimated = items.size(); - } else if (percent_cutoff) { - // another approach: Xapian::doccount new_est = items.size() * (1 - percent_factor) / (1 - min_item.wt / greatest_wt); - Xapian::doccount new_est = (Xapian::doccount)((1 - percent_factor) * - matches_estimated); - matches_estimated = max((size_t)new_est, items.size()); - // and another: items.size() + (1 - greatest_wt * percent_factor / min_item.wt) * (matches_estimated - items.size()); - - // Very likely an underestimate, but we can't really do better without - // checking further matches... Only possibility would be to track how - // many docs made the min weight test but didn't make the candidate set - // since the last greatest_wt change, which we could use if the top - // documents matched all the prob terms. - matches_lower_bound = items.size(); - // matches_upper_bound could be reduced by the number of documents - // which fail the min weight test + } else { + if (percent_cutoff) { + // another approach: Xapian::doccount new_est = items.size() * (1 - percent_factor) / (1 - min_item.wt / greatest_wt); + Xapian::doccount new_est = (Xapian::doccount)((1 - percent_factor) * + matches_estimated); + matches_estimated = max((size_t)new_est, items.size()); + // and another: items.size() + (1 - greatest_wt * percent_factor / min_item.wt) * (matches_estimated - items.size()); + + // Very likely an underestimate, but we can't really do better without + // checking further matches... Only possibility would be to track how + // many docs made the min weight test but didn't make the candidate set + // since the last greatest_wt change, which we could use if the top + // documents matched all the prob terms. + matches_lower_bound = items.size(); + // matches_upper_bound could be reduced by the number of documents + // which fail the min weight test + } + + if (collapse_key != Xapian::valueno(-1)) { + // Lower bound must be set to no more than the number of hits + // we've got, since it's possible that all further hits will be + // collapsed to a single value, and that all the futher + // that all hits will be collapsed to a single hit. + matches_lower_bound = items.size(); + + // The estimate for the number of hits can be modified by + // multiplying it by the rate at which we've been finding + // duplicates. + if (documents_considered > 0) { + double collapse_rate = + double(duplicates_found) / double(documents_considered); + matches_estimated = + (Xapian::doccount)(double(matches_estimated) * + collapse_rate); + } + + // We can safely reduce the upper bound by the number of + // duplicates we've seen. + matches_upper_bound -= duplicates_found; + + matches_estimated = max(matches_estimated, matches_lower_bound); + matches_estimated = min(matches_estimated, matches_upper_bound); + } } DEBUGLINE(MATCH, items.size() << " items in potential mset"); @@ -856,7 +894,9 @@ "docs_matched = " << docs_matched << ", " << "matches_lower_bound = " << matches_lower_bound << ", " << "matches_estimated = " << matches_estimated << ", " << - "matches_upper_bound = " << matches_upper_bound); + "matches_upper_bound = " << matches_upper_bound << ", " << + "duplicates_found = " << duplicates_found << ", " << + "documents_considered = " << documents_considered); if (sort_bands <= 1 && !items.empty()) { DEBUGLINE(MATCH, "sorting"); Index: tests/api_db.cc =================================================================== RCS file: /home/cvs/xapian/xapian-core/tests/api_db.cc,v retrieving revision 1.169 diff -u -r1.169 api_db.cc --- tests/api_db.cc 27 Apr 2004 13:37:20 -0000 1.169 +++ tests/api_db.cc 26 May 2004 02:02:26 -0000 @@ -1173,6 +1173,88 @@ return true; } +// tests that collapse-on-key modifies the predicted bounds for the number of +// matches appropriately. +static bool test_collapsekey3() +{ + Xapian::Enquire enquire(get_simple_database()); + init_simple_enquire(enquire); + + Xapian::MSet mymset1 = enquire.get_mset(0, 3); + + for (Xapian::valueno value_no = 1; value_no < 7; ++value_no) { + enquire.set_collapse_key(value_no); + Xapian::MSet mymset = enquire.get_mset(0, 3); + + TEST_AND_EXPLAIN(mymset1.get_matches_lower_bound() > mymset.get_matches_lower_bound(), + "Lower bound was not lower when performing collapse: don't know whether it worked."); + TEST_AND_EXPLAIN(mymset1.get_matches_upper_bound() > mymset.get_matches_upper_bound(), + "Upper bound was not lower when performing collapse: don't know whether it worked."); + + map values; + Xapian::MSetIterator i = mymset.begin(); + for ( ; i != mymset.end(); ++i) { + string value = i.get_document().get_value(value_no); + TEST(values[value] == 0 || value == ""); + values[value] = *i; + } + } + + // Test that, if no duplicates are found (eg, by collapsing on key 1000, + // which has no entries), the upper bound stays the same, but the lower + // bound drops. + { + Xapian::valueno value_no = 1000; + enquire.set_collapse_key(value_no); + Xapian::MSet mymset = enquire.get_mset(0, 3); + + TEST_AND_EXPLAIN(mymset1.get_matches_lower_bound() > mymset.get_matches_lower_bound(), + "Lower bound was not lower when performing collapse: don't know whether it worked."); + TEST_AND_EXPLAIN(mymset1.get_matches_upper_bound() == mymset.get_matches_upper_bound(), + "Upper bound was not equal when collapse turned on, but no duplicates found."); + + map values; + Xapian::MSetIterator i = mymset.begin(); + for ( ; i != mymset.end(); ++i) { + string value = i.get_document().get_value(value_no); + TEST(values[value] == 0 || value == ""); + values[value] = *i; + } + } + + return true; +} + +// tests that collapse-on-key modifies the predicted bounds for the number of +// matches appropriately even when no results are requested. +static bool test_collapsekey4() +{ + Xapian::Enquire enquire(get_simple_database()); + init_simple_enquire(enquire); + + Xapian::MSet mymset1 = enquire.get_mset(0, 0); + + for (Xapian::valueno value_no = 1; value_no < 7; ++value_no) { + enquire.set_collapse_key(value_no); + Xapian::MSet mymset = enquire.get_mset(0, 0); + + TEST_AND_EXPLAIN(mymset.get_matches_lower_bound() == 1, + "Lower bound was not 1 when performing collapse but not asking for any results."); + TEST_AND_EXPLAIN(mymset1.get_matches_upper_bound() == mymset.get_matches_upper_bound(), + "Upper bound was changed when performing collapse but not asking for any results."); + + map values; + Xapian::MSetIterator i = mymset.begin(); + for ( ; i != mymset.end(); ++i) { + string value = i.get_document().get_value(value_no); + TEST(values[value] == 0 || value == ""); + values[value] = *i; + } + } + + return true; +} + // tests a reversed boolean query static bool test_reversebool1() { @@ -3299,6 +3381,8 @@ /// The tests which require a database which supports values > 0 sensibly test_desc multivalue_tests[] = { {"collapsekey1", test_collapsekey1}, + {"collapsekey3", test_collapsekey3}, + {"collapsekey4", test_collapsekey4}, {0, 0} };