Index: ChangeLog
===================================================================
RCS file: /home/cvs/xapian/xapian-core/ChangeLog,v
retrieving revision 1.2316
diff -u -r1.2316 ChangeLog
--- ChangeLog	22 May 2004 23:57:31 -0000	1.2316
+++ ChangeLog	26 May 2004 15:43:31 -0000
@@ -1,3 +1,16 @@
+Wed May 26 01:10:07 BST 2004  Richard Boulton <richard@tartarus.org>
+
+        * matcher/multimatch.cc: When collapsing, keep track of the number
+          of collapses performed, and use this information to modify the
+          bounds and estimate of the number of matches.
+        * include/xapian/enquire.h: Update documentation comments for
+          MSet::get_matches_*() functions to make clear that collapsing and
+          cutoffs are taken into account.  (Previously, the most likely
+          interpretation of the comments was that they wouldn't be taken
+          into account, but the implementation was that percentage cutoffs
+          were taken into account.)  Due to this ambiguity, I think it is
+          reasonable to say this isn't an API change.
+
 Sun May 23 00:56:41 BST 2004  Olly Betts <olly@survex.com>
 
 	* backends/quartz/btree.cc,backends/quartz/btree.h: Merge split_off()
Index: include/xapian/enquire.h
===================================================================
RCS file: /home/cvs/xapian/xapian-core/include/xapian/enquire.h,v
retrieving revision 1.18
diff -u -r1.18 enquire.h
--- include/xapian/enquire.h	8 Apr 2004 14:33:04 -0000	1.18
+++ include/xapian/enquire.h	26 May 2004 15:43:35 -0000
@@ -130,7 +130,10 @@
 	Xapian::doccount get_firstitem() const;
 
 	/** A lower bound on the number of documents in the database which
-	 *  have a weight greater than zero.
+	 *  match the query.
+	 *
+	 *  This figure takes into account collapsing of duplicates,
+	 *  and weighting cutoff values.
 	 *
 	 *  This number is usually considerably less than the actual number
 	 *  of documents which match the query.
@@ -138,7 +141,10 @@
 	Xapian::doccount get_matches_lower_bound() const;
 
 	/** An estimate for the number of documents in the database which
-	 *  have a weight greater than zero.
+	 *  match the query.
+	 *
+	 *  This figure takes into account collapsing of duplicates,
+	 *  and weighting cutoff values.
 	 *
 	 *  This value is returned because there is sometimes a request to
 	 *  display such information.  However, our experience is that
@@ -148,8 +154,11 @@
 	 */
 	Xapian::doccount get_matches_estimated() const;
 
-	/** An upper bound on the number of documents in the database with
-	 *  a weight greater than zero.
+	/** An upper bound on the number of documents in the database which
+	 *  match the query.
+	 *
+	 *  This figure takes into account collapsing of duplicates,
+	 *  and weighting cutoff values.
 	 *
 	 *  This number is usually considerably greater than the actual
 	 *  number of documents which match the query.
Index: matcher/multimatch.cc
===================================================================
RCS file: /home/cvs/xapian/xapian-core/matcher/multimatch.cc,v
retrieving revision 1.162
diff -u -r1.162 multimatch.cc
--- matcher/multimatch.cc	11 Aug 2003 11:11:00 -0000	1.162
+++ matcher/multimatch.cc	26 May 2004 15:43:39 -0000
@@ -431,11 +431,19 @@
     Xapian::doccount matches_upper_bound = pl->get_termfreq_max();
     Xapian::doccount matches_lower_bound = pl->get_termfreq_min();
     Xapian::doccount matches_estimated   = pl->get_termfreq_est();
+    Xapian::doccount duplicates_found = 0;
+    Xapian::doccount documents_considered = 0;
 
     // Check if any results have been asked for (might just be wanting
     // maxweight)
     if (maxitems == 0) {
 	delete pl;
+	if (collapse_key != Xapian::valueno(-1)) {
+	    // Lower bound must be set to no more than 1, since it's possible that
+	    // all hits will be collapsed to a single hit.
+	    if (matches_lower_bound > 1) matches_lower_bound = 1;
+	}
+
 	mset = Xapian::MSet(new Xapian::MSet::Internal(
 					   first,
 					   matches_upper_bound,
@@ -550,6 +558,7 @@
 	}
 
 	bool pushback = true;
+	documents_considered++;
 
 	// Perform collapsing on key if requested.
 	if (collapse_key != Xapian::valueno(-1)) {
@@ -567,6 +576,7 @@
 		    collapse_tab.insert(make_pair(new_item.collapse_key,
 					make_pair(new_item, Xapian::weight(0))));
 		} else {
+		    duplicates_found++;
 		    const Xapian::Internal::MSetItem &old_item = oldkey->second.first;
 		    // FIXME: what about sort_bands == 1 case here?
 		    if (mcmp(old_item, new_item)) {
@@ -613,7 +623,7 @@
 		}
 	    }
 	}
-	    
+
 	// OK, actually add the item to the mset.
 	if (pushback) {
 	    ++docs_matched;
@@ -764,7 +774,7 @@
 	    double denom = 0;
 	    for (i = termfreqandwts.begin(); i != termfreqandwts.end(); ++i)
 		denom += i->second.termweight;
-	   
+
 	    DEBUGLINE(MATCH, "denom = " << denom << " percent_scale = " << percent_scale);
 	    Assert(percent_scale <= denom);
 	    denom *= greatest_wt;
@@ -802,7 +812,7 @@
 	}
 	percent_scale *= 100.0;
     }
-    
+
     if (sort_bands > 1) {
 	sort(items.begin(), items.end(),
 	     MSetSortCmp(db, sort_bands, percent_scale,
@@ -818,21 +828,54 @@
 	Assert(percent_cutoff || docs_matched == items.size());
 	matches_lower_bound = matches_upper_bound = matches_estimated
 	    = items.size();
-    } else if (percent_cutoff) {
-	// another approach: Xapian::doccount new_est = items.size() * (1 - percent_factor) / (1 - min_item.wt / greatest_wt);
-	Xapian::doccount new_est = (Xapian::doccount)((1 - percent_factor) *
-					    matches_estimated);
-	matches_estimated = max((size_t)new_est, items.size());
-	// and another: items.size() + (1 - greatest_wt * percent_factor / min_item.wt) * (matches_estimated - items.size());
-	
-	// Very likely an underestimate, but we can't really do better without
-	// checking further matches...  Only possibility would be to track how
-	// many docs made the min weight test but didn't make the candidate set
-	// since the last greatest_wt change, which we could use if the top
-	// documents matched all the prob terms.
-	matches_lower_bound = items.size();
-	// matches_upper_bound could be reduced by the number of documents
-	// which fail the min weight test
+    } else {
+	if (percent_cutoff) {
+	    // another approach: Xapian::doccount new_est = items.size() * (1 - percent_factor) / (1 - min_item.wt / greatest_wt);
+	    Xapian::doccount new_est = (Xapian::doccount)((1 - percent_factor) *
+							  matches_estimated);
+	    matches_estimated = max((size_t)new_est, items.size());
+	    // and another: items.size() + (1 - greatest_wt * percent_factor / min_item.wt) * (matches_estimated - items.size());
+
+	    // Very likely an underestimate, but we can't really do better without
+	    // checking further matches...  Only possibility would be to track how
+	    // many docs made the min weight test but didn't make the candidate set
+	    // since the last greatest_wt change, which we could use if the top
+	    // documents matched all the prob terms.
+	    matches_lower_bound = items.size();
+	    // matches_upper_bound could be reduced by the number of documents
+	    // which fail the min weight test
+	}
+
+	if (collapse_key != Xapian::valueno(-1)) {
+	    // Lower bound must be set to no more than the number of collapse
+            // values we've got, since it's possible that all further hits
+            // will be collapsed to a single value.
+	    matches_lower_bound = collapse_tab.size();
+
+	    // The estimate for the number of hits can be modified by
+	    // multiplying it by the rate at which we've been finding
+	    // duplicates.
+	    if (documents_considered > 0) {
+		double collapse_rate =
+			double(duplicates_found) / double(documents_considered);
+		matches_estimated -=
+			(Xapian::doccount)(double(matches_estimated) *
+					   collapse_rate);
+	    }
+
+	    // We can safely reduce the upper bound by the number of
+	    // duplicates we've seen.
+	    matches_upper_bound -= duplicates_found;
+
+	    matches_estimated = max(matches_estimated, matches_lower_bound);
+	    matches_estimated = min(matches_estimated, matches_upper_bound);
+        } else if (!percent_cutoff) {
+            Assert(docs_matched <= matches_upper_bound);
+            if (docs_matched > matches_lower_bound)
+                matches_lower_bound = docs_matched;
+            if (docs_matched > matches_estimated)
+                matches_estimated = docs_matched;
+        }
     }
 
     DEBUGLINE(MATCH, items.size() << " items in potential mset");
@@ -856,7 +899,9 @@
 	      "docs_matched = " << docs_matched << ", " <<
 	      "matches_lower_bound = " << matches_lower_bound << ", " <<
 	      "matches_estimated = " << matches_estimated << ", " <<
-	      "matches_upper_bound = " << matches_upper_bound);
+	      "matches_upper_bound = " << matches_upper_bound << ", " <<
+	      "duplicates_found = " << duplicates_found << ", " <<
+	      "documents_considered = " << documents_considered);
 
     if (sort_bands <= 1 && !items.empty()) {
 	DEBUGLINE(MATCH, "sorting");
@@ -870,14 +915,6 @@
 
     Assert(matches_estimated >= matches_lower_bound);
     Assert(matches_estimated <= matches_upper_bound);
-
-    if (!percent_cutoff) {
-	Assert(docs_matched <= matches_upper_bound);
-	if (docs_matched > matches_lower_bound)
-	    matches_lower_bound = docs_matched;
-	if (docs_matched > matches_estimated)
-	    matches_estimated = docs_matched;
-    }
 
     // We may need to qualify any collapse_count to see if the highest weight
     // collapsed item would have qualified percent_cutoff 
Index: tests/api_db.cc
===================================================================
RCS file: /home/cvs/xapian/xapian-core/tests/api_db.cc,v
retrieving revision 1.169
diff -u -r1.169 api_db.cc
--- tests/api_db.cc	27 Apr 2004 13:37:20 -0000	1.169
+++ tests/api_db.cc	26 May 2004 15:43:46 -0000
@@ -1173,6 +1173,88 @@
     return true;
 }
 
+// tests that collapse-on-key modifies the predicted bounds for the number of
+// matches appropriately.
+static bool test_collapsekey3()
+{
+    Xapian::Enquire enquire(get_simple_database());
+    init_simple_enquire(enquire);
+
+    Xapian::MSet mymset1 = enquire.get_mset(0, 3);
+
+    for (Xapian::valueno value_no = 1; value_no < 7; ++value_no) {
+	enquire.set_collapse_key(value_no);
+	Xapian::MSet mymset = enquire.get_mset(0, 3);
+
+	TEST_AND_EXPLAIN(mymset1.get_matches_lower_bound() > mymset.get_matches_lower_bound(),
+			 "Lower bound was not lower when performing collapse: don't know whether it worked.");
+	TEST_AND_EXPLAIN(mymset1.get_matches_upper_bound() > mymset.get_matches_upper_bound(),
+			 "Upper bound was not lower when performing collapse: don't know whether it worked.");
+
+	map<string, Xapian::docid> values;
+	Xapian::MSetIterator i = mymset.begin();
+	for ( ; i != mymset.end(); ++i) {
+	    string value = i.get_document().get_value(value_no);
+	    TEST(values[value] == 0 || value == "");
+	    values[value] = *i;
+	}
+    }
+
+    // Test that, if no duplicates are found (eg, by collapsing on key 1000,
+    // which has no entries), the upper bound stays the same, but the lower
+    // bound drops.
+    {
+        Xapian::valueno value_no = 1000;
+	enquire.set_collapse_key(value_no);
+	Xapian::MSet mymset = enquire.get_mset(0, 3);
+
+	TEST_AND_EXPLAIN(mymset1.get_matches_lower_bound() > mymset.get_matches_lower_bound(),
+			 "Lower bound was not lower when performing collapse: don't know whether it worked.");
+	TEST_AND_EXPLAIN(mymset1.get_matches_upper_bound() == mymset.get_matches_upper_bound(),
+			 "Upper bound was not equal when collapse turned on, but no duplicates found.");
+
+	map<string, Xapian::docid> values;
+	Xapian::MSetIterator i = mymset.begin();
+	for ( ; i != mymset.end(); ++i) {
+	    string value = i.get_document().get_value(value_no);
+	    TEST(values[value] == 0 || value == "");
+	    values[value] = *i;
+	}
+    }
+
+    return true;
+}
+
+// tests that collapse-on-key modifies the predicted bounds for the number of
+// matches appropriately even when no results are requested.
+static bool test_collapsekey4()
+{
+    Xapian::Enquire enquire(get_simple_database());
+    init_simple_enquire(enquire);
+
+    Xapian::MSet mymset1 = enquire.get_mset(0, 0);
+
+    for (Xapian::valueno value_no = 1; value_no < 7; ++value_no) {
+	enquire.set_collapse_key(value_no);
+	Xapian::MSet mymset = enquire.get_mset(0, 0);
+
+	TEST_AND_EXPLAIN(mymset.get_matches_lower_bound() == 1,
+			 "Lower bound was not 1 when performing collapse but not asking for any results.");
+	TEST_AND_EXPLAIN(mymset1.get_matches_upper_bound() == mymset.get_matches_upper_bound(),
+			 "Upper bound was changed when performing collapse but not asking for any results.");
+
+	map<string, Xapian::docid> values;
+	Xapian::MSetIterator i = mymset.begin();
+	for ( ; i != mymset.end(); ++i) {
+	    string value = i.get_document().get_value(value_no);
+	    TEST(values[value] == 0 || value == "");
+	    values[value] = *i;
+	}
+    }
+
+    return true;
+}
+
 // tests a reversed boolean query
 static bool test_reversebool1()
 {
@@ -3299,6 +3381,8 @@
 /// The tests which require a database which supports values > 0 sensibly
 test_desc multivalue_tests[] = {
     {"collapsekey1",	   test_collapsekey1},
+    {"collapsekey3",	   test_collapsekey3},
+    {"collapsekey4",	   test_collapsekey4},
     {0, 0}
 };
 
