| 1 | Index: ChangeLog
|
|---|
| 2 | ===================================================================
|
|---|
| 3 | RCS file: /home/cvs/xapian/xapian-core/ChangeLog,v
|
|---|
| 4 | retrieving revision 1.2316
|
|---|
| 5 | diff -u -r1.2316 ChangeLog
|
|---|
| 6 | --- ChangeLog 22 May 2004 23:57:31 -0000 1.2316
|
|---|
| 7 | +++ ChangeLog 26 May 2004 15:07:53 -0000
|
|---|
| 8 | @@ -1,3 +1,16 @@
|
|---|
| 9 | +Wed May 26 01:10:07 BST 2004 Richard Boulton <richard@tartarus.org>
|
|---|
| 10 | +
|
|---|
| 11 | + * matcher/multimatch.cc: When collapsing, keep track of the number
|
|---|
| 12 | + of collapses performed, and use this information to modify the
|
|---|
| 13 | + bounds and estimate of the number of matches.
|
|---|
| 14 | + * include/xapian/enquire.h: Update documentation comments for
|
|---|
| 15 | + MSet::get_matches_*() functions to make clear that collapsing and
|
|---|
| 16 | + cutoffs are taken into account. (Previously, the most likely
|
|---|
| 17 | + interpretation of the comments was that they wouldn't be taken
|
|---|
| 18 | + into account, but the implementation was that percentage cutoffs
|
|---|
| 19 | + were taken into account.) Due to this ambiguity, I think it is
|
|---|
| 20 | + reasonable to say this isn't an API change.
|
|---|
| 21 | +
|
|---|
| 22 | Sun May 23 00:56:41 BST 2004 Olly Betts <olly@survex.com>
|
|---|
| 23 |
|
|---|
| 24 | * backends/quartz/btree.cc,backends/quartz/btree.h: Merge split_off()
|
|---|
| 25 | Index: include/xapian/enquire.h
|
|---|
| 26 | ===================================================================
|
|---|
| 27 | RCS file: /home/cvs/xapian/xapian-core/include/xapian/enquire.h,v
|
|---|
| 28 | retrieving revision 1.18
|
|---|
| 29 | diff -u -r1.18 enquire.h
|
|---|
| 30 | --- include/xapian/enquire.h 8 Apr 2004 14:33:04 -0000 1.18
|
|---|
| 31 | +++ include/xapian/enquire.h 26 May 2004 15:07:58 -0000
|
|---|
| 32 | @@ -130,7 +130,10 @@
|
|---|
| 33 | Xapian::doccount get_firstitem() const;
|
|---|
| 34 |
|
|---|
| 35 | /** A lower bound on the number of documents in the database which
|
|---|
| 36 | - * have a weight greater than zero.
|
|---|
| 37 | + * match the query.
|
|---|
| 38 | + *
|
|---|
| 39 | + * This figure takes into account collapsing of duplicates,
|
|---|
| 40 | + * and weighting cutoff values.
|
|---|
| 41 | *
|
|---|
| 42 | * This number is usually considerably less than the actual number
|
|---|
| 43 | * of documents which match the query.
|
|---|
| 44 | @@ -138,7 +141,10 @@
|
|---|
| 45 | Xapian::doccount get_matches_lower_bound() const;
|
|---|
| 46 |
|
|---|
| 47 | /** An estimate for the number of documents in the database which
|
|---|
| 48 | - * have a weight greater than zero.
|
|---|
| 49 | + * match the query.
|
|---|
| 50 | + *
|
|---|
| 51 | + * This figure takes into account collapsing of duplicates,
|
|---|
| 52 | + * and weighting cutoff values.
|
|---|
| 53 | *
|
|---|
| 54 | * This value is returned because there is sometimes a request to
|
|---|
| 55 | * display such information. However, our experience is that
|
|---|
| 56 | @@ -148,8 +154,11 @@
|
|---|
| 57 | */
|
|---|
| 58 | Xapian::doccount get_matches_estimated() const;
|
|---|
| 59 |
|
|---|
| 60 | - /** An upper bound on the number of documents in the database with
|
|---|
| 61 | - * a weight greater than zero.
|
|---|
| 62 | + /** An upper bound on the number of documents in the database which
|
|---|
| 63 | + * match the query.
|
|---|
| 64 | + *
|
|---|
| 65 | + * This figure takes into account collapsing of duplicates,
|
|---|
| 66 | + * and weighting cutoff values.
|
|---|
| 67 | *
|
|---|
| 68 | * This number is usually considerably greater than the actual
|
|---|
| 69 | * number of documents which match the query.
|
|---|
| 70 | Index: matcher/multimatch.cc
|
|---|
| 71 | ===================================================================
|
|---|
| 72 | RCS file: /home/cvs/xapian/xapian-core/matcher/multimatch.cc,v
|
|---|
| 73 | retrieving revision 1.162
|
|---|
| 74 | diff -u -r1.162 multimatch.cc
|
|---|
| 75 | --- matcher/multimatch.cc 11 Aug 2003 11:11:00 -0000 1.162
|
|---|
| 76 | +++ matcher/multimatch.cc 26 May 2004 15:08:05 -0000
|
|---|
| 77 | @@ -431,11 +431,19 @@
|
|---|
| 78 | Xapian::doccount matches_upper_bound = pl->get_termfreq_max();
|
|---|
| 79 | Xapian::doccount matches_lower_bound = pl->get_termfreq_min();
|
|---|
| 80 | Xapian::doccount matches_estimated = pl->get_termfreq_est();
|
|---|
| 81 | + Xapian::doccount duplicates_found = 0;
|
|---|
| 82 | + Xapian::doccount documents_considered = 0;
|
|---|
| 83 |
|
|---|
| 84 | // Check if any results have been asked for (might just be wanting
|
|---|
| 85 | // maxweight)
|
|---|
| 86 | if (maxitems == 0) {
|
|---|
| 87 | delete pl;
|
|---|
| 88 | + if (collapse_key != Xapian::valueno(-1)) {
|
|---|
| 89 | + // Lower bound must be set to no more than 1, since it's possible that
|
|---|
| 90 | + // all hits will be collapsed to a single hit.
|
|---|
| 91 | + if (matches_lower_bound > 1) matches_lower_bound = 1;
|
|---|
| 92 | + }
|
|---|
| 93 | +
|
|---|
| 94 | mset = Xapian::MSet(new Xapian::MSet::Internal(
|
|---|
| 95 | first,
|
|---|
| 96 | matches_upper_bound,
|
|---|
| 97 | @@ -550,6 +558,7 @@
|
|---|
| 98 | }
|
|---|
| 99 |
|
|---|
| 100 | bool pushback = true;
|
|---|
| 101 | + documents_considered++;
|
|---|
| 102 |
|
|---|
| 103 | // Perform collapsing on key if requested.
|
|---|
| 104 | if (collapse_key != Xapian::valueno(-1)) {
|
|---|
| 105 | @@ -567,6 +576,7 @@
|
|---|
| 106 | collapse_tab.insert(make_pair(new_item.collapse_key,
|
|---|
| 107 | make_pair(new_item, Xapian::weight(0))));
|
|---|
| 108 | } else {
|
|---|
| 109 | + duplicates_found++;
|
|---|
| 110 | const Xapian::Internal::MSetItem &old_item = oldkey->second.first;
|
|---|
| 111 | // FIXME: what about sort_bands == 1 case here?
|
|---|
| 112 | if (mcmp(old_item, new_item)) {
|
|---|
| 113 | @@ -613,7 +623,7 @@
|
|---|
| 114 | }
|
|---|
| 115 | }
|
|---|
| 116 | }
|
|---|
| 117 | -
|
|---|
| 118 | +
|
|---|
| 119 | // OK, actually add the item to the mset.
|
|---|
| 120 | if (pushback) {
|
|---|
| 121 | ++docs_matched;
|
|---|
| 122 | @@ -764,7 +774,7 @@
|
|---|
| 123 | double denom = 0;
|
|---|
| 124 | for (i = termfreqandwts.begin(); i != termfreqandwts.end(); ++i)
|
|---|
| 125 | denom += i->second.termweight;
|
|---|
| 126 | -
|
|---|
| 127 | +
|
|---|
| 128 | DEBUGLINE(MATCH, "denom = " << denom << " percent_scale = " << percent_scale);
|
|---|
| 129 | Assert(percent_scale <= denom);
|
|---|
| 130 | denom *= greatest_wt;
|
|---|
| 131 | @@ -802,7 +812,7 @@
|
|---|
| 132 | }
|
|---|
| 133 | percent_scale *= 100.0;
|
|---|
| 134 | }
|
|---|
| 135 | -
|
|---|
| 136 | +
|
|---|
| 137 | if (sort_bands > 1) {
|
|---|
| 138 | sort(items.begin(), items.end(),
|
|---|
| 139 | MSetSortCmp(db, sort_bands, percent_scale,
|
|---|
| 140 | @@ -818,21 +828,49 @@
|
|---|
| 141 | Assert(percent_cutoff || docs_matched == items.size());
|
|---|
| 142 | matches_lower_bound = matches_upper_bound = matches_estimated
|
|---|
| 143 | = items.size();
|
|---|
| 144 | - } else if (percent_cutoff) {
|
|---|
| 145 | - // another approach: Xapian::doccount new_est = items.size() * (1 - percent_factor) / (1 - min_item.wt / greatest_wt);
|
|---|
| 146 | - Xapian::doccount new_est = (Xapian::doccount)((1 - percent_factor) *
|
|---|
| 147 | - matches_estimated);
|
|---|
| 148 | - matches_estimated = max((size_t)new_est, items.size());
|
|---|
| 149 | - // and another: items.size() + (1 - greatest_wt * percent_factor / min_item.wt) * (matches_estimated - items.size());
|
|---|
| 150 | -
|
|---|
| 151 | - // Very likely an underestimate, but we can't really do better without
|
|---|
| 152 | - // checking further matches... Only possibility would be to track how
|
|---|
| 153 | - // many docs made the min weight test but didn't make the candidate set
|
|---|
| 154 | - // since the last greatest_wt change, which we could use if the top
|
|---|
| 155 | - // documents matched all the prob terms.
|
|---|
| 156 | - matches_lower_bound = items.size();
|
|---|
| 157 | - // matches_upper_bound could be reduced by the number of documents
|
|---|
| 158 | - // which fail the min weight test
|
|---|
| 159 | + } else {
|
|---|
| 160 | + if (percent_cutoff) {
|
|---|
| 161 | + // another approach: Xapian::doccount new_est = items.size() * (1 - percent_factor) / (1 - min_item.wt / greatest_wt);
|
|---|
| 162 | + Xapian::doccount new_est = (Xapian::doccount)((1 - percent_factor) *
|
|---|
| 163 | + matches_estimated);
|
|---|
| 164 | + matches_estimated = max((size_t)new_est, items.size());
|
|---|
| 165 | + // and another: items.size() + (1 - greatest_wt * percent_factor / min_item.wt) * (matches_estimated - items.size());
|
|---|
| 166 | +
|
|---|
| 167 | + // Very likely an underestimate, but we can't really do better without
|
|---|
| 168 | + // checking further matches... Only possibility would be to track how
|
|---|
| 169 | + // many docs made the min weight test but didn't make the candidate set
|
|---|
| 170 | + // since the last greatest_wt change, which we could use if the top
|
|---|
| 171 | + // documents matched all the prob terms.
|
|---|
| 172 | + matches_lower_bound = items.size();
|
|---|
| 173 | + // matches_upper_bound could be reduced by the number of documents
|
|---|
| 174 | + // which fail the min weight test
|
|---|
| 175 | + }
|
|---|
| 176 | +
|
|---|
| 177 | + if (collapse_key != Xapian::valueno(-1)) {
|
|---|
| 178 | + // Lower bound must be set to no more than the number of hits
|
|---|
| 179 | + // we've got, since it's possible that all further hits will be
|
|---|
| 180 | + // collapsed to a single value, and that all the futher
|
|---|
| 181 | + // that all hits will be collapsed to a single hit.
|
|---|
| 182 | + matches_lower_bound = items.size();
|
|---|
| 183 | +
|
|---|
| 184 | + // The estimate for the number of hits can be modified by
|
|---|
| 185 | + // multiplying it by the rate at which we've been finding
|
|---|
| 186 | + // duplicates.
|
|---|
| 187 | + if (documents_considered > 0) {
|
|---|
| 188 | + double collapse_rate =
|
|---|
| 189 | + double(duplicates_found) / double(documents_considered);
|
|---|
| 190 | + matches_estimated -=
|
|---|
| 191 | + (Xapian::doccount)(double(matches_estimated) *
|
|---|
| 192 | + collapse_rate);
|
|---|
| 193 | + }
|
|---|
| 194 | +
|
|---|
| 195 | + // We can safely reduce the upper bound by the number of
|
|---|
| 196 | + // duplicates we've seen.
|
|---|
| 197 | + matches_upper_bound -= duplicates_found;
|
|---|
| 198 | +
|
|---|
| 199 | + matches_estimated = max(matches_estimated, matches_lower_bound);
|
|---|
| 200 | + matches_estimated = min(matches_estimated, matches_upper_bound);
|
|---|
| 201 | + }
|
|---|
| 202 | }
|
|---|
| 203 |
|
|---|
| 204 | DEBUGLINE(MATCH, items.size() << " items in potential mset");
|
|---|
| 205 | @@ -856,7 +894,9 @@
|
|---|
| 206 | "docs_matched = " << docs_matched << ", " <<
|
|---|
| 207 | "matches_lower_bound = " << matches_lower_bound << ", " <<
|
|---|
| 208 | "matches_estimated = " << matches_estimated << ", " <<
|
|---|
| 209 | - "matches_upper_bound = " << matches_upper_bound);
|
|---|
| 210 | + "matches_upper_bound = " << matches_upper_bound << ", " <<
|
|---|
| 211 | + "duplicates_found = " << duplicates_found << ", " <<
|
|---|
| 212 | + "documents_considered = " << documents_considered);
|
|---|
| 213 |
|
|---|
| 214 | if (sort_bands <= 1 && !items.empty()) {
|
|---|
| 215 | DEBUGLINE(MATCH, "sorting");
|
|---|
| 216 | Index: tests/api_db.cc
|
|---|
| 217 | ===================================================================
|
|---|
| 218 | RCS file: /home/cvs/xapian/xapian-core/tests/api_db.cc,v
|
|---|
| 219 | retrieving revision 1.169
|
|---|
| 220 | diff -u -r1.169 api_db.cc
|
|---|
| 221 | --- tests/api_db.cc 27 Apr 2004 13:37:20 -0000 1.169
|
|---|
| 222 | +++ tests/api_db.cc 26 May 2004 15:08:10 -0000
|
|---|
| 223 | @@ -1173,6 +1173,88 @@
|
|---|
| 224 | return true;
|
|---|
| 225 | }
|
|---|
| 226 |
|
|---|
| 227 | +// tests that collapse-on-key modifies the predicted bounds for the number of
|
|---|
| 228 | +// matches appropriately.
|
|---|
| 229 | +static bool test_collapsekey3()
|
|---|
| 230 | +{
|
|---|
| 231 | + Xapian::Enquire enquire(get_simple_database());
|
|---|
| 232 | + init_simple_enquire(enquire);
|
|---|
| 233 | +
|
|---|
| 234 | + Xapian::MSet mymset1 = enquire.get_mset(0, 3);
|
|---|
| 235 | +
|
|---|
| 236 | + for (Xapian::valueno value_no = 1; value_no < 7; ++value_no) {
|
|---|
| 237 | + enquire.set_collapse_key(value_no);
|
|---|
| 238 | + Xapian::MSet mymset = enquire.get_mset(0, 3);
|
|---|
| 239 | +
|
|---|
| 240 | + TEST_AND_EXPLAIN(mymset1.get_matches_lower_bound() > mymset.get_matches_lower_bound(),
|
|---|
| 241 | + "Lower bound was not lower when performing collapse: don't know whether it worked.");
|
|---|
| 242 | + TEST_AND_EXPLAIN(mymset1.get_matches_upper_bound() > mymset.get_matches_upper_bound(),
|
|---|
| 243 | + "Upper bound was not lower when performing collapse: don't know whether it worked.");
|
|---|
| 244 | +
|
|---|
| 245 | + map<string, Xapian::docid> values;
|
|---|
| 246 | + Xapian::MSetIterator i = mymset.begin();
|
|---|
| 247 | + for ( ; i != mymset.end(); ++i) {
|
|---|
| 248 | + string value = i.get_document().get_value(value_no);
|
|---|
| 249 | + TEST(values[value] == 0 || value == "");
|
|---|
| 250 | + values[value] = *i;
|
|---|
| 251 | + }
|
|---|
| 252 | + }
|
|---|
| 253 | +
|
|---|
| 254 | + // Test that, if no duplicates are found (eg, by collapsing on key 1000,
|
|---|
| 255 | + // which has no entries), the upper bound stays the same, but the lower
|
|---|
| 256 | + // bound drops.
|
|---|
| 257 | + {
|
|---|
| 258 | + Xapian::valueno value_no = 1000;
|
|---|
| 259 | + enquire.set_collapse_key(value_no);
|
|---|
| 260 | + Xapian::MSet mymset = enquire.get_mset(0, 3);
|
|---|
| 261 | +
|
|---|
| 262 | + TEST_AND_EXPLAIN(mymset1.get_matches_lower_bound() > mymset.get_matches_lower_bound(),
|
|---|
| 263 | + "Lower bound was not lower when performing collapse: don't know whether it worked.");
|
|---|
| 264 | + TEST_AND_EXPLAIN(mymset1.get_matches_upper_bound() == mymset.get_matches_upper_bound(),
|
|---|
| 265 | + "Upper bound was not equal when collapse turned on, but no duplicates found.");
|
|---|
| 266 | +
|
|---|
| 267 | + map<string, Xapian::docid> values;
|
|---|
| 268 | + Xapian::MSetIterator i = mymset.begin();
|
|---|
| 269 | + for ( ; i != mymset.end(); ++i) {
|
|---|
| 270 | + string value = i.get_document().get_value(value_no);
|
|---|
| 271 | + TEST(values[value] == 0 || value == "");
|
|---|
| 272 | + values[value] = *i;
|
|---|
| 273 | + }
|
|---|
| 274 | + }
|
|---|
| 275 | +
|
|---|
| 276 | + return true;
|
|---|
| 277 | +}
|
|---|
| 278 | +
|
|---|
| 279 | +// tests that collapse-on-key modifies the predicted bounds for the number of
|
|---|
| 280 | +// matches appropriately even when no results are requested.
|
|---|
| 281 | +static bool test_collapsekey4()
|
|---|
| 282 | +{
|
|---|
| 283 | + Xapian::Enquire enquire(get_simple_database());
|
|---|
| 284 | + init_simple_enquire(enquire);
|
|---|
| 285 | +
|
|---|
| 286 | + Xapian::MSet mymset1 = enquire.get_mset(0, 0);
|
|---|
| 287 | +
|
|---|
| 288 | + for (Xapian::valueno value_no = 1; value_no < 7; ++value_no) {
|
|---|
| 289 | + enquire.set_collapse_key(value_no);
|
|---|
| 290 | + Xapian::MSet mymset = enquire.get_mset(0, 0);
|
|---|
| 291 | +
|
|---|
| 292 | + TEST_AND_EXPLAIN(mymset.get_matches_lower_bound() == 1,
|
|---|
| 293 | + "Lower bound was not 1 when performing collapse but not asking for any results.");
|
|---|
| 294 | + TEST_AND_EXPLAIN(mymset1.get_matches_upper_bound() == mymset.get_matches_upper_bound(),
|
|---|
| 295 | + "Upper bound was changed when performing collapse but not asking for any results.");
|
|---|
| 296 | +
|
|---|
| 297 | + map<string, Xapian::docid> values;
|
|---|
| 298 | + Xapian::MSetIterator i = mymset.begin();
|
|---|
| 299 | + for ( ; i != mymset.end(); ++i) {
|
|---|
| 300 | + string value = i.get_document().get_value(value_no);
|
|---|
| 301 | + TEST(values[value] == 0 || value == "");
|
|---|
| 302 | + values[value] = *i;
|
|---|
| 303 | + }
|
|---|
| 304 | + }
|
|---|
| 305 | +
|
|---|
| 306 | + return true;
|
|---|
| 307 | +}
|
|---|
| 308 | +
|
|---|
| 309 | // tests a reversed boolean query
|
|---|
| 310 | static bool test_reversebool1()
|
|---|
| 311 | {
|
|---|
| 312 | @@ -3299,6 +3381,8 @@
|
|---|
| 313 | /// The tests which require a database which supports values > 0 sensibly
|
|---|
| 314 | test_desc multivalue_tests[] = {
|
|---|
| 315 | {"collapsekey1", test_collapsekey1},
|
|---|
| 316 | + {"collapsekey3", test_collapsekey3},
|
|---|
| 317 | + {"collapsekey4", test_collapsekey4},
|
|---|
| 318 | {0, 0}
|
|---|
| 319 | };
|
|---|
| 320 |
|
|---|