Ticket #86: bug86-fix.patch

File bug86-fix.patch, 8.1 KB (added by Olly Betts, 18 years ago)

Fix for this bug (I hope!)

  • matcher/multimatch.cc

     
    351351    Xapian::doccount max_msize = first + check_at_least;
    352352    items.reserve(max_msize + 1);
    353353
    354     // Set the minimum item, used to compare against to see if an item
    355     // should be considered for the mset.
    356     Xapian::Internal::MSetItem min_item(weight_cutoff, 0);
     354    // Tracks the minimum item currently eligible for the MSet - we compare
     355    // candidate items against this.
     356    Xapian::Internal::MSetItem min_item(0.0, 0);
     357
     358    // Minimum weight an item must have to be worth considering.
     359    Xapian::weight min_weight = weight_cutoff;
    357360
    358361    Xapian::weight percent_factor = percent_cutoff / 100.0;
    359362
     
    381384
    382385    while (true) {
    383386        if (recalculate_w_max) {
    384             if (min_item.wt > 0.0 && getorrecalc_maxweight(pl) < min_item.wt) {
     387            if (min_weight > 0.0 && getorrecalc_maxweight(pl) < min_weight) {
    385388                DEBUGLINE(MATCH, "*** TERMINATING EARLY (1)");
    386389                break;
    387390            }
    388391        }
    389392
    390         if (next_handling_prune(pl, min_item.wt, this)) {
     393        if (next_handling_prune(pl, min_weight, this)) {
    391394            DEBUGLINE(MATCH, "*** REPLACING ROOT");
    392395
    393             // no need for a full recalc (unless we've got to do one because
    394             // of a prune elsewhere) - we're just switching to a subtree
    395             if (getorrecalc_maxweight(pl) < min_item.wt) {
    396                 DEBUGLINE(MATCH, "*** TERMINATING EARLY (2)");
    397                 break;
     396            if (min_weight > 0.0) {
     397                // No need for a full recalc (unless we've got to do one
     398                // because of a prune elsewhere) - we're just switching to a
     399                // subtree.
     400                if (getorrecalc_maxweight(pl) < min_weight) {
     401                    DEBUGLINE(MATCH, "*** TERMINATING EARLY (2)");
     402                    break;
     403                }
    398404            }
    399405        }
    400406
     
    403409            break;
    404410        }
    405411
    406         Xapian::docid did = pl->get_docid();
     412        // Only calculate the weight if we need it for mcmp, or there's a
     413        // percentage or weight cutoff in effect.  Otherwise we calculate it
     414        // below if we haven't already rejected this candidate.
    407415        Xapian::weight wt = 0.0;
    408         // Only calculate the weight if we need it for mcmp - otherwise
    409         // we calculate it below only if we keep the item
    410         if (min_item.wt > 0.0) wt = pl->get_weight();
     416        bool calculated_weight = false;
     417        if (sort_by != VAL || min_weight > 0.0) {
     418            wt = pl->get_weight();
     419            if (wt < min_weight) continue;
     420            calculated_weight = true;
     421        }
    411422
     423        Xapian::docid did = pl->get_docid();
    412424        DEBUGLINE(MATCH, "Candidate document id " << did << " wt " << wt);
    413425        Xapian::Internal::MSetItem new_item(wt, did);
    414426        if (sort_by != REL) {
     
    418430            Xapian::docid m = (new_item.did - 1) / multiplier + 1; // real docid in that database
    419431            Xapian::Internal::RefCntPtr<Xapian::Document::Internal> doc(db.internal[n]->open_document(m, true));
    420432            new_item.sort_key = doc->get_value(sort_key);
    421         }
    422433
    423         // Test if item has high enough weight (or sort key) to get into
    424         // proto-mset.
    425         if (sort_by != REL || min_item.wt > 0.0)
     434            // We're sorting by value (in part at least), so compare the item
     435            // against the lowest currently in the proto-mset.  If sort_by is
     436            // VAL, then new_item.wt won't yet be set, but that doesn't
     437            // matter since it's not used by the sort function.
    426438            if (!mcmp(new_item, min_item)) continue;
     439        }
    427440
    428441        Xapian::Internal::RefCntPtr<Xapian::Document::Internal> doc;
    429442
     
    446459            }
    447460        }
    448461
    449         if (min_item.wt <= 0.0) {
     462        if (!calculated_weight) {
    450463            // we didn't calculate the weight above, but now we will need it
    451464            wt = pl->get_weight();
    452465            new_item.wt = wt;
     
    489502                    // replacement item
    490503                    new_item.collapse_count = old_item.collapse_count + 1;
    491504
    492                     // This is best match with this key so far:
    493                     // remove the old one from the MSet
    494                     if ((sort_by != VAL && min_item.wt <= 0.0) ||
    495                         mcmp(old_item, min_item)) {
    496                         // Old one hasn't fallen out of MSet yet
    497                         // Scan through (unsorted) MSet looking for entry
     505                    // There was a previous item in the collapse tab so
     506                    // the MSet can't be empty.
     507                    Assert(!items.empty());
     508
     509                    // This is best potential MSet entry with this key which
     510                    // we've seen so far.  Check if the previous best entry
     511                    // with this key might still be in the proto-MSet.  If it
     512                    // might be, we need to check through for it.
     513                    if (old_item.wt >= min_weight && mcmp(old_item, min_item)) {
     514                        // Scan through (unsorted) MSet looking for entry.
    498515                        // FIXME: more efficient way than just scanning?
    499516                        Xapian::docid olddid = old_item.did;
    500                         DEBUGLINE(MATCH, "collapsem: removing " << olddid <<
    501                                   ": " << new_item.collapse_key);
    502517                        vector<Xapian::Internal::MSetItem>::iterator i;
    503                         for (i = items.begin(); i->did != olddid; ++i) {
    504                             Assert(i != items.end());
     518                        for (i = items.begin(); i != items.end(); ++i) {
     519                            if (i->did == olddid) {
     520                                DEBUGLINE(MATCH, "collapsem: removing " <<
     521                                          olddid << ": " <<
     522                                          new_item.collapse_key);
     523                                // We can replace an arbitrary element in
     524                                // O(log N) but have to do it by hand (in this
     525                                // case the new elt is bigger, so we just swap
     526                                // down the tree).
     527                                // FIXME: implement this, and clean up is_heap
     528                                // handling
     529                                *i = new_item;
     530                                pushback = false;
     531                                is_heap = false;
     532                                break;
     533                            }
    505534                        }
    506                         *i = new_item;
    507                         pushback = false;
    508                         // We can replace an arbitrary element in O(log N)
    509                         // but have to do it by hand (in this case the new
    510                         // elt is bigger, so we just swap down the tree)
    511                         // FIXME: implement this, and clean up is_heap
    512                         // handling
    513                         is_heap = false;
    514535                    }
    515536
    516537                    // Keep the old weight as it is now second best so far
     
    536557                pop_heap<vector<Xapian::Internal::MSetItem>::iterator,
    537558                         MSetCmp>(items.begin(), items.end(), mcmp);
    538559                items.pop_back();
     560
     561                min_item = items.front();
    539562                if (sort_by == REL || sort_by == REL_VAL) {
    540                     Xapian::weight tmp = min_item.wt;
    541                     min_item = items.front();
    542                     min_item.wt = tmp;
    543                 } else {
    544                     min_item = items.front();
     563                    if (min_item.wt > min_weight) min_weight = min_item.wt;
    545564                }
    546                 if (getorrecalc_maxweight(pl) < min_item.wt) {
     565                if (getorrecalc_maxweight(pl) < min_weight) {
    547566                    DEBUGLINE(MATCH, "*** TERMINATING EARLY (3)");
    548567                    break;
    549568                }
     
    563582            greatest_wt = wt;
    564583            if (percent_cutoff) {
    565584                Xapian::weight w = wt * percent_factor;
    566                 if (w > min_item.wt) {
    567                     min_item.wt = w;
    568                     min_item.did = 0;
     585                if (w > min_weight) {
     586                    min_weight = w;
    569587                    if (!is_heap) {
    570588                        is_heap = true;
    571589                        make_heap<vector<Xapian::Internal::MSetItem>::iterator,
    572590                                  MSetCmp>(items.begin(), items.end(), mcmp);
    573591                    }
    574                     while (!items.empty() && items.front().wt < min_item.wt) {
     592                    while (!items.empty() && items.front().wt < min_weight) {
    575593                        pop_heap<vector<Xapian::Internal::MSetItem>::iterator,
    576594                                 MSetCmp>(items.begin(), items.end(), mcmp);
    577                         Assert(items.back().wt < min_item.wt);
     595                        Assert(items.back().wt < min_weight);
    578596                        items.pop_back();
    579597                    }
    580598#ifdef XAPIAN_DEBUG_PARANOID
    581599                    vector<Xapian::Internal::MSetItem>::const_iterator i;
    582600                    for (i = items.begin(); i != items.end(); ++i) {
    583                         Assert(i->wt >= min_item.wt);
     601                        Assert(i->wt >= min_weight);
    584602                    }
    585603#endif
    586604                }
     
    665683            = items.size();
    666684    } else {
    667685        if (percent_cutoff) {
    668             // another approach: Xapian::doccount new_est = items.size() * (1 - percent_factor) / (1 - min_item.wt / greatest_wt);
     686            // another approach: Xapian::doccount new_est = items.size() * (1 - percent_factor) / (1 - min_weight / greatest_wt);
    669687            Xapian::doccount new_est;
    670688            new_est = Xapian::doccount((1 - percent_factor) * matches_estimated);
    671689            matches_estimated = max(size_t(new_est), items.size());
    672             // and another: items.size() + (1 - greatest_wt * percent_factor / min_item.wt) * (matches_estimated - items.size());
     690            // and another: items.size() + (1 - greatest_wt * percent_factor / min_weight) * (matches_estimated - items.size());
    673691
    674692            // Very likely an underestimate, but we can't really do better without
    675693            // checking further matches...  Only possibility would be to track how