Ticket #394: phrase-settling-pond-update-20120911.patch

File phrase-settling-pond-update-20120911.patch, 24.6 KB (added by Olly Betts, 12 years ago)

updated for trunk as of 2012-09-11

  • xapian-core/api/leafpostlist.cc

    diff --git a/xapian-core/api/leafpostlist.cc b/xapian-core/api/leafpostlist.cc
    index 0bbeedb..c5e969b 100644
    a b LeafPostList::count_matching_subqs() const  
    102102{
    103103    return 1;
    104104}
     105
     106std::string
     107LeafPostList::get_termname() const
     108{
     109    return term;
     110}
  • xapian-core/api/leafpostlist.h

    diff --git a/xapian-core/api/leafpostlist.h b/xapian-core/api/leafpostlist.h
    index bf107ca..34c05fa 100644
    a b class LeafPostList : public PostList {  
    8686    TermFreqs get_termfreq_est_using_stats(
    8787        const Xapian::Weight::Internal & stats) const;
    8888
     89    virtual std::string get_termname() const;
     90
    8991    Xapian::termcount count_matching_subqs() const;
    9092};
    9193
  • xapian-core/api/postlist.cc

    diff --git a/xapian-core/api/postlist.cc b/xapian-core/api/postlist.cc
    index 2684913..886c29f 100644
    a b PostList::count_matching_subqs() const  
    7878    return 0;
    7979}
    8080
     81std::string
     82PostList::get_termname() const
     83{
     84    return std::string();
     85}
     86
    8187}
  • xapian-core/api/postlist.h

    diff --git a/xapian-core/api/postlist.h b/xapian-core/api/postlist.h
    index 0c7ca1f..8fd67e6 100644
    a b class Xapian::PostingIterator::Internal : public Xapian::Internal::intrusive_bas  
    194194    /// Count the number of leaf subqueries which match at the current position.
    195195    virtual Xapian::termcount count_matching_subqs() const;
    196196
     197    /// If this is a term, return the name, otherwise return empty string.
     198    virtual std::string get_termname() const;
     199
    197200    /// Return a string description of this object.
    198201    virtual std::string get_description() const = 0;
    199202};
  • xapian-core/api/queryinternal.cc

    diff --git a/xapian-core/api/queryinternal.cc b/xapian-core/api/queryinternal.cc
    index 9359e48..0bc5daa 100644
    a b class AndContext : public Context {  
    248248                  Xapian::termcount window_)
    249249            : op_(op__), begin(begin_), end(end_), window(window_) { }
    250250
    251         PostList * postlist(PostList * pl, const vector<PostList*>& pls) const;
     251        PostList * postlist(PostList * pl, const vector<PostList*>& pls,
     252                            QueryOptimiser * qopt) const;
    252253    };
    253254
    254255    list<PosFilter> pos_filters;
    class AndContext : public Context {  
    264265};
    265266
    266267PostList *
    267 AndContext::PosFilter::postlist(PostList * pl, const vector<PostList*>& pls) const
     268AndContext::PosFilter::postlist(PostList * pl, const vector<PostList*>& pls,
     269                                QueryOptimiser * qopt) const
    268270try {
    269271    vector<PostList *>::const_iterator terms_begin = pls.begin() + begin;
    270272    vector<PostList *>::const_iterator terms_end = pls.begin() + end;
    try {  
    273275        pl = new NearPostList(pl, window, terms_begin, terms_end);
    274276    } else if (window == end - begin) {
    275277        AssertEq(op_, Xapian::Query::OP_PHRASE);
    276         pl = new ExactPhrasePostList(pl, terms_begin, terms_end);
     278        if (qopt->top_and) {
     279            vector<PostList *>::const_iterator j;
     280            for (j = terms_begin; j != terms_end; ++j) {
     281                const string & term = (*j)->get_termname();
     282                if (term.empty()) {
     283                    // FIXME: Currently all the subqueries must be terms.
     284                    qopt->pool_terms.clear();
     285                    goto cannot_pool;
     286                }
     287                qopt->pool_terms.push_back(term);
     288            }
     289            // We can currently only handle hoisting out one phrase check.
     290            // FIXME: Gather a list of checks, not a list of the terms in one
     291            // check.
     292            qopt->top_and = false;
     293        } else {
     294cannot_pool:
     295            pl = new ExactPhrasePostList(pl, terms_begin, terms_end);
     296        }
    277297    } else {
    278298        AssertEq(op_, Xapian::Query::OP_PHRASE);
    279299        pl = new PhrasePostList(pl, window, terms_begin, terms_end);
    AndContext::postlist(QueryOptimiser* qopt)  
    308328    list<PosFilter>::const_iterator i;
    309329    for (i = pos_filters.begin(); i != pos_filters.end(); ++i) {
    310330        const PosFilter & filter = *i;
    311         pl.reset(filter.postlist(pl.release(), pls));
     331        pl.reset(filter.postlist(pl.release(), pls, qopt));
    312332    }
    313333
    314334    // Empty pls so our destructor doesn't delete them all!
    Query::Internal::postlist_sub_or_like(OrContext& ctx,  
    491511                                      QueryOptimiser * qopt,
    492512                                      double factor) const
    493513{
     514    bool top_and = qopt->top_and;
     515    qopt->top_and = false;
    494516    ctx.add_postlist(postlist(qopt, factor));
     517    qopt->top_and = top_and;
    495518}
    496519
    497520void
    Query::Internal::postlist_sub_xor(XorContext& ctx,  
    499522                                  QueryOptimiser * qopt,
    500523                                  double factor) const
    501524{
     525    bool top_and = qopt->top_and;
     526    qopt->top_and = false;
    502527    ctx.add_postlist(postlist(qopt, factor));
     528    qopt->top_and = top_and;
    503529}
    504530
    505531namespace Internal {
    QueryAndNot::postlist(QueryOptimiser * qopt, double factor) const  
    11471173    LOGCALL(QUERY, PostingIterator::Internal *, "QueryAndNot::postlist", qopt | factor);
    11481174    // FIXME: Combine and-like side with and-like stuff above.
    11491175    AutoPtr<PostList> l(subqueries[0].internal->postlist(qopt, factor));
     1176    bool top_and = qopt->top_and;
     1177    qopt->top_and = false;
    11501178    OrContext ctx(subqueries.size() - 1);
    11511179    do_or_like(ctx, qopt, 0.0, 0, 1);
    11521180    AutoPtr<PostList> r(ctx.postlist(qopt));
     1181    qopt->top_and = top_and;
    11531182    RETURN(new AndNotPostList(l.release(), r.release(),
    11541183                              qopt->matcher, qopt->db_size));
    11551184}
    QueryAndMaybe::postlist(QueryOptimiser * qopt, double factor) const  
    11801209    LOGCALL(QUERY, PostingIterator::Internal *, "QueryAndMaybe::postlist", qopt | factor);
    11811210    // FIXME: Combine and-like side with and-like stuff above.
    11821211    AutoPtr<PostList> l(subqueries[0].internal->postlist(qopt, factor));
     1212    bool top_and = qopt->top_and;
     1213    qopt->top_and = false;
    11831214    OrContext ctx(subqueries.size() - 1);
    11841215    do_or_like(ctx, qopt, factor, 0, 1);
    11851216    AutoPtr<PostList> r(ctx.postlist(qopt));
     1217    qopt->top_and = top_and;
    11861218    RETURN(new AndMaybePostList(l.release(), r.release(),
    11871219                                qopt->matcher, qopt->db_size));
    11881220}
  • xapian-core/common/submatch.h

    diff --git a/xapian-core/common/submatch.h b/xapian-core/common/submatch.h
    index c90eee0..bdd16f0 100644
    a b class SubMatch : public Xapian::Internal::intrusive_base {  
    7676    virtual PostList * get_postlist_and_term_info(MultiMatch *matcher,
    7777        std::map<std::string,
    7878                 Xapian::MSet::Internal::TermFreqAndWeight> *termfreqandwts,
    79         Xapian::termcount * total_subqs_ptr)
     79        Xapian::termcount * total_subqs_ptr,
     80        std::vector<std::string> & pool_terms)
    8081        = 0;
    8182};
    8283
  • xapian-core/matcher/Makefile.mk

    diff --git a/xapian-core/matcher/Makefile.mk b/xapian-core/matcher/Makefile.mk
    index 0bae22c..85dc522 100644
    a b noinst_HEADERS +=\  
    44        matcher/branchpostlist.h\
    55        matcher/collapser.h\
    66        matcher/const_database_wrapper.h\
     7        matcher/exactphrasecheck.h\
    78        matcher/exactphrasepostlist.h\
    89        matcher/externalpostlist.h\
    910        matcher/extraweightpostlist.h\
    lib_src +=\  
    4243        matcher/branchpostlist.cc\
    4344        matcher/collapser.cc\
    4445        matcher/const_database_wrapper.cc\
     46        matcher/exactphrasecheck.cc\
    4547        matcher/exactphrasepostlist.cc\
    4648        matcher/externalpostlist.cc\
    4749        matcher/localsubmatch.cc\
  • new file xapian-core/matcher/exactphrasecheck.cc

    diff --git a/xapian-core/matcher/exactphrasecheck.cc b/xapian-core/matcher/exactphrasecheck.cc
    new file mode 100644
    index 0000000..c6eade9
    - +  
     1/** @file exactphrasecheck.cc
     2 * @brief Check if terms form a particular exact phrase.
     3 */
     4/* Copyright (C) 2006,2007,2009,2012 Olly Betts
     5 *
     6 * This program is free software; you can redistribute it and/or modify
     7 * it under the terms of the GNU General Public License as published by
     8 * the Free Software Foundation; either version 2 of the License, or
     9 * (at your option) any later version.
     10 *
     11 * This program is distributed in the hope that it will be useful,
     12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
     13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     14 * GNU General Public License for more details.
     15 *
     16 * You should have received a copy of the GNU General Public License
     17 * along with this program; if not, write to the Free Software
     18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
     19 */
     20
     21// FIXME: this could probably share code with ExactPhrasePostList.
     22
     23#include <config.h>
     24
     25#include "exactphrasecheck.h"
     26
     27#include "debuglog.h"
     28#include "omassert.h"
     29#include "backends/positionlist.h"
     30
     31#include <algorithm>
     32#include <vector>
     33
     34using namespace std;
     35
     36class TermCompare {
     37    const Xapian::Database & db;
     38    vector<string> & terms;
     39
     40  public:
     41    TermCompare(const Xapian::Database & db_,
     42                vector<string> & terms_)
     43        : db(db_), terms(terms_) { }
     44
     45    bool operator()(unsigned a, unsigned b) const {
     46        return db.get_collection_freq(terms[a]) < db.get_collection_freq(terms[b]);
     47    }
     48};
     49
     50ExactPhraseCheck::ExactPhraseCheck(const Xapian::Database & db_,
     51                                   const vector<string> &terms_)
     52    : db(db_), terms(terms_)
     53{
     54    if (terms.empty()) {
     55        poslists = NULL;
     56        order = NULL;
     57        return;
     58    }
     59
     60    AssertRel(terms.size(),>,1);
     61    size_t n = terms_.size();
     62    poslists = new PositionList*[n];
     63    try {
     64        order = new unsigned[n];
     65    } catch (...) {
     66        delete [] poslists;
     67        throw;
     68    }
     69    for (size_t i = 0; i < n; ++i) {
     70        poslists[i] = NULL;
     71        order[i] = unsigned(i);
     72    }
     73
     74    // We often don't need to read all the position lists, so rather than using
     75    // the shortest position lists first, we approximate by using the terms
     76    // with the lowest collection freq first.  Overall this should give a
     77    // similar order.
     78    sort(order, order + terms.size(), TermCompare(db, terms));
     79}
     80
     81ExactPhraseCheck::~ExactPhraseCheck()
     82{
     83    delete [] poslists;
     84    delete [] order;
     85}
     86
     87bool
     88ExactPhraseCheck::start_position_list(unsigned i, Xapian::docid did)
     89{
     90    AssertRel(i,<,terms.size());
     91    unsigned index = order[i];
     92    // FIXME: nasty hacking around with internals and ref counts - we should
     93    // just add a new Database::Internal method to do what we want.
     94    Xapian::PositionIterator p = db.positionlist_begin(did, terms[index]);
     95    PositionList * tmp = p.internal;
     96    if (!tmp)
     97        return false;
     98    ++tmp->_refs;
     99    p.internal = poslists[i];
     100    poslists[i] = tmp;
     101    poslists[i]->index = index;
     102    return true;
     103}
     104
     105bool
     106ExactPhraseCheck::operator()(Xapian::docid did)
     107{
     108    LOGCALL(MATCH, bool, "ExactPhraseCheck::operator()", did);
     109
     110    if (terms.size() <= 1) RETURN(true);
     111
     112    // We often don't need to read all the position lists, so rather than using
     113
     114    AssertRel(terms.size(),>,1);
     115
     116    bool result = false;
     117    // If the first term we check only occurs too close to the start of the
     118    // document, we only need to read one term's positions.  E.g. search for
     119    // "ripe mango" when the only occurrence of 'mango' in the current document
     120    // is at position 0.
     121    if (!start_position_list(0, did))
     122        goto done;
     123    poslists[0]->skip_to(poslists[0]->index);
     124    if (poslists[0]->at_end()) goto done;
     125
     126    // If we get here, we'll need to read the positionlists for at least two
     127    // terms, so check the true positionlist length for the two terms with the
     128    // lowest wdf and if necessary swap them so the true shorter one is first.
     129    if (!start_position_list(1, did))
     130        goto done;
     131    if (poslists[0]->get_size() < poslists[1]->get_size()) {
     132        poslists[1]->skip_to(poslists[1]->index);
     133        if (poslists[1]->at_end()) goto done;
     134        swap(poslists[0], poslists[1]);
     135    }
     136
     137    {
     138        unsigned read_hwm = 1;
     139        Xapian::termpos idx0 = poslists[0]->index;
     140        do {
     141            Xapian::termpos base = poslists[0]->get_position() - idx0;
     142            unsigned i = 1;
     143            while (true) {
     144                if (i > read_hwm) {
     145                    read_hwm = i;
     146                    if (!start_position_list(i, did))
     147                        goto done;
     148                    // FIXME: consider comparing with poslist[0] and swapping
     149                    // if less common.  Should we allow for the number of positions
     150                    // we've read from poslist[0] already?
     151                }
     152                Xapian::termpos required = base + poslists[i]->index;
     153                poslists[i]->skip_to(required);
     154                if (poslists[i]->at_end()) goto done;
     155                if (poslists[i]->get_position() != required) break;
     156                if (++i == terms.size()) {
     157                    result = true;
     158                    goto done;
     159                }
     160            }
     161            poslists[0]->next();
     162        } while (!poslists[0]->at_end());
     163    }
     164done:
     165    for (size_t i = 0; i < terms.size(); ++i) {
     166        delete poslists[i];
     167        poslists[i] = NULL;
     168    }
     169    RETURN(result);
     170}
  • new file xapian-core/matcher/exactphrasecheck.h

    diff --git a/xapian-core/matcher/exactphrasecheck.h b/xapian-core/matcher/exactphrasecheck.h
    new file mode 100644
    index 0000000..52b9e9e
    - +  
     1/** @file exactphrasecheck.cc
     2 * @brief Check if terms form a particular exact phrase.
     3 */
     4/* Copyright (C) 2006,2012 Olly Betts
     5 * Copyright (C) 2009 Lemur Consulting Ltd
     6 *
     7 * This program is free software; you can redistribute it and/or modify
     8 * it under the terms of the GNU General Public License as published by
     9 * the Free Software Foundation; either version 2 of the License, or
     10 * (at your option) any later version.
     11 *
     12 * This program is distributed in the hope that it will be useful,
     13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
     14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     15 * GNU General Public License for more details.
     16 *
     17 * You should have received a copy of the GNU General Public License
     18 * along with this program; if not, write to the Free Software
     19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
     20 */
     21
     22#ifndef XAPIAN_INCLUDED_EXACTPHRASEPOSTLIST_H
     23#define XAPIAN_INCLUDED_EXACTPHRASEPOSTLIST_H
     24
     25#include "xapian/database.h"
     26
     27#include <string>
     28#include <vector>
     29
     30typedef Xapian::PositionIterator::Internal PositionList;
     31
     32/** Check for an exact phrase using positional information.
     33 *
     34 *  Tests if the terms occur somewhere in the document in the order given
     35 *  and at adjacent term positions.
     36 */
     37class ExactPhraseCheck {
     38    Xapian::Database db;
     39
     40    std::vector<std::string> terms;
     41
     42    PositionList ** poslists;
     43
     44    unsigned * order;
     45
     46    /// Start reading from the i-th position list.
     47    bool start_position_list(unsigned i, Xapian::docid did);
     48
     49  public:
     50    ExactPhraseCheck(const Xapian::Database & db_,
     51                     const std::vector<std::string> &terms_);
     52
     53    ~ExactPhraseCheck();
     54
     55    /// Test if the specified document contains the terms as an exact phrase.
     56    bool operator()(Xapian::docid did);
     57};
     58
     59#endif
  • xapian-core/matcher/localsubmatch.cc

    diff --git a/xapian-core/matcher/localsubmatch.cc b/xapian-core/matcher/localsubmatch.cc
    index 10c648f..763fdcc 100644
    a b LocalSubMatch::start_match(Xapian::doccount first,  
    6868PostList *
    6969LocalSubMatch::get_postlist_and_term_info(MultiMatch * matcher,
    7070        map<string, Xapian::MSet::Internal::TermFreqAndWeight> * termfreqandwts,
    71         Xapian::termcount * total_subqs_ptr)
     71        Xapian::termcount * total_subqs_ptr,
     72        std::vector<std::string> & pool_terms)
    7273{
    7374    LOGCALL(MATCH, PostList *, "LocalSubMatch::get_postlist_and_term_info", matcher | termfreqandwts | total_subqs_ptr);
    7475    (void)matcher;
    LocalSubMatch::get_postlist_and_term_info(MultiMatch * matcher,  
    8283
    8384    PostList * pl;
    8485    {
    85         QueryOptimiser opt(*db, *this, matcher);
     86        QueryOptimiser opt(*db, *this, matcher, pool_terms);
    8687        pl = query.internal->postlist(&opt, 1.0);
    8788        *total_subqs_ptr = opt.get_total_subqs();
    8889    }
  • xapian-core/matcher/localsubmatch.h

    diff --git a/xapian-core/matcher/localsubmatch.h b/xapian-core/matcher/localsubmatch.h
    index 8e92416..ea50a2c 100644
    a b class LocalSubMatch : public SubMatch {  
    8989    PostList * get_postlist_and_term_info(MultiMatch *matcher,
    9090        std::map<std::string,
    9191                 Xapian::MSet::Internal::TermFreqAndWeight> *termfreqandwts,
    92         Xapian::termcount * total_subqs_ptr);
     92        Xapian::termcount * total_subqs_ptr,
     93        std::vector<std::string> & pool_terms);
    9394
    9495    /** Convert a postlist into a synonym postlist.
    9596     */
  • xapian-core/matcher/multimatch.cc

    diff --git a/xapian-core/matcher/multimatch.cc b/xapian-core/matcher/multimatch.cc
    index 654908a..a221ce8 100644
    a b  
    4646#include "valuestreamdocument.h"
    4747#include "weight/weightinternal.h"
    4848
     49#include "exactphrasecheck.h"
     50
    4951#include <xapian/errorhandler.h>
    5052#include <xapian/matchspy.h>
    5153#include <xapian/version.h> // For XAPIAN_HAS_REMOTE_BACKEND
    MultiMatch::get_mset(Xapian::doccount first, Xapian::doccount maxitems,  
    355357    map<string, Xapian::MSet::Internal::TermFreqAndWeight> * termfreqandwts_ptr;
    356358    termfreqandwts_ptr = &termfreqandwts;
    357359
     360    vector<string> pool_terms;
    358361    Xapian::termcount total_subqs = 0;
    359362    // Keep a count of matches which we know exist, but we won't see.  This
    360363    // occurs when a submatch is remote, and returns a lower bound on the
    MultiMatch::get_mset(Xapian::doccount first, Xapian::doccount maxitems,  
    364367    for (size_t i = 0; i != leaves.size(); ++i) {
    365368        PostList *pl;
    366369        try {
     370            if (!is_remote[i]) pool_terms.clear();
    367371            pl = leaves[i]->get_postlist_and_term_info(this,
    368372                                                       termfreqandwts_ptr,
    369                                                        &total_subqs);
     373                                                       &total_subqs,
     374                                                       pool_terms);
    370375            if (termfreqandwts_ptr && !termfreqandwts.empty())
    371376                termfreqandwts_ptr = NULL;
    372377            if (is_remote[i]) {
    MultiMatch::get_mset(Xapian::doccount first, Xapian::doccount maxitems,  
    525530    // Is the mset a valid heap?
    526531    bool is_heap = false;
    527532
     533    size_t SETTLING_POND_SIZE = 0;
     534    if (!pool_terms.empty()) {
     535        const char * sps = getenv("POND_SIZE");
     536        SETTLING_POND_SIZE = sps ? atoi(sps) : 100000;
     537    }
     538    ExactPhraseCheck phrase_check(db, pool_terms);
     539    // FIXME: a min/max heap is probably a better choice here (notably more
     540    // compact) but the STL doesn't provide one so we'd have to find an
     541    // implementation or write one.
     542    multimap<double, Xapian::Internal::MSetItem> settling_pond;
    528543    while (true) {
    529544        bool pushback;
    530545
    MultiMatch::get_mset(Xapian::doccount first, Xapian::doccount maxitems,  
    646661            new_item.wt = wt;
    647662        }
    648663
     664        if (SETTLING_POND_SIZE) {
     665            if (items.size() >= max_msize) {
     666                // Settling pond handling...
     667                multimap<double, Xapian::Internal::MSetItem>::iterator it;
     668                it = settling_pond.upper_bound(-min_weight);
     669                settling_pond.erase(it, settling_pond.end());
     670
     671                settling_pond.insert(make_pair(-new_item.wt, new_item));
     672                if (settling_pond.size() < SETTLING_POND_SIZE) {
     673                    continue;
     674                }
     675
     676                // Take the last item off the heap, which will have a reasonably
     677                // high weight in general.
     678                it = settling_pond.begin();
     679                swap(new_item, it->second);
     680                settling_pond.erase(it);
     681            }
     682            if (!phrase_check(new_item.did)) continue;
     683        }
     684
    649685        pushback = true;
    650686
    651687        // Perform collapsing on key if requested.
    new_greatest_weight:  
    808844        }
    809845    }
    810846
     847    multimap<double, Xapian::Internal::MSetItem>::iterator it;
     848    for (it = settling_pond.begin(); it != settling_pond.end(); ++it) {
     849        const Xapian::Internal::MSetItem & new_item = it->second;
     850        if (new_item.wt < min_weight) break;
     851        if (!phrase_check(new_item.did)) continue;
     852
     853        {
     854            ++docs_matched;
     855            if (items.size() >= max_msize) {
     856                items.push_back(new_item);
     857                if (!is_heap) {
     858                    is_heap = true;
     859                    make_heap(items.begin(), items.end(), mcmp);
     860                } else {
     861                    push_heap<vector<Xapian::Internal::MSetItem>::iterator,
     862                              MSetCmp>(items.begin(), items.end(), mcmp);
     863                }
     864                pop_heap<vector<Xapian::Internal::MSetItem>::iterator,
     865                         MSetCmp>(items.begin(), items.end(), mcmp);
     866                items.pop_back();
     867
     868                min_item = items.front();
     869                if (sort_by == REL || sort_by == REL_VAL) {
     870                    if (docs_matched >= check_at_least) {
     871                        if (sort_by == REL) {
     872                            // We're done if this is a forward boolean match
     873                            // with only one database (bodgetastic, FIXME
     874                            // better if we can!)
     875                            if (rare(max_possible == 0 && sort_forward)) {
     876                                // In the multi database case, MergePostList
     877                                // currently processes each database
     878                                // sequentially (which actually may well be
     879                                // more efficient) so the docids in general
     880                                // won't arrive in order.
     881                                // FIXME: is this still good here:
     882                                // if (leaves.size() == 1) break;
     883                            }
     884                        }
     885                        if (min_item.wt > min_weight) {
     886                            LOGLINE(MATCH, "Setting min_weight to " <<
     887                                    min_item.wt << " from " << min_weight);
     888                            min_weight = min_item.wt;
     889                        }
     890                    }
     891                }
     892            } else {
     893                items.push_back(new_item);
     894                is_heap = false;
     895                if (sort_by == REL && items.size() == max_msize) {
     896                    if (docs_matched >= check_at_least) {
     897                        // We're done if this is a forward boolean match
     898                        // with only one database (bodgetastic, FIXME
     899                        // better if we can!)
     900                        if (rare(max_possible == 0 && sort_forward)) {
     901                            // In the multi database case, MergePostList
     902                            // currently processes each database
     903                            // sequentially (which actually may well be
     904                            // more efficient) so the docids in general
     905                            // won't arrive in order.
     906                            // FIXME: if (leaves.size() == 1) break;
     907                        }
     908                    }
     909                }
     910            }
     911        }
     912
     913        // Keep a track of the greatest weight we've seen.
     914        if (new_item.wt > greatest_wt) {
     915            greatest_wt = new_item.wt;
     916#ifdef XAPIAN_HAS_REMOTE_BACKEND
     917            const unsigned int multiplier = db.internal.size();
     918            unsigned int db_num = (new_item.did - 1) % multiplier;
     919            if (is_remote[db_num]) {
     920                // Note that the greatest weighted document came from a remote
     921                // database, and which one.
     922                greatest_wt_subqs_db_num = db_num;
     923            } else
     924#endif
     925            {
     926                greatest_wt_subqs_matched = pl->count_matching_subqs();
     927#ifdef XAPIAN_HAS_REMOTE_BACKEND
     928                greatest_wt_subqs_db_num = UINT_MAX;
     929#endif
     930            }
     931            if (percent_cutoff) {
     932                double w = new_item.wt * percent_cutoff_factor;
     933                if (w > min_weight) {
     934                    min_weight = w;
     935                    if (!is_heap) {
     936                        is_heap = true;
     937                        make_heap<vector<Xapian::Internal::MSetItem>::iterator,
     938                                  MSetCmp>(items.begin(), items.end(), mcmp);
     939                    }
     940                    while (!items.empty() && items.front().wt < min_weight) {
     941                        pop_heap<vector<Xapian::Internal::MSetItem>::iterator,
     942                                 MSetCmp>(items.begin(), items.end(), mcmp);
     943                        Assert(items.back().wt < min_weight);
     944                        items.pop_back();
     945                    }
     946#ifdef XAPIAN_ASSERTIONS_PARANOID
     947                    vector<Xapian::Internal::MSetItem>::const_iterator i;
     948                    for (i = items.begin(); i != items.end(); ++i) {
     949                        Assert(i->wt >= min_weight);
     950                    }
     951#endif
     952                }
     953            }
     954        }
     955    }
     956
     957
    811958    // done with posting list tree
    812959    pl.reset(NULL);
    813960
  • xapian-core/matcher/queryoptimiser.h

    diff --git a/xapian-core/matcher/queryoptimiser.h b/xapian-core/matcher/queryoptimiser.h
    index 7147b76..cfa6409 100644
    a b class QueryOptimiser {  
    4949    Xapian::termcount total_subqs;
    5050
    5151  public:
     52    std::vector<std::string> & pool_terms;
     53
     54    bool top_and;
     55
    5256    const Xapian::Database::Internal & db;
    5357
    5458    Xapian::doccount db_size;
    class QueryOptimiser {  
    5761
    5862    QueryOptimiser(const Xapian::Database::Internal & db_,
    5963                   LocalSubMatch & localsubmatch_,
    60                    MultiMatch * matcher_)
     64                   MultiMatch * matcher_,
     65                   std::vector<std::string> & pool_terms_)
    6166        : localsubmatch(localsubmatch_), total_subqs(0),
    62           db(db_), db_size(db.get_doccount()), matcher(matcher_) { }
     67          pool_terms(pool_terms_), top_and(true), db(db_),
     68          db_size(db.get_doccount()), matcher(matcher_) { }
    6369
    6470    void inc_total_subqs() { ++total_subqs; }
    6571
  • xapian-core/matcher/remotesubmatch.cc

    diff --git a/xapian-core/matcher/remotesubmatch.cc b/xapian-core/matcher/remotesubmatch.cc
    index ff5184e..4e6efa4 100644
    a b RemoteSubMatch::start_match(Xapian::doccount first,  
    6262PostList *
    6363RemoteSubMatch::get_postlist_and_term_info(MultiMatch *,
    6464        map<string, Xapian::MSet::Internal::TermFreqAndWeight> * termfreqandwts,
    65         Xapian::termcount * total_subqs_ptr)
     65        Xapian::termcount * total_subqs_ptr,
     66        std::vector<std::string> &)
    6667{
    6768    LOGCALL(MATCH, PostList *, "RemoteSubMatch::get_postlist_and_term_info", Literal("[matcher]") | termfreqandwts | total_subqs_ptr);
    6869    Xapian::MSet mset;
  • xapian-core/matcher/remotesubmatch.h

    diff --git a/xapian-core/matcher/remotesubmatch.h b/xapian-core/matcher/remotesubmatch.h
    index 1198d8a..7d29e16 100644
    a b class RemoteSubMatch : public SubMatch {  
    7272    PostList * get_postlist_and_term_info(MultiMatch *matcher,
    7373        std::map<std::string,
    7474                 Xapian::MSet::Internal::TermFreqAndWeight> *termfreqandwts,
    75         Xapian::termcount * total_subqs_ptr);
     75        Xapian::termcount * total_subqs_ptr,
     76        std::vector<std::string> & pool_terms);
    7677
    7778    /// Get percentage factor - only valid after get_postlist_and_term_info().
    7879    double get_percent_factor() const { return percent_factor; }