Ticket #394: phrase-settling-pond-update-20130517.patch

File phrase-settling-pond-update-20130517.patch, 30.7 KB (added by Olly Betts, 2 years ago)

2013 version of pond patch

  • xapian-core-1.3.1

    Description: Use a settling pond for faster phrase processing
     Somewhat experimental.
    Author: Olly Betts <olly@survex.com>
    
    ---
    Origin: upstream
    Bug: http://trac.xapian.org/ticket/394
    Last-Update: 2013-05.17
    
    old new  
    470470        languages/steminternal.cc matcher/remotesubmatch.cc \
    471471        matcher/andmaybepostlist.cc matcher/andnotpostlist.cc \
    472472        matcher/branchpostlist.cc matcher/collapser.cc \
    473         matcher/const_database_wrapper.cc \
     473        matcher/const_database_wrapper.cc matcher/exactphrasecheck.cc \
    474474        matcher/exactphrasepostlist.cc matcher/externalpostlist.cc \
    475475        matcher/localsubmatch.cc matcher/mergepostlist.cc \
    476476        matcher/msetcmp.cc matcher/msetpostlist.cc \
     
    618618        languages/stem.lo languages/steminternal.lo $(am__objects_14) \
    619619        matcher/andmaybepostlist.lo matcher/andnotpostlist.lo \
    620620        matcher/branchpostlist.lo matcher/collapser.lo \
    621         matcher/const_database_wrapper.lo \
     621        matcher/const_database_wrapper.lo matcher/exactphrasecheck.lo \
    622622        matcher/exactphrasepostlist.lo matcher/externalpostlist.lo \
    623623        matcher/localsubmatch.lo matcher/mergepostlist.lo \
    624624        matcher/msetcmp.lo matcher/msetpostlist.lo \
     
    886886        languages/steminternal.h matcher/andmaybepostlist.h \
    887887        matcher/andnotpostlist.h matcher/branchpostlist.h \
    888888        matcher/collapser.h matcher/const_database_wrapper.h \
    889         matcher/exactphrasepostlist.h matcher/externalpostlist.h \
    890         matcher/extraweightpostlist.h matcher/localsubmatch.h \
    891         matcher/mergepostlist.h matcher/msetcmp.h \
    892         matcher/msetpostlist.h matcher/multiandpostlist.h \
    893         matcher/multimatch.h matcher/multixorpostlist.h \
    894         matcher/orpostlist.h matcher/phrasepostlist.h \
    895         matcher/queryoptimiser.h matcher/remotesubmatch.h \
    896         matcher/selectpostlist.h matcher/synonympostlist.h \
    897         matcher/valuegepostlist.h matcher/valuerangepostlist.h \
    898         matcher/valuestreamdocument.h net/length.h net/progclient.h \
    899         net/remoteconnection.h net/remoteserver.h \
    900         net/remotetcpclient.h net/remotetcpserver.h \
     889        matcher/exactphrasecheck.h matcher/exactphrasepostlist.h \
     890        matcher/externalpostlist.h matcher/extraweightpostlist.h \
     891        matcher/localsubmatch.h matcher/mergepostlist.h \
     892        matcher/msetcmp.h matcher/msetpostlist.h \
     893        matcher/multiandpostlist.h matcher/multimatch.h \
     894        matcher/multixorpostlist.h matcher/orpostlist.h \
     895        matcher/phrasepostlist.h matcher/queryoptimiser.h \
     896        matcher/remotesubmatch.h matcher/selectpostlist.h \
     897        matcher/synonympostlist.h matcher/valuegepostlist.h \
     898        matcher/valuerangepostlist.h matcher/valuestreamdocument.h \
     899        net/length.h net/progclient.h net/remoteconnection.h \
     900        net/remoteserver.h net/remotetcpclient.h net/remotetcpserver.h \
    901901        net/replicatetcpclient.h net/replicatetcpserver.h \
    902902        net/serialise.h net/tcpclient.h net/tcpserver.h \
    903903        queryparser/cjk-tokenizer.h queryparser/queryparser_internal.h \
     
    11651165        languages/steminternal.h matcher/andmaybepostlist.h \
    11661166        matcher/andnotpostlist.h matcher/branchpostlist.h \
    11671167        matcher/collapser.h matcher/const_database_wrapper.h \
    1168         matcher/exactphrasepostlist.h matcher/externalpostlist.h \
    1169         matcher/extraweightpostlist.h matcher/localsubmatch.h \
    1170         matcher/mergepostlist.h matcher/msetcmp.h \
    1171         matcher/msetpostlist.h matcher/multiandpostlist.h \
    1172         matcher/multimatch.h matcher/multixorpostlist.h \
    1173         matcher/orpostlist.h matcher/phrasepostlist.h \
    1174         matcher/queryoptimiser.h matcher/remotesubmatch.h \
    1175         matcher/selectpostlist.h matcher/synonympostlist.h \
    1176         matcher/valuegepostlist.h matcher/valuerangepostlist.h \
    1177         matcher/valuestreamdocument.h net/length.h net/progclient.h \
    1178         net/remoteconnection.h net/remoteserver.h \
    1179         net/remotetcpclient.h net/remotetcpserver.h \
     1168        matcher/exactphrasecheck.h matcher/exactphrasepostlist.h \
     1169        matcher/externalpostlist.h matcher/extraweightpostlist.h \
     1170        matcher/localsubmatch.h matcher/mergepostlist.h \
     1171        matcher/msetcmp.h matcher/msetpostlist.h \
     1172        matcher/multiandpostlist.h matcher/multimatch.h \
     1173        matcher/multixorpostlist.h matcher/orpostlist.h \
     1174        matcher/phrasepostlist.h matcher/queryoptimiser.h \
     1175        matcher/remotesubmatch.h matcher/selectpostlist.h \
     1176        matcher/synonympostlist.h matcher/valuegepostlist.h \
     1177        matcher/valuerangepostlist.h matcher/valuestreamdocument.h \
     1178        net/length.h net/progclient.h net/remoteconnection.h \
     1179        net/remoteserver.h net/remotetcpclient.h net/remotetcpserver.h \
    11801180        net/replicatetcpclient.h net/replicatetcpserver.h \
    11811181        net/serialise.h net/tcpclient.h net/tcpserver.h \
    11821182        queryparser/cjk-tokenizer.h queryparser/queryparser_internal.h \
     
    12721272        languages/stem.cc languages/steminternal.cc $(am__append_23) \
    12731273        matcher/andmaybepostlist.cc matcher/andnotpostlist.cc \
    12741274        matcher/branchpostlist.cc matcher/collapser.cc \
    1275         matcher/const_database_wrapper.cc \
     1275        matcher/const_database_wrapper.cc matcher/exactphrasecheck.cc \
    12761276        matcher/exactphrasepostlist.cc matcher/externalpostlist.cc \
    12771277        matcher/localsubmatch.cc matcher/mergepostlist.cc \
    12781278        matcher/msetcmp.cc matcher/msetpostlist.cc \
     
    18941894        matcher/$(DEPDIR)/$(am__dirstamp)
    18951895matcher/const_database_wrapper.lo: matcher/$(am__dirstamp) \
    18961896        matcher/$(DEPDIR)/$(am__dirstamp)
     1897matcher/exactphrasecheck.lo: matcher/$(am__dirstamp) \
     1898        matcher/$(DEPDIR)/$(am__dirstamp)
    18971899matcher/exactphrasepostlist.lo: matcher/$(am__dirstamp) \
    18981900        matcher/$(DEPDIR)/$(am__dirstamp)
    18991901matcher/externalpostlist.lo: matcher/$(am__dirstamp) \
     
    24042406@AMDEP_TRUE@@am__include@ @am__quote@matcher/$(DEPDIR)/branchpostlist.Plo@am__quote@
    24052407@AMDEP_TRUE@@am__include@ @am__quote@matcher/$(DEPDIR)/collapser.Plo@am__quote@
    24062408@AMDEP_TRUE@@am__include@ @am__quote@matcher/$(DEPDIR)/const_database_wrapper.Plo@am__quote@
     2409@AMDEP_TRUE@@am__include@ @am__quote@matcher/$(DEPDIR)/exactphrasecheck.Plo@am__quote@
    24072410@AMDEP_TRUE@@am__include@ @am__quote@matcher/$(DEPDIR)/exactphrasepostlist.Plo@am__quote@
    24082411@AMDEP_TRUE@@am__include@ @am__quote@matcher/$(DEPDIR)/externalpostlist.Plo@am__quote@
    24092412@AMDEP_TRUE@@am__include@ @am__quote@matcher/$(DEPDIR)/localsubmatch.Plo@am__quote@
  • common/submatch.h

    old new  
    7676    virtual PostList * get_postlist_and_term_info(MultiMatch *matcher,
    7777        std::map<std::string,
    7878                 Xapian::MSet::Internal::TermFreqAndWeight> *termfreqandwts,
    79         Xapian::termcount * total_subqs_ptr)
     79        Xapian::termcount * total_subqs_ptr,
     80        std::vector<std::string> & pool_terms)
    8081        = 0;
    8182};
    8283
  • matcher/remotesubmatch.h

    old new  
    7272    PostList * get_postlist_and_term_info(MultiMatch *matcher,
    7373        std::map<std::string,
    7474                 Xapian::MSet::Internal::TermFreqAndWeight> *termfreqandwts,
    75         Xapian::termcount * total_subqs_ptr);
     75        Xapian::termcount * total_subqs_ptr,
     76        std::vector<std::string> & pool_terms);
    7677
    7778    /// Get percentage factor - only valid after get_postlist_and_term_info().
    7879    double get_percent_factor() const { return percent_factor; }
  • matcher/multimatch.cc

    old new  
    4646#include "valuestreamdocument.h"
    4747#include "weight/weightinternal.h"
    4848
     49#include "exactphrasecheck.h"
     50
    4951#include <xapian/errorhandler.h>
    5052#include <xapian/matchspy.h>
    5153#include <xapian/version.h> // For XAPIAN_HAS_REMOTE_BACKEND
     
    355357    map<string, Xapian::MSet::Internal::TermFreqAndWeight> * termfreqandwts_ptr;
    356358    termfreqandwts_ptr = &termfreqandwts;
    357359
     360    vector<string> pool_terms;
    358361    Xapian::termcount total_subqs = 0;
    359362    // Keep a count of matches which we know exist, but we won't see.  This
    360363    // occurs when a submatch is remote, and returns a lower bound on the
     
    364367    for (size_t i = 0; i != leaves.size(); ++i) {
    365368        PostList *pl;
    366369        try {
     370            if (!is_remote[i]) pool_terms.clear();
    367371            pl = leaves[i]->get_postlist_and_term_info(this,
    368372                                                       termfreqandwts_ptr,
    369                                                        &total_subqs);
     373                                                       &total_subqs,
     374                                                       pool_terms);
    370375            if (termfreqandwts_ptr && !termfreqandwts.empty())
    371376                termfreqandwts_ptr = NULL;
    372377            if (is_remote[i]) {
     
    525530    // Is the mset a valid heap?
    526531    bool is_heap = false;
    527532
     533    size_t SETTLING_POND_SIZE = 0;
     534    if (!pool_terms.empty()) {
     535        const char * sps = getenv("POND_SIZE");
     536        SETTLING_POND_SIZE = sps ? atoi(sps) : 100000;
     537    }
     538    ExactPhraseCheck phrase_check(db, pool_terms);
     539    // FIXME: a min/max heap is probably a better choice here (notably more
     540    // compact) but the STL doesn't provide one so we'd have to find an
     541    // implementation or write one.
     542    multimap<double, Xapian::Internal::MSetItem> settling_pond;
    528543    while (true) {
    529544        bool pushback;
    530545
     
    646661            new_item.wt = wt;
    647662        }
    648663
     664        if (SETTLING_POND_SIZE) {
     665            if (items.size() >= max_msize) {
     666                // Settling pond handling...
     667                multimap<double, Xapian::Internal::MSetItem>::iterator it;
     668                it = settling_pond.upper_bound(-min_weight);
     669                settling_pond.erase(it, settling_pond.end());
     670
     671                settling_pond.insert(make_pair(-new_item.wt, new_item));
     672                if (settling_pond.size() < SETTLING_POND_SIZE) {
     673                    continue;
     674                }
     675
     676                // Take the last item off the heap, which will have a reasonably
     677                // high weight in general.
     678                it = settling_pond.begin();
     679                swap(new_item, it->second);
     680                settling_pond.erase(it);
     681            }
     682            if (!phrase_check(new_item.did)) continue;
     683        }
     684
    649685        pushback = true;
    650686
    651687        // Perform collapsing on key if requested.
     
    808844        }
    809845    }
    810846
     847    multimap<double, Xapian::Internal::MSetItem>::iterator it;
     848    for (it = settling_pond.begin(); it != settling_pond.end(); ++it) {
     849        const Xapian::Internal::MSetItem & new_item = it->second;
     850        if (new_item.wt < min_weight) break;
     851        if (!phrase_check(new_item.did)) continue;
     852
     853        {
     854            ++docs_matched;
     855            if (items.size() >= max_msize) {
     856                items.push_back(new_item);
     857                if (!is_heap) {
     858                    is_heap = true;
     859                    make_heap(items.begin(), items.end(), mcmp);
     860                } else {
     861                    push_heap<vector<Xapian::Internal::MSetItem>::iterator,
     862                              MSetCmp>(items.begin(), items.end(), mcmp);
     863                }
     864                pop_heap<vector<Xapian::Internal::MSetItem>::iterator,
     865                         MSetCmp>(items.begin(), items.end(), mcmp);
     866                items.pop_back();
     867
     868                min_item = items.front();
     869                if (sort_by == REL || sort_by == REL_VAL) {
     870                    if (docs_matched >= check_at_least) {
     871                        if (sort_by == REL) {
     872                            // We're done if this is a forward boolean match
     873                            // with only one database (bodgetastic, FIXME
     874                            // better if we can!)
     875                            if (rare(max_possible == 0 && sort_forward)) {
     876                                // In the multi database case, MergePostList
     877                                // currently processes each database
     878                                // sequentially (which actually may well be
     879                                // more efficient) so the docids in general
     880                                // won't arrive in order.
     881                                // FIXME: is this still good here:
     882                                // if (leaves.size() == 1) break;
     883                            }
     884                        }
     885                        if (min_item.wt > min_weight) {
     886                            LOGLINE(MATCH, "Setting min_weight to " <<
     887                                    min_item.wt << " from " << min_weight);
     888                            min_weight = min_item.wt;
     889                        }
     890                    }
     891                }
     892            } else {
     893                items.push_back(new_item);
     894                is_heap = false;
     895                if (sort_by == REL && items.size() == max_msize) {
     896                    if (docs_matched >= check_at_least) {
     897                        // We're done if this is a forward boolean match
     898                        // with only one database (bodgetastic, FIXME
     899                        // better if we can!)
     900                        if (rare(max_possible == 0 && sort_forward)) {
     901                            // In the multi database case, MergePostList
     902                            // currently processes each database
     903                            // sequentially (which actually may well be
     904                            // more efficient) so the docids in general
     905                            // won't arrive in order.
     906                            // FIXME: if (leaves.size() == 1) break;
     907                        }
     908                    }
     909                }
     910            }
     911        }
     912
     913        // Keep a track of the greatest weight we've seen.
     914        if (new_item.wt > greatest_wt) {
     915            greatest_wt = new_item.wt;
     916#ifdef XAPIAN_HAS_REMOTE_BACKEND
     917            const unsigned int multiplier = db.internal.size();
     918            unsigned int db_num = (new_item.did - 1) % multiplier;
     919            if (is_remote[db_num]) {
     920                // Note that the greatest weighted document came from a remote
     921                // database, and which one.
     922                greatest_wt_subqs_db_num = db_num;
     923            } else
     924#endif
     925            {
     926                greatest_wt_subqs_matched = pl->count_matching_subqs();
     927#ifdef XAPIAN_HAS_REMOTE_BACKEND
     928                greatest_wt_subqs_db_num = UINT_MAX;
     929#endif
     930            }
     931            if (percent_cutoff) {
     932                double w = new_item.wt * percent_cutoff_factor;
     933                if (w > min_weight) {
     934                    min_weight = w;
     935                    if (!is_heap) {
     936                        is_heap = true;
     937                        make_heap<vector<Xapian::Internal::MSetItem>::iterator,
     938                                  MSetCmp>(items.begin(), items.end(), mcmp);
     939                    }
     940                    while (!items.empty() && items.front().wt < min_weight) {
     941                        pop_heap<vector<Xapian::Internal::MSetItem>::iterator,
     942                                 MSetCmp>(items.begin(), items.end(), mcmp);
     943                        Assert(items.back().wt < min_weight);
     944                        items.pop_back();
     945                    }
     946#ifdef XAPIAN_ASSERTIONS_PARANOID
     947                    vector<Xapian::Internal::MSetItem>::const_iterator i;
     948                    for (i = items.begin(); i != items.end(); ++i) {
     949                        Assert(i->wt >= min_weight);
     950                    }
     951#endif
     952                }
     953            }
     954        }
     955    }
     956
     957
    811958    // done with posting list tree
    812959    pl.reset(NULL);
    813960
  • matcher/localsubmatch.cc

    old new  
    6868PostList *
    6969LocalSubMatch::get_postlist_and_term_info(MultiMatch * matcher,
    7070        map<string, Xapian::MSet::Internal::TermFreqAndWeight> * termfreqandwts,
    71         Xapian::termcount * total_subqs_ptr)
     71        Xapian::termcount * total_subqs_ptr,
     72        std::vector<std::string> & pool_terms)
    7273{
    7374    LOGCALL(MATCH, PostList *, "LocalSubMatch::get_postlist_and_term_info", matcher | termfreqandwts | total_subqs_ptr);
    7475    (void)matcher;
     
    8283
    8384    PostList * pl;
    8485    {
    85         QueryOptimiser opt(*db, *this, matcher);
     86        QueryOptimiser opt(*db, *this, matcher, pool_terms);
    8687        pl = query.internal->postlist(&opt, 1.0);
    8788        *total_subqs_ptr = opt.get_total_subqs();
    8889    }
  • matcher/Makefile.mk

    old new  
    44        matcher/branchpostlist.h\
    55        matcher/collapser.h\
    66        matcher/const_database_wrapper.h\
     7        matcher/exactphrasecheck.h\
    78        matcher/exactphrasepostlist.h\
    89        matcher/externalpostlist.h\
    910        matcher/extraweightpostlist.h\
     
    4243        matcher/branchpostlist.cc\
    4344        matcher/collapser.cc\
    4445        matcher/const_database_wrapper.cc\
     46        matcher/exactphrasecheck.cc\
    4547        matcher/exactphrasepostlist.cc\
    4648        matcher/externalpostlist.cc\
    4749        matcher/localsubmatch.cc\
  • new file xapian-core-1.3.1/matcher/exactphrasecheck.cc

    - +  
     1/** @file exactphrasecheck.cc
     2 * @brief Check if terms form a particular exact phrase.
     3 */
     4/* Copyright (C) 2006,2007,2009,2012 Olly Betts
     5 *
     6 * This program is free software; you can redistribute it and/or modify
     7 * it under the terms of the GNU General Public License as published by
     8 * the Free Software Foundation; either version 2 of the License, or
     9 * (at your option) any later version.
     10 *
     11 * This program is distributed in the hope that it will be useful,
     12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
     13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     14 * GNU General Public License for more details.
     15 *
     16 * You should have received a copy of the GNU General Public License
     17 * along with this program; if not, write to the Free Software
     18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
     19 */
     20
     21// FIXME: this could probably share code with ExactPhrasePostList.
     22
     23#include <config.h>
     24
     25#include "exactphrasecheck.h"
     26
     27#include "debuglog.h"
     28#include "omassert.h"
     29#include "backends/positionlist.h"
     30
     31#include <algorithm>
     32#include <vector>
     33
     34using namespace std;
     35
     36class TermCompare {
     37    const Xapian::Database & db;
     38    vector<string> & terms;
     39
     40  public:
     41    TermCompare(const Xapian::Database & db_,
     42                vector<string> & terms_)
     43        : db(db_), terms(terms_) { }
     44
     45    bool operator()(unsigned a, unsigned b) const {
     46        return db.get_collection_freq(terms[a]) < db.get_collection_freq(terms[b]);
     47    }
     48};
     49
     50ExactPhraseCheck::ExactPhraseCheck(const Xapian::Database & db_,
     51                                   const vector<string> &terms_)
     52    : db(db_), terms(terms_)
     53{
     54    if (terms.empty()) {
     55        poslists = NULL;
     56        order = NULL;
     57        return;
     58    }
     59
     60    AssertRel(terms.size(),>,1);
     61    size_t n = terms_.size();
     62    poslists = new PositionList*[n];
     63    try {
     64        order = new unsigned[n];
     65    } catch (...) {
     66        delete [] poslists;
     67        throw;
     68    }
     69    for (size_t i = 0; i < n; ++i) {
     70        poslists[i] = NULL;
     71        order[i] = unsigned(i);
     72    }
     73
     74    // We often don't need to read all the position lists, so rather than using
     75    // the shortest position lists first, we approximate by using the terms
     76    // with the lowest collection freq first.  Overall this should give a
     77    // similar order.
     78    sort(order, order + terms.size(), TermCompare(db, terms));
     79}
     80
     81ExactPhraseCheck::~ExactPhraseCheck()
     82{
     83    delete [] poslists;
     84    delete [] order;
     85}
     86
     87bool
     88ExactPhraseCheck::start_position_list(unsigned i, Xapian::docid did)
     89{
     90    AssertRel(i,<,terms.size());
     91    unsigned index = order[i];
     92    // FIXME: nasty hacking around with internals and ref counts - we should
     93    // just add a new Database::Internal method to do what we want.
     94    Xapian::PositionIterator p = db.positionlist_begin(did, terms[index]);
     95    PositionList * tmp = p.internal;
     96    if (!tmp)
     97        return false;
     98    ++tmp->_refs;
     99    p.internal = poslists[i];
     100    poslists[i] = tmp;
     101    poslists[i]->index = index;
     102    return true;
     103}
     104
     105bool
     106ExactPhraseCheck::operator()(Xapian::docid did)
     107{
     108    LOGCALL(MATCH, bool, "ExactPhraseCheck::operator()", did);
     109
     110    if (terms.size() <= 1) RETURN(true);
     111
     112    // We often don't need to read all the position lists, so rather than using
     113
     114    AssertRel(terms.size(),>,1);
     115
     116    bool result = false;
     117    // If the first term we check only occurs too close to the start of the
     118    // document, we only need to read one term's positions.  E.g. search for
     119    // "ripe mango" when the only occurrence of 'mango' in the current document
     120    // is at position 0.
     121    if (!start_position_list(0, did))
     122        goto done;
     123    poslists[0]->skip_to(poslists[0]->index);
     124    if (poslists[0]->at_end()) goto done;
     125
     126    // If we get here, we'll need to read the positionlists for at least two
     127    // terms, so check the true positionlist length for the two terms with the
     128    // lowest wdf and if necessary swap them so the true shorter one is first.
     129    if (!start_position_list(1, did))
     130        goto done;
     131    if (poslists[0]->get_size() < poslists[1]->get_size()) {
     132        poslists[1]->skip_to(poslists[1]->index);
     133        if (poslists[1]->at_end()) goto done;
     134        swap(poslists[0], poslists[1]);
     135    }
     136
     137    {
     138        unsigned read_hwm = 1;
     139        Xapian::termpos idx0 = poslists[0]->index;
     140        do {
     141            Xapian::termpos base = poslists[0]->get_position() - idx0;
     142            unsigned i = 1;
     143            while (true) {
     144                if (i > read_hwm) {
     145                    read_hwm = i;
     146                    if (!start_position_list(i, did))
     147                        goto done;
     148                    // FIXME: consider comparing with poslist[0] and swapping
     149                    // if less common.  Should we allow for the number of positions
     150                    // we've read from poslist[0] already?
     151                }
     152                Xapian::termpos required = base + poslists[i]->index;
     153                poslists[i]->skip_to(required);
     154                if (poslists[i]->at_end()) goto done;
     155                if (poslists[i]->get_position() != required) break;
     156                if (++i == terms.size()) {
     157                    result = true;
     158                    goto done;
     159                }
     160            }
     161            poslists[0]->next();
     162        } while (!poslists[0]->at_end());
     163    }
     164done:
     165    for (size_t i = 0; i < terms.size(); ++i) {
     166        delete poslists[i];
     167        poslists[i] = NULL;
     168    }
     169    RETURN(result);
     170}
  • matcher/queryoptimiser.h

    old new  
    4949    Xapian::termcount total_subqs;
    5050
    5151  public:
     52    std::vector<std::string> & pool_terms;
     53
     54    bool top_and;
     55
    5256    const Xapian::Database::Internal & db;
    5357
    5458    Xapian::doccount db_size;
     
    5761
    5862    QueryOptimiser(const Xapian::Database::Internal & db_,
    5963                   LocalSubMatch & localsubmatch_,
    60                    MultiMatch * matcher_)
     64                   MultiMatch * matcher_,
     65                   std::vector<std::string> & pool_terms_)
    6166        : localsubmatch(localsubmatch_), total_subqs(0),
    62           db(db_), db_size(db.get_doccount()), matcher(matcher_) { }
     67          pool_terms(pool_terms_), top_and(true), db(db_),
     68          db_size(db.get_doccount()), matcher(matcher_) { }
    6369
    6470    void inc_total_subqs() { ++total_subqs; }
    6571
  • matcher/remotesubmatch.cc

    old new  
    6262PostList *
    6363RemoteSubMatch::get_postlist_and_term_info(MultiMatch *,
    6464        map<string, Xapian::MSet::Internal::TermFreqAndWeight> * termfreqandwts,
    65         Xapian::termcount * total_subqs_ptr)
     65        Xapian::termcount * total_subqs_ptr,
     66        std::vector<std::string> &)
    6667{
    6768    LOGCALL(MATCH, PostList *, "RemoteSubMatch::get_postlist_and_term_info", Literal("[matcher]") | termfreqandwts | total_subqs_ptr);
    6869    Xapian::MSet mset;
  • matcher/localsubmatch.h

    old new  
    8989    PostList * get_postlist_and_term_info(MultiMatch *matcher,
    9090        std::map<std::string,
    9191                 Xapian::MSet::Internal::TermFreqAndWeight> *termfreqandwts,
    92         Xapian::termcount * total_subqs_ptr);
     92        Xapian::termcount * total_subqs_ptr,
     93        std::vector<std::string> & pool_terms);
    9394
    9495    /** Convert a postlist into a synonym postlist.
    9596     */
  • new file xapian-core-1.3.1/matcher/exactphrasecheck.h

    - +  
     1/** @file exactphrasecheck.h
     2 * @brief Check if terms form a particular exact phrase.
     3 */
     4/* Copyright (C) 2006,2012 Olly Betts
     5 * Copyright (C) 2009 Lemur Consulting Ltd
     6 *
     7 * This program is free software; you can redistribute it and/or modify
     8 * it under the terms of the GNU General Public License as published by
     9 * the Free Software Foundation; either version 2 of the License, or
     10 * (at your option) any later version.
     11 *
     12 * This program is distributed in the hope that it will be useful,
     13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
     14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     15 * GNU General Public License for more details.
     16 *
     17 * You should have received a copy of the GNU General Public License
     18 * along with this program; if not, write to the Free Software
     19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
     20 */
     21
     22#ifndef XAPIAN_INCLUDED_EXACTPHRASEPOSTLIST_H
     23#define XAPIAN_INCLUDED_EXACTPHRASEPOSTLIST_H
     24
     25#include "xapian/database.h"
     26
     27#include <string>
     28#include <vector>
     29
     30typedef Xapian::PositionIterator::Internal PositionList;
     31
     32/** Check for an exact phrase using positional information.
     33 *
     34 *  Tests if the terms occur somewhere in the document in the order given
     35 *  and at adjacent term positions.
     36 */
     37class ExactPhraseCheck {
     38    Xapian::Database db;
     39
     40    std::vector<std::string> terms;
     41
     42    PositionList ** poslists;
     43
     44    unsigned * order;
     45
     46    /// Start reading from the i-th position list.
     47    bool start_position_list(unsigned i, Xapian::docid did);
     48
     49  public:
     50    ExactPhraseCheck(const Xapian::Database & db_,
     51                     const std::vector<std::string> &terms_);
     52
     53    ~ExactPhraseCheck();
     54
     55    /// Test if the specified document contains the terms as an exact phrase.
     56    bool operator()(Xapian::docid did);
     57};
     58
     59#endif
  • api/queryinternal.cc

    old new  
    249249                  Xapian::termcount window_)
    250250            : op_(op__), begin(begin_), end(end_), window(window_) { }
    251251
    252         PostList * postlist(PostList * pl, const vector<PostList*>& pls) const;
     252        PostList * postlist(PostList * pl, const vector<PostList*>& pls,
     253                            QueryOptimiser * qopt) const;
    253254    };
    254255
    255256    list<PosFilter> pos_filters;
     
    265266};
    266267
    267268PostList *
    268 AndContext::PosFilter::postlist(PostList * pl, const vector<PostList*>& pls) const
     269AndContext::PosFilter::postlist(PostList * pl, const vector<PostList*>& pls,
     270                                QueryOptimiser * qopt) const
    269271try {
    270272    vector<PostList *>::const_iterator terms_begin = pls.begin() + begin;
    271273    vector<PostList *>::const_iterator terms_end = pls.begin() + end;
     
    274276        pl = new NearPostList(pl, window, terms_begin, terms_end);
    275277    } else if (window == end - begin) {
    276278        AssertEq(op_, Xapian::Query::OP_PHRASE);
    277         pl = new ExactPhrasePostList(pl, terms_begin, terms_end);
     279        if (qopt->top_and) {
     280            vector<PostList *>::const_iterator j;
     281            for (j = terms_begin; j != terms_end; ++j) {
     282                const string & term = (*j)->get_termname();
     283                if (term.empty()) {
     284                    // FIXME: Currently all the subqueries must be terms.
     285                    qopt->pool_terms.clear();
     286                    goto cannot_pool;
     287                }
     288                qopt->pool_terms.push_back(term);
     289            }
     290            // We can currently only handle hoisting out one phrase check.
     291            // FIXME: Gather a list of checks, not a list of the terms in one
     292            // check.
     293            qopt->top_and = false;
     294        } else {
     295cannot_pool:
     296            pl = new ExactPhrasePostList(pl, terms_begin, terms_end);
     297        }
    278298    } else {
    279299        AssertEq(op_, Xapian::Query::OP_PHRASE);
    280300        pl = new PhrasePostList(pl, window, terms_begin, terms_end);
     
    309329    list<PosFilter>::const_iterator i;
    310330    for (i = pos_filters.begin(); i != pos_filters.end(); ++i) {
    311331        const PosFilter & filter = *i;
    312         pl.reset(filter.postlist(pl.release(), pls));
     332        pl.reset(filter.postlist(pl.release(), pls, qopt));
    313333    }
    314334
    315335    // Empty pls so our destructor doesn't delete them all!
     
    492512                                      QueryOptimiser * qopt,
    493513                                      double factor) const
    494514{
     515    bool top_and = qopt->top_and;
     516    qopt->top_and = false;
    495517    ctx.add_postlist(postlist(qopt, factor));
     518    qopt->top_and = top_and;
    496519}
    497520
    498521void
     
    500523                                  QueryOptimiser * qopt,
    501524                                  double factor) const
    502525{
     526    bool top_and = qopt->top_and;
     527    qopt->top_and = false;
    503528    ctx.add_postlist(postlist(qopt, factor));
     529    qopt->top_and = top_and;
    504530}
    505531
    506532namespace Internal {
     
    11481174    LOGCALL(QUERY, PostingIterator::Internal *, "QueryAndNot::postlist", qopt | factor);
    11491175    // FIXME: Combine and-like side with and-like stuff above.
    11501176    AutoPtr<PostList> l(subqueries[0].internal->postlist(qopt, factor));
     1177    bool top_and = qopt->top_and;
     1178    qopt->top_and = false;
    11511179    OrContext ctx(subqueries.size() - 1);
    11521180    do_or_like(ctx, qopt, 0.0, 0, 1);
    11531181    AutoPtr<PostList> r(ctx.postlist(qopt));
     1182    qopt->top_and = top_and;
    11541183    RETURN(new AndNotPostList(l.release(), r.release(),
    11551184                              qopt->matcher, qopt->db_size));
    11561185}
     
    11811210    LOGCALL(QUERY, PostingIterator::Internal *, "QueryAndMaybe::postlist", qopt | factor);
    11821211    // FIXME: Combine and-like side with and-like stuff above.
    11831212    AutoPtr<PostList> l(subqueries[0].internal->postlist(qopt, factor));
     1213    bool top_and = qopt->top_and;
     1214    qopt->top_and = false;
    11841215    OrContext ctx(subqueries.size() - 1);
    11851216    do_or_like(ctx, qopt, factor, 0, 1);
    11861217    AutoPtr<PostList> r(ctx.postlist(qopt));
     1218    qopt->top_and = top_and;
    11871219    RETURN(new AndMaybePostList(l.release(), r.release(),
    11881220                                qopt->matcher, qopt->db_size));
    11891221}
  • xapian-core-1.3.1

    old new  
    7878    return 0;
    7979}
    8080
     81std::string
     82PostList::get_termname() const
     83{
     84    return std::string();
     85}
     86
    8187}
  • api/leafpostlist.cc

    old new  
    102102{
    103103    return weight ? 1 : 0;
    104104}
     105
     106std::string
     107LeafPostList::get_termname() const
     108{
     109    return term;
     110}
  • xapian-core-1.3.1

    old new  
    194194    /// Count the number of leaf subqueries which match at the current position.
    195195    virtual Xapian::termcount count_matching_subqs() const;
    196196
     197    /// If this is a term, return the name, otherwise return empty string.
     198    virtual std::string get_termname() const;
     199
    197200    /// Return a string description of this object.
    198201    virtual std::string get_description() const = 0;
    199202};
  • api/leafpostlist.h

    old new  
    8686    TermFreqs get_termfreq_est_using_stats(
    8787        const Xapian::Weight::Internal & stats) const;
    8888
     89    virtual std::string get_termname() const;
     90
    8991    Xapian::termcount count_matching_subqs() const;
    9092};
    9193