Ticket #157: boolgroups.patch

File boolgroups.patch, 11.4 KB (added by Richard Boulton, 17 years ago)

Updated implementation of ORring filter terms with the same prefix

  • queryparser/queryparser.lemony

     
    2525#include "queryparser_internal.h"
    2626#include <xapian/unicode.h>
    2727#include "utils.h"
     28#include "autoptr.h"
    2829
    2930// Include the list of token values lemon generates.
    3031#include "queryparser_token.h"
     
    100101    /// Flag, true iff this represents a "MatchNothing" query.
    101102    bool match_nothing;
    102103
     104    /// The value number that this represents, if it's a range query.
     105    /// (Needed, since we can't get the value number back out of the "q" member.)
     106    Xapian::valueno valno;
     107
    103108  public:
    104109    QpQuery(const QpQuery & tocopy)
    105         : q(tocopy.q), match_nothing(tocopy.match_nothing)
     110        : q(tocopy.q), match_nothing(tocopy.match_nothing),
     111          valno(Xapian::BAD_VALUENO)
    106112    {}
    107113
    108114    QpQuery & operator=(const QpQuery & tocopy)
     
    115121    /** A query consisting of a single term. */
    116122    QpQuery(const std::string & tname, Xapian::termcount wqf,
    117123            Xapian::termpos pos)
    118         : q(tname, wqf, pos), match_nothing(false)
     124        : q(tname, wqf, pos), match_nothing(false),
     125          valno(Xapian::BAD_VALUENO)
    119126    {}
    120127
    121128    /** A query consisting of two subqueries, combined with operator op. */
    122129    QpQuery(Query::op op, const QpQuery & left, const QpQuery & right)
    123         : q(op, left.get(), right.get()), match_nothing(false)
     130        : q(op, left.get(), right.get()), match_nothing(false),
     131          valno(Xapian::BAD_VALUENO)
    124132    {}
    125133
    126     QpQuery(Query::op op_, Xapian::valueno valno,
     134    QpQuery(Query::op op_, Xapian::valueno valno_,
    127135          const std::string &begin, const std::string &end)
    128         : q(op_, valno, begin, end), match_nothing(false)
     136        : q(op_, valno_, begin, end), match_nothing(false),
     137          valno(valno_)
    129138    {}
    130139
    131     QpQuery(const Query & q_) : q(q_), match_nothing(false) {}
    132     QpQuery() : q(), match_nothing(false) {}
    133     QpQuery(bool m) : q(), match_nothing(m) {}
     140    QpQuery(const Query & q_) : q(q_), match_nothing(false), valno(Xapian::BAD_VALUENO) {}
     141    QpQuery() : q(), match_nothing(false), valno(Xapian::BAD_VALUENO) {}
     142    QpQuery(bool m) : q(), match_nothing(m), valno(Xapian::BAD_VALUENO) {}
    134143    Query & get() { return q; }
    135144    const Query & get() const { return q; }
     145    Xapian::valueno get_valno() const { return valno; }
    136146
    137147    /// True iff the query is not empty, and doesn't explicitly match nothing.
    138148    bool can_match() { return (!q.empty() && !match_nothing); }
     
    160170#endif
    161171};
    162172
     173/// A structure identifying a group of filter terms
     174struct filter_group_id {
     175    /** The prefix of the filter terms.
     176     *  This is used for boolean filter terms.
     177     */
     178    string prefix;
     179
     180    /** The value number of the filter terms.
     181     *  This is used for value range terms.
     182     */
     183    Xapian::valueno valno;
     184
     185    /// Make a new filter_group_id for boolean filter terms.
     186    explicit filter_group_id(const string & prefix_) : prefix(prefix_), valno(Xapian::BAD_VALUENO) {}
     187
     188    /// Make a new filter_group_id for value range terms.
     189    explicit filter_group_id(Xapian::valueno valno_) : prefix(), valno(valno_) {}
     190
     191    /// Compare to another filter_group_id.
     192    bool operator<(const filter_group_id & other) const {
     193        if (prefix != other.prefix) {
     194            return prefix < other.prefix;
     195        }
     196        return valno < other.valno;
     197    }
     198};
     199
    163200/** Class used to pass information about a token from lexer to parser.
    164201 *
    165202 *  Generally a this class carries term information, but it can be used for the
     
    177214    bool stem;
    178215    termpos pos;
    179216
    180     std::string make_term() const;
    181 
    182217  public:
    183218    Term(const string &name_, termpos pos_) : name(name_), stem(false), pos(pos_) { }
    184219    Term(const string &name_) : name(name_), stem(false), pos(0) { }
     220    Term(const string &name_, const string &prefix_)
     221        : name(name_), prefix(prefix_), stem(false), pos(0) { }
    185222    Term(termpos pos_) : stem(false), pos(pos_) { }
    186223    Term(State * state_, const string &name_, const string &prefix_,
    187224         const string &unstemmed_, bool stem_, termpos pos_)
    188225        : state(state_), name(name_), prefix(prefix_), unstemmed(unstemmed_),
    189226          stem(stem_), pos(pos_) { }
    190227
     228    std::string make_term() const;
     229
    191230    void dont_stem() { stem = false; }
    192231
    193232    termpos get_termpos() const { return pos; }
    194233
     234    filter_group_id get_filter_group_id() const { return filter_group_id(prefix); }
     235
    195236    QpQuery * as_query() const { return new QpQuery(make_term(), 1, pos); }
    196237
    197238    QpQuery * as_wildcarded_query(State * state) const;
     
    589630                        // until the next space or ')' as part of the boolean
    590631                        // term.
    591632                        it = p;
    592                         if (prefix_needs_colon(prefix, *it))
    593                             prefix += ':';
    594                         string term;
     633                        string name;
    595634                        while (it != end && *it > ' ' && *it != ')')
    596                             Unicode::append_utf8(term, *it++);
    597                         prefix += term;
     635                            Unicode::append_utf8(name, *it++);
     636                        AutoPtr<Term> term(new Term(name, prefix));
    598637                        field += ':';
    599                         field += term;
    600                         unstem.insert(make_pair(prefix, field));
    601                         Parse(pParser, BOOLEAN_FILTER, new Term(prefix),
    602                               &state);
     638                        field += name;
     639                        unstem.insert(make_pair(term->make_term(), field));
     640                        Parse(pParser, BOOLEAN_FILTER, term.release(), &state);
    603641                        continue;
    604642                    }
    605643
     
    858896    QpQuery query;
    859897    QpQuery love;
    860898    QpQuery hate;
    861     QpQuery filter;
     899
     900    // filter is a map from prefix to a query for that prefix.  Queries with
     901    // the same prefix are combined with OR, and the results of this are
     902    // combined with AND to get the full filter.
     903    map<filter_group_id, QpQuery> filter;
     904
     905    QpQuery merge_filters() const {
     906        QpQuery q;
     907        for (map<filter_group_id, QpQuery>::const_iterator i = filter.begin();
     908             i != filter.end(); ++i)
     909        {
     910            add_to_query(q, Query::OP_AND, i->second.get());
     911        }
     912        return q;
     913    }
    862914};
    863915
    864916class TermList {
     
    10781130    // Handle any boolean filters.
    10791131    if (!P->filter.empty()) {
    10801132        if (E->empty()) {
    1081             *E = P->filter;
     1133            *E = P->merge_filters();
    10821134            // FIXME and make the query boolean somehow...
    10831135        } else {
    1084             *E = QpQuery(Query::OP_FILTER, *E, P->filter);
     1136            *E = QpQuery(Query::OP_FILTER, *E, P->merge_filters());
    10851137        }
    10861138    }
    10871139    // FIXME what if E->empty() (all terms are stopwords)?
     
    11081160        yy_parse_failed(yypParser);
    11091161        return;
    11101162    }
     1163    Xapian::valueno valno = range->get_valno();
    11111164    P = new ProbQuery;
    1112     P->filter = *range;
     1165    P->filter[filter_group_id(valno)] = *range;
    11131166    delete range;
    11141167}
    11151168
     
    11211174        yy_parse_failed(yypParser);
    11221175        return;
    11231176    }
     1177    Xapian::valueno valno = range->get_valno();
    11241178    P = Q;
    1125     add_to_query(P->filter, Query::OP_AND, *range);
     1179    add_to_query(P->filter[filter_group_id(valno)], Query::OP_OR, *range);
    11261180    delete range;
    11271181}
    11281182
     
    11921246
    11931247prob(P) ::= BOOLEAN_FILTER(T). {
    11941248    P = new ProbQuery;
    1195     P->filter = T->as_query_object();
     1249    P->filter[T->get_filter_group_id()] = T->as_query_object();
    11961250    delete T;
    11971251}
    11981252
    11991253prob(P) ::= stop_prob(Q) BOOLEAN_FILTER(T). {
    12001254    P = Q;
    1201     // FIXME we should OR filters with the same prefix...
    1202     add_to_query(P->filter, Query::OP_AND, T->as_query_object());
     1255    // We OR filters with the same prefix...
     1256    add_to_query(P->filter[T->get_filter_group_id()], Query::OP_OR, T->as_query_object());
    12031257    delete T;
    12041258}
    12051259
    12061260prob(P) ::= LOVE BOOLEAN_FILTER(T). {
    12071261    // LOVE BOOLEAN_FILTER(T) is just the same as BOOLEAN_FILTER
    12081262    P = new ProbQuery;
    1209     P->filter = T->as_query_object();
     1263    P->filter[T->get_filter_group_id()] = T->as_query_object();
    12101264    delete T;
    12111265}
    12121266
    12131267prob(P) ::= stop_prob(Q) LOVE BOOLEAN_FILTER(T). {
    12141268    // LOVE BOOLEAN_FILTER(T) is just the same as BOOLEAN_FILTER
    12151269    P = Q;
    1216     // FIXME we should OR filters with the same prefix...
    1217     add_to_query(P->filter, Query::OP_AND, T->as_query_object());
     1270    // We OR filters with the same prefix...
     1271    add_to_query(P->filter[T->get_filter_group_id()], Query::OP_OR, T->as_query_object());
    12181272    delete T;
    12191273}
    12201274
  • tests/harness/testsuite.h

     
    225225
    226226/// Test for equality of two things.
    227227#define TEST_EQUAL(a, b) TEST_AND_EXPLAIN(((a) == (b)), \
    228         "Expected `"STRINGIZE(a)"' and `"STRINGIZE(b)"' to be equal:" \
    229         " were " << (a) << " and " << (b))
     228        "Expected `"STRINGIZE(a)"' and `"STRINGIZE(b)"' to be equal:\n" \
     229        " were " << (a) << "\n and " << (b))
    230230
    231231/** Test for equality of two strings.
    232232 *
  • tests/queryparsertest.cc

     
    532532    { "- NEAR 12V voeding", "(near:(pos=1) OR 12v:(pos=2) OR Zvoed:(pos=3))" },
    533533    { "waarom \"~\" in directorynaam", "(Zwaarom:(pos=1) OR Zin:(pos=2) OR Zdirectorynaam:(pos=3))" },
    534534    { "cd'r NEAR toebehoren", "(cd'r:(pos=1) NEAR 11 toebehoren:(pos=2))" },
     535    { "site:1 site:2", "(H1 OR H2)" },
     536    { "site:1 site2:2", "(H1 AND J2)" },
     537    { "site:1 site:2 site2:2", "((H1 OR H2) AND J2)" },
     538    { "site:1 OR site:2", "(H1 OR H2)" },
     539    { "site:1 AND site:2", "(H1 AND H2)" },
     540#if 0
     541    { "A site:1 site:2", "(a FILTER (H1 OR H2))" },
     542    { "A (site:1 OR site:2)", "(a FILTER (H1 OR H2))" },
     543    { "A (site:1 OR site:2)", "(a FILTER (H1 OR H2))" },
     544    { "A site:1 site2:2", "(a FILTER (H1 AND J2))" },
     545    { "A site:1 site:2 site2:2", "(a FILTER ((H1 OR H2) AND J2))" },
     546    { "A site:1 OR site:2", "(a FILTER (H1 OR H2))" },
     547    { "A site:1 AND site:2", "(a FILTER (H1 AND H2))" },
     548#endif
     549    { "site:xapian.org OR site:www.xapian.org", "(Hxapian.org OR Hwww.xapian.org)" },
     550    { "site:xapian.org site:www.xapian.org", "(Hxapian.org OR Hwww.xapian.org)" },
     551    { "site:xapian.org AND site:www.xapian.org", "(Hxapian.org AND Hwww.xapian.org)" },
     552    { "Xapian site:xapian.org site:www.xapian.org", "(xapian:(pos=1) FILTER (Hxapian.org OR Hwww.xapian.org))" },
    535553    { NULL, NULL }
    536554};
    537555
     
    576594    queryparser.add_prefix("title", "XT");
    577595    queryparser.add_prefix("subject", "XT");
    578596    queryparser.add_boolean_prefix("site", "H");
     597    queryparser.add_boolean_prefix("site2", "J");
    579598    for (test *p = test_or_queries; p->query; ++p) {
    580599        string expect, parsed;
    581600        if (p->expect)
     
    969988    { "hello a..b", "(hello:(pos=1) FILTER VALUE_RANGE 1 a b)" },
    970989    { "hello a..b world", "((hello:(pos=1) OR world:(pos=2)) FILTER VALUE_RANGE 1 a b)" },
    971990    { "hello a..b test:foo", "(hello:(pos=1) FILTER (VALUE_RANGE 1 a b AND XTESTfoo))" },
     991    { "hello a..b test:foo test:bar", "(hello:(pos=1) FILTER (VALUE_RANGE 1 a b AND (XTESTfoo OR XTESTbar)))" },
     992    { "hello a..b c..d test:foo", "(hello:(pos=1) FILTER ((VALUE_RANGE 1 a b OR VALUE_RANGE 1 c d) AND XTESTfoo))" },
     993    { "hello a..b c..d test:foo test:bar", "(hello:(pos=1) FILTER ((VALUE_RANGE 1 a b OR VALUE_RANGE 1 c d) AND (XTESTfoo OR XTESTbar)))" },
    972994    { "-5..7", "VALUE_RANGE 1 -5 7" },
    973995    { "hello -5..7", "(hello:(pos=1) FILTER VALUE_RANGE 1 -5 7)" },
    974996    { "-5..7 hello", "(hello:(pos=1) FILTER VALUE_RANGE 1 -5 7)" },
     
    10201042    { "12/03/99..12/04/01", "VALUE_RANGE 1 19990312 20010412" },
    10211043    { "03-12-99..04-14-01", "VALUE_RANGE 1 19990312 20010414" },
    10221044    { "(test:a..test:b hello)", "(hello:(pos=1) FILTER VALUE_RANGE 3 test:a test:b)" },
     1045    { "12..42kg 5..6kg 1..12", "(VALUE_RANGE 2 1 12 AND (VALUE_RANGE 5 12 42 OR VALUE_RANGE 5 5 6))" },
    10231046    { NULL, NULL }
    10241047};
    10251048