Ticket #157: boolgroups.4.patch

File boolgroups.4.patch, 11.6 KB (added by Richard Boulton, 17 years ago)

Further update - adds documentation and a test case

  • queryparser/queryparser.lemony

     
    2525#include "queryparser_internal.h"
    2626#include <xapian/unicode.h>
    2727#include "utils.h"
     28#include "autoptr.h"
    2829
    2930// Include the list of token values lemon generates.
    3031#include "queryparser_token.h"
     
    133134    QpQuery(bool m) : q(), match_nothing(m) {}
    134135    Query & get() { return q; }
    135136    const Query & get() const { return q; }
     137    Xapian::valueno get_valno() const { return q.internal->get_parameter(); }
    136138
    137139    /// True iff the query is not empty, and doesn't explicitly match nothing.
    138140    bool can_match() { return (!q.empty() && !match_nothing); }
     
    160162#endif
    161163};
    162164
     165/// A structure identifying a group of filter terms
     166struct filter_group_id {
     167    /** The prefix of the filter terms.
     168     *  This is used for boolean filter terms.
     169     */
     170    string prefix;
     171
     172    /** The value number of the filter terms.
     173     *  This is used for value range terms.
     174     */
     175    Xapian::valueno valno;
     176
     177    /// Make a new filter_group_id for boolean filter terms.
     178    explicit filter_group_id(const string & prefix_) : prefix(prefix_), valno(Xapian::BAD_VALUENO) {}
     179
     180    /// Make a new filter_group_id for value range terms.
     181    explicit filter_group_id(Xapian::valueno valno_) : prefix(), valno(valno_) {}
     182
     183    /// Compare to another filter_group_id.
     184    bool operator<(const filter_group_id & other) const {
     185        if (prefix != other.prefix) {
     186            return prefix < other.prefix;
     187        }
     188        return valno < other.valno;
     189    }
     190};
     191
    163192/** Class used to pass information about a token from lexer to parser.
    164193 *
    165194 *  Generally a this class carries term information, but it can be used for the
     
    177206    bool stem;
    178207    termpos pos;
    179208
    180     std::string make_term() const;
    181 
    182209  public:
    183210    Term(const string &name_, termpos pos_) : name(name_), stem(false), pos(pos_) { }
    184211    Term(const string &name_) : name(name_), stem(false), pos(0) { }
     212    Term(const string &name_, const string &prefix_)
     213        : name(name_), prefix(prefix_), stem(false), pos(0) { }
    185214    Term(termpos pos_) : stem(false), pos(pos_) { }
    186215    Term(State * state_, const string &name_, const string &prefix_,
    187216         const string &unstemmed_, bool stem_, termpos pos_)
    188217        : state(state_), name(name_), prefix(prefix_), unstemmed(unstemmed_),
    189218          stem(stem_), pos(pos_) { }
    190219
     220    std::string make_term() const;
     221
    191222    void dont_stem() { stem = false; }
    192223
    193224    termpos get_termpos() const { return pos; }
    194225
     226    filter_group_id get_filter_group_id() const { return filter_group_id(prefix); }
     227
    195228    QpQuery * as_query() const { return new QpQuery(make_term(), 1, pos); }
    196229
    197230    QpQuery * as_wildcarded_query(State * state) const;
     
    587620                        // until the next space or ')' as part of the boolean
    588621                        // term.
    589622                        it = p;
    590                         if (prefix_needs_colon(prefix, *it))
    591                             prefix += ':';
    592                         string term;
     623                        string name;
    593624                        while (it != end && *it > ' ' && *it != ')')
    594                             Unicode::append_utf8(term, *it++);
    595                         prefix += term;
     625                            Unicode::append_utf8(name, *it++);
     626                        AutoPtr<Term> term(new Term(name, prefix));
    596627                        field += ':';
    597                         field += term;
    598                         unstem.insert(make_pair(prefix, field));
    599                         Parse(pParser, BOOLEAN_FILTER, new Term(prefix),
    600                               &state);
     628                        field += name;
     629                        unstem.insert(make_pair(term->make_term(), field));
     630                        Parse(pParser, BOOLEAN_FILTER, term.release(), &state);
    601631                        continue;
    602632                    }
    603633
     
    856886    QpQuery query;
    857887    QpQuery love;
    858888    QpQuery hate;
    859     QpQuery filter;
     889
     890    // filter is a map from prefix to a query for that prefix.  Queries with
     891    // the same prefix are combined with OR, and the results of this are
     892    // combined with AND to get the full filter.
     893    map<filter_group_id, QpQuery> filter;
     894
     895    QpQuery merge_filters() const {
     896        QpQuery q;
     897        for (map<filter_group_id, QpQuery>::const_iterator i = filter.begin();
     898             i != filter.end(); ++i)
     899        {
     900            add_to_query(q, Query::OP_AND, i->second.get());
     901        }
     902        return q;
     903    }
    860904};
    861905
    862906class TermList {
     
    10761120    // Handle any boolean filters.
    10771121    if (!P->filter.empty()) {
    10781122        if (E->empty()) {
    1079             *E = P->filter;
     1123            *E = P->merge_filters();
    10801124            // FIXME and make the query boolean somehow...
    10811125        } else {
    1082             *E = QpQuery(Query::OP_FILTER, *E, P->filter);
     1126            *E = QpQuery(Query::OP_FILTER, *E, P->merge_filters());
    10831127        }
    10841128    }
    10851129    // FIXME what if E->empty() (all terms are stopwords)?
     
    11061150        yy_parse_failed(yypParser);
    11071151        return;
    11081152    }
     1153    Xapian::valueno valno = range->get_valno();
    11091154    P = new ProbQuery;
    1110     P->filter = *range;
     1155    P->filter[filter_group_id(valno)] = *range;
    11111156    delete range;
    11121157}
    11131158
     
    11191164        yy_parse_failed(yypParser);
    11201165        return;
    11211166    }
     1167    Xapian::valueno valno = range->get_valno();
    11221168    P = Q;
    1123     add_to_query(P->filter, Query::OP_AND, *range);
     1169    add_to_query(P->filter[filter_group_id(valno)], Query::OP_OR, *range);
    11241170    delete range;
    11251171}
    11261172
     
    11901236
    11911237prob(P) ::= BOOLEAN_FILTER(T). {
    11921238    P = new ProbQuery;
    1193     P->filter = T->as_query_object();
     1239    P->filter[T->get_filter_group_id()] = T->as_query_object();
    11941240    delete T;
    11951241}
    11961242
    11971243prob(P) ::= stop_prob(Q) BOOLEAN_FILTER(T). {
    11981244    P = Q;
    1199     // FIXME we should OR filters with the same prefix...
    1200     add_to_query(P->filter, Query::OP_AND, T->as_query_object());
     1245    // We OR filters with the same prefix...
     1246    add_to_query(P->filter[T->get_filter_group_id()], Query::OP_OR, T->as_query_object());
    12011247    delete T;
    12021248}
    12031249
    12041250prob(P) ::= LOVE BOOLEAN_FILTER(T). {
    12051251    // LOVE BOOLEAN_FILTER(T) is just the same as BOOLEAN_FILTER
    12061252    P = new ProbQuery;
    1207     P->filter = T->as_query_object();
     1253    P->filter[T->get_filter_group_id()] = T->as_query_object();
    12081254    delete T;
    12091255}
    12101256
    12111257prob(P) ::= stop_prob(Q) LOVE BOOLEAN_FILTER(T). {
    12121258    // LOVE BOOLEAN_FILTER(T) is just the same as BOOLEAN_FILTER
    12131259    P = Q;
    1214     // FIXME we should OR filters with the same prefix...
    1215     add_to_query(P->filter, Query::OP_AND, T->as_query_object());
     1260    // We OR filters with the same prefix...
     1261    add_to_query(P->filter[T->get_filter_group_id()], Query::OP_OR, T->as_query_object());
    12161262    delete T;
    12171263}
    12181264
  • tests/queryparsertest.cc

     
    532532    { "- NEAR 12V voeding", "(near:(pos=1) OR 12v:(pos=2) OR Zvoed:(pos=3))" },
    533533    { "waarom \"~\" in directorynaam", "(Zwaarom:(pos=1) OR Zin:(pos=2) OR Zdirectorynaam:(pos=3))" },
    534534    { "cd'r NEAR toebehoren", "(cd'r:(pos=1) NEAR 11 toebehoren:(pos=2))" },
     535    { "site:1 site:2", "(H1 OR H2)" },
     536    { "site:1 site2:2", "(H1 AND J2)" },
     537    { "site:1 site:2 site2:2", "((H1 OR H2) AND J2)" },
     538    { "site:1 OR site:2", "(H1 OR H2)" },
     539    { "site:1 AND site:2", "(H1 AND H2)" },
     540#if 0
     541    { "A site:1 site:2", "(a FILTER (H1 OR H2))" },
     542    { "A (site:1 OR site:2)", "(a FILTER (H1 OR H2))" },
     543    { "A (site:1 OR site:2)", "(a FILTER (H1 OR H2))" },
     544    { "A site:1 site2:2", "(a FILTER (H1 AND J2))" },
     545    { "A site:1 site:2 site2:2", "(a FILTER ((H1 OR H2) AND J2))" },
     546    { "A site:1 OR site:2", "(a FILTER (H1 OR H2))" },
     547    { "A site:1 AND site:2", "(a FILTER (H1 AND H2))" },
     548#endif
     549    { "site:xapian.org OR site:www.xapian.org", "(Hxapian.org OR Hwww.xapian.org)" },
     550    { "site:xapian.org site:www.xapian.org", "(Hxapian.org OR Hwww.xapian.org)" },
     551    { "site:xapian.org AND site:www.xapian.org", "(Hxapian.org AND Hwww.xapian.org)" },
     552    { "Xapian site:xapian.org site:www.xapian.org", "(xapian:(pos=1) FILTER (Hxapian.org OR Hwww.xapian.org))" },
     553    { "author:richard author:olly writer:charlie", "(ZArichard:(pos=1) OR ZAolli:(pos=2) OR ZAcharli:(pos=3))"},
    535554    { NULL, NULL }
    536555};
    537556
     
    573592    queryparser.set_stemmer(Xapian::Stem("english"));
    574593    queryparser.set_stemming_strategy(Xapian::QueryParser::STEM_SOME);
    575594    queryparser.add_prefix("author", "A");
     595    queryparser.add_prefix("writer", "A");
    576596    queryparser.add_prefix("title", "XT");
    577597    queryparser.add_prefix("subject", "XT");
    578598    queryparser.add_boolean_prefix("site", "H");
     599    queryparser.add_boolean_prefix("site2", "J");
    579600    for (test *p = test_or_queries; p->query; ++p) {
    580601        string expect, parsed;
    581602        if (p->expect)
     
    977998    { "hello a..b", "(hello:(pos=1) FILTER VALUE_RANGE 1 a b)" },
    978999    { "hello a..b world", "((hello:(pos=1) OR world:(pos=2)) FILTER VALUE_RANGE 1 a b)" },
    9791000    { "hello a..b test:foo", "(hello:(pos=1) FILTER (VALUE_RANGE 1 a b AND XTESTfoo))" },
     1001    { "hello a..b test:foo test:bar", "(hello:(pos=1) FILTER (VALUE_RANGE 1 a b AND (XTESTfoo OR XTESTbar)))" },
     1002    { "hello a..b c..d test:foo", "(hello:(pos=1) FILTER ((VALUE_RANGE 1 a b OR VALUE_RANGE 1 c d) AND XTESTfoo))" },
     1003    { "hello a..b c..d test:foo test:bar", "(hello:(pos=1) FILTER ((VALUE_RANGE 1 a b OR VALUE_RANGE 1 c d) AND (XTESTfoo OR XTESTbar)))" },
    9801004    { "-5..7", "VALUE_RANGE 1 -5 7" },
    9811005    { "hello -5..7", "(hello:(pos=1) FILTER VALUE_RANGE 1 -5 7)" },
    9821006    { "-5..7 hello", "(hello:(pos=1) FILTER VALUE_RANGE 1 -5 7)" },
     
    10301054    { "12/03/99..12/04/01", "VALUE_RANGE 1 19990312 20010412" },
    10311055    { "03-12-99..04-14-01", "VALUE_RANGE 1 19990312 20010414" },
    10321056    { "(test:a..test:b hello)", "(hello:(pos=1) FILTER VALUE_RANGE 3 test:a test:b)" },
     1057    { "12..42kg 5..6kg 1..12", "(VALUE_RANGE 2 1 12 AND (VALUE_RANGE 5 12 42 OR VALUE_RANGE 5 5 6))" },
    10331058    { NULL, NULL }
    10341059};
    10351060
  • include/xapian/query.h

     
    377377         */
    378378        std::string get_description() const;
    379379
     380        /** Get the numeric parameter used in this query.
     381         *
     382         *  This is used by the queryparser to get the value number for
     383         *  VALUE_RANGE queries.  It should be replaced by a public method on
     384         *  the Query class at some point, but the API which should be used for
     385         *  that is unclear, so this is a temporary workaround.
     386         */
     387        Xapian::termcount get_parameter() const {  return parameter; }
     388
    380389        /** Get the length of the query, used by some ranking formulae.
    381390         *  This value is calculated automatically - if you want to override
    382391         *  it you can pass a different value to Enquire::set_query().
  • include/xapian/queryparser.h

     
    238238     *  will be converted to Hxapian.org combined with any probabilistic
    239239     *  query with OP_FILTER.
    240240     *
    241      *  Multiple fields can be mapped to the same prefix (so you can
    242      *  e.g. make site: and domain: aliases for each other).
     241     *  If multiple boolean filters are specified in a query for the same
     242     *  prefix, they will be combined with the OR operator.  Then, if there are
     243     *  boolean filters for different prefixes, they will be combined with the
     244     *  AND operator.
    243245     *
     246     *  Multiple fields can be mapped to the same prefix (so you can e.g. make
     247     *  site: and domain: aliases for each other).  Instances of fields with
     248     *  different aliases but the same prefix will still be combined with the
     249     *  OR operator.
     250     *
     251     *  For example, if "site" and "domain" map to "H", but author maps to "A",
     252     *  a search for "site:Foo domain:Bar author:Fred" will map to
     253     *  "(Hfoo OR Hbar) AND Afred".
     254     *
    244255     *  @param field   The user visible field name
    245256     *  @param prefix  The term prefix to map this to
    246257     */