Context Navigation

Back to Ticket #245

Ticket #245: bug-245-fix.patch

File bug-245-fix.patch, 10.8 KB (added by Olly Betts, 15 years ago)
More sophisticated fix, but a bit ugly

queryparser/queryparser.lemony

     Database get_database() const {
         return qpi->db;
+    }
+    const Stopper * get_stopper() const {
+        return qpi->stopper;
+    }
+    size_t stoplist_size() const {
+        return qpi->stoplist.size();
+    }
+    void stoplist_resize(size_t s) {
+        qpi->stoplist.resize(s);
+    }
 };
 string
 …
 main_lex_loop:
     enum {
         DEFAULT, IN_QUOTES, IN_PREFIXED_QUOTES, IN_PHRASED_TERM, IN_GROUP,
         EXPLICIT_SYNONYM
+        IN_GROUP2, EXPLICIT_SYNONYM
     } mode = DEFAULT;
     while (it != end && !state.error) {
         bool last_was_operator = false;
+        bool last_was_operator_needing_term = false;
         if (mode == EXPLICIT_SYNONYM) mode = DEFAULT;
         if (false) {
 just_had_operator:
             if (it == end) break;
             mode = DEFAULT;
+just_had_synonym_operator:
+            last_was_operator_needing_term = false;
+            last_was_operator = true;
+        }
+        if (false) {
+just_had_operator_needing_term:
+            last_was_operator_needing_term = true;
             last_was_operator = true;
+        }
         if (mode == IN_PHRASED_TERM) mode = DEFAULT;
 …
             if (it == end) break;
+        }
+        if ((mode == DEFAULT || mode == IN_GROUP) && value_ranges) {
+        if (value_ranges &&
+            (mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2)) {
             // Scan forward to see if this could be the "start of range"
             // token.  Sadly this has O(n^2) tendencies, though at least
             // "n" is the number of words in a query which is likely to
 …
             unsigned ch = *it++;
             newprev = ch;
             // Drop out of IN_GROUP mode.
+            if (mode == IN_GROUP) mode = DEFAULT;
+            if (mode == IN_GROUP || mode == IN_GROUP2)
+                mode = DEFAULT;
             switch (ch) {
               case '"': // Quoted phrase.
                 if (mode == DEFAULT) {
 …
                         token = HATE;
+                    }
                     Parse(pParser, token, NULL, &state);
                     goto just_had_operator;
+                    goto just_had_operator_needing_term;
+                }
                 // Need to prevent the term after a LOVE or HATE starting a
                 // term group...
 …
+                    }
                     Parse(pParser, SYNONYM, NULL, &state);
                     mode = EXPLICIT_SYNONYM;
                     goto just_had_synonym_operator;
+                    goto just_had_operator_needing_term;
+                }
                 break;
+            }
 …
         // A term, a prefix, or a boolean operator.
         const PrefixInfo * prefix_info = NULL;
         if ((mode == DEFAULT || mode == IN_GROUP || mode == EXPLICIT_SYNONYM) &&
+        if ((mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2 || mode == EXPLICIT_SYNONYM) &&
             !prefixmap.empty()) {
             // Check for a fieldname prefix (e.g. title:historical).
             Utf8Iterator p = find_if(it, end, is_not_wordchar);
 …
                     if (prefix_info->type != NON_BOOLEAN) {
                         // Drop out of IN_GROUP if we're in it.
                         if (mode == IN_GROUP)
+                        if (mode == IN_GROUP || mode == IN_GROUP2)
                             mode = DEFAULT;
                         it = p;
                         string name;
 …
         string term = parse_term(it, end, was_acronym);
         // Boolean operators.
         if ((mode == DEFAULT || mode == IN_GROUP) &&
+        if ((mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2) &&
             (flags & FLAG_BOOLEAN) &&
             // Don't want to interpret A.N.D. as an AND operator.
             !was_acronym &&
 …
             Term * term_obj = new Term(&state, term, prefix_info,
                                        unstemmed_term, stem_term, term_pos++);
             if (mode == DEFAULT || mode == IN_GROUP) {
+            if (mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2) {
                 if (it != end) {
                     if ((flags & FLAG_WILDCARD) && *it == '*') {
                         Utf8Iterator p(it);
                         ++p;
                         if (p == end || !is_wordchar(*p)) {
                             it = p;
+                            // Drop out of IN_GROUP if we are in it.
+                            mode = DEFAULT;
+                            if (mode == IN_GROUP || mode == IN_GROUP2) {
+                                // Drop out of IN_GROUP and flag that the group
+                                // can be all stopwords.
+                                if (mode == IN_GROUP2)
+                                    Parse(pParser, EMPTY_GROUP_OK, NULL, &state);
+                                mode = DEFAULT;
+                            }
                             // Wildcard at end of term (also known as
                             // "right truncation").
                             Parse(pParser, WILD_TERM, term_obj, &state);
 …
+                    }
                 } else {
                     if (flags & FLAG_PARTIAL) {
+                        if (mode == IN_GROUP || mode == IN_GROUP2) {
+                            // Drop out of IN_GROUP and flag that the group
+                            // can be all stopwords.
+                            if (mode == IN_GROUP2)
+                                Parse(pParser, EMPTY_GROUP_OK, NULL, &state);
+                            mode = DEFAULT;
+                        }
                         // Final term of a partial match query, with no
                         // following characters - treat as a wildcard.
                         Parse(pParser, PARTIAL_TERM, term_obj, &state);
 …
             } else {
                 // See if the next token will be PHR_TERM - if so, this one
                 // needs to be TERM not GROUP_TERM.
+                if (mode == IN_GROUP && is_phrase_generator(*it)) {
+                if ((mode == IN_GROUP || mode == IN_GROUP2) &&
+                    is_phrase_generator(*it)) {
                     // FIXME: can we clean this up?
                     Utf8Iterator p = it;
                     do {
 …
+                    }
+                }
+                Parse(pParser, (mode == IN_GROUP ? GROUP_TERM : TERM),
+                      term_obj, &state);
+                if (mode != DEFAULT && mode != IN_GROUP) continue;
+                int token = TERM;
+                if (mode == IN_GROUP || mode == IN_GROUP2) {
+                    mode = IN_GROUP2;
+                    token = GROUP_TERM;
+                }
+                Parse(pParser, token, term_obj, &state);
+                if (token == TERM && mode != DEFAULT)
+                    continue;
+            }
+        }
 …
                 term_start_index = it.raw() - qs.data();
                 goto phrased_term;
+            }
+        } else if (mode == DEFAULT || mode == IN_GROUP) {
+        } else if (mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2) {
+            int old_mode = mode;
             mode = DEFAULT;
             if (!last_was_operator && is_whitespace(*it)) {
+            if (!last_was_operator_needing_term && is_whitespace(*it)) {
                 newprev = ' ';
                 // Skip multiple whitespace.
                 do {
 …
                 // Don't generate a group unless the terms are only separated
                 // by whitespace.
                 if (it != end && is_wordchar(*it)) {
+                    mode = IN_GROUP;
+                    if (old_mode == IN_GROUP || old_mode == IN_GROUP2) {
+                        mode = IN_GROUP2;
+                    } else {
+                        mode = IN_GROUP;
+                    }
+                }
+            }
+        }
 …
 class TermGroup {
     vector<Term *> terms;
+    bool empty_ok;
   public:
     TermGroup() { }
+    TermGroup() : empty_ok(false) { }
     /// Add a Term object to this TermGroup object.
     void add_term(Term * term) {
         terms.push_back(term);
+    }
+    void set_empty_ok() { empty_ok = true; }
     /// Convert to a Xapian::Query * using default_op.
     Query * as_group(State *state) const;
 …
 Query *
 TermGroup::as_group(State *state) const
+{
+    const Xapian::Stopper * stopper = state->get_stopper();
+    size_t stoplist_size = state->stoplist_size();
+reprocess:
     Query::op default_op = state->default_op();
     vector<Query> subqs;
     subqs.reserve(terms.size());
 …
             TermIterator synend(db.synonym_keys_end((*i)->name));
             if (synkey == synend) {
                 // No multi-synonym matches.
                 if (state->is_stopword(*i)) {
+                if (stopper && (*stopper)((*i)->name)) {
                     state->add_to_stoplist(*i);
                 } else {
                     subqs.push_back((*i)->get_query_with_auto_synonyms());
 …
+            }
             if (i == begin) {
                 // No multi-synonym matches.
                 if (state->is_stopword(*i)) {
+                if (stopper && (*stopper)((*i)->name)) {
                     state->add_to_stoplist(*i);
                 } else {
                     subqs.push_back((*i)->get_query_with_auto_synonyms());
 …
             vector<Query> subqs2;
             vector<Term*>::const_iterator j;
             for (j = begin; j != i; ++j) {
                 if (state->is_stopword(*j)) {
+                if (stopper && (*stopper)((*j)->name)) {
                     state->add_to_stoplist(*j);
                 } else {
                     subqs2.push_back((*j)->get_query());
 …
     } else {
         vector<Term*>::const_iterator i;
         for (i = terms.begin(); i != terms.end(); ++i) {
             if (state->is_stopword(*i)) {
+            if (stopper && (*stopper)((*i)->name)) {
                 state->add_to_stoplist(*i);
             } else {
                 subqs.push_back((*i)->get_query_with_auto_synonyms());
 …
+        }
+    }
+    if (!empty_ok && stopper && subqs.empty() &&
+        stoplist_size < state->stoplist_size()) {
+        // This group is all stopwords, so roll-back, disable stopper
+        // temporarily, and reprocess this group.
+        state->stoplist_resize(stoplist_size);
+        stopper = NULL;
+        goto reprocess;
+    }
     delete this;
     if (subqs.empty()) return NULL;
 …
+        }
         *E = Query(Query::OP_AND_NOT, *E, *P->hate);
+    }
-    // FIXME what if E && E->empty() (all terms are stopwords)?
     delete P;
+}
 …
     P->add_term(T);
+}
+group(P) ::= group(Q) EMPTY_GROUP_OK. {
+    P = Q;
+    P->set_empty_ok();
+}
 // near_expr - 2 or more terms with NEAR in between.  There must be at least 2
 // terms in order for there to be any NEAR operators!

tests/queryparsertest.cc

     { "XOR", "Syntax: <expression> XOR <expression>" },
     { "hard\xa0space", "(Zhard:(pos=1) OR Zspace:(pos=2))" },
     { " white\r\nspace\ttest ", "(Zwhite:(pos=1) OR Zspace:(pos=2) OR Ztest:(pos=3))" },
+    { "one AND two three", "(Zone:(pos=1) AND (Ztwo:(pos=2) OR Zthree:(pos=3)))" },
+    { "one two AND three", "((Zone:(pos=1) OR Ztwo:(pos=2)) AND Zthree:(pos=3))" },
     { "one AND two/three", "(Zone:(pos=1) AND (two:(pos=2) PHRASE 2 three:(pos=3)))" },
     { "one AND /two/three", "(Zone:(pos=1) AND (two:(pos=2) PHRASE 2 three:(pos=3)))" },
     { "one AND/two/three", "(Zone:(pos=1) AND (two:(pos=2) PHRASE 2 three:(pos=3)))" },
 …
     // parse.
     { "test AND the AND queryparser", "(test:(pos=1) AND the:(pos=2) AND queryparser:(pos=3))" },
     // 0.9.6 and earlier ignored a stopword even if it was the only term.
+    // We don't ignore it in this case, which is probably better.  But
+    // an all-stopword query with multiple terms doesn't work, which
+    // prevents 'to be or not to be' for being searchable unless made
+    // into a phrase query.
+    // More recent versions don't ever treat a single term as a stopword.
     { "the", "the:(pos=1)" },
+    // 1.2.2 and earlier ignored an all-stopword query with multiple terms,
+    // which prevents 'to be or not to be' for being searchable unless the
+    // user made it into a phrase query or prefixed all terms with '+'
+    // (ticket#245).
+    { "an the a", "(an:(pos=1) AND the:(pos=2) AND a:(pos=3))" },
+    // Regression test for bug in initial version of the patch for the
+    // "all-stopword" case.
+    { "the AND a an", "(the:(pos=1) AND a:(pos=2) AND an:(pos=3))" },
     { NULL, NULL }
 };

Download in other formats:

Original Format