Ticket #245: bug-245-fix.patch
File bug-245-fix.patch, 10.8 KB (added by , 14 years ago) |
---|
-
queryparser/queryparser.lemony
239 239 Database get_database() const { 240 240 return qpi->db; 241 241 } 242 243 const Stopper * get_stopper() const { 244 return qpi->stopper; 245 } 246 247 size_t stoplist_size() const { 248 return qpi->stoplist.size(); 249 } 250 251 void stoplist_resize(size_t s) { 252 qpi->stoplist.resize(s); 253 } 242 254 }; 243 255 244 256 string … … 634 646 main_lex_loop: 635 647 enum { 636 648 DEFAULT, IN_QUOTES, IN_PREFIXED_QUOTES, IN_PHRASED_TERM, IN_GROUP, 637 EXPLICIT_SYNONYM649 IN_GROUP2, EXPLICIT_SYNONYM 638 650 } mode = DEFAULT; 639 651 while (it != end && !state.error) { 640 652 bool last_was_operator = false; 653 bool last_was_operator_needing_term = false; 641 654 if (mode == EXPLICIT_SYNONYM) mode = DEFAULT; 642 655 if (false) { 643 656 just_had_operator: 644 657 if (it == end) break; 645 658 mode = DEFAULT; 646 just_had_synonym_operator: 659 last_was_operator_needing_term = false; 660 last_was_operator = true; 661 } 662 if (false) { 663 just_had_operator_needing_term: 664 last_was_operator_needing_term = true; 647 665 last_was_operator = true; 648 666 } 649 667 if (mode == IN_PHRASED_TERM) mode = DEFAULT; … … 654 672 if (it == end) break; 655 673 } 656 674 657 if ((mode == DEFAULT || mode == IN_GROUP) && value_ranges) { 675 if (value_ranges && 676 (mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2)) { 658 677 // Scan forward to see if this could be the "start of range" 659 678 // token. Sadly this has O(n^2) tendencies, though at least 660 679 // "n" is the number of words in a query which is likely to … … 709 728 unsigned ch = *it++; 710 729 newprev = ch; 711 730 // Drop out of IN_GROUP mode. 712 if (mode == IN_GROUP) mode = DEFAULT; 731 if (mode == IN_GROUP || mode == IN_GROUP2) 732 mode = DEFAULT; 713 733 switch (ch) { 714 734 case '"': // Quoted phrase. 715 735 if (mode == DEFAULT) { … … 766 786 token = HATE; 767 787 } 768 788 Parse(pParser, token, NULL, &state); 769 goto just_had_operator ;789 goto just_had_operator_needing_term; 770 790 } 771 791 // Need to prevent the term after a LOVE or HATE starting a 772 792 // term group... … … 816 836 } 817 837 Parse(pParser, SYNONYM, NULL, &state); 818 838 mode = EXPLICIT_SYNONYM; 819 goto just_had_ synonym_operator;839 goto just_had_operator_needing_term; 820 840 } 821 841 break; 822 842 } … … 832 852 833 853 // A term, a prefix, or a boolean operator. 834 854 const PrefixInfo * prefix_info = NULL; 835 if ((mode == DEFAULT || mode == IN_GROUP || mode == EXPLICIT_SYNONYM) &&855 if ((mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2 || mode == EXPLICIT_SYNONYM) && 836 856 !prefixmap.empty()) { 837 857 // Check for a fieldname prefix (e.g. title:historical). 838 858 Utf8Iterator p = find_if(it, end, is_not_wordchar); … … 851 871 852 872 if (prefix_info->type != NON_BOOLEAN) { 853 873 // Drop out of IN_GROUP if we're in it. 854 if (mode == IN_GROUP )874 if (mode == IN_GROUP || mode == IN_GROUP2) 855 875 mode = DEFAULT; 856 876 it = p; 857 877 string name; … … 923 943 string term = parse_term(it, end, was_acronym); 924 944 925 945 // Boolean operators. 926 if ((mode == DEFAULT || mode == IN_GROUP ) &&946 if ((mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2) && 927 947 (flags & FLAG_BOOLEAN) && 928 948 // Don't want to interpret A.N.D. as an AND operator. 929 949 !was_acronym && … … 1020 1040 Term * term_obj = new Term(&state, term, prefix_info, 1021 1041 unstemmed_term, stem_term, term_pos++); 1022 1042 1023 if (mode == DEFAULT || mode == IN_GROUP ) {1043 if (mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2) { 1024 1044 if (it != end) { 1025 1045 if ((flags & FLAG_WILDCARD) && *it == '*') { 1026 1046 Utf8Iterator p(it); 1027 1047 ++p; 1028 1048 if (p == end || !is_wordchar(*p)) { 1029 1049 it = p; 1030 // Drop out of IN_GROUP if we are in it. 1031 mode = DEFAULT; 1050 if (mode == IN_GROUP || mode == IN_GROUP2) { 1051 // Drop out of IN_GROUP and flag that the group 1052 // can be all stopwords. 1053 if (mode == IN_GROUP2) 1054 Parse(pParser, EMPTY_GROUP_OK, NULL, &state); 1055 mode = DEFAULT; 1056 } 1032 1057 // Wildcard at end of term (also known as 1033 1058 // "right truncation"). 1034 1059 Parse(pParser, WILD_TERM, term_obj, &state); … … 1037 1062 } 1038 1063 } else { 1039 1064 if (flags & FLAG_PARTIAL) { 1065 if (mode == IN_GROUP || mode == IN_GROUP2) { 1066 // Drop out of IN_GROUP and flag that the group 1067 // can be all stopwords. 1068 if (mode == IN_GROUP2) 1069 Parse(pParser, EMPTY_GROUP_OK, NULL, &state); 1070 mode = DEFAULT; 1071 } 1040 1072 // Final term of a partial match query, with no 1041 1073 // following characters - treat as a wildcard. 1042 1074 Parse(pParser, PARTIAL_TERM, term_obj, &state); … … 1074 1106 } else { 1075 1107 // See if the next token will be PHR_TERM - if so, this one 1076 1108 // needs to be TERM not GROUP_TERM. 1077 if (mode == IN_GROUP && is_phrase_generator(*it)) { 1109 if ((mode == IN_GROUP || mode == IN_GROUP2) && 1110 is_phrase_generator(*it)) { 1078 1111 // FIXME: can we clean this up? 1079 1112 Utf8Iterator p = it; 1080 1113 do { … … 1087 1120 } 1088 1121 } 1089 1122 1090 Parse(pParser, (mode == IN_GROUP ? GROUP_TERM : TERM), 1091 term_obj, &state); 1092 if (mode != DEFAULT && mode != IN_GROUP) continue; 1123 int token = TERM; 1124 if (mode == IN_GROUP || mode == IN_GROUP2) { 1125 mode = IN_GROUP2; 1126 token = GROUP_TERM; 1127 } 1128 Parse(pParser, token, term_obj, &state); 1129 if (token == TERM && mode != DEFAULT) 1130 continue; 1093 1131 } 1094 1132 } 1095 1133 … … 1107 1145 term_start_index = it.raw() - qs.data(); 1108 1146 goto phrased_term; 1109 1147 } 1110 } else if (mode == DEFAULT || mode == IN_GROUP) { 1148 } else if (mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2) { 1149 int old_mode = mode; 1111 1150 mode = DEFAULT; 1112 if (!last_was_operator && is_whitespace(*it)) {1151 if (!last_was_operator_needing_term && is_whitespace(*it)) { 1113 1152 newprev = ' '; 1114 1153 // Skip multiple whitespace. 1115 1154 do { … … 1118 1157 // Don't generate a group unless the terms are only separated 1119 1158 // by whitespace. 1120 1159 if (it != end && is_wordchar(*it)) { 1121 mode = IN_GROUP; 1160 if (old_mode == IN_GROUP || old_mode == IN_GROUP2) { 1161 mode = IN_GROUP2; 1162 } else { 1163 mode = IN_GROUP; 1164 } 1122 1165 } 1123 1166 } 1124 1167 } … … 1188 1231 class TermGroup { 1189 1232 vector<Term *> terms; 1190 1233 1234 bool empty_ok; 1235 1191 1236 public: 1192 TermGroup() { }1237 TermGroup() : empty_ok(false) { } 1193 1238 1194 1239 /// Add a Term object to this TermGroup object. 1195 1240 void add_term(Term * term) { 1196 1241 terms.push_back(term); 1197 1242 } 1198 1243 1244 void set_empty_ok() { empty_ok = true; } 1245 1199 1246 /// Convert to a Xapian::Query * using default_op. 1200 1247 Query * as_group(State *state) const; 1201 1248 … … 1219 1266 Query * 1220 1267 TermGroup::as_group(State *state) const 1221 1268 { 1269 const Xapian::Stopper * stopper = state->get_stopper(); 1270 size_t stoplist_size = state->stoplist_size(); 1271 reprocess: 1222 1272 Query::op default_op = state->default_op(); 1223 1273 vector<Query> subqs; 1224 1274 subqs.reserve(terms.size()); … … 1234 1284 TermIterator synend(db.synonym_keys_end((*i)->name)); 1235 1285 if (synkey == synend) { 1236 1286 // No multi-synonym matches. 1237 if (st ate->is_stopword(*i)) {1287 if (stopper && (*stopper)((*i)->name)) { 1238 1288 state->add_to_stoplist(*i); 1239 1289 } else { 1240 1290 subqs.push_back((*i)->get_query_with_auto_synonyms()); … … 1261 1311 } 1262 1312 if (i == begin) { 1263 1313 // No multi-synonym matches. 1264 if (st ate->is_stopword(*i)) {1314 if (stopper && (*stopper)((*i)->name)) { 1265 1315 state->add_to_stoplist(*i); 1266 1316 } else { 1267 1317 subqs.push_back((*i)->get_query_with_auto_synonyms()); … … 1273 1323 vector<Query> subqs2; 1274 1324 vector<Term*>::const_iterator j; 1275 1325 for (j = begin; j != i; ++j) { 1276 if (st ate->is_stopword(*j)) {1326 if (stopper && (*stopper)((*j)->name)) { 1277 1327 state->add_to_stoplist(*j); 1278 1328 } else { 1279 1329 subqs2.push_back((*j)->get_query()); … … 1306 1356 } else { 1307 1357 vector<Term*>::const_iterator i; 1308 1358 for (i = terms.begin(); i != terms.end(); ++i) { 1309 if (st ate->is_stopword(*i)) {1359 if (stopper && (*stopper)((*i)->name)) { 1310 1360 state->add_to_stoplist(*i); 1311 1361 } else { 1312 1362 subqs.push_back((*i)->get_query_with_auto_synonyms()); … … 1314 1364 } 1315 1365 } 1316 1366 1367 if (!empty_ok && stopper && subqs.empty() && 1368 stoplist_size < state->stoplist_size()) { 1369 // This group is all stopwords, so roll-back, disable stopper 1370 // temporarily, and reprocess this group. 1371 state->stoplist_resize(stoplist_size); 1372 stopper = NULL; 1373 goto reprocess; 1374 } 1375 1317 1376 delete this; 1318 1377 1319 1378 if (subqs.empty()) return NULL; … … 1618 1677 } 1619 1678 *E = Query(Query::OP_AND_NOT, *E, *P->hate); 1620 1679 } 1621 // FIXME what if E && E->empty() (all terms are stopwords)?1622 1680 delete P; 1623 1681 } 1624 1682 … … 1881 1939 P->add_term(T); 1882 1940 } 1883 1941 1942 group(P) ::= group(Q) EMPTY_GROUP_OK. { 1943 P = Q; 1944 P->set_empty_ok(); 1945 } 1946 1884 1947 // near_expr - 2 or more terms with NEAR in between. There must be at least 2 1885 1948 // terms in order for there to be any NEAR operators! 1886 1949 -
tests/queryparsertest.cc
141 141 { "XOR", "Syntax: <expression> XOR <expression>" }, 142 142 { "hard\xa0space", "(Zhard:(pos=1) OR Zspace:(pos=2))" }, 143 143 { " white\r\nspace\ttest ", "(Zwhite:(pos=1) OR Zspace:(pos=2) OR Ztest:(pos=3))" }, 144 { "one AND two three", "(Zone:(pos=1) AND (Ztwo:(pos=2) OR Zthree:(pos=3)))" }, 145 { "one two AND three", "((Zone:(pos=1) OR Ztwo:(pos=2)) AND Zthree:(pos=3))" }, 144 146 { "one AND two/three", "(Zone:(pos=1) AND (two:(pos=2) PHRASE 2 three:(pos=3)))" }, 145 147 { "one AND /two/three", "(Zone:(pos=1) AND (two:(pos=2) PHRASE 2 three:(pos=3)))" }, 146 148 { "one AND/two/three", "(Zone:(pos=1) AND (two:(pos=2) PHRASE 2 three:(pos=3)))" }, … … 1069 1071 // parse. 1070 1072 { "test AND the AND queryparser", "(test:(pos=1) AND the:(pos=2) AND queryparser:(pos=3))" }, 1071 1073 // 0.9.6 and earlier ignored a stopword even if it was the only term. 1072 // We don't ignore it in this case, which is probably better. But 1073 // an all-stopword query with multiple terms doesn't work, which 1074 // prevents 'to be or not to be' for being searchable unless made 1075 // into a phrase query. 1074 // More recent versions don't ever treat a single term as a stopword. 1076 1075 { "the", "the:(pos=1)" }, 1076 // 1.2.2 and earlier ignored an all-stopword query with multiple terms, 1077 // which prevents 'to be or not to be' for being searchable unless the 1078 // user made it into a phrase query or prefixed all terms with '+' 1079 // (ticket#245). 1080 { "an the a", "(an:(pos=1) AND the:(pos=2) AND a:(pos=3))" }, 1081 // Regression test for bug in initial version of the patch for the 1082 // "all-stopword" case. 1083 { "the AND a an", "(the:(pos=1) AND a:(pos=2) AND an:(pos=3))" }, 1077 1084 { NULL, NULL } 1078 1085 }; 1079 1086