Ticket #50: patch
File patch, 18.0 KB (added by , 17 years ago) |
---|
-
matcher/Makefile.mk
15 15 matcher/phrasepostlist.h\ 16 16 matcher/remotesubmatch.h\ 17 17 matcher/selectpostlist.h\ 18 matcher/synonympostlist.h\ 18 19 matcher/valuerangepostlist.h\ 19 20 matcher/xorpostlist.h 20 21 … … 47 48 matcher/phrasepostlist.cc\ 48 49 matcher/rset.cc\ 49 50 matcher/selectpostlist.cc\ 51 matcher/synonympostlist.cc\ 50 52 matcher/stats.cc\ 51 53 matcher/tradweight.cc\ 52 54 matcher/valuerangepostlist.cc\ -
matcher/localmatch.cc
3 3 * Copyright 1999,2000,2001 BrightStation PLC 4 4 * Copyright 2002 Ananova Ltd 5 5 * Copyright 2002,2003,2004,2005,2006,2007 Olly Betts 6 * Copyright 2007 Lemur Consulting Ltd 6 7 * 7 8 * This program is free software; you can redistribute it and/or 8 9 * modify it under the terms of the GNU General Public License as … … 38 39 #include "mergepostlist.h" 39 40 #include "extraweightpostlist.h" 40 41 #include "valuerangepostlist.h" 42 #include "synonympostlist.h" 41 43 42 44 #include "omqueryinternal.h" 43 45 … … 262 264 } 263 265 } 264 266 267 // Convert a list of subqueries into a vector of postlists. 268 void 269 LocalSubMatch::postlists_from_queries(std::vector<PostList *> &result, 270 const Xapian::Query::Internal::subquery_list &queries, 271 MultiMatch * matcher, bool is_bool) 272 { 273 Assert(queries.size() >= 2); 274 275 // Open a postlist for each query, and store these postlists in a vector. 276 result.reserve(queries.size()); 277 278 Xapian::Query::Internal::subquery_list::const_iterator q; 279 for (q = queries.begin(); q != queries.end(); q++) { 280 result.push_back(postlist_from_query(*q, matcher, is_bool)); 281 DEBUGLINE(MATCH, "Made postlist for " << (*q)->get_description() << 282 ": termfreq is: (min, est, max) = (" << 283 result.back()->get_termfreq_min() << ", " << 284 result.back()->get_termfreq_est() << ", " << 285 result.back()->get_termfreq_max() << ")"); 286 } 287 } 288 265 289 // Make a postlist from the subqueries of a query objects. 266 290 // Operation must be either AND, OR, XOR, PHRASE, NEAR, or ELITE_SET. 267 291 // Optimise query by building tree carefully. … … 270 294 const Xapian::Query::Internal *query, MultiMatch *matcher, bool is_bool) 271 295 { 272 296 DEBUGCALL(MATCH, PostList *, "LocalSubMatch::postlist_from_queries", op << ", " << query << ", " << matcher << ", " << is_bool); 273 Assert(op == Xapian::Query::OP_OR || op == Xapian::Query::OP_AND || 297 Assert(op == Xapian::Query::OP_OR || 298 op == Xapian::Query::OP_AND || 274 299 op == Xapian::Query::OP_XOR || 275 op == Xapian::Query::OP_NEAR || op == Xapian::Query::OP_PHRASE || 300 op == Xapian::Query::OP_NEAR || 301 op == Xapian::Query::OP_PHRASE || 276 302 op == Xapian::Query::OP_ELITE_SET); 277 const Xapian::Query::Internal::subquery_list &queries = query->subqs;278 Assert(queries.size() >= 2);279 303 280 304 // Open a postlist for each query, and store these postlists in a vector. 281 305 std::vector<PostList *> postlists; 282 postlists .reserve(queries.size());306 postlists_from_queries(postlists, query->subqs, matcher, is_bool); 283 307 284 Xapian::Query::Internal::subquery_list::const_iterator q;285 for (q = queries.begin(); q != queries.end(); q++) {286 postlists.push_back(postlist_from_query(*q, matcher, is_bool));287 DEBUGLINE(MATCH, "Made postlist for " << (*q)->get_description() <<288 ": termfreq is: (min, est, max) = (" <<289 postlists.back()->get_termfreq_min() << ", " <<290 postlists.back()->get_termfreq_est() << ", " <<291 postlists.back()->get_termfreq_max() << ")");292 }293 294 308 // Build tree 295 309 switch (op) { 296 310 case Xapian::Query::OP_XOR: … … 427 441 pl->set_termweight(wt); 428 442 RETURN(pl); 429 443 } 444 case Xapian::Query::OP_SYNONYM: 445 { 446 if (is_bool) { 447 // An or postlist returns the same documents as a synonym 448 // postlist, and doesn't have the overhead of calculating the 449 // term frequency, so is more efficient than a synonym postlist 450 // if we don't care about the weights. 451 RETURN(postlist_from_queries(Xapian::Query::OP_OR, query, matcher, is_bool)); 452 } else { 453 AutoPtr<Xapian::Weight> wt; 454 // Use a wqf of 1, since we don't have a specific value. 455 // Set the term name to "", since we don't have one of them, either. 456 wt = wt_factory->create(&statssource, qlen, 1, ""); 457 458 std::vector<PostList *> postlists; 459 postlists_from_queries(postlists, query->subqs, matcher, is_bool); 460 461 // build_or_tree empties "postlists", but we need to have them 462 // available to get statistics, so we need to keep a copy 463 // FIXME: there must be a cleaner way for this to work... 464 std::vector<PostList *> postlists_orig = postlists; 465 PostList *res = build_or_tree(postlists, matcher); 466 RETURN(new SynonymPostList(res, postlists_orig, matcher, wt.release())); 467 } 468 } 430 469 case Xapian::Query::OP_PHRASE: 431 470 case Xapian::Query::OP_NEAR: 432 471 // If no positional information in this sub-database, change the -
matcher/localmatch.h
76 76 PostList * build_xor_tree(std::vector<PostList *> &postlists, 77 77 MultiMatch *matcher); 78 78 79 80 /** Convert a list of subqueries into a vector of postlists. 81 * 82 * FIXME - expand documentation comment. 83 */ 84 void postlists_from_queries(std::vector<PostList *> &result, 85 const Xapian::Query::Internal::subquery_list &queries, 86 MultiMatch *matcher, 87 bool is_bool); 88 79 89 /** Convert the sub-queries of a Query into an optimised PostList tree. 80 90 * 81 91 * We take the sub-queries from @a query, but use @op instead of -
matcher/synonympostlist.h
1 /* synonympostlist.h: Combine subqueries, weighting as if they are synonyms 2 * 3 * Copyright 2007 Lemur Consulting Ltd 4 * 5 * This program is free software; you can redistribute it and/or modify 6 * it under the terms of the GNU General Public License as published by 7 * the Free Software Foundation; either version 2 of the License, or 8 * (at your option) any later version. 9 * 10 * This program is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * GNU General Public License for more details. 14 * 15 * You should have received a copy of the GNU General Public License 16 * along with this program; if not, write to the Free Software 17 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 18 */ 19 20 #ifndef XAPIAN_INCLUDED_SYNONYMPOSTLIST_H 21 #define XAPIAN_INCLUDED_SYNONYMPOSTLIST_H 22 23 #include "multimatch.h" 24 #include "postlist.h" 25 #include <vector> 26 27 /** A postlist comprising several postlists SYNONYMed together. 28 * 29 * This postlist returns all postings in the OR of the sub postlists, but 30 * returns weights as if they represented a single term. The term frequency 31 * portion of the weight is approximated. 32 */ 33 class SynonymPostList : public PostList { 34 private: 35 PostList * subtree; 36 std::vector<PostList *> terms; 37 38 /** The object which is using this postlist to perform 39 * a match. This object needs to be notified when the 40 * tree changes such that the maximum weights need to be 41 * recalculated. 42 */ 43 MultiMatch *matcher; 44 45 const Xapian::Weight * wt; 46 bool want_doclength; 47 48 public: 49 SynonymPostList(PostList *subtree_, 50 const std::vector<PostList *> & terms_, 51 MultiMatch * matcher_, 52 const Xapian::Weight * wt_) 53 : subtree(subtree_), 54 terms(terms_), 55 matcher(matcher_), 56 wt(wt_), 57 want_doclength(wt_->get_sumpart_needs_doclength()) 58 { 59 } 60 61 PostList *next(Xapian::weight w_min); 62 PostList *skip_to(Xapian::docid did, Xapian::weight w_min); 63 64 Xapian::weight get_weight() const; 65 Xapian::weight get_maxweight() const; 66 Xapian::weight recalc_maxweight(); 67 Xapian::termcount get_wdf() const; 68 69 // The following methods just call through to the subtree. 70 Xapian::doccount get_termfreq_min() const { 71 return subtree->get_termfreq_min(); 72 } 73 Xapian::doccount get_termfreq_est() const { 74 return subtree->get_termfreq_est(); 75 } 76 Xapian::doccount get_termfreq_max() const { 77 return subtree->get_termfreq_max(); 78 } 79 Xapian::docid get_docid() const { 80 return subtree->get_docid(); 81 } 82 Xapian::doclength get_doclength() const { 83 return subtree->get_doclength(); 84 } 85 PositionList * read_position_list() { 86 return subtree->read_position_list(); 87 } 88 PositionList * open_position_list() const { 89 return subtree->open_position_list(); 90 } 91 bool at_end() const { 92 return subtree->at_end(); 93 } 94 95 std::string get_description() const { 96 return "(Synonym " + subtree->get_description() + ")"; 97 } 98 }; 99 100 #endif /* XAPIAN_INCLUDED_SYNONYMPOSTLIST_H */ -
matcher/synonympostlist.cc
Property changes on: matcher/synonympostlist.h ___________________________________________________________________ Name: svn:eol-style + native
1 /* synonympostlist.cc: Combine subqueries, weighting as if they are synonyms 2 * 3 * Copyright 2007 Lemur Consulting Ltd 4 * 5 * This program is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU General Public License as 7 * published by the Free Software Foundation; either version 2 of the 8 * License, or (at your option) any later version. 9 * 10 * This program is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * GNU General Public License for more details. 14 * 15 * You should have received a copy of the GNU General Public License 16 * along with this program; if not, write to the Free Software 17 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 18 * USA 19 */ 20 21 #include <config.h> 22 23 #include "synonympostlist.h" 24 #include "branchpostlist.h" 25 #include "omassert.h" 26 #include "omdebug.h" 27 28 PostList * 29 SynonymPostList::next(Xapian::weight w_min) 30 { 31 DEBUGCALL(MATCH, PostList *, "SynonymPostList::next", w_min); 32 next_handling_prune(subtree, w_min, matcher); 33 RETURN(NULL); 34 } 35 36 PostList * 37 SynonymPostList::skip_to(Xapian::docid did, Xapian::weight w_min) 38 { 39 DEBUGCALL(MATCH, PostList *, "SynonymPostList::skip_to", did << ", " << w_min); 40 skip_to_handling_prune(subtree, did, w_min, matcher); 41 RETURN(NULL); 42 } 43 44 Xapian::weight 45 SynonymPostList::get_weight() const 46 { 47 return wt->get_sumpart(get_wdf(), want_doclength ? get_doclength() : 0); 48 } 49 50 Xapian::weight 51 SynonymPostList::get_maxweight() const 52 { 53 return wt->get_maxpart(); 54 } 55 56 Xapian::weight 57 SynonymPostList::recalc_maxweight() 58 { 59 return SynonymPostList::get_maxweight(); 60 } 61 62 Xapian::termcount 63 SynonymPostList::get_wdf() const { 64 std::vector<PostList *>::const_iterator i; 65 Xapian::termcount wdf = 0; 66 Xapian::docid did = get_docid(); 67 for (i = terms.begin(); i != terms.end(); ++i) { 68 if ((*i)->get_docid() == did) 69 wdf += (*i)->get_wdf(); 70 } 71 return wdf; 72 } 73 -
tests/api_db.cc
Property changes on: matcher/synonympostlist.cc ___________________________________________________________________ Name: svn:eol-style + native
1129 1129 return true; 1130 1130 } 1131 1131 1132 // Check a synonym search 1133 static bool test_synonym1() 1134 { 1135 Xapian::Database db(get_database("etext")); 1136 Xapian::Enquire enquire(db); 1137 enquire.set_query(Xapian::Query(Xapian::Query::OP_OR, 1138 Xapian::Query("date"), 1139 Xapian::Query("sky"))); 1140 Xapian::doccount lots = 214; 1141 Xapian::MSet ormset = enquire.get_mset(0, lots); 1142 1143 enquire.set_query(Xapian::Query(Xapian::Query::OP_SYNONYM, 1144 Xapian::Query("date"), 1145 Xapian::Query("sky"))); 1146 Xapian::MSet mset = enquire.get_mset(0, lots); 1147 1148 TEST_NOT_EQUAL(mset.size(), 0); 1149 TEST_EQUAL(mset.size(), ormset.size()); 1150 for (Xapian::doccount i = 0; i < mset.size(); ++i) { 1151 printf("%d,\t%f,\t%d,\t%f\n", 1152 *mset[i], mset[i].get_weight(), 1153 *ormset[i], ormset[i].get_weight()); 1154 //TEST_EQUAL(*mset[i], *ormset[i]); 1155 //TEST_EQUAL_DOUBLE(mset[i].get_weight(), ormset[i].get_weight()); 1156 } 1157 return true; 1158 } 1159 1132 1160 // tests that specifying a nonexistent input file throws an exception. 1133 1161 static bool test_quartzdatabaseopeningerror1() 1134 1162 { … … 1707 1735 // with that, and testing it there doesn't actually improve the test 1708 1736 // coverage really. 1709 1737 {"consistency1", test_consistency1}, 1738 {"synonym1", test_synonym1}, 1710 1739 // Would work with remote if we registered the weighting scheme. 1711 1740 // FIXME: do this so we also test that functionality... 1712 1741 {"userweight1", test_userweight1}, … … 1731 1760 {"keepalive1", test_keepalive1}, 1732 1761 {"termstats", test_termstats}, 1733 1762 {"sortvalue1", test_sortvalue1}, 1763 {"synonym1", test_synonym1}, 1734 1764 {"sortrel1", test_sortrel1}, 1735 1765 {"netstats1", test_netstats1}, 1736 1766 {0, 0} -
include/xapian/query.h
4 4 /* Copyright 1999,2000,2001 BrightStation PLC 5 5 * Copyright 2002 Ananova Ltd 6 6 * Copyright 2003,2004,2005,2006,2007 Olly Betts 7 * Copyright 2006 Lemur Consulting Ltd7 * Copyright 2006,2007 Lemur Consulting Ltd 8 8 * 9 9 * This program is free software; you can redistribute it and/or 10 10 * modify it under the terms of the GNU General Public License as … … 96 96 /** Filter by a range test on a document value. */ 97 97 OP_VALUE_RANGE, 98 98 99 /** Treat a set of queries as synonyms. 100 * 101 * This returns all results which match at least one of the 102 * queries, but weighting as if all the sub-queries are instances 103 * of the same term: so multiple matching terms for a document 104 * increase the wdf value used, and the term frequency is based on 105 * the number of documents which would match an OR of all the 106 * subqueries. 107 * 108 * The term frequency used will usually be an approximation, 109 * because calculating the precise combined term frequency would 110 * be overly expensive. 111 * 112 * Identical to OP_OR, except for the weightings returned. 113 */ 114 OP_SYNONYM, 115 99 116 /** Select an elite set from the subqueries, and perform 100 117 * a query with these combined as an OR query. 101 118 */ -
api/omqueryinternal.cc
3 3 * Copyright 1999,2000,2001 BrightStation PLC 4 4 * Copyright 2002 Ananova Ltd 5 5 * Copyright 2002,2003,2004,2005,2006,2007 Olly Betts 6 * Copyright 2006 Lemur Consulting Ltd6 * Copyright 2006,2007 Lemur Consulting Ltd 7 7 * 8 8 * This program is free software; you can redistribute it and/or 9 9 * modify it under the terms of the GNU General Public License as … … 57 57 case Xapian::Query::OP_PHRASE: 58 58 case Xapian::Query::OP_ELITE_SET: 59 59 case Xapian::Query::OP_VALUE_RANGE: 60 case Xapian::Query::OP_SYNONYM: 60 61 return 0; 61 62 case Xapian::Query::OP_FILTER: 62 63 case Xapian::Query::OP_AND_MAYBE: … … 85 86 case Xapian::Query::OP_NEAR: 86 87 case Xapian::Query::OP_PHRASE: 87 88 case Xapian::Query::OP_ELITE_SET: 89 case Xapian::Query::OP_SYNONYM: 88 90 return UINT_MAX; 89 91 default: 90 92 Assert(false); … … 177 179 result += str_parameter; 178 180 result += om_tostring(parameter); 179 181 break; 182 case Xapian::Query::OP_SYNONYM: 183 result += "="; 184 break; 180 185 } 181 186 } 182 187 return result; … … 202 207 case Xapian::Query::OP_PHRASE: name = "PHRASE"; break; 203 208 case Xapian::Query::OP_ELITE_SET: name = "ELITE_SET"; break; 204 209 case Xapian::Query::OP_VALUE_RANGE: name = "VALUE_RANGE"; break; 210 case Xapian::Query::OP_SYNONYM: name = "SYNONYM"; break; 205 211 } 206 212 return name; 207 213 } … … 451 457 return new Xapian::Query::Internal(Xapian::Query::OP_VALUE_RANGE, valno, 452 458 start, stop); 453 459 } 460 case '=': 461 return qint_from_vector(Xapian::Query::OP_SYNONYM, subqs); 454 462 default: 455 463 DEBUGLINE(UNKNOWN, "Can't parse remainder `" << p - 1 << "'"); 456 464 throw Xapian::InvalidArgumentError("Invalid query string"); … … 617 625 case OP_ELITE_SET: 618 626 case OP_OR: 619 627 case OP_XOR: 628 case OP_SYNONYM: 620 629 // Doing an "OR" type operation - if we've got any MatchNothing 621 630 // subnodes, drop them; except that we mustn't become an empty 622 631 // node due to this, so we never drop a MatchNothing subnode … … 690 699 } 691 700 } 692 701 break; 693 case OP_OR: case OP_AND: case OP_XOR: 702 case OP_OR: case OP_AND: case OP_XOR: case OP_SYNONYM: 694 703 // Remove duplicates if we can. 695 704 if (subqs.size() > 1) collapse_subqs(); 696 705 break; … … 734 743 void 735 744 Xapian::Query::Internal::collapse_subqs() 736 745 { 737 Assert(op == OP_OR || op == OP_AND || op == OP_XOR );746 Assert(op == OP_OR || op == OP_AND || op == OP_XOR || op == OP_SYNONYM); 738 747 typedef set<Xapian::Query::Internal *, SortPosName> subqtable; 739 748 subqtable sqtab; 740 749 … … 809 818 Assert(!is_leaf(op)); 810 819 if (subq == 0) { 811 820 subqs.push_back(0); 812 } else if (op == subq->op && (op == OP_AND || op == OP_OR || op == OP_XOR )) {821 } else if (op == subq->op && (op == OP_AND || op == OP_OR || op == OP_XOR || op == OP_SYNONYM)) { 813 822 // Distribute the subquery. 814 823 for (subquery_list::const_iterator i = subq->subqs.begin(); 815 824 i != subq->subqs.end(); i++) {