Ticket #363: xapian-percent-scaling-without-termlists.patch
File xapian-percent-scaling-without-termlists.patch, 23.4 KB (added by , 15 years ago) |
---|
-
api/leafpostlist.cc
97 97 Assert(i != stats.termfreqs.end()); 98 98 RETURN(i->second); 99 99 } 100 101 Xapian::termcount 102 LeafPostList::count_matching_subqs() const 103 { 104 return 1; 105 } -
api/postlist.cc
1 1 /** @file postlist.cc 2 2 * @brief Abstract base class for postlists. 3 3 */ 4 /* Copyright (C) 2007 Olly Betts4 /* Copyright (C) 2007,2009 Olly Betts 5 5 * 6 6 * This program is free software; you can redistribute it and/or 7 7 * modify it under the terms of the GNU General Public License as … … 20 20 21 21 #include <config.h> 22 22 23 #include "postlist.h" 24 23 25 #include <xapian/error.h> 24 26 25 #include " postlist.h"27 #include "omassert.h" 26 28 27 29 using namespace std; 28 30 … … 69 71 return skip_to(did, w_min); 70 72 } 71 73 74 Xapian::termcount 75 PostList::count_matching_subqs() const 76 { 77 Assert(false); 78 return 0; 79 } 80 72 81 } -
common/leafpostlist.h
1 1 /** @file leafpostlist.h 2 2 * @brief Abstract base class for leaf postlists. 3 3 */ 4 /* Copyright (C) 2007 Olly Betts4 /* Copyright (C) 2007,2009 Olly Betts 5 5 * Copyright (C) 2009 Lemur Consulting Ltd 6 6 * 7 7 * This program is free software; you can redistribute it and/or … … 85 85 86 86 TermFreqs get_termfreq_est_using_stats( 87 87 const Xapian::Weight::Internal & stats) const; 88 89 Xapian::termcount count_matching_subqs() const; 88 90 }; 89 91 90 92 #endif // XAPIAN_INCLUDED_LEAFPOSTLIST_H -
common/postlist.h
192 192 */ 193 193 Internal * skip_to(Xapian::docid did) { return skip_to(did, 0.0); } 194 194 195 /// Count the number of leaf subqueries which match at the current position. 196 virtual Xapian::termcount count_matching_subqs() const; 197 195 198 /// Return a string description of this object. 196 199 virtual std::string get_description() const = 0; 197 200 }; -
common/submatch.h
75 75 /// Get PostList and term info. 76 76 virtual PostList * get_postlist_and_term_info(MultiMatch *matcher, 77 77 std::map<std::string, 78 Xapian::MSet::Internal::TermFreqAndWeight> *termfreqandwts) 78 Xapian::MSet::Internal::TermFreqAndWeight> *termfreqandwts, 79 Xapian::termcount * total_subqs_ptr) 79 80 = 0; 80 81 }; 81 82 -
matcher/extraweightpostlist.h
113 113 delete pl; 114 114 delete wt; 115 115 } 116 117 Xapian::termcount count_matching_subqs() const { 118 return pl->count_matching_subqs(); 119 } 116 120 }; 117 121 118 122 #endif /* OM_HGUARD_EXTRAWEIGHTPOSTLIST_H */ -
matcher/multimatch.cc
326 326 map<string, Xapian::MSet::Internal::TermFreqAndWeight> * termfreqandwts_ptr; 327 327 termfreqandwts_ptr = &termfreqandwts; 328 328 329 Xapian::termcount total_subqs = 0; 329 330 // Keep a count of matches which we know exist, but we won't see. This 330 331 // occurs when a submatch is remote, and returns a lower bound on the 331 332 // number of matching documents which is higher than the number of … … 335 336 PostList *pl; 336 337 try { 337 338 pl = leaves[i]->get_postlist_and_term_info(this, 338 termfreqandwts_ptr); 339 termfreqandwts_ptr, 340 &total_subqs); 339 341 if (termfreqandwts_ptr && !termfreqandwts.empty()) 340 342 termfreqandwts_ptr = NULL; 341 343 if (is_remote[i]) { … … 384 386 // Empty result set 385 387 Xapian::doccount docs_matched = 0; 386 388 Xapian::weight greatest_wt = 0; 389 Xapian::termcount greatest_wt_subqs_matched = 0; 387 390 vector<Xapian::Internal::MSetItem> items; 388 391 389 392 // maximum weight a document could possibly have … … 715 718 if (wt > greatest_wt) { 716 719 new_greatest_weight: 717 720 greatest_wt = wt; 721 greatest_wt_subqs_matched = pl->count_matching_subqs(); 718 722 if (percent_cutoff) { 719 723 Xapian::weight w = wt * percent_cutoff_factor; 720 724 if (w > min_weight) { … … 764 768 percent_scale = rem_match->get_percent_factor(); 765 769 } else 766 770 #endif 767 if (termfreqandwts.size() > 1) { 768 Xapian::termcount matching_terms = 0; 769 map<string, 770 Xapian::MSet::Internal::TermFreqAndWeight>::const_iterator i; 771 772 // Special case for MatchAll queries. 773 i = termfreqandwts.find(string()); 774 if (i != termfreqandwts.end()) { 775 percent_scale += i->second.termweight; 776 ++matching_terms; 777 } 778 779 Xapian::TermIterator docterms = db.termlist_begin(best->did); 780 Xapian::TermIterator docterms_end = db.termlist_end(best->did); 781 while (docterms != docterms_end) { 782 i = termfreqandwts.find(*docterms); 783 if (i != termfreqandwts.end()) { 784 LOGLINE(MATCH, "adding " << i->second.termweight << 785 " to percent_scale for term '" << 786 *docterms << "'"); 787 percent_scale += i->second.termweight; 788 ++matching_terms; 789 if (matching_terms == termfreqandwts.size()) break; 790 } 791 ++docterms; 792 } 793 794 if (matching_terms < termfreqandwts.size()) { 795 if (percent_scale == 0.0) { 796 // This happens if the only matching terms are synonyms. 797 percent_scale = 1.0; 798 } else { 799 // OK, work out weight corresponding to 100% 800 double denom = 0; 801 for (i = termfreqandwts.begin(); i != termfreqandwts.end(); ++i) 802 denom += i->second.termweight; 803 804 LOGVALUE(MATCH, denom); 805 LOGVALUE(MATCH, percent_scale); 806 AssertRel(percent_scale,<=,denom); 807 if (denom == 0) { 808 // This happens if the top-level operator is OP_SYNONYM. 809 percent_scale = 1.0 / greatest_wt; 810 } else { 811 denom *= greatest_wt; 812 AssertRel(denom,>,0); 813 percent_scale /= denom; 814 } 815 } 816 } else { 817 // If all the terms match, the 2 sums of weights cancel. 818 percent_scale = 1.0 / greatest_wt; 819 } 820 } else { 821 // If there's only a single term in the query, the top document 822 // must score 100%. 823 percent_scale = 1.0 / greatest_wt; 771 { 772 percent_scale = greatest_wt_subqs_matched / double(total_subqs); 773 percent_scale /= greatest_wt; 824 774 } 825 775 Assert(percent_scale > 0); 826 776 if (percent_cutoff) { -
matcher/localmatch.cc
88 88 89 89 PostList * 90 90 LocalSubMatch::get_postlist_and_term_info(MultiMatch * matcher, 91 map<string, Xapian::MSet::Internal::TermFreqAndWeight> * termfreqandwts) 91 map<string, Xapian::MSet::Internal::TermFreqAndWeight> * termfreqandwts, 92 Xapian::termcount * total_subqs_ptr) 92 93 { 93 94 DEBUGCALL(MATCH, PostList *, "LocalSubMatch::get_postlist_and_term_info", 94 matcher << ", [termfreqandwts] ");95 matcher << ", [termfreqandwts], [total_subqs_ptr]"); 95 96 term_info = termfreqandwts; 96 97 97 98 // Build the postlist tree for the query. This calls … … 99 100 // which builds term_info as a side effect. 100 101 QueryOptimiser opt(*db, *this, matcher); 101 102 PostList * pl = opt.optimise_query(&orig_query); 103 *total_subqs_ptr = opt.get_total_subqueries(); 102 104 103 105 // We only need an ExtraWeightPostList if there's an extra weight 104 106 // contribution. -
matcher/localmatch.h
80 80 81 81 /// Get PostList and term info. 82 82 PostList * get_postlist_and_term_info(MultiMatch *matcher, 83 std::map<string, Xapian::MSet::Internal::TermFreqAndWeight> *termfreqandwts); 83 std::map<string, Xapian::MSet::Internal::TermFreqAndWeight> *termfreqandwts, 84 Xapian::termcount * total_subqs_ptr); 84 85 85 86 /** Convert a postlist into a synonym postlist. 86 87 */ -
matcher/xorpostlist.h
77 77 * which is at the current document. 78 78 */ 79 79 Xapian::termcount get_wdf() const; 80 81 Xapian::termcount count_matching_subqs() const; 80 82 }; 81 83 82 84 #endif /* OM_HGUARD_XORPOSTLIST_H */ -
matcher/synonympostlist.h
2 2 * @brief Combine subqueries, weighting as if they are synonyms 3 3 */ 4 4 /* Copyright 2007,2009 Lemur Consulting Ltd 5 * Copyright 2009 Olly Betts 5 6 * 6 7 * This program is free software; you can redistribute it and/or modify 7 8 * it under the terms of the GNU General Public License as published by … … 89 90 Xapian::termcount get_doclength() const; 90 91 bool at_end() const; 91 92 93 Xapian::termcount count_matching_subqs() const; 94 92 95 std::string get_description() const; 93 96 }; 94 97 -
matcher/remotesubmatch.cc
58 58 59 59 PostList * 60 60 RemoteSubMatch::get_postlist_and_term_info(MultiMatch *, 61 map<string, Xapian::MSet::Internal::TermFreqAndWeight> * termfreqandwts) 61 map<string, Xapian::MSet::Internal::TermFreqAndWeight> * termfreqandwts, 62 Xapian::termcount * total_subqs_ptr) 62 63 { 63 64 DEBUGCALL(MATCH, PostList *, "RemoteSubMatch::get_postlist_and_term_info", 64 "[matcher], " << (void*)termfreqandwts );65 "[matcher], " << (void*)termfreqandwts << ", " << (void*)total_subqs_ptr); 65 66 Xapian::MSet mset; 66 67 db->get_mset(mset); 67 68 percent_factor = mset.internal->percent_factor; 68 69 if (termfreqandwts) *termfreqandwts = mset.internal->termfreqandwts; 70 (void)total_subqs_ptr; // FIXME hmm... 69 71 return new MSetPostList(mset, decreasing_relevance); 70 72 } -
matcher/selectpostlist.h
56 56 PositionList * open_position_list() const { return source->open_position_list(); } 57 57 bool at_end() const { return source->at_end(); } 58 58 59 Xapian::termcount count_matching_subqs() const { 60 return source->count_matching_subqs(); 61 } 62 59 63 std::string get_description() const; 60 64 61 65 SelectPostList(PostList *source_) : source(source_) { } -
matcher/externalpostlist.cc
188 188 RETURN(source == NULL); 189 189 } 190 190 191 Xapian::termcount 192 ExternalPostList::count_matching_subqs() const 193 { 194 return 1; 195 } 196 191 197 string 192 198 ExternalPostList::get_description() const 193 199 { -
matcher/orpostlist.h
76 76 * when the OR is part of a synonym. 77 77 */ 78 78 Xapian::termcount get_wdf() const; 79 80 Xapian::termcount count_matching_subqs() const; 79 81 }; 80 82 81 83 #endif /* OM_HGUARD_ORPOSTLIST_H */ -
matcher/andmaybepostlist.h
113 113 * when the ANDMAYBE is part of a synonym. 114 114 */ 115 115 Xapian::termcount get_wdf() const; 116 117 Xapian::termcount count_matching_subqs() const; 116 118 }; 117 119 118 120 #endif /* OM_HGUARD_ANDMAYBEPOSTLIST_H */ -
matcher/andnotpostlist.cc
214 214 DEBUGCALL(MATCH, Xapian::termcount, "AndNotPostList::get_wdf", ""); 215 215 RETURN(l->get_wdf()); 216 216 } 217 218 Xapian::termcount 219 AndNotPostList::count_matching_subqs() const 220 { 221 DEBUGCALL(MATCH, Xapian::termcount, "AndNotPostList::count_matching_subqs", ""); 222 RETURN(l->count_matching_subqs()); 223 } -
matcher/andnotpostlist.h
77 77 * side. 78 78 */ 79 79 Xapian::termcount get_wdf() const; 80 81 Xapian::termcount count_matching_subqs() const; 80 82 }; 81 83 82 84 #endif /* OM_HGUARD_ANDNOTPOSTLIST_H */ -
matcher/valuerangepostlist.cc
166 166 return (db == NULL); 167 167 } 168 168 169 Xapian::termcount 170 ValueRangePostList::count_matching_subqs() const 171 { 172 return 1; 173 } 174 169 175 string 170 176 ValueRangePostList::get_description() const 171 177 { -
matcher/valuerangepostlist.h
85 85 86 86 bool at_end() const; 87 87 88 Xapian::termcount count_matching_subqs() const; 89 88 90 string get_description() const; 89 91 }; 90 92 -
matcher/queryoptimiser.cc
59 59 60 60 switch (query->op) { 61 61 case Xapian::Query::Internal::OP_LEAF: 62 ++total_subqs; 62 63 if (query->tname.empty()) factor = 0.0; 63 64 RETURN(localsubmatch.postlist_from_op_leaf_query(query, factor)); 64 65 65 66 case Xapian::Query::Internal::OP_EXTERNAL_SOURCE: { 67 ++total_subqs; 66 68 Assert(query->external_source); 67 69 Xapian::Database wrappeddb(new ConstDatabaseWrapper(&db)); 68 70 RETURN(new ExternalPostList(wrappeddb, … … 81 83 case Xapian::Query::OP_ELITE_SET: 82 84 RETURN(do_or_like(query, factor)); 83 85 84 case Xapian::Query::OP_SYNONYM: 85 RETURN(do_synonym(query, factor)); 86 case Xapian::Query::OP_SYNONYM: { 87 // Save and restore total_subqs so we only add one for the whole 88 // OP_SYNONYM subquery. 89 Xapian::termcount save_total_subqs = total_subqs; 90 PostList * pl = do_synonym(query, factor); 91 total_subqs = save_total_subqs + 1; 92 RETURN(pl); 93 } 86 94 87 95 case Xapian::Query::OP_AND_NOT: { 88 96 AssertEq(query->subqs.size(), 2); 89 97 PostList * l = do_subquery(query->subqs[0], factor); 98 Xapian::termcount save_total_subqs = total_subqs; 90 99 PostList * r = do_subquery(query->subqs[1], 0.0); 100 total_subqs = save_total_subqs; 91 101 RETURN(new AndNotPostList(l, r, matcher, db_size)); 92 102 } 93 103 … … 99 109 } 100 110 101 111 case Xapian::Query::OP_VALUE_RANGE: { 112 ++total_subqs; 102 113 Xapian::valueno valno(query->parameter); 103 114 const string & range_begin = query->tname; 104 115 const string & range_end = query->str_parameter; … … 106 117 } 107 118 108 119 case Xapian::Query::OP_VALUE_GE: { 120 ++total_subqs; 109 121 Xapian::valueno valno(query->parameter); 110 122 const string & range_begin = query->tname; 111 123 RETURN(new ValueGePostList(&db, valno, range_begin)); 112 124 } 113 125 114 126 case Xapian::Query::OP_VALUE_LE: { 127 ++total_subqs; 115 128 Xapian::valueno valno(query->parameter); 116 129 const string & range_end = query->tname; 117 130 RETURN(new ValueRangePostList(&db, valno, "", range_end)); … … 393 406 if (factor == 0.0) { 394 407 // If we have a factor of 0, we don't care about the weights, so 395 408 // we're just like a normal OR query. 409 // FIXME: what about count_matching_subqs()? 396 410 RETURN(do_or_like(query, 0.0)); 397 411 } 398 412 -
matcher/mergepostlist.cc
227 227 Assert(current != -1); 228 228 return plists[current]->get_doclength(); 229 229 } 230 231 Xapian::termcount 232 MergePostList::count_matching_subqs() const 233 { 234 DEBUGCALL(MATCH, Xapian::termcount, "MergePostList::count_matching_subqs", ""); 235 RETURN(plists[current]->count_matching_subqs()); 236 } -
matcher/queryoptimiser.h
44 44 45 45 MultiMatch * matcher; 46 46 47 /** How many leaf subqueries there are. 48 * 49 * Used for scaling percentages when the highest weighted document doesn't 50 * "match all terms". 51 */ 52 Xapian::termcount total_subqs; 53 47 54 /** Optimise a Xapian::Query::Internal subtree into a PostList subtree. 48 55 * 49 56 * @param query The subtree to optimise. … … 102 109 LocalSubMatch & localsubmatch_, 103 110 MultiMatch * matcher_) 104 111 : db(db_), db_size(db.get_doccount()), localsubmatch(localsubmatch_), 105 matcher(matcher_) { }112 matcher(matcher_), total_subqs(0) { } 106 113 107 114 PostList * optimise_query(Xapian::Query::Internal * query) { 108 115 return do_subquery(query, 1.0); 109 116 } 117 118 Xapian::termcount get_total_subqueries() const { return total_subqs; } 110 119 }; 111 120 112 121 #endif // XAPIAN_INCLUDED_QUERYOPTIMISER_H -
matcher/mergepostlist.h
73 73 */ 74 74 virtual Xapian::termcount get_doclength() const; 75 75 76 Xapian::termcount count_matching_subqs() const; 77 76 78 MergePostList(vector<PostList *> plists_, 77 79 MultiMatch *matcher, 78 80 Xapian::ErrorHandler * errorhandler_); -
matcher/xorpostlist.cc
335 335 if (lhead < rhead) RETURN(l->get_wdf()); 336 336 RETURN(r->get_wdf()); 337 337 } 338 339 Xapian::termcount 340 XorPostList::count_matching_subqs() const 341 { 342 DEBUGCALL(MATCH, Xapian::termcount, "XorPostList::count_matching_subqs", ""); 343 if (lhead < rhead) RETURN(l->count_matching_subqs()); 344 RETURN(r->count_matching_subqs()); 345 } -
matcher/synonympostlist.cc
2 2 * @brief Combine subqueries, weighting as if they are synonyms 3 3 */ 4 4 /* Copyright 2007,2009 Lemur Consulting Ltd 5 * Copyright 2009 Olly Betts 5 6 * 6 7 * This program is free software; you can redistribute it and/or 7 8 * modify it under the terms of the GNU General Public License as … … 147 148 RETURN(subtree->at_end()); 148 149 } 149 150 151 Xapian::termcount 152 SynonymPostList::count_matching_subqs() const 153 { 154 return 1; 155 } 156 150 157 std::string 151 158 SynonymPostList::get_description() const 152 159 { -
matcher/multiandpostlist.cc
250 250 } 251 251 return totwdf; 252 252 } 253 254 Xapian::termcount 255 MultiAndPostList::count_matching_subqs() const 256 { 257 Xapian::termcount total = 0; 258 for (size_t i = 0; i < n_kids; ++i) { 259 total += plist[i]->count_matching_subqs(); 260 } 261 return total; 262 } -
matcher/multiandpostlist.h
194 194 * that in general. 195 195 */ 196 196 Xapian::termcount get_wdf() const; 197 198 Xapian::termcount count_matching_subqs() const; 197 199 }; 198 200 199 201 #endif // XAPIAN_INCLUDED_MULTIANDPOSTLIST_H -
matcher/orpostlist.cc
299 299 if (lhead > rhead) RETURN(r->get_wdf()); 300 300 RETURN(l->get_wdf() + r->get_wdf()); 301 301 } 302 303 Xapian::termcount 304 OrPostList::count_matching_subqs() const 305 { 306 DEBUGCALL(MATCH, Xapian::termcount, "OrPostList::count_matching_subqs", ""); 307 if (lhead < rhead) RETURN(l->count_matching_subqs()); 308 if (lhead > rhead) RETURN(r->count_matching_subqs()); 309 RETURN(l->count_matching_subqs() + r->count_matching_subqs()); 310 } -
matcher/andmaybepostlist.cc
193 193 if (lhead == rhead) RETURN(l->get_wdf() + r->get_wdf()); 194 194 RETURN(l->get_wdf()); 195 195 } 196 197 Xapian::termcount 198 AndMaybePostList::count_matching_subqs() const 199 { 200 if (lhead == rhead) 201 RETURN(l->count_matching_subqs() + r->count_matching_subqs()); 202 RETURN(l->count_matching_subqs()); 203 } -
matcher/remotesubmatch.h
62 62 /// Get PostList and term info. 63 63 PostList * get_postlist_and_term_info(MultiMatch *matcher, 64 64 std::map<std::string, 65 Xapian::MSet::Internal::TermFreqAndWeight> *termfreqandwts); 65 Xapian::MSet::Internal::TermFreqAndWeight> *termfreqandwts, 66 Xapian::termcount * total_subqs_ptr); 66 67 67 68 /// Get percentage factor - only valid after get_postlist_and_term_info(). 68 69 double get_percent_factor() const { return percent_factor; } -
matcher/externalpostlist.h
86 86 87 87 bool at_end() const; 88 88 89 Xapian::termcount count_matching_subqs() const; 90 89 91 string get_description() const; 90 92 }; 91 93 -
tests/api_opsynonym.cc
24 24 25 25 #include "api_opsynonym.h" 26 26 27 #include<iostream> 27 28 #include <map> 28 29 #include <set> 29 30 #include <vector> … … 387 388 Xapian::Query query2(*i, or_query, date_query); 388 389 389 390 enquire.set_query(query1); 391 cout << query1.get_description() << endl; 390 392 tout << "query1:" << query1 << '\n'; 391 393 Xapian::MSet mset1 = enquire.get_mset(0, db.get_doccount()); 392 394 tout << "mset1:" << mset1 << '\n'; 393 395 enquire.set_query(query2); 396 cout << query2.get_description() << endl; 394 397 tout << "query2:" << query2 << '\n'; 395 398 Xapian::MSet mset2 = enquire.get_mset(0, db.get_doccount()); 396 399 tout << "mset2:" << mset2 << '\n'; 397 400 398 401 TEST_NOT_EQUAL(mset1.size(), 0); 399 TEST_EQUAL(mset1[0].get_percent(), 100.0); 402 tout << mset2[0].get_percent() << endl; 403 if (*i != Xapian::Query::OP_XOR) { 404 TEST_EQUAL(mset1[0].get_percent(), 100.0); 405 } 400 406 check_msets_contain_same_docs(mset1, mset2); 401 407 } 402 408 -
tests/api_anydb.cc
539 539 TEST(i != mymset.end()); 540 540 pct = mymset.convert_to_percent(i); 541 541 TEST_REL(pct,>,60); 542 TEST_REL(pct,<,7 5);542 TEST_REL(pct,<,76); 543 543 544 544 ++i; 545 545