Ticket #394: phrase-settling-pond-update-20130517.patch
File phrase-settling-pond-update-20130517.patch, 30.7 KB (added by , 2 years ago) |
---|
-
xapian-core-1.3.1
Description: Use a settling pond for faster phrase processing Somewhat experimental. Author: Olly Betts <olly@survex.com> --- Origin: upstream Bug: http://trac.xapian.org/ticket/394 Last-Update: 2013-05.17
old new 470 470 languages/steminternal.cc matcher/remotesubmatch.cc \ 471 471 matcher/andmaybepostlist.cc matcher/andnotpostlist.cc \ 472 472 matcher/branchpostlist.cc matcher/collapser.cc \ 473 matcher/const_database_wrapper.cc \473 matcher/const_database_wrapper.cc matcher/exactphrasecheck.cc \ 474 474 matcher/exactphrasepostlist.cc matcher/externalpostlist.cc \ 475 475 matcher/localsubmatch.cc matcher/mergepostlist.cc \ 476 476 matcher/msetcmp.cc matcher/msetpostlist.cc \ … … 618 618 languages/stem.lo languages/steminternal.lo $(am__objects_14) \ 619 619 matcher/andmaybepostlist.lo matcher/andnotpostlist.lo \ 620 620 matcher/branchpostlist.lo matcher/collapser.lo \ 621 matcher/const_database_wrapper.lo \621 matcher/const_database_wrapper.lo matcher/exactphrasecheck.lo \ 622 622 matcher/exactphrasepostlist.lo matcher/externalpostlist.lo \ 623 623 matcher/localsubmatch.lo matcher/mergepostlist.lo \ 624 624 matcher/msetcmp.lo matcher/msetpostlist.lo \ … … 886 886 languages/steminternal.h matcher/andmaybepostlist.h \ 887 887 matcher/andnotpostlist.h matcher/branchpostlist.h \ 888 888 matcher/collapser.h matcher/const_database_wrapper.h \ 889 matcher/exactphrase postlist.h matcher/externalpostlist.h \890 matcher/ext raweightpostlist.h matcher/localsubmatch.h \891 matcher/ mergepostlist.h matcher/msetcmp.h \892 matcher/mset postlist.h matcher/multiandpostlist.h \893 matcher/multi match.h matcher/multixorpostlist.h \894 matcher/ orpostlist.h matcher/phrasepostlist.h \895 matcher/ queryoptimiser.h matcher/remotesubmatch.h \896 matcher/ selectpostlist.h matcher/synonympostlist.h \897 matcher/ valuegepostlist.h matcher/valuerangepostlist.h \898 matcher/value streamdocument.h net/length.h net/progclient.h \899 net/ remoteconnection.h net/remoteserver.h \900 net/remote tcpclient.h net/remotetcpserver.h \889 matcher/exactphrasecheck.h matcher/exactphrasepostlist.h \ 890 matcher/externalpostlist.h matcher/extraweightpostlist.h \ 891 matcher/localsubmatch.h matcher/mergepostlist.h \ 892 matcher/msetcmp.h matcher/msetpostlist.h \ 893 matcher/multiandpostlist.h matcher/multimatch.h \ 894 matcher/multixorpostlist.h matcher/orpostlist.h \ 895 matcher/phrasepostlist.h matcher/queryoptimiser.h \ 896 matcher/remotesubmatch.h matcher/selectpostlist.h \ 897 matcher/synonympostlist.h matcher/valuegepostlist.h \ 898 matcher/valuerangepostlist.h matcher/valuestreamdocument.h \ 899 net/length.h net/progclient.h net/remoteconnection.h \ 900 net/remoteserver.h net/remotetcpclient.h net/remotetcpserver.h \ 901 901 net/replicatetcpclient.h net/replicatetcpserver.h \ 902 902 net/serialise.h net/tcpclient.h net/tcpserver.h \ 903 903 queryparser/cjk-tokenizer.h queryparser/queryparser_internal.h \ … … 1165 1165 languages/steminternal.h matcher/andmaybepostlist.h \ 1166 1166 matcher/andnotpostlist.h matcher/branchpostlist.h \ 1167 1167 matcher/collapser.h matcher/const_database_wrapper.h \ 1168 matcher/exactphrase postlist.h matcher/externalpostlist.h \1169 matcher/ext raweightpostlist.h matcher/localsubmatch.h \1170 matcher/ mergepostlist.h matcher/msetcmp.h \1171 matcher/mset postlist.h matcher/multiandpostlist.h \1172 matcher/multi match.h matcher/multixorpostlist.h \1173 matcher/ orpostlist.h matcher/phrasepostlist.h \1174 matcher/ queryoptimiser.h matcher/remotesubmatch.h \1175 matcher/ selectpostlist.h matcher/synonympostlist.h \1176 matcher/ valuegepostlist.h matcher/valuerangepostlist.h \1177 matcher/value streamdocument.h net/length.h net/progclient.h \1178 net/ remoteconnection.h net/remoteserver.h \1179 net/remote tcpclient.h net/remotetcpserver.h \1168 matcher/exactphrasecheck.h matcher/exactphrasepostlist.h \ 1169 matcher/externalpostlist.h matcher/extraweightpostlist.h \ 1170 matcher/localsubmatch.h matcher/mergepostlist.h \ 1171 matcher/msetcmp.h matcher/msetpostlist.h \ 1172 matcher/multiandpostlist.h matcher/multimatch.h \ 1173 matcher/multixorpostlist.h matcher/orpostlist.h \ 1174 matcher/phrasepostlist.h matcher/queryoptimiser.h \ 1175 matcher/remotesubmatch.h matcher/selectpostlist.h \ 1176 matcher/synonympostlist.h matcher/valuegepostlist.h \ 1177 matcher/valuerangepostlist.h matcher/valuestreamdocument.h \ 1178 net/length.h net/progclient.h net/remoteconnection.h \ 1179 net/remoteserver.h net/remotetcpclient.h net/remotetcpserver.h \ 1180 1180 net/replicatetcpclient.h net/replicatetcpserver.h \ 1181 1181 net/serialise.h net/tcpclient.h net/tcpserver.h \ 1182 1182 queryparser/cjk-tokenizer.h queryparser/queryparser_internal.h \ … … 1272 1272 languages/stem.cc languages/steminternal.cc $(am__append_23) \ 1273 1273 matcher/andmaybepostlist.cc matcher/andnotpostlist.cc \ 1274 1274 matcher/branchpostlist.cc matcher/collapser.cc \ 1275 matcher/const_database_wrapper.cc \1275 matcher/const_database_wrapper.cc matcher/exactphrasecheck.cc \ 1276 1276 matcher/exactphrasepostlist.cc matcher/externalpostlist.cc \ 1277 1277 matcher/localsubmatch.cc matcher/mergepostlist.cc \ 1278 1278 matcher/msetcmp.cc matcher/msetpostlist.cc \ … … 1894 1894 matcher/$(DEPDIR)/$(am__dirstamp) 1895 1895 matcher/const_database_wrapper.lo: matcher/$(am__dirstamp) \ 1896 1896 matcher/$(DEPDIR)/$(am__dirstamp) 1897 matcher/exactphrasecheck.lo: matcher/$(am__dirstamp) \ 1898 matcher/$(DEPDIR)/$(am__dirstamp) 1897 1899 matcher/exactphrasepostlist.lo: matcher/$(am__dirstamp) \ 1898 1900 matcher/$(DEPDIR)/$(am__dirstamp) 1899 1901 matcher/externalpostlist.lo: matcher/$(am__dirstamp) \ … … 2404 2406 @AMDEP_TRUE@@am__include@ @am__quote@matcher/$(DEPDIR)/branchpostlist.Plo@am__quote@ 2405 2407 @AMDEP_TRUE@@am__include@ @am__quote@matcher/$(DEPDIR)/collapser.Plo@am__quote@ 2406 2408 @AMDEP_TRUE@@am__include@ @am__quote@matcher/$(DEPDIR)/const_database_wrapper.Plo@am__quote@ 2409 @AMDEP_TRUE@@am__include@ @am__quote@matcher/$(DEPDIR)/exactphrasecheck.Plo@am__quote@ 2407 2410 @AMDEP_TRUE@@am__include@ @am__quote@matcher/$(DEPDIR)/exactphrasepostlist.Plo@am__quote@ 2408 2411 @AMDEP_TRUE@@am__include@ @am__quote@matcher/$(DEPDIR)/externalpostlist.Plo@am__quote@ 2409 2412 @AMDEP_TRUE@@am__include@ @am__quote@matcher/$(DEPDIR)/localsubmatch.Plo@am__quote@ -
common/submatch.h
old new 76 76 virtual PostList * get_postlist_and_term_info(MultiMatch *matcher, 77 77 std::map<std::string, 78 78 Xapian::MSet::Internal::TermFreqAndWeight> *termfreqandwts, 79 Xapian::termcount * total_subqs_ptr) 79 Xapian::termcount * total_subqs_ptr, 80 std::vector<std::string> & pool_terms) 80 81 = 0; 81 82 }; 82 83 -
matcher/remotesubmatch.h
old new 72 72 PostList * get_postlist_and_term_info(MultiMatch *matcher, 73 73 std::map<std::string, 74 74 Xapian::MSet::Internal::TermFreqAndWeight> *termfreqandwts, 75 Xapian::termcount * total_subqs_ptr); 75 Xapian::termcount * total_subqs_ptr, 76 std::vector<std::string> & pool_terms); 76 77 77 78 /// Get percentage factor - only valid after get_postlist_and_term_info(). 78 79 double get_percent_factor() const { return percent_factor; } -
matcher/multimatch.cc
old new 46 46 #include "valuestreamdocument.h" 47 47 #include "weight/weightinternal.h" 48 48 49 #include "exactphrasecheck.h" 50 49 51 #include <xapian/errorhandler.h> 50 52 #include <xapian/matchspy.h> 51 53 #include <xapian/version.h> // For XAPIAN_HAS_REMOTE_BACKEND … … 355 357 map<string, Xapian::MSet::Internal::TermFreqAndWeight> * termfreqandwts_ptr; 356 358 termfreqandwts_ptr = &termfreqandwts; 357 359 360 vector<string> pool_terms; 358 361 Xapian::termcount total_subqs = 0; 359 362 // Keep a count of matches which we know exist, but we won't see. This 360 363 // occurs when a submatch is remote, and returns a lower bound on the … … 364 367 for (size_t i = 0; i != leaves.size(); ++i) { 365 368 PostList *pl; 366 369 try { 370 if (!is_remote[i]) pool_terms.clear(); 367 371 pl = leaves[i]->get_postlist_and_term_info(this, 368 372 termfreqandwts_ptr, 369 &total_subqs); 373 &total_subqs, 374 pool_terms); 370 375 if (termfreqandwts_ptr && !termfreqandwts.empty()) 371 376 termfreqandwts_ptr = NULL; 372 377 if (is_remote[i]) { … … 525 530 // Is the mset a valid heap? 526 531 bool is_heap = false; 527 532 533 size_t SETTLING_POND_SIZE = 0; 534 if (!pool_terms.empty()) { 535 const char * sps = getenv("POND_SIZE"); 536 SETTLING_POND_SIZE = sps ? atoi(sps) : 100000; 537 } 538 ExactPhraseCheck phrase_check(db, pool_terms); 539 // FIXME: a min/max heap is probably a better choice here (notably more 540 // compact) but the STL doesn't provide one so we'd have to find an 541 // implementation or write one. 542 multimap<double, Xapian::Internal::MSetItem> settling_pond; 528 543 while (true) { 529 544 bool pushback; 530 545 … … 646 661 new_item.wt = wt; 647 662 } 648 663 664 if (SETTLING_POND_SIZE) { 665 if (items.size() >= max_msize) { 666 // Settling pond handling... 667 multimap<double, Xapian::Internal::MSetItem>::iterator it; 668 it = settling_pond.upper_bound(-min_weight); 669 settling_pond.erase(it, settling_pond.end()); 670 671 settling_pond.insert(make_pair(-new_item.wt, new_item)); 672 if (settling_pond.size() < SETTLING_POND_SIZE) { 673 continue; 674 } 675 676 // Take the last item off the heap, which will have a reasonably 677 // high weight in general. 678 it = settling_pond.begin(); 679 swap(new_item, it->second); 680 settling_pond.erase(it); 681 } 682 if (!phrase_check(new_item.did)) continue; 683 } 684 649 685 pushback = true; 650 686 651 687 // Perform collapsing on key if requested. … … 808 844 } 809 845 } 810 846 847 multimap<double, Xapian::Internal::MSetItem>::iterator it; 848 for (it = settling_pond.begin(); it != settling_pond.end(); ++it) { 849 const Xapian::Internal::MSetItem & new_item = it->second; 850 if (new_item.wt < min_weight) break; 851 if (!phrase_check(new_item.did)) continue; 852 853 { 854 ++docs_matched; 855 if (items.size() >= max_msize) { 856 items.push_back(new_item); 857 if (!is_heap) { 858 is_heap = true; 859 make_heap(items.begin(), items.end(), mcmp); 860 } else { 861 push_heap<vector<Xapian::Internal::MSetItem>::iterator, 862 MSetCmp>(items.begin(), items.end(), mcmp); 863 } 864 pop_heap<vector<Xapian::Internal::MSetItem>::iterator, 865 MSetCmp>(items.begin(), items.end(), mcmp); 866 items.pop_back(); 867 868 min_item = items.front(); 869 if (sort_by == REL || sort_by == REL_VAL) { 870 if (docs_matched >= check_at_least) { 871 if (sort_by == REL) { 872 // We're done if this is a forward boolean match 873 // with only one database (bodgetastic, FIXME 874 // better if we can!) 875 if (rare(max_possible == 0 && sort_forward)) { 876 // In the multi database case, MergePostList 877 // currently processes each database 878 // sequentially (which actually may well be 879 // more efficient) so the docids in general 880 // won't arrive in order. 881 // FIXME: is this still good here: 882 // if (leaves.size() == 1) break; 883 } 884 } 885 if (min_item.wt > min_weight) { 886 LOGLINE(MATCH, "Setting min_weight to " << 887 min_item.wt << " from " << min_weight); 888 min_weight = min_item.wt; 889 } 890 } 891 } 892 } else { 893 items.push_back(new_item); 894 is_heap = false; 895 if (sort_by == REL && items.size() == max_msize) { 896 if (docs_matched >= check_at_least) { 897 // We're done if this is a forward boolean match 898 // with only one database (bodgetastic, FIXME 899 // better if we can!) 900 if (rare(max_possible == 0 && sort_forward)) { 901 // In the multi database case, MergePostList 902 // currently processes each database 903 // sequentially (which actually may well be 904 // more efficient) so the docids in general 905 // won't arrive in order. 906 // FIXME: if (leaves.size() == 1) break; 907 } 908 } 909 } 910 } 911 } 912 913 // Keep a track of the greatest weight we've seen. 914 if (new_item.wt > greatest_wt) { 915 greatest_wt = new_item.wt; 916 #ifdef XAPIAN_HAS_REMOTE_BACKEND 917 const unsigned int multiplier = db.internal.size(); 918 unsigned int db_num = (new_item.did - 1) % multiplier; 919 if (is_remote[db_num]) { 920 // Note that the greatest weighted document came from a remote 921 // database, and which one. 922 greatest_wt_subqs_db_num = db_num; 923 } else 924 #endif 925 { 926 greatest_wt_subqs_matched = pl->count_matching_subqs(); 927 #ifdef XAPIAN_HAS_REMOTE_BACKEND 928 greatest_wt_subqs_db_num = UINT_MAX; 929 #endif 930 } 931 if (percent_cutoff) { 932 double w = new_item.wt * percent_cutoff_factor; 933 if (w > min_weight) { 934 min_weight = w; 935 if (!is_heap) { 936 is_heap = true; 937 make_heap<vector<Xapian::Internal::MSetItem>::iterator, 938 MSetCmp>(items.begin(), items.end(), mcmp); 939 } 940 while (!items.empty() && items.front().wt < min_weight) { 941 pop_heap<vector<Xapian::Internal::MSetItem>::iterator, 942 MSetCmp>(items.begin(), items.end(), mcmp); 943 Assert(items.back().wt < min_weight); 944 items.pop_back(); 945 } 946 #ifdef XAPIAN_ASSERTIONS_PARANOID 947 vector<Xapian::Internal::MSetItem>::const_iterator i; 948 for (i = items.begin(); i != items.end(); ++i) { 949 Assert(i->wt >= min_weight); 950 } 951 #endif 952 } 953 } 954 } 955 } 956 957 811 958 // done with posting list tree 812 959 pl.reset(NULL); 813 960 -
matcher/localsubmatch.cc
old new 68 68 PostList * 69 69 LocalSubMatch::get_postlist_and_term_info(MultiMatch * matcher, 70 70 map<string, Xapian::MSet::Internal::TermFreqAndWeight> * termfreqandwts, 71 Xapian::termcount * total_subqs_ptr) 71 Xapian::termcount * total_subqs_ptr, 72 std::vector<std::string> & pool_terms) 72 73 { 73 74 LOGCALL(MATCH, PostList *, "LocalSubMatch::get_postlist_and_term_info", matcher | termfreqandwts | total_subqs_ptr); 74 75 (void)matcher; … … 82 83 83 84 PostList * pl; 84 85 { 85 QueryOptimiser opt(*db, *this, matcher );86 QueryOptimiser opt(*db, *this, matcher, pool_terms); 86 87 pl = query.internal->postlist(&opt, 1.0); 87 88 *total_subqs_ptr = opt.get_total_subqs(); 88 89 } -
matcher/Makefile.mk
old new 4 4 matcher/branchpostlist.h\ 5 5 matcher/collapser.h\ 6 6 matcher/const_database_wrapper.h\ 7 matcher/exactphrasecheck.h\ 7 8 matcher/exactphrasepostlist.h\ 8 9 matcher/externalpostlist.h\ 9 10 matcher/extraweightpostlist.h\ … … 42 43 matcher/branchpostlist.cc\ 43 44 matcher/collapser.cc\ 44 45 matcher/const_database_wrapper.cc\ 46 matcher/exactphrasecheck.cc\ 45 47 matcher/exactphrasepostlist.cc\ 46 48 matcher/externalpostlist.cc\ 47 49 matcher/localsubmatch.cc\ -
new file xapian-core-1.3.1/matcher/exactphrasecheck.cc
- + 1 /** @file exactphrasecheck.cc 2 * @brief Check if terms form a particular exact phrase. 3 */ 4 /* Copyright (C) 2006,2007,2009,2012 Olly Betts 5 * 6 * This program is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation; either version 2 of the License, or 9 * (at your option) any later version. 10 * 11 * This program is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 * GNU General Public License for more details. 15 * 16 * You should have received a copy of the GNU General Public License 17 * along with this program; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21 // FIXME: this could probably share code with ExactPhrasePostList. 22 23 #include <config.h> 24 25 #include "exactphrasecheck.h" 26 27 #include "debuglog.h" 28 #include "omassert.h" 29 #include "backends/positionlist.h" 30 31 #include <algorithm> 32 #include <vector> 33 34 using namespace std; 35 36 class TermCompare { 37 const Xapian::Database & db; 38 vector<string> & terms; 39 40 public: 41 TermCompare(const Xapian::Database & db_, 42 vector<string> & terms_) 43 : db(db_), terms(terms_) { } 44 45 bool operator()(unsigned a, unsigned b) const { 46 return db.get_collection_freq(terms[a]) < db.get_collection_freq(terms[b]); 47 } 48 }; 49 50 ExactPhraseCheck::ExactPhraseCheck(const Xapian::Database & db_, 51 const vector<string> &terms_) 52 : db(db_), terms(terms_) 53 { 54 if (terms.empty()) { 55 poslists = NULL; 56 order = NULL; 57 return; 58 } 59 60 AssertRel(terms.size(),>,1); 61 size_t n = terms_.size(); 62 poslists = new PositionList*[n]; 63 try { 64 order = new unsigned[n]; 65 } catch (...) { 66 delete [] poslists; 67 throw; 68 } 69 for (size_t i = 0; i < n; ++i) { 70 poslists[i] = NULL; 71 order[i] = unsigned(i); 72 } 73 74 // We often don't need to read all the position lists, so rather than using 75 // the shortest position lists first, we approximate by using the terms 76 // with the lowest collection freq first. Overall this should give a 77 // similar order. 78 sort(order, order + terms.size(), TermCompare(db, terms)); 79 } 80 81 ExactPhraseCheck::~ExactPhraseCheck() 82 { 83 delete [] poslists; 84 delete [] order; 85 } 86 87 bool 88 ExactPhraseCheck::start_position_list(unsigned i, Xapian::docid did) 89 { 90 AssertRel(i,<,terms.size()); 91 unsigned index = order[i]; 92 // FIXME: nasty hacking around with internals and ref counts - we should 93 // just add a new Database::Internal method to do what we want. 94 Xapian::PositionIterator p = db.positionlist_begin(did, terms[index]); 95 PositionList * tmp = p.internal; 96 if (!tmp) 97 return false; 98 ++tmp->_refs; 99 p.internal = poslists[i]; 100 poslists[i] = tmp; 101 poslists[i]->index = index; 102 return true; 103 } 104 105 bool 106 ExactPhraseCheck::operator()(Xapian::docid did) 107 { 108 LOGCALL(MATCH, bool, "ExactPhraseCheck::operator()", did); 109 110 if (terms.size() <= 1) RETURN(true); 111 112 // We often don't need to read all the position lists, so rather than using 113 114 AssertRel(terms.size(),>,1); 115 116 bool result = false; 117 // If the first term we check only occurs too close to the start of the 118 // document, we only need to read one term's positions. E.g. search for 119 // "ripe mango" when the only occurrence of 'mango' in the current document 120 // is at position 0. 121 if (!start_position_list(0, did)) 122 goto done; 123 poslists[0]->skip_to(poslists[0]->index); 124 if (poslists[0]->at_end()) goto done; 125 126 // If we get here, we'll need to read the positionlists for at least two 127 // terms, so check the true positionlist length for the two terms with the 128 // lowest wdf and if necessary swap them so the true shorter one is first. 129 if (!start_position_list(1, did)) 130 goto done; 131 if (poslists[0]->get_size() < poslists[1]->get_size()) { 132 poslists[1]->skip_to(poslists[1]->index); 133 if (poslists[1]->at_end()) goto done; 134 swap(poslists[0], poslists[1]); 135 } 136 137 { 138 unsigned read_hwm = 1; 139 Xapian::termpos idx0 = poslists[0]->index; 140 do { 141 Xapian::termpos base = poslists[0]->get_position() - idx0; 142 unsigned i = 1; 143 while (true) { 144 if (i > read_hwm) { 145 read_hwm = i; 146 if (!start_position_list(i, did)) 147 goto done; 148 // FIXME: consider comparing with poslist[0] and swapping 149 // if less common. Should we allow for the number of positions 150 // we've read from poslist[0] already? 151 } 152 Xapian::termpos required = base + poslists[i]->index; 153 poslists[i]->skip_to(required); 154 if (poslists[i]->at_end()) goto done; 155 if (poslists[i]->get_position() != required) break; 156 if (++i == terms.size()) { 157 result = true; 158 goto done; 159 } 160 } 161 poslists[0]->next(); 162 } while (!poslists[0]->at_end()); 163 } 164 done: 165 for (size_t i = 0; i < terms.size(); ++i) { 166 delete poslists[i]; 167 poslists[i] = NULL; 168 } 169 RETURN(result); 170 } -
matcher/queryoptimiser.h
old new 49 49 Xapian::termcount total_subqs; 50 50 51 51 public: 52 std::vector<std::string> & pool_terms; 53 54 bool top_and; 55 52 56 const Xapian::Database::Internal & db; 53 57 54 58 Xapian::doccount db_size; … … 57 61 58 62 QueryOptimiser(const Xapian::Database::Internal & db_, 59 63 LocalSubMatch & localsubmatch_, 60 MultiMatch * matcher_) 64 MultiMatch * matcher_, 65 std::vector<std::string> & pool_terms_) 61 66 : localsubmatch(localsubmatch_), total_subqs(0), 62 db(db_), db_size(db.get_doccount()), matcher(matcher_) { } 67 pool_terms(pool_terms_), top_and(true), db(db_), 68 db_size(db.get_doccount()), matcher(matcher_) { } 63 69 64 70 void inc_total_subqs() { ++total_subqs; } 65 71 -
matcher/remotesubmatch.cc
old new 62 62 PostList * 63 63 RemoteSubMatch::get_postlist_and_term_info(MultiMatch *, 64 64 map<string, Xapian::MSet::Internal::TermFreqAndWeight> * termfreqandwts, 65 Xapian::termcount * total_subqs_ptr) 65 Xapian::termcount * total_subqs_ptr, 66 std::vector<std::string> &) 66 67 { 67 68 LOGCALL(MATCH, PostList *, "RemoteSubMatch::get_postlist_and_term_info", Literal("[matcher]") | termfreqandwts | total_subqs_ptr); 68 69 Xapian::MSet mset; -
matcher/localsubmatch.h
old new 89 89 PostList * get_postlist_and_term_info(MultiMatch *matcher, 90 90 std::map<std::string, 91 91 Xapian::MSet::Internal::TermFreqAndWeight> *termfreqandwts, 92 Xapian::termcount * total_subqs_ptr); 92 Xapian::termcount * total_subqs_ptr, 93 std::vector<std::string> & pool_terms); 93 94 94 95 /** Convert a postlist into a synonym postlist. 95 96 */ -
new file xapian-core-1.3.1/matcher/exactphrasecheck.h
- + 1 /** @file exactphrasecheck.h 2 * @brief Check if terms form a particular exact phrase. 3 */ 4 /* Copyright (C) 2006,2012 Olly Betts 5 * Copyright (C) 2009 Lemur Consulting Ltd 6 * 7 * This program is free software; you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License as published by 9 * the Free Software Foundation; either version 2 of the License, or 10 * (at your option) any later version. 11 * 12 * This program is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 * GNU General Public License for more details. 16 * 17 * You should have received a copy of the GNU General Public License 18 * along with this program; if not, write to the Free Software 19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 20 */ 21 22 #ifndef XAPIAN_INCLUDED_EXACTPHRASEPOSTLIST_H 23 #define XAPIAN_INCLUDED_EXACTPHRASEPOSTLIST_H 24 25 #include "xapian/database.h" 26 27 #include <string> 28 #include <vector> 29 30 typedef Xapian::PositionIterator::Internal PositionList; 31 32 /** Check for an exact phrase using positional information. 33 * 34 * Tests if the terms occur somewhere in the document in the order given 35 * and at adjacent term positions. 36 */ 37 class ExactPhraseCheck { 38 Xapian::Database db; 39 40 std::vector<std::string> terms; 41 42 PositionList ** poslists; 43 44 unsigned * order; 45 46 /// Start reading from the i-th position list. 47 bool start_position_list(unsigned i, Xapian::docid did); 48 49 public: 50 ExactPhraseCheck(const Xapian::Database & db_, 51 const std::vector<std::string> &terms_); 52 53 ~ExactPhraseCheck(); 54 55 /// Test if the specified document contains the terms as an exact phrase. 56 bool operator()(Xapian::docid did); 57 }; 58 59 #endif -
api/queryinternal.cc
old new 249 249 Xapian::termcount window_) 250 250 : op_(op__), begin(begin_), end(end_), window(window_) { } 251 251 252 PostList * postlist(PostList * pl, const vector<PostList*>& pls) const; 252 PostList * postlist(PostList * pl, const vector<PostList*>& pls, 253 QueryOptimiser * qopt) const; 253 254 }; 254 255 255 256 list<PosFilter> pos_filters; … … 265 266 }; 266 267 267 268 PostList * 268 AndContext::PosFilter::postlist(PostList * pl, const vector<PostList*>& pls) const 269 AndContext::PosFilter::postlist(PostList * pl, const vector<PostList*>& pls, 270 QueryOptimiser * qopt) const 269 271 try { 270 272 vector<PostList *>::const_iterator terms_begin = pls.begin() + begin; 271 273 vector<PostList *>::const_iterator terms_end = pls.begin() + end; … … 274 276 pl = new NearPostList(pl, window, terms_begin, terms_end); 275 277 } else if (window == end - begin) { 276 278 AssertEq(op_, Xapian::Query::OP_PHRASE); 277 pl = new ExactPhrasePostList(pl, terms_begin, terms_end); 279 if (qopt->top_and) { 280 vector<PostList *>::const_iterator j; 281 for (j = terms_begin; j != terms_end; ++j) { 282 const string & term = (*j)->get_termname(); 283 if (term.empty()) { 284 // FIXME: Currently all the subqueries must be terms. 285 qopt->pool_terms.clear(); 286 goto cannot_pool; 287 } 288 qopt->pool_terms.push_back(term); 289 } 290 // We can currently only handle hoisting out one phrase check. 291 // FIXME: Gather a list of checks, not a list of the terms in one 292 // check. 293 qopt->top_and = false; 294 } else { 295 cannot_pool: 296 pl = new ExactPhrasePostList(pl, terms_begin, terms_end); 297 } 278 298 } else { 279 299 AssertEq(op_, Xapian::Query::OP_PHRASE); 280 300 pl = new PhrasePostList(pl, window, terms_begin, terms_end); … … 309 329 list<PosFilter>::const_iterator i; 310 330 for (i = pos_filters.begin(); i != pos_filters.end(); ++i) { 311 331 const PosFilter & filter = *i; 312 pl.reset(filter.postlist(pl.release(), pls ));332 pl.reset(filter.postlist(pl.release(), pls, qopt)); 313 333 } 314 334 315 335 // Empty pls so our destructor doesn't delete them all! … … 492 512 QueryOptimiser * qopt, 493 513 double factor) const 494 514 { 515 bool top_and = qopt->top_and; 516 qopt->top_and = false; 495 517 ctx.add_postlist(postlist(qopt, factor)); 518 qopt->top_and = top_and; 496 519 } 497 520 498 521 void … … 500 523 QueryOptimiser * qopt, 501 524 double factor) const 502 525 { 526 bool top_and = qopt->top_and; 527 qopt->top_and = false; 503 528 ctx.add_postlist(postlist(qopt, factor)); 529 qopt->top_and = top_and; 504 530 } 505 531 506 532 namespace Internal { … … 1148 1174 LOGCALL(QUERY, PostingIterator::Internal *, "QueryAndNot::postlist", qopt | factor); 1149 1175 // FIXME: Combine and-like side with and-like stuff above. 1150 1176 AutoPtr<PostList> l(subqueries[0].internal->postlist(qopt, factor)); 1177 bool top_and = qopt->top_and; 1178 qopt->top_and = false; 1151 1179 OrContext ctx(subqueries.size() - 1); 1152 1180 do_or_like(ctx, qopt, 0.0, 0, 1); 1153 1181 AutoPtr<PostList> r(ctx.postlist(qopt)); 1182 qopt->top_and = top_and; 1154 1183 RETURN(new AndNotPostList(l.release(), r.release(), 1155 1184 qopt->matcher, qopt->db_size)); 1156 1185 } … … 1181 1210 LOGCALL(QUERY, PostingIterator::Internal *, "QueryAndMaybe::postlist", qopt | factor); 1182 1211 // FIXME: Combine and-like side with and-like stuff above. 1183 1212 AutoPtr<PostList> l(subqueries[0].internal->postlist(qopt, factor)); 1213 bool top_and = qopt->top_and; 1214 qopt->top_and = false; 1184 1215 OrContext ctx(subqueries.size() - 1); 1185 1216 do_or_like(ctx, qopt, factor, 0, 1); 1186 1217 AutoPtr<PostList> r(ctx.postlist(qopt)); 1218 qopt->top_and = top_and; 1187 1219 RETURN(new AndMaybePostList(l.release(), r.release(), 1188 1220 qopt->matcher, qopt->db_size)); 1189 1221 } -
xapian-core-1.3.1
old new 78 78 return 0; 79 79 } 80 80 81 std::string 82 PostList::get_termname() const 83 { 84 return std::string(); 85 } 86 81 87 } -
api/leafpostlist.cc
old new 102 102 { 103 103 return weight ? 1 : 0; 104 104 } 105 106 std::string 107 LeafPostList::get_termname() const 108 { 109 return term; 110 } -
xapian-core-1.3.1
old new 194 194 /// Count the number of leaf subqueries which match at the current position. 195 195 virtual Xapian::termcount count_matching_subqs() const; 196 196 197 /// If this is a term, return the name, otherwise return empty string. 198 virtual std::string get_termname() const; 199 197 200 /// Return a string description of this object. 198 201 virtual std::string get_description() const = 0; 199 202 }; -
api/leafpostlist.h
old new 86 86 TermFreqs get_termfreq_est_using_stats( 87 87 const Xapian::Weight::Internal & stats) const; 88 88 89 virtual std::string get_termname() const; 90 89 91 Xapian::termcount count_matching_subqs() const; 90 92 }; 91 93