Ticket #199: matchspy.patch
File matchspy.patch, 90.2 KB (added by , 15 years ago) |
---|
-
xapian-maintainer-tools/win32msvc/win32_api.mak
23 23 $(INTDIR)/errorhandler.obj \ 24 24 $(INTDIR)/expanddecider.obj \ 25 25 $(INTDIR)/leafpostlist.obj \ 26 $(INTDIR)/matchspy.obj \ 26 27 $(INTDIR)/omdatabase.obj \ 27 28 $(INTDIR)/omdocument.obj \ 28 29 $(INTDIR)/omenquire.obj \ … … 51 52 $(INTDIR)/errorhandler.cc\ 52 53 $(INTDIR)/expanddecider.cc\ 53 54 $(INTDIR)/leafpostlist.cc\ 55 $(INTDIR)/matchspy.cc \ 54 56 $(INTDIR)/omdatabase.cc\ 55 57 $(INTDIR)/omdocument.cc\ 56 58 $(INTDIR)/omenquire.cc\ -
xapian-core/matcher/multimatch.cc
5 5 * Copyright 2002,2003,2004,2005,2006,2007,2008,2009 Olly Betts 6 6 * Copyright 2003 Orange PCS Ltd 7 7 * Copyright 2003 Sam Liddicott 8 * Copyright 2007,2008 Lemur Consulting Ltd8 * Copyright 2007,2008,2009 Lemur Consulting Ltd 9 9 * 10 10 * This program is free software; you can redistribute it and/or 11 11 * modify it under the terms of the GNU General Public License as … … 27 27 28 28 #include "multimatch.h" 29 29 30 #include "autoptr.h" 30 31 #include "collapser.h" 31 32 #include "submatch.h" 32 33 #include "localmatch.h" … … 47 48 #include "weightinternal.h" 48 49 49 50 #include <xapian/errorhandler.h> 51 #include <xapian/matchspy.h> 50 52 #include <xapian/version.h> // For XAPIAN_HAS_REMOTE_BACKEND 51 53 52 54 #ifdef XAPIAN_HAS_REMOTE_BACKEND … … 174 176 } 175 177 } 176 178 179 /// Class which applies several match spies in turn. 180 class MultipleMatchSpy : public Xapian::MatchSpy { 181 private: 182 /// List of match spies to call, in order. 183 std::vector<Xapian::MatchSpy *> spies; 184 185 public: 186 /// Add a match spy to the end of the list to be called. 187 void append(Xapian::MatchSpy * spy) { 188 spies.push_back(spy); 189 } 190 191 /** Implementation of virtual operator(). 192 * 193 * This implementation calls all the spies in turn. 194 */ 195 void operator()(const Xapian::Document &doc, Xapian::weight wt); 196 }; 197 198 void 199 MultipleMatchSpy::operator()(const Xapian::Document &doc, Xapian::weight wt) { 200 LOGCALL_VOID(MATCH, "MultipleMatchSpy::operator()", doc << ", " << wt); 201 vector<Xapian::MatchSpy *>::const_iterator i; 202 for (i = spies.begin(); i != spies.end(); i++) { 203 (**i)(doc, wt); 204 } 205 } 206 177 207 //////////////////////////////////// 178 208 // Initialisation and cleaning up // 179 209 //////////////////////////////////// … … 191 221 const Xapian::Sorter * sorter_, 192 222 Xapian::ErrorHandler * errorhandler_, 193 223 Xapian::Weight::Internal & stats, 194 const Xapian::Weight * weight_) 224 const Xapian::Weight * weight_, 225 const vector<Xapian::MatchSpy *> & matchspies_) 195 226 : db(db_), query(query_), 196 227 collapse_max(collapse_max_), collapse_key(collapse_key_), 197 228 percent_cutoff(percent_cutoff_), weight_cutoff(weight_cutoff_), … … 199 230 sort_key(sort_key_), sort_by(sort_by_), 200 231 sort_value_forward(sort_value_forward_), sorter(sorter_), 201 232 errorhandler(errorhandler_), weight(weight_), 202 is_remote(db.internal.size()) 233 is_remote(db.internal.size()), 234 matchspies(matchspies_) 203 235 { 204 236 DEBUGCALL(MATCH, void, "MultiMatch", db_ << ", " << query_ << ", " << 205 237 qlen << ", " << (omrset ? *omrset : Xapian::RSet()) << ", " << … … 229 261 rem_db->set_query(query, qlen, collapse_max, collapse_key, 230 262 order, sort_key, sort_by, sort_value_forward, 231 263 percent_cutoff, weight_cutoff, weight, 232 subrsets[i] );264 subrsets[i], matchspies); 233 265 bool decreasing_relevance = 234 266 (sort_by == REL || sort_by == REL_VAL); 235 smatch = new RemoteSubMatch(rem_db, decreasing_relevance );267 smatch = new RemoteSubMatch(rem_db, decreasing_relevance, matchspies); 236 268 is_remote[i] = true; 237 269 } else { 238 270 #endif /* XAPIAN_HAS_REMOTE_BACKEND */ … … 277 309 Xapian::MSet & mset, 278 310 const Xapian::Weight::Internal & stats, 279 311 const Xapian::MatchDecider *mdecider, 280 const Xapian::MatchDecider *matchspy )312 const Xapian::MatchDecider *matchspy_legacy) 281 313 { 282 314 DEBUGCALL(MATCH, void, "MultiMatch::get_mset", first << ", " << maxitems 283 315 << ", " << check_at_least << ", ..."); … … 403 435 Xapian::doccount matches_lower_bound = 0; 404 436 Xapian::doccount matches_estimated = pl->get_termfreq_est(); 405 437 406 if (mdecider == NULL && matchspy == NULL) {438 if (mdecider == NULL && matchspy_legacy == NULL) { 407 439 // If we have a matcher decider or match spy, the lower bound must be 408 440 // set to 0 as we could discard all hits. Otherwise set it to the 409 441 // minimum number of entries which the postlist could return. 410 442 matches_lower_bound = pl->get_termfreq_min(); 411 443 } 412 444 445 // Prepare the matchspy 446 Xapian::MatchSpy *matchspy = NULL; 447 AutoPtr<MultipleMatchSpy> multispy; 448 if (!matchspies.empty()) { 449 if (matchspies.size() == 1) { 450 matchspy = matchspies[0]; 451 } else { 452 multispy.reset(new MultipleMatchSpy); 453 for (vector<Xapian::MatchSpy *>::const_iterator i = matchspies.begin(); 454 i != matchspies.end(); ++i) { 455 multispy->append(*i); 456 } 457 matchspy = multispy.get(); 458 } 459 } 460 413 461 // Check if any results have been asked for (might just be wanting 414 462 // maxweight). 415 463 if (check_at_least == 0) { … … 437 485 return; 438 486 } 439 487 440 // Number of documents considered by a decider or matchspy .488 // Number of documents considered by a decider or matchspy_legacy. 441 489 Xapian::doccount decider_considered = 0; 442 // Number of documents denied by the decider or matchspy .490 // Number of documents denied by the decider or matchspy_legacy. 443 491 Xapian::doccount decider_denied = 0; 444 492 445 493 // Set max number of results that we want - this is used to decide … … 545 593 // VAL, then new_item.wt won't yet be set, but that doesn't 546 594 // matter since it's not used by the sort function. 547 595 if (!mcmp(new_item, min_item)) { 548 if (matchspy == NULL && mdecider == NULL && !collapser) {596 if (matchspy_legacy == NULL && mdecider == NULL && !collapser) { 549 597 // Document was definitely suitable for mset - no more 550 598 // processing needed. 551 599 LOGLINE(MATCH, "Making note of match item which sorts lower than min_item"); … … 563 611 continue; 564 612 } 565 613 // We can't drop the item, because we need to show it 566 // to the matchspy , test whether the mdecider would614 // to the matchspy_legacy, test whether the mdecider would 567 615 // accept it, and/or test whether it would be collapsed. 568 616 LOGLINE(MATCH, "Keeping candidate which sorts lower than min_item for further investigation"); 569 617 } 570 618 } 571 619 572 620 // Use the match spy and/or decision functors (if specified). 573 if (matchspy != NULL || m decider != NULL) {621 if (matchspy != NULL || matchspy_legacy != NULL || mdecider != NULL) { 574 622 const unsigned int multiplier = db.internal.size(); 575 623 Assert(multiplier != 0); 576 624 Xapian::doccount n = (did - 1) % multiplier; // which actual database … … 584 632 Xapian::Document mydoc(doc.get()); 585 633 586 634 ++decider_considered; 587 if (matchspy && !matchspy->operator()(mydoc)) {635 if (matchspy_legacy && !matchspy_legacy->operator()(mydoc)) { 588 636 ++decider_denied; 589 637 continue; 590 638 } … … 592 640 ++decider_denied; 593 641 continue; 594 642 } 643 if (matchspy) { 644 if (!calculated_weight) { 645 wt = pl->get_weight(); 646 new_item.wt = wt; 647 calculated_weight = wt; 648 } 649 matchspy->operator()(mydoc, wt); 650 } 595 651 } 596 652 } 597 653 … … 882 938 ", matches_upper_bound=" << matches_upper_bound); 883 939 } 884 940 885 if (matchspy || mdecider) {941 if (matchspy_legacy || mdecider) { 886 942 if (!percent_cutoff) { 887 943 if (!collapser) { 888 944 // We're not collapsing or doing a percentage cutoff, so … … 946 1002 matches_estimated = matches_lower_bound; 947 1003 } 948 1004 949 if (collapser || matchspy || mdecider) {1005 if (collapser || matchspy_legacy || mdecider) { 950 1006 LOGLINE(MATCH, "Clamping estimate between bounds: " 951 1007 "matches_lower_bound = " << matches_lower_bound << 952 1008 ", matches_estimated = " << matches_estimated << … … 962 1018 matches_estimated = docs_matched; 963 1019 } 964 1020 965 if (collapser && !matchspy && !mdecider && !percent_cutoff) {1021 if (collapser && !matchspy_legacy && !mdecider && !percent_cutoff) { 966 1022 AssertRel(docs_matched,<=,uncollapsed_upper_bound); 967 1023 if (docs_matched > uncollapsed_lower_bound) 968 1024 uncollapsed_lower_bound = docs_matched; -
xapian-core/matcher/remotesubmatch.cc
27 27 #include "remote-database.h" 28 28 #include "weightinternal.h" 29 29 30 RemoteSubMatch::RemoteSubMatch(RemoteDatabase *db_, bool decreasing_relevance_) 31 : db(db_), decreasing_relevance(decreasing_relevance_) 30 RemoteSubMatch::RemoteSubMatch(RemoteDatabase *db_, 31 bool decreasing_relevance_, 32 const vector<Xapian::MatchSpy *> & matchspies_) 33 : db(db_), 34 decreasing_relevance(decreasing_relevance_), 35 matchspies(matchspies_) 32 36 { 33 37 DEBUGCALL(MATCH, void, "RemoteSubMatch", 34 db_ << ", " << decreasing_relevance_); 38 db_ << ", " << decreasing_relevance_ << ", " << 39 "matchspies"); 35 40 } 36 41 37 42 bool … … 64 69 DEBUGCALL(MATCH, PostList *, "RemoteSubMatch::get_postlist_and_term_info", 65 70 "[matcher], " << (void*)termfreqandwts << ", " << (void*)total_subqs_ptr); 66 71 Xapian::MSet mset; 67 db->get_mset(mset );72 db->get_mset(mset, matchspies); 68 73 percent_factor = mset.internal->percent_factor; 69 74 if (termfreqandwts) *termfreqandwts = mset.internal->termfreqandwts; 70 75 // For remote databases we report percent_factor rather than counting the -
xapian-core/matcher/remotesubmatch.h
26 26 #include "remote-database.h" 27 27 #include "xapian/weight.h" 28 28 29 class Xapian::MatchSpy; 30 29 31 /// Class for performing matching on a remote database. 30 32 class RemoteSubMatch : public SubMatch { 31 33 /// Don't allow assignment. … … 46 48 /// The factor to use to convert weights to percentages. 47 49 double percent_factor; 48 50 51 /// The matchspies to use. 52 const vector<Xapian::MatchSpy *> & matchspies; 53 49 54 public: 50 55 /// Constructor. 51 RemoteSubMatch(RemoteDatabase *db_, bool decreasing_relevance_); 56 RemoteSubMatch(RemoteDatabase *db_, 57 bool decreasing_relevance_, 58 const vector<Xapian::MatchSpy *> & matchspies); 52 59 53 60 /// Fetch and collate statistics. 54 61 bool prepare_match(bool nowait, Xapian::Weight::Internal & total_stats); … … 69 76 double get_percent_factor() const { return percent_factor; } 70 77 71 78 /// Short-cut for single remote match. 72 void get_mset(Xapian::MSet & mset) { db->get_mset(mset ); }79 void get_mset(Xapian::MSet & mset) { db->get_mset(mset, matchspies); } 73 80 }; 74 81 75 82 #endif /* XAPIAN_INCLUDED_REMOTESUBMATCH_H */ -
xapian-core/docs/categorisation.rst
1 1 2 2 .. Copyright (C) 2007 Olly Betts 3 .. Copyright (C) 2009 Lemur Consulting Ltd 3 4 4 5 ============================= 5 6 Xapian Categorisation Support … … 14 15 lists of category values which feature in matching documents. There are 15 16 numerous potential uses this can be put to, but a common one is to offer the 16 17 user the ability to narrow down their search by filtering it to only include 17 documents with a particular value of a particular category. 18 documents with a particular value of a particular category. This is often 19 referred to as ``faceted search``. 18 20 19 21 Some categories are numeric and can take many different values (examples 20 22 include price, width, and height). The number of different values will often … … 43 45 Searching 44 46 --------- 45 47 46 At search time, you need to pass a ``Xapian::MatchSpy`` object to 47 ``Xapian::Enquire::get_mset()``, like so:: 48 At search time, you need to pass a ``Xapian::ValueCountMatchSpy`` object for 49 each category you want to look at to ``Xapian::Enquire::add_matchspy()``, like 50 so:: 48 51 49 Xapian::MatchSpy spy; 52 Xapian::ValueCountMatchSpy spy0(0); 53 Xapian::ValueCountMatchSpy spy1(1); 54 Xapian::ValueCountMatchSpy spy3(3); 50 55 51 spy.add_category(0);52 spy.add_category(1);53 spy.add_category(3);54 55 56 Xapian::Enquire enq(db); 57 enq.add_matchspy(spy0); 58 enq.add_matchspy(spy1); 59 enq.add_matchspy(spy3); 56 60 57 61 enq.set_query(query); 58 62 59 63 Xapian::MSet mset = enq.get_mset(0, 10, 10000, NULL, NULL, &spy); 60 64 61 The ``10000`` in the call to ``get_mset `` tells Xapian to check at least65 The ``10000`` in the call to ``get_mset()`` tells Xapian to check at least 62 66 10000 documents, so the ``spy`` object will be passed at least 10000 documents 63 to tally category information from (unless less than 10000 documents match 64 the query, in which case it will see all of them). Setting this higher will 65 make the counts exact, but Xapian will have to do more work for most queries 66 s o searches will be slower.67 to tally category information from (unless less than 10000 documents match the 68 query, in which case it will see all of them). Setting this higher will make 69 the counts exact, but Xapian will have to do more work for most queries so 70 searches will be slower. 67 71 68 The ``spy`` object now contains the category information. You can find out 69 how many documents it looked at by calling ``spy.get_total()``. You can 70 read the values for category ``cat_no`` like this:: 72 The ``spy`` objects now contain the category information. You can find out how 73 many documents they looked at by calling ``spy0.get_total()``. (All the spies 74 will have looked at the same number of documents.) You can read the values 75 from, say, ``spy0`` like this:: 71 76 72 const map<string, size_t> & cat = spy .get_categories(cat_no);77 const map<string, size_t> & cat = spy0.get_values(); 73 78 map<string, size_t>::const_iterator i; 74 79 for (i = cat.begin(); i != cat.end(); ++i) { 75 80 cout << i->first << ": " << i->second << endl; 76 81 } 77 82 78 You calculate the score for category ``cat_no`` like so:: 83 You can calculate a score to indicate how evenly spread the values are using 84 the ``score_evenness`` function like so:: 79 85 80 double score = spy.score_categorisation(cat_num);86 double score = Xapian::score_evenness(spy0); 81 87 82 88 Or if you prefer categories with 4 or 5 values:: 83 89 84 double score = spy.score_categorisation(cat_num, 4.5);90 double score = Xapian::score_evenness(spy0, 4.5); 85 91 86 92 The smaller the score, the better - a perfectly even split with exactly the 87 93 number of entries asked (or with no preference given for the number of entries) … … 89 95 application, but to give you a rough idea, a suitable threshold is likely to be 90 96 less than one. 91 97 92 The scoring uses a sum of squared differences (currently that is - this should98 The scoring uses a sum of squared differences (currently, that is - this should 93 99 probably be regarded as an implementation detail which could change in the 94 100 future if we find a better algorithm). 95 101 96 You would build ranges from numeric values for value ``cat_no``, asking for at97 most ``num_ranges`` ranges like so::102 You can build ranges from numeric values for the values returned from spy 103 ``spy0``, asking for at most ``num_ranges`` ranges like so:: 98 104 99 bool result = spy.build_numeric_ranges(cat_no, num_ranges); 105 std::map<Xapian::NumericRange, Xapian::doccount> result; 106 Xapian::doccount values_seen; 107 values_seen = build_numeric_ranges(result, spy0.get_values(), num_ranges); 100 108 101 If ranges could not be built (for example, because all documents have the 102 same value for ``cat_no``), ``false`` is returned. Otherwise ``true`` is 103 returned, and the spy object's category map for value ``cat_no`` is modified 104 to consist of ranges. Keys are now built of strings returned by 105 ``Xapian::sortable_serialise()`` - either a single string if there is only 106 one number in a particular range, or for a range a string padded to 9 bytes 107 with zero bytes, with a second string appended. 109 Here, ``result`` will be filled with a set of numeric ranges (holding at most 110 ``num_ranges`` ranges), and ``values_seen`` will be the count of the number of 111 values seen (note - this may be different from the number of documents seen by 112 the matchspy, since some may have no value stored in the slot). 108 113 114 If there are no values seen by the spy, ``result`` will be empty. If all the 115 values seen by the spy are the same, ``result`` will contain a single entry, 116 with a single range with the same start and end points. 117 109 118 Restricting by category values 110 119 ------------------------------ 111 120 112 If you're using the categorisation to offer the user choices for narrowing 113 down their search results, you then need to be able to apply a suitable 114 filter. 121 If you're using the categorisation to offer the user choices for narrowing down 122 their search results, you then need to be able to apply a suitable filter. 115 123 116 For a range, the best way is to use ``Xapian::Query::OP_VALUE_RANGE`` to124 For a range, the easiest way is to use ``Xapian::Query::OP_VALUE_RANGE`` to 117 125 build a filter query, and then combine this with the user's query using 118 126 ``Xapian::Query::OP_FILTER``. 119 127 120 For a single value, you could use ``Xapian::Query::OP_VALUE_RANGE`` with 121 the same start and end, or ``Xapian::MatchDecider``, but it's probably 122 most efficient to also index the categories as suitably prefixed boolean 123 terms anduse those for filtering.128 For a single value, you could use ``Xapian::Query::OP_VALUE_RANGE`` with the 129 same start and end, or ``Xapian::MatchDecider``, but it's probably most 130 efficient to also index the categories as suitably prefixed boolean terms and 131 use those for filtering. 124 132 125 133 Current Limitations 126 134 =================== 127 135 128 It's not currently possible to build logarithmic ranges without writing 129 your own subclass. 130 131 It's not possible to try building different ranges because the original 132 data is overwritten. If it's actually useful to do this, the API needs 133 adjusting. 136 It's not currently possible to build logarithmic ranges with 137 ``build_numeric_ranges``. -
xapian-core/docs/Makefile.am
17 17 bm25.html code_structure.html queryparser.html \ 18 18 quickstartexpand.cc.html quickstartindex.cc.html quickstartsearch.cc.html 19 19 20 RSTDOCS = admin_notes.rst deprecation.rst glossary.rst \20 RSTDOCS = admin_notes.rst categorisation.rst deprecation.rst glossary.rst \ 21 21 postingsource.rst replication.rst replication_protocol.rst \ 22 22 sorting.rst serialisation.rst spelling.rst synonyms.rst \ 23 23 termgenerator.rst valueranges.rst -
xapian-core/tests/api_matchspy.cc
1 /** @file api_opsynonym.cc 2 * @brief tests of MatchSpy usage 3 */ 4 /* Copyright 2007,2009 Lemur Consulting Ltd 5 * 6 * This program is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU General Public License as 8 * published by the Free Software Foundation; either version 2 of the 9 * License, or (at your option) any later version. 10 * 11 * This program is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 * GNU General Public License for more details. 15 * 16 * You should have received a copy of the GNU General Public License 17 * along with this program; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 19 * USA 20 */ 21 22 #include <config.h> 23 24 #include "api_matchspy.h" 25 26 #include <xapian.h> 27 28 #include "str.h" 29 #include <cmath> 30 #include <map> 31 #include <vector> 32 33 #include "backendmanager.h" 34 #include "testsuite.h" 35 #include "testutils.h" 36 #include "apitest.h" 37 38 using namespace std; 39 40 // ####################################################################### 41 // # Tests start here 42 43 class SimpleMatchSpy : public Xapian::MatchSpy { 44 public: 45 // Vector which will be filled with all the document contents seen. 46 std::vector<std::string> seen; 47 48 void operator()(const Xapian::Document &doc, 49 Xapian::weight) { 50 // Note that this is not recommended usage of get_data() - you 51 // generally shouldn't call get_data() from inside a MatchSpy, because 52 // it is (likely to be) a slow operation resulting in considerable IO. 53 seen.push_back(doc.get_data()); 54 } 55 }; 56 57 // Basic test of a matchspy. 58 DEFINE_TESTCASE(matchspy1, backend && !remote) { 59 Xapian::Database db(get_database("apitest_simpledata")); 60 Xapian::Enquire enquire(db); 61 enquire.set_query(Xapian::Query("this")); 62 63 SimpleMatchSpy myspy; 64 65 Xapian::MSet nospymset = enquire.get_mset(0, 100); 66 enquire.add_matchspy(&myspy); 67 Xapian::MSet spymset = enquire.get_mset(0, 100); 68 69 // Check that the match estimates aren't affected by the matchspy. 70 TEST_EQUAL(nospymset, spymset); 71 72 vector<bool> docid_checked(db.get_lastdocid()); 73 74 // Check that we get the expected number of matches, and that the stored 75 // document contents are right. 76 Xapian::MSetIterator i = spymset.begin(); 77 TEST(i != spymset.end()); 78 TEST_EQUAL(spymset.size(), 6); 79 TEST_EQUAL(myspy.seen.size(), spymset.size()); 80 81 std::sort(myspy.seen.begin(), myspy.seen.end()); 82 83 std::vector<std::string> seen2; 84 for ( ; i != spymset.end(); ++i) { 85 const Xapian::Document doc(i.get_document()); 86 seen2.push_back(doc.get_data()); 87 } 88 std::sort(seen2.begin(), seen2.end()); 89 90 TEST_EQUAL(myspy.seen.size(), seen2.size()); 91 std::vector<std::string>::const_iterator j = myspy.seen.begin(); 92 std::vector<std::string>::const_iterator j2 = seen2.begin(); 93 for (; j != myspy.seen.end(); ++j, ++j2) { 94 TEST_EQUAL(*j, *j2); 95 } 96 97 return true; 98 } 99 100 static string values_to_repr(const map<string, Xapian::doccount> & cat) { 101 string resultrepr("|"); 102 map<string, Xapian::doccount>::const_iterator i; 103 for (i = cat.begin(); i != cat.end(); ++i) { 104 resultrepr += i->first; 105 resultrepr += ':'; 106 resultrepr += str(i->second); 107 resultrepr += '|'; 108 } 109 return resultrepr; 110 } 111 112 DEFINE_TESTCASE(matchspy2, writable) 113 { 114 if (get_dbtype() == "remotetcp" || get_dbtype() == "remoteprog") { 115 SKIP_TEST("Test not supported for remote backend"); 116 } 117 118 Xapian::WritableDatabase db = get_writable_database(""); 119 for (int c = 1; c <= 25; ++c) { 120 Xapian::Document doc; 121 doc.set_data("Document " + str(c)); 122 int factors = 0; 123 for (int factor = 1; factor <= c; ++factor) { 124 doc.add_term("all"); 125 if (c % factor == 0) { 126 doc.add_term("XFACT" + str(factor)); 127 ++factors; 128 } 129 } 130 131 // Number of factors. 132 doc.add_value(0, str(factors)); 133 // Units digits. 134 doc.add_value(1, str(c % 10)); 135 // Constant. 136 doc.add_value(2, "fish"); 137 // Number of digits. 138 doc.add_value(3, str(str(c).size())); 139 140 db.add_document(doc); 141 } 142 143 Xapian::ValueCountMatchSpy spy0(0); 144 Xapian::ValueCountMatchSpy spy1(1); 145 Xapian::ValueCountMatchSpy spy3(3); 146 147 Xapian::Enquire enq(db); 148 149 enq.set_query(Xapian::Query("all")); 150 151 enq.add_matchspy(&spy0); 152 enq.add_matchspy(&spy1); 153 enq.add_matchspy(&spy3); 154 Xapian::MSet mset = enq.get_mset(0, 10); 155 156 TEST_EQUAL(spy0.get_total(), 25); 157 TEST_EQUAL(spy1.get_total(), 25); 158 TEST_EQUAL(spy3.get_total(), 25); 159 160 static const char * results[] = { 161 "|1:1|2:9|3:3|4:7|5:1|6:3|8:1|", 162 "|0:2|1:3|2:3|3:3|4:3|5:3|6:2|7:2|8:2|9:2|", 163 "|1:9|2:16|", 164 }; 165 TEST_STRINGS_EQUAL(values_to_repr(spy0.get_values()), results[0]); 166 TEST_STRINGS_EQUAL(values_to_repr(spy1.get_values()), results[1]); 167 TEST_STRINGS_EQUAL(values_to_repr(spy3.get_values()), results[2]); 168 169 { 170 // Test scoring evenness returns scores with the natural ordering. 171 double score0 = Xapian::score_evenness(spy0); 172 tout << "score0 = " << score0 << endl; 173 double score1 = Xapian::score_evenness(spy1); 174 tout << "score1 = " << score1 << endl; 175 double score3 = Xapian::score_evenness(spy3); 176 tout << "score3 = " << score3 << endl; 177 // 1 is obviously best, and 0 obviously worst. 178 TEST(score1 < score3); 179 TEST(score3 < score0); 180 181 // Check that the using the expanded form gives the same results. 182 double score0_check = Xapian::score_evenness(spy0.get_values(), spy0.get_total()); 183 tout << "score0_check = " << score0_check << endl; 184 TEST_EQUAL(score0, score0_check); 185 } 186 187 { 188 // Test scoring evenness and about 7 categories returns scores with the 189 // natural ordering. 190 double score0 = Xapian::score_evenness(spy0, 7); 191 tout << "score0 = " << score0 << endl; 192 double score1 = Xapian::score_evenness(spy1, 7); 193 tout << "score1 = " << score1 << endl; 194 double score3 = Xapian::score_evenness(spy3, 7); 195 tout << "score3 = " << score3 << endl; 196 // 3 is clearly worst - 0 is arguably a little better than 1 (0 is the 197 // requested size, but 1 has a much more even split). 198 TEST(score0 < score1); 199 TEST(score1 < score3); 200 201 // Check that the using the expanded form gives the same results. 202 double score0_check = Xapian::score_evenness(spy0.get_values(), spy0.get_total()); 203 tout << "score0_check = " << score0_check << endl; 204 TEST_EQUAL(score0, score0_check); 205 } 206 207 return true; 208 } 209 210 DEFINE_TESTCASE(matchspy3, writable) 211 { 212 if (get_dbtype() == "remotetcp" || get_dbtype() == "remoteprog") { 213 SKIP_TEST("Test not supported for remote backend"); 214 } 215 216 Xapian::WritableDatabase db = get_writable_database(""); 217 for (int c = 1; c <= 25; ++c) { 218 Xapian::Document doc; 219 doc.set_data("Document " + str(c)); 220 int factors = 0; 221 for (int factor = 1; factor <= c; ++factor) { 222 doc.add_term("all"); 223 if (c % factor == 0) { 224 doc.add_term("XFACT" + str(factor)); 225 ++factors; 226 } 227 } 228 229 // Number of factors. 230 doc.add_value(0, Xapian::sortable_serialise(factors)); 231 // Units digits. 232 doc.add_value(1, Xapian::sortable_serialise(c % 10)); 233 // (x + 1/3)*(x + 1/3). 234 doc.add_value(2, Xapian::sortable_serialise((c + 1.0/3.0) * (c + 1.0/3.0))); 235 // Reciprocal. 236 doc.add_value(3, Xapian::sortable_serialise(floor(100.0 / c))); 237 238 db.add_document(doc); 239 } 240 241 Xapian::ValueCountMatchSpy spy0(0); 242 Xapian::ValueCountMatchSpy spy1(1); 243 Xapian::ValueCountMatchSpy spy2(2); 244 Xapian::ValueCountMatchSpy spy3(3); 245 246 Xapian::Enquire enq(db); 247 248 enq.set_query(Xapian::Query("all")); 249 250 enq.add_matchspy(&spy0); 251 enq.add_matchspy(&spy1); 252 enq.add_matchspy(&spy2); 253 enq.add_matchspy(&spy3); 254 Xapian::MSet mset = enq.get_mset(0, 10); 255 256 TEST_EQUAL(spy0.get_total(), 25); 257 TEST_EQUAL(spy1.get_total(), 25); 258 TEST_EQUAL(spy2.get_total(), 25); 259 TEST_EQUAL(spy3.get_total(), 25); 260 261 static const string results[] = { 262 "|100:1|200:9|300:3|400:7|500:1|600:3|800:1|", 263 "|0..200:8|300..400:6|500..700:7|800..900:4|", 264 "|177..8711:9|10677..17777:4|20544..26677:3|30044..37377:3|41344..49877:3|54444..59211:2|64177:1|", 265 "|400..900:15|1000..1600:5|2000..2500:2|3300:1|5000:1|10000:1|", 266 "" 267 }; 268 std::vector<Xapian::ValueCountMatchSpy *> spies; 269 spies.push_back(&spy0); 270 spies.push_back(&spy1); 271 spies.push_back(&spy2); 272 spies.push_back(&spy3); 273 for (Xapian::valueno v = 0; !results[v].empty(); ++v) { 274 Xapian::doccount total_seen; 275 std::map<Xapian::NumericRange, Xapian::doccount> ranges; 276 total_seen = Xapian::build_numeric_ranges(ranges, spies[v]->get_values(), 7); 277 if (results[v] == "|") { 278 TEST_EQUAL(total_seen, 0); 279 continue; 280 } 281 TEST_NOT_EQUAL(total_seen, 0); 282 TEST(ranges.size() <= 7); 283 string resultrepr("|"); 284 map<Xapian::NumericRange, Xapian::doccount>::const_iterator i; 285 for (i = ranges.begin(); i != ranges.end(); ++i) { 286 if (i->first.lower != i->first.upper) { 287 resultrepr += str(floor(i->first.lower * 100)); 288 resultrepr += ".."; 289 resultrepr += str(floor(i->first.upper * 100)); 290 } else { 291 double start = floor(i->first.lower * 100); 292 resultrepr += str(start); 293 } 294 resultrepr += ':'; 295 resultrepr += str(i->second); 296 resultrepr += '|'; 297 } 298 tout << "value " << v << endl; 299 TEST_STRINGS_EQUAL(resultrepr, results[v]); 300 } 301 302 return true; 303 } 304 305 DEFINE_TESTCASE(matchspy4, writable) 306 { 307 if (get_dbtype() == "remotetcp" || get_dbtype() == "remoteprog") { 308 SKIP_TEST("Test not supported for remote backend"); 309 } 310 311 Xapian::WritableDatabase db = get_writable_database(""); 312 for (int c = 1; c <= 25; ++c) { 313 Xapian::Document doc; 314 doc.set_data("Document " + str(c)); 315 int factors = 0; 316 for (int factor = 1; factor <= c; ++factor) { 317 doc.add_term("all"); 318 if (c % factor == 0) { 319 doc.add_term("XFACT" + str(factor)); 320 ++factors; 321 } 322 } 323 324 // Number of factors. 325 doc.add_value(0, str(factors)); 326 // Units digits. 327 doc.add_value(1, str(c % 10)); 328 // Constant. 329 doc.add_value(2, "fish"); 330 // Number of digits. 331 doc.add_value(3, str(str(c).size())); 332 333 db.add_document(doc); 334 } 335 336 Xapian::ValueCountMatchSpy spy0(0); 337 Xapian::ValueCountMatchSpy spy1(1); 338 Xapian::ValueCountMatchSpy spy3(3); 339 340 Xapian::Enquire enq(db); 341 342 enq.set_query(Xapian::Query("all")); 343 344 enq.add_matchspy(&spy0); 345 enq.add_matchspy(&spy1); 346 enq.add_matchspy(&spy3); 347 Xapian::MSet mset = enq.get_mset(0, 10); 348 349 TEST_EQUAL(spy0.get_total(), 25); 350 TEST_EQUAL(spy1.get_total(), 25); 351 TEST_EQUAL(spy3.get_total(), 25); 352 353 static const char * results[] = { 354 "|2:9|4:7|3:3|6:3|1:1|5:1|8:1|", 355 "|1:3|2:3|3:3|4:3|5:3|0:2|6:2|7:2|8:2|9:2|", 356 "|", 357 "|2:16|1:9|", 358 NULL 359 }; 360 std::vector<Xapian::ValueCountMatchSpy *> spies; 361 spies.push_back(&spy0); 362 spies.push_back(&spy1); 363 spies.push_back(NULL); 364 spies.push_back(&spy3); 365 for (Xapian::valueno v = 0; results[v]; ++v) { 366 tout << "value " << v << endl; 367 std::vector<Xapian::StringAndFrequency> allvals; 368 369 Xapian::ValueCountMatchSpy * spy = spies[v]; 370 if (spy != NULL) 371 spy->get_top_values(allvals, 100); 372 string allvals_str("|"); 373 for (size_t i = 0; i < allvals.size(); i++) { 374 allvals_str += allvals[i].str; 375 allvals_str += ':'; 376 allvals_str += str(allvals[i].frequency); 377 allvals_str += '|'; 378 } 379 tout << allvals_str << endl; 380 TEST_STRINGS_EQUAL(allvals_str, results[v]); 381 382 std::vector<Xapian::StringAndFrequency> vals; 383 for (size_t i = 0; i < allvals.size(); i++) { 384 tout << "i " << i << endl; 385 if (spy != NULL) 386 spy->get_top_values(vals, i); 387 for (size_t j = 0; j < vals.size(); j++) { 388 tout << "j " << j << endl; 389 TEST_EQUAL(vals[j].str, allvals[j].str); 390 TEST_EQUAL(vals[j].frequency, allvals[j].frequency); 391 } 392 } 393 } 394 395 return true; 396 } 397 398 // Test builtin match spies 399 DEFINE_TESTCASE(matchspy5, backend) 400 { 401 Xapian::Database db(get_database("apitest_simpledata")); 402 Xapian::Enquire enquire(db); 403 enquire.set_query(Xapian::Query("this")); 404 405 Xapian::ValueCountMatchSpy myspy1(1); 406 Xapian::ValueCountMatchSpy myspy2(1); 407 408 enquire.add_matchspy(&myspy1); 409 enquire.add_matchspy(&myspy2); 410 Xapian::MSet mymset = enquire.get_mset(0, 100); 411 TEST_EQUAL(mymset.size(), 6); 412 413 const std::map<std::string, Xapian::doccount> & vals1 = myspy1.get_values(); 414 const std::map<std::string, Xapian::doccount> & vals2 = myspy2.get_values(); 415 416 TEST_EQUAL(vals1.size(), 2); 417 TEST(vals1.find("h") != vals1.end()); 418 TEST(vals1.find("n") != vals1.end()); 419 TEST_EQUAL(vals1.find("h")->second, 5); 420 TEST_EQUAL(vals1.find("n")->second, 1); 421 422 TEST_EQUAL(vals2.size(), 2); 423 TEST(vals2.find("h") != vals2.end()); 424 TEST(vals2.find("n") != vals2.end()); 425 TEST_EQUAL(vals2.find("h")->second, 5); 426 TEST_EQUAL(vals2.find("n")->second, 1); 427 428 return true; 429 } -
xapian-core/tests/Makefile.am
Property changes on: xapian-core/tests/api_matchspy.cc ___________________________________________________________________ Added: svn:eol-style + native
115 115 api_collapse.cc \ 116 116 api_db.cc \ 117 117 api_generated.cc \ 118 api_matchspy.cc \ 118 119 api_metadata.cc \ 119 120 api_nodb.cc \ 120 121 api_opsynonym.cc \ -
xapian-core/include/xapian/enquire.h
4 4 /* Copyright 1999,2000,2001 BrightStation PLC 5 5 * Copyright 2001,2002 Ananova Ltd 6 6 * Copyright 2002,2003,2004,2005,2006,2007,2008,2009 Olly Betts 7 * Copyright 2009 Lemur Consulting Ltd 7 8 * 8 9 * This program is free software; you can redistribute it and/or 9 10 * modify it under the terms of the GNU General Public License as … … 39 40 class Document; 40 41 class ErrorHandler; 41 42 class ExpandDecider; 43 class MatchSpy; 42 44 class MSetIterator; 43 45 class Query; 44 46 class Weight; … … 689 691 */ 690 692 const Xapian::Query & get_query() const; 691 693 694 /** Add a matchspy. 695 * 696 * This matchspy will be called with some of the documents which match 697 * the query, during the match process. Exactly which of the matching 698 * documents are passed to it depends on exactly when certain 699 * optimisations occur during the match process, but it can be 700 * controlled to some extent by setting the @a checkatleast parameter 701 * to @a get_mset(). 702 * 703 * In particular, if there are enough matching documents, at least the 704 * number specified by @a checkatleast will be passed to the matchspy. 705 * This means that you can force the matchspy to be shown all matching 706 * documents by setting @a checkatleast to the number of documents in 707 * the database. 708 * 709 * @param matchspy The MatchSpy subclass to add. The caller must 710 * ensure that this remains valid while the Enquire 711 * object remains active, or until @a 712 * clear_matchspies() is called. 713 */ 714 void add_matchspy(MatchSpy * spy); 715 716 /** Remove all the matchspies. 717 */ 718 void clear_matchspies(); 719 692 720 /** Set the weighting scheme to use for queries. 693 721 * 694 722 * @param weight_ the new weighting scheme. If no weighting scheme -
xapian-core/include/xapian/serialisationcontext.h
31 31 // Forward declarations. 32 32 class Weight; 33 33 class PostingSource; 34 class MatchSpy; 34 35 35 36 /** A context for serialisation. 36 37 * … … 92 93 */ 93 94 const Xapian::PostingSource * 94 95 get_posting_source(const std::string & name) const; 96 97 /// Register a user-defined match spy class. 98 void register_match_spy(const Xapian::MatchSpy &spy); 99 100 /** Get a match spy given a name. 101 * 102 * The returned match spy is owned by the context object. 103 * 104 * Returns NULL if the match spy could not be found. 105 */ 106 const Xapian::MatchSpy * 107 get_match_spy(const std::string & name) const; 95 108 }; 96 109 97 110 } -
xapian-core/include/xapian/matchspy.h
1 /** @file matchspy.h 2 * @brief MatchSpy implementation. 3 */ 4 /* Copyright (C) 2007,2008 Olly Betts 5 * Copyright (C) 2007,2009 Lemur Consulting Ltd 6 * 7 * This program is free software; you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License as published by 9 * the Free Software Foundation; either version 2 of the License, or 10 * (at your option) any later version. 11 * 12 * This program is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 * GNU General Public License for more details. 16 * 17 * You should have received a copy of the GNU General Public License 18 * along with this program; if not, write to the Free Software 19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 20 */ 21 22 #ifndef XAPIAN_INCLUDED_MATCHSPY_H 23 #define XAPIAN_INCLUDED_MATCHSPY_H 24 25 #include <xapian/enquire.h> 26 #include <xapian/visibility.h> 27 28 #include <string> 29 #include <map> 30 #include <set> 31 #include <string> 32 #include <vector> 33 34 namespace Xapian { 35 36 class Document; 37 class SerialisationContext; 38 39 /** Abstract base class for match spies. 40 * 41 * The subclasses will generally accumulate information seen during the match, 42 * to calculate aggregate functions, or other profiles of the matching 43 * documents. 44 */ 45 class XAPIAN_VISIBILITY_DEFAULT MatchSpy { 46 private: 47 /// Don't allow assignment. 48 void operator=(const MatchSpy &); 49 50 /// Don't allow copying. 51 MatchSpy(const MatchSpy &); 52 53 protected: 54 /// Default constructor, needed by subclass constructors. 55 MatchSpy() {} 56 57 public: 58 /** Virtual destructor, because we have virtual methods. */ 59 virtual ~MatchSpy(); 60 61 /** Register a document with the match spy. 62 * 63 * This is called by the matcher once with each document seen by the 64 * matcher during the match process. Note that the matcher will often not 65 * see all the documents which match the query, due to optimisations which 66 * allow low-weighted documents to be skipped, and allow the match process 67 * to be terminated early. 68 * 69 * @param doc The document seen by the match spy. 70 * @param wt The weight of the document. 71 */ 72 virtual void operator()(const Xapian::Document &doc, 73 Xapian::weight wt) = 0; 74 75 /** Clone the match spy. 76 * 77 * The clone should inherit the configuration of the parent, but need not 78 * inherit the state. ie, the clone does not need to be passed 79 * information about the results seen by the parent. 80 * 81 * If you don't want to support the remote backend in your match spy, you 82 * can use the default implementation which simply throws 83 * Xapian::UnimplementedError. 84 * 85 * Note that the returned object will be deallocated by Xapian after use 86 * with "delete". It must therefore have been allocated with "new". 87 */ 88 virtual MatchSpy * clone() const; 89 90 /** Return the name of this match spy. 91 * 92 * This name is used by the remote backend. It is passed with the 93 * serialised parameters to the remote server so that it knows which class 94 * to create. 95 * 96 * Return the full namespace-qualified name of your class here - if your 97 * class is called MyApp::FooMatchSpy, return "MyApp::FooMatchSpy" from 98 * this method. 99 * 100 * If you don't want to support the remote backend in your match spy, you 101 * can use the default implementation which simply throws 102 * Xapian::UnimplementedError. 103 */ 104 virtual std::string name() const; 105 106 /** Return this object's parameters serialised as a single string. 107 * 108 * If you don't want to support the remote backend in your match spy, you 109 * can use the default implementation which simply throws 110 * Xapian::UnimplementedError. 111 */ 112 virtual std::string serialise() const; 113 114 /** Unserialise parameters. 115 * 116 * This method unserialises parameters serialised by the @a serialise() 117 * method and allocates and returns a new object initialised with them. 118 * 119 * If you don't want to support the remote backend in your match spy, you 120 * can use the default implementation which simply throws 121 * Xapian::UnimplementedError. 122 * 123 * Note that the returned object will be deallocated by Xapian after use 124 * with "delete". It must therefore have been allocated with "new". 125 */ 126 virtual MatchSpy * unserialise(const std::string & s, 127 const SerialisationContext & context) const; 128 129 /** Serialise the results of this match spy. 130 * 131 * If you don't want to support the remote backend in your match spy, you 132 * can use the default implementation which simply throws 133 * Xapian::UnimplementedError. 134 */ 135 virtual std::string serialise_results() const; 136 137 /** Unserialise some results, and merge them into this matchspy. 138 * 139 * The order in which results are merged should not be significant, since 140 * this order is not specified (and will vary depending on the speed of 141 * the search in each sub-database). 142 * 143 * If you don't want to support the remote backend in your match spy, you 144 * can use the default implementation which simply throws 145 * Xapian::UnimplementedError. 146 */ 147 virtual void merge_results(const std::string & s); 148 149 /** Return a string describing this object. 150 * 151 * This default implementation returns a generic answer, to avoid forcing 152 * those deriving their own MatchSpy subclasses from having to implement 153 * this (they may not care what get_description() gives for their 154 * subclass). 155 */ 156 virtual std::string get_description() const; 157 }; 158 159 160 /** A string with a corresponding frequency. 161 */ 162 struct XAPIAN_VISIBILITY_DEFAULT StringAndFrequency { 163 std::string str; 164 Xapian::doccount frequency; 165 StringAndFrequency(std::string str_, Xapian::doccount frequency_) 166 : str(str_), frequency(frequency_) {} 167 }; 168 169 170 /// Class for counting the frequencies of values in the matching documents. 171 class XAPIAN_VISIBILITY_DEFAULT ValueCountMatchSpy : public MatchSpy { 172 protected: 173 /// The slot to count. 174 Xapian::valueno slot; 175 176 /// Total number of documents seen by the match spy. 177 Xapian::doccount total; 178 179 /// The values seen so far, together with their frequency. 180 std::map<std::string, Xapian::doccount> values; 181 182 public: 183 /// Construct an empty ValueCountMatchSpy. 184 ValueCountMatchSpy() : slot(-1), total(0) {} 185 186 /** Construct a MatchSpy which counts the values in a particular slot. 187 * 188 * Further slots can be added by calling @a add_slot(). 189 */ 190 ValueCountMatchSpy(Xapian::valueno slot_) 191 : slot(slot_), total(0) { 192 } 193 194 /// Return the values seen in the slot. 195 const std::map<std::string, Xapian::doccount> & get_values() const { 196 return values; 197 } 198 199 /** Return the total number of documents tallied. */ 200 size_t get_total() const { 201 return total; 202 } 203 204 /** Get the most frequent values in the slot. 205 * 206 * @param result A vector which will be filled with the most frequent 207 * values, in descending order of frequency. Values with 208 * the same frequency will be sorted in ascending 209 * alphabetical order. 210 * 211 * @param maxvalues The maximum number of values to return. 212 */ 213 void get_top_values(std::vector<StringAndFrequency> & result, 214 size_t maxvalues) const; 215 216 /** Implementation of virtual operator(). 217 * 218 * This implementation tallies values for a matching document. 219 */ 220 void operator()(const Xapian::Document &doc, Xapian::weight wt); 221 222 virtual MatchSpy * clone() const; 223 virtual std::string name() const; 224 virtual std::string serialise() const; 225 virtual MatchSpy * unserialise(const std::string & s, 226 const SerialisationContext & context) const; 227 virtual std::string serialise_results() const; 228 virtual void merge_results(const std::string & s); 229 virtual std::string get_description() const; 230 }; 231 232 233 /** A numeric range. 234 * 235 * This is used to represent ranges of values returned by the match spies. 236 */ 237 struct XAPIAN_VISIBILITY_DEFAULT NumericRange { 238 /// The lower value in the range. 239 double lower; 240 241 /// The upper value in the range. 242 double upper; 243 244 /// Compare NumericRanges, lower first, then upper if equal. 245 bool operator<(const NumericRange & other) const { 246 if (lower < other.lower) return true; 247 if (lower > other.lower) return false; 248 return (upper < other.upper); 249 } 250 }; 251 252 253 /** Return a score reflecting how evenly divided a set of values is. 254 * 255 * If you don't want to show a poor categorisation, or have multiple 256 * categories and only space in your user interface to show a few, you want to 257 * be able to decide how "good" a categorisation is. One definition of "good" 258 * is that it offers a fairly even split of the available values, and 259 * (optionally) about a specified number of options. 260 * 261 * @param values The values making up the categorisation, together with their 262 * frequencies. 263 * 264 * @param total The total number of documents seen. 265 * 266 * @param desired_no_of_categories The desired number of categories - this is 267 * a floating point value, so you can ask for 5.5 if you'd like "about 5 or 6 268 * categories". The default is to desire the number of categories that there 269 * actually are, so the score then only reflects how even the split is. 270 271 * @return A score for the categorisation for the value - lower is better, 272 * with a perfectly even split across the right number of categories scoring 273 * 0. 274 */ 275 //@{ 276 double XAPIAN_VISIBILITY_DEFAULT score_evenness( 277 const std::map<std::string, Xapian::doccount> & values, 278 Xapian::doccount total, 279 double desired_no_of_categories = 0.0); 280 double XAPIAN_VISIBILITY_DEFAULT score_evenness( 281 const std::map<Xapian::NumericRange, Xapian::doccount> & values, 282 Xapian::doccount total, 283 double desired_no_of_categories = 0.0); 284 double XAPIAN_VISIBILITY_DEFAULT score_evenness( 285 const ValueCountMatchSpy & spy, 286 double desired_no_of_categories = 0.0); 287 //@} 288 289 290 /** Turn a category containing sort-encoded numeric values into a set of 291 * ranges. 292 * 293 * For "continuous" values (such as price, height, weight, etc), there will 294 * usually be too many different values to offer the user, and the user won't 295 * want to restrict to an exact value anyway. 296 * 297 * This method produces a set of NumericRange objects for a particular value 298 * number. 299 * 300 * @param result Used to return the resulting ranges. 301 * @param values The values representing the initial numbers. 302 * @param max_ranges Group into at most this many ranges. 303 * 304 * @return The number of values seen. 305 */ 306 doccount XAPIAN_VISIBILITY_DEFAULT build_numeric_ranges( 307 std::map<Xapian::NumericRange, Xapian::doccount> & result, 308 const std::map<std::string, Xapian::doccount> & values, 309 size_t max_ranges); 310 311 } 312 313 #endif // XAPIAN_INCLUDED_MATCHSPY_H -
xapian-core/include/Makefile.mk
Property changes on: xapian-core/include/xapian/matchspy.h ___________________________________________________________________ Added: svn:eol-style + native
19 19 include/xapian/enquire.h\ 20 20 include/xapian/errorhandler.h\ 21 21 include/xapian/expanddecider.h\ 22 include/xapian/matchspy.h\ 22 23 include/xapian/positioniterator.h\ 23 24 include/xapian/postingiterator.h\ 24 25 include/xapian/postingsource.h\ -
xapian-core/include/xapian.h
45 45 // Searching 46 46 #include <xapian/enquire.h> 47 47 #include <xapian/expanddecider.h> 48 #include <xapian/matchspy.h> 48 49 #include <xapian/postingsource.h> 49 50 #include <xapian/query.h> 50 51 #include <xapian/queryparser.h> -
xapian-core/net/serialise.cc
203 203 const map<string, Xapian::MSet::Internal::TermFreqAndWeight> &termfreqandwts 204 204 = mset.internal->termfreqandwts; 205 205 206 result += encode_length(termfreqandwts.size()); 206 207 map<string, Xapian::MSet::Internal::TermFreqAndWeight>::const_iterator j; 207 208 for (j = termfreqandwts.begin(); j != termfreqandwts.end(); ++j) { 208 209 result += encode_length(j->first.size()); … … 215 216 } 216 217 217 218 Xapian::MSet 218 unserialise_mset(const string &s)219 unserialise_mset(const char ** p, const char * p_end) 219 220 { 220 const char * p = s.data(); 221 const char * p_end = p + s.size(); 221 Xapian::doccount firstitem = decode_length(p, p_end, false); 222 Xapian::doccount matches_lower_bound = decode_length(p, p_end, false); 223 Xapian::doccount matches_estimated = decode_length(p, p_end, false); 224 Xapian::doccount matches_upper_bound = decode_length(p, p_end, false); 225 Xapian::doccount uncollapsed_lower_bound = decode_length(p, p_end, false); 226 Xapian::doccount uncollapsed_estimated = decode_length(p, p_end, false); 227 Xapian::doccount uncollapsed_upper_bound = decode_length(p, p_end, false); 228 Xapian::weight max_possible = unserialise_double(p, p_end); 229 Xapian::weight max_attained = unserialise_double(p, p_end); 222 230 223 Xapian::doccount firstitem = decode_length(&p, p_end, false); 224 Xapian::doccount matches_lower_bound = decode_length(&p, p_end, false); 225 Xapian::doccount matches_estimated = decode_length(&p, p_end, false); 226 Xapian::doccount matches_upper_bound = decode_length(&p, p_end, false); 227 Xapian::doccount uncollapsed_lower_bound = decode_length(&p, p_end, false); 228 Xapian::doccount uncollapsed_estimated = decode_length(&p, p_end, false); 229 Xapian::doccount uncollapsed_upper_bound = decode_length(&p, p_end, false); 230 Xapian::weight max_possible = unserialise_double(&p, p_end); 231 Xapian::weight max_attained = unserialise_double(&p, p_end); 231 double percent_factor = unserialise_double(p, p_end); 232 232 233 double percent_factor = unserialise_double(&p, p_end);234 235 233 vector<Xapian::Internal::MSetItem> items; 236 size_t msize = decode_length( &p, p_end, false);234 size_t msize = decode_length(p, p_end, false); 237 235 while (msize-- > 0) { 238 Xapian::weight wt = unserialise_double( &p, p_end);239 Xapian::docid did = decode_length( &p, p_end, false);240 size_t len = decode_length( &p, p_end, true);241 string key( p, len);242 p += len;236 Xapian::weight wt = unserialise_double(p, p_end); 237 Xapian::docid did = decode_length(p, p_end, false); 238 size_t len = decode_length(p, p_end, true); 239 string key(*p, len); 240 *p += len; 243 241 items.push_back(Xapian::Internal::MSetItem(wt, did, key, 244 decode_length( &p, p_end, false)));242 decode_length(p, p_end, false))); 245 243 } 246 244 245 size_t terminfosize = decode_length(p, p_end, false); 247 246 map<string, Xapian::MSet::Internal::TermFreqAndWeight> terminfo; 248 while ( p != p_end) {247 while (terminfosize-- > 0) { 249 248 Xapian::MSet::Internal::TermFreqAndWeight tfaw; 250 size_t len = decode_length( &p, p_end, true);251 string term( p, len);252 p += len;253 tfaw.termfreq = decode_length( &p, p_end, false);254 tfaw.termweight = unserialise_double( &p, p_end);249 size_t len = decode_length(p, p_end, true); 250 string term(*p, len); 251 *p += len; 252 tfaw.termfreq = decode_length(p, p_end, false); 253 tfaw.termweight = unserialise_double(p, p_end); 255 254 terminfo.insert(make_pair(term, tfaw)); 256 255 } 257 256 -
xapian-core/net/remoteserver.cc
25 25 #include "xapian/database.h" 26 26 #include "xapian/enquire.h" 27 27 #include "xapian/error.h" 28 #include "xapian/matchspy.h" 28 29 #include "xapian/valueiterator.h" 29 30 30 31 #include "safeerrno.h" … … 354 355 send_message(REPLY_UPDATE, message); 355 356 } 356 357 358 /** Structure holding a list of match spies. 359 * 360 * The main reason for the existence of this structure is to make it easy to 361 * ensure that the match spies are all deleted after use. 362 */ 363 struct MatchSpyList { 364 vector<Xapian::MatchSpy *> spies; 365 366 ~MatchSpyList() { 367 vector<Xapian::MatchSpy *>::const_iterator i; 368 for (i = spies.begin(); i != spies.end(); ++i) { 369 delete *i; 370 } 371 } 372 }; 373 357 374 void 358 375 RemoteServer::msg_query(const string &message_in) 359 376 { … … 405 422 406 423 // Unserialise the Weight object. 407 424 len = decode_length(&p, p_end, true); 408 const Xapian::Weight * wttype = ctx.get_weighting_scheme(string(p, len)); 425 string wtname(p, len); 426 p += len; 427 428 const Xapian::Weight * wttype = ctx.get_weighting_scheme(wtname); 409 429 if (wttype == NULL) { 410 430 // Note: user weighting schemes should be registered by adding them to 411 431 // a SerialisationContext, and setting the context using 412 432 // RemoteServer::set_context(). 413 433 throw Xapian::InvalidArgumentError("Weighting scheme " + 414 string(p, len)+ " not registered");434 wtname + " not registered"); 415 435 } 416 p += len;417 436 418 437 len = decode_length(&p, p_end, true); 419 438 AutoPtr<Xapian::Weight> wt(wttype->unserialise(string(p, len))); 420 439 p += len; 421 440 422 441 // Unserialise the RSet object. 423 Xapian::RSet rset = unserialise_rset(string(p, p_end - p)); 442 len = decode_length(&p, p_end, true); 443 Xapian::RSet rset = unserialise_rset(string(p, len)); 444 p += len; 424 445 446 // Unserialise the MatchSpy objects. 447 vector<Xapian::MatchSpy *>::size_type spycount = decode_length(&p, p_end, false); 448 MatchSpyList matchspies; 449 while (spycount != 0) { 450 len = decode_length(&p, p_end, true); 451 string spytype(p, len); 452 const Xapian::MatchSpy * spyclass = ctx.get_match_spy(spytype); 453 if (spyclass == NULL) { 454 throw Xapian::InvalidArgumentError("Match spy " + spytype + 455 " not registered"); 456 } 457 p += len; 458 459 len = decode_length(&p, p_end, true); 460 matchspies.spies.push_back(spyclass->unserialise(string(p, len), ctx)); 461 p += len; 462 463 --spycount; 464 } 465 425 466 Xapian::Weight::Internal local_stats; 426 467 MultiMatch match(*db, query.get(), qlen, &rset, collapse_max, collapse_key, 427 468 percent_cutoff, weight_cutoff, order, 428 469 sort_key, sort_by, sort_value_forward, NULL, 429 NULL, local_stats, wt.get() );470 NULL, local_stats, wt.get(), matchspies.spies); 430 471 431 472 send_message(REPLY_STATS, serialise_stats(local_stats)); 432 473 … … 448 489 Xapian::MSet mset; 449 490 match.get_mset(first, maxitems, check_at_least, mset, total_stats, 0, 0); 450 491 451 send_message(REPLY_RESULTS, serialise_mset(mset)); 492 message = serialise_mset(mset); 493 494 for (vector<Xapian::MatchSpy *>::const_iterator i = matchspies.spies.begin(); 495 i != matchspies.spies.end(); ++i) { 496 string spy_results = (*i)->serialise_results(); 497 message += encode_length(spy_results.size()); 498 message += spy_results; 499 } 500 send_message(REPLY_RESULTS, message); 452 501 } 453 502 454 503 void -
xapian-core/common/omenquireinternal.h
3 3 * Copyright 1999,2000,2001 BrightStation PLC 4 4 * Copyright 2001,2002 Ananova Ltd 5 5 * Copyright 2002,2003,2004,2005,2006,2007,2008,2009 Olly Betts 6 * Copyright 2009 Lemur Consulting Ltd 6 7 * 7 8 * This program is free software; you can redistribute it and/or 8 9 * modify it under the terms of the GNU General Public License as … … 159 160 */ 160 161 mutable Weight * weight; 161 162 163 vector<MatchSpy *> spies; 164 162 165 Internal(const Xapian::Database &databases, ErrorHandler * errorhandler_); 163 166 ~Internal(); 164 167 … … 174 177 const Query & get_query(); 175 178 MSet get_mset(Xapian::doccount first, Xapian::doccount maxitems, 176 179 Xapian::doccount check_at_least, 177 const RSet *omrset, const MatchDecider *mdecider, 178 const MatchDecider *matchspy) const; 180 const RSet *omrset, 181 const MatchDecider *mdecider, 182 const MatchDecider *matchspy_legacy) const; 179 183 ESet get_eset(Xapian::termcount maxitems, const RSet & omrset, int flags, 180 184 double k, const ExpandDecider *edecider) const; 181 185 -
xapian-core/common/multimatch.h
2 2 * 3 3 * Copyright 1999,2000,2001 BrightStation PLC 4 4 * Copyright 2002,2003,2004,2005,2006,2007,2009 Olly Betts 5 * Copyright 2009 Lemur Consulting Ltd 5 6 * 6 7 * This program is free software; you can redistribute it and/or 7 8 * modify it under the terms of the GNU General Public License as … … 71 72 /** Is each sub-database remote? */ 72 73 vector<bool> is_remote; 73 74 75 /// The matchspies to use. 76 const vector<Xapian::MatchSpy *> & matchspies; 77 74 78 /** get the maxweight that the postlist pl may return, calling 75 79 * recalc_maxweight if recalculate_w_max is set, and unsetting it. 76 80 * Must only be called on the top of the postlist tree. … … 110 114 const Xapian::Sorter * sorter_, 111 115 Xapian::ErrorHandler * errorhandler, 112 116 Xapian::Weight::Internal & stats, 113 const Xapian::Weight *wtscheme); 117 const Xapian::Weight *wtscheme, 118 const vector<Xapian::MatchSpy *> & matchspies_); 114 119 115 120 void get_mset(Xapian::doccount first, 116 121 Xapian::doccount maxitems, … … 118 123 Xapian::MSet & mset, 119 124 const Xapian::Weight::Internal & stats, 120 125 const Xapian::MatchDecider * mdecider, 121 const Xapian::MatchDecider * matchspy );126 const Xapian::MatchDecider * matchspy_legacy); 122 127 123 128 /** Called by postlists to indicate that they've rearranged themselves 124 129 * and the maxweight now possible is smaller. -
xapian-core/common/remote-database.h
143 143 * @param weight_cutoff Weight cutoff. 144 144 * @param wtscheme Weighting scheme. 145 145 * @param omrset The rset. 146 * @param matchspies The matchspies to use. NULL if none. 146 147 */ 147 148 void set_query(const Xapian::Query::Internal *query, 148 149 Xapian::termcount qlen, … … 154 155 bool sort_value_forward, 155 156 int percent_cutoff, Xapian::weight weight_cutoff, 156 157 const Xapian::Weight *wtscheme, 157 const Xapian::RSet &omrset); 158 const Xapian::RSet &omrset, 159 const vector<Xapian::MatchSpy *> & matchspies); 158 160 159 161 /** Get the stats from the remote server. 160 162 * … … 169 171 const Xapian::Weight::Internal &stats); 170 172 171 173 /// Get the MSet from the remote server. 172 void get_mset(Xapian::MSet &mset); 174 void get_mset(Xapian::MSet &mset, 175 const vector<Xapian::MatchSpy *> & matchspies); 173 176 174 177 /// Get remote termlist. 175 178 TermList * open_term_list(Xapian::docid did) const; -
xapian-core/common/remoteprotocol.h
41 41 // 30.6: Support for OP_VALUE_GE and OP_VALUE_LE in query serialisation 42 42 // 31: Clean up for Xapian 1.1.0 43 43 // 32: Serialise termfreq and reltermfreqs together in serialise_stats. 44 #define XAPIAN_REMOTE_PROTOCOL_MAJOR_VERSION 32 44 // 33: Support for passing matchspies over the remote connection. 45 #define XAPIAN_REMOTE_PROTOCOL_MAJOR_VERSION 33 45 46 #define XAPIAN_REMOTE_PROTOCOL_MINOR_VERSION 0 46 47 47 48 /** Message types (client -> server). -
xapian-core/common/serialisationcontextinternal.h
31 31 namespace Xapian { 32 32 class Weight; 33 33 class PostingSource; 34 class MatchSpy; 34 35 } 35 36 36 37 class Xapian::SerialisationContext::Internal … … 41 42 /// Registered external posting sources. 42 43 std::map<std::string, Xapian::PostingSource *> postingsources; 43 44 45 /// Registered match spies. 46 std::map<std::string, Xapian::MatchSpy *> matchspies; 47 44 48 /// Add the standard default weighting schemes and posting sources. 45 49 void add_defaults(); 46 50 … … 50 54 /// Clear all registered posting sources from the context. 51 55 void clear_posting_sources(); 52 56 57 /// Clear all registered match spies from the context. 58 void clear_match_spies(); 59 53 60 public: 54 61 Internal(); 55 62 ~Internal(); … … 77 84 */ 78 85 const Xapian::PostingSource * 79 86 get_posting_source(const std::string & name) const; 87 88 /// Register a user-defined match spy class. 89 void register_match_spy(const Xapian::MatchSpy &spy); 90 91 /** Get a match spy given a name. 92 * 93 * The returned match spy is owned by the context object. 94 * 95 * Returns NULL if the match spy could not be found. 96 */ 97 const Xapian::MatchSpy * 98 get_match_spy(const std::string & name) const; 99 80 100 }; 81 101 82 102 #endif // XAPIAN_INCLUDED_SERIALISATIONCONTEXTINTERNAL_H -
xapian-core/common/serialise.h
132 132 133 133 /** Unserialise a serialised Xapian::MSet object. 134 134 * 135 * @param s The string to unserialise. 135 * @param p Pointer to pointer to start of the string to unserialise. 136 * @param p_end Pointer to end of the string to unserialise. 136 137 * 137 138 * @return The unserialised Xapian::MSet object. 138 139 */ 139 Xapian::MSet unserialise_mset(const std::string &s);140 Xapian::MSet unserialise_mset(const char ** p, const char * p_end); 140 141 141 142 /** Serialise a Xapian::RSet object. 142 143 * -
xapian-core/api/Makefile.mk
16 16 api/errorhandler.cc\ 17 17 api/expanddecider.cc\ 18 18 api/leafpostlist.cc\ 19 api/matchspy.cc\ 19 20 api/omdatabase.cc\ 20 21 api/omdocument.cc\ 21 22 api/omenquire.cc\ -
xapian-core/api/omenquire.cc
3 3 * Copyright 1999,2000,2001 BrightStation PLC 4 4 * Copyright 2001,2002 Ananova Ltd 5 5 * Copyright 2002,2003,2004,2005,2006,2007,2008,2009 Olly Betts 6 * Copyright 2007 Lemur Consulting Ltd6 * Copyright 2007,2009 Lemur Consulting Ltd 7 7 * 8 8 * This program is free software; you can redistribute it and/or 9 9 * modify it under the terms of the GNU General Public License as … … 642 642 Enquire::Internal::get_mset(Xapian::doccount first, Xapian::doccount maxitems, 643 643 Xapian::doccount check_at_least, const RSet *rset, 644 644 const MatchDecider *mdecider, 645 const MatchDecider *matchspy ) const645 const MatchDecider *matchspy_legacy) const 646 646 { 647 647 DEBUGCALL(API, MSet, "Enquire::Internal::get_mset", first << ", " << 648 648 maxitems << ", " << check_at_least << ", " << rset << ", " << 649 mdecider << ", " << matchspy );649 mdecider << ", " << matchspy_legacy); 650 650 651 651 if (percent_cutoff && (sort_by == VAL || sort_by == VAL_REL)) { 652 652 throw Xapian::UnimplementedError("Use of a percentage cutoff while sorting primary by value isn't currently supported"); … … 661 661 collapse_max, collapse_key, 662 662 percent_cutoff, weight_cutoff, 663 663 order, sort_key, sort_by, sort_value_forward, sorter, 664 errorhandler, stats, weight );664 errorhandler, stats, weight, spies); 665 665 // Run query and put results into supplied Xapian::MSet object. 666 666 MSet retval; 667 667 match.get_mset(first, maxitems, check_at_least, retval, 668 stats, mdecider, matchspy );668 stats, mdecider, matchspy_legacy); 669 669 670 670 Assert(weight->name() != "bool" || retval.get_max_possible() == 0); 671 671 … … 893 893 } 894 894 895 895 void 896 Enquire::add_matchspy(MatchSpy * spy) { 897 DEBUGAPICALL(void, "Xapian::Enquire::add_matchspy", spy); 898 internal->spies.push_back(spy); 899 } 900 901 void 902 Enquire::clear_matchspies() { 903 DEBUGAPICALL(const Xapian::Query &, "Xapian::Enquire::clear_matchspies", ""); 904 internal->spies.clear(); 905 } 906 907 void 896 908 Enquire::set_weighting_scheme(const Weight &weight_) 897 909 { 898 910 DEBUGAPICALL(void, "Xapian::Enquire::set_weighting_scheme", "[Weight]"); -
xapian-core/api/serialisationcontext.cc
24 24 #include "xapian/serialisationcontext.h" 25 25 26 26 #include "xapian/error.h" 27 #include "xapian/matchspy.h" 27 28 #include "xapian/postingsource.h" 28 29 #include "xapian/weight.h" 29 30 … … 78 79 RETURN(internal->get_weighting_scheme(name)); 79 80 } 80 81 81 82 82 void 83 83 SerialisationContext::register_posting_source(const Xapian::PostingSource &source) 84 84 { … … 93 93 RETURN(internal->get_posting_source(name)); 94 94 } 95 95 96 void 97 SerialisationContext::register_match_spy(const Xapian::MatchSpy &spy) 98 { 99 LOGCALL_VOID(API, "Xapian::SerialisationContext::register_match_spy", spy.name()); 100 internal->register_match_spy(spy); 101 } 96 102 103 const Xapian::MatchSpy * 104 SerialisationContext::get_match_spy(const string & name) const 105 { 106 LOGCALL(API, const Xapian::MatchSpy *, "Xapian::SerialisationContext::get_match_spy", name); 107 RETURN(internal->get_match_spy(name)); 108 } 109 110 97 111 SerialisationContext::Internal::Internal() 98 112 : Xapian::Internal::RefCntBase(), 99 113 wtschemes(), … … 106 120 { 107 121 clear_weighting_schemes(); 108 122 clear_posting_sources(); 123 clear_match_spies(); 109 124 } 110 125 111 126 void … … 128 143 postingsources[source->name()] = source; 129 144 source = new Xapian::FixedWeightPostingSource(0.0); 130 145 postingsources[source->name()] = source; 146 147 Xapian::MatchSpy * spy; 148 spy = new Xapian::ValueCountMatchSpy(); 149 matchspies[spy->name()] = spy; 131 150 } 132 151 133 152 void … … 149 168 } 150 169 151 170 void 171 SerialisationContext::Internal::clear_match_spies() 172 { 173 map<string, Xapian::MatchSpy *>::const_iterator i; 174 for (i = matchspies.begin(); i != matchspies.end(); ++i) { 175 delete i->second; 176 } 177 } 178 179 void 152 180 SerialisationContext::Internal::register_weighting_scheme(const Xapian::Weight &wt) 153 181 { 154 182 string wtname = wt.name(); … … 220 248 return i->second; 221 249 } 222 250 251 void 252 SerialisationContext::Internal::register_match_spy(const Xapian::MatchSpy &spy) 253 { 254 string spyname = spy.name(); 255 if (spyname.empty()) { 256 throw Xapian::InvalidOperationError("Unable to register match spy - name() method returns empty string."); 257 } 258 259 map<string, Xapian::MatchSpy *>::const_iterator i; 260 i = matchspies.find(spyname); 261 if (i != matchspies.end()) { 262 delete i->second; 263 } 264 265 Xapian::MatchSpy * spyclone = spy.clone(); 266 if (!spyclone) { 267 matchspies.erase(spyname); 268 throw Xapian::InvalidOperationError("Unable to register match spy - clone() method returns NULL."); 269 } 270 try { 271 matchspies[spyname] = spyclone; 272 } catch(...) { 273 delete spyclone; 274 matchspies.erase(spyname); 275 throw; 276 } 223 277 } 278 279 const Xapian::MatchSpy * 280 SerialisationContext::Internal::get_match_spy(const string & name) const 281 { 282 map<string, Xapian::MatchSpy *>::const_iterator i; 283 i = matchspies.find(name); 284 if (i == matchspies.end()) { 285 return NULL; 286 } 287 return i->second; 288 } 289 290 } -
xapian-core/api/matchspy.cc
1 /** @file matchspy.cc 2 * @brief MatchSpy implementation. 3 */ 4 /* Copyright (C) 2007,2008,2009 Olly Betts 5 * Copyright (C) 2007,2009 Lemur Consulting Ltd 6 * 7 * This program is free software; you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License as published by 9 * the Free Software Foundation; either version 2 of the License, or 10 * (at your option) any later version. 11 * 12 * This program is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 * GNU General Public License for more details. 16 * 17 * You should have received a copy of the GNU General Public License 18 * along with this program; if not, write to the Free Software 19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 20 */ 21 22 #include <config.h> 23 #include <xapian/matchspy.h> 24 25 #include <xapian/document.h> 26 #include <xapian/error.h> 27 #include <xapian/queryparser.h> 28 #include <xapian/serialisationcontext.h> 29 30 #include <map> 31 #include <string> 32 #include <vector> 33 34 #include "autoptr.h" 35 #include "debuglog.h" 36 #include "omassert.h" 37 #include "serialise.h" 38 #include "stringutils.h" 39 #include "str.h" 40 41 #include <float.h> 42 #include <math.h> 43 44 45 using namespace std; 46 47 namespace Xapian { 48 49 MatchSpy::~MatchSpy() {} 50 51 MatchSpy * 52 MatchSpy::clone() const { 53 throw UnimplementedError("MatchSpy not suitable for use with remote searches - clone() method unimplemented"); 54 } 55 56 string 57 MatchSpy::name() const { 58 throw UnimplementedError("MatchSpy not suitable for use with remote searches - name() method unimplemented"); 59 } 60 61 string 62 MatchSpy::serialise() const { 63 throw UnimplementedError("MatchSpy not suitable for use with remote searches - serialise() method unimplemented"); 64 } 65 66 MatchSpy * 67 MatchSpy::unserialise(const string &, const SerialisationContext &) const { 68 throw UnimplementedError("MatchSpy not suitable for use with remote searches - unserialise() method unimplemented"); 69 } 70 71 string 72 MatchSpy::serialise_results() const { 73 throw UnimplementedError("MatchSpy not suitable for use with remote searches - serialise_results() method unimplemented"); 74 } 75 76 void 77 MatchSpy::merge_results(const string &) { 78 throw UnimplementedError("MatchSpy not suitable for use with remote searches - merge_results() method unimplemented"); 79 } 80 81 string 82 MatchSpy::get_description() const { 83 return "Xapian::MatchSpy()"; 84 } 85 86 87 /** Compare two StringAndFrequency objects. 88 * 89 * The comparison is firstly by frequency (higher is better), then by string 90 * (earlier lexicographic sort is better). 91 */ 92 class StringAndFreqCmpByFreq { 93 public: 94 /// Default constructor 95 StringAndFreqCmpByFreq() {} 96 97 /// Return true if a has a higher frequency than b. 98 /// If equal, compare by the str, to provide a stable sort order. 99 bool operator()(const StringAndFrequency &a, 100 const StringAndFrequency &b) const { 101 if (a.frequency > b.frequency) return true; 102 if (a.frequency < b.frequency) return false; 103 if (a.str > b.str) return false; 104 return true; 105 } 106 }; 107 108 109 /** Get the most frequent items from a map from string to frequency. 110 * 111 * This takes input such as that returned by @a 112 * ValueCountMatchSpy::get_values(), and returns a vector of the most 113 * frequent items in the input. 114 * 115 * @param result A vector which will be filled with the most frequent 116 * items, in descending order of frequency. Items with 117 * the same frequency will be sorted in ascending 118 * alphabetical order. 119 * 120 * @param items The map from string to frequency, from which the most 121 * frequent items will be selected. 122 * 123 * @param maxitems The maximum number of items to return. 124 */ 125 static void 126 get_most_frequent_items(vector<StringAndFrequency> & result, 127 const map<string, doccount> & items, 128 size_t maxitems) 129 { 130 result.clear(); 131 result.reserve(maxitems); 132 StringAndFreqCmpByFreq cmpfn; 133 bool is_heap(false); 134 135 for (map<string, doccount>::const_iterator i = items.begin(); 136 i != items.end(); i++) { 137 Assert(result.size() <= maxitems); 138 result.push_back(StringAndFrequency(i->first, i->second)); 139 if (result.size() > maxitems) { 140 // Make the list back into a heap. 141 if (is_heap) { 142 // Only the new element isn't in the right place. 143 push_heap(result.begin(), result.end(), cmpfn); 144 } else { 145 // Need to build heap from scratch. 146 make_heap(result.begin(), result.end(), cmpfn); 147 is_heap = true; 148 } 149 pop_heap(result.begin(), result.end(), cmpfn); 150 result.pop_back(); 151 } 152 } 153 154 if (is_heap) { 155 sort_heap(result.begin(), result.end(), cmpfn); 156 } else { 157 sort(result.begin(), result.end(), cmpfn); 158 } 159 } 160 161 162 void 163 ValueCountMatchSpy::operator()(const Document &doc, weight) { 164 ++total; 165 string val(doc.get_value(slot)); 166 if (!val.empty()) ++values[val]; 167 } 168 169 void 170 ValueCountMatchSpy::get_top_values(vector<StringAndFrequency> & result, 171 size_t maxvalues) const 172 { 173 get_most_frequent_items(result, values, maxvalues); 174 } 175 176 MatchSpy * 177 ValueCountMatchSpy::clone() const { 178 return new ValueCountMatchSpy(slot); 179 } 180 181 string 182 ValueCountMatchSpy::name() const { 183 return "Xapian::ValueCountMatchSpy"; 184 } 185 186 string 187 ValueCountMatchSpy::serialise() const { 188 string result; 189 result += encode_length(slot); 190 return result; 191 } 192 193 MatchSpy * 194 ValueCountMatchSpy::unserialise(const string & s, 195 const SerialisationContext &) const{ 196 const char * p = s.data(); 197 const char * end = p + s.size(); 198 199 valueno new_slot = decode_length(&p, end, false); 200 if (p != end) { 201 throw NetworkError("Junk at end of serialised ValueCountMatchSpy"); 202 } 203 204 return new ValueCountMatchSpy(new_slot); 205 } 206 207 string 208 ValueCountMatchSpy::serialise_results() const { 209 LOGCALL(REMOTE, string, "ValueCountMatchSpy::serialise_results", ""); 210 string result; 211 result += encode_length(total); 212 result += encode_length(values.size()); 213 for (map<string, doccount>::const_iterator i = values.begin(); 214 i != values.end(); ++i) { 215 result += encode_length(i->first.size()); 216 result += i->first; 217 result += encode_length(i->second); 218 } 219 RETURN(result); 220 } 221 222 void 223 ValueCountMatchSpy::merge_results(const string & s) { 224 LOGCALL_VOID(REMOTE, "ValueCountMatchSpy::merge_results", s); 225 const char * p = s.data(); 226 const char * end = p + s.size(); 227 228 total += decode_length(&p, end, false); 229 230 map<string, doccount>::size_type items = decode_length(&p, end, false); 231 while (p != end) { 232 while(items != 0) { 233 size_t vallen = decode_length(&p, end, true); 234 string val(p, vallen); 235 p += vallen; 236 doccount freq = decode_length(&p, end, false); 237 values[val] += freq; 238 --items; 239 } 240 } 241 } 242 243 string 244 ValueCountMatchSpy::get_description() const { 245 return "Xapian::ValueCountMatchSpy(" + str(total) + 246 " docs seen, looking in " + str(values.size()) + " slots)"; 247 } 248 249 250 inline double sqrd(double x) { return x * x; } 251 252 /** Calculate a score based on how evenly distributed the frequencies of a map 253 * of values are. 254 */ 255 template<class T> double 256 do_score_evenness(const map<T, doccount> & values, 257 doccount total, 258 double desired_no_of_categories) 259 { 260 if (total == 0) return 0.0; 261 262 size_t total_unset = total; 263 double score = 0.0; 264 265 if (desired_no_of_categories <= 0.0) 266 desired_no_of_categories = values.size(); 267 268 double avg = double(total) / desired_no_of_categories; 269 270 typename map<T, doccount>::const_iterator i; 271 for (i = values.begin(); i != values.end(); ++i) { 272 size_t count = i->second; 273 total_unset -= count; 274 score += sqrd(count - avg); 275 } 276 if (total_unset) score += sqrd(total_unset - avg); 277 278 // Scale down so the total number of items doesn't make a difference. 279 score /= sqrd(total); 280 281 // Bias towards returning the number of categories requested. 282 score += 0.01 * sqrd(desired_no_of_categories - values.size()); 283 284 return score; 285 } 286 287 double score_evenness(const map<string, doccount> & values, 288 doccount total, 289 double desired_no_of_categories) { 290 return do_score_evenness(values, total, desired_no_of_categories); 291 } 292 293 double score_evenness(const map<NumericRange, doccount> & values, 294 doccount total, 295 double desired_no_of_categories) { 296 return do_score_evenness(values, total, desired_no_of_categories); 297 } 298 299 double score_evenness(const ValueCountMatchSpy & spy, 300 double desired_no_of_categories) { 301 return do_score_evenness(spy.get_values(), spy.get_total(), 302 desired_no_of_categories); 303 } 304 305 306 /** A bucket, used when building numeric ranges. 307 */ 308 struct bucketval { 309 size_t count; 310 double min, max; 311 312 bucketval() : count(0), min(DBL_MAX), max(-DBL_MAX) { } 313 314 void update(size_t n, double value) { 315 count += n; 316 if (value < min) min = value; 317 if (value > max) max = value; 318 } 319 }; 320 321 doccount build_numeric_ranges(map<NumericRange, doccount> & result, 322 const map<string, doccount> & values, 323 size_t max_ranges) 324 { 325 double lo = DBL_MAX, hi = -DBL_MAX; 326 result.clear(); 327 328 map<double, doccount> histo; 329 doccount total_set = 0; 330 map<string, doccount>::const_iterator i; 331 for (i = values.begin(); i != values.end(); ++i) { 332 if (i->first.size() == 0) continue; 333 double v = sortable_unserialise(i->first.c_str()); 334 if (v < lo) lo = v; 335 if (v > hi) hi = v; 336 doccount count = i->second; 337 histo[v] = count; 338 total_set += count; 339 } 340 341 if (total_set == 0) { 342 // No set values. 343 return total_set; 344 } 345 if (lo == hi) { 346 // All set values are the same. 347 NumericRange range; 348 range.lower = lo; 349 range.upper = hi; 350 result[range] = total_set; 351 return total_set; 352 } 353 354 double sizeby = max(fabs(hi), fabs(lo)); 355 // E.g. if sizeby = 27.4 and max_ranges = 7, we want to split into units of 356 // width 1.0 which we may then coalesce if there are too many used buckets. 357 double unit = pow(10.0, floor(log10(sizeby / max_ranges) - 0.2)); 358 double start = floor(lo / unit) * unit; 359 // Can happen due to FP rounding (e.g. lo = 11.95, unit = 0.01). 360 if (start > lo) start = lo; 361 size_t n_buckets = size_t(ceil(hi / unit) - floor(lo / unit)); 362 363 bool scaleby2 = true; 364 vector<bucketval> bucket(n_buckets + 1); 365 while (true) { 366 size_t n_used = 0; 367 map<double, doccount>::const_iterator j; 368 for (j = histo.begin(); j != histo.end(); ++j) { 369 double v = j->first; 370 size_t b = size_t(floor((v - start) / unit)); 371 if (b > n_buckets) b = n_buckets; // FIXME - Hacky workaround to ensure that b is in range. 372 if (bucket[b].count == 0) ++n_used; 373 bucket[b].update(j->second, v); 374 } 375 376 if (n_used <= max_ranges) break; 377 378 unit *= scaleby2 ? 2.0 : 2.5; 379 scaleby2 = !scaleby2; 380 start = floor(lo / unit) * unit; 381 // Can happen due to FP rounding (e.g. lo = 11.95, unit = 0.01). 382 if (start > lo) start = lo; 383 n_buckets = size_t(ceil(hi / unit) - floor(lo / unit)); 384 bucket.resize(0); 385 bucket.resize(n_buckets + 1); 386 } 387 388 map<string, doccount> discrete_categories; 389 for (size_t b = 0; b < bucket.size(); ++b) { 390 if (bucket[b].count == 0) continue; 391 NumericRange range; 392 range.lower = bucket[b].min; 393 range.upper = bucket[b].max; 394 result[range] = bucket[b].count; 395 } 396 397 return total_set; 398 } 399 400 } -
xapian-core/backends/remote/remote-database.cc
Property changes on: xapian-core/api/matchspy.cc ___________________________________________________________________ Added: svn:eol-style + native
43 43 #include <vector> 44 44 45 45 #include "xapian/error.h" 46 #include "xapian/matchspy.h" 46 47 47 48 using namespace std; 48 49 … … 536 537 bool sort_value_forward, 537 538 int percent_cutoff, Xapian::weight weight_cutoff, 538 539 const Xapian::Weight *wtscheme, 539 const Xapian::RSet &omrset) 540 const Xapian::RSet &omrset, 541 const vector<Xapian::MatchSpy *> & matchspies) 540 542 { 541 543 string tmp = query->serialise(); 542 544 string message = encode_length(tmp.size()); … … 561 563 message += encode_length(tmp.size()); 562 564 message += tmp; 563 565 564 message += serialise_rset(omrset); 566 tmp = serialise_rset(omrset); 567 message += encode_length(tmp.size()); 568 message += tmp; 565 569 570 message += encode_length(matchspies.size()); 571 for (vector<Xapian::MatchSpy *>::const_iterator i = matchspies.begin(); 572 i != matchspies.end(); ++i) { 573 574 tmp = (*i)->name(); 575 if (tmp.size() == 0) { 576 throw Xapian::UnimplementedError("MatchSpy not suitable for use with remote searches - name() method returned empty string"); 577 } 578 message += encode_length(tmp.size()); 579 message += tmp; 580 581 tmp = (*i)->serialise(); 582 message += encode_length(tmp.size()); 583 message += tmp; 584 } 585 566 586 send_message(MSG_QUERY, message); 567 587 } 568 588 … … 592 612 } 593 613 594 614 void 595 RemoteDatabase::get_mset(Xapian::MSet &mset) 615 RemoteDatabase::get_mset(Xapian::MSet &mset, 616 const vector<Xapian::MatchSpy *> & matchspies) 596 617 { 597 618 string message; 598 619 get_message(message, REPLY_RESULTS); 599 mset = unserialise_mset(message); 620 const char * p = message.data(); 621 const char * p_end = p + message.size(); 622 mset = unserialise_mset(&p, p_end); 623 624 for (vector<Xapian::MatchSpy *>::const_iterator i = matchspies.begin(); 625 i != matchspies.end(); ++i) { 626 if (p == p_end) 627 throw Xapian::NetworkError("Expected serialised matchspy"); 628 size_t len = decode_length(&p, p_end, true); 629 string spyresults = string(p, len); 630 p += len; 631 (*i)->merge_results(spyresults); 632 } 633 if (p != p_end) 634 throw Xapian::NetworkError("Junk at end of mset"); 600 635 } 601 636 602 637 void -
xapian-bindings/csharp/Makefile.am
26 26 Flint.cs \ 27 27 InMemory.cs \ 28 28 MatchDecider.cs \ 29 MatchSpy.cs \ 29 30 MSet.cs \ 30 31 MSetIterator.cs \ 31 32 MultiValueSorter.cs \ … … 37 38 QueryParser.cs \ 38 39 Remote.cs \ 39 40 RSet.cs \ 41 SWIGTYPE_p_std__mapT_Xapian__NumericRange_unsigned_int_t.cs \ 42 SWIGTYPE_p_std__mapT_std__string_unsigned_int_t.cs \ 40 43 SWIGTYPE_p_std__string.cs \ 41 44 SWIGTYPE_p_std__vectorT_std__string_t.cs \ 42 45 SWIGTYPE_p_std__vectorT_Xapian__Query_t.cs \ 46 SWIGTYPE_p_std__vectorT_Xapian__StringAndFrequency_t.cs \ 43 47 SerialisationContext.cs \ 44 48 SimpleStopper.cs \ 45 49 Sorter.cs \ … … 49 53 TermGenerator.cs \ 50 54 TermIterator.cs \ 51 55 TradWeight.cs \ 56 ValueCountMatchSpy.cs \ 52 57 ValueIterator.cs \ 53 58 ValueRangeProcessor.cs \ 54 59 Version.cs \ -
xapian-bindings/python/pythontest2.py
1289 1289 enq.set_query(xapian.Query('foo')) 1290 1290 enq.get_mset(0, 10) 1291 1291 1292 def test_matchspy(): 1293 """Test use of matchspies. 1294 1295 """ 1296 db = setup_database() 1297 query = xapian.Query(xapian.Query.OP_OR, "was", "it") 1298 enq = xapian.Enquire(db) 1299 enq.set_query(query) 1300 1301 def set_matchspy_deref(enq): 1302 """Set a matchspy, and then drop the reference, to check that it 1303 doesn't get deleted too soon. 1304 """ 1305 spy = xapian.ValueCountMatchSpy(0) 1306 enq.add_matchspy(spy) 1307 del spy 1308 set_matchspy_deref(enq) 1309 mset = enq.get_mset(0, 10) 1310 expect(len(mset), 5) 1311 1312 spy = xapian.ValueCountMatchSpy(0) 1313 enq.add_matchspy(spy) 1314 mset = enq.get_mset(0, 10) 1315 expect(spy.get_values_as_dict(), {'zero': 1}) 1316 expect(spy.get_total(), 5) 1317 expect(spy.get_top_values(10), [('zero', 1)]) 1318 1292 1319 # Run all tests (ie, callables with names starting "test_"). 1293 1320 if not runtests(globals(), sys.argv[1:]): 1294 1321 sys.exit(1) -
xapian-bindings/python/pythontest3.py
1289 1289 enq.set_query(xapian.Query('foo')) 1290 1290 enq.get_mset(0, 10) 1291 1291 1292 def test_matchspy(): 1293 """Test use of matchspies. 1294 1295 """ 1296 db = setup_database() 1297 query = xapian.Query(xapian.Query.OP_OR, "was", "it") 1298 enq = xapian.Enquire(db) 1299 enq.set_query(query) 1300 1301 def set_matchspy_deref(enq): 1302 """Set a matchspy, and then drop the reference, to check that it 1303 doesn't get deleted too soon. 1304 """ 1305 spy = xapian.ValueCountMatchSpy(0) 1306 enq.add_matchspy(spy) 1307 del spy 1308 set_matchspy_deref(enq) 1309 mset = enq.get_mset(0, 10) 1310 expect(len(mset), 5) 1311 1312 spy = xapian.ValueCountMatchSpy(0) 1313 enq.add_matchspy(spy) 1314 mset = enq.get_mset(0, 10) 1315 expect(spy.get_values_as_dict(), {'zero': 1}) 1316 expect(spy.get_total(), 5) 1317 expect(spy.get_top_values(10), [('zero', 1)]) 1318 1292 1319 # Run all tests (ie, callables with names starting "test_"). 1293 1320 if not runtests(globals(), sys.argv[1:]): 1294 1321 sys.exit(1) -
xapian-bindings/python/util.i
172 172 } 173 173 } 174 174 175 %{ 176 /* Typemap for returning a map of ints keyed by strings: converts to a dict. 177 * This is used for @a ValueCountMatchSpy::get_values(). 178 * The GIL must be held when this is called. 179 */ 180 PyObject * 181 value_map_to_dict(const std::map<std::string, Xapian::doccount> & vals) 182 { 183 PyObject * result = PyDict_New(); 184 if (result == 0) { 185 return NULL; 186 } 187 188 std::map<std::string, Xapian::doccount>::const_iterator i; 189 for (i = vals.begin(); i != vals.end(); ++i) { 190 PyObject * str = PyString_FromStringAndSize((*i).first.data(), 191 (*i).first.size()); 192 if (str == 0) { 193 Py_DECREF(result); 194 result = NULL; 195 return NULL; 196 } 197 198 PyObject * l = PyInt_FromLong((*i).second); 199 if (l == 0) { 200 Py_DECREF(str); 201 Py_DECREF(result); 202 result = NULL; 203 return NULL; 204 } 205 206 if (PyDict_SetItem(result, str, l) == -1) { 207 Py_DECREF(result); 208 result = NULL; 209 return NULL; 210 } 211 Py_DECREF(str); 212 Py_DECREF(l); 213 } 214 return result; 215 } 216 %} 217 218 /** Typemap pair for getting the return value from @a ValueCountMatchSpy::get_top_values(). 219 */ 220 %typemap(in, numinputs=0) std::vector<Xapian::StringAndFrequency> & result (std::vector<Xapian::StringAndFrequency> temp) { 221 $1 = &temp; 222 } 223 %typemap(argout) std::vector<Xapian::StringAndFrequency> & result { 224 Py_DECREF($result); 225 $result = PyList_New($1->size()); 226 size_t pos = 0; 227 for (std::vector<Xapian::StringAndFrequency>::const_iterator i = $1->begin(); 228 i != $1->end(); ++i) { 229 PyObject * str = PyString_FromStringAndSize((*i).str.data(), 230 (*i).str.size()); 231 if (str == 0) { 232 Py_DECREF($result); 233 $result = NULL; 234 SWIG_fail; 235 } 236 237 PyObject * l = PyInt_FromLong((*i).frequency); 238 if (l == 0) { 239 Py_DECREF($result); 240 Py_DECREF(str); 241 $result = NULL; 242 SWIG_fail; 243 } 244 245 PyObject *t = PyTuple_New(2); 246 if (t == 0) { 247 Py_DECREF($result); 248 Py_DECREF(str); 249 Py_DECREF(l); 250 $result = NULL; 251 SWIG_fail; 252 } 253 PyTuple_SetItem(t, 0, str); 254 PyTuple_SetItem(t, 1, l); 255 256 PyList_SetItem($result, pos++, t); 257 } 258 } 259 175 260 %typedef PyObject *LangSpecificListType; 176 261 177 262 %inline %{ -
xapian-bindings/python/extra.i
22 22 */ 23 23 %} 24 24 25 %extend ValueCountMatchSpy { 26 %feature("nothread") get_values_as_dict; 27 %exception get_values_as_dict { 28 try { 29 $action 30 } catch (...) { 31 Xapian::SetPythonException(); 32 SWIG_fail; 33 } 34 } 35 PyObject * get_values_as_dict() { 36 return value_map_to_dict($self->get_values()); 37 } 38 } 39 25 40 %pythoncode %{ 26 41 27 42 # Set the documentation format - this is used by tools like "epydoc" to decide … … 1076 1091 __all__ = tuple(__all__) 1077 1092 1078 1093 1094 # Fix up Enquire so that it keeps a python reference to the deciders supplied 1095 # to it so that they won't be deleted before the Enquire object. This hack can 1096 # probably be removed once xapian bug #186 is fixed. 1097 _enquire_add_matchspy_orig = Enquire.add_matchspy 1098 def _enquire_match_spy_add(self, decider): 1099 if not hasattr(self, '_deciders'): 1100 self._deciders = [] 1101 self._deciders.append(decider) 1102 _enquire_add_matchspy_orig(self, decider) 1103 _enquire_match_spy_add.__doc__ = Enquire.add_matchspy.__doc__ 1104 Enquire.add_matchspy = _enquire_match_spy_add 1105 1106 _enquire_clear_matchspies_orig = Enquire.clear_matchspies 1107 def _enquire_match_spies_clear(self): 1108 _enquire_clear_matchspies_orig(self, decider) 1109 if hasattr(self, '_deciders'): 1110 del self._deciders 1111 _enquire_match_spies_clear.__doc__ = Enquire.clear_matchspies.__doc__ 1112 Enquire.clear_matchspies = _enquire_match_spies_clear 1113 1114 1115 1079 1116 # Remove static methods which shouldn't be in the API. 1080 1117 del Document_unserialise 1081 1118 del Query_unserialise -
xapian-bindings/xapian.i
356 356 #endif 357 357 358 358 class Database; 359 class MatchSpy; 359 360 class Query; 360 361 class Sorter; 361 362 … … 367 368 void set_query(const Query & query, termcount qlen = 0); 368 369 const Query& get_query(); 369 370 371 void add_matchspy(MatchSpy * spy); 372 void clear_matchspies(); 373 370 374 void set_weighting_scheme(const Weight& weight); 371 375 void set_collapse_key(Xapian::valueno collapse_key, 372 376 Xapian::doccount collapse_max = 1); … … 440 444 441 445 } 442 446 447 %ignore Xapian::SerialisationContext::operator=; 448 %include <xapian/serialisationcontext.h> 449 443 450 /* Generated code won't compile if directors are enabled. Disable for now 444 451 * while we investigate. 445 452 * … … 466 473 %warnfilter(842) Xapian::TradWeight::unserialise; 467 474 %include <xapian/weight.h> 468 475 476 %ignore Xapian::NumericRange::operator<; 477 %include <xapian/matchspy.h> 478 469 479 namespace Xapian { 470 480 471 481 // xapian/database.h … … 747 757 %include <xapian/replication.h> 748 758 %include <xapian/valuesetmatchdecider.h> 749 759 750 %ignore Xapian::SerialisationContext::operator=;751 %include <xapian/serialisationcontext.h>752 753 760 namespace Xapian { 754 761 755 762 #if defined SWIGPYTHON