Ticket #199: matchspy3.patch
File matchspy3.patch, 90.2 KB (added by , 15 years ago) |
---|
-
xapian-maintainer-tools/win32msvc/win32_api.mak
23 23 $(INTDIR)/errorhandler.obj \ 24 24 $(INTDIR)/expanddecider.obj \ 25 25 $(INTDIR)/leafpostlist.obj \ 26 $(INTDIR)/matchspy.obj \ 26 27 $(INTDIR)/omdatabase.obj \ 27 28 $(INTDIR)/omdocument.obj \ 28 29 $(INTDIR)/omenquire.obj \ … … 51 52 $(INTDIR)/errorhandler.cc\ 52 53 $(INTDIR)/expanddecider.cc\ 53 54 $(INTDIR)/leafpostlist.cc\ 55 $(INTDIR)/matchspy.cc \ 54 56 $(INTDIR)/omdatabase.cc\ 55 57 $(INTDIR)/omdocument.cc\ 56 58 $(INTDIR)/omenquire.cc\ -
xapian-core/matcher/multimatch.cc
5 5 * Copyright 2002,2003,2004,2005,2006,2007,2008,2009 Olly Betts 6 6 * Copyright 2003 Orange PCS Ltd 7 7 * Copyright 2003 Sam Liddicott 8 * Copyright 2007,2008 Lemur Consulting Ltd8 * Copyright 2007,2008,2009 Lemur Consulting Ltd 9 9 * 10 10 * This program is free software; you can redistribute it and/or 11 11 * modify it under the terms of the GNU General Public License as … … 47 47 #include "weightinternal.h" 48 48 49 49 #include <xapian/errorhandler.h> 50 #include <xapian/matchspy.h> 50 51 #include <xapian/version.h> // For XAPIAN_HAS_REMOTE_BACKEND 51 52 52 53 #ifdef XAPIAN_HAS_REMOTE_BACKEND … … 174 175 } 175 176 } 176 177 178 /// Class which applies several match spies in turn. 179 class MultipleMatchSpy : public Xapian::MatchSpy { 180 private: 181 /// List of match spies to call, in order. 182 const std::vector<Xapian::MatchSpy *> & spies; 183 184 public: 185 MultipleMatchSpy(const std::vector<Xapian::MatchSpy *> & spies_) 186 : spies(spies_) {} 187 188 /** Implementation of virtual operator(). 189 * 190 * This implementation calls all the spies in turn. 191 */ 192 void operator()(const Xapian::Document &doc, Xapian::weight wt); 193 }; 194 195 void 196 MultipleMatchSpy::operator()(const Xapian::Document &doc, Xapian::weight wt) { 197 LOGCALL_VOID(MATCH, "MultipleMatchSpy::operator()", doc << ", " << wt); 198 vector<Xapian::MatchSpy *>::const_iterator i; 199 for (i = spies.begin(); i != spies.end(); ++i) { 200 (**i)(doc, wt); 201 } 202 } 203 177 204 //////////////////////////////////// 178 205 // Initialisation and cleaning up // 179 206 //////////////////////////////////// … … 191 218 const Xapian::Sorter * sorter_, 192 219 Xapian::ErrorHandler * errorhandler_, 193 220 Xapian::Weight::Internal & stats, 194 const Xapian::Weight * weight_) 221 const Xapian::Weight * weight_, 222 const vector<Xapian::MatchSpy *> & matchspies_) 195 223 : db(db_), query(query_), 196 224 collapse_max(collapse_max_), collapse_key(collapse_key_), 197 225 percent_cutoff(percent_cutoff_), weight_cutoff(weight_cutoff_), … … 199 227 sort_key(sort_key_), sort_by(sort_by_), 200 228 sort_value_forward(sort_value_forward_), sorter(sorter_), 201 229 errorhandler(errorhandler_), weight(weight_), 202 is_remote(db.internal.size()) 230 is_remote(db.internal.size()), 231 matchspies(matchspies_) 203 232 { 204 233 DEBUGCALL(MATCH, void, "MultiMatch", db_ << ", " << query_ << ", " << 205 234 qlen << ", " << (omrset ? *omrset : Xapian::RSet()) << ", " << … … 229 258 rem_db->set_query(query, qlen, collapse_max, collapse_key, 230 259 order, sort_key, sort_by, sort_value_forward, 231 260 percent_cutoff, weight_cutoff, weight, 232 subrsets[i] );261 subrsets[i], matchspies); 233 262 bool decreasing_relevance = 234 263 (sort_by == REL || sort_by == REL_VAL); 235 smatch = new RemoteSubMatch(rem_db, decreasing_relevance );264 smatch = new RemoteSubMatch(rem_db, decreasing_relevance, matchspies); 236 265 is_remote[i] = true; 237 266 } else { 238 267 #endif /* XAPIAN_HAS_REMOTE_BACKEND */ … … 277 306 Xapian::MSet & mset, 278 307 const Xapian::Weight::Internal & stats, 279 308 const Xapian::MatchDecider *mdecider, 280 const Xapian::MatchDecider *matchspy )309 const Xapian::MatchDecider *matchspy_legacy) 281 310 { 282 311 DEBUGCALL(MATCH, void, "MultiMatch::get_mset", first << ", " << maxitems 283 312 << ", " << check_at_least << ", ..."); … … 403 432 Xapian::doccount matches_lower_bound = 0; 404 433 Xapian::doccount matches_estimated = pl->get_termfreq_est(); 405 434 406 if (mdecider == NULL && matchspy == NULL) {435 if (mdecider == NULL && matchspy_legacy == NULL) { 407 436 // If we have a matcher decider or match spy, the lower bound must be 408 437 // set to 0 as we could discard all hits. Otherwise set it to the 409 438 // minimum number of entries which the postlist could return. 410 439 matches_lower_bound = pl->get_termfreq_min(); 411 440 } 412 441 442 // Prepare the matchspy 443 Xapian::MatchSpy *matchspy = NULL; 444 MultipleMatchSpy multispy(matchspies); 445 if (!matchspies.empty()) { 446 if (matchspies.size() == 1) { 447 matchspy = matchspies[0]; 448 } else { 449 matchspy = &multispy; 450 } 451 } 452 413 453 // Check if any results have been asked for (might just be wanting 414 454 // maxweight). 415 455 if (check_at_least == 0) { … … 437 477 return; 438 478 } 439 479 440 // Number of documents considered by a decider or matchspy .480 // Number of documents considered by a decider or matchspy_legacy. 441 481 Xapian::doccount decider_considered = 0; 442 // Number of documents denied by the decider or matchspy .482 // Number of documents denied by the decider or matchspy_legacy. 443 483 Xapian::doccount decider_denied = 0; 444 484 445 485 // Set max number of results that we want - this is used to decide … … 545 585 // VAL, then new_item.wt won't yet be set, but that doesn't 546 586 // matter since it's not used by the sort function. 547 587 if (!mcmp(new_item, min_item)) { 548 if (m atchspy == NULL && mdecider == NULL && !collapser) {588 if (mdecider == NULL && !collapser && matchspy_legacy == NULL) { 549 589 // Document was definitely suitable for mset - no more 550 590 // processing needed. 551 591 LOGLINE(MATCH, "Making note of match item which sorts lower than min_item"); … … 563 603 continue; 564 604 } 565 605 // We can't drop the item, because we need to show it 566 // to the matchspy , test whether the mdecider would606 // to the matchspy_legacy, test whether the mdecider would 567 607 // accept it, and/or test whether it would be collapsed. 568 608 LOGLINE(MATCH, "Keeping candidate which sorts lower than min_item for further investigation"); 569 609 } 570 610 } 571 611 572 612 // Use the match spy and/or decision functors (if specified). 573 if (matchspy != NULL || mdecider != NULL ) {613 if (matchspy != NULL || mdecider != NULL || matchspy_legacy != NULL) { 574 614 const unsigned int multiplier = db.internal.size(); 575 615 Assert(multiplier != 0); 576 616 Xapian::doccount n = (did - 1) % multiplier; // which actual database … … 584 624 Xapian::Document mydoc(doc.get()); 585 625 586 626 ++decider_considered; 587 if (matchspy && !matchspy->operator()(mydoc)) {627 if (matchspy_legacy && !matchspy_legacy->operator()(mydoc)) { 588 628 ++decider_denied; 589 629 continue; 590 630 } … … 592 632 ++decider_denied; 593 633 continue; 594 634 } 635 if (matchspy) { 636 if (!calculated_weight) { 637 wt = pl->get_weight(); 638 new_item.wt = wt; 639 calculated_weight = true; 640 } 641 matchspy->operator()(mydoc, wt); 642 } 595 643 } 596 644 } 597 645 … … 882 930 ", matches_upper_bound=" << matches_upper_bound); 883 931 } 884 932 885 if (m atchspy || mdecider) {933 if (mdecider || matchspy_legacy) { 886 934 if (!percent_cutoff) { 887 935 if (!collapser) { 888 936 // We're not collapsing or doing a percentage cutoff, so … … 946 994 matches_estimated = matches_lower_bound; 947 995 } 948 996 949 if (collapser || m atchspy || mdecider) {997 if (collapser || mdecider || matchspy_legacy) { 950 998 LOGLINE(MATCH, "Clamping estimate between bounds: " 951 999 "matches_lower_bound = " << matches_lower_bound << 952 1000 ", matches_estimated = " << matches_estimated << … … 962 1010 matches_estimated = docs_matched; 963 1011 } 964 1012 965 if (collapser && !m atchspy && !mdecider && !percent_cutoff) {1013 if (collapser && !mdecider && !percent_cutoff && !matchspy_legacy) { 966 1014 AssertRel(docs_matched,<=,uncollapsed_upper_bound); 967 1015 if (docs_matched > uncollapsed_lower_bound) 968 1016 uncollapsed_lower_bound = docs_matched; -
xapian-core/matcher/remotesubmatch.cc
27 27 #include "remote-database.h" 28 28 #include "weightinternal.h" 29 29 30 RemoteSubMatch::RemoteSubMatch(RemoteDatabase *db_, bool decreasing_relevance_) 31 : db(db_), decreasing_relevance(decreasing_relevance_) 30 RemoteSubMatch::RemoteSubMatch(RemoteDatabase *db_, 31 bool decreasing_relevance_, 32 const vector<Xapian::MatchSpy *> & matchspies_) 33 : db(db_), 34 decreasing_relevance(decreasing_relevance_), 35 matchspies(matchspies_) 32 36 { 33 37 DEBUGCALL(MATCH, void, "RemoteSubMatch", 34 db_ << ", " << decreasing_relevance_); 38 db_ << ", " << decreasing_relevance_ << ", " << 39 "matchspies"); 35 40 } 36 41 37 42 bool … … 64 69 DEBUGCALL(MATCH, PostList *, "RemoteSubMatch::get_postlist_and_term_info", 65 70 "[matcher], " << (void*)termfreqandwts << ", " << (void*)total_subqs_ptr); 66 71 Xapian::MSet mset; 67 db->get_mset(mset );72 db->get_mset(mset, matchspies); 68 73 percent_factor = mset.internal->percent_factor; 69 74 if (termfreqandwts) *termfreqandwts = mset.internal->termfreqandwts; 70 75 // For remote databases we report percent_factor rather than counting the -
xapian-core/matcher/remotesubmatch.h
26 26 #include "remote-database.h" 27 27 #include "xapian/weight.h" 28 28 29 class Xapian::MatchSpy; 30 29 31 /// Class for performing matching on a remote database. 30 32 class RemoteSubMatch : public SubMatch { 31 33 /// Don't allow assignment. … … 46 48 /// The factor to use to convert weights to percentages. 47 49 double percent_factor; 48 50 51 /// The matchspies to use. 52 const vector<Xapian::MatchSpy *> & matchspies; 53 49 54 public: 50 55 /// Constructor. 51 RemoteSubMatch(RemoteDatabase *db_, bool decreasing_relevance_); 56 RemoteSubMatch(RemoteDatabase *db_, 57 bool decreasing_relevance_, 58 const vector<Xapian::MatchSpy *> & matchspies); 52 59 53 60 /// Fetch and collate statistics. 54 61 bool prepare_match(bool nowait, Xapian::Weight::Internal & total_stats); … … 69 76 double get_percent_factor() const { return percent_factor; } 70 77 71 78 /// Short-cut for single remote match. 72 void get_mset(Xapian::MSet & mset) { db->get_mset(mset ); }79 void get_mset(Xapian::MSet & mset) { db->get_mset(mset, matchspies); } 73 80 }; 74 81 75 82 #endif /* XAPIAN_INCLUDED_REMOTESUBMATCH_H */ -
xapian-core/docs/categorisation.rst
1 1 2 2 .. Copyright (C) 2007 Olly Betts 3 .. Copyright (C) 2009 Lemur Consulting Ltd 3 4 4 5 ============================= 5 6 Xapian Categorisation Support … … 14 15 lists of category values which feature in matching documents. There are 15 16 numerous potential uses this can be put to, but a common one is to offer the 16 17 user the ability to narrow down their search by filtering it to only include 17 documents with a particular value of a particular category. 18 documents with a particular value of a particular category. This is often 19 referred to as ``faceted search``. 18 20 19 21 Some categories are numeric and can take many different values (examples 20 22 include price, width, and height). The number of different values will often … … 43 45 Searching 44 46 --------- 45 47 46 At search time, you need to pass a ``Xapian::MatchSpy`` object to 47 ``Xapian::Enquire::get_mset()``, like so:: 48 At search time, you need to pass a ``Xapian::ValueCountMatchSpy`` object for 49 each category you want to look at to ``Xapian::Enquire::add_matchspy()``, like 50 so:: 48 51 49 Xapian::MatchSpy spy; 52 Xapian::ValueCountMatchSpy spy0(0); 53 Xapian::ValueCountMatchSpy spy1(1); 54 Xapian::ValueCountMatchSpy spy3(3); 50 55 51 spy.add_category(0);52 spy.add_category(1);53 spy.add_category(3);54 55 56 Xapian::Enquire enq(db); 57 enq.add_matchspy(spy0); 58 enq.add_matchspy(spy1); 59 enq.add_matchspy(spy3); 56 60 57 61 enq.set_query(query); 58 62 59 63 Xapian::MSet mset = enq.get_mset(0, 10, 10000, NULL, NULL, &spy); 60 64 61 The ``10000`` in the call to ``get_mset `` tells Xapian to check at least65 The ``10000`` in the call to ``get_mset()`` tells Xapian to check at least 62 66 10000 documents, so the ``spy`` object will be passed at least 10000 documents 63 to tally category information from (unless less than 10000 documents match64 the query, in which case it will see all of them). Setting this higher will 65 make the counts exact, but Xapian will have to do more work for most queries 66 s o searches will be slower.67 to tally category information from (unless fewer than 10000 documents match the 68 query, in which case it will see all of them). Setting this higher will make 69 the counts exact, but Xapian will have to do more work for most queries so 70 searches will be slower. 67 71 68 The ``spy`` object now contains the category information. You can find out 69 how many documents it looked at by calling ``spy.get_total()``. You can 70 read the values for category ``cat_no`` like this:: 72 The ``spy`` objects now contain the category information. You can find out how 73 many documents they looked at by calling ``spy0.get_total()``. (All the spies 74 will have looked at the same number of documents.) You can read the values 75 from, say, ``spy0`` like this:: 71 76 72 const map<string, size_t> & cat = spy .get_categories(cat_no);77 const map<string, size_t> & cat = spy0.get_values(); 73 78 map<string, size_t>::const_iterator i; 74 79 for (i = cat.begin(); i != cat.end(); ++i) { 75 80 cout << i->first << ": " << i->second << endl; 76 81 } 77 82 78 You calculate the score for category ``cat_no`` like so:: 83 You can calculate a score to indicate how evenly spread the values are using 84 the ``score_evenness`` function like so:: 79 85 80 double score = spy.score_categorisation(cat_num);86 double score = Xapian::score_evenness(spy0); 81 87 82 88 Or if you prefer categories with 4 or 5 values:: 83 89 84 double score = spy.score_categorisation(cat_num, 4.5);90 double score = Xapian::score_evenness(spy0, 4.5); 85 91 86 92 The smaller the score, the better - a perfectly even split with exactly the 87 93 number of entries asked (or with no preference given for the number of entries) … … 89 95 application, but to give you a rough idea, a suitable threshold is likely to be 90 96 less than one. 91 97 92 The scoring uses a sum of squared differences (currently that is - this should98 The scoring uses a sum of squared differences (currently, that is - this should 93 99 probably be regarded as an implementation detail which could change in the 94 100 future if we find a better algorithm). 95 101 96 You would build ranges from numeric values for value ``cat_no``, asking for at97 most ``num_ranges`` ranges like so::102 You can build ranges from numeric values for the values returned from spy 103 ``spy0``, asking for at most ``num_ranges`` ranges like so:: 98 104 99 bool result = spy.build_numeric_ranges(cat_no, num_ranges); 105 std::map<Xapian::NumericRange, Xapian::doccount> result; 106 Xapian::doccount values_seen; 107 values_seen = build_numeric_ranges(result, spy0.get_values(), num_ranges); 100 108 101 If ranges could not be built (for example, because all documents have the 102 same value for ``cat_no``), ``false`` is returned. Otherwise ``true`` is 103 returned, and the spy object's category map for value ``cat_no`` is modified 104 to consist of ranges. Keys are now built of strings returned by 105 ``Xapian::sortable_serialise()`` - either a single string if there is only 106 one number in a particular range, or for a range a string padded to 9 bytes 107 with zero bytes, with a second string appended. 109 Here, ``result`` will be filled with a set of numeric ranges (holding at most 110 ``num_ranges`` ranges), and ``values_seen`` will be the count of the number of 111 values seen (note - this may be different from the number of documents seen by 112 the matchspy, since some may have no value stored in the slot). 108 113 114 If there are no values seen by the spy, ``result`` will be empty. If all the 115 values seen by the spy are the same, ``result`` will contain a single entry, 116 with a single range with the same start and end points. 117 109 118 Restricting by category values 110 119 ------------------------------ 111 120 112 If you're using the categorisation to offer the user choices for narrowing 113 down their search results, you then need to be able to apply a suitable 114 filter. 121 If you're using the categorisation to offer the user choices for narrowing down 122 their search results, you then need to be able to apply a suitable filter. 115 123 116 For a range, the best way is to use ``Xapian::Query::OP_VALUE_RANGE`` to124 For a range, the easiest way is to use ``Xapian::Query::OP_VALUE_RANGE`` to 117 125 build a filter query, and then combine this with the user's query using 118 126 ``Xapian::Query::OP_FILTER``. 119 127 120 For a single value, you could use ``Xapian::Query::OP_VALUE_RANGE`` with 121 the same start and end, or ``Xapian::MatchDecider``, but it's probably 122 most efficient to also index the categories as suitably prefixed boolean 123 terms anduse those for filtering.128 For a single value, you could use ``Xapian::Query::OP_VALUE_RANGE`` with the 129 same start and end, or ``Xapian::MatchDecider``, but it's probably most 130 efficient to also index the categories as suitably prefixed boolean terms and 131 use those for filtering. 124 132 125 133 Current Limitations 126 134 =================== 127 135 128 It's not currently possible to build logarithmic ranges without writing 129 your own subclass. 130 131 It's not possible to try building different ranges because the original 132 data is overwritten. If it's actually useful to do this, the API needs 133 adjusting. 136 It's not currently possible to build logarithmic ranges with 137 ``build_numeric_ranges``. -
xapian-core/docs/Makefile.am
17 17 bm25.html code_structure.html queryparser.html \ 18 18 quickstartexpand.cc.html quickstartindex.cc.html quickstartsearch.cc.html 19 19 20 RSTDOCS = admin_notes.rst deprecation.rst glossary.rst \20 RSTDOCS = admin_notes.rst categorisation.rst deprecation.rst glossary.rst \ 21 21 postingsource.rst replication.rst replication_protocol.rst \ 22 22 sorting.rst serialisation.rst spelling.rst synonyms.rst \ 23 23 termgenerator.rst valueranges.rst -
xapian-core/tests/api_matchspy.cc
1 /** @file api_matchspy.cc 2 * @brief tests of MatchSpy usage 3 */ 4 /* Copyright 2007,2009 Lemur Consulting Ltd 5 * 6 * This program is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU General Public License as 8 * published by the Free Software Foundation; either version 2 of the 9 * License, or (at your option) any later version. 10 * 11 * This program is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 * GNU General Public License for more details. 15 * 16 * You should have received a copy of the GNU General Public License 17 * along with this program; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 19 * USA 20 */ 21 22 #include <config.h> 23 24 #include "api_matchspy.h" 25 26 #include <xapian.h> 27 28 #include "str.h" 29 #include <cmath> 30 #include <map> 31 #include <vector> 32 33 #include "backendmanager.h" 34 #include "testsuite.h" 35 #include "testutils.h" 36 #include "apitest.h" 37 38 using namespace std; 39 40 // ####################################################################### 41 // # Tests start here 42 43 class SimpleMatchSpy : public Xapian::MatchSpy { 44 public: 45 // Vector which will be filled with all the document contents seen. 46 std::vector<std::string> seen; 47 48 void operator()(const Xapian::Document &doc, 49 Xapian::weight) { 50 // Note that this is not recommended usage of get_data() - you 51 // generally shouldn't call get_data() from inside a MatchSpy, because 52 // it is (likely to be) a slow operation resulting in considerable IO. 53 seen.push_back(doc.get_data()); 54 } 55 }; 56 57 // Basic test of a matchspy. 58 DEFINE_TESTCASE(matchspy1, backend && !remote) { 59 Xapian::Database db(get_database("apitest_simpledata")); 60 Xapian::Enquire enquire(db); 61 enquire.set_query(Xapian::Query("this")); 62 63 SimpleMatchSpy myspy; 64 65 Xapian::MSet nospymset = enquire.get_mset(0, 100); 66 enquire.add_matchspy(&myspy); 67 Xapian::MSet spymset = enquire.get_mset(0, 100); 68 69 // Check that the match estimates aren't affected by the matchspy. 70 TEST_EQUAL(nospymset, spymset); 71 72 vector<bool> docid_checked(db.get_lastdocid()); 73 74 // Check that we get the expected number of matches, and that the stored 75 // document contents are right. 76 Xapian::MSetIterator i = spymset.begin(); 77 TEST(i != spymset.end()); 78 TEST_EQUAL(spymset.size(), 6); 79 TEST_EQUAL(myspy.seen.size(), spymset.size()); 80 81 std::sort(myspy.seen.begin(), myspy.seen.end()); 82 83 std::vector<std::string> seen2; 84 for ( ; i != spymset.end(); ++i) { 85 const Xapian::Document doc(i.get_document()); 86 seen2.push_back(doc.get_data()); 87 } 88 std::sort(seen2.begin(), seen2.end()); 89 90 TEST_EQUAL(myspy.seen.size(), seen2.size()); 91 std::vector<std::string>::const_iterator j = myspy.seen.begin(); 92 std::vector<std::string>::const_iterator j2 = seen2.begin(); 93 for (; j != myspy.seen.end(); ++j, ++j2) { 94 TEST_EQUAL(*j, *j2); 95 } 96 97 return true; 98 } 99 100 static string values_to_repr(const map<string, Xapian::doccount> & cat) { 101 string resultrepr("|"); 102 map<string, Xapian::doccount>::const_iterator i; 103 for (i = cat.begin(); i != cat.end(); ++i) { 104 resultrepr += i->first; 105 resultrepr += ':'; 106 resultrepr += str(i->second); 107 resultrepr += '|'; 108 } 109 return resultrepr; 110 } 111 112 DEFINE_TESTCASE(matchspy2, writable) 113 { 114 if (get_dbtype() == "remotetcp" || get_dbtype() == "remoteprog") { 115 SKIP_TEST("Test not supported for remote backend"); 116 } 117 118 Xapian::WritableDatabase db = get_writable_database(""); 119 for (int c = 1; c <= 25; ++c) { 120 Xapian::Document doc; 121 doc.set_data("Document " + str(c)); 122 int factors = 0; 123 for (int factor = 1; factor <= c; ++factor) { 124 doc.add_term("all"); 125 if (c % factor == 0) { 126 doc.add_term("XFACT" + str(factor)); 127 ++factors; 128 } 129 } 130 131 // Number of factors. 132 doc.add_value(0, str(factors)); 133 // Units digits. 134 doc.add_value(1, str(c % 10)); 135 // Constant. 136 doc.add_value(2, "fish"); 137 // Number of digits. 138 doc.add_value(3, str(str(c).size())); 139 140 db.add_document(doc); 141 } 142 143 Xapian::ValueCountMatchSpy spy0(0); 144 Xapian::ValueCountMatchSpy spy1(1); 145 Xapian::ValueCountMatchSpy spy3(3); 146 147 Xapian::Enquire enq(db); 148 149 enq.set_query(Xapian::Query("all")); 150 151 enq.add_matchspy(&spy0); 152 enq.add_matchspy(&spy1); 153 enq.add_matchspy(&spy3); 154 Xapian::MSet mset = enq.get_mset(0, 10); 155 156 TEST_EQUAL(spy0.get_total(), 25); 157 TEST_EQUAL(spy1.get_total(), 25); 158 TEST_EQUAL(spy3.get_total(), 25); 159 160 static const char * results[] = { 161 "|1:1|2:9|3:3|4:7|5:1|6:3|8:1|", 162 "|0:2|1:3|2:3|3:3|4:3|5:3|6:2|7:2|8:2|9:2|", 163 "|1:9|2:16|", 164 }; 165 TEST_STRINGS_EQUAL(values_to_repr(spy0.get_values()), results[0]); 166 TEST_STRINGS_EQUAL(values_to_repr(spy1.get_values()), results[1]); 167 TEST_STRINGS_EQUAL(values_to_repr(spy3.get_values()), results[2]); 168 169 { 170 // Test scoring evenness returns scores with the natural ordering. 171 double score0 = Xapian::score_evenness(spy0); 172 tout << "score0 = " << score0 << endl; 173 double score1 = Xapian::score_evenness(spy1); 174 tout << "score1 = " << score1 << endl; 175 double score3 = Xapian::score_evenness(spy3); 176 tout << "score3 = " << score3 << endl; 177 // 1 is obviously best, and 0 obviously worst. 178 TEST(score1 < score3); 179 TEST(score3 < score0); 180 181 // Check that the using the expanded form gives the same results. 182 double score0_check = Xapian::score_evenness(spy0.get_values(), spy0.get_total()); 183 tout << "score0_check = " << score0_check << endl; 184 TEST_EQUAL(score0, score0_check); 185 } 186 187 { 188 // Test scoring evenness and about 7 categories returns scores with the 189 // natural ordering. 190 double score0 = Xapian::score_evenness(spy0, 7); 191 tout << "score0 = " << score0 << endl; 192 double score1 = Xapian::score_evenness(spy1, 7); 193 tout << "score1 = " << score1 << endl; 194 double score3 = Xapian::score_evenness(spy3, 7); 195 tout << "score3 = " << score3 << endl; 196 // 3 is clearly worst - 0 is arguably a little better than 1 (0 is the 197 // requested size, but 1 has a much more even split). 198 TEST(score0 < score1); 199 TEST(score1 < score3); 200 201 // Check that the using the expanded form gives the same results. 202 double score0_check = Xapian::score_evenness(spy0.get_values(), spy0.get_total()); 203 tout << "score0_check = " << score0_check << endl; 204 TEST_EQUAL(score0, score0_check); 205 } 206 207 return true; 208 } 209 210 DEFINE_TESTCASE(matchspy3, writable) 211 { 212 if (get_dbtype() == "remotetcp" || get_dbtype() == "remoteprog") { 213 SKIP_TEST("Test not supported for remote backend"); 214 } 215 216 Xapian::WritableDatabase db = get_writable_database(""); 217 for (int c = 1; c <= 25; ++c) { 218 Xapian::Document doc; 219 doc.set_data("Document " + str(c)); 220 int factors = 0; 221 for (int factor = 1; factor <= c; ++factor) { 222 doc.add_term("all"); 223 if (c % factor == 0) { 224 doc.add_term("XFACT" + str(factor)); 225 ++factors; 226 } 227 } 228 229 // Number of factors. 230 doc.add_value(0, Xapian::sortable_serialise(factors)); 231 // Units digits. 232 doc.add_value(1, Xapian::sortable_serialise(c % 10)); 233 // (x + 1/3)*(x + 1/3). 234 doc.add_value(2, Xapian::sortable_serialise((c + 1.0/3.0) * (c + 1.0/3.0))); 235 // Reciprocal. 236 doc.add_value(3, Xapian::sortable_serialise(floor(100.0 / c))); 237 238 db.add_document(doc); 239 } 240 241 Xapian::ValueCountMatchSpy spy0(0); 242 Xapian::ValueCountMatchSpy spy1(1); 243 Xapian::ValueCountMatchSpy spy2(2); 244 Xapian::ValueCountMatchSpy spy3(3); 245 246 Xapian::Enquire enq(db); 247 248 enq.set_query(Xapian::Query("all")); 249 250 enq.add_matchspy(&spy0); 251 enq.add_matchspy(&spy1); 252 enq.add_matchspy(&spy2); 253 enq.add_matchspy(&spy3); 254 Xapian::MSet mset = enq.get_mset(0, 10); 255 256 TEST_EQUAL(spy0.get_total(), 25); 257 TEST_EQUAL(spy1.get_total(), 25); 258 TEST_EQUAL(spy2.get_total(), 25); 259 TEST_EQUAL(spy3.get_total(), 25); 260 261 static const string results[] = { 262 "|100:1|200:9|300:3|400:7|500:1|600:3|800:1|", 263 "|0..200:8|300..400:6|500..700:7|800..900:4|", 264 "|177..8711:9|10677..17777:4|20544..26677:3|30044..37377:3|41344..49877:3|54444..59211:2|64177:1|", 265 "|400..900:15|1000..1600:5|2000..2500:2|3300:1|5000:1|10000:1|", 266 "" 267 }; 268 std::vector<Xapian::ValueCountMatchSpy *> spies; 269 spies.push_back(&spy0); 270 spies.push_back(&spy1); 271 spies.push_back(&spy2); 272 spies.push_back(&spy3); 273 for (Xapian::valueno v = 0; !results[v].empty(); ++v) { 274 Xapian::doccount total_seen; 275 std::map<Xapian::NumericRange, Xapian::doccount> ranges; 276 total_seen = Xapian::build_numeric_ranges(ranges, spies[v]->get_values(), 7); 277 if (results[v] == "|") { 278 TEST_EQUAL(total_seen, 0); 279 continue; 280 } 281 TEST_NOT_EQUAL(total_seen, 0); 282 TEST(ranges.size() <= 7); 283 string resultrepr("|"); 284 map<Xapian::NumericRange, Xapian::doccount>::const_iterator i; 285 for (i = ranges.begin(); i != ranges.end(); ++i) { 286 if (i->first.get_lower() != i->first.get_upper()) { 287 resultrepr += str(floor(i->first.get_lower() * 100)); 288 resultrepr += ".."; 289 resultrepr += str(floor(i->first.get_upper() * 100)); 290 } else { 291 double start = floor(i->first.get_lower() * 100); 292 resultrepr += str(start); 293 } 294 resultrepr += ':'; 295 resultrepr += str(i->second); 296 resultrepr += '|'; 297 } 298 tout << "value " << v << endl; 299 TEST_STRINGS_EQUAL(resultrepr, results[v]); 300 } 301 302 return true; 303 } 304 305 DEFINE_TESTCASE(matchspy4, writable) 306 { 307 if (get_dbtype() == "remotetcp" || get_dbtype() == "remoteprog") { 308 SKIP_TEST("Test not supported for remote backend"); 309 } 310 311 Xapian::WritableDatabase db = get_writable_database(""); 312 for (int c = 1; c <= 25; ++c) { 313 Xapian::Document doc; 314 doc.set_data("Document " + str(c)); 315 int factors = 0; 316 for (int factor = 1; factor <= c; ++factor) { 317 doc.add_term("all"); 318 if (c % factor == 0) { 319 doc.add_term("XFACT" + str(factor)); 320 ++factors; 321 } 322 } 323 324 // Number of factors. 325 doc.add_value(0, str(factors)); 326 // Units digits. 327 doc.add_value(1, str(c % 10)); 328 // Constant. 329 doc.add_value(2, "fish"); 330 // Number of digits. 331 doc.add_value(3, str(str(c).size())); 332 333 db.add_document(doc); 334 } 335 336 Xapian::ValueCountMatchSpy spy0(0); 337 Xapian::ValueCountMatchSpy spy1(1); 338 Xapian::ValueCountMatchSpy spy3(3); 339 340 Xapian::Enquire enq(db); 341 342 enq.set_query(Xapian::Query("all")); 343 344 enq.add_matchspy(&spy0); 345 enq.add_matchspy(&spy1); 346 enq.add_matchspy(&spy3); 347 Xapian::MSet mset = enq.get_mset(0, 10); 348 349 TEST_EQUAL(spy0.get_total(), 25); 350 TEST_EQUAL(spy1.get_total(), 25); 351 TEST_EQUAL(spy3.get_total(), 25); 352 353 static const char * results[] = { 354 "|2:9|4:7|3:3|6:3|1:1|5:1|8:1|", 355 "|1:3|2:3|3:3|4:3|5:3|0:2|6:2|7:2|8:2|9:2|", 356 "|", 357 "|2:16|1:9|", 358 NULL 359 }; 360 std::vector<Xapian::ValueCountMatchSpy *> spies; 361 spies.push_back(&spy0); 362 spies.push_back(&spy1); 363 spies.push_back(NULL); 364 spies.push_back(&spy3); 365 for (Xapian::valueno v = 0; results[v]; ++v) { 366 tout << "value " << v << endl; 367 std::vector<Xapian::StringAndFrequency> allvals; 368 369 Xapian::ValueCountMatchSpy * spy = spies[v]; 370 if (spy != NULL) 371 spy->get_top_values(allvals, 100); 372 string allvals_str("|"); 373 for (size_t i = 0; i < allvals.size(); ++i) { 374 allvals_str += allvals[i].get_string(); 375 allvals_str += ':'; 376 allvals_str += str(allvals[i].get_frequency()); 377 allvals_str += '|'; 378 } 379 tout << allvals_str << endl; 380 TEST_STRINGS_EQUAL(allvals_str, results[v]); 381 382 std::vector<Xapian::StringAndFrequency> vals; 383 for (size_t i = 0; i < allvals.size(); ++i) { 384 tout << "i " << i << endl; 385 if (spy != NULL) 386 spy->get_top_values(vals, i); 387 for (size_t j = 0; j < vals.size(); j++) { 388 tout << "j " << j << endl; 389 TEST_EQUAL(vals[j].get_string(), allvals[j].get_string()); 390 TEST_EQUAL(vals[j].get_frequency(), allvals[j].get_frequency()); 391 } 392 } 393 } 394 395 return true; 396 } 397 398 // Test builtin match spies 399 DEFINE_TESTCASE(matchspy5, backend) 400 { 401 Xapian::Database db(get_database("apitest_simpledata")); 402 Xapian::Enquire enquire(db); 403 enquire.set_query(Xapian::Query("this")); 404 405 Xapian::ValueCountMatchSpy myspy1(1); 406 Xapian::ValueCountMatchSpy myspy2(1); 407 408 enquire.add_matchspy(&myspy1); 409 enquire.add_matchspy(&myspy2); 410 Xapian::MSet mymset = enquire.get_mset(0, 100); 411 TEST_EQUAL(mymset.size(), 6); 412 413 const std::map<std::string, Xapian::doccount> & vals1 = myspy1.get_values(); 414 const std::map<std::string, Xapian::doccount> & vals2 = myspy2.get_values(); 415 416 TEST_EQUAL(vals1.size(), 2); 417 TEST(vals1.find("h") != vals1.end()); 418 TEST(vals1.find("n") != vals1.end()); 419 TEST_EQUAL(vals1.find("h")->second, 5); 420 TEST_EQUAL(vals1.find("n")->second, 1); 421 422 TEST_EQUAL(vals2.size(), 2); 423 TEST(vals2.find("h") != vals2.end()); 424 TEST(vals2.find("n") != vals2.end()); 425 TEST_EQUAL(vals2.find("h")->second, 5); 426 TEST_EQUAL(vals2.find("n")->second, 1); 427 428 return true; 429 } 430 431 class MySpy : public Xapian::MatchSpy { 432 void operator()(const Xapian::Document &, Xapian::weight) { 433 } 434 }; 435 436 // Test exceptions from matchspy base class, and get_description method. 437 DEFINE_TESTCASE(matchspy6, !backend) 438 { 439 MySpy spy; 440 441 TEST_EXCEPTION(Xapian::UnimplementedError, spy.clone()); 442 TEST_EXCEPTION(Xapian::UnimplementedError, spy.name()); 443 TEST_EXCEPTION(Xapian::UnimplementedError, spy.serialise()); 444 TEST_EXCEPTION(Xapian::UnimplementedError, 445 spy.unserialise(std::string(), 446 Xapian::SerialisationContext())); 447 TEST_EXCEPTION(Xapian::UnimplementedError, spy.serialise_results()); 448 TEST_EXCEPTION(Xapian::UnimplementedError, 449 spy.merge_results(std::string())); 450 TEST_EQUAL(spy.get_description(), "Xapian::MatchSpy()"); 451 452 return true; 453 } 454 455 /// Test that NumericRange comparisons work correctly. 456 DEFINE_TESTCASE(numericrange1, !backend) 457 { 458 Xapian::NumericRange n1(0, 0); 459 Xapian::NumericRange n2(0, 1); 460 Xapian::NumericRange n3(1, 1); 461 Xapian::NumericRange n4(2, 1); 462 463 TEST(!(n1 < n1)); 464 TEST(n1 < n2); 465 TEST(!(n2 < n1)); 466 TEST(n2 < n3); 467 TEST(!(n3 < n2)); 468 TEST(n3 < n4); 469 TEST(!(n4 < n3)); 470 return true; 471 } -
xapian-core/tests/Makefile.am
Property changes on: xapian-core/tests/api_matchspy.cc ___________________________________________________________________ Added: svn:eol-style + native
115 115 api_collapse.cc \ 116 116 api_db.cc \ 117 117 api_generated.cc \ 118 api_matchspy.cc \ 118 119 api_metadata.cc \ 119 120 api_nodb.cc \ 120 121 api_opsynonym.cc \ -
xapian-core/include/xapian/enquire.h
4 4 /* Copyright 1999,2000,2001 BrightStation PLC 5 5 * Copyright 2001,2002 Ananova Ltd 6 6 * Copyright 2002,2003,2004,2005,2006,2007,2008,2009 Olly Betts 7 * Copyright 2009 Lemur Consulting Ltd 7 8 * 8 9 * This program is free software; you can redistribute it and/or 9 10 * modify it under the terms of the GNU General Public License as … … 39 40 class Document; 40 41 class ErrorHandler; 41 42 class ExpandDecider; 43 class MatchSpy; 42 44 class MSetIterator; 43 45 class Query; 44 46 class Weight; … … 689 691 */ 690 692 const Xapian::Query & get_query() const; 691 693 694 /** Add a matchspy. 695 * 696 * This matchspy will be called with some of the documents which match 697 * the query, during the match process. Exactly which of the matching 698 * documents are passed to it depends on exactly when certain 699 * optimisations occur during the match process, but it can be 700 * controlled to some extent by setting the @a checkatleast parameter 701 * to @a get_mset(). 702 * 703 * In particular, if there are enough matching documents, at least the 704 * number specified by @a checkatleast will be passed to the matchspy. 705 * This means that you can force the matchspy to be shown all matching 706 * documents by setting @a checkatleast to the number of documents in 707 * the database. 708 * 709 * @param spy The MatchSpy subclass to add. The caller must 710 * ensure that this remains valid while the Enquire 711 * object remains active, or until @a 712 * clear_matchspies() is called. 713 */ 714 void add_matchspy(MatchSpy * spy); 715 716 /** Remove all the matchspies. 717 */ 718 void clear_matchspies(); 719 692 720 /** Set the weighting scheme to use for queries. 693 721 * 694 722 * @param weight_ the new weighting scheme. If no weighting scheme -
xapian-core/include/xapian/serialisationcontext.h
31 31 // Forward declarations. 32 32 class Weight; 33 33 class PostingSource; 34 class MatchSpy; 34 35 35 36 /** A context for serialisation. 36 37 * … … 92 93 */ 93 94 const Xapian::PostingSource * 94 95 get_posting_source(const std::string & name) const; 96 97 /// Register a user-defined match spy class. 98 void register_match_spy(const Xapian::MatchSpy &spy); 99 100 /** Get a match spy given a name. 101 * 102 * The returned match spy is owned by the context object. 103 * 104 * Returns NULL if the match spy could not be found. 105 */ 106 const Xapian::MatchSpy * 107 get_match_spy(const std::string & name) const; 95 108 }; 96 109 97 110 } -
xapian-core/include/xapian/matchspy.h
1 /** @file matchspy.h 2 * @brief MatchSpy implementation. 3 */ 4 /* Copyright (C) 2007,2008 Olly Betts 5 * Copyright (C) 2007,2009 Lemur Consulting Ltd 6 * 7 * This program is free software; you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License as published by 9 * the Free Software Foundation; either version 2 of the License, or 10 * (at your option) any later version. 11 * 12 * This program is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 * GNU General Public License for more details. 16 * 17 * You should have received a copy of the GNU General Public License 18 * along with this program; if not, write to the Free Software 19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 20 */ 21 22 #ifndef XAPIAN_INCLUDED_MATCHSPY_H 23 #define XAPIAN_INCLUDED_MATCHSPY_H 24 25 #include <xapian/enquire.h> 26 #include <xapian/visibility.h> 27 28 #include <string> 29 #include <map> 30 #include <set> 31 #include <string> 32 #include <vector> 33 34 namespace Xapian { 35 36 class Document; 37 class SerialisationContext; 38 39 /** Abstract base class for match spies. 40 * 41 * The subclasses will generally accumulate information seen during the match, 42 * to calculate aggregate functions, or other profiles of the matching 43 * documents. 44 */ 45 class XAPIAN_VISIBILITY_DEFAULT MatchSpy { 46 private: 47 /// Don't allow assignment. 48 void operator=(const MatchSpy &); 49 50 /// Don't allow copying. 51 MatchSpy(const MatchSpy &); 52 53 protected: 54 /// Default constructor, needed by subclass constructors. 55 MatchSpy() {} 56 57 public: 58 /** Virtual destructor, because we have virtual methods. */ 59 virtual ~MatchSpy(); 60 61 /** Register a document with the match spy. 62 * 63 * This is called by the matcher once with each document seen by the 64 * matcher during the match process. Note that the matcher will often not 65 * see all the documents which match the query, due to optimisations which 66 * allow low-weighted documents to be skipped, and allow the match process 67 * to be terminated early. 68 * 69 * @param doc The document seen by the match spy. 70 * @param wt The weight of the document. 71 */ 72 virtual void operator()(const Xapian::Document &doc, 73 Xapian::weight wt) = 0; 74 75 /** Clone the match spy. 76 * 77 * The clone should inherit the configuration of the parent, but need not 78 * inherit the state. ie, the clone does not need to be passed 79 * information about the results seen by the parent. 80 * 81 * If you don't want to support the remote backend in your match spy, you 82 * can use the default implementation which simply throws 83 * Xapian::UnimplementedError. 84 * 85 * Note that the returned object will be deallocated by Xapian after use 86 * with "delete". It must therefore have been allocated with "new". 87 */ 88 virtual MatchSpy * clone() const; 89 90 /** Return the name of this match spy. 91 * 92 * This name is used by the remote backend. It is passed with the 93 * serialised parameters to the remote server so that it knows which class 94 * to create. 95 * 96 * Return the full namespace-qualified name of your class here - if your 97 * class is called MyApp::FooMatchSpy, return "MyApp::FooMatchSpy" from 98 * this method. 99 * 100 * If you don't want to support the remote backend in your match spy, you 101 * can use the default implementation which simply throws 102 * Xapian::UnimplementedError. 103 */ 104 virtual std::string name() const; 105 106 /** Return this object's parameters serialised as a single string. 107 * 108 * If you don't want to support the remote backend in your match spy, you 109 * can use the default implementation which simply throws 110 * Xapian::UnimplementedError. 111 */ 112 virtual std::string serialise() const; 113 114 /** Unserialise parameters. 115 * 116 * This method unserialises parameters serialised by the @a serialise() 117 * method and allocates and returns a new object initialised with them. 118 * 119 * If you don't want to support the remote backend in your match spy, you 120 * can use the default implementation which simply throws 121 * Xapian::UnimplementedError. 122 * 123 * Note that the returned object will be deallocated by Xapian after use 124 * with "delete". It must therefore have been allocated with "new". 125 */ 126 virtual MatchSpy * unserialise(const std::string & s, 127 const SerialisationContext & context) const; 128 129 /** Serialise the results of this match spy. 130 * 131 * If you don't want to support the remote backend in your match spy, you 132 * can use the default implementation which simply throws 133 * Xapian::UnimplementedError. 134 */ 135 virtual std::string serialise_results() const; 136 137 /** Unserialise some results, and merge them into this matchspy. 138 * 139 * The order in which results are merged should not be significant, since 140 * this order is not specified (and will vary depending on the speed of 141 * the search in each sub-database). 142 * 143 * If you don't want to support the remote backend in your match spy, you 144 * can use the default implementation which simply throws 145 * Xapian::UnimplementedError. 146 */ 147 virtual void merge_results(const std::string & s); 148 149 /** Return a string describing this object. 150 * 151 * This default implementation returns a generic answer, to avoid forcing 152 * those deriving their own MatchSpy subclasses from having to implement 153 * this (they may not care what get_description() gives for their 154 * subclass). 155 */ 156 virtual std::string get_description() const; 157 }; 158 159 160 /** A string with a corresponding frequency. 161 */ 162 class XAPIAN_VISIBILITY_DEFAULT StringAndFrequency { 163 std::string str; 164 Xapian::doccount frequency; 165 public: 166 StringAndFrequency(std::string str_, Xapian::doccount frequency_) 167 : str(str_), frequency(frequency_) {} 168 169 std::string get_string() const { return str; } 170 Xapian::doccount get_frequency() const { return frequency; } 171 }; 172 173 174 /// Class for counting the frequencies of values in the matching documents. 175 class XAPIAN_VISIBILITY_DEFAULT ValueCountMatchSpy : public MatchSpy { 176 protected: 177 /// The slot to count. 178 Xapian::valueno slot; 179 180 /// Total number of documents seen by the match spy. 181 Xapian::doccount total; 182 183 /// The values seen so far, together with their frequency. 184 std::map<std::string, Xapian::doccount> values; 185 186 public: 187 /// Construct an empty ValueCountMatchSpy. 188 ValueCountMatchSpy() : slot(Xapian::BAD_VALUENO), total(0) {} 189 190 /** Construct a MatchSpy which counts the values in a particular slot. 191 * 192 * Further slots can be added by calling @a add_slot(). 193 */ 194 ValueCountMatchSpy(Xapian::valueno slot_) 195 : slot(slot_), total(0) { 196 } 197 198 /// Return the values seen in the slot. 199 const std::map<std::string, Xapian::doccount> & get_values() const { 200 return values; 201 } 202 203 /** Return the total number of documents tallied. */ 204 size_t get_total() const { 205 return total; 206 } 207 208 /** Get the most frequent values in the slot. 209 * 210 * @param result A vector which will be filled with the most frequent 211 * values, in descending order of frequency. Values with 212 * the same frequency will be sorted in ascending 213 * alphabetical order. 214 * 215 * @param maxvalues The maximum number of values to return. 216 */ 217 void get_top_values(std::vector<StringAndFrequency> & result, 218 size_t maxvalues) const; 219 220 /** Implementation of virtual operator(). 221 * 222 * This implementation tallies values for a matching document. 223 */ 224 void operator()(const Xapian::Document &doc, Xapian::weight wt); 225 226 virtual MatchSpy * clone() const; 227 virtual std::string name() const; 228 virtual std::string serialise() const; 229 virtual MatchSpy * unserialise(const std::string & s, 230 const SerialisationContext & context) const; 231 virtual std::string serialise_results() const; 232 virtual void merge_results(const std::string & s); 233 virtual std::string get_description() const; 234 }; 235 236 237 /** A numeric range. 238 * 239 * This is used to represent ranges of values returned by the match spies. 240 */ 241 class XAPIAN_VISIBILITY_DEFAULT NumericRange { 242 /// The lower value in the range. 243 double lower; 244 245 /// The upper value in the range. 246 double upper; 247 248 public: 249 NumericRange(double lower_, double upper_) 250 : lower(lower_), upper(upper_) {} 251 252 double get_lower() const { return lower; } 253 double get_upper() const { return upper; } 254 255 bool operator<(const NumericRange & other) const { 256 if (lower < other.lower) return true; 257 if (lower > other.lower) return false; 258 return (upper < other.upper); 259 } 260 }; 261 262 263 /** Return a score reflecting how evenly divided a set of values is. 264 * 265 * If you don't want to show a poor categorisation, or have multiple 266 * categories and only space in your user interface to show a few, you want to 267 * be able to decide how "good" a categorisation is. One definition of "good" 268 * is that it offers a fairly even split of the available values, and 269 * (optionally) about a specified number of options. 270 * 271 * @param values The values making up the categorisation, together with their 272 * frequencies. 273 * 274 * @param total The total number of documents seen. 275 * 276 * @param desired_no_of_categories The desired number of categories - this is 277 * a floating point value, so you can ask for 5.5 if you'd like "about 5 or 6 278 * categories". The default is to desire the number of categories that there 279 * actually are, so the score then only reflects how even the split is. 280 281 * @return A score for the categorisation for the value - lower is better, 282 * with a perfectly even split across the right number of categories scoring 283 * 0. 284 */ 285 //@{ 286 double XAPIAN_VISIBILITY_DEFAULT score_evenness( 287 const std::map<std::string, Xapian::doccount> & values, 288 Xapian::doccount total, 289 double desired_no_of_categories = 0.0); 290 double XAPIAN_VISIBILITY_DEFAULT score_evenness( 291 const std::map<Xapian::NumericRange, Xapian::doccount> & values, 292 Xapian::doccount total, 293 double desired_no_of_categories = 0.0); 294 double XAPIAN_VISIBILITY_DEFAULT score_evenness( 295 const ValueCountMatchSpy & spy, 296 double desired_no_of_categories = 0.0); 297 //@} 298 299 300 /** Turn a category containing sort-encoded numeric values into a set of 301 * ranges. 302 * 303 * For "continuous" values (such as price, height, weight, etc), there will 304 * usually be too many different values to offer the user, and the user won't 305 * want to restrict to an exact value anyway. 306 * 307 * This method produces a set of NumericRange objects for a particular value 308 * number. 309 * 310 * @param result Used to return the resulting ranges. 311 * @param values The values representing the initial numbers. 312 * @param max_ranges Group into at most this many ranges. 313 * 314 * @return The number of values seen. 315 */ 316 doccount XAPIAN_VISIBILITY_DEFAULT build_numeric_ranges( 317 std::map<Xapian::NumericRange, Xapian::doccount> & result, 318 const std::map<std::string, Xapian::doccount> & values, 319 size_t max_ranges); 320 321 } 322 323 #endif // XAPIAN_INCLUDED_MATCHSPY_H -
xapian-core/include/Makefile.mk
Property changes on: xapian-core/include/xapian/matchspy.h ___________________________________________________________________ Added: svn:eol-style + native
19 19 include/xapian/enquire.h\ 20 20 include/xapian/errorhandler.h\ 21 21 include/xapian/expanddecider.h\ 22 include/xapian/matchspy.h\ 22 23 include/xapian/positioniterator.h\ 23 24 include/xapian/postingiterator.h\ 24 25 include/xapian/postingsource.h\ -
xapian-core/include/xapian.h
45 45 // Searching 46 46 #include <xapian/enquire.h> 47 47 #include <xapian/expanddecider.h> 48 #include <xapian/matchspy.h> 48 49 #include <xapian/postingsource.h> 49 50 #include <xapian/query.h> 50 51 #include <xapian/queryparser.h> -
xapian-core/net/serialise.cc
203 203 const map<string, Xapian::MSet::Internal::TermFreqAndWeight> &termfreqandwts 204 204 = mset.internal->termfreqandwts; 205 205 206 result += encode_length(termfreqandwts.size()); 206 207 map<string, Xapian::MSet::Internal::TermFreqAndWeight>::const_iterator j; 207 208 for (j = termfreqandwts.begin(); j != termfreqandwts.end(); ++j) { 208 209 result += encode_length(j->first.size()); … … 215 216 } 216 217 217 218 Xapian::MSet 218 unserialise_mset(const string &s)219 unserialise_mset(const char ** p, const char * p_end) 219 220 { 220 const char * p = s.data(); 221 const char * p_end = p + s.size(); 221 Xapian::doccount firstitem = decode_length(p, p_end, false); 222 Xapian::doccount matches_lower_bound = decode_length(p, p_end, false); 223 Xapian::doccount matches_estimated = decode_length(p, p_end, false); 224 Xapian::doccount matches_upper_bound = decode_length(p, p_end, false); 225 Xapian::doccount uncollapsed_lower_bound = decode_length(p, p_end, false); 226 Xapian::doccount uncollapsed_estimated = decode_length(p, p_end, false); 227 Xapian::doccount uncollapsed_upper_bound = decode_length(p, p_end, false); 228 Xapian::weight max_possible = unserialise_double(p, p_end); 229 Xapian::weight max_attained = unserialise_double(p, p_end); 222 230 223 Xapian::doccount firstitem = decode_length(&p, p_end, false); 224 Xapian::doccount matches_lower_bound = decode_length(&p, p_end, false); 225 Xapian::doccount matches_estimated = decode_length(&p, p_end, false); 226 Xapian::doccount matches_upper_bound = decode_length(&p, p_end, false); 227 Xapian::doccount uncollapsed_lower_bound = decode_length(&p, p_end, false); 228 Xapian::doccount uncollapsed_estimated = decode_length(&p, p_end, false); 229 Xapian::doccount uncollapsed_upper_bound = decode_length(&p, p_end, false); 230 Xapian::weight max_possible = unserialise_double(&p, p_end); 231 Xapian::weight max_attained = unserialise_double(&p, p_end); 231 double percent_factor = unserialise_double(p, p_end); 232 232 233 double percent_factor = unserialise_double(&p, p_end);234 235 233 vector<Xapian::Internal::MSetItem> items; 236 size_t msize = decode_length( &p, p_end, false);234 size_t msize = decode_length(p, p_end, false); 237 235 while (msize-- > 0) { 238 Xapian::weight wt = unserialise_double( &p, p_end);239 Xapian::docid did = decode_length( &p, p_end, false);240 size_t len = decode_length( &p, p_end, true);241 string key( p, len);242 p += len;236 Xapian::weight wt = unserialise_double(p, p_end); 237 Xapian::docid did = decode_length(p, p_end, false); 238 size_t len = decode_length(p, p_end, true); 239 string key(*p, len); 240 *p += len; 243 241 items.push_back(Xapian::Internal::MSetItem(wt, did, key, 244 decode_length( &p, p_end, false)));242 decode_length(p, p_end, false))); 245 243 } 246 244 245 size_t terminfosize = decode_length(p, p_end, false); 247 246 map<string, Xapian::MSet::Internal::TermFreqAndWeight> terminfo; 248 while ( p != p_end) {247 while (terminfosize-- > 0) { 249 248 Xapian::MSet::Internal::TermFreqAndWeight tfaw; 250 size_t len = decode_length( &p, p_end, true);251 string term( p, len);252 p += len;253 tfaw.termfreq = decode_length( &p, p_end, false);254 tfaw.termweight = unserialise_double( &p, p_end);249 size_t len = decode_length(p, p_end, true); 250 string term(*p, len); 251 *p += len; 252 tfaw.termfreq = decode_length(p, p_end, false); 253 tfaw.termweight = unserialise_double(p, p_end); 255 254 terminfo.insert(make_pair(term, tfaw)); 256 255 } 257 256 -
xapian-core/net/remoteserver.cc
25 25 #include "xapian/database.h" 26 26 #include "xapian/enquire.h" 27 27 #include "xapian/error.h" 28 #include "xapian/matchspy.h" 28 29 #include "xapian/valueiterator.h" 29 30 30 31 #include "safeerrno.h" … … 354 355 send_message(REPLY_UPDATE, message); 355 356 } 356 357 358 /** Structure holding a list of match spies. 359 * 360 * The main reason for the existence of this structure is to make it easy to 361 * ensure that the match spies are all deleted after use. 362 */ 363 struct MatchSpyList { 364 vector<Xapian::MatchSpy *> spies; 365 366 ~MatchSpyList() { 367 vector<Xapian::MatchSpy *>::const_iterator i; 368 for (i = spies.begin(); i != spies.end(); ++i) { 369 delete *i; 370 } 371 } 372 }; 373 357 374 void 358 375 RemoteServer::msg_query(const string &message_in) 359 376 { … … 405 422 406 423 // Unserialise the Weight object. 407 424 len = decode_length(&p, p_end, true); 408 const Xapian::Weight * wttype = ctx.get_weighting_scheme(string(p, len)); 425 string wtname(p, len); 426 p += len; 427 428 const Xapian::Weight * wttype = ctx.get_weighting_scheme(wtname); 409 429 if (wttype == NULL) { 410 430 // Note: user weighting schemes should be registered by adding them to 411 431 // a SerialisationContext, and setting the context using 412 432 // RemoteServer::set_context(). 413 433 throw Xapian::InvalidArgumentError("Weighting scheme " + 414 string(p, len)+ " not registered");434 wtname + " not registered"); 415 435 } 416 p += len;417 436 418 437 len = decode_length(&p, p_end, true); 419 438 AutoPtr<Xapian::Weight> wt(wttype->unserialise(string(p, len))); 420 439 p += len; 421 440 422 441 // Unserialise the RSet object. 423 Xapian::RSet rset = unserialise_rset(string(p, p_end - p)); 442 len = decode_length(&p, p_end, true); 443 Xapian::RSet rset = unserialise_rset(string(p, len)); 444 p += len; 424 445 446 // Unserialise the MatchSpy objects. 447 vector<Xapian::MatchSpy *>::size_type spycount = decode_length(&p, p_end, false); 448 MatchSpyList matchspies; 449 while (spycount != 0) { 450 len = decode_length(&p, p_end, true); 451 string spytype(p, len); 452 const Xapian::MatchSpy * spyclass = ctx.get_match_spy(spytype); 453 if (spyclass == NULL) { 454 throw Xapian::InvalidArgumentError("Match spy " + spytype + 455 " not registered"); 456 } 457 p += len; 458 459 len = decode_length(&p, p_end, true); 460 matchspies.spies.push_back(spyclass->unserialise(string(p, len), ctx)); 461 p += len; 462 463 --spycount; 464 } 465 425 466 Xapian::Weight::Internal local_stats; 426 467 MultiMatch match(*db, query.get(), qlen, &rset, collapse_max, collapse_key, 427 468 percent_cutoff, weight_cutoff, order, 428 469 sort_key, sort_by, sort_value_forward, NULL, 429 NULL, local_stats, wt.get() );470 NULL, local_stats, wt.get(), matchspies.spies); 430 471 431 472 send_message(REPLY_STATS, serialise_stats(local_stats)); 432 473 … … 448 489 Xapian::MSet mset; 449 490 match.get_mset(first, maxitems, check_at_least, mset, total_stats, 0, 0); 450 491 451 send_message(REPLY_RESULTS, serialise_mset(mset)); 492 message = serialise_mset(mset); 493 494 for (vector<Xapian::MatchSpy *>::const_iterator i = matchspies.spies.begin(); 495 i != matchspies.spies.end(); ++i) { 496 string spy_results = (*i)->serialise_results(); 497 message += encode_length(spy_results.size()); 498 message += spy_results; 499 } 500 send_message(REPLY_RESULTS, message); 452 501 } 453 502 454 503 void -
xapian-core/common/omenquireinternal.h
3 3 * Copyright 1999,2000,2001 BrightStation PLC 4 4 * Copyright 2001,2002 Ananova Ltd 5 5 * Copyright 2002,2003,2004,2005,2006,2007,2008,2009 Olly Betts 6 * Copyright 2009 Lemur Consulting Ltd 6 7 * 7 8 * This program is free software; you can redistribute it and/or 8 9 * modify it under the terms of the GNU General Public License as … … 159 160 */ 160 161 mutable Weight * weight; 161 162 163 vector<MatchSpy *> spies; 164 162 165 Internal(const Xapian::Database &databases, ErrorHandler * errorhandler_); 163 166 ~Internal(); 164 167 … … 174 177 const Query & get_query(); 175 178 MSet get_mset(Xapian::doccount first, Xapian::doccount maxitems, 176 179 Xapian::doccount check_at_least, 177 const RSet *omrset, const MatchDecider *mdecider, 178 const MatchDecider *matchspy) const; 180 const RSet *omrset, 181 const MatchDecider *mdecider, 182 const MatchDecider *matchspy_legacy) const; 179 183 ESet get_eset(Xapian::termcount maxitems, const RSet & omrset, int flags, 180 184 double k, const ExpandDecider *edecider) const; 181 185 -
xapian-core/common/multimatch.h
2 2 * 3 3 * Copyright 1999,2000,2001 BrightStation PLC 4 4 * Copyright 2002,2003,2004,2005,2006,2007,2009 Olly Betts 5 * Copyright 2009 Lemur Consulting Ltd 5 6 * 6 7 * This program is free software; you can redistribute it and/or 7 8 * modify it under the terms of the GNU General Public License as … … 71 72 /** Is each sub-database remote? */ 72 73 vector<bool> is_remote; 73 74 75 /// The matchspies to use. 76 const vector<Xapian::MatchSpy *> & matchspies; 77 74 78 /** get the maxweight that the postlist pl may return, calling 75 79 * recalc_maxweight if recalculate_w_max is set, and unsetting it. 76 80 * Must only be called on the top of the postlist tree. … … 110 114 const Xapian::Sorter * sorter_, 111 115 Xapian::ErrorHandler * errorhandler, 112 116 Xapian::Weight::Internal & stats, 113 const Xapian::Weight *wtscheme); 117 const Xapian::Weight *wtscheme, 118 const vector<Xapian::MatchSpy *> & matchspies_); 114 119 115 120 void get_mset(Xapian::doccount first, 116 121 Xapian::doccount maxitems, … … 118 123 Xapian::MSet & mset, 119 124 const Xapian::Weight::Internal & stats, 120 125 const Xapian::MatchDecider * mdecider, 121 const Xapian::MatchDecider * matchspy );126 const Xapian::MatchDecider * matchspy_legacy); 122 127 123 128 /** Called by postlists to indicate that they've rearranged themselves 124 129 * and the maxweight now possible is smaller. -
xapian-core/common/remote-database.h
143 143 * @param weight_cutoff Weight cutoff. 144 144 * @param wtscheme Weighting scheme. 145 145 * @param omrset The rset. 146 * @param matchspies The matchspies to use. NULL if none. 146 147 */ 147 148 void set_query(const Xapian::Query::Internal *query, 148 149 Xapian::termcount qlen, … … 154 155 bool sort_value_forward, 155 156 int percent_cutoff, Xapian::weight weight_cutoff, 156 157 const Xapian::Weight *wtscheme, 157 const Xapian::RSet &omrset); 158 const Xapian::RSet &omrset, 159 const vector<Xapian::MatchSpy *> & matchspies); 158 160 159 161 /** Get the stats from the remote server. 160 162 * … … 169 171 const Xapian::Weight::Internal &stats); 170 172 171 173 /// Get the MSet from the remote server. 172 void get_mset(Xapian::MSet &mset); 174 void get_mset(Xapian::MSet &mset, 175 const vector<Xapian::MatchSpy *> & matchspies); 173 176 174 177 /// Get remote termlist. 175 178 TermList * open_term_list(Xapian::docid did) const; -
xapian-core/common/remoteprotocol.h
41 41 // 30.6: Support for OP_VALUE_GE and OP_VALUE_LE in query serialisation 42 42 // 31: Clean up for Xapian 1.1.0 43 43 // 32: Serialise termfreq and reltermfreqs together in serialise_stats. 44 #define XAPIAN_REMOTE_PROTOCOL_MAJOR_VERSION 32 44 // 33: Support for passing matchspies over the remote connection. 45 #define XAPIAN_REMOTE_PROTOCOL_MAJOR_VERSION 33 45 46 #define XAPIAN_REMOTE_PROTOCOL_MINOR_VERSION 0 46 47 47 48 /** Message types (client -> server). -
xapian-core/common/serialisationcontextinternal.h
31 31 namespace Xapian { 32 32 class Weight; 33 33 class PostingSource; 34 class MatchSpy; 34 35 } 35 36 36 37 class Xapian::SerialisationContext::Internal … … 41 42 /// Registered external posting sources. 42 43 std::map<std::string, Xapian::PostingSource *> postingsources; 43 44 45 /// Registered match spies. 46 std::map<std::string, Xapian::MatchSpy *> matchspies; 47 44 48 /// Add the standard default weighting schemes and posting sources. 45 49 void add_defaults(); 46 50 … … 50 54 /// Clear all registered posting sources from the context. 51 55 void clear_posting_sources(); 52 56 57 /// Clear all registered match spies from the context. 58 void clear_match_spies(); 59 53 60 public: 54 61 Internal(); 55 62 ~Internal(); … … 77 84 */ 78 85 const Xapian::PostingSource * 79 86 get_posting_source(const std::string & name) const; 87 88 /// Register a user-defined match spy class. 89 void register_match_spy(const Xapian::MatchSpy &spy); 90 91 /** Get a match spy given a name. 92 * 93 * The returned match spy is owned by the context object. 94 * 95 * Returns NULL if the match spy could not be found. 96 */ 97 const Xapian::MatchSpy * 98 get_match_spy(const std::string & name) const; 99 80 100 }; 81 101 82 102 #endif // XAPIAN_INCLUDED_SERIALISATIONCONTEXTINTERNAL_H -
xapian-core/common/serialise.h
132 132 133 133 /** Unserialise a serialised Xapian::MSet object. 134 134 * 135 * @param s The string to unserialise. 135 * @param p Pointer to pointer to start of the string to unserialise. 136 * @param p_end Pointer to end of the string to unserialise. 136 137 * 137 138 * @return The unserialised Xapian::MSet object. 138 139 */ 139 Xapian::MSet unserialise_mset(const std::string &s);140 Xapian::MSet unserialise_mset(const char ** p, const char * p_end); 140 141 141 142 /** Serialise a Xapian::RSet object. 142 143 * -
xapian-core/api/Makefile.mk
16 16 api/errorhandler.cc\ 17 17 api/expanddecider.cc\ 18 18 api/leafpostlist.cc\ 19 api/matchspy.cc\ 19 20 api/omdatabase.cc\ 20 21 api/omdocument.cc\ 21 22 api/omenquire.cc\ -
xapian-core/api/omenquire.cc
3 3 * Copyright 1999,2000,2001 BrightStation PLC 4 4 * Copyright 2001,2002 Ananova Ltd 5 5 * Copyright 2002,2003,2004,2005,2006,2007,2008,2009 Olly Betts 6 * Copyright 2007 Lemur Consulting Ltd6 * Copyright 2007,2009 Lemur Consulting Ltd 7 7 * 8 8 * This program is free software; you can redistribute it and/or 9 9 * modify it under the terms of the GNU General Public License as … … 642 642 Enquire::Internal::get_mset(Xapian::doccount first, Xapian::doccount maxitems, 643 643 Xapian::doccount check_at_least, const RSet *rset, 644 644 const MatchDecider *mdecider, 645 const MatchDecider *matchspy ) const645 const MatchDecider *matchspy_legacy) const 646 646 { 647 647 DEBUGCALL(API, MSet, "Enquire::Internal::get_mset", first << ", " << 648 648 maxitems << ", " << check_at_least << ", " << rset << ", " << 649 mdecider << ", " << matchspy );649 mdecider << ", " << matchspy_legacy); 650 650 651 651 if (percent_cutoff && (sort_by == VAL || sort_by == VAL_REL)) { 652 652 throw Xapian::UnimplementedError("Use of a percentage cutoff while sorting primary by value isn't currently supported"); … … 661 661 collapse_max, collapse_key, 662 662 percent_cutoff, weight_cutoff, 663 663 order, sort_key, sort_by, sort_value_forward, sorter, 664 errorhandler, stats, weight );664 errorhandler, stats, weight, spies); 665 665 // Run query and put results into supplied Xapian::MSet object. 666 666 MSet retval; 667 667 match.get_mset(first, maxitems, check_at_least, retval, 668 stats, mdecider, matchspy );668 stats, mdecider, matchspy_legacy); 669 669 670 670 Assert(weight->name() != "bool" || retval.get_max_possible() == 0); 671 671 … … 893 893 } 894 894 895 895 void 896 Enquire::add_matchspy(MatchSpy * spy) { 897 DEBUGAPICALL(void, "Xapian::Enquire::add_matchspy", spy); 898 internal->spies.push_back(spy); 899 } 900 901 void 902 Enquire::clear_matchspies() { 903 DEBUGAPICALL(const Xapian::Query &, "Xapian::Enquire::clear_matchspies", ""); 904 internal->spies.clear(); 905 } 906 907 void 896 908 Enquire::set_weighting_scheme(const Weight &weight_) 897 909 { 898 910 DEBUGAPICALL(void, "Xapian::Enquire::set_weighting_scheme", "[Weight]"); -
xapian-core/api/serialisationcontext.cc
24 24 #include "xapian/serialisationcontext.h" 25 25 26 26 #include "xapian/error.h" 27 #include "xapian/matchspy.h" 27 28 #include "xapian/postingsource.h" 28 29 #include "xapian/weight.h" 29 30 … … 78 79 RETURN(internal->get_weighting_scheme(name)); 79 80 } 80 81 81 82 82 void 83 83 SerialisationContext::register_posting_source(const Xapian::PostingSource &source) 84 84 { … … 93 93 RETURN(internal->get_posting_source(name)); 94 94 } 95 95 96 void 97 SerialisationContext::register_match_spy(const Xapian::MatchSpy &spy) 98 { 99 LOGCALL_VOID(API, "Xapian::SerialisationContext::register_match_spy", spy.name()); 100 internal->register_match_spy(spy); 101 } 96 102 103 const Xapian::MatchSpy * 104 SerialisationContext::get_match_spy(const string & name) const 105 { 106 LOGCALL(API, const Xapian::MatchSpy *, "Xapian::SerialisationContext::get_match_spy", name); 107 RETURN(internal->get_match_spy(name)); 108 } 109 110 97 111 SerialisationContext::Internal::Internal() 98 112 : Xapian::Internal::RefCntBase(), 99 113 wtschemes(), … … 106 120 { 107 121 clear_weighting_schemes(); 108 122 clear_posting_sources(); 123 clear_match_spies(); 109 124 } 110 125 111 126 void … … 128 143 postingsources[source->name()] = source; 129 144 source = new Xapian::FixedWeightPostingSource(0.0); 130 145 postingsources[source->name()] = source; 146 147 Xapian::MatchSpy * spy; 148 spy = new Xapian::ValueCountMatchSpy(); 149 matchspies[spy->name()] = spy; 131 150 } 132 151 133 152 void … … 149 168 } 150 169 151 170 void 171 SerialisationContext::Internal::clear_match_spies() 172 { 173 map<string, Xapian::MatchSpy *>::const_iterator i; 174 for (i = matchspies.begin(); i != matchspies.end(); ++i) { 175 delete i->second; 176 } 177 } 178 179 void 152 180 SerialisationContext::Internal::register_weighting_scheme(const Xapian::Weight &wt) 153 181 { 154 182 string wtname = wt.name(); … … 220 248 return i->second; 221 249 } 222 250 251 void 252 SerialisationContext::Internal::register_match_spy(const Xapian::MatchSpy &spy) 253 { 254 string spyname = spy.name(); 255 if (spyname.empty()) { 256 throw Xapian::InvalidOperationError("Unable to register match spy - name() method returns empty string."); 257 } 258 259 map<string, Xapian::MatchSpy *>::const_iterator i; 260 i = matchspies.find(spyname); 261 if (i != matchspies.end()) { 262 delete i->second; 263 } 264 265 Xapian::MatchSpy * spyclone = spy.clone(); 266 if (!spyclone) { 267 matchspies.erase(spyname); 268 throw Xapian::InvalidOperationError("Unable to register match spy - clone() method returns NULL."); 269 } 270 try { 271 matchspies[spyname] = spyclone; 272 } catch(...) { 273 delete spyclone; 274 matchspies.erase(spyname); 275 throw; 276 } 223 277 } 278 279 const Xapian::MatchSpy * 280 SerialisationContext::Internal::get_match_spy(const string & name) const 281 { 282 map<string, Xapian::MatchSpy *>::const_iterator i; 283 i = matchspies.find(name); 284 if (i == matchspies.end()) { 285 return NULL; 286 } 287 return i->second; 288 } 289 290 } -
xapian-core/api/matchspy.cc
1 /** @file matchspy.cc 2 * @brief MatchSpy implementation. 3 */ 4 /* Copyright (C) 2007,2008,2009 Olly Betts 5 * Copyright (C) 2007,2009 Lemur Consulting Ltd 6 * 7 * This program is free software; you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License as published by 9 * the Free Software Foundation; either version 2 of the License, or 10 * (at your option) any later version. 11 * 12 * This program is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 * GNU General Public License for more details. 16 * 17 * You should have received a copy of the GNU General Public License 18 * along with this program; if not, write to the Free Software 19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 20 */ 21 22 #include <config.h> 23 #include <xapian/matchspy.h> 24 25 #include <xapian/document.h> 26 #include <xapian/error.h> 27 #include <xapian/queryparser.h> 28 #include <xapian/serialisationcontext.h> 29 30 #include <map> 31 #include <string> 32 #include <vector> 33 34 #include "autoptr.h" 35 #include "debuglog.h" 36 #include "omassert.h" 37 #include "serialise.h" 38 #include "stringutils.h" 39 #include "str.h" 40 41 #include <float.h> 42 #include <math.h> 43 44 45 using namespace std; 46 47 namespace Xapian { 48 49 MatchSpy::~MatchSpy() {} 50 51 MatchSpy * 52 MatchSpy::clone() const { 53 throw UnimplementedError("MatchSpy not suitable for use with remote searches - clone() method unimplemented"); 54 } 55 56 string 57 MatchSpy::name() const { 58 throw UnimplementedError("MatchSpy not suitable for use with remote searches - name() method unimplemented"); 59 } 60 61 string 62 MatchSpy::serialise() const { 63 throw UnimplementedError("MatchSpy not suitable for use with remote searches - serialise() method unimplemented"); 64 } 65 66 MatchSpy * 67 MatchSpy::unserialise(const string &, const SerialisationContext &) const { 68 throw UnimplementedError("MatchSpy not suitable for use with remote searches - unserialise() method unimplemented"); 69 } 70 71 string 72 MatchSpy::serialise_results() const { 73 throw UnimplementedError("MatchSpy not suitable for use with remote searches - serialise_results() method unimplemented"); 74 } 75 76 void 77 MatchSpy::merge_results(const string &) { 78 throw UnimplementedError("MatchSpy not suitable for use with remote searches - merge_results() method unimplemented"); 79 } 80 81 string 82 MatchSpy::get_description() const { 83 return "Xapian::MatchSpy()"; 84 } 85 86 87 /** Compare two StringAndFrequency objects. 88 * 89 * The comparison is firstly by frequency (higher is better), then by string 90 * (earlier lexicographic sort is better). 91 */ 92 class StringAndFreqCmpByFreq { 93 public: 94 /// Default constructor 95 StringAndFreqCmpByFreq() {} 96 97 /// Return true if a has a higher frequency than b. 98 /// If equal, compare by the str, to provide a stable sort order. 99 bool operator()(const StringAndFrequency &a, 100 const StringAndFrequency &b) const { 101 if (a.get_frequency() > b.get_frequency()) return true; 102 if (a.get_frequency() < b.get_frequency()) return false; 103 if (a.get_string() > b.get_string()) return false; 104 return true; 105 } 106 }; 107 108 /** Get the most frequent items from a map from string to frequency. 109 * 110 * This takes input such as that returned by @a 111 * ValueCountMatchSpy::get_values(), and returns a vector of the most 112 * frequent items in the input. 113 * 114 * @param result A vector which will be filled with the most frequent 115 * items, in descending order of frequency. Items with 116 * the same frequency will be sorted in ascending 117 * alphabetical order. 118 * 119 * @param items The map from string to frequency, from which the most 120 * frequent items will be selected. 121 * 122 * @param maxitems The maximum number of items to return. 123 */ 124 static void 125 get_most_frequent_items(vector<StringAndFrequency> & result, 126 const map<string, doccount> & items, 127 size_t maxitems) 128 { 129 result.clear(); 130 result.reserve(maxitems); 131 StringAndFreqCmpByFreq cmpfn; 132 bool is_heap(false); 133 134 for (map<string, doccount>::const_iterator i = items.begin(); 135 i != items.end(); i++) { 136 Assert(result.size() <= maxitems); 137 result.push_back(StringAndFrequency(i->first, i->second)); 138 if (result.size() > maxitems) { 139 // Make the list back into a heap. 140 if (is_heap) { 141 // Only the new element isn't in the right place. 142 push_heap(result.begin(), result.end(), cmpfn); 143 } else { 144 // Need to build heap from scratch. 145 make_heap(result.begin(), result.end(), cmpfn); 146 is_heap = true; 147 } 148 pop_heap(result.begin(), result.end(), cmpfn); 149 result.pop_back(); 150 } 151 } 152 153 if (is_heap) { 154 sort_heap(result.begin(), result.end(), cmpfn); 155 } else { 156 sort(result.begin(), result.end(), cmpfn); 157 } 158 } 159 160 void 161 ValueCountMatchSpy::operator()(const Document &doc, weight) { 162 ++total; 163 string val(doc.get_value(slot)); 164 if (!val.empty()) ++values[val]; 165 } 166 167 void 168 ValueCountMatchSpy::get_top_values(vector<StringAndFrequency> & result, 169 size_t maxvalues) const 170 { 171 get_most_frequent_items(result, values, maxvalues); 172 } 173 174 MatchSpy * 175 ValueCountMatchSpy::clone() const { 176 return new ValueCountMatchSpy(slot); 177 } 178 179 string 180 ValueCountMatchSpy::name() const { 181 return "Xapian::ValueCountMatchSpy"; 182 } 183 184 string 185 ValueCountMatchSpy::serialise() const { 186 string result; 187 result += encode_length(slot); 188 return result; 189 } 190 191 MatchSpy * 192 ValueCountMatchSpy::unserialise(const string & s, 193 const SerialisationContext &) const{ 194 const char * p = s.data(); 195 const char * end = p + s.size(); 196 197 valueno new_slot = decode_length(&p, end, false); 198 if (p != end) { 199 throw NetworkError("Junk at end of serialised ValueCountMatchSpy"); 200 } 201 202 return new ValueCountMatchSpy(new_slot); 203 } 204 205 string 206 ValueCountMatchSpy::serialise_results() const { 207 LOGCALL(REMOTE, string, "ValueCountMatchSpy::serialise_results", ""); 208 string result; 209 result += encode_length(total); 210 result += encode_length(values.size()); 211 for (map<string, doccount>::const_iterator i = values.begin(); 212 i != values.end(); ++i) { 213 result += encode_length(i->first.size()); 214 result += i->first; 215 result += encode_length(i->second); 216 } 217 RETURN(result); 218 } 219 220 void 221 ValueCountMatchSpy::merge_results(const string & s) { 222 LOGCALL_VOID(REMOTE, "ValueCountMatchSpy::merge_results", s); 223 const char * p = s.data(); 224 const char * end = p + s.size(); 225 226 total += decode_length(&p, end, false); 227 228 map<string, doccount>::size_type items = decode_length(&p, end, false); 229 while (p != end) { 230 while(items != 0) { 231 size_t vallen = decode_length(&p, end, true); 232 string val(p, vallen); 233 p += vallen; 234 doccount freq = decode_length(&p, end, false); 235 values[val] += freq; 236 --items; 237 } 238 } 239 } 240 241 string 242 ValueCountMatchSpy::get_description() const { 243 return "Xapian::ValueCountMatchSpy(" + str(total) + 244 " docs seen, looking in " + str(values.size()) + " slots)"; 245 } 246 247 248 inline double sqrd(double x) { return x * x; } 249 250 /** Calculate a score based on how evenly distributed the frequencies of a set 251 * of values are. 252 */ 253 template<class T> double 254 do_score_evenness(const map<T, doccount> & values, 255 doccount total, 256 double desired_no_of_categories) 257 { 258 if (total == 0) return 0.0; 259 260 size_t total_unset = total; 261 double score = 0.0; 262 263 if (desired_no_of_categories <= 0.0) 264 desired_no_of_categories = values.size(); 265 266 double avg = double(total) / desired_no_of_categories; 267 268 typename map<T, doccount>::const_iterator i; 269 for (i = values.begin(); i != values.end(); ++i) { 270 size_t count = i->second; 271 total_unset -= count; 272 score += sqrd(count - avg); 273 } 274 if (total_unset) score += sqrd(total_unset - avg); 275 276 // Scale down so the total number of items doesn't make a difference. 277 score /= sqrd(total); 278 279 // Bias towards returning the number of categories requested. 280 score += 0.01 * sqrd(desired_no_of_categories - values.size()); 281 282 return score; 283 } 284 285 double score_evenness(const map<string, doccount> & values, 286 doccount total, 287 double desired_no_of_categories) { 288 return do_score_evenness(values, total, desired_no_of_categories); 289 } 290 291 double score_evenness(const map<NumericRange, doccount> & values, 292 doccount total, 293 double desired_no_of_categories) { 294 return do_score_evenness(values, total, desired_no_of_categories); 295 } 296 297 double score_evenness(const ValueCountMatchSpy & spy, 298 double desired_no_of_categories) { 299 return do_score_evenness(spy.get_values(), spy.get_total(), 300 desired_no_of_categories); 301 } 302 303 304 /** A bucket, used when building numeric ranges. 305 */ 306 struct bucketval { 307 size_t count; 308 double min, max; 309 310 bucketval() : count(0), min(DBL_MAX), max(-DBL_MAX) { } 311 312 void update(size_t n, double value) { 313 count += n; 314 if (value < min) min = value; 315 if (value > max) max = value; 316 } 317 }; 318 319 doccount build_numeric_ranges(map<NumericRange, doccount> & result, 320 const map<string, doccount> & values, 321 size_t max_ranges) 322 { 323 double lo = DBL_MAX, hi = -DBL_MAX; 324 result.clear(); 325 326 map<double, doccount> histo; 327 doccount total_set = 0; 328 map<string, doccount>::const_iterator i; 329 for (i = values.begin(); i != values.end(); ++i) { 330 if (i->first.size() == 0) continue; 331 double v = sortable_unserialise(i->first.c_str()); 332 if (v < lo) lo = v; 333 if (v > hi) hi = v; 334 doccount count = i->second; 335 histo[v] = count; 336 total_set += count; 337 } 338 339 if (total_set == 0) { 340 // No set values. 341 return total_set; 342 } 343 if (lo == hi) { 344 // All set values are the same. 345 NumericRange range(lo, hi); 346 result[range] = total_set; 347 return total_set; 348 } 349 350 double sizeby = max(fabs(hi), fabs(lo)); 351 // E.g. if sizeby = 27.4 and max_ranges = 7, we want to split into units of 352 // width 1.0 which we may then coalesce if there are too many used buckets. 353 double unit = pow(10.0, floor(log10(sizeby / max_ranges) - 0.2)); 354 double start = floor(lo / unit) * unit; 355 // Can happen due to FP rounding (e.g. lo = 11.95, unit = 0.01). 356 if (start > lo) start = lo; 357 size_t n_buckets = size_t(ceil(hi / unit) - floor(lo / unit)); 358 359 bool scaleby2 = true; 360 vector<bucketval> bucket(n_buckets + 1); 361 while (true) { 362 size_t n_used = 0; 363 map<double, doccount>::const_iterator j; 364 for (j = histo.begin(); j != histo.end(); ++j) { 365 double v = j->first; 366 size_t b = size_t(floor((v - start) / unit)); 367 if (b > n_buckets) b = n_buckets; // FIXME - Hacky workaround to ensure that b is in range. 368 if (bucket[b].count == 0) ++n_used; 369 bucket[b].update(j->second, v); 370 } 371 372 if (n_used <= max_ranges) break; 373 374 unit *= scaleby2 ? 2.0 : 2.5; 375 scaleby2 = !scaleby2; 376 start = floor(lo / unit) * unit; 377 // Can happen due to FP rounding (e.g. lo = 11.95, unit = 0.01). 378 if (start > lo) start = lo; 379 n_buckets = size_t(ceil(hi / unit) - floor(lo / unit)); 380 bucket.resize(0); 381 bucket.resize(n_buckets + 1); 382 } 383 384 map<string, doccount> discrete_categories; 385 for (size_t b = 0; b < bucket.size(); ++b) { 386 if (bucket[b].count == 0) continue; 387 NumericRange range(bucket[b].min, bucket[b].max); 388 result[range] = bucket[b].count; 389 } 390 391 return total_set; 392 } 393 394 } -
xapian-core/backends/remote/remote-database.cc
Property changes on: xapian-core/api/matchspy.cc ___________________________________________________________________ Added: svn:eol-style + native
43 43 #include <vector> 44 44 45 45 #include "xapian/error.h" 46 #include "xapian/matchspy.h" 46 47 47 48 using namespace std; 48 49 … … 536 537 bool sort_value_forward, 537 538 int percent_cutoff, Xapian::weight weight_cutoff, 538 539 const Xapian::Weight *wtscheme, 539 const Xapian::RSet &omrset) 540 const Xapian::RSet &omrset, 541 const vector<Xapian::MatchSpy *> & matchspies) 540 542 { 541 543 string tmp = query->serialise(); 542 544 string message = encode_length(tmp.size()); … … 561 563 message += encode_length(tmp.size()); 562 564 message += tmp; 563 565 564 message += serialise_rset(omrset); 566 tmp = serialise_rset(omrset); 567 message += encode_length(tmp.size()); 568 message += tmp; 565 569 570 message += encode_length(matchspies.size()); 571 for (vector<Xapian::MatchSpy *>::const_iterator i = matchspies.begin(); 572 i != matchspies.end(); ++i) { 573 574 tmp = (*i)->name(); 575 if (tmp.size() == 0) { 576 throw Xapian::UnimplementedError("MatchSpy not suitable for use with remote searches - name() method returned empty string"); 577 } 578 message += encode_length(tmp.size()); 579 message += tmp; 580 581 tmp = (*i)->serialise(); 582 message += encode_length(tmp.size()); 583 message += tmp; 584 } 585 566 586 send_message(MSG_QUERY, message); 567 587 } 568 588 … … 592 612 } 593 613 594 614 void 595 RemoteDatabase::get_mset(Xapian::MSet &mset) 615 RemoteDatabase::get_mset(Xapian::MSet &mset, 616 const vector<Xapian::MatchSpy *> & matchspies) 596 617 { 597 618 string message; 598 619 get_message(message, REPLY_RESULTS); 599 mset = unserialise_mset(message); 620 const char * p = message.data(); 621 const char * p_end = p + message.size(); 622 mset = unserialise_mset(&p, p_end); 623 624 for (vector<Xapian::MatchSpy *>::const_iterator i = matchspies.begin(); 625 i != matchspies.end(); ++i) { 626 if (p == p_end) 627 throw Xapian::NetworkError("Expected serialised matchspy"); 628 size_t len = decode_length(&p, p_end, true); 629 string spyresults = string(p, len); 630 p += len; 631 (*i)->merge_results(spyresults); 632 } 633 if (p != p_end) 634 throw Xapian::NetworkError("Junk at end of mset"); 600 635 } 601 636 602 637 void -
xapian-bindings/csharp/Makefile.am
26 26 Flint.cs \ 27 27 InMemory.cs \ 28 28 MatchDecider.cs \ 29 MatchSpy.cs \ 29 30 MSet.cs \ 30 31 MSetIterator.cs \ 31 32 MultiValueSorter.cs \ … … 37 38 QueryParser.cs \ 38 39 Remote.cs \ 39 40 RSet.cs \ 41 SWIGTYPE_p_std__mapT_Xapian__NumericRange_unsigned_int_t.cs \ 42 SWIGTYPE_p_std__mapT_std__string_unsigned_int_t.cs \ 40 43 SWIGTYPE_p_std__string.cs \ 41 44 SWIGTYPE_p_std__vectorT_std__string_t.cs \ 42 45 SWIGTYPE_p_std__vectorT_Xapian__Query_t.cs \ 46 SWIGTYPE_p_std__vectorT_Xapian__StringAndFrequency_t.cs \ 43 47 SerialisationContext.cs \ 44 48 SimpleStopper.cs \ 45 49 Sorter.cs \ … … 49 53 TermGenerator.cs \ 50 54 TermIterator.cs \ 51 55 TradWeight.cs \ 56 ValueCountMatchSpy.cs \ 52 57 ValueIterator.cs \ 53 58 ValueRangeProcessor.cs \ 54 59 Version.cs \ -
xapian-bindings/python/pythontest2.py
1289 1289 enq.set_query(xapian.Query('foo')) 1290 1290 enq.get_mset(0, 10) 1291 1291 1292 def test_matchspy(): 1293 """Test use of matchspies. 1294 1295 """ 1296 db = setup_database() 1297 query = xapian.Query(xapian.Query.OP_OR, "was", "it") 1298 enq = xapian.Enquire(db) 1299 enq.set_query(query) 1300 1301 def set_matchspy_deref(enq): 1302 """Set a matchspy, and then drop the reference, to check that it 1303 doesn't get deleted too soon. 1304 """ 1305 spy = xapian.ValueCountMatchSpy(0) 1306 enq.add_matchspy(spy) 1307 del spy 1308 set_matchspy_deref(enq) 1309 mset = enq.get_mset(0, 10) 1310 expect(len(mset), 5) 1311 1312 spy = xapian.ValueCountMatchSpy(0) 1313 enq.add_matchspy(spy) 1314 mset = enq.get_mset(0, 10) 1315 expect(spy.get_values_as_dict(), {'zero': 1}) 1316 expect(spy.get_total(), 5) 1317 expect(spy.get_top_values(10), [('zero', 1)]) 1318 1292 1319 # Run all tests (ie, callables with names starting "test_"). 1293 1320 if not runtests(globals(), sys.argv[1:]): 1294 1321 sys.exit(1) -
xapian-bindings/python/util.i
172 172 } 173 173 } 174 174 175 %{ 176 /* Typemap for returning a map of ints keyed by strings: converts to a dict. 177 * This is used for @a ValueCountMatchSpy::get_values(). 178 * The GIL must be held when this is called. 179 */ 180 PyObject * 181 value_map_to_dict(const std::map<std::string, Xapian::doccount> & vals) 182 { 183 PyObject * result = PyDict_New(); 184 if (result == 0) { 185 return NULL; 186 } 187 188 std::map<std::string, Xapian::doccount>::const_iterator i; 189 for (i = vals.begin(); i != vals.end(); ++i) { 190 PyObject * str = PyString_FromStringAndSize((*i).first.data(), 191 (*i).first.size()); 192 if (str == 0) { 193 Py_DECREF(result); 194 result = NULL; 195 return NULL; 196 } 197 198 PyObject * l = PyInt_FromLong((*i).second); 199 if (l == 0) { 200 Py_DECREF(str); 201 Py_DECREF(result); 202 result = NULL; 203 return NULL; 204 } 205 206 if (PyDict_SetItem(result, str, l) == -1) { 207 Py_DECREF(result); 208 result = NULL; 209 return NULL; 210 } 211 Py_DECREF(str); 212 Py_DECREF(l); 213 } 214 return result; 215 } 216 %} 217 218 /** Typemap pair for getting the return value from @a ValueCountMatchSpy::get_top_values(). 219 */ 220 %typemap(in, numinputs=0) std::vector<Xapian::StringAndFrequency> & result (std::vector<Xapian::StringAndFrequency> temp) { 221 $1 = &temp; 222 } 223 %typemap(argout) std::vector<Xapian::StringAndFrequency> & result { 224 Py_DECREF($result); 225 $result = PyList_New($1->size()); 226 size_t pos = 0; 227 for (std::vector<Xapian::StringAndFrequency>::const_iterator i = $1->begin(); 228 i != $1->end(); ++i) { 229 PyObject * str = PyString_FromStringAndSize((*i).get_string().data(), 230 (*i).get_string().size()); 231 if (str == 0) { 232 Py_DECREF($result); 233 $result = NULL; 234 SWIG_fail; 235 } 236 237 PyObject * l = PyInt_FromLong((*i).get_frequency()); 238 if (l == 0) { 239 Py_DECREF($result); 240 Py_DECREF(str); 241 $result = NULL; 242 SWIG_fail; 243 } 244 245 PyObject *t = PyTuple_New(2); 246 if (t == 0) { 247 Py_DECREF($result); 248 Py_DECREF(str); 249 Py_DECREF(l); 250 $result = NULL; 251 SWIG_fail; 252 } 253 PyTuple_SetItem(t, 0, str); 254 PyTuple_SetItem(t, 1, l); 255 256 PyList_SetItem($result, pos++, t); 257 } 258 } 259 175 260 %typedef PyObject *LangSpecificListType; 176 261 177 262 %inline %{ -
xapian-bindings/python/extra.i
22 22 */ 23 23 %} 24 24 25 %extend ValueCountMatchSpy { 26 %feature("nothread") get_values_as_dict; 27 %exception get_values_as_dict { 28 try { 29 $action 30 } catch (...) { 31 Xapian::SetPythonException(); 32 SWIG_fail; 33 } 34 } 35 PyObject * get_values_as_dict() { 36 return value_map_to_dict($self->get_values()); 37 } 38 } 39 25 40 %pythoncode %{ 26 41 27 42 # Set the documentation format - this is used by tools like "epydoc" to decide … … 1076 1091 __all__ = tuple(__all__) 1077 1092 1078 1093 1094 # Fix up Enquire so that it keeps a python reference to the deciders supplied 1095 # to it so that they won't be deleted before the Enquire object. This hack can 1096 # probably be removed once xapian bug #186 is fixed. 1097 _enquire_add_matchspy_orig = Enquire.add_matchspy 1098 def _enquire_match_spy_add(self, decider): 1099 if not hasattr(self, '_deciders'): 1100 self._deciders = [] 1101 self._deciders.append(decider) 1102 _enquire_add_matchspy_orig(self, decider) 1103 _enquire_match_spy_add.__doc__ = Enquire.add_matchspy.__doc__ 1104 Enquire.add_matchspy = _enquire_match_spy_add 1105 1106 _enquire_clear_matchspies_orig = Enquire.clear_matchspies 1107 def _enquire_match_spies_clear(self): 1108 _enquire_clear_matchspies_orig(self, decider) 1109 if hasattr(self, '_deciders'): 1110 del self._deciders 1111 _enquire_match_spies_clear.__doc__ = Enquire.clear_matchspies.__doc__ 1112 Enquire.clear_matchspies = _enquire_match_spies_clear 1113 1114 1115 1079 1116 # Remove static methods which shouldn't be in the API. 1080 1117 del Document_unserialise 1081 1118 del Query_unserialise -
xapian-bindings/xapian.i
356 356 #endif 357 357 358 358 class Database; 359 class MatchSpy; 359 360 class Query; 360 361 class Sorter; 361 362 … … 367 368 void set_query(const Query & query, termcount qlen = 0); 368 369 const Query& get_query(); 369 370 371 void add_matchspy(MatchSpy * spy); 372 void clear_matchspies(); 373 370 374 void set_weighting_scheme(const Weight& weight); 371 375 void set_collapse_key(Xapian::valueno collapse_key, 372 376 Xapian::doccount collapse_max = 1); … … 440 444 441 445 } 442 446 447 %ignore Xapian::SerialisationContext::operator=; 448 %include <xapian/serialisationcontext.h> 449 443 450 /* Generated code won't compile if directors are enabled. Disable for now 444 451 * while we investigate. 445 452 * … … 466 473 %warnfilter(842) Xapian::TradWeight::unserialise; 467 474 %include <xapian/weight.h> 468 475 476 %ignore Xapian::NumericRange::operator<; 477 %include <xapian/matchspy.h> 478 469 479 namespace Xapian { 470 480 471 481 // xapian/database.h … … 747 757 %include <xapian/replication.h> 748 758 %include <xapian/valuesetmatchdecider.h> 749 759 750 %ignore Xapian::SerialisationContext::operator=;751 %include <xapian/serialisationcontext.h>752 753 760 namespace Xapian { 754 761 755 762 #if defined SWIGPYTHON