Ticket #537: eset_min_wt.diff
File eset_min_wt.diff, 11.9 KB (added by , 14 years ago) |
---|
-
xapian-core/api/omenquire.cc
diff --git a/xapian-core/api/omenquire.cc b/xapian-core/api/omenquire.cc index f2eb486..1cb8e84 100644
a b 4 4 * Copyright 2001,2002 Ananova Ltd 5 5 * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010 Olly Betts 6 6 * Copyright 2007,2009 Lemur Consulting Ltd 7 * Copyright 2011 Dan Colish 7 8 * 8 9 * This program is free software; you can redistribute it and/or 9 10 * modify it under the terms of the GNU General Public License as … … Enquire::Internal::get_mset(Xapian::doccount first, Xapian::doccount maxitems, 690 691 ESet 691 692 Enquire::Internal::get_eset(Xapian::termcount maxitems, 692 693 const RSet & rset, int flags, double k, 693 const ExpandDecider * edecider ) const694 const ExpandDecider * edecider, Xapian::weight min_wt) const 694 695 { 695 LOGCALL(MATCH, ESet, "Enquire::Internal::get_eset", maxitems | rset | flags | k | edecider );696 LOGCALL(MATCH, ESet, "Enquire::Internal::get_eset", maxitems | rset | flags | k | edecider | min_wt); 696 697 697 698 if (maxitems == 0 || rset.empty()) { 698 699 // Either we were asked for no results, or wouldn't produce any … … Enquire::Internal::get_eset(Xapian::termcount maxitems, 728 729 ExpandWeight eweight(db, rset.size(), use_exact_termfreq, k); 729 730 730 731 Xapian::ESet eset; 731 eset.internal->expand(maxitems, db, rset, edecider, eweight );732 eset.internal->expand(maxitems, db, rset, edecider, eweight, min_wt); 732 733 RETURN(eset); 733 734 } 734 735 … … Enquire::get_eset(Xapian::termcount maxitems, const RSet & rset, int flags, 1043 1044 LOGCALL(API, Xapian::ESet, "Xapian::Enquire::get_eset", maxitems | rset | flags | k | edecider); 1044 1045 1045 1046 try { 1046 RETURN(internal->get_eset(maxitems, rset, flags, k, edecider)); 1047 RETURN(internal->get_eset(maxitems, rset, flags, k, edecider, 0)); 1048 } catch (Error & e) { 1049 if (internal->errorhandler) (*internal->errorhandler)(e); 1050 throw; 1051 } 1052 } 1053 1054 ESet 1055 Enquire::get_eset(Xapian::termcount maxitems, const RSet & rset, int flags, 1056 double k, const ExpandDecider * edecider, Xapian::weight min_wt) const 1057 { 1058 LOGCALL(API, Xapian::ESet, "Xapian::Enquire::get_eset", maxitems | rset | flags | k | edecider | min_wt); 1059 1060 try { 1061 RETURN(internal->get_eset(maxitems, rset, flags, k, edecider, min_wt)); 1047 1062 } catch (Error & e) { 1048 1063 if (internal->errorhandler) (*internal->errorhandler)(e); 1049 1064 throw; -
xapian-core/common/esetinternal.h
diff --git a/xapian-core/common/esetinternal.h b/xapian-core/common/esetinternal.h index 0807462..7304f64 100644
a b 2 2 * @brief Xapian::ESet::Internal class 3 3 */ 4 4 /* Copyright (C) 2008,2010 Olly Betts 5 * Copyright (C) 2011 Dan Colish 5 6 * 6 7 * This program is free software; you can redistribute it and/or modify 7 8 * it under the terms of the GNU General Public License as published by … … class ESet::Internal : public Xapian::Internal::RefCntBase { 101 102 const Xapian::Database & db, 102 103 const Xapian::RSet & rset, 103 104 const Xapian::ExpandDecider * edecider, 104 const Xapian::Internal::ExpandWeight & eweight); 105 const Xapian::Internal::ExpandWeight & eweight, 106 Xapian::weight min_wt); 105 107 106 108 /// Return a string describing this object. 107 109 std::string get_description() const; -
xapian-core/common/omenquireinternal.h
diff --git a/xapian-core/common/omenquireinternal.h b/xapian-core/common/omenquireinternal.h index 9c3d825..6cd68fc 100644
a b 4 4 * Copyright 2001,2002 Ananova Ltd 5 5 * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010 Olly Betts 6 6 * Copyright 2009 Lemur Consulting Ltd 7 * Copyright 2011 Dan Colish 7 8 * 8 9 * This program is free software; you can redistribute it and/or 9 10 * modify it under the terms of the GNU General Public License as … … class Enquire::Internal : public Xapian::Internal::RefCntBase { 179 180 const RSet *omrset, 180 181 const MatchDecider *mdecider, 181 182 const MatchDecider *matchspy_legacy) const; 183 182 184 ESet get_eset(Xapian::termcount maxitems, const RSet & omrset, int flags, 183 double k, const ExpandDecider *edecider ) const;185 double k, const ExpandDecider *edecider, Xapian::weight min_wt) const; 184 186 185 187 TermIterator get_matching_terms(Xapian::docid did) const; 186 188 TermIterator get_matching_terms(const Xapian::MSetIterator &it) const; -
xapian-core/expand/esetinternal.cc
diff --git a/xapian-core/expand/esetinternal.cc b/xapian-core/expand/esetinternal.cc index d8abaf6..5f692f2 100644
a b 2 2 * @brief Xapian::ESet::Internal class 3 3 */ 4 4 /* Copyright (C) 2008,2010 Olly Betts 5 * Copyright (C) 2011 Dan Colish 5 6 * 6 7 * This program is free software; you can redistribute it and/or modify 7 8 * it under the terms of the GNU General Public License as published by … … ESet::Internal::expand(Xapian::termcount max_esize, 138 139 const Xapian::Database & db, 139 140 const RSet & rset, 140 141 const Xapian::ExpandDecider * edecider, 141 const Xapian::Internal::ExpandWeight & eweight) 142 const Xapian::Internal::ExpandWeight & eweight, 143 Xapian::weight min_wt) 142 144 { 143 145 LOGCALL_VOID(EXPAND, "ESet::Internal::expand", max_esize | db | rset | edecider | eweight); 144 146 // These two cases are handled by our caller. … … ESet::Internal::expand(Xapian::termcount max_esize, 149 151 Assert(ebound == 0); 150 152 Assert(items.empty()); 151 153 152 Xapian::weight min_wt = 0;153 154 154 AutoPtr<TermList> tree(build_termlist_tree(db, rset)); 155 155 Assert(tree.get()); 156 156 -
xapian-core/expand/expandweight.cc
diff --git a/xapian-core/expand/expandweight.cc b/xapian-core/expand/expandweight.cc index f8b8567..6202b7e 100644
a b 2 2 * @brief Calculate term weights for the ESet. 3 3 */ 4 4 /* Copyright (C) 2007,2008 Olly Betts 5 * 5 * Copyright (C) 2011 Dan Colish 6 * 6 7 * This program is free software; you can redistribute it and/or 7 8 * modify it under the terms of the GNU General Public License as 8 9 * published by the Free Software Foundation; either version 2 of the … … ExpandWeight::get_weight(TermList * merger, const string & term) const 103 104 denom = (termfreq - rtermfreq + 0.5) * (reldocs_without_term + 0.5); 104 105 AssertRel(denom,>,0); 105 106 106 // If the returned termweight would be negative, clamp it to 0.0.107 if (rare(num <= denom)) RETURN(0.0);108 109 107 Xapian::weight tw = log(num / denom); 110 108 LOGVALUE(EXPAND, tw); 111 109 LOGVALUE(EXPAND, stats.multiplier); -
xapian-core/include/xapian/enquire.h
diff --git a/xapian-core/include/xapian/enquire.h b/xapian-core/include/xapian/enquire.h index e7ec821..d9cd439 100644
a b 5 5 * Copyright 2001,2002 Ananova Ltd 6 6 * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2011 Olly Betts 7 7 * Copyright 2009 Lemur Consulting Ltd 8 * Copyright 2011 Dan Colish 8 9 * 9 10 * This program is free software; you can redistribute it and/or 10 11 * modify it under the terms of the GNU General Public License as … … class XAPIAN_VISIBILITY_DEFAULT Enquire { 1005 1006 * @param edecider a decision functor to use to decide whether a 1006 1007 * given term should be put in the ESet 1007 1008 * 1009 * @param min_wt the minimum weight for included terms 1010 * 1008 1011 * @return An ESet object containing the results of the 1009 1012 * expand. 1010 1013 * … … class XAPIAN_VISIBILITY_DEFAULT Enquire { 1015 1018 int flags = 0, 1016 1019 double k = 1.0, 1017 1020 const Xapian::ExpandDecider * edecider = 0) const; 1018 1019 1021 /** Get the expand set for the given rset. 1020 1022 * 1021 1023 * @param maxitems the maximum number of items to return. … … class XAPIAN_VISIBILITY_DEFAULT Enquire { 1034 1036 return get_eset(maxitems, omrset, 0, 1.0, edecider); 1035 1037 } 1036 1038 1039 /** Get the expand set for the given rset. 1040 * 1041 * @param maxitems the maximum number of items to return. 1042 * @param omrset the relevance set to use when performing 1043 * the expand operation. 1044 * @param flags zero or more of these values |-ed together: 1045 * - Xapian::Enquire::INCLUDE_QUERY_TERMS query 1046 * terms may be returned from expand 1047 * - Xapian::Enquire::USE_EXACT_TERMFREQ for multi 1048 * dbs, calculate the exact termfreq; otherwise an 1049 * approximation is used which can greatly improve 1050 * efficiency, but still returns good results. 1051 * @param k the parameter k in the query expansion algorithm 1052 * (default is 1.0) 1053 * @param edecider a decision functor to use to decide whether a 1054 * given term should be put in the ESet 1055 * 1056 * @param min_wt the minimum weight for included terms 1057 * 1058 * @return An ESet object containing the results of the 1059 * expand. 1060 * 1061 * @exception Xapian::InvalidArgumentError See class documentation. 1062 */ 1063 ESet get_eset(Xapian::termcount maxitems, 1064 const RSet & omrset, 1065 int flags, 1066 double k, 1067 const Xapian::ExpandDecider * edecider, 1068 Xapian::weight min_wt) const; 1069 1037 1070 /** Get terms which match a given document, by document id. 1038 1071 * 1039 1072 * This method returns the terms in the current query which match -
xapian-core/tests/api_anydb.cc
diff --git a/xapian-core/tests/api_anydb.cc b/xapian-core/tests/api_anydb.cc index 4295e0a..bf5280b 100644
a b 4 4 * Copyright 2002 Ananova Ltd 5 5 * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011 Olly Betts 6 6 * Copyright 2006,2008 Lemur Consulting Ltd 7 * Copyright 2011 Dan Colish 7 8 * 8 9 * This program is free software; you can redistribute it and/or 9 10 * modify it under the terms of the GNU General Public License as … … DEFINE_TESTCASE(expandweights2, backend) { 421 422 return true; 422 423 } 423 424 425 // tests the returned weights are as expected (regression test for remote 426 // backend which was using the average weight rather than the actual document 427 // weight for computing weights - fixed in 1.0.0). 428 DEFINE_TESTCASE(expandweights3, backend) { 429 Xapian::Enquire enquire(get_database("apitest_simpledata")); 430 enquire.set_query(Xapian::Query("this")); 431 432 Xapian::MSet mymset = enquire.get_mset(0, 10); 433 434 Xapian::RSet myrset; 435 Xapian::MSetIterator i = mymset.begin(); 436 myrset.add_document(*i); 437 myrset.add_document(*(++i)); 438 439 // Set min_wt to 0 440 Xapian::ESet eset = enquire.get_eset(50, myrset, 0, 1.0, 0, 0); 441 if (!startswith(get_dbtype(), "multi")) { 442 // For a single database, the weights should be the same with or 443 // without USE_EXACT_TERMFREQ. 444 TEST_EQUAL_DOUBLE(eset[0].get_weight(), 6.08904001099445); 445 TEST_EQUAL_DOUBLE(eset[1].get_weight(), 6.08904001099445); 446 TEST_EQUAL_DOUBLE(eset[2].get_weight(), 4.73383620844021); 447 TEST_EQUAL(eset[49].get_weight() < 0, false); 448 } else { 449 // For multiple databases, we expect that using USE_EXACT_TERMFREQ 450 // will result in different weights in some cases. 451 TEST_NOT_EQUAL_DOUBLE(eset[0].get_weight(), 6.08904001099445); 452 TEST_EQUAL_DOUBLE(eset[1].get_weight(), 6.08904001099445); 453 TEST_NOT_EQUAL_DOUBLE(eset[2].get_weight(), 4.73383620844021); 454 TEST_EQUAL(eset[49].get_weight() < 0, false); 455 } 456 457 return true; 458 } 459 460 461 // tests that negative weights are returned 462 DEFINE_TESTCASE(expandweights4, backend) { 463 Xapian::Enquire enquire(get_database("apitest_simpledata")); 464 enquire.set_query(Xapian::Query("paragraph")); 465 466 Xapian::MSet mymset = enquire.get_mset(0, 20); 467 468 Xapian::RSet myrset; 469 Xapian::MSetIterator i = mymset.begin(); 470 myrset.add_document(*i); 471 myrset.add_document(*(++i)); 472 473 Xapian::ESet eset = enquire.get_eset(37, myrset, 0, 1.0, 0, -100); 474 // Now include negative weights 475 // Multi backends need bigger sets 476 if (!startswith(get_dbtype(), "multi")) { 477 TEST_EQUAL(eset.size(), 37); 478 TEST_EQUAL(eset[36].get_weight() < 0, true); 479 } else { 480 TEST_EQUAL(eset.size(), 37); 481 TEST_EQUAL(eset[36].get_weight() < 0, true); 482 } 483 return true; 484 } 485 486 424 487 // tests that when specifying maxitems to get_eset, no more than 425 488 // that are returned. 426 489 DEFINE_TESTCASE(expandmaxitems1, backend) {