Ticket #537: eset_min_wt.diff

File eset_min_wt.diff, 11.9 KB (added by Dan, 14 years ago)
  • xapian-core/api/omenquire.cc

    diff --git a/xapian-core/api/omenquire.cc b/xapian-core/api/omenquire.cc
    index f2eb486..1cb8e84 100644
    a b  
    44 * Copyright 2001,2002 Ananova Ltd
    55 * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010 Olly Betts
    66 * Copyright 2007,2009 Lemur Consulting Ltd
     7 * Copyright 2011 Dan Colish
    78 *
    89 * This program is free software; you can redistribute it and/or
    910 * modify it under the terms of the GNU General Public License as
    Enquire::Internal::get_mset(Xapian::doccount first, Xapian::doccount maxitems,  
    690691ESet
    691692Enquire::Internal::get_eset(Xapian::termcount maxitems,
    692693                    const RSet & rset, int flags, double k,
    693                     const ExpandDecider * edecider) const
     694                    const ExpandDecider * edecider, Xapian::weight min_wt) const
    694695{
    695     LOGCALL(MATCH, ESet, "Enquire::Internal::get_eset", maxitems | rset | flags | k | edecider);
     696    LOGCALL(MATCH, ESet, "Enquire::Internal::get_eset", maxitems | rset | flags | k | edecider | min_wt);
    696697
    697698    if (maxitems == 0 || rset.empty()) {
    698699        // Either we were asked for no results, or wouldn't produce any
    Enquire::Internal::get_eset(Xapian::termcount maxitems,  
    728729    ExpandWeight eweight(db, rset.size(), use_exact_termfreq, k);
    729730
    730731    Xapian::ESet eset;
    731     eset.internal->expand(maxitems, db, rset, edecider, eweight);
     732    eset.internal->expand(maxitems, db, rset, edecider, eweight, min_wt);
    732733    RETURN(eset);
    733734}
    734735
    Enquire::get_eset(Xapian::termcount maxitems, const RSet & rset, int flags,  
    10431044    LOGCALL(API, Xapian::ESet, "Xapian::Enquire::get_eset", maxitems | rset | flags | k | edecider);
    10441045
    10451046    try {
    1046         RETURN(internal->get_eset(maxitems, rset, flags, k, edecider));
     1047        RETURN(internal->get_eset(maxitems, rset, flags, k, edecider, 0));
     1048    } catch (Error & e) {
     1049        if (internal->errorhandler) (*internal->errorhandler)(e);
     1050        throw;
     1051    }
     1052}
     1053
     1054ESet
     1055Enquire::get_eset(Xapian::termcount maxitems, const RSet & rset, int flags,
     1056                  double k, const ExpandDecider * edecider, Xapian::weight min_wt) const
     1057{
     1058    LOGCALL(API, Xapian::ESet, "Xapian::Enquire::get_eset", maxitems | rset | flags | k | edecider | min_wt);
     1059
     1060    try {
     1061        RETURN(internal->get_eset(maxitems, rset, flags, k, edecider, min_wt));
    10471062    } catch (Error & e) {
    10481063        if (internal->errorhandler) (*internal->errorhandler)(e);
    10491064        throw;
  • xapian-core/common/esetinternal.h

    diff --git a/xapian-core/common/esetinternal.h b/xapian-core/common/esetinternal.h
    index 0807462..7304f64 100644
    a b  
    22 * @brief Xapian::ESet::Internal class
    33 */
    44/* Copyright (C) 2008,2010 Olly Betts
     5 * Copyright (C) 2011 Dan Colish
    56 *
    67 * This program is free software; you can redistribute it and/or modify
    78 * it under the terms of the GNU General Public License as published by
    class ESet::Internal : public Xapian::Internal::RefCntBase {  
    101102                const Xapian::Database & db,
    102103                const Xapian::RSet & rset,
    103104                const Xapian::ExpandDecider * edecider,
    104                 const Xapian::Internal::ExpandWeight & eweight);
     105                const Xapian::Internal::ExpandWeight & eweight,
     106                Xapian::weight min_wt);
    105107
    106108    /// Return a string describing this object.
    107109    std::string get_description() const;
  • xapian-core/common/omenquireinternal.h

    diff --git a/xapian-core/common/omenquireinternal.h b/xapian-core/common/omenquireinternal.h
    index 9c3d825..6cd68fc 100644
    a b  
    44 * Copyright 2001,2002 Ananova Ltd
    55 * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010 Olly Betts
    66 * Copyright 2009 Lemur Consulting Ltd
     7 * Copyright 2011 Dan Colish
    78 *
    89 * This program is free software; you can redistribute it and/or
    910 * modify it under the terms of the GNU General Public License as
    class Enquire::Internal : public Xapian::Internal::RefCntBase {  
    179180                      const RSet *omrset,
    180181                      const MatchDecider *mdecider,
    181182                      const MatchDecider *matchspy_legacy) const;
     183
    182184        ESet get_eset(Xapian::termcount maxitems, const RSet & omrset, int flags,
    183                       double k, const ExpandDecider *edecider) const;
     185                      double k, const ExpandDecider *edecider, Xapian::weight min_wt) const;
    184186
    185187        TermIterator get_matching_terms(Xapian::docid did) const;
    186188        TermIterator get_matching_terms(const Xapian::MSetIterator &it) const;
  • xapian-core/expand/esetinternal.cc

    diff --git a/xapian-core/expand/esetinternal.cc b/xapian-core/expand/esetinternal.cc
    index d8abaf6..5f692f2 100644
    a b  
    22 * @brief Xapian::ESet::Internal class
    33 */
    44/* Copyright (C) 2008,2010 Olly Betts
     5 * Copyright (C) 2011 Dan Colish
    56 *
    67 * This program is free software; you can redistribute it and/or modify
    78 * it under the terms of the GNU General Public License as published by
    ESet::Internal::expand(Xapian::termcount max_esize,  
    138139                       const Xapian::Database & db,
    139140                       const RSet & rset,
    140141                       const Xapian::ExpandDecider * edecider,
    141                        const Xapian::Internal::ExpandWeight & eweight)
     142                       const Xapian::Internal::ExpandWeight & eweight,
     143                       Xapian::weight min_wt)
    142144{
    143145    LOGCALL_VOID(EXPAND, "ESet::Internal::expand", max_esize | db | rset | edecider | eweight);
    144146    // These two cases are handled by our caller.
    ESet::Internal::expand(Xapian::termcount max_esize,  
    149151    Assert(ebound == 0);
    150152    Assert(items.empty());
    151153
    152     Xapian::weight min_wt = 0;
    153 
    154154    AutoPtr<TermList> tree(build_termlist_tree(db, rset));
    155155    Assert(tree.get());
    156156
  • xapian-core/expand/expandweight.cc

    diff --git a/xapian-core/expand/expandweight.cc b/xapian-core/expand/expandweight.cc
    index f8b8567..6202b7e 100644
    a b  
    22 * @brief Calculate term weights for the ESet.
    33 */
    44/* Copyright (C) 2007,2008 Olly Betts
    5  *
     5 * Copyright (C) 2011 Dan Colish
     6 *
    67 * This program is free software; you can redistribute it and/or
    78 * modify it under the terms of the GNU General Public License as
    89 * published by the Free Software Foundation; either version 2 of the
    ExpandWeight::get_weight(TermList * merger, const string & term) const  
    103104    denom = (termfreq - rtermfreq + 0.5) * (reldocs_without_term + 0.5);
    104105    AssertRel(denom,>,0);
    105106
    106     // If the returned termweight would be negative, clamp it to 0.0.
    107     if (rare(num <= denom)) RETURN(0.0);
    108 
    109107    Xapian::weight tw = log(num / denom);
    110108    LOGVALUE(EXPAND, tw);
    111109    LOGVALUE(EXPAND, stats.multiplier);
  • xapian-core/include/xapian/enquire.h

    diff --git a/xapian-core/include/xapian/enquire.h b/xapian-core/include/xapian/enquire.h
    index e7ec821..d9cd439 100644
    a b  
    55 * Copyright 2001,2002 Ananova Ltd
    66 * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2011 Olly Betts
    77 * Copyright 2009 Lemur Consulting Ltd
     8 * Copyright 2011 Dan Colish
    89 *
    910 * This program is free software; you can redistribute it and/or
    1011 * modify it under the terms of the GNU General Public License as
    class XAPIAN_VISIBILITY_DEFAULT Enquire {  
    10051006         *  @param edecider  a decision functor to use to decide whether a
    10061007         *                   given term should be put in the ESet
    10071008         *
     1009         *  @param min_wt    the minimum weight for included terms
     1010         *
    10081011         *  @return          An ESet object containing the results of the
    10091012         *                   expand.
    10101013         *
    class XAPIAN_VISIBILITY_DEFAULT Enquire {  
    10151018                        int flags = 0,
    10161019                        double k = 1.0,
    10171020                        const Xapian::ExpandDecider * edecider = 0) const;
    1018 
    10191021        /** Get the expand set for the given rset.
    10201022         *
    10211023         *  @param maxitems  the maximum number of items to return.
    class XAPIAN_VISIBILITY_DEFAULT Enquire {  
    10341036            return get_eset(maxitems, omrset, 0, 1.0, edecider);
    10351037        }
    10361038
     1039        /** Get the expand set for the given rset.
     1040         *
     1041         *  @param maxitems  the maximum number of items to return.
     1042         *  @param omrset    the relevance set to use when performing
     1043         *                   the expand operation.
     1044         *  @param flags     zero or more of these values |-ed together:
     1045         *                    - Xapian::Enquire::INCLUDE_QUERY_TERMS query
     1046         *                      terms may be returned from expand
     1047         *                    - Xapian::Enquire::USE_EXACT_TERMFREQ for multi
     1048         *                      dbs, calculate the exact termfreq; otherwise an
     1049         *                      approximation is used which can greatly improve
     1050         *                      efficiency, but still returns good results.
     1051         *  @param k         the parameter k in the query expansion algorithm
     1052         *                   (default is 1.0)
     1053         *  @param edecider  a decision functor to use to decide whether a
     1054         *                   given term should be put in the ESet
     1055         *
     1056         *  @param min_wt    the minimum weight for included terms
     1057         *
     1058         *  @return          An ESet object containing the results of the
     1059         *                   expand.
     1060         *
     1061         *  @exception Xapian::InvalidArgumentError  See class documentation.
     1062         */
     1063        ESet get_eset(Xapian::termcount maxitems,
     1064                        const RSet & omrset,
     1065                        int flags,
     1066                        double k,
     1067                        const Xapian::ExpandDecider * edecider,
     1068                        Xapian::weight min_wt) const;
     1069
    10371070        /** Get terms which match a given document, by document id.
    10381071         *
    10391072         *  This method returns the terms in the current query which match
  • xapian-core/tests/api_anydb.cc

    diff --git a/xapian-core/tests/api_anydb.cc b/xapian-core/tests/api_anydb.cc
    index 4295e0a..bf5280b 100644
    a b  
    44 * Copyright 2002 Ananova Ltd
    55 * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011 Olly Betts
    66 * Copyright 2006,2008 Lemur Consulting Ltd
     7 * Copyright 2011 Dan Colish
    78 *
    89 * This program is free software; you can redistribute it and/or
    910 * modify it under the terms of the GNU General Public License as
    DEFINE_TESTCASE(expandweights2, backend) {  
    421422    return true;
    422423}
    423424
     425// tests the returned weights are as expected (regression test for remote
     426// backend which was using the average weight rather than the actual document
     427// weight for computing weights - fixed in 1.0.0).
     428DEFINE_TESTCASE(expandweights3, backend) {
     429    Xapian::Enquire enquire(get_database("apitest_simpledata"));
     430    enquire.set_query(Xapian::Query("this"));
     431
     432    Xapian::MSet mymset = enquire.get_mset(0, 10);
     433
     434    Xapian::RSet myrset;
     435    Xapian::MSetIterator i = mymset.begin();
     436    myrset.add_document(*i);
     437    myrset.add_document(*(++i));
     438
     439    // Set min_wt to 0
     440    Xapian::ESet eset = enquire.get_eset(50, myrset, 0, 1.0, 0, 0);
     441    if (!startswith(get_dbtype(), "multi")) {
     442        // For a single database, the weights should be the same with or
     443        // without USE_EXACT_TERMFREQ.
     444        TEST_EQUAL_DOUBLE(eset[0].get_weight(), 6.08904001099445);
     445        TEST_EQUAL_DOUBLE(eset[1].get_weight(), 6.08904001099445);
     446        TEST_EQUAL_DOUBLE(eset[2].get_weight(), 4.73383620844021);
     447        TEST_EQUAL(eset[49].get_weight() < 0, false);
     448    } else {
     449        // For multiple databases, we expect that using USE_EXACT_TERMFREQ
     450        // will result in different weights in some cases.
     451        TEST_NOT_EQUAL_DOUBLE(eset[0].get_weight(), 6.08904001099445);
     452        TEST_EQUAL_DOUBLE(eset[1].get_weight(), 6.08904001099445);
     453        TEST_NOT_EQUAL_DOUBLE(eset[2].get_weight(), 4.73383620844021);
     454        TEST_EQUAL(eset[49].get_weight() < 0, false);
     455    }
     456
     457    return true;
     458}
     459
     460
     461// tests that negative weights are returned
     462DEFINE_TESTCASE(expandweights4, backend) {
     463    Xapian::Enquire enquire(get_database("apitest_simpledata"));
     464    enquire.set_query(Xapian::Query("paragraph"));
     465
     466    Xapian::MSet mymset = enquire.get_mset(0, 20);
     467
     468    Xapian::RSet myrset;
     469    Xapian::MSetIterator i = mymset.begin();
     470    myrset.add_document(*i);
     471    myrset.add_document(*(++i));
     472
     473    Xapian::ESet eset = enquire.get_eset(37, myrset, 0, 1.0, 0, -100);
     474    // Now include negative weights
     475    // Multi backends need bigger sets
     476    if (!startswith(get_dbtype(), "multi")) {
     477        TEST_EQUAL(eset.size(), 37);
     478        TEST_EQUAL(eset[36].get_weight() < 0, true);
     479    } else {
     480        TEST_EQUAL(eset.size(), 37);
     481        TEST_EQUAL(eset[36].get_weight() < 0, true);
     482    }
     483    return true;
     484}
     485
     486
    424487// tests that when specifying maxitems to get_eset, no more than
    425488// that are returned.
    426489DEFINE_TESTCASE(expandmaxitems1, backend) {