Ticket #363: xapian-percent-scaling-without-termlists.patch

File xapian-percent-scaling-without-termlists.patch, 23.4 KB (added by Olly Betts, 15 years ago)

Patch which implements my idea

  • api/leafpostlist.cc

     
    9797    Assert(i != stats.termfreqs.end());
    9898    RETURN(i->second);
    9999}
     100
     101Xapian::termcount
     102LeafPostList::count_matching_subqs() const
     103{
     104    return 1;
     105}
  • api/postlist.cc

     
    11/** @file postlist.cc
    22 * @brief Abstract base class for postlists.
    33 */
    4 /* Copyright (C) 2007 Olly Betts
     4/* Copyright (C) 2007,2009 Olly Betts
    55 *
    66 * This program is free software; you can redistribute it and/or
    77 * modify it under the terms of the GNU General Public License as
     
    2020
    2121#include <config.h>
    2222
     23#include "postlist.h"
     24
    2325#include <xapian/error.h>
    2426
    25 #include "postlist.h"
     27#include "omassert.h"
    2628
    2729using namespace std;
    2830
     
    6971    return skip_to(did, w_min);
    7072}
    7173
     74Xapian::termcount
     75PostList::count_matching_subqs() const
     76{
     77    Assert(false);
     78    return 0;
     79}
     80
    7281}
  • common/leafpostlist.h

     
    11/** @file leafpostlist.h
    22 * @brief Abstract base class for leaf postlists.
    33 */
    4 /* Copyright (C) 2007 Olly Betts
     4/* Copyright (C) 2007,2009 Olly Betts
    55 * Copyright (C) 2009 Lemur Consulting Ltd
    66 *
    77 * This program is free software; you can redistribute it and/or
     
    8585
    8686    TermFreqs get_termfreq_est_using_stats(
    8787        const Xapian::Weight::Internal & stats) const;
     88
     89    Xapian::termcount count_matching_subqs() const;
    8890};
    8991
    9092#endif // XAPIAN_INCLUDED_LEAFPOSTLIST_H
  • common/postlist.h

     
    192192     */
    193193    Internal * skip_to(Xapian::docid did) { return skip_to(did, 0.0); }
    194194
     195    /// Count the number of leaf subqueries which match at the current position.
     196    virtual Xapian::termcount count_matching_subqs() const;
     197
    195198    /// Return a string description of this object.
    196199    virtual std::string get_description() const = 0;
    197200};
  • common/submatch.h

     
    7575    /// Get PostList and term info.
    7676    virtual PostList * get_postlist_and_term_info(MultiMatch *matcher,
    7777        std::map<std::string,
    78                  Xapian::MSet::Internal::TermFreqAndWeight> *termfreqandwts)
     78                 Xapian::MSet::Internal::TermFreqAndWeight> *termfreqandwts,
     79        Xapian::termcount * total_subqs_ptr)
    7980        = 0;
    8081};
    8182
  • matcher/extraweightpostlist.h

     
    113113            delete pl;
    114114            delete wt;
    115115        }
     116
     117        Xapian::termcount count_matching_subqs() const {
     118            return pl->count_matching_subqs();
     119        }
    116120};
    117121
    118122#endif /* OM_HGUARD_EXTRAWEIGHTPOSTLIST_H */
  • matcher/multimatch.cc

     
    326326    map<string, Xapian::MSet::Internal::TermFreqAndWeight> * termfreqandwts_ptr;
    327327    termfreqandwts_ptr = &termfreqandwts;
    328328
     329    Xapian::termcount total_subqs = 0;
    329330    // Keep a count of matches which we know exist, but we won't see.  This
    330331    // occurs when a submatch is remote, and returns a lower bound on the
    331332    // number of matching documents which is higher than the number of
     
    335336        PostList *pl;
    336337        try {
    337338            pl = leaves[i]->get_postlist_and_term_info(this,
    338                                                        termfreqandwts_ptr);
     339                                                       termfreqandwts_ptr,
     340                                                       &total_subqs);
    339341            if (termfreqandwts_ptr && !termfreqandwts.empty())
    340342                termfreqandwts_ptr = NULL;
    341343            if (is_remote[i]) {
     
    384386    // Empty result set
    385387    Xapian::doccount docs_matched = 0;
    386388    Xapian::weight greatest_wt = 0;
     389    Xapian::termcount greatest_wt_subqs_matched = 0;
    387390    vector<Xapian::Internal::MSetItem> items;
    388391
    389392    // maximum weight a document could possibly have
     
    715718        if (wt > greatest_wt) {
    716719new_greatest_weight:
    717720            greatest_wt = wt;
     721            greatest_wt_subqs_matched = pl->count_matching_subqs();
    718722            if (percent_cutoff) {
    719723                Xapian::weight w = wt * percent_cutoff_factor;
    720724                if (w > min_weight) {
     
    764768            percent_scale = rem_match->get_percent_factor();
    765769        } else
    766770#endif
    767         if (termfreqandwts.size() > 1) {
    768             Xapian::termcount matching_terms = 0;
    769             map<string,
    770                 Xapian::MSet::Internal::TermFreqAndWeight>::const_iterator i;
    771 
    772             // Special case for MatchAll queries.
    773             i = termfreqandwts.find(string());
    774             if (i != termfreqandwts.end()) {
    775                 percent_scale += i->second.termweight;
    776                 ++matching_terms;
    777             }
    778 
    779             Xapian::TermIterator docterms = db.termlist_begin(best->did);
    780             Xapian::TermIterator docterms_end = db.termlist_end(best->did);
    781             while (docterms != docterms_end) {
    782                 i = termfreqandwts.find(*docterms);
    783                 if (i != termfreqandwts.end()) {
    784                     LOGLINE(MATCH, "adding " << i->second.termweight <<
    785                             " to percent_scale for term '" <<
    786                             *docterms << "'");
    787                     percent_scale += i->second.termweight;
    788                     ++matching_terms;
    789                     if (matching_terms == termfreqandwts.size()) break;
    790                 }
    791                 ++docterms;
    792             }
    793 
    794             if (matching_terms < termfreqandwts.size()) {
    795                 if (percent_scale == 0.0) {
    796                     // This happens if the only matching terms are synonyms.
    797                     percent_scale = 1.0;
    798                 } else {
    799                     // OK, work out weight corresponding to 100%
    800                     double denom = 0;
    801                     for (i = termfreqandwts.begin(); i != termfreqandwts.end(); ++i)
    802                         denom += i->second.termweight;
    803 
    804                     LOGVALUE(MATCH, denom);
    805                     LOGVALUE(MATCH, percent_scale);
    806                     AssertRel(percent_scale,<=,denom);
    807                     if (denom == 0) {
    808                         // This happens if the top-level operator is OP_SYNONYM.
    809                         percent_scale = 1.0 / greatest_wt;
    810                     } else {
    811                         denom *= greatest_wt;
    812                         AssertRel(denom,>,0);
    813                         percent_scale /= denom;
    814                     }
    815                 }
    816             } else {
    817                 // If all the terms match, the 2 sums of weights cancel.
    818                 percent_scale = 1.0 / greatest_wt;
    819             }
    820         } else {
    821             // If there's only a single term in the query, the top document
    822             // must score 100%.
    823             percent_scale = 1.0 / greatest_wt;
     771        {
     772            percent_scale = greatest_wt_subqs_matched / double(total_subqs);
     773            percent_scale /= greatest_wt;
    824774        }
    825775        Assert(percent_scale > 0);
    826776        if (percent_cutoff) {
  • matcher/localmatch.cc

     
    8888
    8989PostList *
    9090LocalSubMatch::get_postlist_and_term_info(MultiMatch * matcher,
    91         map<string, Xapian::MSet::Internal::TermFreqAndWeight> * termfreqandwts)
     91        map<string, Xapian::MSet::Internal::TermFreqAndWeight> * termfreqandwts,
     92        Xapian::termcount * total_subqs_ptr)
    9293{
    9394    DEBUGCALL(MATCH, PostList *, "LocalSubMatch::get_postlist_and_term_info",
    94               matcher << ", [termfreqandwts]");
     95              matcher << ", [termfreqandwts], [total_subqs_ptr]");
    9596    term_info = termfreqandwts;
    9697
    9798    // Build the postlist tree for the query.  This calls
     
    99100    // which builds term_info as a side effect.
    100101    QueryOptimiser opt(*db, *this, matcher);
    101102    PostList * pl = opt.optimise_query(&orig_query);
     103    *total_subqs_ptr = opt.get_total_subqueries();
    102104
    103105    // We only need an ExtraWeightPostList if there's an extra weight
    104106    // contribution.
  • matcher/localmatch.h

     
    8080
    8181    /// Get PostList and term info.
    8282    PostList * get_postlist_and_term_info(MultiMatch *matcher,
    83         std::map<string, Xapian::MSet::Internal::TermFreqAndWeight> *termfreqandwts);
     83        std::map<string, Xapian::MSet::Internal::TermFreqAndWeight> *termfreqandwts,
     84        Xapian::termcount * total_subqs_ptr);
    8485
    8586    /** Convert a postlist into a synonym postlist.
    8687     */
  • matcher/xorpostlist.h

     
    7777         *  which is at the current document.
    7878         */
    7979        Xapian::termcount get_wdf() const;
     80
     81        Xapian::termcount count_matching_subqs() const;
    8082};
    8183
    8284#endif /* OM_HGUARD_XORPOSTLIST_H */
  • matcher/synonympostlist.h

     
    22 * @brief Combine subqueries, weighting as if they are synonyms
    33 */
    44/* Copyright 2007,2009 Lemur Consulting Ltd
     5 * Copyright 2009 Olly Betts
    56 *
    67 * This program is free software; you can redistribute it and/or modify
    78 * it under the terms of the GNU General Public License as published by
     
    8990    Xapian::termcount get_doclength() const;
    9091    bool at_end() const;
    9192
     93    Xapian::termcount count_matching_subqs() const;
     94
    9295    std::string get_description() const;
    9396};
    9497
  • matcher/remotesubmatch.cc

     
    5858
    5959PostList *
    6060RemoteSubMatch::get_postlist_and_term_info(MultiMatch *,
    61         map<string, Xapian::MSet::Internal::TermFreqAndWeight> * termfreqandwts)
     61        map<string, Xapian::MSet::Internal::TermFreqAndWeight> * termfreqandwts,
     62        Xapian::termcount * total_subqs_ptr)
    6263{
    6364    DEBUGCALL(MATCH, PostList *, "RemoteSubMatch::get_postlist_and_term_info",
    64               "[matcher], " << (void*)termfreqandwts);
     65              "[matcher], " << (void*)termfreqandwts << ", " << (void*)total_subqs_ptr);
    6566    Xapian::MSet mset;
    6667    db->get_mset(mset);
    6768    percent_factor = mset.internal->percent_factor;
    6869    if (termfreqandwts) *termfreqandwts = mset.internal->termfreqandwts;
     70    (void)total_subqs_ptr; // FIXME hmm...
    6971    return new MSetPostList(mset, decreasing_relevance);
    7072}
  • matcher/selectpostlist.h

     
    5656        PositionList * open_position_list() const { return source->open_position_list(); }
    5757        bool at_end() const { return source->at_end(); }
    5858
     59        Xapian::termcount count_matching_subqs() const {
     60            return source->count_matching_subqs();
     61        }
     62
    5963        std::string get_description() const;   
    6064   
    6165        SelectPostList(PostList *source_) : source(source_) { }
  • matcher/externalpostlist.cc

     
    188188    RETURN(source == NULL);
    189189}
    190190
     191Xapian::termcount
     192ExternalPostList::count_matching_subqs() const
     193{
     194    return 1;
     195}
     196
    191197string
    192198ExternalPostList::get_description() const
    193199{
  • matcher/orpostlist.h

     
    7676         *  when the OR is part of a synonym.
    7777         */
    7878        Xapian::termcount get_wdf() const;
     79
     80        Xapian::termcount count_matching_subqs() const;
    7981};
    8082
    8183#endif /* OM_HGUARD_ORPOSTLIST_H */
  • matcher/andmaybepostlist.h

     
    113113         *  when the ANDMAYBE is part of a synonym.
    114114         */
    115115        Xapian::termcount get_wdf() const;
     116
     117        Xapian::termcount count_matching_subqs() const;
    116118};
    117119
    118120#endif /* OM_HGUARD_ANDMAYBEPOSTLIST_H */
  • matcher/andnotpostlist.cc

     
    214214    DEBUGCALL(MATCH, Xapian::termcount, "AndNotPostList::get_wdf", "");
    215215    RETURN(l->get_wdf());
    216216}
     217
     218Xapian::termcount
     219AndNotPostList::count_matching_subqs() const
     220{
     221    DEBUGCALL(MATCH, Xapian::termcount, "AndNotPostList::count_matching_subqs", "");
     222    RETURN(l->count_matching_subqs());
     223}
  • matcher/andnotpostlist.h

     
    7777         * side.
    7878         */
    7979        Xapian::termcount get_wdf() const;
     80
     81        Xapian::termcount count_matching_subqs() const;
    8082};
    8183
    8284#endif /* OM_HGUARD_ANDNOTPOSTLIST_H */
  • matcher/valuerangepostlist.cc

     
    166166    return (db == NULL);
    167167}
    168168
     169Xapian::termcount
     170ValueRangePostList::count_matching_subqs() const
     171{
     172    return 1;
     173}
     174
    169175string
    170176ValueRangePostList::get_description() const
    171177{
  • matcher/valuerangepostlist.h

     
    8585
    8686    bool at_end() const;
    8787
     88    Xapian::termcount count_matching_subqs() const;
     89
    8890    string get_description() const;
    8991};
    9092
  • matcher/queryoptimiser.cc

     
    5959
    6060    switch (query->op) {
    6161        case Xapian::Query::Internal::OP_LEAF:
     62            ++total_subqs;
    6263            if (query->tname.empty()) factor = 0.0;
    6364            RETURN(localsubmatch.postlist_from_op_leaf_query(query, factor));
    6465
    6566        case Xapian::Query::Internal::OP_EXTERNAL_SOURCE: {
     67            ++total_subqs;
    6668            Assert(query->external_source);
    6769            Xapian::Database wrappeddb(new ConstDatabaseWrapper(&db));
    6870            RETURN(new ExternalPostList(wrappeddb,
     
    8183        case Xapian::Query::OP_ELITE_SET:
    8284            RETURN(do_or_like(query, factor));
    8385
    84         case Xapian::Query::OP_SYNONYM:
    85             RETURN(do_synonym(query, factor));
     86        case Xapian::Query::OP_SYNONYM: {
     87            // Save and restore total_subqs so we only add one for the whole
     88            // OP_SYNONYM subquery.
     89            Xapian::termcount save_total_subqs = total_subqs;
     90            PostList * pl = do_synonym(query, factor);
     91            total_subqs = save_total_subqs + 1;
     92            RETURN(pl);
     93        }
    8694
    8795        case Xapian::Query::OP_AND_NOT: {
    8896            AssertEq(query->subqs.size(), 2);
    8997            PostList * l = do_subquery(query->subqs[0], factor);
     98            Xapian::termcount save_total_subqs = total_subqs;
    9099            PostList * r = do_subquery(query->subqs[1], 0.0);
     100            total_subqs = save_total_subqs;
    91101            RETURN(new AndNotPostList(l, r, matcher, db_size));
    92102        }
    93103
     
    99109        }
    100110
    101111        case Xapian::Query::OP_VALUE_RANGE: {
     112            ++total_subqs;
    102113            Xapian::valueno valno(query->parameter);
    103114            const string & range_begin = query->tname;
    104115            const string & range_end = query->str_parameter;
     
    106117        }
    107118
    108119        case Xapian::Query::OP_VALUE_GE: {
     120            ++total_subqs;
    109121            Xapian::valueno valno(query->parameter);
    110122            const string & range_begin = query->tname;
    111123            RETURN(new ValueGePostList(&db, valno, range_begin));
    112124        }
    113125
    114126        case Xapian::Query::OP_VALUE_LE: {
     127            ++total_subqs;
    115128            Xapian::valueno valno(query->parameter);
    116129            const string & range_end = query->tname;
    117130            RETURN(new ValueRangePostList(&db, valno, "", range_end));
     
    393406    if (factor == 0.0) {
    394407        // If we have a factor of 0, we don't care about the weights, so
    395408        // we're just like a normal OR query.
     409        // FIXME: what about count_matching_subqs()?
    396410        RETURN(do_or_like(query, 0.0));
    397411    }
    398412
  • matcher/mergepostlist.cc

     
    227227    Assert(current != -1);
    228228    return plists[current]->get_doclength();
    229229}
     230
     231Xapian::termcount
     232MergePostList::count_matching_subqs() const
     233{
     234    DEBUGCALL(MATCH, Xapian::termcount, "MergePostList::count_matching_subqs", "");
     235    RETURN(plists[current]->count_matching_subqs());
     236}
  • matcher/queryoptimiser.h

     
    4444
    4545    MultiMatch * matcher;
    4646
     47    /** How many leaf subqueries there are.
     48     *
     49     *  Used for scaling percentages when the highest weighted document doesn't
     50     *  "match all terms".
     51     */
     52    Xapian::termcount total_subqs;
     53
    4754    /** Optimise a Xapian::Query::Internal subtree into a PostList subtree.
    4855     *
    4956     *  @param query    The subtree to optimise.
     
    102109                   LocalSubMatch & localsubmatch_,
    103110                   MultiMatch * matcher_)
    104111        : db(db_), db_size(db.get_doccount()), localsubmatch(localsubmatch_),
    105           matcher(matcher_) { }
     112          matcher(matcher_), total_subqs(0) { }
    106113
    107114    PostList * optimise_query(Xapian::Query::Internal * query) {
    108115        return do_subquery(query, 1.0);
    109116    }
     117
     118    Xapian::termcount get_total_subqueries() const { return total_subqs; }
    110119};
    111120
    112121#endif // XAPIAN_INCLUDED_QUERYOPTIMISER_H
  • matcher/mergepostlist.h

     
    7373         */
    7474        virtual Xapian::termcount get_doclength() const;
    7575
     76        Xapian::termcount count_matching_subqs() const;
     77
    7678        MergePostList(vector<PostList *> plists_,
    7779                      MultiMatch *matcher,
    7880                      Xapian::ErrorHandler * errorhandler_);
  • matcher/xorpostlist.cc

     
    335335    if (lhead < rhead) RETURN(l->get_wdf());
    336336    RETURN(r->get_wdf());
    337337}
     338
     339Xapian::termcount
     340XorPostList::count_matching_subqs() const
     341{
     342    DEBUGCALL(MATCH, Xapian::termcount, "XorPostList::count_matching_subqs", "");
     343    if (lhead < rhead) RETURN(l->count_matching_subqs());
     344    RETURN(r->count_matching_subqs());
     345}
  • matcher/synonympostlist.cc

     
    22 * @brief Combine subqueries, weighting as if they are synonyms
    33 */
    44/* Copyright 2007,2009 Lemur Consulting Ltd
     5 * Copyright 2009 Olly Betts
    56 *
    67 * This program is free software; you can redistribute it and/or
    78 * modify it under the terms of the GNU General Public License as
     
    147148    RETURN(subtree->at_end());
    148149}
    149150
     151Xapian::termcount
     152SynonymPostList::count_matching_subqs() const
     153{
     154    return 1;
     155}
     156
    150157std::string
    151158SynonymPostList::get_description() const
    152159{
  • matcher/multiandpostlist.cc

     
    250250    }
    251251    return totwdf;
    252252}
     253
     254Xapian::termcount
     255MultiAndPostList::count_matching_subqs() const
     256{
     257    Xapian::termcount total = 0;
     258    for (size_t i = 0; i < n_kids; ++i) {
     259        total += plist[i]->count_matching_subqs();
     260    }
     261    return total;
     262}
  • matcher/multiandpostlist.h

     
    194194     *  that in general.
    195195     */
    196196    Xapian::termcount get_wdf() const;
     197
     198    Xapian::termcount count_matching_subqs() const;
    197199};
    198200
    199201#endif // XAPIAN_INCLUDED_MULTIANDPOSTLIST_H
  • matcher/orpostlist.cc

     
    299299    if (lhead > rhead) RETURN(r->get_wdf());
    300300    RETURN(l->get_wdf() + r->get_wdf());
    301301}
     302
     303Xapian::termcount
     304OrPostList::count_matching_subqs() const
     305{
     306    DEBUGCALL(MATCH, Xapian::termcount, "OrPostList::count_matching_subqs", "");
     307    if (lhead < rhead) RETURN(l->count_matching_subqs());
     308    if (lhead > rhead) RETURN(r->count_matching_subqs());
     309    RETURN(l->count_matching_subqs() + r->count_matching_subqs());
     310}
  • matcher/andmaybepostlist.cc

     
    193193    if (lhead == rhead) RETURN(l->get_wdf() + r->get_wdf());
    194194    RETURN(l->get_wdf());
    195195}
     196
     197Xapian::termcount
     198AndMaybePostList::count_matching_subqs() const
     199{
     200    if (lhead == rhead)
     201        RETURN(l->count_matching_subqs() + r->count_matching_subqs());
     202    RETURN(l->count_matching_subqs());
     203}
  • matcher/remotesubmatch.h

     
    6262    /// Get PostList and term info.
    6363    PostList * get_postlist_and_term_info(MultiMatch *matcher,
    6464        std::map<std::string,
    65                  Xapian::MSet::Internal::TermFreqAndWeight> *termfreqandwts);
     65                 Xapian::MSet::Internal::TermFreqAndWeight> *termfreqandwts,
     66        Xapian::termcount * total_subqs_ptr);
    6667
    6768    /// Get percentage factor - only valid after get_postlist_and_term_info().
    6869    double get_percent_factor() const { return percent_factor; }
  • matcher/externalpostlist.h

     
    8686
    8787    bool at_end() const;
    8888
     89    Xapian::termcount count_matching_subqs() const;
     90
    8991    string get_description() const;
    9092};
    9193
  • tests/api_opsynonym.cc

     
    2424
    2525#include "api_opsynonym.h"
    2626
     27#include<iostream>
    2728#include <map>
    2829#include <set>
    2930#include <vector>
     
    387388        Xapian::Query query2(*i, or_query, date_query);
    388389
    389390        enquire.set_query(query1);
     391        cout << query1.get_description() << endl;
    390392        tout << "query1:" << query1 << '\n';
    391393        Xapian::MSet mset1 = enquire.get_mset(0, db.get_doccount());
    392394        tout << "mset1:" << mset1 << '\n';
    393395        enquire.set_query(query2);
     396        cout << query2.get_description() << endl;
    394397        tout << "query2:" << query2 << '\n';
    395398        Xapian::MSet mset2 = enquire.get_mset(0, db.get_doccount());
    396399        tout << "mset2:" << mset2 << '\n';
    397400
    398401        TEST_NOT_EQUAL(mset1.size(), 0);
    399         TEST_EQUAL(mset1[0].get_percent(), 100.0);
     402        tout << mset2[0].get_percent() << endl;
     403        if (*i != Xapian::Query::OP_XOR) {
     404            TEST_EQUAL(mset1[0].get_percent(), 100.0);
     405        }
    400406        check_msets_contain_same_docs(mset1, mset2);
    401407    }
    402408
  • tests/api_anydb.cc

     
    539539    TEST(i != mymset.end());
    540540    pct = mymset.convert_to_percent(i);
    541541    TEST_REL(pct,>,60);
    542     TEST_REL(pct,<,75);
     542    TEST_REL(pct,<,76);
    543543
    544544    ++i;
    545545