Ticket #50: patch

File patch, 18.0 KB (added by Richard Boulton, 17 years ago)

Updated patch resolving the cosmetic issues

  • matcher/Makefile.mk

     
    1515        matcher/phrasepostlist.h\
    1616        matcher/remotesubmatch.h\
    1717        matcher/selectpostlist.h\
     18        matcher/synonympostlist.h\
    1819        matcher/valuerangepostlist.h\
    1920        matcher/xorpostlist.h
    2021
     
    4748        matcher/phrasepostlist.cc\
    4849        matcher/rset.cc\
    4950        matcher/selectpostlist.cc\
     51        matcher/synonympostlist.cc\
    5052        matcher/stats.cc\
    5153        matcher/tradweight.cc\
    5254        matcher/valuerangepostlist.cc\
  • matcher/localmatch.cc

     
    33 * Copyright 1999,2000,2001 BrightStation PLC
    44 * Copyright 2002 Ananova Ltd
    55 * Copyright 2002,2003,2004,2005,2006,2007 Olly Betts
     6 * Copyright 2007 Lemur Consulting Ltd
    67 *
    78 * This program is free software; you can redistribute it and/or
    89 * modify it under the terms of the GNU General Public License as
     
    3839#include "mergepostlist.h"
    3940#include "extraweightpostlist.h"
    4041#include "valuerangepostlist.h"
     42#include "synonympostlist.h"
    4143
    4244#include "omqueryinternal.h"
    4345
     
    262264    }
    263265}
    264266
     267// Convert a list of subqueries into a vector of postlists.
     268void
     269LocalSubMatch::postlists_from_queries(std::vector<PostList *> &result,
     270                                      const Xapian::Query::Internal::subquery_list &queries,
     271                                      MultiMatch * matcher, bool is_bool)
     272{
     273    Assert(queries.size() >= 2);
     274
     275    // Open a postlist for each query, and store these postlists in a vector.
     276    result.reserve(queries.size());
     277
     278    Xapian::Query::Internal::subquery_list::const_iterator q;
     279    for (q = queries.begin(); q != queries.end(); q++) {
     280        result.push_back(postlist_from_query(*q, matcher, is_bool));
     281        DEBUGLINE(MATCH, "Made postlist for " << (*q)->get_description() <<
     282                  ": termfreq is: (min, est, max) = (" <<
     283                  result.back()->get_termfreq_min() << ", " <<
     284                  result.back()->get_termfreq_est() << ", " <<
     285                  result.back()->get_termfreq_max() << ")");
     286    }
     287}
     288
    265289// Make a postlist from the subqueries of a query objects.
    266290// Operation must be either AND, OR, XOR, PHRASE, NEAR, or ELITE_SET.
    267291// Optimise query by building tree carefully.
     
    270294        const Xapian::Query::Internal *query, MultiMatch *matcher, bool is_bool)
    271295{
    272296    DEBUGCALL(MATCH, PostList *, "LocalSubMatch::postlist_from_queries", op << ", " << query << ", " << matcher << ", " << is_bool);
    273     Assert(op == Xapian::Query::OP_OR || op == Xapian::Query::OP_AND ||
     297    Assert(op == Xapian::Query::OP_OR ||
     298           op == Xapian::Query::OP_AND ||
    274299           op == Xapian::Query::OP_XOR ||
    275            op == Xapian::Query::OP_NEAR || op == Xapian::Query::OP_PHRASE ||
     300           op == Xapian::Query::OP_NEAR ||
     301           op == Xapian::Query::OP_PHRASE ||
    276302           op == Xapian::Query::OP_ELITE_SET);
    277     const Xapian::Query::Internal::subquery_list &queries = query->subqs;
    278     Assert(queries.size() >= 2);
    279303
    280304    // Open a postlist for each query, and store these postlists in a vector.
    281305    std::vector<PostList *> postlists;
    282     postlists.reserve(queries.size());
     306    postlists_from_queries(postlists, query->subqs, matcher, is_bool);
    283307
    284     Xapian::Query::Internal::subquery_list::const_iterator q;
    285     for (q = queries.begin(); q != queries.end(); q++) {
    286         postlists.push_back(postlist_from_query(*q, matcher, is_bool));
    287         DEBUGLINE(MATCH, "Made postlist for " << (*q)->get_description() <<
    288                   ": termfreq is: (min, est, max) = (" <<
    289                   postlists.back()->get_termfreq_min() << ", " <<
    290                   postlists.back()->get_termfreq_est() << ", " <<
    291                   postlists.back()->get_termfreq_max() << ")");
    292     }
    293 
    294308    // Build tree
    295309    switch (op) {
    296310        case Xapian::Query::OP_XOR:
     
    427441            pl->set_termweight(wt);
    428442            RETURN(pl);
    429443        }
     444        case Xapian::Query::OP_SYNONYM:
     445        {
     446            if (is_bool) {
     447                // An or postlist returns the same documents as a synonym
     448                // postlist, and doesn't have the overhead of calculating the
     449                // term frequency, so is more efficient than a synonym postlist
     450                // if we don't care about the weights.
     451                RETURN(postlist_from_queries(Xapian::Query::OP_OR, query, matcher, is_bool));
     452            } else {
     453                AutoPtr<Xapian::Weight> wt;
     454                // Use a wqf of 1, since we don't have a specific value.
     455                // Set the term name to "", since we don't have one of them, either.
     456                wt = wt_factory->create(&statssource, qlen, 1, "");
     457
     458                std::vector<PostList *> postlists;
     459                postlists_from_queries(postlists, query->subqs, matcher, is_bool);
     460
     461                // build_or_tree empties "postlists", but we need to have them
     462                // available to get statistics, so we need to keep a copy
     463                // FIXME: there must be a cleaner way for this to work...
     464                std::vector<PostList *> postlists_orig = postlists;
     465                PostList *res = build_or_tree(postlists, matcher);
     466                RETURN(new SynonymPostList(res, postlists_orig, matcher, wt.release()));
     467            }
     468        }
    430469        case Xapian::Query::OP_PHRASE:
    431470        case Xapian::Query::OP_NEAR:
    432471            // If no positional information in this sub-database, change the
  • matcher/localmatch.h

     
    7676    PostList * build_xor_tree(std::vector<PostList *> &postlists,
    7777                              MultiMatch *matcher);
    7878
     79
     80    /** Convert a list of subqueries into a vector of postlists.
     81     *
     82     *  FIXME - expand documentation comment.
     83     */
     84    void postlists_from_queries(std::vector<PostList *> &result,
     85                                const Xapian::Query::Internal::subquery_list &queries,
     86                                MultiMatch *matcher,
     87                                bool is_bool);
     88
    7989    /** Convert the sub-queries of a Query into an optimised PostList tree.
    8090     *
    8191     *  We take the sub-queries from @a query, but use @op instead of
  • matcher/synonympostlist.h

     
     1/* synonympostlist.h: Combine subqueries, weighting as if they are synonyms
     2 *
     3 * Copyright 2007 Lemur Consulting Ltd
     4 *
     5 * This program is free software; you can redistribute it and/or modify
     6 * it under the terms of the GNU General Public License as published by
     7 * the Free Software Foundation; either version 2 of the License, or
     8 * (at your option) any later version.
     9 *
     10 * This program is distributed in the hope that it will be useful,
     11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
     12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     13 * GNU General Public License for more details.
     14 *
     15 * You should have received a copy of the GNU General Public License
     16 * along with this program; if not, write to the Free Software
     17 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
     18 */
     19
     20#ifndef XAPIAN_INCLUDED_SYNONYMPOSTLIST_H
     21#define XAPIAN_INCLUDED_SYNONYMPOSTLIST_H
     22
     23#include "multimatch.h"
     24#include "postlist.h"
     25#include <vector>
     26
     27/** A postlist comprising several postlists SYNONYMed together.
     28 *
     29 *  This postlist returns all postings in the OR of the sub postlists, but
     30 *  returns weights as if they represented a single term.  The term frequency
     31 *  portion of the weight is approximated.
     32 */
     33class SynonymPostList : public PostList {
     34    private:
     35        PostList * subtree;
     36        std::vector<PostList *> terms;
     37
     38        /** The object which is using this postlist to perform
     39         *  a match.  This object needs to be notified when the
     40         *  tree changes such that the maximum weights need to be
     41         *  recalculated.
     42         */
     43        MultiMatch *matcher;
     44
     45        const Xapian::Weight * wt;
     46        bool want_doclength;
     47
     48    public:
     49        SynonymPostList(PostList *subtree_,
     50                        const std::vector<PostList *> & terms_,
     51                        MultiMatch * matcher_,
     52                        const Xapian::Weight * wt_)
     53                : subtree(subtree_),
     54                  terms(terms_),
     55                  matcher(matcher_),
     56                  wt(wt_),
     57                  want_doclength(wt_->get_sumpart_needs_doclength())
     58        {
     59        }
     60
     61        PostList *next(Xapian::weight w_min);
     62        PostList *skip_to(Xapian::docid did, Xapian::weight w_min);
     63
     64        Xapian::weight get_weight() const;
     65        Xapian::weight get_maxweight() const;
     66        Xapian::weight recalc_maxweight();
     67        Xapian::termcount get_wdf() const;
     68
     69        // The following methods just call through to the subtree.
     70        Xapian::doccount get_termfreq_min() const {
     71            return subtree->get_termfreq_min();
     72        }
     73        Xapian::doccount get_termfreq_est() const {
     74            return subtree->get_termfreq_est();
     75        }
     76        Xapian::doccount get_termfreq_max() const {
     77            return subtree->get_termfreq_max();
     78        }
     79        Xapian::docid get_docid() const {
     80            return subtree->get_docid();
     81        }
     82        Xapian::doclength get_doclength() const {
     83            return subtree->get_doclength();
     84        }
     85        PositionList * read_position_list() {
     86            return subtree->read_position_list();
     87        }
     88        PositionList * open_position_list() const {
     89            return subtree->open_position_list();
     90        }
     91        bool at_end() const {
     92            return subtree->at_end();
     93        }
     94
     95        std::string get_description() const {
     96            return "(Synonym " + subtree->get_description() + ")";
     97        }
     98};
     99
     100#endif /* XAPIAN_INCLUDED_SYNONYMPOSTLIST_H */
  • matcher/synonympostlist.cc

    Property changes on: matcher/synonympostlist.h
    ___________________________________________________________________
    Name: svn:eol-style
       + native
    
     
     1/* synonympostlist.cc: Combine subqueries, weighting as if they are synonyms
     2 *
     3 * Copyright 2007 Lemur Consulting Ltd
     4 *
     5 * This program is free software; you can redistribute it and/or
     6 * modify it under the terms of the GNU General Public License as
     7 * published by the Free Software Foundation; either version 2 of the
     8 * License, or (at your option) any later version.
     9 *
     10 * This program is distributed in the hope that it will be useful,
     11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
     12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     13 * GNU General Public License for more details.
     14 *
     15 * You should have received a copy of the GNU General Public License
     16 * along with this program; if not, write to the Free Software
     17 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
     18 * USA
     19 */
     20
     21#include <config.h>
     22
     23#include "synonympostlist.h"
     24#include "branchpostlist.h"
     25#include "omassert.h"
     26#include "omdebug.h"
     27
     28PostList *
     29SynonymPostList::next(Xapian::weight w_min)
     30{
     31    DEBUGCALL(MATCH, PostList *, "SynonymPostList::next", w_min);
     32    next_handling_prune(subtree, w_min, matcher);
     33    RETURN(NULL);
     34}
     35
     36PostList *
     37SynonymPostList::skip_to(Xapian::docid did, Xapian::weight w_min)
     38{
     39    DEBUGCALL(MATCH, PostList *, "SynonymPostList::skip_to", did << ", " << w_min);
     40    skip_to_handling_prune(subtree, did, w_min, matcher);
     41    RETURN(NULL);
     42}
     43
     44Xapian::weight
     45SynonymPostList::get_weight() const
     46{
     47    return wt->get_sumpart(get_wdf(), want_doclength ? get_doclength() : 0);
     48}
     49
     50Xapian::weight
     51SynonymPostList::get_maxweight() const
     52{
     53    return wt->get_maxpart();
     54}
     55
     56Xapian::weight
     57SynonymPostList::recalc_maxweight()
     58{
     59    return SynonymPostList::get_maxweight();
     60}
     61
     62Xapian::termcount
     63SynonymPostList::get_wdf() const {
     64    std::vector<PostList *>::const_iterator i;
     65    Xapian::termcount wdf = 0;
     66    Xapian::docid did = get_docid();
     67    for (i = terms.begin(); i != terms.end(); ++i) {
     68        if ((*i)->get_docid() == did)
     69            wdf += (*i)->get_wdf();
     70    }
     71    return wdf;
     72}
     73
  • tests/api_db.cc

    Property changes on: matcher/synonympostlist.cc
    ___________________________________________________________________
    Name: svn:eol-style
       + native
    
     
    11291129    return true;
    11301130}
    11311131
     1132// Check a synonym search
     1133static bool test_synonym1()
     1134{
     1135    Xapian::Database db(get_database("etext"));
     1136    Xapian::Enquire enquire(db);
     1137    enquire.set_query(Xapian::Query(Xapian::Query::OP_OR,
     1138                                    Xapian::Query("date"),
     1139                                    Xapian::Query("sky")));
     1140    Xapian::doccount lots = 214;
     1141    Xapian::MSet ormset = enquire.get_mset(0, lots);
     1142
     1143    enquire.set_query(Xapian::Query(Xapian::Query::OP_SYNONYM,
     1144                                    Xapian::Query("date"),
     1145                                    Xapian::Query("sky")));
     1146    Xapian::MSet mset = enquire.get_mset(0, lots);
     1147
     1148    TEST_NOT_EQUAL(mset.size(), 0);
     1149    TEST_EQUAL(mset.size(), ormset.size());
     1150    for (Xapian::doccount i = 0; i < mset.size(); ++i) {
     1151        printf("%d,\t%f,\t%d,\t%f\n",
     1152               *mset[i], mset[i].get_weight(),
     1153               *ormset[i], ormset[i].get_weight());
     1154        //TEST_EQUAL(*mset[i], *ormset[i]);
     1155        //TEST_EQUAL_DOUBLE(mset[i].get_weight(), ormset[i].get_weight());
     1156    }
     1157    return true;
     1158}
     1159
    11321160// tests that specifying a nonexistent input file throws an exception.
    11331161static bool test_quartzdatabaseopeningerror1()
    11341162{
     
    17071735    // with that, and testing it there doesn't actually improve the test
    17081736    // coverage really.
    17091737    {"consistency1",       test_consistency1},
     1738    {"synonym1",           test_synonym1},
    17101739    // Would work with remote if we registered the weighting scheme.
    17111740    // FIXME: do this so we also test that functionality...
    17121741    {"userweight1",        test_userweight1},
     
    17311760    {"keepalive1",         test_keepalive1},
    17321761    {"termstats",          test_termstats},
    17331762    {"sortvalue1",         test_sortvalue1},
     1763    {"synonym1",           test_synonym1},
    17341764    {"sortrel1",           test_sortrel1},
    17351765    {"netstats1",          test_netstats1},
    17361766    {0, 0}
  • include/xapian/query.h

     
    44/* Copyright 1999,2000,2001 BrightStation PLC
    55 * Copyright 2002 Ananova Ltd
    66 * Copyright 2003,2004,2005,2006,2007 Olly Betts
    7  * Copyright 2006 Lemur Consulting Ltd
     7 * Copyright 2006,2007 Lemur Consulting Ltd
    88 *
    99 * This program is free software; you can redistribute it and/or
    1010 * modify it under the terms of the GNU General Public License as
     
    9696            /** Filter by a range test on a document value. */
    9797            OP_VALUE_RANGE,
    9898
     99            /** Treat a set of queries as synonyms.
     100             *
     101             *  This returns all results which match at least one of the
     102             *  queries, but weighting as if all the sub-queries are instances
     103             *  of the same term: so multiple matching terms for a document
     104             *  increase the wdf value used, and the term frequency is based on
     105             *  the number of documents which would match an OR of all the
     106             *  subqueries.
     107             *
     108             *  The term frequency used will usually be an approximation,
     109             *  because calculating the precise combined term frequency would
     110             *  be overly expensive.
     111             *
     112             *  Identical to OP_OR, except for the weightings returned.
     113             */
     114            OP_SYNONYM,
     115
    99116            /** Select an elite set from the subqueries, and perform
    100117             *  a query with these combined as an OR query.
    101118             */
  • api/omqueryinternal.cc

     
    33 * Copyright 1999,2000,2001 BrightStation PLC
    44 * Copyright 2002 Ananova Ltd
    55 * Copyright 2002,2003,2004,2005,2006,2007 Olly Betts
    6  * Copyright 2006 Lemur Consulting Ltd
     6 * Copyright 2006,2007 Lemur Consulting Ltd
    77 *
    88 * This program is free software; you can redistribute it and/or
    99 * modify it under the terms of the GNU General Public License as
     
    5757        case Xapian::Query::OP_PHRASE:
    5858        case Xapian::Query::OP_ELITE_SET:
    5959        case Xapian::Query::OP_VALUE_RANGE:
     60        case Xapian::Query::OP_SYNONYM:
    6061            return 0;
    6162        case Xapian::Query::OP_FILTER:
    6263        case Xapian::Query::OP_AND_MAYBE:
     
    8586        case Xapian::Query::OP_NEAR:
    8687        case Xapian::Query::OP_PHRASE:
    8788        case Xapian::Query::OP_ELITE_SET:
     89        case Xapian::Query::OP_SYNONYM:
    8890            return UINT_MAX;
    8991        default:
    9092            Assert(false);
     
    177179                result += str_parameter;
    178180                result += om_tostring(parameter);
    179181                break;
     182            case Xapian::Query::OP_SYNONYM:
     183                result += "=";
     184                break;
    180185        }
    181186    }
    182187    return result;
     
    202207        case Xapian::Query::OP_PHRASE:          name = "PHRASE"; break;
    203208        case Xapian::Query::OP_ELITE_SET:       name = "ELITE_SET"; break;
    204209        case Xapian::Query::OP_VALUE_RANGE:     name = "VALUE_RANGE"; break;
     210        case Xapian::Query::OP_SYNONYM:         name = "SYNONYM"; break;
    205211    }
    206212    return name;
    207213}
     
    451457                    return new Xapian::Query::Internal(Xapian::Query::OP_VALUE_RANGE, valno,
    452458                                                       start, stop);
    453459                }
     460                case '=':
     461                    return qint_from_vector(Xapian::Query::OP_SYNONYM, subqs);
    454462                default:
    455463                    DEBUGLINE(UNKNOWN, "Can't parse remainder `" << p - 1 << "'");
    456464                    throw Xapian::InvalidArgumentError("Invalid query string");
     
    617625        case OP_ELITE_SET:
    618626        case OP_OR:
    619627        case OP_XOR:
     628        case OP_SYNONYM:
    620629            // Doing an "OR" type operation - if we've got any MatchNothing
    621630            // subnodes, drop them; except that we mustn't become an empty
    622631            // node due to this, so we never drop a MatchNothing subnode
     
    690699                }
    691700            }
    692701            break;
    693         case OP_OR: case OP_AND: case OP_XOR:
     702        case OP_OR: case OP_AND: case OP_XOR: case OP_SYNONYM:
    694703            // Remove duplicates if we can.
    695704            if (subqs.size() > 1) collapse_subqs();
    696705            break;
     
    734743void
    735744Xapian::Query::Internal::collapse_subqs()
    736745{
    737     Assert(op == OP_OR || op == OP_AND || op == OP_XOR);
     746    Assert(op == OP_OR || op == OP_AND || op == OP_XOR || op == OP_SYNONYM);
    738747    typedef set<Xapian::Query::Internal *, SortPosName> subqtable;
    739748    subqtable sqtab;
    740749
     
    809818    Assert(!is_leaf(op));
    810819    if (subq == 0) {
    811820        subqs.push_back(0);
    812     } else if (op == subq->op && (op == OP_AND || op == OP_OR || op == OP_XOR)) {
     821    } else if (op == subq->op && (op == OP_AND || op == OP_OR || op == OP_XOR || op == OP_SYNONYM)) {
    813822        // Distribute the subquery.
    814823        for (subquery_list::const_iterator i = subq->subqs.begin();
    815824             i != subq->subqs.end(); i++) {