Ticket #50: patch

File patch, 18.0 kB (added by richard, 19 months ago)

Updated patch resolving the cosmetic issues

  • matcher/Makefile.mk

     
    1515        matcher/phrasepostlist.h\ 
    1616        matcher/remotesubmatch.h\ 
    1717        matcher/selectpostlist.h\ 
     18        matcher/synonympostlist.h\ 
    1819        matcher/valuerangepostlist.h\ 
    1920        matcher/xorpostlist.h 
    2021 
     
    4748        matcher/phrasepostlist.cc\ 
    4849        matcher/rset.cc\ 
    4950        matcher/selectpostlist.cc\ 
     51        matcher/synonympostlist.cc\ 
    5052        matcher/stats.cc\ 
    5153        matcher/tradweight.cc\ 
    5254        matcher/valuerangepostlist.cc\ 
  • matcher/localmatch.cc

     
    33 * Copyright 1999,2000,2001 BrightStation PLC 
    44 * Copyright 2002 Ananova Ltd 
    55 * Copyright 2002,2003,2004,2005,2006,2007 Olly Betts 
     6 * Copyright 2007 Lemur Consulting Ltd 
    67 * 
    78 * This program is free software; you can redistribute it and/or 
    89 * modify it under the terms of the GNU General Public License as 
     
    3839#include "mergepostlist.h" 
    3940#include "extraweightpostlist.h" 
    4041#include "valuerangepostlist.h" 
     42#include "synonympostlist.h" 
    4143 
    4244#include "omqueryinternal.h" 
    4345 
     
    262264    } 
    263265} 
    264266 
     267// Convert a list of subqueries into a vector of postlists. 
     268void 
     269LocalSubMatch::postlists_from_queries(std::vector<PostList *> &result, 
     270                                      const Xapian::Query::Internal::subquery_list &queries, 
     271                                      MultiMatch * matcher, bool is_bool) 
     272{ 
     273    Assert(queries.size() >= 2); 
     274 
     275    // Open a postlist for each query, and store these postlists in a vector. 
     276    result.reserve(queries.size()); 
     277 
     278    Xapian::Query::Internal::subquery_list::const_iterator q; 
     279    for (q = queries.begin(); q != queries.end(); q++) { 
     280        result.push_back(postlist_from_query(*q, matcher, is_bool)); 
     281        DEBUGLINE(MATCH, "Made postlist for " << (*q)->get_description() << 
     282                  ": termfreq is: (min, est, max) = (" << 
     283                  result.back()->get_termfreq_min() << ", " << 
     284                  result.back()->get_termfreq_est() << ", " << 
     285                  result.back()->get_termfreq_max() << ")"); 
     286    } 
     287} 
     288 
    265289// Make a postlist from the subqueries of a query objects. 
    266290// Operation must be either AND, OR, XOR, PHRASE, NEAR, or ELITE_SET. 
    267291// Optimise query by building tree carefully. 
     
    270294        const Xapian::Query::Internal *query, MultiMatch *matcher, bool is_bool) 
    271295{ 
    272296    DEBUGCALL(MATCH, PostList *, "LocalSubMatch::postlist_from_queries", op << ", " << query << ", " << matcher << ", " << is_bool); 
    273     Assert(op == Xapian::Query::OP_OR || op == Xapian::Query::OP_AND || 
     297    Assert(op == Xapian::Query::OP_OR || 
     298           op == Xapian::Query::OP_AND || 
    274299           op == Xapian::Query::OP_XOR || 
    275            op == Xapian::Query::OP_NEAR || op == Xapian::Query::OP_PHRASE || 
     300           op == Xapian::Query::OP_NEAR || 
     301           op == Xapian::Query::OP_PHRASE || 
    276302           op == Xapian::Query::OP_ELITE_SET); 
    277     const Xapian::Query::Internal::subquery_list &queries = query->subqs; 
    278     Assert(queries.size() >= 2); 
    279303 
    280304    // Open a postlist for each query, and store these postlists in a vector. 
    281305    std::vector<PostList *> postlists; 
    282     postlists.reserve(queries.size()); 
     306    postlists_from_queries(postlists, query->subqs, matcher, is_bool); 
    283307 
    284     Xapian::Query::Internal::subquery_list::const_iterator q; 
    285     for (q = queries.begin(); q != queries.end(); q++) { 
    286         postlists.push_back(postlist_from_query(*q, matcher, is_bool)); 
    287         DEBUGLINE(MATCH, "Made postlist for " << (*q)->get_description() << 
    288                   ": termfreq is: (min, est, max) = (" << 
    289                   postlists.back()->get_termfreq_min() << ", " << 
    290                   postlists.back()->get_termfreq_est() << ", " << 
    291                   postlists.back()->get_termfreq_max() << ")"); 
    292     } 
    293  
    294308    // Build tree 
    295309    switch (op) { 
    296310        case Xapian::Query::OP_XOR: 
     
    427441            pl->set_termweight(wt); 
    428442            RETURN(pl); 
    429443        } 
     444        case Xapian::Query::OP_SYNONYM: 
     445        { 
     446            if (is_bool) { 
     447                // An or postlist returns the same documents as a synonym 
     448                // postlist, and doesn't have the overhead of calculating the 
     449                // term frequency, so is more efficient than a synonym postlist 
     450                // if we don't care about the weights. 
     451                RETURN(postlist_from_queries(Xapian::Query::OP_OR, query, matcher, is_bool)); 
     452            } else { 
     453                AutoPtr<Xapian::Weight> wt; 
     454                // Use a wqf of 1, since we don't have a specific value. 
     455                // Set the term name to "", since we don't have one of them, either. 
     456                wt = wt_factory->create(&statssource, qlen, 1, ""); 
     457 
     458                std::vector<PostList *> postlists; 
     459                postlists_from_queries(postlists, query->subqs, matcher, is_bool); 
     460 
     461                // build_or_tree empties "postlists", but we need to have them 
     462                // available to get statistics, so we need to keep a copy 
     463                // FIXME: there must be a cleaner way for this to work... 
     464                std::vector<PostList *> postlists_orig = postlists; 
     465                PostList *res = build_or_tree(postlists, matcher); 
     466                RETURN(new SynonymPostList(res, postlists_orig, matcher, wt.release())); 
     467            } 
     468        } 
    430469        case Xapian::Query::OP_PHRASE: 
    431470        case Xapian::Query::OP_NEAR: 
    432471            // If no positional information in this sub-database, change the 
  • matcher/localmatch.h

     
    7676    PostList * build_xor_tree(std::vector<PostList *> &postlists, 
    7777                              MultiMatch *matcher); 
    7878 
     79 
     80    /** Convert a list of subqueries into a vector of postlists. 
     81     * 
     82     *  FIXME - expand documentation comment. 
     83     */ 
     84    void postlists_from_queries(std::vector<PostList *> &result, 
     85                                const Xapian::Query::Internal::subquery_list &queries, 
     86                                MultiMatch *matcher, 
     87                                bool is_bool); 
     88 
    7989    /** Convert the sub-queries of a Query into an optimised PostList tree. 
    8090     * 
    8191     *  We take the sub-queries from @a query, but use @op instead of 
  • matcher/synonympostlist.h

     
     1/* synonympostlist.h: Combine subqueries, weighting as if they are synonyms 
     2 * 
     3 * Copyright 2007 Lemur Consulting Ltd 
     4 * 
     5 * This program is free software; you can redistribute it and/or modify 
     6 * it under the terms of the GNU General Public License as published by 
     7 * the Free Software Foundation; either version 2 of the License, or 
     8 * (at your option) any later version. 
     9 * 
     10 * This program is distributed in the hope that it will be useful, 
     11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 
     12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
     13 * GNU General Public License for more details. 
     14 * 
     15 * You should have received a copy of the GNU General Public License 
     16 * along with this program; if not, write to the Free Software 
     17 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA 
     18 */ 
     19 
     20#ifndef XAPIAN_INCLUDED_SYNONYMPOSTLIST_H 
     21#define XAPIAN_INCLUDED_SYNONYMPOSTLIST_H 
     22 
     23#include "multimatch.h" 
     24#include "postlist.h" 
     25#include <vector> 
     26 
     27/** A postlist comprising several postlists SYNONYMed together. 
     28 * 
     29 *  This postlist returns all postings in the OR of the sub postlists, but 
     30 *  returns weights as if they represented a single term.  The term frequency 
     31 *  portion of the weight is approximated. 
     32 */ 
     33class SynonymPostList : public PostList { 
     34    private: 
     35        PostList * subtree; 
     36        std::vector<PostList *> terms; 
     37 
     38        /** The object which is using this postlist to perform 
     39         *  a match.  This object needs to be notified when the 
     40         *  tree changes such that the maximum weights need to be 
     41         *  recalculated. 
     42         */ 
     43        MultiMatch *matcher; 
     44 
     45        const Xapian::Weight * wt; 
     46        bool want_doclength; 
     47 
     48    public: 
     49        SynonymPostList(PostList *subtree_, 
     50                        const std::vector<PostList *> & terms_, 
     51                        MultiMatch * matcher_, 
     52                        const Xapian::Weight * wt_) 
     53                : subtree(subtree_), 
     54                  terms(terms_), 
     55                  matcher(matcher_), 
     56                  wt(wt_), 
     57                  want_doclength(wt_->get_sumpart_needs_doclength()) 
     58        { 
     59        } 
     60 
     61        PostList *next(Xapian::weight w_min); 
     62        PostList *skip_to(Xapian::docid did, Xapian::weight w_min); 
     63 
     64        Xapian::weight get_weight() const; 
     65        Xapian::weight get_maxweight() const; 
     66        Xapian::weight recalc_maxweight(); 
     67        Xapian::termcount get_wdf() const; 
     68 
     69        // The following methods just call through to the subtree. 
     70        Xapian::doccount get_termfreq_min() const { 
     71            return subtree->get_termfreq_min(); 
     72        } 
     73        Xapian::doccount get_termfreq_est() const { 
     74            return subtree->get_termfreq_est(); 
     75        } 
     76        Xapian::doccount get_termfreq_max() const { 
     77            return subtree->get_termfreq_max(); 
     78        } 
     79        Xapian::docid get_docid() const { 
     80            return subtree->get_docid(); 
     81        } 
     82        Xapian::doclength get_doclength() const { 
     83            return subtree->get_doclength(); 
     84        } 
     85        PositionList * read_position_list() { 
     86            return subtree->read_position_list(); 
     87        } 
     88        PositionList * open_position_list() const { 
     89            return subtree->open_position_list(); 
     90        } 
     91        bool at_end() const { 
     92            return subtree->at_end(); 
     93        } 
     94 
     95        std::string get_description() const { 
     96            return "(Synonym " + subtree->get_description() + ")"; 
     97        } 
     98}; 
     99 
     100#endif /* XAPIAN_INCLUDED_SYNONYMPOSTLIST_H */ 
  • matcher/synonympostlist.cc

    Property changes on: matcher/synonympostlist.h
    ___________________________________________________________________
    Name: svn:eol-style
       + native
    
     
     1/* synonympostlist.cc: Combine subqueries, weighting as if they are synonyms 
     2 * 
     3 * Copyright 2007 Lemur Consulting Ltd 
     4 * 
     5 * This program is free software; you can redistribute it and/or 
     6 * modify it under the terms of the GNU General Public License as 
     7 * published by the Free Software Foundation; either version 2 of the 
     8 * License, or (at your option) any later version. 
     9 * 
     10 * This program is distributed in the hope that it will be useful, 
     11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 
     12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
     13 * GNU General Public License for more details. 
     14 * 
     15 * You should have received a copy of the GNU General Public License 
     16 * along with this program; if not, write to the Free Software 
     17 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 
     18 * USA 
     19 */ 
     20 
     21#include <config.h> 
     22 
     23#include "synonympostlist.h" 
     24#include "branchpostlist.h" 
     25#include "omassert.h" 
     26#include "omdebug.h" 
     27 
     28PostList * 
     29SynonymPostList::next(Xapian::weight w_min) 
     30{ 
     31    DEBUGCALL(MATCH, PostList *, "SynonymPostList::next", w_min); 
     32    next_handling_prune(subtree, w_min, matcher); 
     33    RETURN(NULL); 
     34} 
     35 
     36PostList * 
     37SynonymPostList::skip_to(Xapian::docid did, Xapian::weight w_min) 
     38{ 
     39    DEBUGCALL(MATCH, PostList *, "SynonymPostList::skip_to", did << ", " << w_min); 
     40    skip_to_handling_prune(subtree, did, w_min, matcher); 
     41    RETURN(NULL); 
     42} 
     43 
     44Xapian::weight 
     45SynonymPostList::get_weight() const 
     46{ 
     47    return wt->get_sumpart(get_wdf(), want_doclength ? get_doclength() : 0); 
     48} 
     49 
     50Xapian::weight 
     51SynonymPostList::get_maxweight() const 
     52{ 
     53    return wt->get_maxpart(); 
     54} 
     55 
     56Xapian::weight 
     57SynonymPostList::recalc_maxweight() 
     58{ 
     59    return SynonymPostList::get_maxweight(); 
     60} 
     61 
     62Xapian::termcount 
     63SynonymPostList::get_wdf() const { 
     64    std::vector<PostList *>::const_iterator i; 
     65    Xapian::termcount wdf = 0; 
     66    Xapian::docid did = get_docid(); 
     67    for (i = terms.begin(); i != terms.end(); ++i) { 
     68        if ((*i)->get_docid() == did) 
     69            wdf += (*i)->get_wdf(); 
     70    } 
     71    return wdf; 
     72} 
     73 
  • tests/api_db.cc

    Property changes on: matcher/synonympostlist.cc
    ___________________________________________________________________
    Name: svn:eol-style
       + native
    
     
    11291129    return true; 
    11301130} 
    11311131 
     1132// Check a synonym search 
     1133static bool test_synonym1() 
     1134{ 
     1135    Xapian::Database db(get_database("etext")); 
     1136    Xapian::Enquire enquire(db); 
     1137    enquire.set_query(Xapian::Query(Xapian::Query::OP_OR, 
     1138                                    Xapian::Query("date"), 
     1139                                    Xapian::Query("sky"))); 
     1140    Xapian::doccount lots = 214; 
     1141    Xapian::MSet ormset = enquire.get_mset(0, lots); 
     1142 
     1143    enquire.set_query(Xapian::Query(Xapian::Query::OP_SYNONYM, 
     1144                                    Xapian::Query("date"), 
     1145                                    Xapian::Query("sky"))); 
     1146    Xapian::MSet mset = enquire.get_mset(0, lots); 
     1147 
     1148    TEST_NOT_EQUAL(mset.size(), 0); 
     1149    TEST_EQUAL(mset.size(), ormset.size()); 
     1150    for (Xapian::doccount i = 0; i < mset.size(); ++i) { 
     1151        printf("%d,\t%f,\t%d,\t%f\n", 
     1152               *mset[i], mset[i].get_weight(), 
     1153               *ormset[i], ormset[i].get_weight()); 
     1154        //TEST_EQUAL(*mset[i], *ormset[i]); 
     1155        //TEST_EQUAL_DOUBLE(mset[i].get_weight(), ormset[i].get_weight()); 
     1156    } 
     1157    return true; 
     1158} 
     1159 
    11321160// tests that specifying a nonexistent input file throws an exception. 
    11331161static bool test_quartzdatabaseopeningerror1() 
    11341162{ 
     
    17071735    // with that, and testing it there doesn't actually improve the test 
    17081736    // coverage really. 
    17091737    {"consistency1",       test_consistency1}, 
     1738    {"synonym1",           test_synonym1}, 
    17101739    // Would work with remote if we registered the weighting scheme. 
    17111740    // FIXME: do this so we also test that functionality... 
    17121741    {"userweight1",        test_userweight1}, 
     
    17311760    {"keepalive1",         test_keepalive1}, 
    17321761    {"termstats",          test_termstats}, 
    17331762    {"sortvalue1",         test_sortvalue1}, 
     1763    {"synonym1",           test_synonym1}, 
    17341764    {"sortrel1",           test_sortrel1}, 
    17351765    {"netstats1",          test_netstats1}, 
    17361766    {0, 0} 
  • include/xapian/query.h

     
    44/* Copyright 1999,2000,2001 BrightStation PLC 
    55 * Copyright 2002 Ananova Ltd 
    66 * Copyright 2003,2004,2005,2006,2007 Olly Betts 
    7  * Copyright 2006 Lemur Consulting Ltd 
     7 * Copyright 2006,2007 Lemur Consulting Ltd 
    88 * 
    99 * This program is free software; you can redistribute it and/or 
    1010 * modify it under the terms of the GNU General Public License as 
     
    9696            /** Filter by a range test on a document value. */ 
    9797            OP_VALUE_RANGE, 
    9898 
     99            /** Treat a set of queries as synonyms. 
     100             * 
     101             *  This returns all results which match at least one of the 
     102             *  queries, but weighting as if all the sub-queries are instances 
     103             *  of the same term: so multiple matching terms for a document 
     104             *  increase the wdf value used, and the term frequency is based on 
     105             *  the number of documents which would match an OR of all the 
     106             *  subqueries. 
     107             * 
     108             *  The term frequency used will usually be an approximation, 
     109             *  because calculating the precise combined term frequency would 
     110             *  be overly expensive. 
     111             * 
     112             *  Identical to OP_OR, except for the weightings returned. 
     113             */ 
     114            OP_SYNONYM, 
     115 
    99116            /** Select an elite set from the subqueries, and perform 
    100117             *  a query with these combined as an OR query. 
    101118             */ 
  • api/omqueryinternal.cc

     
    33 * Copyright 1999,2000,2001 BrightStation PLC 
    44 * Copyright 2002 Ananova Ltd 
    55 * Copyright 2002,2003,2004,2005,2006,2007 Olly Betts 
    6  * Copyright 2006 Lemur Consulting Ltd 
     6 * Copyright 2006,2007 Lemur Consulting Ltd 
    77 * 
    88 * This program is free software; you can redistribute it and/or 
    99 * modify it under the terms of the GNU General Public License as 
     
    5757        case Xapian::Query::OP_PHRASE: 
    5858        case Xapian::Query::OP_ELITE_SET: 
    5959        case Xapian::Query::OP_VALUE_RANGE: 
     60        case Xapian::Query::OP_SYNONYM: 
    6061            return 0; 
    6162        case Xapian::Query::OP_FILTER: 
    6263        case Xapian::Query::OP_AND_MAYBE: 
     
    8586        case Xapian::Query::OP_NEAR: 
    8687        case Xapian::Query::OP_PHRASE: 
    8788        case Xapian::Query::OP_ELITE_SET: 
     89        case Xapian::Query::OP_SYNONYM: 
    8890            return UINT_MAX; 
    8991        default: 
    9092            Assert(false); 
     
    177179                result += str_parameter; 
    178180                result += om_tostring(parameter); 
    179181                break; 
     182            case Xapian::Query::OP_SYNONYM: 
     183                result += "="; 
     184                break; 
    180185        } 
    181186    } 
    182187    return result; 
     
    202207        case Xapian::Query::OP_PHRASE:          name = "PHRASE"; break; 
    203208        case Xapian::Query::OP_ELITE_SET:       name = "ELITE_SET"; break; 
    204209        case Xapian::Query::OP_VALUE_RANGE:     name = "VALUE_RANGE"; break; 
     210        case Xapian::Query::OP_SYNONYM:         name = "SYNONYM"; break; 
    205211    } 
    206212    return name; 
    207213} 
     
    451457                    return new Xapian::Query::Internal(Xapian::Query::OP_VALUE_RANGE, valno, 
    452458                                                       start, stop); 
    453459                } 
     460                case '=': 
     461                    return qint_from_vector(Xapian::Query::OP_SYNONYM, subqs); 
    454462                default: 
    455463                    DEBUGLINE(UNKNOWN, "Can't parse remainder `" << p - 1 << "'"); 
    456464                    throw Xapian::InvalidArgumentError("Invalid query string"); 
     
    617625        case OP_ELITE_SET: 
    618626        case OP_OR: 
    619627        case OP_XOR: 
     628        case OP_SYNONYM: 
    620629            // Doing an "OR" type operation - if we've got any MatchNothing 
    621630            // subnodes, drop them; except that we mustn't become an empty 
    622631            // node due to this, so we never drop a MatchNothing subnode 
     
    690699                } 
    691700            } 
    692701            break; 
    693         case OP_OR: case OP_AND: case OP_XOR: 
     702        case OP_OR: case OP_AND: case OP_XOR: case OP_SYNONYM: 
    694703            // Remove duplicates if we can. 
    695704            if (subqs.size() > 1) collapse_subqs(); 
    696705            break; 
     
    734743void 
    735744Xapian::Query::Internal::collapse_subqs() 
    736745{ 
    737     Assert(op == OP_OR || op == OP_AND || op == OP_XOR); 
     746    Assert(op == OP_OR || op == OP_AND || op == OP_XOR || op == OP_SYNONYM); 
    738747    typedef set<Xapian::Query::Internal *, SortPosName> subqtable; 
    739748    subqtable sqtab; 
    740749 
     
    809818    Assert(!is_leaf(op)); 
    810819    if (subq == 0) { 
    811820        subqs.push_back(0); 
    812     } else if (op == subq->op && (op == OP_AND || op == OP_OR || op == OP_XOR)) { 
     821    } else if (op == subq->op && (op == OP_AND || op == OP_OR || op == OP_XOR || op == OP_SYNONYM)) { 
    813822        // Distribute the subquery. 
    814823        for (subquery_list::const_iterator i = subq->subqs.begin(); 
    815824             i != subq->subqs.end(); i++) {