Ticket #50: opsynonym_changes_12434_12435.patch

File opsynonym_changes_12434_12435.patch, 39.7 KB (added by Richard Boulton, 16 years ago)

Recent version of patch

  • xapian-maintainer-tools/win32msvc/win32_matcher.mak

     
    3434    $(INTDIR)\queryoptimiser.obj\
    3535    $(INTDIR)\rset.obj\
    3636    $(INTDIR)\selectpostlist.obj\
     37    $(INTDIR)\synonympostlist.obj\
    3738    $(INTDIR)\valuerangepostlist.obj\
    3839    $(INTDIR)\valuegepostlist.obj\
    3940    $(INTDIR)\xorpostlist.obj\
     
    6061    $(INTDIR)\queryoptimiser.cc\
    6162    $(INTDIR)\rset.cc\
    6263    $(INTDIR)\selectpostlist.cc\
     64    $(INTDIR)\synonympostlist.cc\
    6365    $(INTDIR)\valuerangepostlist.cc\
    6466    $(INTDIR)\valuegepostlist.cc\
    6567    $(INTDIR)\xorpostlist.cc\
  • xapian-core/queryparser/queryparser.lemony

     
    279279            end = db.synonyms_end(term);
    280280        }
    281281        while (syn != end) {
    282             q = Query(Query::OP_OR, q, Query(*syn, 1, pos));
     282            q = Query(Query::OP_SYNONYM, q, Query(*syn, 1, pos));
    283283            ++syn;
    284284        }
    285285    }
     
    345345        }
    346346    }
    347347    delete this;
    348     return new Query(Query::OP_OR, subqs.begin(), subqs.end());
     348    return new Query(Query::OP_SYNONYM, subqs.begin(), subqs.end());
    349349}
    350350
    351351Query *
     
    357357    for (piter = prefixes.begin(); piter != prefixes.end(); ++piter) {
    358358        string root = *piter;
    359359        root += name;
     360        vector<Query> subqs2;
    360361        TermIterator t = db.allterms_begin(root);
    361362        while (t != db.allterms_end(root)) {
    362             subqs.push_back(Query(*t, 1, pos));
     363            subqs2.push_back(Query(*t, 1, pos));
    363364            ++t;
    364365        }
     366        subqs.push_back(Query(Query::OP_SYNONYM, subqs2.begin(), subqs2.end()));
    365367        // Add the term, as it would normally be handled, as an alternative.
    366368        subqs.push_back(Query(make_term(*piter), 1, pos));
    367369    }
     
    11681170                subqs2.push_back(Query(*syn, 1, pos));
    11691171                ++syn;
    11701172            }
    1171             Query q_synonym_terms(Query::OP_OR, subqs2.begin(), subqs2.end());
     1173            Query q_synonym_terms(Query::OP_SYNONYM, subqs2.begin(), subqs2.end());
    11721174            subqs2.clear();
    1173             subqs.push_back(Query(Query::OP_OR,
     1175            subqs.push_back(Query(Query::OP_SYNONYM,
    11741176                                  q_original_terms, q_synonym_terms));
    11751177        }
    11761178    } else {
  • xapian-core/matcher/Makefile.mk

     
    1818        matcher/queryoptimiser.h\
    1919        matcher/remotesubmatch.h\
    2020        matcher/selectpostlist.h\
     21        matcher/synonympostlist.h\
    2122        matcher/valuegepostlist.h\
    2223        matcher/valuerangepostlist.h\
    2324        matcher/xorpostlist.h
     
    5455        matcher/queryoptimiser.cc\
    5556        matcher/rset.cc\
    5657        matcher/selectpostlist.cc\
     58        matcher/synonympostlist.cc\
    5759        matcher/valuegepostlist.cc\
    5860        matcher/valuerangepostlist.cc\
    5961        matcher/xorpostlist.cc
  • xapian-core/matcher/multimatch.cc

     
    791791                LOGVALUE(MATCH, denom);
    792792                LOGVALUE(MATCH, percent_scale);
    793793                Assert(percent_scale <= denom);
    794                 denom *= greatest_wt;
    795                 Assert(denom > 0);
    796                 percent_scale /= denom;
     794                if (denom == 0) {
     795                    percent_scale = 1.0 / greatest_wt;
     796                } else {
     797                    denom *= greatest_wt;
     798                    Assert(denom > 0);
     799                    percent_scale /= denom;
     800                }
    797801            } else {
    798802                // If all the terms match, the 2 sums of weights cancel
    799803                percent_scale = 1.0 / greatest_wt;
  • xapian-core/matcher/localmatch.cc

     
    3131#include "omdebug.h"
    3232#include "omqueryinternal.h"
    3333#include "queryoptimiser.h"
     34#include "synonympostlist.h"
    3435#include "weightinternal.h"
    3536
    3637#include <cfloat>
     
    111112}
    112113
    113114PostList *
     115LocalSubMatch::make_synonym_postlist(PostList * or_pl, MultiMatch * matcher,
     116                                     double factor)
     117{
     118    DEBUGCALL(MATCH, PostList *, "LocalSubMatch::make_synonym_postlist",
     119              "[or_pl], [matcher], " << factor);
     120    LOGVALUE(MATCH, or_pl->get_termfreq_est());
     121    AutoPtr<SynonymPostList> res(new SynonymPostList(or_pl, matcher));
     122    AutoPtr<Xapian::Weight> wt(wt_factory->clone_());
     123
     124    // FIXME - calculate the reltermfreq to use and pass it in?
     125    wt->init_(*stats, qlen, factor, or_pl->get_termfreq_est());
     126
     127    res->set_weight(wt.release());
     128    RETURN(res.release());
     129}
     130
     131PostList *
    114132LocalSubMatch::postlist_from_op_leaf_query(const Xapian::Query::Internal *query,
    115133                                           double factor)
    116134{
     
    132150        Xapian::doccount tf = stats->get_termfreq(query->tname);
    133151        Xapian::weight weight = boolean ? 0 : wt->get_maxpart();
    134152        Xapian::MSet::Internal::TermFreqAndWeight info(tf, weight);
     153        LOGLINE(MATCH, "Setting term_info[" << query->tname << "] to (" << tf << ", " << weight << ")");
    135154        term_info.insert(make_pair(query->tname, info));
    136155    } else if (!boolean) {
    137156        i->second.termweight += wt->get_maxpart();
     157        AssertEq(stats->get_termfreq(query->tname), i->second.termfreq);
     158        LOGLINE(MATCH, "Increasing term_info[" << query->tname << "] to (" << i->second.termfreq << ", " << i->second.termweight << ")");
    138159    }
    139160
    140161    LeafPostList * pl = db->open_post_list(query->tname);
  • xapian-core/matcher/localmatch.h

     
    8282    PostList * get_postlist_and_term_info(MultiMatch *matcher,
    8383        std::map<string, Xapian::MSet::Internal::TermFreqAndWeight> *termfreqandwts);
    8484
     85    /** Convert a postlist into a synonym postlist.
     86     */
     87    PostList * make_synonym_postlist(PostList * or_pl, MultiMatch * matcher,
     88                                     double factor);
     89
    8590    /** Convert an OP_LEAF query to a PostList.
    8691     *
    8792     *  This is called by QueryOptimiser when it reaches an OP_LEAF query.
  • xapian-core/matcher/synonympostlist.h

     
     1/** @file synonympostlist.h
     2 * @brief Combine subqueries, weighting as if they are synonyms
     3 */
     4/* Copyright 2007 Lemur Consulting Ltd
     5 *
     6 * This program is free software; you can redistribute it and/or modify
     7 * it under the terms of the GNU General Public License as published by
     8 * the Free Software Foundation; either version 2 of the License, or
     9 * (at your option) any later version.
     10 *
     11 * This program is distributed in the hope that it will be useful,
     12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
     13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     14 * GNU General Public License for more details.
     15 *
     16 * You should have received a copy of the GNU General Public License
     17 * along with this program; if not, write to the Free Software
     18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
     19 */
     20
     21#ifndef XAPIAN_INCLUDED_SYNONYMPOSTLIST_H
     22#define XAPIAN_INCLUDED_SYNONYMPOSTLIST_H
     23
     24#include "multimatch.h"
     25#include "postlist.h"
     26
     27/** A postlist comprising several postlists SYNONYMed together.
     28 *
     29 *  This postlist returns all postings in the OR of the sub postlists, but
     30 *  returns weights as if they represented a single term.  The term frequency
     31 *  portion of the weight is approximated.
     32 */
     33class SynonymPostList : public PostList {
     34    /** The subtree, which starts as an OR of all the sub-postlists being
     35     *  joined with Synonym, but may decay into something else.
     36     */
     37    PostList * subtree;
     38
     39    /** The object which is using this postlist to perform a match.
     40     *
     41     *  This object needs to be notified when the tree changes such that the
     42     *  maximum weights need to be recalculated.
     43     */
     44    MultiMatch *matcher;
     45
     46    /** Weighting object used for calculating the synonym weights.
     47     */
     48    const Xapian::Weight * wt;
     49
     50    /** Flag indicating whether the weighting object needs the doclength.
     51     */
     52    bool want_doclength;
     53
     54  public:
     55    SynonymPostList(PostList *subtree_, MultiMatch * matcher_);
     56
     57    ~SynonymPostList();
     58
     59    /** Set the weight object to be used for the synonym postlist.
     60     *
     61     *  Ownership of the weight object passes to the synonym postlist - the
     62     *  caller must not delete it after use.
     63     */
     64    void set_weight(const Xapian::Weight * wt_);
     65
     66    PostList *next(Xapian::weight w_min);
     67    PostList *skip_to(Xapian::docid did, Xapian::weight w_min);
     68
     69    Xapian::weight get_weight() const;
     70    Xapian::weight get_maxweight() const;
     71    Xapian::weight recalc_maxweight();
     72
     73    // The following methods just call through to the subtree.
     74    Xapian::termcount get_wdf() const;
     75    Xapian::doccount get_termfreq_min() const;
     76    Xapian::doccount get_termfreq_est() const;
     77    Xapian::doccount get_termfreq_max() const;
     78    Xapian::docid get_docid() const;
     79    Xapian::termcount get_doclength() const;
     80    PositionList * read_position_list();
     81    PositionList * open_position_list() const;
     82    bool at_end() const;
     83
     84    std::string get_description() const;
     85};
     86
     87#endif /* XAPIAN_INCLUDED_SYNONYMPOSTLIST_H */
  • xapian-core/matcher/queryoptimiser.cc

    Property changes on: xapian-core/matcher/synonympostlist.h
    ___________________________________________________________________
    Added: svn:eol-style
       + native
    
     
    122122            RETURN(do_subquery(query->subqs[0], sub_factor));
    123123        }
    124124
     125        case Xapian::Query::OP_SYNONYM: {
     126            RETURN(do_synonym(query, factor));
     127        }
     128
    125129        default:
    126130            Assert(false);
    127131            RETURN(NULL);
     
    304308    // for AND-like operations.
    305309    Xapian::Query::Internal::op_t op = query->op;
    306310    Assert(op == Xapian::Query::OP_ELITE_SET || op == Xapian::Query::OP_OR ||
    307            op == Xapian::Query::OP_XOR);
     311           op == Xapian::Query::OP_XOR || op == Xapian::Query::OP_SYNONYM);
     312
     313    // We build an OR tree for OP_SYNONYM.  (The resulting tree will then be
     314    // passed into a SynonymPostList, from which the weightings will come.)
     315    if (op == Xapian::Query::OP_SYNONYM) {
     316        op = Xapian::Query::OP_OR;
     317    }
    308318
    309319    const Xapian::Query::Internal::subquery_list &queries = query->subqs;
    310320    AssertRel(queries.size(), >=, 2);
     
    382392                  ComparePostListTermFreqAscending());
    383393    }
    384394}
     395
     396PostList *
     397QueryOptimiser::do_synonym(const Xapian::Query::Internal *query, double factor)
     398{
     399    DEBUGCALL(MATCH, PostList *, "QueryOptimiser::do_synonym",
     400              query << ", " << factor);
     401
     402    if (factor == 0.0) {
     403        // If we have a factor of 0, we don't care about the weights, so
     404        // we're just like a normal OR query.
     405        RETURN(do_or_like(query, 0.0));
     406    }
     407
     408    AssertEq(query->wqf, 0); // FIXME - should we be doing something with the wqf?
     409
     410    RETURN(localsubmatch.make_synonym_postlist(do_or_like(query, 0.0),
     411                                               matcher, factor));
     412}
  • xapian-core/matcher/queryoptimiser.h

     
    8888     */
    8989    PostList * do_or_like(const Xapian::Query::Internal *query, double factor);
    9090
     91    /** Optimise a synonym Xapian::Query::Internal subtree into a PostList
     92     *
     93     *  @param query    The subtree to optimise.
     94     *  @param factor   How much to scale weights for this subtree by.
     95     *
     96     *  @return         A PostList subtree.
     97     */
     98    PostList * do_synonym(const Xapian::Query::Internal *query, double factor);
     99
    91100  public:
    92101    QueryOptimiser(const Xapian::Database::Internal & db_,
    93102                   LocalSubMatch & localsubmatch_,
  • xapian-core/matcher/synonympostlist.cc

     
     1/* synonympostlist.cc: Combine subqueries, weighting as if they are synonyms
     2 *
     3 * Copyright 2007 Lemur Consulting Ltd
     4 *
     5 * This program is free software; you can redistribute it and/or
     6 * modify it under the terms of the GNU General Public License as
     7 * published by the Free Software Foundation; either version 2 of the
     8 * License, or (at your option) any later version.
     9 *
     10 * This program is distributed in the hope that it will be useful,
     11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
     12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     13 * GNU General Public License for more details.
     14 *
     15 * You should have received a copy of the GNU General Public License
     16 * along with this program; if not, write to the Free Software
     17 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
     18 * USA
     19 */
     20
     21#include <config.h>
     22
     23#include "synonympostlist.h"
     24
     25#include "branchpostlist.h"
     26#include "debuglog.h"
     27
     28SynonymPostList::SynonymPostList(PostList *subtree_,
     29                                 MultiMatch * matcher_)
     30        : subtree(subtree_),
     31          matcher(matcher_),
     32          wt(NULL),
     33          want_doclength(false)
     34{
     35}
     36
     37SynonymPostList::~SynonymPostList()
     38{
     39    delete wt;
     40    delete subtree;
     41}
     42
     43void
     44SynonymPostList::set_weight(const Xapian::Weight * wt_)
     45{
     46    delete wt;
     47    wt = wt_;
     48    want_doclength = wt_->get_sumpart_needs_doclength_();
     49}
     50
     51PostList *
     52SynonymPostList::next(Xapian::weight w_min)
     53{
     54    LOGCALL(MATCH, PostList *, "SynonymPostList::next", w_min);
     55    next_handling_prune(subtree, w_min, matcher);
     56    RETURN(NULL);
     57}
     58
     59PostList *
     60SynonymPostList::skip_to(Xapian::docid did, Xapian::weight w_min)
     61{
     62    LOGCALL(MATCH, PostList *, "SynonymPostList::skip_to", did << ", " << w_min);
     63    skip_to_handling_prune(subtree, did, w_min, matcher);
     64    RETURN(NULL);
     65}
     66
     67Xapian::weight
     68SynonymPostList::get_weight() const
     69{
     70    return wt->get_sumpart(get_wdf(), want_doclength ? get_doclength() : 0);
     71}
     72
     73Xapian::weight
     74SynonymPostList::get_maxweight() const
     75{
     76    return wt->get_maxpart();
     77}
     78
     79Xapian::weight
     80SynonymPostList::recalc_maxweight()
     81{
     82    return SynonymPostList::get_maxweight();
     83}
     84
     85Xapian::termcount
     86SynonymPostList::get_wdf() const {
     87    return subtree->get_wdf();
     88}
     89
     90Xapian::doccount
     91SynonymPostList::get_termfreq_min() const {
     92    return subtree->get_termfreq_min();
     93}
     94
     95Xapian::doccount
     96SynonymPostList::get_termfreq_est() const {
     97    return subtree->get_termfreq_est();
     98}
     99
     100Xapian::doccount
     101SynonymPostList::get_termfreq_max() const {
     102    return subtree->get_termfreq_max();
     103}
     104
     105Xapian::docid
     106SynonymPostList::get_docid() const {
     107    return subtree->get_docid();
     108}
     109
     110Xapian::termcount
     111SynonymPostList::get_doclength() const {
     112    return subtree->get_doclength();
     113}
     114
     115PositionList *
     116SynonymPostList::read_position_list() {
     117    return subtree->read_position_list();
     118}
     119
     120PositionList *
     121SynonymPostList::open_position_list() const {
     122    return subtree->open_position_list();
     123}
     124
     125bool
     126SynonymPostList::at_end() const {
     127    return subtree->at_end();
     128}
     129
     130std::string
     131SynonymPostList::get_description() const
     132{
     133    return "(Synonym " + subtree->get_description() + ")";
     134}
  • xapian-core/weight/weight.cc

    Property changes on: xapian-core/matcher/synonympostlist.cc
    ___________________________________________________________________
    Added: svn:eol-style
       + native
    
     
    7373    init(factor);
    7474}
    7575
     76void
     77Weight::init_(const Internal & stats, Xapian::termcount query_length,
     78              double factor, Xapian::doccount termfreq)
     79{
     80    // Synonym case.
     81    collection_size_ = stats.collection_size;
     82    rset_size_ = stats.rset_size;
     83    average_length_ = stats.get_average_length();
     84    doclength_upper_bound_ = stats.db.get_doclength_upper_bound();
     85    doclength_lower_bound_ = stats.db.get_doclength_lower_bound();
     86    // For a synonym, the doclength is an upper bound on the wdf.
     87    // FIXME: foo OP_SYNONYM foo could exceed this, but we probably need to
     88    // handle repeated terms better somehow.
     89    wdf_upper_bound_ = stats.db.get_doclength_upper_bound();
     90    termfreq_ = termfreq;
     91    reltermfreq_ = 0;
     92    query_length_ = query_length;
     93    wqf_ = 1;
     94    init(factor);
     95}
     96
    7697Weight::~Weight() { }
    7798
    7899}
  • xapian-core/tests/api_db.cc

     
    14571457    return true;
    14581458}
    14591459
     1460// Check a synonym search
     1461DEFINE_TESTCASE(synonym1, backend) {
     1462    Xapian::Database db(get_database("etext"));
     1463    Xapian::doccount lots = 214;
     1464    vector<vector<Xapian::Query> > subqueries_list;
     1465
     1466    vector<Xapian::Query> subqueries;
     1467    subqueries.push_back(Xapian::Query("date"));
     1468    subqueries_list.push_back(subqueries);
     1469
     1470    subqueries.clear();
     1471    subqueries.push_back(Xapian::Query("sky"));
     1472    subqueries.push_back(Xapian::Query("date"));
     1473    subqueries_list.push_back(subqueries);
     1474
     1475    subqueries.clear();
     1476    subqueries.push_back(Xapian::Query("date"));
     1477    subqueries.push_back(Xapian::Query(Xapian::Query::OP_OR,
     1478                                       Xapian::Query("sky"),
     1479                                       Xapian::Query("glove")));
     1480    subqueries_list.push_back(subqueries);
     1481
     1482    subqueries.clear();
     1483    subqueries.push_back(Xapian::Query("sky"));
     1484    subqueries.push_back(Xapian::Query("date"));
     1485    subqueries.push_back(Xapian::Query("stein"));
     1486    subqueries.push_back(Xapian::Query("ally"));
     1487    subqueries_list.push_back(subqueries);
     1488
     1489    subqueries.clear();
     1490    subqueries.push_back(Xapian::Query("sky"));
     1491    subqueries.push_back(Xapian::Query(Xapian::Query::OP_PHRASE,
     1492                                       Xapian::Query("date"),
     1493                                       Xapian::Query("stein")));
     1494    subqueries_list.push_back(subqueries);
     1495
     1496    for (vector<vector<Xapian::Query> >::const_iterator
     1497         qlist = subqueries_list.begin();
     1498         qlist != subqueries_list.end(); ++qlist)
     1499    {
     1500        // Run two queries, one joining the subqueries with OR and one joining them
     1501        // with SYNONYM.
     1502        Xapian::Enquire enquire(db);
     1503        enquire.set_query(Xapian::Query(Xapian::Query::OP_OR, qlist->begin(), qlist->end()));
     1504        Xapian::MSet ormset = enquire.get_mset(0, lots);
     1505        Xapian::Query synquery(Xapian::Query::OP_SYNONYM, qlist->begin(), qlist->end());
     1506        tout << synquery << "\n";
     1507        enquire.set_query(synquery);
     1508        Xapian::MSet mset = enquire.get_mset(0, lots);
     1509
     1510        // Check that the queries return some results.
     1511        TEST_NOT_EQUAL(mset.size(), 0);
     1512        // Check that the queries return the same number of results.
     1513        TEST_EQUAL(mset.size(), ormset.size());
     1514        map<Xapian::docid, Xapian::weight> values_or;
     1515        map<Xapian::docid, Xapian::weight> values_synonym;
     1516        for (Xapian::doccount i = 0; i < mset.size(); ++i) {
     1517            values_or[*ormset[i]] = ormset[i].get_weight();
     1518            values_synonym[*mset[i]] = mset[i].get_weight();
     1519        }
     1520        TEST_EQUAL(values_or.size(), values_synonym.size());
     1521
     1522        /* Check that the most of the weights for items in the "or" mset are
     1523         * different from those in the "synonym" mset. */
     1524        int same_weight = 0;
     1525        int different_weight = 0;
     1526        for (map<Xapian::docid, Xapian::weight>::const_iterator
     1527             j = values_or.begin();
     1528             j != values_or.end(); ++j)
     1529        {
     1530            Xapian::docid did = j->first;
     1531            // Check that all the results in the or tree make it to the synonym tree.
     1532            TEST(values_synonym.find(did) != values_synonym.end());
     1533            if (values_or[did] == values_synonym[did]) {
     1534                same_weight += 1;
     1535            } else {
     1536                different_weight += 1;
     1537            }
     1538        }
     1539        if (qlist->size() == 1) {
     1540            // Had a single term - check that all the weights were the same.
     1541            TEST_EQUAL(different_weight, 0);
     1542            TEST_NOT_EQUAL(same_weight, 0);
     1543        } else {
     1544            // Check that most of the weights differ.
     1545            TEST_NOT_EQUAL(different_weight, 0);
     1546            TEST_REL(same_weight, <, different_weight);
     1547        }
     1548    }
     1549    return true;
     1550}
     1551
     1552// Regression test - test a synonym search with a MultiAndPostlist.
     1553DEFINE_TESTCASE(synonym2, backend) {
     1554    Xapian::Query query;
     1555    vector<Xapian::Query> subqueries;
     1556    subqueries.push_back(Xapian::Query("file"));
     1557    subqueries.push_back(Xapian::Query("the"));
     1558    subqueries.push_back(Xapian::Query("next"));
     1559    subqueries.push_back(Xapian::Query("reader"));
     1560    query = Xapian::Query(Xapian::Query::OP_AND, subqueries.begin(), subqueries.end());
     1561    subqueries.clear();
     1562    subqueries.push_back(query);
     1563    subqueries.push_back(Xapian::Query("gutenberg"));
     1564    query = Xapian::Query(Xapian::Query::OP_SYNONYM, subqueries.begin(), subqueries.end());
     1565
     1566    tout << query.get_description() << endl;
     1567
     1568    Xapian::Database db(get_database("etext"));
     1569    Xapian::Enquire enquire(db);
     1570    enquire.set_query(query);
     1571    Xapian::MSet mset = enquire.get_mset(0, 10);
     1572    tout << mset.get_description() << endl;
     1573
     1574    // Regression test that OP_SCALE_WEIGHT works with OP_SYNONYM
     1575    double maxposs = mset.get_max_possible();
     1576    query = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, query, 10.0);
     1577    enquire.set_query(query);
     1578    mset = enquire.get_mset(0, 10);
     1579    double maxposs2 = mset.get_max_possible();
     1580
     1581    TEST_EQUAL_DOUBLE(maxposs * 10.0, maxposs2);
     1582
     1583    return true;
     1584}
     1585
    14601586// tests that specifying a nonexistent input file throws an exception.
    14611587DEFINE_TESTCASE(flintdatabaseopeningerror1, flint) {
    14621588    mkdir(".flint", 0755);
  • xapian-core/tests/queryparsertest.cc

     
    786786    Xapian::Query qobj = qp.parse_query("ab*", Xapian::QueryParser::FLAG_WILDCARD);
    787787    TEST_STRINGS_EQUAL(qobj.get_description(), "Xapian::Query(abc:(pos=1))");
    788788    qobj = qp.parse_query("muscle*", Xapian::QueryParser::FLAG_WILDCARD);
    789     TEST_STRINGS_EQUAL(qobj.get_description(), "Xapian::Query((muscle:(pos=1) OR musclebound:(pos=1)))");
     789    TEST_STRINGS_EQUAL(qobj.get_description(), "Xapian::Query((muscle:(pos=1) SYNONYM musclebound:(pos=1)))");
    790790    qobj = qp.parse_query("meat*", Xapian::QueryParser::FLAG_WILDCARD);
    791791    TEST_STRINGS_EQUAL(qobj.get_description(), "Xapian::Query()");
    792792    qobj = qp.parse_query("musc*", Xapian::QueryParser::FLAG_WILDCARD);
    793     TEST_STRINGS_EQUAL(qobj.get_description(), "Xapian::Query((muscat:(pos=1) OR muscle:(pos=1) OR musclebound:(pos=1) OR muscular:(pos=1)))");
     793    TEST_STRINGS_EQUAL(qobj.get_description(), "Xapian::Query((muscat:(pos=1) SYNONYM muscle:(pos=1) SYNONYM musclebound:(pos=1) SYNONYM muscular:(pos=1)))");
    794794    qobj = qp.parse_query("mutt*", Xapian::QueryParser::FLAG_WILDCARD);
    795795    TEST_STRINGS_EQUAL(qobj.get_description(), "Xapian::Query(mutton:(pos=1))");
    796796    // Regression test (we weren't lowercasing terms before checking if they
     
    879879    qp.add_prefix("author", "A");
    880880    Xapian::Query qobj;
    881881    qobj = qp.parse_query("author:h*", Xapian::QueryParser::FLAG_WILDCARD);
    882     TEST_STRINGS_EQUAL(qobj.get_description(), "Xapian::Query((Aheinlein:(pos=1) OR Ahuxley:(pos=1)))");
     882    TEST_STRINGS_EQUAL(qobj.get_description(), "Xapian::Query((Aheinlein:(pos=1) SYNONYM Ahuxley:(pos=1)))");
    883883    qobj = qp.parse_query("author:h* test", Xapian::QueryParser::FLAG_WILDCARD);
    884     TEST_STRINGS_EQUAL(qobj.get_description(), "Xapian::Query((Aheinlein:(pos=1) OR Ahuxley:(pos=1) OR test:(pos=2)))");
     884    TEST_STRINGS_EQUAL(qobj.get_description(), "Xapian::Query(((Aheinlein:(pos=1) SYNONYM Ahuxley:(pos=1)) OR test:(pos=2)))");
    885885    return true;
    886886}
    887887
     
    922922    qobj = qp.parse_query("ab", Xapian::QueryParser::FLAG_PARTIAL);
    923923    TEST_STRINGS_EQUAL(qobj.get_description(), "Xapian::Query((abc:(pos=1) OR Zab:(pos=1)))");
    924924    qobj = qp.parse_query("muscle", Xapian::QueryParser::FLAG_PARTIAL);
    925     TEST_STRINGS_EQUAL(qobj.get_description(), "Xapian::Query((muscle:(pos=1) OR musclebound:(pos=1) OR Zmuscl:(pos=1)))");
     925    TEST_STRINGS_EQUAL(qobj.get_description(), "Xapian::Query(((muscle:(pos=1) SYNONYM musclebound:(pos=1)) OR Zmuscl:(pos=1)))");
    926926    qobj = qp.parse_query("meat", Xapian::QueryParser::FLAG_PARTIAL);
    927927    TEST_STRINGS_EQUAL(qobj.get_description(), "Xapian::Query(Zmeat:(pos=1))");
    928928    qobj = qp.parse_query("musc", Xapian::QueryParser::FLAG_PARTIAL);
    929     TEST_STRINGS_EQUAL(qobj.get_description(), "Xapian::Query((muscat:(pos=1) OR muscle:(pos=1) OR musclebound:(pos=1) OR muscular:(pos=1) OR Zmusc:(pos=1)))");
     929    TEST_STRINGS_EQUAL(qobj.get_description(), "Xapian::Query(((muscat:(pos=1) SYNONYM muscle:(pos=1) SYNONYM musclebound:(pos=1) SYNONYM muscular:(pos=1)) OR Zmusc:(pos=1)))");
    930930    qobj = qp.parse_query("mutt", Xapian::QueryParser::FLAG_PARTIAL);
    931931    TEST_STRINGS_EQUAL(qobj.get_description(), "Xapian::Query((mutton:(pos=1) OR Zmutt:(pos=1)))");
    932932    qobj = qp.parse_query("abc musc", Xapian::QueryParser::FLAG_PARTIAL);
    933     TEST_STRINGS_EQUAL(qobj.get_description(), "Xapian::Query((Zabc:(pos=1) OR muscat:(pos=2) OR muscle:(pos=2) OR musclebound:(pos=2) OR muscular:(pos=2) OR Zmusc:(pos=2)))");
     933    TEST_STRINGS_EQUAL(qobj.get_description(), "Xapian::Query((Zabc:(pos=1) OR (muscat:(pos=2) SYNONYM muscle:(pos=2) SYNONYM musclebound:(pos=2) SYNONYM muscular:(pos=2)) OR Zmusc:(pos=2)))");
    934934    qobj = qp.parse_query("a* mutt", Xapian::QueryParser::FLAG_PARTIAL | Xapian::QueryParser::FLAG_WILDCARD);
    935935    TEST_STRINGS_EQUAL(qobj.get_description(), "Xapian::Query((abc:(pos=1) OR mutton:(pos=2) OR Zmutt:(pos=2)))");
    936936
    937937    // Check behaviour with stemmed terms, and stem strategy STEM_SOME.
    938938    qobj = qp.parse_query("o", Xapian::QueryParser::FLAG_PARTIAL);
    939     TEST_STRINGS_EQUAL(qobj.get_description(), "Xapian::Query((out:(pos=1) OR outside:(pos=1) OR Zo:(pos=1)))");
     939    TEST_STRINGS_EQUAL(qobj.get_description(), "Xapian::Query(((out:(pos=1) SYNONYM outside:(pos=1)) OR Zo:(pos=1)))");
    940940    qobj = qp.parse_query("ou", Xapian::QueryParser::FLAG_PARTIAL);
    941     TEST_STRINGS_EQUAL(qobj.get_description(), "Xapian::Query((out:(pos=1) OR outside:(pos=1) OR Zou:(pos=1)))");
     941    TEST_STRINGS_EQUAL(qobj.get_description(), "Xapian::Query(((out:(pos=1) SYNONYM outside:(pos=1)) OR Zou:(pos=1)))");
    942942    qobj = qp.parse_query("out", Xapian::QueryParser::FLAG_PARTIAL);
    943     TEST_STRINGS_EQUAL(qobj.get_description(), "Xapian::Query((out:(pos=1) OR outside:(pos=1) OR Zout:(pos=1)))");
     943    TEST_STRINGS_EQUAL(qobj.get_description(), "Xapian::Query(((out:(pos=1) SYNONYM outside:(pos=1)) OR Zout:(pos=1)))");
    944944    qobj = qp.parse_query("outs", Xapian::QueryParser::FLAG_PARTIAL);
    945945    TEST_STRINGS_EQUAL(qobj.get_description(), "Xapian::Query((outside:(pos=1) OR Zout:(pos=1)))");
    946946    qobj = qp.parse_query("outsi", Xapian::QueryParser::FLAG_PARTIAL);
     
    952952
    953953    // Check behaviour with capitalised terms, and stem strategy STEM_SOME.
    954954    qobj = qp.parse_query("Out", Xapian::QueryParser::FLAG_PARTIAL);
    955     TEST_STRINGS_EQUAL(qobj.get_description(), "Xapian::Query((out:(pos=1,wqf=2) OR outside:(pos=1)))");
     955    TEST_STRINGS_EQUAL(qobj.get_description(), "Xapian::Query(((out:(pos=1) SYNONYM outside:(pos=1)) OR out:(pos=1)))");
    956956    qobj = qp.parse_query("Outs", Xapian::QueryParser::FLAG_PARTIAL);
    957957    TEST_STRINGS_EQUAL(qobj.get_description(), "Xapian::Query((outside:(pos=1) OR outs:(pos=1)))");
    958958    qobj = qp.parse_query("Outside", Xapian::QueryParser::FLAG_PARTIAL);
     
    961961    // And now with stemming strategy STEM_ALL.
    962962    qp.set_stemming_strategy(Xapian::QueryParser::STEM_ALL);
    963963    qobj = qp.parse_query("Out", Xapian::QueryParser::FLAG_PARTIAL);
    964     TEST_STRINGS_EQUAL(qobj.get_description(), "Xapian::Query((out:(pos=1,wqf=2) OR outside:(pos=1)))");
     964    TEST_STRINGS_EQUAL(qobj.get_description(), "Xapian::Query(((out:(pos=1) SYNONYM outside:(pos=1)) OR out:(pos=1)))");
    965965    qobj = qp.parse_query("Outs", Xapian::QueryParser::FLAG_PARTIAL);
    966966    TEST_STRINGS_EQUAL(qobj.get_description(), "Xapian::Query((outside:(pos=1) OR out:(pos=1)))");
    967967    qobj = qp.parse_query("Outside", Xapian::QueryParser::FLAG_PARTIAL);
     
    970970    // Check handling of a case with a prefix.
    971971    qp.set_stemming_strategy(Xapian::QueryParser::STEM_SOME);
    972972    qobj = qp.parse_query("title:cow", Xapian::QueryParser::FLAG_PARTIAL);
    973     TEST_STRINGS_EQUAL(qobj.get_description(), "Xapian::Query((XTcowl:(pos=1) OR XTcows:(pos=1) OR ZXTcow:(pos=1)))");
     973    TEST_STRINGS_EQUAL(qobj.get_description(), "Xapian::Query(((XTcowl:(pos=1) SYNONYM XTcows:(pos=1)) OR ZXTcow:(pos=1)))");
    974974    qobj = qp.parse_query("title:cows", Xapian::QueryParser::FLAG_PARTIAL);
    975975    TEST_STRINGS_EQUAL(qobj.get_description(), "Xapian::Query((XTcows:(pos=1) OR ZXTcow:(pos=1)))");
    976976    qobj = qp.parse_query("title:Cow", Xapian::QueryParser::FLAG_PARTIAL);
    977     TEST_STRINGS_EQUAL(qobj.get_description(), "Xapian::Query((XTcowl:(pos=1) OR XTcows:(pos=1) OR XTcow:(pos=1)))");
     977    TEST_STRINGS_EQUAL(qobj.get_description(), "Xapian::Query(((XTcowl:(pos=1) SYNONYM XTcows:(pos=1)) OR XTcow:(pos=1)))");
    978978    qobj = qp.parse_query("title:Cows", Xapian::QueryParser::FLAG_PARTIAL);
    979979    TEST_STRINGS_EQUAL(qobj.get_description(), "Xapian::Query(XTcows:(pos=1,wqf=2))");
    980980
     
    15471547}
    15481548
    15491549static test test_synonym_queries[] = {
    1550     { "searching", "(Zsearch:(pos=1) OR Zfind:(pos=1) OR Zlocate:(pos=1))" },
    1551     { "search", "(Zsearch:(pos=1) OR find:(pos=1))" },
    1552     { "Search", "(search:(pos=1) OR find:(pos=1))" },
     1550    { "searching", "(Zsearch:(pos=1) SYNONYM Zfind:(pos=1) SYNONYM Zlocate:(pos=1))" },
     1551    { "search", "(Zsearch:(pos=1) SYNONYM find:(pos=1))" },
     1552    { "Search", "(search:(pos=1) SYNONYM find:(pos=1))" },
    15531553    { "Searching", "searching:(pos=1)" },
    1554     { "searching OR terms", "(Zsearch:(pos=1) OR Zfind:(pos=1) OR Zlocate:(pos=1) OR Zterm:(pos=2))" },
    1555     { "search OR terms", "(Zsearch:(pos=1) OR find:(pos=1) OR Zterm:(pos=2))" },
    1556     { "search +terms", "(Zterm:(pos=2) AND_MAYBE (Zsearch:(pos=1) OR find:(pos=1)))" },
    1557     { "search -terms", "((Zsearch:(pos=1) OR find:(pos=1)) AND_NOT Zterm:(pos=2))" },
    1558     { "+search terms", "((Zsearch:(pos=1) OR find:(pos=1)) AND_MAYBE Zterm:(pos=2))" },
    1559     { "-search terms", "(Zterm:(pos=2) AND_NOT (Zsearch:(pos=1) OR find:(pos=1)))" },
    1560     { "search terms", "(Zsearch:(pos=1) OR find:(pos=1) OR Zterm:(pos=2))" },
     1554    { "searching OR terms", "((Zsearch:(pos=1) SYNONYM Zfind:(pos=1) SYNONYM Zlocate:(pos=1)) OR Zterm:(pos=2))" },
     1555    { "search OR terms", "((Zsearch:(pos=1) SYNONYM find:(pos=1)) OR Zterm:(pos=2))" },
     1556    { "search +terms", "(Zterm:(pos=2) AND_MAYBE (Zsearch:(pos=1) SYNONYM find:(pos=1)))" },
     1557    { "search -terms", "((Zsearch:(pos=1) SYNONYM find:(pos=1)) AND_NOT Zterm:(pos=2))" },
     1558    { "+search terms", "((Zsearch:(pos=1) SYNONYM find:(pos=1)) AND_MAYBE Zterm:(pos=2))" },
     1559    { "-search terms", "(Zterm:(pos=2) AND_NOT (Zsearch:(pos=1) SYNONYM find:(pos=1)))" },
     1560    { "search terms", "((Zsearch:(pos=1) SYNONYM find:(pos=1)) OR Zterm:(pos=2))" },
    15611561    // Shouldn't trigger synonyms:
    15621562    { "\"search terms\"", "(search:(pos=1) PHRASE 2 terms:(pos=2))" },
    15631563    { NULL, NULL }
     
    15971597
    15981598static test test_multi_synonym_queries[] = {
    15991599    { "sun OR tan OR cream", "(Zsun:(pos=1) OR Ztan:(pos=2) OR Zcream:(pos=3))" },
    1600     { "sun tan", "(Zsun:(pos=1) OR Ztan:(pos=2) OR bathe:(pos=1))" },
    1601     { "sun tan cream", "(Zsun:(pos=1) OR Ztan:(pos=2) OR Zcream:(pos=3) OR lotion:(pos=1))" },
    1602     { "beach sun tan holiday", "(Zbeach:(pos=1) OR Zsun:(pos=2) OR Ztan:(pos=3) OR bathe:(pos=2) OR Zholiday:(pos=4))" },
    1603     { "sun tan sun tan cream", "(Zsun:(pos=1) OR Ztan:(pos=2) OR bathe:(pos=1) OR Zsun:(pos=3) OR Ztan:(pos=4) OR Zcream:(pos=5) OR lotion:(pos=3))" },
    1604     { "single", "(Zsingl:(pos=1) OR record:(pos=1))" },
     1600    { "sun tan", "((Zsun:(pos=1) OR Ztan:(pos=2)) SYNONYM bathe:(pos=1))" },
     1601    { "sun tan cream", "((Zsun:(pos=1) OR Ztan:(pos=2) OR Zcream:(pos=3)) SYNONYM lotion:(pos=1))" },
     1602    { "beach sun tan holiday", "(Zbeach:(pos=1) OR ((Zsun:(pos=2) OR Ztan:(pos=3)) SYNONYM bathe:(pos=2)) OR Zholiday:(pos=4))" },
     1603    { "sun tan sun tan cream", "(((Zsun:(pos=1) OR Ztan:(pos=2)) SYNONYM bathe:(pos=1)) OR ((Zsun:(pos=3) OR Ztan:(pos=4) OR Zcream:(pos=5)) SYNONYM lotion:(pos=3)))" },
     1604    { "single", "(Zsingl:(pos=1) SYNONYM record:(pos=1))" },
    16051605    { NULL, NULL }
    16061606};
    16071607
     
    16401640
    16411641static test test_synonym_op_queries[] = {
    16421642    { "searching", "Zsearch:(pos=1)" },
    1643     { "~searching", "(Zsearch:(pos=1) OR Zfind:(pos=1) OR Zlocate:(pos=1))" },
    1644     { "~search", "(Zsearch:(pos=1) OR find:(pos=1))" },
    1645     { "~Search", "(search:(pos=1) OR find:(pos=1))" },
     1643    { "~searching", "(Zsearch:(pos=1) SYNONYM Zfind:(pos=1) SYNONYM Zlocate:(pos=1))" },
     1644    { "~search", "(Zsearch:(pos=1) SYNONYM find:(pos=1))" },
     1645    { "~Search", "(search:(pos=1) SYNONYM find:(pos=1))" },
    16461646    { "~Searching", "searching:(pos=1)" },
    1647     { "~searching OR terms", "(Zsearch:(pos=1) OR Zfind:(pos=1) OR Zlocate:(pos=1) OR Zterm:(pos=2))" },
    1648     { "~search OR terms", "(Zsearch:(pos=1) OR find:(pos=1) OR Zterm:(pos=2))" },
    1649     { "~search +terms", "(Zterm:(pos=2) AND_MAYBE (Zsearch:(pos=1) OR find:(pos=1)))" },
    1650     { "~search -terms", "((Zsearch:(pos=1) OR find:(pos=1)) AND_NOT Zterm:(pos=2))" },
    1651     { "+~search terms", "((Zsearch:(pos=1) OR find:(pos=1)) AND_MAYBE Zterm:(pos=2))" },
    1652     { "-~search terms", "(Zterm:(pos=2) AND_NOT (Zsearch:(pos=1) OR find:(pos=1)))" },
    1653     { "~search terms", "(Zsearch:(pos=1) OR find:(pos=1) OR Zterm:(pos=2))" },
     1647    { "~searching OR terms", "((Zsearch:(pos=1) SYNONYM Zfind:(pos=1) SYNONYM Zlocate:(pos=1)) OR Zterm:(pos=2))" },
     1648    { "~search OR terms", "((Zsearch:(pos=1) SYNONYM find:(pos=1)) OR Zterm:(pos=2))" },
     1649    { "~search +terms", "(Zterm:(pos=2) AND_MAYBE (Zsearch:(pos=1) SYNONYM find:(pos=1)))" },
     1650    { "~search -terms", "((Zsearch:(pos=1) SYNONYM find:(pos=1)) AND_NOT Zterm:(pos=2))" },
     1651    { "+~search terms", "((Zsearch:(pos=1) SYNONYM find:(pos=1)) AND_MAYBE Zterm:(pos=2))" },
     1652    { "-~search terms", "(Zterm:(pos=2) AND_NOT (Zsearch:(pos=1) SYNONYM find:(pos=1)))" },
     1653    { "~search terms", "((Zsearch:(pos=1) SYNONYM find:(pos=1)) OR Zterm:(pos=2))" },
    16541654    // FIXME: should look for multi-term synonym...
    16551655    { "~\"search terms\"", "(search:(pos=1) PHRASE 2 terms:(pos=2))" },
    16561656    { NULL, NULL }
  • xapian-core/include/xapian/query.h

     
    119119            OP_VALUE_GE,
    120120
    121121            /** Filter by a less-than-or-equal test on a document value. */
    122             OP_VALUE_LE
     122            OP_VALUE_LE,
     123
     124            /** Treat a set of queries as synonyms.
     125             *
     126             *  This returns all results which match at least one of the
     127             *  queries, but weighting as if all the sub-queries are instances
     128             *  of the same term: so multiple matching terms for a document
     129             *  increase the wdf value used, and the term frequency is based on
     130             *  the number of documents which would match an OR of all the
     131             *  subqueries.
     132             *
     133             *  The term frequency used will usually be an approximation,
     134             *  because calculating the precise combined term frequency would
     135             *  be overly expensive.
     136             *
     137             *  Identical to OP_OR, except for the weightings returned.
     138             */
     139            OP_SYNONYM
    123140        } op;
    124141
    125142        /** Copy constructor. */
  • xapian-core/include/xapian/weight.h

     
    212212               const std::string & term, Xapian::termcount wqf_,
    213213               double factor);
    214214
     215    /** @private @internal Initialise this object to calculate weights for a
     216     *  synonym.
     217     *
     218     *  @param stats       Source of statistics.
     219     *  @param query_len_  Query length.
     220     *  @param factor      Any scaling factor (e.g. from OP_SCALE_WEIGHT).
     221     *  @param termfreq    The termfreq to use.
     222     */
     223    void init_(const Internal & stats, Xapian::termcount query_len_,
     224               double factor, Xapian::doccount termfreq);
     225
    215226    /** @private @internal Initialise this object to calculate the extra weight
    216227     *  component.
    217228     *
  • xapian-core/api/omqueryinternal.cc

     
    6565        case Xapian::Query::OP_VALUE_RANGE:
    6666        case Xapian::Query::OP_VALUE_GE:
    6767        case Xapian::Query::OP_VALUE_LE:
     68        case Xapian::Query::OP_SYNONYM:
    6869            return 0;
    6970        case Xapian::Query::OP_SCALE_WEIGHT:
    7071            return 1;
     
    100101        case Xapian::Query::OP_NEAR:
    101102        case Xapian::Query::OP_PHRASE:
    102103        case Xapian::Query::OP_ELITE_SET:
     104        case Xapian::Query::OP_SYNONYM:
    103105            return UINT_MAX;
    104106        default:
    105107            Assert(false);
     
    221223                result += ".";
    222224                result += str_parameter; // serialise_double(get_dbl_parameter());
    223225                break;
     226            case Xapian::Query::OP_SYNONYM:
     227                result += "=";
     228                break;
    224229        }
    225230    }
    226231    return result;
     
    251256        case Xapian::Query::OP_VALUE_GE:        name = "VALUE_GE"; break;
    252257        case Xapian::Query::OP_VALUE_LE:        name = "VALUE_LE"; break;
    253258        case Xapian::Query::OP_SCALE_WEIGHT:    name = "SCALE_WEIGHT"; break;
     259        case Xapian::Query::OP_SYNONYM:         name = "SYNONYM"; break;
    254260    }
    255261    return name;
    256262}
     
    584590                    return qint_from_vector(Xapian::Query::OP_SCALE_WEIGHT,
    585591                                            subqs, 0, param);
    586592                }
    587                 default:
     593                case '=': {
     594                    return qint_from_vector(Xapian::Query::OP_SYNONYM, subqs);
     595                }
     596                default:
    588597                    LOGLINE(UNKNOWN, "Can't parse remainder `" << p - 1 << "'");
    589598                    throw Xapian::InvalidArgumentError("Invalid query string");
    590599            }
     
    809818        case OP_ELITE_SET:
    810819        case OP_OR:
    811820        case OP_XOR:
     821        case OP_SYNONYM:
    812822            // Doing an "OR" type operation - if we've got any MatchNothing
    813823            // subnodes, drop them; except that we mustn't become an empty
    814824            // node due to this, so we never drop a MatchNothing subnode
     
    900910                }
    901911            }
    902912            break;
    903         case OP_OR: case OP_AND: case OP_XOR:
     913        case OP_OR: case OP_AND: case OP_XOR: case OP_SYNONYM:
    904914            // Remove duplicates if we can.
    905915            if (subqs.size() > 1) collapse_subqs();
    906916            break;
     
    944954void
    945955Xapian::Query::Internal::collapse_subqs()
    946956{
    947     Assert(op == OP_OR || op == OP_AND || op == OP_XOR);
     957    Assert(op == OP_OR || op == OP_AND || op == OP_XOR || op == OP_SYNONYM);
    948958    typedef set<Xapian::Query::Internal *, SortPosName> subqtable;
    949959    subqtable sqtab;
    950960
     
    10381048    Assert(!is_leaf(op));
    10391049    if (subq == 0) {
    10401050        subqs.push_back(0);
    1041     } else if (op == subq->op && (op == OP_AND || op == OP_OR || op == OP_XOR)) {
     1051    } else if (op == subq->op && (op == OP_AND || op == OP_OR || op == OP_XOR || op == OP_SYNONYM)) {
    10421052        // Distribute the subquery.
    10431053        for (subquery_list::const_iterator i = subq->subqs.begin();
    10441054             i != subq->subqs.end(); i++) {