Ticket #153: xapian-allprefixedterms-wildcard.patch

File xapian-allprefixedterms-wildcard.patch, 24.3 KB (added by Richard Boulton, 17 years ago)

Fix, by adding a prefix parameter to allterms

  • xapian-core/queryparser/queryparser.lemony

     
    295295Term::as_wildcarded_query(State * state_) const
    296296{
    297297    Database db = state_->get_database();
    298     TermIterator t = db.allterms_begin();
    299298    string root = prefix;
    300299    root += name;
    301     t.skip_to(root);
     300    TermIterator t = db.allterms_begin(root);
    302301    QpQuery q;
    303     while (t != db.allterms_end() && (*t).substr(0, root.size()) == root) {
     302    while (t != db.allterms_end(root)) {
    304303        add_to_query(q, Query::OP_OR, QpQuery(*t, 1, pos));
    305304        ++t;
    306305    }
     
    316315Term::as_partial_query(State * state_) const
    317316{
    318317    Database db = state_->get_database();
    319     TermIterator t = db.allterms_begin();
    320318    string root = prefix;
    321319    root += name;
    322     t.skip_to(root);
     320    TermIterator t = db.allterms_begin(root);
    323321    QpQuery q;
    324     while (t != db.allterms_end() && (*t).substr(0, root.size()) == root) {
     322    while (t != db.allterms_end(root)) {
    325323        add_to_query(q, Query::OP_OR, QpQuery(*t, 1, pos));
    326324        ++t;
    327325    }
  • xapian-core/tests/api_db.cc

     
    707707    return true;
    708708}
    709709
     710// test allterms iterators with prefixes
     711static bool test_allterms6()
     712{
     713    Xapian::Database db;
     714    db.add_database(get_database("apitest_allterms"));
     715    db.add_database(get_database("apitest_allterms2"));
     716
     717    Xapian::TermIterator ati = db.allterms_begin("three");
     718    TEST(ati != db.allterms_end("three"));
     719    TEST_EQUAL(*ati, "three");
     720    ati.skip_to("three");
     721    TEST(ati != db.allterms_end("three"));
     722    TEST_EQUAL(*ati, "three");
     723    ati++;
     724    TEST(ati == db.allterms_end("three"));
     725
     726    ati = db.allterms_begin("thre");
     727    TEST(ati != db.allterms_end("thre"));
     728    TEST_EQUAL(*ati, "three");
     729    ati.skip_to("three");
     730    TEST(ati != db.allterms_end("thre"));
     731    TEST_EQUAL(*ati, "three");
     732    ati++;
     733    TEST(ati == db.allterms_end("thre"));
     734
     735    ati = db.allterms_begin("f");
     736    TEST(ati != db.allterms_end("f"));
     737    TEST_EQUAL(*ati, "five");
     738    TEST(ati != db.allterms_end("f"));
     739    ati.skip_to("three");
     740    TEST(ati == db.allterms_end("f"));
     741
     742    ati = db.allterms_begin("f");
     743    TEST(ati != db.allterms_end("f"));
     744    TEST_EQUAL(*ati, "five");
     745    ati++;
     746    TEST(ati != db.allterms_end("f"));
     747    TEST_EQUAL(*ati, "four");
     748    ati++;
     749    TEST(ati == db.allterms_end("f"));
     750
     751    return true;
     752}
     753
    710754// test that searching for a term with a special characters in it works
    711755static bool test_specialterms1()
    712756{
     
    16321676    {"allterms3",          test_allterms3},
    16331677    {"allterms4",          test_allterms4},
    16341678    {"allterms5",          test_allterms5},
     1679    {"allterms6",          test_allterms6},
    16351680    {"specialterms2",      test_specialterms2},
    16361681    {0, 0}
    16371682};
  • xapian-core/include/xapian/database.h

     
    163163            return TermIterator(NULL);
    164164        }
    165165
     166        /** An iterator which runs across all terms with a given prefix.
     167         *
     168         *  This is functionally similar to getting an iterator with
     169         *  allterms_begin() and then calling skip_to(prefix) on that iterator
     170         *  to move to the start of the prefix, but is more convenient (because
     171         *  it detects the end of the prefixed terms), and may be more
     172         *  efficient than simply calling skip_to() after opening the iterator,
     173         *  particularly for network databases.
     174         *
     175         *  @param prefix The prefix to restrict the returned terms to.
     176         */
     177        TermIterator allterms_begin(const std::string & prefix) const;
     178
     179        /** Corresponding end iterator to allterms_begin(prefix).
     180         */
     181        TermIterator allterms_end(const std::string &) const {
     182            return TermIterator(NULL);
     183        }
     184
    166185        /// Get the number of documents in the database.
    167186        Xapian::doccount get_doccount() const;
    168187
  • xapian-core/net/remoteserver.cc

     
    237237}
    238238
    239239void
    240 RemoteServer::msg_allterms(const string &)
     240RemoteServer::msg_allterms(const string &message)
    241241{
    242     const Xapian::TermIterator end = db->allterms_end();
    243     for (Xapian::TermIterator t = db->allterms_begin(); t != end; ++t) {
     242    const char *p = message.data();
     243    const char *p_end = p + message.size();
     244    string prefix(p, p_end - p);
     245
     246    const Xapian::TermIterator end = db->allterms_end(prefix);
     247    for (Xapian::TermIterator t = db->allterms_begin(prefix); t != end; ++t) {
    244248        string item = encode_length(t.get_termfreq());
    245249        item += *t;
    246250        send_message(REPLY_ALLTERMS, item);
  • xapian-core/common/remote-database.h

     
    150150    TermList * open_term_list(Xapian::docid did) const;
    151151
    152152    /// Iterate all terms.
    153     TermList * open_allterms() const;
     153    TermList * open_allterms(const string & prefix) const;
    154154
    155155    bool has_positions() const;
    156156
  • xapian-core/common/remoteprotocol.h

     
    3232// 27: Support for postlists (always passes the whole list across)
    3333// 28: Pass document length in reply to MSG_TERMLIST
    3434// 29: Serialisation of Xapian::Error includes error_string.
    35 #define XAPIAN_REMOTE_PROTOCOL_VERSION 29
     35// 30: Pass the prefix parameter for MSG_ALLTERMS, and use it.
     36#define XAPIAN_REMOTE_PROTOCOL_VERSION 30
    3637
    3738/// Message types (client -> server).
    3839enum message_type {
  • xapian-core/common/database.h

     
    197197         *
    198198         *  This is a list of all the terms in the database
    199199         *
     200         *  @param prefix The prefix to restrict the terms to.
    200201         *  @return       A pointer to the newly created allterms list.
    201202         *                This object must be deleted by the caller after
    202203         *                use.
    203204         */
    204         virtual TermList * open_allterms() const = 0;
     205        virtual TermList * open_allterms(const string & prefix) const = 0;
    205206
    206207        /** Open a position list for the given term in the given document.
    207208         *
  • xapian-core/api/omdatabase.cc

     
    151151TermIterator
    152152Database::allterms_begin() const
    153153{
     154    return allterms_begin("");
     155}
     156
     157TermIterator
     158Database::allterms_begin(const std::string & prefix) const
     159{
    154160    DEBUGAPICALL(TermIterator, "Database::allterms_begin", "");
    155161    if (internal.empty()) RETURN(TermIterator(NULL));
    156162
    157163    if (internal.size() == 1)
    158         RETURN(TermIterator(internal[0]->open_allterms()));
     164        RETURN(TermIterator(internal[0]->open_allterms(prefix)));
    159165 
    160166    vector<TermList *> lists;
    161167
    162168    vector<Xapian::Internal::RefCntPtr<Database::Internal> >::const_iterator i;
    163169    for (i = internal.begin(); i != internal.end(); ++i) {
    164         lists.push_back((*i)->open_allterms());
     170        lists.push_back((*i)->open_allterms(prefix));
    165171    }
    166172
    167173    RETURN(TermIterator(new MultiAllTermsList(lists)));
  • xapian-core/backends/inmemory/inmemory_alltermslist.cc

     
    2525#include "inmemory_alltermslist.h"
    2626
    2727InMemoryAllTermsList::InMemoryAllTermsList(const std::map<string, InMemoryTerm> *tmap_,
    28                                            Xapian::Internal::RefCntPtr<const InMemoryDatabase> database_)
    29         : tmap(tmap_), it(tmap->begin()), database(database_), started(false)
     28                                           Xapian::Internal::RefCntPtr<const InMemoryDatabase> database_,
     29                                           const string & prefix_)
     30        : tmap(tmap_), it(tmap->begin()), database(database_), started(false), prefix(prefix_)
    3031{
    31     while (it != tmap->end() && it->second.term_freq == 0) ++it;
     32    while (it != tmap->end() &&
     33           (it->second.term_freq == 0 ||
     34            it->first.substr(0, prefix.size()) != prefix))
     35        ++it;
    3236}
    3337
    3438InMemoryAllTermsList::~InMemoryAllTermsList()
     
    7377    // FIXME: might skip backwards - is this a problem?
    7478    it = tmap->lower_bound(tname);
    7579    while (it != tmap->end() && it->second.term_freq == 0) ++it;
     80    if (it != tmap->end() && it->first.substr(0, prefix.size()) != prefix)
     81        it = tmap->end();
    7682    return NULL;
    7783}
    7884
     
    8692        ++it;
    8793        while (it != tmap->end() && it->second.term_freq == 0) ++it;
    8894    }
     95    if (it != tmap->end() && it->first.substr(0, prefix.size()) != prefix)
     96        it = tmap->end();
    8997    return NULL;
    9098}
    9199
  • xapian-core/backends/inmemory/inmemory_alltermslist.h

     
    4444        Xapian::Internal::RefCntPtr<const InMemoryDatabase> database;
    4545
    4646        bool started;
     47
     48        string prefix;
    4749    public:
    4850        /// Standard constructor for base class.
    4951        InMemoryAllTermsList(const std::map<string, InMemoryTerm> *tmap_,
    50                              Xapian::Internal::RefCntPtr<const InMemoryDatabase> database_);
     52                             Xapian::Internal::RefCntPtr<const InMemoryDatabase> database_,
     53                             const string & prefix);
    5154
    5255        /// Standard destructor for base class.
    5356        ~InMemoryAllTermsList();
  • xapian-core/backends/inmemory/inmemory_database.cc

     
    707707}
    708708
    709709TermList *
    710 InMemoryDatabase::open_allterms() const
     710InMemoryDatabase::open_allterms(const string & prefix) const
    711711{
    712712    return new InMemoryAllTermsList(&postlists,
    713                                     Xapian::Internal::RefCntPtr<const InMemoryDatabase>(this));
     713                                    Xapian::Internal::RefCntPtr<const InMemoryDatabase>(this),
     714                                    prefix);
    714715}
  • xapian-core/backends/inmemory/inmemory_database.h

     
    306306        Xapian::Document::Internal * open_document(Xapian::docid did, bool lazy = false) const;
    307307        PositionList * open_position_list(Xapian::docid did,
    308308                                          const string & tname) const;
    309         TermList * open_allterms() const;
     309        TermList * open_allterms(const string & prefix) const;
    310310};
    311311
    312312#endif /* OM_HGUARD_INMEMORY_DATABASE_H */
  • xapian-core/backends/quartz/quartz_database.h

     
    220220        Xapian::Document::Internal * open_document(Xapian::docid did, bool lazy = false) const;
    221221        PositionList * open_position_list(Xapian::docid did,
    222222                                          const string & tname) const;
    223         TermList * open_allterms() const;
     223        TermList * open_allterms(const string & prefix) const;
    224224        //@}
    225225};
    226226
     
    317317        Xapian::Document::Internal * open_document(Xapian::docid did, bool lazy = false) const;
    318318        PositionList * open_position_list(Xapian::docid did,
    319319                                          const string & tname) const;
    320         TermList * open_allterms() const;
     320        TermList * open_allterms(const string & prefix) const;
    321321        //@}
    322322};
    323323
  • xapian-core/backends/quartz/quartz_alltermslist.cc

     
    2929
    3030QuartzAllTermsList::QuartzAllTermsList(Xapian::Internal::RefCntPtr<const Xapian::Database::Internal> database_,
    3131                                       AutoPtr<Bcursor> pl_cursor_,
    32                                        quartz_tablesize_t size_)
     32                                       quartz_tablesize_t size_,
     33                                       const string & prefix_)
    3334        : database(database_), pl_cursor(pl_cursor_), size(size_),
    34           started(false)
     35          started(false), prefix(prefix_)
    3536{
    3637    DEBUGCALL(DB, void, "QuartzAllTermsList", "[database_], [pl_cursor_]");
    37     /* Seek to the first term */
    38     pl_cursor->find_entry(string());
     38    if (prefix.empty()) {
     39        /* Seek to the first term */
     40        pl_cursor->find_entry(string());
    3941
    40     if (pl_cursor->current_key.empty()) {
    41         pl_cursor->next();
     42        if (pl_cursor->current_key.empty()) {
     43            pl_cursor->next();
     44        }
     45    } else {
     46        // Seek to the first key before one with the desired prefix.
     47        if (!pl_cursor->find_entry(pack_string_preserving_sort(prefix))) {
     48            // Found a a key which is before the prefix - advance to the
     49            // first one following the prefix.
     50            pl_cursor->next();
     51        }
    4252    }
    4353
    4454    is_at_end = pl_cursor->after_end();
     
    5060        }
    5161    }
    5262
     63    if (current_term.substr(0, prefix.size()) != prefix)
     64        is_at_end = true;
     65
    5366    have_stats = false;
    5467}
    5568
     
    128141            is_at_end = true;
    129142        } else {
    130143            next();
     144            // next() checks the prefix, so we don't need to do that here.
    131145        }
    132146    } else {
    133147        // This assertion isn't true if key contains zero bytes.
    134148        // Assert(key == pl_cursor->current_key);
    135149        current_term = tname;
     150
     151        // Check that we haven't gone past the prefix.
     152        if (current_term.substr(0, prefix.size()) != prefix)
     153            is_at_end = true;
    136154    }
    137155    RETURN(NULL);
    138156}
     
    156174            if (!unpack_string_preserving_sort(&start, end, current_term)) {
    157175                throw Xapian::DatabaseCorruptError("Failed to read the key field from a Bcursor's key");
    158176            }
     177            if (current_term.substr(0, prefix.size()) != prefix) {
     178                is_at_end = true;
     179                break;
     180            }
    159181            // Check if this is the first chunk of a postlist, skip otherwise
    160182            if (start == end) break;
    161183        }
  • xapian-core/backends/quartz/quartz_alltermslist.h

     
    5555        /// Cached termname
    5656        string current_term;
    5757
     58        /// The prefix to restrict the terms to.
     59        string prefix;
     60
    5861        /// Cached statistics
    5962        mutable bool have_stats;
    6063        mutable Xapian::termcount termfreq;
     
    6568        /// Standard constructor for base class.
    6669        QuartzAllTermsList(Xapian::Internal::RefCntPtr<const Xapian::Database::Internal> database_,
    6770                           AutoPtr<Bcursor> pl_cursor_,
    68                            quartz_tablesize_t size_);
     71                           quartz_tablesize_t size_,
     72                           const string & prefix_);
    6973
    7074        /// Standard destructor for base class.
    7175        ~QuartzAllTermsList();
  • xapian-core/backends/quartz/quartz_database.cc

     
    663663}
    664664
    665665TermList *
    666 QuartzDatabase::open_allterms() const
     666QuartzDatabase::open_allterms(const string & prefix) const
    667667{
    668668    DEBUGCALL(DB, TermList *, "QuartzDatabase::open_allterms", "");
    669669    AutoPtr<Bcursor> pl_cursor(postlist_table.cursor_get());
    670670    RETURN(new QuartzAllTermsList(Xapian::Internal::RefCntPtr<const QuartzDatabase>(this),
    671                                   pl_cursor, postlist_table.get_entry_count()));
     671                                  pl_cursor, postlist_table.get_entry_count(), prefix));
    672672}
    673673
    674674size_t QuartzWritableDatabase::flush_threshold = 0;
     
    11951195}
    11961196
    11971197TermList *
    1198 QuartzWritableDatabase::open_allterms() const
     1198QuartzWritableDatabase::open_allterms(const string & prefix) const
    11991199{
    12001200    DEBUGCALL(DB, TermList *, "QuartzWritableDatabase::open_allterms", "");
    12011201    if (transaction_active())
     
    12051205    QuartzPostListTable *t = &database_ro.postlist_table;
    12061206    AutoPtr<Bcursor> pl_cursor(t->cursor_get());
    12071207    RETURN(new QuartzAllTermsList(Xapian::Internal::RefCntPtr<const QuartzWritableDatabase>(this),
    1208                                   pl_cursor, t->get_entry_count()));
     1208                                  pl_cursor, t->get_entry_count(), prefix));
    12091209}
    12101210
    12111211void
  • xapian-core/backends/remote/remote-database.cc

     
    152152}
    153153
    154154TermList *
    155 RemoteDatabase::open_allterms() const {
     155RemoteDatabase::open_allterms(const string & prefix) const {
    156156    // Ensure that avlength and doccount are up-to-date.
    157157    if (!cached_stats_valid) update_stats();
    158158
    159     send_message(MSG_ALLTERMS, "");
     159    send_message(MSG_ALLTERMS, prefix);
    160160
    161161    AutoPtr<NetworkTermList> tlist;
    162162    tlist = new NetworkTermList(0.0, doccount,
  • xapian-core/backends/flint/flint_database.h

     
    213213        Xapian::Document::Internal * open_document(Xapian::docid did, bool lazy = false) const;
    214214        PositionList * open_position_list(Xapian::docid did,
    215215                                          const string & tname) const;
    216         TermList * open_allterms() const;
     216        TermList * open_allterms(const string & prefix) const;
    217217        //@}
    218218};
    219219
     
    310310        Xapian::Document::Internal * open_document(Xapian::docid did, bool lazy = false) const;
    311311        PositionList * open_position_list(Xapian::docid did,
    312312                                          const string & tname) const;
    313         TermList * open_allterms() const;
     313        TermList * open_allterms(const string & prefix) const;
    314314        //@}
    315315};
    316316
  • xapian-core/backends/flint/flint_alltermslist.cc

     
    103103            throw Xapian::DatabaseCorruptError("PostList table key has unexpected format");
    104104        }
    105105
     106        if (current_term.substr(0, prefix.size()) != prefix) {
     107            // We've reached the end of the end of the prefixed terms.
     108            cursor->to_end();
     109            current_term = "";
     110            break;
     111        }
     112
    106113        // If this key is for the first chunk of a postlist, we're done.  Otherwise we need
    107114        // to skip past continuation chunks until we find the first chunk of the next postlist.
    108115        if (p == pend) break;
     
    114121FlintAllTermsList::skip_to(const string &tname)
    115122{
    116123    DEBUGCALL(DB, TermList *, "FlintAllTermsList::skip_to", tname);
     124    if (at_end()) abort();
    117125    Assert(!at_end());
    118126    // Set termfreq to 0 to indicate no value has been read for the current term.
    119127    termfreq = 0;
  • xapian-core/backends/flint/flint_cursor.h

     
    199199         */
    200200        bool find_entry(const string &key);
    201201
     202        /** Set the cursor to be off the end of the table.
     203         */
     204        void to_end() { is_after_end = true; }
     205
    202206        /** Determine whether cursor is off the end of table.
    203207         *
    204208         *  @return true if the cursor has been moved off the end of the
  • xapian-core/backends/flint/flint_alltermslist.h

     
    4848    /// The termname at the current position.
    4949    string current_term;
    5050
     51    /// The prefix to restrict the terms to.
     52    string prefix;
     53
    5154    /** The term frequency of the term at the current position.
    5255     *
    5356     *  If this value is zero, then we haven't read the term frequency or
     
    6467
    6568  public:
    6669    FlintAllTermsList(Xapian::Internal::RefCntPtr<const Xapian::Database::Internal> database_,
    67                       const FlintPostListTable * pltab) : database(database_), termfreq(0) {
     70                      const FlintPostListTable * pltab,
     71                      const string & prefix_)
     72            : database(database_), prefix(prefix_), termfreq(0) {
    6873        // The number of entries in the postlist table will be the number of
    6974        // terms, probably plus some extra entries for chunked posting lists,
    7075        // plus 1 for the metainfo key (unless the table is completely empty).
     
    7782        approx_size = pltab->get_entry_count();
    7883        if (approx_size) --approx_size;
    7984
    80         // Seek to the metainfo key, so the first next will advance us to the
    81         // first real key.
    8285        cursor = pltab->cursor_get();
    83         cursor->find_entry(string("", 1));
     86        if (prefix.empty()) {
     87            // Seek to the metainfo key, so the first next() will advance us to the
     88            // first real key.
     89            cursor->find_entry(string("", 1));
     90        } else {
     91            // Seek to the first key before one with the desired prefix.
     92            if (cursor->find_entry(pack_string_preserving_sort(prefix))) {
     93                // Found a key which is exactly the prefix - move back, so that
     94                // next() moves to it.
     95                cursor->prev();
     96            }
     97        }
    8498    }
    8599
    86100    /// Destructor.
  • xapian-core/backends/flint/flint_database.cc

     
    516516}
    517517
    518518TermList *
    519 FlintDatabase::open_allterms() const
     519FlintDatabase::open_allterms(const string & prefix) const
    520520{
    521521    DEBUGCALL(DB, TermList *, "FlintDatabase::open_allterms", "");
    522522    RETURN(new FlintAllTermsList(Xapian::Internal::RefCntPtr<const FlintDatabase>(this),
    523                                  &postlist_table));
     523                                 &postlist_table, prefix));
    524524}
    525525
    526526size_t FlintWritableDatabase::flush_threshold = 0;
     
    10531053}
    10541054
    10551055TermList *
    1056 FlintWritableDatabase::open_allterms() const
     1056FlintWritableDatabase::open_allterms(const string & prefix) const
    10571057{
    10581058    DEBUGCALL(DB, TermList *, "FlintWritableDatabase::open_allterms", "");
    10591059    if (transaction_active())
     
    10621062    // need to flush.
    10631063    if (changes_made) do_flush_const();
    10641064    RETURN(new FlintAllTermsList(Xapian::Internal::RefCntPtr<const FlintWritableDatabase>(this),
    1065                                  &database_ro.postlist_table));
     1065                                 &database_ro.postlist_table, prefix));
    10661066}
    10671067
    10681068void
  • xapian-bindings/python/extra.i

     
    536536Query.__iter__ = _query_gen_iter
    537537
    538538# Modify Database to add an "__iter__()" method and an "allterms()" method.
    539 def _database_gen_allterms_iter(self):
     539def _database_gen_allterms_iter(self, prefix=None):
    540540    """Get an iterator over all the terms in the database.
    541541
    542542    The iterator will return TermListItem objects, but these will not support
     
    545545    Access to term frequency information is only available until the iterator
    546546    has moved on.
    547547
     548    If prefix is supplied, only terms which start with that prefix will be
     549    returned.
     550
    548551    """
    549     return TermIter(self.allterms_begin(), self.allterms_end(),
    550                     has_termfreq=TermIter.LAZY)
     552    if prefix is None:
     553        return TermIter(self.allterms_begin(), self.allterms_end(),
     554                        has_termfreq=TermIter.LAZY)
     555    else:
     556        return TermIter(self.allterms_begin(prefix), self.allterms_end(prefix),
     557                        has_termfreq=TermIter.LAZY)
    551558Database.__iter__ = _database_gen_allterms_iter
    552559Database.allterms = _database_gen_allterms_iter
    553560
  • xapian-bindings/xapian.i

     
    657657        PositionIterator positionlist_end(docid did, const std::string& tname) const;
    658658        TermIterator allterms_begin() const;
    659659        TermIterator allterms_end() const;
     660        TermIterator allterms_begin(const std::string &prefix) const;
     661        TermIterator allterms_end(const std::string &prefix) const;
    660662
    661663        doccount get_doccount() const;
    662664        docid get_lastdocid() const;