Changeset 10716

Show
Ignore:
Timestamp:
2008-06-23 00:47:29 (7 months ago)
Author:
richard
Message:

backends/chert/: Change ChertAllDocsPostList? to inherit from
ChertPostList? and use the doclen list from the posting table
rather than using the termlist table. This helps towards making
the termlist table optional, reduces the amount of data read in
the process of iterating through an alldocs postlist, and can
make a massive difference in performance: I've measured the time
to iterate through all the documents in a 1000000 document
database, and the patch speeds this operation up by a factor of
6.
Add keep_reference parameter to the ChertPostList? constructor,
instead of not keeping a reference if the term is empty.
ChertAllDocsPostList? uses this to keep a reference to the
database while using an empty term.
Add ChertAllDocsModifiedPostList? class, inspired by
ChertModifiedPostList?, (with corresponding new source files) to
handle alldocs postlist with modifications: this wasn't needed
before since the termlist is updated immediately after changes.

Location:
trunk/xapian-core
Files:
2 added
8 modified

Legend:

Unmodified
Added
Removed
  • trunk/xapian-core/ChangeLog

    r10715 r10716  
     1Sun Jun 22 23:44:46 GMT 2008  Richard Boulton <richard@lemurconsulting.com> 
     2 
     3        * backends/chert/: Change ChertAllDocsPostList to inherit from 
     4          ChertPostList and use the doclen list from the posting table 
     5          rather than using the termlist table.  This helps towards making 
     6          the termlist table optional, reduces the amount of data read in 
     7          the process of iterating through an alldocs postlist, and can 
     8          make a massive difference in performance: I've measured the time 
     9          to iterate through all the documents in a 1000000 document 
     10          database, and the patch speeds this operation up by a factor of 
     11          6. 
     12 
     13          Add `keep_reference` parameter to the ChertPostList constructor, 
     14          instead of not keeping a reference if the term is empty. 
     15          ChertAllDocsPostList uses this to keep a reference to the 
     16          database while using an empty term. 
     17 
     18          Add ChertAllDocsModifiedPostList class, inspired by 
     19          ChertModifiedPostList, (with corresponding new source files) to 
     20          handle alldocs postlist with modifications: this wasn't needed 
     21          before since the termlist is updated immediately after changes. 
     22 
    123Sun Jun 22 21:37:00 GMT 2008  Richard Boulton <richard@lemurconsulting.com> 
    224 
  • trunk/xapian-core/backends/chert/Makefile.mk

    r10346 r10716  
    55if BUILD_BACKEND_CHERT 
    66noinst_HEADERS +=\ 
     7        backends/chert/chert_alldocsmodifiedpostlist.h\ 
    78        backends/chert/chert_alldocspostlist.h\ 
    89        backends/chert/chert_alltermslist.h\ 
     
    3132 
    3233libxapian_la_SOURCES +=\ 
     34        backends/chert/chert_alldocsmodifiedpostlist.cc\ 
    3335        backends/chert/chert_alldocspostlist.cc\ 
    3436        backends/chert/chert_alltermslist.cc\ 
  • trunk/xapian-core/backends/chert/chert_alldocspostlist.cc

    r10346 r10716  
    33 */ 
    44/* Copyright (C) 2006,2007,2008 Olly Betts 
     5 * Copyright (C) 2008 Lemur Consulting Ltd 
    56 * 
    67 * This program is free software; you can redistribute it and/or modify 
     
    2324#include <string> 
    2425 
     26#include "chert_alldocspostlist.h" 
    2527#include "chert_database.h" 
    26 #include "chert_alldocspostlist.h" 
    2728 
    2829#include "utils.h" 
     
    3031using namespace std; 
    3132 
     33ChertAllDocsPostList::ChertAllDocsPostList(Xapian::Internal::RefCntPtr<const ChertDatabase> db_, 
     34                                           Xapian::doccount doccount_) 
     35        : ChertPostList(db_, string(), true), 
     36          doccount(doccount_) 
     37{ 
     38    DEBUGCALL(DB, void, "ChertAllDocsPostList::ChertAllDocsPostList", db_.get() << ", " << doccount_); 
     39} 
     40 
    3241Xapian::doccount 
    3342ChertAllDocsPostList::get_termfreq() const 
    3443{ 
    35     return doccount; 
    36 } 
    37  
    38 Xapian::docid 
    39 ChertAllDocsPostList::get_docid() const 
    40 { 
    41     return current_did; 
     44    DEBUGCALL(DB, Xapian::doccount, "ChertAllDocsPostList::get_termfreq", ""); 
     45    RETURN(doccount); 
    4246} 
    4347 
     
    4650{ 
    4751    DEBUGCALL(DB, Xapian::doclength, "ChertAllDocsPostList::get_doclength", ""); 
    48     Assert(current_did); 
    4952 
    50     cursor->read_tag(); 
    51  
    52     if (cursor->current_tag.empty()) RETURN(0); 
    53  
    54     const char * pos = cursor->current_tag.data(); 
    55     const char * end = pos + cursor->current_tag.size(); 
    56  
    57     chert_doclen_t doclen; 
    58     if (!unpack_uint(&pos, end, &doclen)) { 
    59         const char *msg; 
    60         if (pos == 0) { 
    61             msg = "Too little data for doclen in termlist"; 
    62         } else { 
    63             msg = "Overflowed value for doclen in termlist"; 
    64         } 
    65         throw Xapian::DatabaseCorruptError(msg); 
    66     } 
    67  
    68     RETURN(doclen); 
     53    RETURN(ChertPostList::get_wdf()); 
    6954} 
    7055 
     
    7358{ 
    7459    DEBUGCALL(DB, Xapian::termcount, "ChertAllDocsPostList::get_wdf", ""); 
    75     Assert(current_did); 
     60    AssertParanoid(!at_end()); 
    7661    RETURN(1); 
    7762} 
    7863 
    79 PostList * 
    80 ChertAllDocsPostList::read_did_from_current_key() 
     64PositionList * 
     65ChertAllDocsPostList::read_position_list() 
    8166{ 
    82     DEBUGCALL(DB, PostList *, "ChertAllDocsPostList::read_did_from_current_key", 
    83               ""); 
    84     const string & key = cursor->current_key; 
    85     const char * pos = key.data(); 
    86     const char * end = pos + key.size(); 
    87     if (!unpack_uint_preserving_sort(&pos, end, &current_did)) { 
    88         const char *msg; 
    89         if (pos == 0) { 
    90             msg = "Too little data in termlist key"; 
    91         } else { 
    92             msg = "Overflowed value in termlist key"; 
    93         } 
    94         throw Xapian::DatabaseCorruptError(msg); 
    95     } 
    96  
    97     // Return NULL to help the compiler tail-call optimise our callers. 
    98     RETURN(NULL); 
     67    DEBUGCALL(DB, Xapian::termcount, "ChertAllDocsPostList::read_position_list", ""); 
     68    throw Xapian::InvalidOperationError("ChertAllDocsPostList::read_position_list() not meaningful"); 
    9969} 
    10070 
    101 PostList * 
    102 ChertAllDocsPostList::next(Xapian::weight /*w_min*/) 
     71PositionList * 
     72ChertAllDocsPostList::open_position_list() const 
    10373{ 
    104     DEBUGCALL(DB, PostList *, "ChertAllDocsPostList::next", "/*w_min*/"); 
    105     Assert(!at_end()); 
    106     if (!cursor->next()) RETURN(NULL); 
    107     RETURN(read_did_from_current_key()); 
    108 } 
    109  
    110 PostList * 
    111 ChertAllDocsPostList::skip_to(Xapian::docid did, Xapian::weight /*w_min*/) 
    112 { 
    113     DEBUGCALL(DB, PostList *, "ChertAllDocsPostList::skip_to", 
    114               did << ", /*w_min*/"); 
    115  
    116     if (did <= current_did || at_end()) RETURN(NULL); 
    117  
    118     if (cursor->find_entry_ge(pack_uint_preserving_sort(did))) { 
    119         // The exact docid that was asked for exists. 
    120         current_did = did; 
    121         RETURN(NULL); 
    122     } 
    123     if (cursor->after_end()) RETURN(NULL); 
    124  
    125     RETURN(read_did_from_current_key()); 
    126 } 
    127  
    128 bool 
    129 ChertAllDocsPostList::at_end() const { 
    130     DEBUGCALL(DB, bool, "ChertAllDocsPostList::at_end", ""); 
    131     RETURN(cursor->after_end()); 
     74    DEBUGCALL(DB, Xapian::termcount, "ChertAllDocsPostList::open_position_list", ""); 
     75    throw Xapian::InvalidOperationError("ChertAllDocsPostList::open_position_list() not meaningful"); 
    13276} 
    13377 
     
    13680{ 
    13781    string desc = "ChertAllDocsPostList(did="; 
    138     desc += om_tostring(current_did); 
     82    desc += om_tostring(get_docid()); 
    13983    desc += ",doccount="; 
    14084    desc += om_tostring(doccount); 
  • trunk/xapian-core/backends/chert/chert_alldocspostlist.h

    r10346 r10716  
    33 */ 
    44/* Copyright (C) 2006,2007,2008 Olly Betts 
     5 * Copyright (C) 2008 Lemur Consulting Ltd 
    56 * 
    67 * This program is free software; you can redistribute it and/or modify 
     
    2526 
    2627#include "leafpostlist.h" 
     28#include "chert_postlist.h" 
    2729 
    28 class ChertAllDocsPostList : public LeafPostList { 
     30class ChertAllDocsPostList : public ChertPostList { 
    2931    /// Don't allow assignment. 
    3032    void operator=(const ChertAllDocsPostList &); 
     
    3335    ChertAllDocsPostList(const ChertAllDocsPostList &); 
    3436 
    35     /// Set @a current_did from @a cursor->current_key. 
    36     PostList * read_did_from_current_key(); 
    37  
    38     /// The database we're iterating over. 
    39     Xapian::Internal::RefCntPtr<const ChertDatabase> db; 
    40  
    4137    /// The number of documents in the database. 
    4238    Xapian::doccount doccount; 
    4339 
    44     /// Cursor running over termlist table keys. 
    45     AutoPtr<ChertCursor> cursor; 
    46  
    47     /// The current document id. 
    48     Xapian::docid current_did; 
    49  
    5040  public: 
    5141    ChertAllDocsPostList(Xapian::Internal::RefCntPtr<const ChertDatabase> db_, 
    52                          Xapian::doccount doccount_) 
    53       : db(db_), doccount(doccount_), cursor(db->termlist_table.cursor_get()), 
    54         current_did(0) 
    55     { 
    56         cursor->find_entry(""); 
    57     } 
     42                         Xapian::doccount doccount_); 
    5843 
    5944    Xapian::doccount get_termfreq() const; 
    60  
    61     Xapian::docid get_docid() const; 
    6245 
    6346    Xapian::doclength get_doclength() const; 
     
    6548    Xapian::termcount get_wdf() const; 
    6649 
    67     PostList * next(Xapian::weight w_min); 
     50    PositionList *read_position_list(); 
    6851 
    69     PostList * skip_to(Xapian::docid desired_did, Xapian::weight w_min); 
    70  
    71     bool at_end() const; 
     52    PositionList *open_position_list() const; 
    7253 
    7354    std::string get_description() const; 
  • trunk/xapian-core/backends/chert/chert_database.cc

    r10675 r10716  
    3232 
    3333#include "contiguousalldocspostlist.h" 
     34#include "chert_alldocsmodifiedpostlist.h" 
    3435#include "chert_alldocspostlist.h" 
    3536#include "chert_alltermslist.h" 
     
    915916    } 
    916917 
    917     RETURN(new ChertPostList(ptrtothis, term)); 
     918    RETURN(new ChertPostList(ptrtothis, term, true)); 
    918919} 
    919920 
     
    18041805            RETURN(new ContiguousAllDocsPostList(ptrtothis, doccount)); 
    18051806        } 
    1806         RETURN(new ChertAllDocsPostList(ptrtothis, doccount)); 
     1807        if (doclens.empty()) { 
     1808            RETURN(new ChertAllDocsPostList(ptrtothis, doccount)); 
     1809        } else { 
     1810            RETURN(new ChertAllDocsModifiedPostList(ptrtothis, doccount, doclens)); 
     1811        } 
    18071812    } 
    18081813 
     
    18151820    } 
    18161821 
    1817     RETURN(new ChertPostList(ptrtothis, tname)); 
     1822    RETURN(new ChertPostList(ptrtothis, tname, true)); 
    18181823} 
    18191824 
  • trunk/xapian-core/backends/chert/chert_modifiedpostlist.h

    r10346 r10716  
    4747                          const string & tname_, 
    4848                          const map<Xapian::docid, pair<char, Xapian::termcount> > & mods_) 
    49         : ChertPostList(this_db_, tname_), 
     49        : ChertPostList(this_db_, tname_, true), 
    5050          mods(mods_), it(mods.begin()), poslist(0) 
    5151    { } 
  • trunk/xapian-core/backends/chert/chert_postlist.cc

    r10428 r10716  
    33 * Copyright 1999,2000,2001 BrightStation PLC 
    44 * Copyright 2002,2003,2004,2005,2007,2008 Olly Betts 
    5  * Copyright 2007 Lemur Consulting Ltd 
     5 * Copyright 2007,2008 Lemur Consulting Ltd 
    66 * 
    77 * This program is free software; you can redistribute it and/or 
     
    6262                                  Xapian::Internal::RefCntPtr<const ChertDatabase> db) const { 
    6363    if (!doclen_pl.get()) { 
    64         doclen_pl.reset(new ChertPostList(db, string())); 
     64        // Don't keep a reference back to the database, since this 
     65        // would make a reference loop. 
     66        doclen_pl.reset(new ChertPostList(db, string(), false)); 
    6567    } 
    6668    if (!doclen_pl->jump_to(did)) 
     
    656658 */ 
    657659ChertPostList::ChertPostList(Xapian::Internal::RefCntPtr<const ChertDatabase> this_db_, 
    658                              const string & tname_) 
    659         : this_db(tname_.empty() ? NULL : this_db_), // Don't keep a reference if this is a "doclen postlist". 
     660                             const string & tname_, 
     661                             bool keep_reference) 
     662        : this_db(keep_reference ? this_db_ : NULL), 
    660663          tname(tname_), 
    661664          have_started(false), 
     
    664667{ 
    665668    DEBUGCALL(DB, void, "ChertPostList::ChertPostList", 
    666               this_db_.get() << ", " << tname_); 
     669              this_db_.get() << ", " << tname_ << ", " << keep_reference); 
    667670    string key = ChertPostListTable::make_key(tname); 
    668671    int found = cursor->find_entry(key); 
    669672    if (!found) { 
     673        DEBUGLINE(DB, "postlist for term not found"); 
    670674        number_of_entries = 0; 
    671675        is_at_end = true; 
     
    685689                                            &is_last_chunk); 
    686690    read_wdf(&pos, end, &wdf); 
     691    DEBUGLINE(DB, "Initial docid " << did); 
    687692} 
    688693 
  • trunk/xapian-core/backends/chert/chert_postlist.h

    r10346 r10716  
    224224        /// Default constructor. 
    225225        ChertPostList(Xapian::Internal::RefCntPtr<const ChertDatabase> this_db_, 
    226                       const string & tname); 
     226                      const string & tname, 
     227                      bool keep_reference); 
    227228 
    228229        /// Destructor.