root / tags / 1.0.8 / xapian-core / backends / flint / flint_termlisttable.cc

Revision 9463, 4.8 kB (checked in by olly, 15 months ago)

common/omassert.h: Rewritten from scratch. The new version only
includes headers if assertions are enabled, which should help
to speed up non-assertion builds by reducing unnecessary header
inclusion. Also, float.h and math.h are never now pulled in -
instead we use the new within_DBL_EPSILON() function. AssertNe?()
and AssertNeParanoid?() are never actually used, so replace them with
AssertRel?() and AssertRelParanoid? which allow the user to assert any
binary relation, not just inequality. Also, we now use rare() to
give branch prediction hints for assertion tests (since the failure
branch should never be taken).
common/omdebug.h,common/stringutils.h,tests/harness/testsuite.h:
Replace several definitions of the STRINGIZE macro with a single
version in common/stringutils.h.
backends/flint/,backends/inmemory/inmemory_database.cc,
backends/multi/multi_postlist.cc,backends/quartz/,
backends/remote/remote-database.cc,bin/quartzcheck.cc,
bin/xapian-compact.cc,common/stringutils.h,expand/expandweight.cc,
expand/ortermlist.cc,matcher/phrasepostlist.cc,
matcher/scaleweightpostlist.cc,net/remoteconnection.cc,
net/tcpserver.cc: Explicitly include headers which were previously
being pulled in implicitly by omassert.h.
HACKING: Update the documentation for assertion calls, and document
CompileTimeAssert?() (which previously wasn't documented here).

Line 
1/** @file flint_termlisttable.cc
2 * @brief Subclass of FlintTable which holds termlists.
3 */
4/* Copyright (C) 2007 Olly Betts
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
19 */
20
21#include <config.h>
22
23#include <xapian/document.h>
24#include <xapian/error.h>
25#include <xapian/termiterator.h>
26
27#include "flint_termlisttable.h"
28#include "flint_utils.h"
29#include "omassert.h"
30#include "omdebug.h"
31#include "stringutils.h"
32#include "utils.h"
33
34#include <string>
35
36using namespace std;
37
38void
39FlintTermListTable::set_termlist(Xapian::docid did,
40                                 const Xapian::Document & doc,
41                                 flint_doclen_t doclen)
42{
43    DEBUGCALL(DB, void, "FlintTermListTable::set_termlist",
44              did << ", " << doc << ", " << doclen);
45
46    string tag = pack_uint(doclen);
47
48    Xapian::doccount termlist_size = doc.termlist_count();
49    if (termlist_size == 0) {
50        // doclen is sum(wdf) so should be zero if there are no terms.
51        Assert(doclen == 0);
52        Assert(doc.termlist_begin() == doc.termlist_end());
53        add(flint_docid_to_key(did), string());
54        return;
55    }
56
57    Xapian::TermIterator t = doc.termlist_begin();
58    if (t != doc.termlist_end()) {
59        tag += pack_uint(termlist_size);
60        string prev_term = *t;
61
62        // Previous database versions encoded a boolean here, which was
63        // always false (and pack_bool() encodes false as a '0').  We can
64        // just omit this and successfully read old and new termlists
65        // except in the case where the next byte is a '0' - in this case
66        // we need keep the '0' so that the decoder can just skip any '0'
67        // it sees in this position (this shouldn't be a common case - 48
68        // character terms aren't very common, and the first term
69        // alphabetically is likely to be shorter than average).
70        // FIXME: If we have an incompatible database version bump we should
71        // drop this completely.
72        if (prev_term.size() == '0') tag += '0';
73
74        tag += prev_term.size();
75        tag += prev_term;
76        tag += pack_uint(t.get_wdf());
77        --termlist_size;
78
79        while (++t != doc.termlist_end()) {
80            const string & term = *t;
81            // If there's a shared prefix with the previous term, we don't
82            // store it explicitly, but just store the length of the shared
83            // prefix.  In general, this is a big win.
84            size_t reuse = common_prefix_length(prev_term, term);
85
86            // reuse must be <= prev_term.size(), and we know that value while
87            // decoding.  So if the wdf is small enough that we can multiply it
88            // by (prev_term.size() + 1), add reuse and fit the result in a
89            // byte, then we can pack reuse and the wdf into a single byte and
90            // save ourselves a byte.  We actually need to add one to the wdf
91            // before multiplying so that a wdf of 0 can be detected by the
92            // decoder.
93            size_t packed = 0;
94            Xapian::termcount wdf = t.get_wdf();
95            // If wdf >= 128, then we aren't going to be able to pack it in so
96            // don't even try to avoid the calculation overflowing and making
97            // us think we can.
98            if (wdf < 127)
99                packed = (wdf + 1) * (prev_term.size() + 1) + reuse;
100
101            if (packed && packed < 256) {
102                // We can pack the wdf into the same byte.
103                tag += char(packed);
104                tag += char(term.size() - reuse);
105                tag.append(term.data() + reuse, term.size() - reuse);
106            } else {
107                tag += char(reuse);
108                tag += char(term.size() - reuse);
109                tag.append(term.data() + reuse, term.size() - reuse);
110                // FIXME: pack wdf after reuse next time we rejig the format
111                // incompatibly.
112                tag += pack_uint(wdf);
113            }
114
115            prev_term = *t;
116            --termlist_size;
117        }
118    }
119    Assert(termlist_size == 0);
120    add(flint_docid_to_key(did), tag);
121}
122
123flint_doclen_t
124FlintTermListTable::get_doclength(Xapian::docid did) const
125{
126    DEBUGCALL(DB, flint_doclen_t, "FlintTermListTable::get_doclength", did);
127
128    string tag;
129    if (!get_exact_entry(flint_docid_to_key(did), tag))
130        throw Xapian::DocNotFoundError("No termlist found for document " +
131                                       om_tostring(did));
132
133    if (tag.empty()) RETURN(0);
134
135    const char * pos = tag.data();
136    const char * end = pos + tag.size();
137
138    flint_doclen_t doclen;
139    if (!unpack_uint(&pos, end, &doclen)) {
140        const char *msg;
141        if (pos == 0) {
142            msg = "Too little data for doclen in termlist";
143        } else {
144            msg = "Overflowed value for doclen in termlist";
145        }
146        throw Xapian::DatabaseCorruptError(msg);
147    }
148
149    RETURN(doclen);
150}
Note: See TracBrowser for help on using the browser.