root / tags / 1.0.8 / xapian-core / backends / flint / flint_positionlist.cc

Revision 9256, 10.0 kB (checked in by olly, 16 months ago)

common/msvc_posix_wrapper.cc,common/msvc_posix_wrapper.h: Add
msvc_posix_rename() which can rename a file on top of another file.
common/stringutils.h: Add common_prefix_length() function.
backends/flint/: Clean up FlintWritableDatabase? - it now just
inherits from FlintDatabase? which allows several virtual methods
which just forwarded to FlintDatabase? to be dropped. Also, we
now no longer need to pass FlintTable objects to other classes
- they can just find the tables they want via the database pointer.
The never-used "store_termfreqs" flag has been dropped from the
termlist table entries - existing 1.0.x flint databases will be
automatically upgraded to the new version. Opening a database
now calls stat() less, so should be slightly more efficient.
And TermIterator::positionlist_count() now works for the flint
backend.
tests/Makefile.am,tests/api_db.cc,tests/testdata/flint-1.0.2/: New
test flintbackwardcompat2 which tests that we can open a flint
database from 1.0.2.
tests/api_wrdb.cc: New test adddoc4 which checks that termlists
handle an initial term of any valid length correctly.
tests/testdata/flint-1.0.1/postlist.DB: Mark as a binary file in
SVN.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
Line 
1/* flint_positionlist.cc: A position list in a flint database.
2 *
3 * Copyright (C) 2004,2005,2006 Olly Betts
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation; either version 2 of the
8 * License, or (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
18 * USA
19 */
20
21#include <config.h>
22
23#include <xapian/types.h>
24
25#include "flint_positionlist.h"
26#include "flint_utils.h"
27#include "omdebug.h"
28
29#include <vector>
30#include <string>
31#include <cmath>
32
33using namespace std;
34
35static const unsigned char flstab[256] = {
36    0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
37    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
38    6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
39    6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
40    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
41    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
42    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
43    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
44    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
45    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
46    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
47    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
48    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
49    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
50    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
51    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8
52};
53
54// Highly optimised fls() implementation.
55inline int my_fls(unsigned mask)
56{
57    int result = 0;
58    if (mask >= 0x10000u) {
59        mask >>= 16;
60        result = 16;
61    }
62    if (mask >= 0x100u) {
63        mask >>= 8;
64        result += 8;
65    }
66    return result + flstab[mask];
67}
68
69class BitWriter {
70    private:
71        string buf;
72        int n_bits;
73        unsigned int acc;
74    public:
75        BitWriter() : n_bits(0), acc(0) { }
76        BitWriter(const string & seed) : buf(seed), n_bits(0), acc(0) { }
77        void encode(size_t value, size_t outof) {
78            Assert(value < outof);
79            size_t bits = my_fls(outof - 1);
80            const size_t spare = (1 << bits) - outof;
81            if (spare) {
82                const size_t mid_start = (outof - spare) / 2;
83                if (value < mid_start) {
84                    write_bits(value, bits);
85                } else if (value >= mid_start + spare) {
86                    write_bits((value - (mid_start + spare)) | (1 << (bits - 1)), bits);
87                } else {
88                    --bits;
89                    write_bits(value, bits);
90                }
91            } else {
92                write_bits(value, bits);
93            }
94        }
95        void write_bits(int data, int count) {
96            if (count + n_bits > 32) {
97                // We need to write more bits than there's empty room for in
98                // the accumulator.  So we arrange to shift out 8 bits, then
99                // adjust things so we're adding 8 fewer bits.
100                Assert(count <= 32);
101                acc |= (data << n_bits);
102                buf += char(acc);
103                acc >>= 8;
104                data >>= 8;
105                count -= 8;
106            }
107            acc |= (data << n_bits);
108            n_bits += count;
109            while (n_bits >= 8) {
110                buf += char(acc);
111                acc >>= 8;
112                n_bits -= 8;
113            }
114        }
115        string & freeze() {
116            if (n_bits) {
117                buf += char(acc);
118                n_bits = 0;
119                acc = 0;
120            }
121            return buf;
122        }
123};
124
125static void
126encode_interpolative(BitWriter & wr, const vector<Xapian::termpos> &pos, int j, int k)
127{
128    while (j + 1 < k) {
129        const size_t mid = (j + k) / 2;
130        // Encode one out of (pos[k] - pos[j] + 1) values
131        // (less some at either end because we must be able to fit
132        // all the intervening pos in)
133        const size_t outof = pos[k] - pos[j] + j - k + 1;
134        const size_t lowest = pos[j] + mid - j;
135        wr.encode(pos[mid] - lowest, outof);
136        encode_interpolative(wr, pos, j, mid);
137        j = mid;
138    }
139}
140
141namespace Xapian {
142
143class BitReader {
144    private:
145        string buf;
146        size_t idx;
147        int n_bits;
148        unsigned int acc;
149    public:
150        BitReader(const string &buf_) : buf(buf_), idx(0), n_bits(0), acc(0) { }
151        Xapian::termpos decode(Xapian::termpos outof) {
152            size_t bits = my_fls(outof - 1);
153            const size_t spare = (1 << bits) - outof;
154            const size_t mid_start = (outof - spare) / 2;
155            Xapian::termpos p;
156            if (spare) {
157                p = read_bits(bits - 1);
158                if (p < mid_start) {
159                    if (read_bits(1)) p += mid_start + spare;
160                }
161            } else {
162                p = read_bits(bits);
163            }
164            Assert(p < outof);
165            return p;
166        }
167        unsigned int read_bits(int count) {
168            unsigned int result;
169            if (count > 25) {
170                // If we need more than 25 bits, read in two goes to ensure
171                // that we don't overflow acc.  This is a little more
172                // conservative than it needs to be, but such large values will
173                // inevitably be rare (because you can't fit very many of them
174                // into 2^32!)
175                Assert(count <= 32);
176                result = read_bits(16);
177                return result | (read_bits(count - 16) << 16);
178            }
179            while (n_bits < count) {
180                Assert(idx < buf.size());
181                acc |= static_cast<unsigned char>(buf[idx++]) << n_bits;
182                n_bits += 8;
183            }
184            result = acc & ((1u << count) - 1);
185            acc >>= count;
186            n_bits -= count;
187            return result;
188        }
189        // Check all the data has been read.  Because it'll be zero padded
190        // to fill a byte, the best we can actually do is check that
191        // there's less than a byte left and that all remaining bits are
192        // zero.
193        bool check_all_gone() const {
194            return (idx == buf.size() && n_bits < 7 && acc == 0);
195        }
196        void decode_interpolative(vector<Xapian::termpos> & pos, int j, int k);
197};
198
199void
200BitReader::decode_interpolative(vector<Xapian::termpos> & pos, int j, int k)
201{
202    while (j + 1 < k) {
203        const size_t mid = (j + k) / 2;
204        // Decode one out of (pos[k] - pos[j] + 1) values
205        // (less some at either end because we must be able to fit
206        // all the intervening pos in)
207        const size_t outof = pos[k] - pos[j] + j - k + 1;
208        pos[mid] = decode(outof) + (pos[j] + mid - j);
209        decode_interpolative(pos, j, mid);
210        j = mid;
211    }
212}
213
214}
215
216using Xapian::BitReader;
217
218void
219FlintPositionListTable::set_positionlist(Xapian::docid did,
220                                         const string & tname,
221                                         Xapian::PositionIterator pos,
222                                         const Xapian::PositionIterator &pos_end)
223{
224    DEBUGCALL(DB, void, "FlintPositionList::set_positionlist",
225              did << ", " << tname << ", " << pos << ", " << pos_end);
226    Assert(pos != pos_end);
227
228    // FIXME: avoid the need for this copy!
229    vector<Xapian::termpos> poscopy(pos, pos_end);
230
231    string key = make_key(did, tname);
232
233    if (poscopy.size() == 1) {
234        // Special case for single entry position list.
235        add(key, pack_uint(poscopy[0]));
236        return;
237    }
238
239    BitWriter wr(pack_uint(poscopy.back()));
240
241    wr.encode(poscopy[0], poscopy.back());
242    wr.encode(poscopy.size() - 2, poscopy.back() - poscopy[0]);
243    encode_interpolative(wr, poscopy, 0, poscopy.size() - 1);
244    add(key, wr.freeze());
245}
246
247Xapian::termcount
248FlintPositionListTable::positionlist_count(Xapian::docid did,
249                                           const string & term) const
250{
251    DEBUGCALL(DB, void, "FlintPositionListTable::positionlist_count",
252              did << ", " << term);
253
254    string data;
255    if (!get_exact_entry(pack_uint_preserving_sort(did) + term, data)) {
256        // There's no positional information for this term.
257        return 0;
258    }
259
260    const char * pos = data.data();
261    const char * end = pos + data.size();
262    Xapian::termpos pos_last;
263    if (!unpack_uint(&pos, end, &pos_last)) {
264        throw Xapian::DatabaseCorruptError("Position list data corrupt");
265    }
266    if (pos == end) {
267        // Special case for single entry position list.
268        return 1;
269    }
270
271    BitReader rd(data);
272    // Skip the header we just read.
273    (void)rd.read_bits(8 * (pos - data.data()));
274    Xapian::termpos pos_first = rd.decode(pos_last);
275    Xapian::termpos pos_size = rd.decode(pos_last - pos_first) + 2;
276    return pos_size;
277}
278
279///////////////////////////////////////////////////////////////////////////
280
281bool
282FlintPositionList::read_data(const FlintTable * table, Xapian::docid did,
283                             const string & tname)
284{
285    DEBUGCALL(DB, void, "FlintPositionList::read_data",
286              table << ", " << did << ", " << tname);
287
288    have_started = false;
289    positions.clear();
290
291    string data;
292    if (!table->get_exact_entry(pack_uint_preserving_sort(did) + tname, data)) {
293        // There's no positional information for this term.
294        current_pos = positions.begin();
295        return false;
296    }
297
298    const char * pos = data.data();
299    const char * end = pos + data.size();
300    Xapian::termpos pos_last;
301    if (!unpack_uint(&pos, end, &pos_last)) {
302        throw Xapian::DatabaseCorruptError("Position list data corrupt");
303    }
304    if (pos == end) {
305        // Special case for single entry position list.
306        positions.push_back(pos_last);
307        current_pos = positions.begin();
308        return true;
309    }
310    BitReader rd(data);
311    // Skip the header we just read.
312    (void)rd.read_bits(8 * (pos - data.data()));
313    Xapian::termpos pos_first = rd.decode(pos_last);
314    Xapian::termpos pos_size = rd.decode(pos_last - pos_first) + 2;
315    positions.resize(pos_size);
316    positions[0] = pos_first;
317    positions.back() = pos_last;
318    rd.decode_interpolative(positions, 0, pos_size - 1);
319
320    current_pos = positions.begin();
321    return true;
322}
323
324Xapian::termcount
325FlintPositionList::get_size() const
326{
327    DEBUGCALL(DB, Xapian::termcount, "FlintPositionList::get_size", "");
328    RETURN(positions.size());
329}
330
331Xapian::termpos
332FlintPositionList::get_position() const
333{
334    DEBUGCALL(DB, Xapian::termpos, "FlintPositionList::get_position", "");
335    Assert(have_started);
336    RETURN(*current_pos);
337}
338
339void
340FlintPositionList::next()
341{
342    DEBUGCALL(DB, void, "FlintPositionList::next", "");
343
344    if (!have_started) {
345        have_started = true;
346    } else {
347        Assert(!at_end());
348        ++current_pos;
349    }
350}
351
352void
353FlintPositionList::skip_to(Xapian::termpos termpos)
354{
355    DEBUGCALL(DB, void, "FlintPositionList::skip_to", termpos);
356    if (!have_started) {
357        have_started = true;
358    }
359    while (!at_end() && *current_pos < termpos) ++current_pos;
360}
361
362bool
363FlintPositionList::at_end() const
364{
365    DEBUGCALL(DB, bool, "FlintPositionList::at_end", "");
366    RETURN(current_pos == positions.end());
367}
Note: See TracBrowser for help on using the browser.