root / tags / 1.0.8 / xapian-core / backends / flint / flint_utils.h

Revision 10730, 10.9 kB (checked in by olly, 7 months ago)

Backport change from trunk:
api/omqueryinternal.cc,backends/flint/flint_btreebase.cc,
backends/flint/flint_utils.h,
backends/inmemory/inmemory_positionlist.cc,matcher/stats.cc:
Assorted formatting tweaks.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
Line 
1/* flint_utils.h: Generic functions for flint
2 *
3 * Copyright 1999,2000,2001 BrightStation PLC
4 * Copyright 2002 Ananova Ltd
5 * Copyright 2002,2003,2004,2006 Olly Betts
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
20 * USA
21 */
22
23#ifndef OM_HGUARD_FLINT_UTILS_H
24#define OM_HGUARD_FLINT_UTILS_H
25
26#include <xapian/types.h>
27
28#include <string>
29
30using namespace std;
31
32/// Compile time assert a condition.
33#define CASSERT(a) {char assert[(a) ? 1 : -1];(void)assert;}
34
35/// Compile time assert that type T is unsigned.
36#define CASSERT_TYPE_UNSIGNED(T) CASSERT(static_cast<T>(-1) > 0)
37
38typedef unsigned char       om_byte;
39typedef unsigned int        om_uint32;
40typedef int                 om_int32;
41
42/** FIXME: the pack and unpack int methods store in low-byte-first order
43 *  - it might be easier to implement efficient specialisations with
44 *  high-byte-first order.
45 */
46
47/** Reads an unsigned integer from a string starting at a given position.
48 *
49 *  @param src       A pointer to a pointer to the data to read.  The
50 *                   character pointer will be updated to point to the
51 *                   next character to read, or 0 if the method ran out of
52 *                   data.  (It is only set to 0 in case of an error).
53 *  @param src_end   A pointer to the byte after the end of the data to
54 *                   read the integer from.
55 *  @param resultptr A pointer to a place to store the result.  If an
56 *                   error occurs, the value stored in this location is
57 *                   undefined.  If this pointer is 0, the result is not
58 *                   stored, and the method simply skips over the result.
59 *
60 *  @result True if an integer was successfully read.  False if the read
61 *          failed.  Failure may either be due to the data running out (in
62 *          which case *src will equal 0), or due to the value read
63 *          overflowing the size of result (in which case *src will point
64 *          to wherever the value ends, despite the overflow).
65 */
66template<class T>
67bool
68unpack_uint(const char ** src,
69            const char * src_end,
70            T * resultptr)
71{
72    // Check unsigned
73    CASSERT_TYPE_UNSIGNED(T);
74
75    // Check byte is what it's meant to be
76    CASSERT(sizeof(om_byte) == 1);
77
78    unsigned int shift = 0;
79    T result = 0;
80
81    while (true) {
82        if ((*src) == src_end) {
83            *src = 0;
84            return false;
85        }
86
87        om_byte part = static_cast<om_byte>(**src);
88        (*src)++;
89
90        // if new byte might cause overflow, and it does
91        if (((shift > (sizeof(T) - 1) * 8 + 1) &&
92             ((part & 0x7f) << (shift % 8)) >= 0x100) ||
93            (shift >= sizeof(T) * 8)) {
94            // Overflowed - move to end of this integer
95            while (true) {
96                if ((part & 0x80) == 0) return false;
97                if ((*src) == src_end) {
98                    *src = 0;
99                    return false;
100                }
101                part = static_cast<om_byte>(**src);
102                (*src)++;
103            }
104        }
105
106        result += T(part & 0x7f) << shift;
107        shift += 7;
108
109        if ((part & 0x80) == 0) {
110            if (resultptr) *resultptr = result;
111            return true;
112        }
113    }
114}
115
116
117/** Generates a packed representation of an integer.
118 *
119 *  @param value  The integer to represent.
120 *
121 *  @result       A string containing the representation of the integer.
122 */
123template<class T>
124string
125pack_uint(T value)
126{
127    // Check unsigned
128    CASSERT_TYPE_UNSIGNED(T);
129
130    if (value == 0) return string("", 1u);
131    string result;
132
133    while (value != 0) {
134        om_byte part = static_cast<om_byte>(value & 0x7f);
135        value = value >> 7;
136        if (value) part |= 0x80;
137        result.append(1u, char(part));
138    }
139
140    return result;
141}
142
143/** Generates a packed representation of a bool.
144 *
145 *  This is a specialisation of the template above.
146 *
147 *  @param value  The bool to represent.
148 *
149 *  @result       A string containing the representation of the bool.
150 */
151template<>
152inline string
153pack_uint<bool>(bool value)
154{
155    return string(1, static_cast<char>(value));
156}
157
158/** Reads an unsigned integer from a string starting at a given position.
159 *  This encoding requires that we know the encoded length from out-of-band
160 *  information (so is suitable when only one integer is encoded, or for
161 *  the last integer encoded).
162 *
163 *  @param src       A pointer to a pointer to the data to read.
164 *  @param src_end   A pointer to the byte after the end of the data to
165 *                   read the integer from.
166 *  @param resultptr A pointer to a place to store the result.  If an
167 *                   error occurs, the value stored in this location is
168 *                   undefined.  If this pointer is 0, the result is not
169 *                   stored, and the method simply skips over the result.
170 *
171 *  @result True if an integer was successfully read.  False if the read
172 *          failed.  Failure can hapen if the value read overflows
173 *          the size of result.
174 */
175template<class T>
176bool
177unpack_uint_last(const char ** src, const char * src_end, T * resultptr)
178{
179    // Check unsigned
180    CASSERT_TYPE_UNSIGNED(T);
181    // Check byte is what it's meant to be
182    CASSERT(sizeof(om_byte) == 1);
183
184    if (src_end - *src > int(sizeof(T))) {
185        // Would overflow
186        *src = src_end;
187        return false;
188    }
189
190    T result = 0;
191    int shift = 0;
192    while (*src != src_end) {
193        result |= static_cast<T>(static_cast<om_byte>(**src)) << shift;
194        ++(*src);
195        shift += 8;
196    }
197    *resultptr = result;
198    return true;
199}
200
201/** Generates a packed representation of an integer.
202 *  This encoding requires that we know the encoded length from out-of-band
203 *  information (so is suitable when only one integer is encoded, or for
204 *  the last integer encoded).
205 *
206 *  @param value  The integer to represent.
207 *
208 *  @result       A string containing the representation of the integer.
209 */
210template<class T>
211string
212pack_uint_last(T value)
213{
214    // Check unsigned
215    CASSERT_TYPE_UNSIGNED(T);
216
217    string result;
218    while (value) {
219        result += char(value);
220        value >>= 8;
221    }
222    return result;
223}
224
225/** Generate a packed representation of an integer, preserving sort order.
226 *
227 *  This representation is less compact than the usual one, and has a limit
228 *  of 256 bytes on the length of the integer.  However, this is unlikely to
229 *  ever be a problem.
230 *
231 *  @param value  The integer to represent.
232 *
233 *  @result       A string containing the representation of the integer.
234 */
235template<class T>
236string
237pack_uint_preserving_sort(T value)
238{
239    // Check unsigned
240    CASSERT_TYPE_UNSIGNED(T);
241
242    string result;
243    while (value != 0) {
244        om_byte part = static_cast<om_byte>(value & 0xff);
245        value = value >> 8;
246        result.insert(string::size_type(0), 1u, char(part));
247    }
248    result.insert(string::size_type(0), 1u, char(result.size()));
249    return result;
250}
251
252/** Unpack a unsigned integer, store in sort preserving order.
253 *
254 *  @param src       A pointer to a pointer to the data to read.  The
255 *                   character pointer will be updated to point to the
256 *                   next character to read, or 0 if the method ran out of
257 *                   data.  (It is only set to 0 in case of an error).
258 *  @param src_end   A pointer to the byte after the end of the data to
259 *                   read the integer from.
260 *  @param resultptr A pointer to a place to store the result.  If an
261 *                   error occurs, the value stored in this location is
262 *                   undefined.  If this pointer is 0, the result is not
263 *                   stored, and the method simply skips over the result.
264 *
265 *  @result True if an integer was successfully read.  False if the read
266 *          failed.  Failure may either be due to the data running out (in
267 *          which case *src will equal 0), or due to the value read
268 *          overflowing the size of result (in which case *src will point
269 *          to wherever the value ends, despite the overflow).
270 */
271template<class T>
272bool
273unpack_uint_preserving_sort(const char ** src,
274                            const char * src_end,
275                            T * resultptr)
276{
277    if (*src == src_end) {
278        *src = 0;
279        return false;
280    }
281
282    unsigned int length = static_cast<om_byte>(**src);
283    (*src)++;
284
285    if (length > sizeof(T)) {
286        *src += length;
287        if (*src > src_end) {
288            *src = 0;
289        }
290        return false;
291    }
292
293    // Can't be overflow now.
294    T result = 0;
295    while (length > 0) {
296        result = result << 8;
297        result += static_cast<om_byte>(**src);
298        (*src)++;
299        length--;
300    }
301    *resultptr = result;
302
303    return true;
304}
305
306inline bool
307unpack_string(const char ** src,
308              const char * src_end,
309              string & result)
310{
311    string::size_type length;
312    if (!unpack_uint(src, src_end, &length)) {
313        return false;
314    }
315
316    if (src_end - *src < 0 ||
317        string::size_type(src_end - *src) < length) {
318        src = 0;
319        return false;
320    }
321
322    result = string(*src, length);
323    *src += length;
324    return true;
325}
326
327inline string
328pack_string(string value)
329{
330    return pack_uint(value.size()) + value;
331}
332
333/** Pack a string into a representation which preserves sort order.
334 *  We do this by replacing zero bytes in the string with a zero byte
335 *  followed by byte value 0xff, and then appending two zero bytes to
336 *  the end.
337 */
338inline string
339pack_string_preserving_sort(string value)
340{
341    string::size_type i = 0, j;
342    while ((j = value.find('\0', i)) != string::npos) {
343        value.replace(j, 1, "\0\xff", 2);
344        i = j + 2;
345    }
346    value += '\0'; // FIXME temp...
347    return value + '\0'; // Note - next byte mustn't be '\xff'...
348}
349
350inline bool
351unpack_string_preserving_sort(const char ** src,
352                              const char * src_end,
353                              string & result)
354{
355    result = "";
356    while (*src < src_end) {
357        const char *begin = *src;
358        while (**src) {
359            ++(*src);
360            if (*src == src_end) return false;
361        }
362        result += string(begin, *src - begin);
363        ++(*src);
364        if (*src == src_end) return false;
365        if (**src != '\xff') {
366            ++(*src); // FIXME temp
367            return true;
368        }
369        result += '\0';
370        ++(*src);
371    }
372    return false;
373}
374
375inline bool
376unpack_bool(const char ** src,
377            const char * src_end,
378            bool * resultptr)
379{
380    if (*src == src_end) {
381        *src = 0;
382        return false;
383    }
384    switch (*((*src)++)) {
385        case '0':
386            if (resultptr) *resultptr = false;
387            return true;
388        case '1':
389            if (resultptr) *resultptr = true;
390            return true;
391    }
392    *src = 0;
393    return false;
394}
395
396inline string
397pack_bool(bool value)
398{
399    return value ? "1" : "0";
400}
401
402/** Convert a document id to a key (suitable when the docid is the only
403 *  component of the key).
404 */
405inline string
406flint_docid_to_key(Xapian::docid did)
407{
408    return pack_uint_preserving_sort(did);
409}
410
411#endif /* OM_HGUARD_FLINT_UTILS_H */
Note: See TracBrowser for help on using the browser.