root / tags / 1.0.8 / xapian-core / backends / flint / flint_termlisttable.cc
| Revision 9463, 4.8 kB (checked in by olly, 15 months ago) |
|---|
| Line | |
|---|---|
| 1 | /** @file flint_termlisttable.cc |
| 2 | * @brief Subclass of FlintTable which holds termlists. |
| 3 | */ |
| 4 | /* Copyright (C) 2007 Olly Betts |
| 5 | * |
| 6 | * This program is free software; you can redistribute it and/or modify |
| 7 | * it under the terms of the GNU General Public License as published by |
| 8 | * the Free Software Foundation; either version 2 of the License, or |
| 9 | * (at your option) any later version. |
| 10 | * |
| 11 | * This program is distributed in the hope that it will be useful, |
| 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 14 | * GNU General Public License for more details. |
| 15 | * |
| 16 | * You should have received a copy of the GNU General Public License |
| 17 | * along with this program; if not, write to the Free Software |
| 18 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA |
| 19 | */ |
| 20 | |
| 21 | #include <config.h> |
| 22 | |
| 23 | #include <xapian/document.h> |
| 24 | #include <xapian/error.h> |
| 25 | #include <xapian/termiterator.h> |
| 26 | |
| 27 | #include "flint_termlisttable.h" |
| 28 | #include "flint_utils.h" |
| 29 | #include "omassert.h" |
| 30 | #include "omdebug.h" |
| 31 | #include "stringutils.h" |
| 32 | #include "utils.h" |
| 33 | |
| 34 | #include <string> |
| 35 | |
| 36 | using namespace std; |
| 37 | |
| 38 | void |
| 39 | FlintTermListTable::set_termlist(Xapian::docid did, |
| 40 | const Xapian::Document & doc, |
| 41 | flint_doclen_t doclen) |
| 42 | { |
| 43 | DEBUGCALL(DB, void, "FlintTermListTable::set_termlist", |
| 44 | did << ", " << doc << ", " << doclen); |
| 45 | |
| 46 | string tag = pack_uint(doclen); |
| 47 | |
| 48 | Xapian::doccount termlist_size = doc.termlist_count(); |
| 49 | if (termlist_size == 0) { |
| 50 | // doclen is sum(wdf) so should be zero if there are no terms. |
| 51 | Assert(doclen == 0); |
| 52 | Assert(doc.termlist_begin() == doc.termlist_end()); |
| 53 | add(flint_docid_to_key(did), string()); |
| 54 | return; |
| 55 | } |
| 56 | |
| 57 | Xapian::TermIterator t = doc.termlist_begin(); |
| 58 | if (t != doc.termlist_end()) { |
| 59 | tag += pack_uint(termlist_size); |
| 60 | string prev_term = *t; |
| 61 | |
| 62 | // Previous database versions encoded a boolean here, which was |
| 63 | // always false (and pack_bool() encodes false as a '0'). We can |
| 64 | // just omit this and successfully read old and new termlists |
| 65 | // except in the case where the next byte is a '0' - in this case |
| 66 | // we need keep the '0' so that the decoder can just skip any '0' |
| 67 | // it sees in this position (this shouldn't be a common case - 48 |
| 68 | // character terms aren't very common, and the first term |
| 69 | // alphabetically is likely to be shorter than average). |
| 70 | // FIXME: If we have an incompatible database version bump we should |
| 71 | // drop this completely. |
| 72 | if (prev_term.size() == '0') tag += '0'; |
| 73 | |
| 74 | tag += prev_term.size(); |
| 75 | tag += prev_term; |
| 76 | tag += pack_uint(t.get_wdf()); |
| 77 | --termlist_size; |
| 78 | |
| 79 | while (++t != doc.termlist_end()) { |
| 80 | const string & term = *t; |
| 81 | // If there's a shared prefix with the previous term, we don't |
| 82 | // store it explicitly, but just store the length of the shared |
| 83 | // prefix. In general, this is a big win. |
| 84 | size_t reuse = common_prefix_length(prev_term, term); |
| 85 | |
| 86 | // reuse must be <= prev_term.size(), and we know that value while |
| 87 | // decoding. So if the wdf is small enough that we can multiply it |
| 88 | // by (prev_term.size() + 1), add reuse and fit the result in a |
| 89 | // byte, then we can pack reuse and the wdf into a single byte and |
| 90 | // save ourselves a byte. We actually need to add one to the wdf |
| 91 | // before multiplying so that a wdf of 0 can be detected by the |
| 92 | // decoder. |
| 93 | size_t packed = 0; |
| 94 | Xapian::termcount wdf = t.get_wdf(); |
| 95 | // If wdf >= 128, then we aren't going to be able to pack it in so |
| 96 | // don't even try to avoid the calculation overflowing and making |
| 97 | // us think we can. |
| 98 | if (wdf < 127) |
| 99 | packed = (wdf + 1) * (prev_term.size() + 1) + reuse; |
| 100 | |
| 101 | if (packed && packed < 256) { |
| 102 | // We can pack the wdf into the same byte. |
| 103 | tag += char(packed); |
| 104 | tag += char(term.size() - reuse); |
| 105 | tag.append(term.data() + reuse, term.size() - reuse); |
| 106 | } else { |
| 107 | tag += char(reuse); |
| 108 | tag += char(term.size() - reuse); |
| 109 | tag.append(term.data() + reuse, term.size() - reuse); |
| 110 | // FIXME: pack wdf after reuse next time we rejig the format |
| 111 | // incompatibly. |
| 112 | tag += pack_uint(wdf); |
| 113 | } |
| 114 | |
| 115 | prev_term = *t; |
| 116 | --termlist_size; |
| 117 | } |
| 118 | } |
| 119 | Assert(termlist_size == 0); |
| 120 | add(flint_docid_to_key(did), tag); |
| 121 | } |
| 122 | |
| 123 | flint_doclen_t |
| 124 | FlintTermListTable::get_doclength(Xapian::docid did) const |
| 125 | { |
| 126 | DEBUGCALL(DB, flint_doclen_t, "FlintTermListTable::get_doclength", did); |
| 127 | |
| 128 | string tag; |
| 129 | if (!get_exact_entry(flint_docid_to_key(did), tag)) |
| 130 | throw Xapian::DocNotFoundError("No termlist found for document " + |
| 131 | om_tostring(did)); |
| 132 | |
| 133 | if (tag.empty()) RETURN(0); |
| 134 | |
| 135 | const char * pos = tag.data(); |
| 136 | const char * end = pos + tag.size(); |
| 137 | |
| 138 | flint_doclen_t doclen; |
| 139 | if (!unpack_uint(&pos, end, &doclen)) { |
| 140 | const char *msg; |
| 141 | if (pos == 0) { |
| 142 | msg = "Too little data for doclen in termlist"; |
| 143 | } else { |
| 144 | msg = "Overflowed value for doclen in termlist"; |
| 145 | } |
| 146 | throw Xapian::DatabaseCorruptError(msg); |
| 147 | } |
| 148 | |
| 149 | RETURN(doclen); |
| 150 | } |
Note: See TracBrowser
for help on using the browser.
