| 1 | |
|---|
| 2 | |
|---|
| 3 | |
|---|
| 4 | |
|---|
| 5 | |
|---|
| 6 | |
|---|
| 7 | |
|---|
| 8 | |
|---|
| 9 | |
|---|
| 10 | |
|---|
| 11 | |
|---|
| 12 | |
|---|
| 13 | |
|---|
| 14 | |
|---|
| 15 | |
|---|
| 16 | |
|---|
| 17 | |
|---|
| 18 | |
|---|
| 19 | |
|---|
| 20 | |
|---|
| 21 | #include <config.h> |
|---|
| 22 | |
|---|
| 23 | #include <xapian/types.h> |
|---|
| 24 | |
|---|
| 25 | #include "flint_positionlist.h" |
|---|
| 26 | #include "flint_utils.h" |
|---|
| 27 | #include "omdebug.h" |
|---|
| 28 | |
|---|
| 29 | #include <vector> |
|---|
| 30 | #include <string> |
|---|
| 31 | #include <cmath> |
|---|
| 32 | |
|---|
| 33 | using namespace std; |
|---|
| 34 | |
|---|
| 35 | static const unsigned char flstab[256] = { |
|---|
| 36 | 0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, |
|---|
| 37 | 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, |
|---|
| 38 | 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, |
|---|
| 39 | 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, |
|---|
| 40 | 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, |
|---|
| 41 | 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, |
|---|
| 42 | 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, |
|---|
| 43 | 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, |
|---|
| 44 | 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, |
|---|
| 45 | 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, |
|---|
| 46 | 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, |
|---|
| 47 | 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, |
|---|
| 48 | 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, |
|---|
| 49 | 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, |
|---|
| 50 | 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, |
|---|
| 51 | 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 |
|---|
| 52 | }; |
|---|
| 53 | |
|---|
| 54 | |
|---|
| 55 | inline int my_fls(unsigned mask) |
|---|
| 56 | { |
|---|
| 57 | int result = 0; |
|---|
| 58 | if (mask >= 0x10000u) { |
|---|
| 59 | mask >>= 16; |
|---|
| 60 | result = 16; |
|---|
| 61 | } |
|---|
| 62 | if (mask >= 0x100u) { |
|---|
| 63 | mask >>= 8; |
|---|
| 64 | result += 8; |
|---|
| 65 | } |
|---|
| 66 | return result + flstab[mask]; |
|---|
| 67 | } |
|---|
| 68 | |
|---|
| 69 | class BitWriter { |
|---|
| 70 | private: |
|---|
| 71 | string buf; |
|---|
| 72 | int n_bits; |
|---|
| 73 | unsigned int acc; |
|---|
| 74 | public: |
|---|
| 75 | BitWriter() : n_bits(0), acc(0) { } |
|---|
| 76 | BitWriter(const string & seed) : buf(seed), n_bits(0), acc(0) { } |
|---|
| 77 | void encode(size_t value, size_t outof) { |
|---|
| 78 | Assert(value < outof); |
|---|
| 79 | size_t bits = my_fls(outof - 1); |
|---|
| 80 | const size_t spare = (1 << bits) - outof; |
|---|
| 81 | if (spare) { |
|---|
| 82 | const size_t mid_start = (outof - spare) / 2; |
|---|
| 83 | if (value < mid_start) { |
|---|
| 84 | write_bits(value, bits); |
|---|
| 85 | } else if (value >= mid_start + spare) { |
|---|
| 86 | write_bits((value - (mid_start + spare)) | (1 << (bits - 1)), bits); |
|---|
| 87 | } else { |
|---|
| 88 | --bits; |
|---|
| 89 | write_bits(value, bits); |
|---|
| 90 | } |
|---|
| 91 | } else { |
|---|
| 92 | write_bits(value, bits); |
|---|
| 93 | } |
|---|
| 94 | } |
|---|
| 95 | void write_bits(int data, int count) { |
|---|
| 96 | if (count + n_bits > 32) { |
|---|
| 97 | |
|---|
| 98 | |
|---|
| 99 | |
|---|
| 100 | Assert(count <= 32); |
|---|
| 101 | acc |= (data << n_bits); |
|---|
| 102 | buf += char(acc); |
|---|
| 103 | acc >>= 8; |
|---|
| 104 | data >>= 8; |
|---|
| 105 | count -= 8; |
|---|
| 106 | } |
|---|
| 107 | acc |= (data << n_bits); |
|---|
| 108 | n_bits += count; |
|---|
| 109 | while (n_bits >= 8) { |
|---|
| 110 | buf += char(acc); |
|---|
| 111 | acc >>= 8; |
|---|
| 112 | n_bits -= 8; |
|---|
| 113 | } |
|---|
| 114 | } |
|---|
| 115 | string & freeze() { |
|---|
| 116 | if (n_bits) { |
|---|
| 117 | buf += char(acc); |
|---|
| 118 | n_bits = 0; |
|---|
| 119 | acc = 0; |
|---|
| 120 | } |
|---|
| 121 | return buf; |
|---|
| 122 | } |
|---|
| 123 | }; |
|---|
| 124 | |
|---|
| 125 | static void |
|---|
| 126 | encode_interpolative(BitWriter & wr, const vector<Xapian::termpos> &pos, int j, int k) |
|---|
| 127 | { |
|---|
| 128 | while (j + 1 < k) { |
|---|
| 129 | const size_t mid = (j + k) / 2; |
|---|
| 130 | |
|---|
| 131 | |
|---|
| 132 | |
|---|
| 133 | const size_t outof = pos[k] - pos[j] + j - k + 1; |
|---|
| 134 | const size_t lowest = pos[j] + mid - j; |
|---|
| 135 | wr.encode(pos[mid] - lowest, outof); |
|---|
| 136 | encode_interpolative(wr, pos, j, mid); |
|---|
| 137 | j = mid; |
|---|
| 138 | } |
|---|
| 139 | } |
|---|
| 140 | |
|---|
| 141 | namespace Xapian { |
|---|
| 142 | |
|---|
| 143 | class BitReader { |
|---|
| 144 | private: |
|---|
| 145 | string buf; |
|---|
| 146 | size_t idx; |
|---|
| 147 | int n_bits; |
|---|
| 148 | unsigned int acc; |
|---|
| 149 | public: |
|---|
| 150 | BitReader(const string &buf_) : buf(buf_), idx(0), n_bits(0), acc(0) { } |
|---|
| 151 | Xapian::termpos decode(Xapian::termpos outof) { |
|---|
| 152 | size_t bits = my_fls(outof - 1); |
|---|
| 153 | const size_t spare = (1 << bits) - outof; |
|---|
| 154 | const size_t mid_start = (outof - spare) / 2; |
|---|
| 155 | Xapian::termpos p; |
|---|
| 156 | if (spare) { |
|---|
| 157 | p = read_bits(bits - 1); |
|---|
| 158 | if (p < mid_start) { |
|---|
| 159 | if (read_bits(1)) p += mid_start + spare; |
|---|
| 160 | } |
|---|
| 161 | } else { |
|---|
| 162 | p = read_bits(bits); |
|---|
| 163 | } |
|---|
| 164 | Assert(p < outof); |
|---|
| 165 | return p; |
|---|
| 166 | } |
|---|
| 167 | unsigned int read_bits(int count) { |
|---|
| 168 | unsigned int result; |
|---|
| 169 | if (count > 25) { |
|---|
| 170 | |
|---|
| 171 | |
|---|
| 172 | |
|---|
| 173 | |
|---|
| 174 | |
|---|
| 175 | Assert(count <= 32); |
|---|
| 176 | result = read_bits(16); |
|---|
| 177 | return result | (read_bits(count - 16) << 16); |
|---|
| 178 | } |
|---|
| 179 | while (n_bits < count) { |
|---|
| 180 | Assert(idx < buf.size()); |
|---|
| 181 | acc |= static_cast<unsigned char>(buf[idx++]) << n_bits; |
|---|
| 182 | n_bits += 8; |
|---|
| 183 | } |
|---|
| 184 | result = acc & ((1u << count) - 1); |
|---|
| 185 | acc >>= count; |
|---|
| 186 | n_bits -= count; |
|---|
| 187 | return result; |
|---|
| 188 | } |
|---|
| 189 | |
|---|
| 190 | |
|---|
| 191 | |
|---|
| 192 | |
|---|
| 193 | bool check_all_gone() const { |
|---|
| 194 | return (idx == buf.size() && n_bits < 7 && acc == 0); |
|---|
| 195 | } |
|---|
| 196 | void decode_interpolative(vector<Xapian::termpos> & pos, int j, int k); |
|---|
| 197 | }; |
|---|
| 198 | |
|---|
| 199 | void |
|---|
| 200 | BitReader::decode_interpolative(vector<Xapian::termpos> & pos, int j, int k) |
|---|
| 201 | { |
|---|
| 202 | while (j + 1 < k) { |
|---|
| 203 | const size_t mid = (j + k) / 2; |
|---|
| 204 | |
|---|
| 205 | |
|---|
| 206 | |
|---|
| 207 | const size_t outof = pos[k] - pos[j] + j - k + 1; |
|---|
| 208 | pos[mid] = decode(outof) + (pos[j] + mid - j); |
|---|
| 209 | decode_interpolative(pos, j, mid); |
|---|
| 210 | j = mid; |
|---|
| 211 | } |
|---|
| 212 | } |
|---|
| 213 | |
|---|
| 214 | } |
|---|
| 215 | |
|---|
| 216 | using Xapian::BitReader; |
|---|
| 217 | |
|---|
| 218 | void |
|---|
| 219 | FlintPositionListTable::set_positionlist(Xapian::docid did, |
|---|
| 220 | const string & tname, |
|---|
| 221 | Xapian::PositionIterator pos, |
|---|
| 222 | const Xapian::PositionIterator &pos_end) |
|---|
| 223 | { |
|---|
| 224 | DEBUGCALL(DB, void, "FlintPositionList::set_positionlist", |
|---|
| 225 | did << ", " << tname << ", " << pos << ", " << pos_end); |
|---|
| 226 | Assert(pos != pos_end); |
|---|
| 227 | |
|---|
| 228 | |
|---|
| 229 | vector<Xapian::termpos> poscopy(pos, pos_end); |
|---|
| 230 | |
|---|
| 231 | string key = make_key(did, tname); |
|---|
| 232 | |
|---|
| 233 | if (poscopy.size() == 1) { |
|---|
| 234 | |
|---|
| 235 | add(key, pack_uint(poscopy[0])); |
|---|
| 236 | return; |
|---|
| 237 | } |
|---|
| 238 | |
|---|
| 239 | BitWriter wr(pack_uint(poscopy.back())); |
|---|
| 240 | |
|---|
| 241 | wr.encode(poscopy[0], poscopy.back()); |
|---|
| 242 | wr.encode(poscopy.size() - 2, poscopy.back() - poscopy[0]); |
|---|
| 243 | encode_interpolative(wr, poscopy, 0, poscopy.size() - 1); |
|---|
| 244 | add(key, wr.freeze()); |
|---|
| 245 | } |
|---|
| 246 | |
|---|
| 247 | Xapian::termcount |
|---|
| 248 | FlintPositionListTable::positionlist_count(Xapian::docid did, |
|---|
| 249 | const string & term) const |
|---|
| 250 | { |
|---|
| 251 | DEBUGCALL(DB, void, "FlintPositionListTable::positionlist_count", |
|---|
| 252 | did << ", " << term); |
|---|
| 253 | |
|---|
| 254 | string data; |
|---|
| 255 | if (!get_exact_entry(pack_uint_preserving_sort(did) + term, data)) { |
|---|
| 256 | |
|---|
| 257 | return 0; |
|---|
| 258 | } |
|---|
| 259 | |
|---|
| 260 | const char * pos = data.data(); |
|---|
| 261 | const char * end = pos + data.size(); |
|---|
| 262 | Xapian::termpos pos_last; |
|---|
| 263 | if (!unpack_uint(&pos, end, &pos_last)) { |
|---|
| 264 | throw Xapian::DatabaseCorruptError("Position list data corrupt"); |
|---|
| 265 | } |
|---|
| 266 | if (pos == end) { |
|---|
| 267 | |
|---|
| 268 | return 1; |
|---|
| 269 | } |
|---|
| 270 | |
|---|
| 271 | BitReader rd(data); |
|---|
| 272 | |
|---|
| 273 | (void)rd.read_bits(8 * (pos - data.data())); |
|---|
| 274 | Xapian::termpos pos_first = rd.decode(pos_last); |
|---|
| 275 | Xapian::termpos pos_size = rd.decode(pos_last - pos_first) + 2; |
|---|
| 276 | return pos_size; |
|---|
| 277 | } |
|---|
| 278 | |
|---|
| 279 | |
|---|
| 280 | |
|---|
| 281 | bool |
|---|
| 282 | FlintPositionList::read_data(const FlintTable * table, Xapian::docid did, |
|---|
| 283 | const string & tname) |
|---|
| 284 | { |
|---|
| 285 | DEBUGCALL(DB, void, "FlintPositionList::read_data", |
|---|
| 286 | table << ", " << did << ", " << tname); |
|---|
| 287 | |
|---|
| 288 | have_started = false; |
|---|
| 289 | positions.clear(); |
|---|
| 290 | |
|---|
| 291 | string data; |
|---|
| 292 | if (!table->get_exact_entry(pack_uint_preserving_sort(did) + tname, data)) { |
|---|
| 293 | |
|---|
| 294 | current_pos = positions.begin(); |
|---|
| 295 | return false; |
|---|
| 296 | } |
|---|
| 297 | |
|---|
| 298 | const char * pos = data.data(); |
|---|
| 299 | const char * end = pos + data.size(); |
|---|
| 300 | Xapian::termpos pos_last; |
|---|
| 301 | if (!unpack_uint(&pos, end, &pos_last)) { |
|---|
| 302 | throw Xapian::DatabaseCorruptError("Position list data corrupt"); |
|---|
| 303 | } |
|---|
| 304 | if (pos == end) { |
|---|
| 305 | |
|---|
| 306 | positions.push_back(pos_last); |
|---|
| 307 | current_pos = positions.begin(); |
|---|
| 308 | return true; |
|---|
| 309 | } |
|---|
| 310 | BitReader rd(data); |
|---|
| 311 | |
|---|
| 312 | (void)rd.read_bits(8 * (pos - data.data())); |
|---|
| 313 | Xapian::termpos pos_first = rd.decode(pos_last); |
|---|
| 314 | Xapian::termpos pos_size = rd.decode(pos_last - pos_first) + 2; |
|---|
| 315 | positions.resize(pos_size); |
|---|
| 316 | positions[0] = pos_first; |
|---|
| 317 | positions.back() = pos_last; |
|---|
| 318 | rd.decode_interpolative(positions, 0, pos_size - 1); |
|---|
| 319 | |
|---|
| 320 | current_pos = positions.begin(); |
|---|
| 321 | return true; |
|---|
| 322 | } |
|---|
| 323 | |
|---|
| 324 | Xapian::termcount |
|---|
| 325 | FlintPositionList::get_size() const |
|---|
| 326 | { |
|---|
| 327 | DEBUGCALL(DB, Xapian::termcount, "FlintPositionList::get_size", ""); |
|---|
| 328 | RETURN(positions.size()); |
|---|
| 329 | } |
|---|
| 330 | |
|---|
| 331 | Xapian::termpos |
|---|
| 332 | FlintPositionList::get_position() const |
|---|
| 333 | { |
|---|
| 334 | DEBUGCALL(DB, Xapian::termpos, "FlintPositionList::get_position", ""); |
|---|
| 335 | Assert(have_started); |
|---|
| 336 | RETURN(*current_pos); |
|---|
| 337 | } |
|---|
| 338 | |
|---|
| 339 | void |
|---|
| 340 | FlintPositionList::next() |
|---|
| 341 | { |
|---|
| 342 | DEBUGCALL(DB, void, "FlintPositionList::next", ""); |
|---|
| 343 | |
|---|
| 344 | if (!have_started) { |
|---|
| 345 | have_started = true; |
|---|
| 346 | } else { |
|---|
| 347 | Assert(!at_end()); |
|---|
| 348 | ++current_pos; |
|---|
| 349 | } |
|---|
| 350 | } |
|---|
| 351 | |
|---|
| 352 | void |
|---|
| 353 | FlintPositionList::skip_to(Xapian::termpos termpos) |
|---|
| 354 | { |
|---|
| 355 | DEBUGCALL(DB, void, "FlintPositionList::skip_to", termpos); |
|---|
| 356 | if (!have_started) { |
|---|
| 357 | have_started = true; |
|---|
| 358 | } |
|---|
| 359 | while (!at_end() && *current_pos < termpos) ++current_pos; |
|---|
| 360 | } |
|---|
| 361 | |
|---|
| 362 | bool |
|---|
| 363 | FlintPositionList::at_end() const |
|---|
| 364 | { |
|---|
| 365 | DEBUGCALL(DB, bool, "FlintPositionList::at_end", ""); |
|---|
| 366 | RETURN(current_pos == positions.end()); |
|---|
| 367 | } |
|---|