Ticket #326: chunktypes.patch
File chunktypes.patch, 14.7 KB (added by , 16 years ago) |
---|
-
chert_postlist.h
122 122 /** A postlist in a chert database. 123 123 */ 124 124 class ChertPostList : public LeafPostList { 125 public: 126 /// Types of chunks. 127 typedef enum { 128 /// Items just consist of wdfs - all docids in range are present. 129 DENSE, 130 131 /// Items consist of docid increments and wdfs. 132 SPARSE 133 } chunk_type; 134 125 135 protected: // ChertModifiedPostList needs to access these. 126 136 /** The database we are searching. This pointer is held so that the 127 137 * database doesn't get deleted before us, and also to give us access … … 157 167 /// Pointer to byte after end of current chunk. 158 168 const char * end; 159 169 170 /// The type of the current chunk. 171 chunk_type current_type; 172 160 173 /// Document id we're currently at. 161 174 Xapian::docid did; 162 175 … … 179 192 * If already at the end of the chunk, returns false. 180 193 */ 181 194 bool next_in_chunk(); 195 bool next_in_chunk_dense(); 196 bool next_in_chunk_sparse(); 182 197 183 198 /** Move to the next chunk. 184 199 * -
chert_postlist.cc
96 96 97 97 /// Append a block of raw entries to this chunk. 98 98 void raw_append(Xapian::docid first_did_, Xapian::docid current_did_, 99 const string & s ) {99 const string & s, ChertPostList::chunk_type type_) { 100 100 Assert(!started); 101 101 first_did = first_did_; 102 102 current_did = current_did_; 103 103 if (!s.empty()) { 104 104 chunk.append(s); 105 105 started = true; 106 current_type = type_; 106 107 } 107 108 } 108 109 … … 121 122 122 123 Xapian::docid first_did; 123 124 Xapian::docid current_did; 125 ChertPostList::chunk_type current_type; 124 126 125 127 string chunk; 128 void start_new_chunk(ChertTable * table, 129 Xapian::docid new_first_did); 126 130 }; 127 131 128 132 // Static functions … … 222 226 read_start_of_chunk(const char ** posptr, 223 227 const char * end, 224 228 Xapian::docid first_did_in_chunk, 225 bool * is_last_chunk_ptr) 229 bool * is_last_chunk_ptr, 230 ChertPostList::chunk_type * new_type) 226 231 { 227 232 DEBUGCALL_STATIC(DB, Xapian::docid, "read_start_of_chunk", 228 233 reinterpret_cast<const void*>(posptr) << ", " << 229 234 reinterpret_cast<const void*>(end) << ", " << 230 235 first_did_in_chunk << ", " << 231 reinterpret_cast<const void*>(is_last_chunk_ptr)); 236 reinterpret_cast<const void*>(is_last_chunk_ptr) << 237 reinterpret_cast<const void*>(new_type)); 232 238 233 239 // Read whether this is the last chunk 234 240 if (!unpack_bool(posptr, end, is_last_chunk_ptr)) … … 241 247 if (!unpack_uint(posptr, end, &increase_to_last)) 242 248 report_read_error(*posptr); 243 249 Xapian::docid last_did_in_chunk = first_did_in_chunk + increase_to_last; 250 251 // Read the chunk format. 252 unsigned tmp; 253 if (!unpack_uint(posptr, end, &tmp)) 254 report_read_error(*posptr); 255 *new_type = ChertPostList::chunk_type(tmp); 256 244 257 LOGVALUE(DB, last_did_in_chunk); 245 258 RETURN(last_did_in_chunk); 246 259 } … … 260 273 Xapian::docid did; 261 274 Xapian::termcount wdf; 262 275 276 ChertPostList::chunk_type current_type; 277 263 278 public: 264 279 /** Initialise the postlist chunk reader. 265 280 * 266 281 * @param first_did First document id in this chunk. 267 282 * @param data The tag string with the header removed. 268 283 */ 269 PostlistChunkReader(Xapian::docid first_did, const string & data_) 270 : data(data_), pos(data.data()), end(pos + data.length()), at_end(data.empty()), did(first_did) 284 PostlistChunkReader(Xapian::docid first_did, const string & data_, 285 ChertPostList::chunk_type current_type_) 286 : data(data_), pos(data.data()), end(pos + data.length()), 287 at_end(data.empty()), did(first_did), current_type(current_type_) 271 288 { 272 289 if (!at_end) read_wdf(&pos, end, &wdf); 273 290 } … … 294 311 if (pos == end) { 295 312 at_end = true; 296 313 } else { 297 read_did_increase(&pos, end, &did); 314 switch (current_type) { 315 case ChertPostList::DENSE: 316 ++did; 317 break; 318 case ChertPostList::SPARSE: 319 read_did_increase(&pos, end, &did); 320 break; 321 } 298 322 read_wdf(&pos, end, &wdf); 299 323 } 300 324 } … … 306 330 : orig_key(orig_key_), 307 331 tname(tname_), is_first_chunk(is_first_chunk_), 308 332 is_last_chunk(is_last_chunk_), 309 started(false) 333 started(false), 334 current_type(ChertPostList::DENSE) 310 335 { 311 336 DEBUGCALL(DB, void, "PostlistChunkWriter::PostlistChunkWriter", 312 337 orig_key_ << ", " << is_first_chunk_ << ", " << tname_ << ", " << … … 314 339 } 315 340 316 341 void 342 PostlistChunkWriter::start_new_chunk(ChertTable * table, 343 Xapian::docid new_first_did) 344 { 345 bool save_is_last_chunk = is_last_chunk; 346 is_last_chunk = false; 347 flush(table); 348 is_last_chunk = save_is_last_chunk; 349 is_first_chunk = false; 350 first_did = new_first_did; 351 chunk.resize(0); 352 orig_key = ChertPostListTable::make_key(tname, first_did); 353 current_type = ChertPostList::DENSE; 354 } 355 356 void 317 357 PostlistChunkWriter::append(ChertTable * table, Xapian::docid did, 318 358 Xapian::termcount wdf) 319 359 { … … 324 364 Assert(did > current_did); 325 365 // Start a new chunk if this one has grown to the threshold. 326 366 if (chunk.size() >= CHUNKSIZE) { 327 bool save_is_last_chunk = is_last_chunk; 328 is_last_chunk = false; 329 flush(table); 330 is_last_chunk = save_is_last_chunk; 331 is_first_chunk = false; 332 first_did = did; 333 chunk.resize(0); 334 orig_key = ChertPostListTable::make_key(tname, first_did); 367 start_new_chunk(table, did); 368 goto append_packed_wdf; 335 369 } else { 336 chunk.append(pack_uint(did - current_did - 1)); 370 if (current_type == ChertPostList::DENSE) { 371 if (current_did + 1 != did) { 372 if (current_did == first_did) { 373 // Only have one entry - can just change type. 374 current_type = ChertPostList::SPARSE; 375 } else { 376 start_new_chunk(table, did); 377 goto append_packed_wdf; 378 } 379 } 380 } 381 if (current_type == ChertPostList::SPARSE) { 382 chunk.append(pack_uint(did - current_did - 1)); 383 } 337 384 } 338 385 } 386 append_packed_wdf: 339 387 current_did = did; 340 388 chunk.append(pack_uint(wdf)); 341 389 } … … 355 403 static inline string 356 404 make_start_of_chunk(bool new_is_last_chunk, 357 405 Xapian::docid new_first_did, 358 Xapian::docid new_final_did) 406 Xapian::docid new_final_did, 407 ChertPostList::chunk_type new_type) 359 408 { 360 409 Assert(new_final_did >= new_first_did); 361 410 return pack_bool(new_is_last_chunk) + 362 pack_uint(new_final_did - new_first_did); 411 pack_uint(new_final_did - new_first_did) + 412 pack_uint(static_cast<unsigned>(new_type)); 363 413 } 364 414 365 415 static void … … 368 418 unsigned int end_of_chunk_header, 369 419 bool is_last_chunk, 370 420 Xapian::docid first_did_in_chunk, 371 Xapian::docid last_did_in_chunk) 421 Xapian::docid last_did_in_chunk, 422 ChertPostList::chunk_type new_type) 372 423 { 373 424 Assert((size_t)(end_of_chunk_header - start_of_chunk_header) <= chunk.size()); 374 425 375 426 chunk.replace(start_of_chunk_header, 376 427 end_of_chunk_header - start_of_chunk_header, 377 428 make_start_of_chunk(is_last_chunk, first_did_in_chunk, 378 last_did_in_chunk ));429 last_did_in_chunk, new_type)); 379 430 } 380 431 381 432 void … … 465 516 466 517 // Read the chunk header 467 518 bool new_is_last_chunk; 519 ChertPostList::chunk_type new_type; 468 520 Xapian::docid new_last_did_in_chunk = 469 521 read_start_of_chunk(&tagpos, tagend, new_first_did, 470 &new_is_last_chunk );522 &new_is_last_chunk, &new_type); 471 523 472 524 string chunk_data(tagpos, tagend); 473 525 … … 478 530 string tag; 479 531 tag = make_start_of_first_chunk(num_ent, coll_freq, new_first_did); 480 532 tag += make_start_of_chunk(new_is_last_chunk, 481 new_first_did, 482 new_last_did_in_chunk); 533 new_first_did, 534 new_last_did_in_chunk, 535 new_type); 483 536 tag += chunk_data; 484 537 table->add(orig_key, tag); 485 538 return; … … 527 580 report_read_error(keypos); 528 581 } 529 582 bool wrong_is_last_chunk; 583 ChertPostList::chunk_type new_type; 530 584 string::size_type start_of_chunk_header = tagpos - tag.data(); 531 585 Xapian::docid last_did_in_chunk = 532 586 read_start_of_chunk(&tagpos, tagend, first_did_in_chunk, 533 &wrong_is_last_chunk );587 &wrong_is_last_chunk, &new_type); 534 588 string::size_type end_of_chunk_header = tagpos - tag.data(); 535 589 536 590 // write new is_last flag … … 539 593 end_of_chunk_header, 540 594 true, // is_last_chunk 541 595 first_did_in_chunk, 542 last_did_in_chunk); 596 last_did_in_chunk, 597 new_type); 543 598 table->add(cursor->current_key, tag); 544 599 } 545 600 } else { … … 576 631 577 632 tag = make_start_of_first_chunk(num_ent, coll_freq, first_did); 578 633 579 tag += make_start_of_chunk(is_last_chunk, first_did, current_did );634 tag += make_start_of_chunk(is_last_chunk, first_did, current_did, current_type); 580 635 tag += chunk; 581 636 table->add(key, tag); 582 637 return; … … 616 671 } 617 672 618 673 // ...and write the start of this chunk. 619 tag = make_start_of_chunk(is_last_chunk, first_did, current_did );674 tag = make_start_of_chunk(is_last_chunk, first_did, current_did, current_type); 620 675 621 676 tag += chunk; 622 677 table->add(new_key, tag); … … 687 742 did = read_start_of_first_chunk(&pos, end, &number_of_entries, NULL); 688 743 first_did_in_chunk = did; 689 744 last_did_in_chunk = read_start_of_chunk(&pos, end, first_did_in_chunk, 690 &is_last_chunk );745 &is_last_chunk, ¤t_type); 691 746 read_wdf(&pos, end, &wdf); 692 747 LOGLINE(DB, "Initial docid " << did); 693 748 } … … 712 767 DEBUGCALL(DB, bool, "ChertPostList::next_in_chunk", ""); 713 768 if (pos == end) RETURN(false); 714 769 770 switch (current_type) { 771 case ChertPostList::DENSE: 772 ++did; 773 break; 774 case ChertPostList::SPARSE: 775 read_did_increase(&pos, end, &did); 776 break; 777 } 778 read_wdf(&pos, end, &wdf); 779 780 // Either not at last doc in chunk, or pos == end, but not both. 781 Assert(did <= last_did_in_chunk); 782 Assert(did < last_did_in_chunk || pos == end); 783 Assert(pos != end || did == last_did_in_chunk); 784 785 RETURN(true); 786 } 787 788 bool 789 ChertPostList::next_in_chunk_dense() 790 { 791 DEBUGCALL(DB, bool, "ChertPostList::next_in_chunk_dense", ""); 792 if (pos == end) RETURN(false); 793 Assert(current_type == ChertPostList::DENSE); 794 795 ++did; 796 read_wdf(&pos, end, &wdf); 797 798 // Either not at last doc in chunk, or pos == end, but not both. 799 Assert(did <= last_did_in_chunk); 800 Assert(did < last_did_in_chunk || pos == end); 801 Assert(pos != end || did == last_did_in_chunk); 802 803 RETURN(true); 804 } 805 806 bool 807 ChertPostList::next_in_chunk_sparse() 808 { 809 DEBUGCALL(DB, bool, "ChertPostList::next_in_chunk_sparse", ""); 810 if (pos == end) RETURN(false); 811 Assert(current_type == ChertPostList::SPARSE); 812 715 813 read_did_increase(&pos, end, &did); 716 814 read_wdf(&pos, end, &wdf); 717 815 … … 765 863 766 864 first_did_in_chunk = did; 767 865 last_did_in_chunk = read_start_of_chunk(&pos, end, first_did_in_chunk, 768 &is_last_chunk );866 &is_last_chunk, ¤t_type); 769 867 read_wdf(&pos, end, &wdf); 770 868 } 771 869 … … 859 957 860 958 first_did_in_chunk = did; 861 959 last_did_in_chunk = read_start_of_chunk(&pos, end, first_did_in_chunk, 862 &is_last_chunk );960 &is_last_chunk, ¤t_type); 863 961 read_wdf(&pos, end, &wdf); 864 962 865 963 // Possible, since desired_did might be after end of this chunk and before … … 872 970 { 873 971 DEBUGCALL(DB, bool, 874 972 "ChertPostList::move_forward_in_chunk_to_at_least", desired_did); 973 Assert(did <= desired_did); 875 974 if (desired_did > last_did_in_chunk) { 876 975 pos = end; 877 976 RETURN(false); 878 977 } 879 while (did < desired_did) { 880 bool at_end_of_chunk = !next_in_chunk(); 881 // If we hit the end of the chunk then last_did_in_chunk must be 882 // wrong. 883 Assert(!at_end_of_chunk); 884 if (at_end_of_chunk) RETURN(false); 978 switch (current_type) { 979 case ChertPostList::DENSE: { 980 unsigned count = desired_did - did; 981 if (count) { 982 if (count > 1) { 983 count -= 1; 984 while (pos != end) { 985 om_byte part = static_cast<om_byte>(*pos); 986 ++pos; 987 if ((part & 0x80) == 0) { 988 if ((--count) == 0) 989 break; 990 } 991 } 992 if (pos == end) 993 throw Xapian::DatabaseCorruptError("Insufficient entries in posting list chunk"); 994 } 995 read_wdf(&pos, end, &wdf); 996 did = desired_did; 997 } 998 break; 999 } 1000 case ChertPostList::SPARSE: 1001 while (did < desired_did) { 1002 bool at_end_of_chunk = !next_in_chunk_sparse(); 1003 // If we hit the end of the chunk then last_did_in_chunk must be 1004 // wrong. 1005 Assert(!at_end_of_chunk); 1006 if (at_end_of_chunk) RETURN(false); 1007 } 1008 break; 885 1009 } 886 1010 RETURN(true); 887 1011 } … … 1004 1128 } 1005 1129 1006 1130 bool is_last_chunk; 1131 ChertPostList::chunk_type current_type; 1007 1132 Xapian::docid last_did_in_chunk; 1008 last_did_in_chunk = read_start_of_chunk(&pos, end, first_did_in_chunk, &is_last_chunk );1133 last_did_in_chunk = read_start_of_chunk(&pos, end, first_did_in_chunk, &is_last_chunk, ¤t_type); 1009 1134 *to = new PostlistChunkWriter(cursor->current_key, is_first_chunk, tname, 1010 1135 is_last_chunk); 1011 1136 if (did > last_did_in_chunk) { … … 1014 1139 // (FIXME) 1015 1140 *from = NULL; 1016 1141 (*to)->raw_append(first_did_in_chunk, last_did_in_chunk, 1017 string(pos, end) );1142 string(pos, end), current_type); 1018 1143 } else { 1019 *from = new PostlistChunkReader(first_did_in_chunk, string(pos, end)); 1144 *from = new PostlistChunkReader(first_did_in_chunk, string(pos, end), 1145 current_type); 1020 1146 } 1021 1147 if (is_last_chunk) RETURN(Xapian::docid(-1)); 1022 1148 … … 1054 1180 if (!key_exists(current_key)) { 1055 1181 LOGLINE(DB, "Adding dummy first chunk"); 1056 1182 string newtag = make_start_of_first_chunk(0, 0, 0); 1057 newtag += make_start_of_chunk(true, 0, 0 );1183 newtag += make_start_of_chunk(true, 0, 0, ChertPostList::DENSE); 1058 1184 add(current_key, newtag); 1059 1185 } 1060 1186 … … 1126 1252 Xapian::termcount termfreq, collfreq; 1127 1253 Xapian::docid firstdid, lastdid; 1128 1254 bool islast; 1255 ChertPostList::chunk_type current_type; 1129 1256 if (pos == end) { 1130 1257 termfreq = 0; 1131 1258 collfreq = 0; … … 1136 1263 firstdid = read_start_of_first_chunk(&pos, end, 1137 1264 &termfreq, &collfreq); 1138 1265 // Handle the generic start of chunk header. 1139 lastdid = read_start_of_chunk(&pos, end, firstdid, &islast );1266 lastdid = read_start_of_chunk(&pos, end, firstdid, &islast, ¤t_type); 1140 1267 } 1141 1268 1142 1269 termfreq += deltas->second.first; … … 1163 1290 1164 1291 // Rewrite start of first chunk to update termfreq and collfreq. 1165 1292 string newhdr = make_start_of_first_chunk(termfreq, collfreq, firstdid); 1166 newhdr += make_start_of_chunk(islast, firstdid, lastdid );1293 newhdr += make_start_of_chunk(islast, firstdid, lastdid, current_type); 1167 1294 if (pos == end) { 1168 1295 add(current_key, newhdr); 1169 1296 } else {