| 234 | |
| 235 | /// A structure holding the result of performing a compaction. |
| 236 | struct CompactionResult { |
| 237 | /// Sometimes stat can fail for benign reasons (e.g. >= 2GB file on certain |
| 238 | /// systems). This keeps track of whether this has happened. |
| 239 | bool bad_stat; |
| 240 | |
| 241 | /// Size of the input DB file (in Kb). |
| 242 | off_t in_size; |
| 243 | |
| 244 | /// Size of the output DB file (in Kb). |
| 245 | off_t out_size; |
| 246 | |
| 247 | CompactionResult() |
| 248 | : bad_stat(false), in_size(0), out_size(0) |
| 249 | {} |
| 250 | |
| 251 | /// Calculate (by calling stat) the size of the output. |
| 252 | void read_out_size(const string & dest) |
| 253 | { |
| 254 | if (!bad_stat) { |
| 255 | struct stat sb; |
| 256 | if (stat(dest + "DB", &sb) == 0) { |
| 257 | out_size = sb.st_size / 1024; |
| 258 | } else { |
| 259 | bad_stat = (errno != ENOENT); |
| 260 | } |
| 261 | } |
| 262 | } |
| 263 | |
| 264 | /// Get the percentage decrease in the size. |
| 265 | double percent_decrease() |
| 266 | { |
| 267 | return 100 * double(in_size - out_size) / in_size; |
| 268 | } |
| 269 | }; |
| 270 | |
| 271 | struct table_list { |
| 272 | // The "base name" of the table. |
| 273 | const char * name; |
| 274 | // zlib compression strategy to use on tags. |
| 275 | int compress_strategy; |
| 276 | // Create tables after position lazily. |
| 277 | bool lazy; |
| 278 | }; |
| 279 | |
| 280 | static const table_list tables[] = { |
| 281 | // name compress_strategy lazy |
| 282 | { "postlist", DONT_COMPRESS, false }, |
| 283 | { "record", Z_DEFAULT_STRATEGY, false }, |
| 284 | { "termlist", Z_DEFAULT_STRATEGY, false }, |
| 285 | { "position", DONT_COMPRESS, true }, |
| 286 | { "value", DONT_COMPRESS, true }, |
| 287 | { "spelling", Z_DEFAULT_STRATEGY, true }, |
| 288 | { "synonyms", Z_DEFAULT_STRATEGY, true } |
| 289 | }; |
| 290 | |
| 291 | static CompactionResult |
| 292 | compact_postlists(FlintTable * out, vector<string> sources, |
| 293 | const table_list * table, |
| 294 | const vector<Xapian::docid> & offset, |
| 295 | bool multipass, |
| 296 | string destdir, |
| 297 | size_t block_size, |
| 298 | Xapian::docid tot_off) |
| 299 | { |
| 300 | CompactionResult result; |
| 301 | vector<string> tmp; |
| 302 | tmp.reserve(sources.size()); |
| 303 | for (vector<string>::const_iterator src = sources.begin(); |
| 304 | src != sources.end(); ++src) { |
| 305 | string s(*src); |
| 306 | s += table->name; |
| 307 | s += '.'; |
| 308 | tmp.push_back(s); |
| 309 | |
| 310 | struct stat sb; |
| 311 | if (stat(s + "DB", &sb) == 0) { |
| 312 | result.in_size += sb.st_size / 1024; |
| 313 | } else { |
| 314 | result.bad_stat = (errno != ENOENT); |
| 315 | } |
| 316 | } |
| 317 | vector<Xapian::docid> off(offset); |
| 318 | unsigned int c = 0; |
| 319 | while (multipass && tmp.size() > 3) { |
| 320 | vector<string> tmpout; |
| 321 | tmpout.reserve(tmp.size() / 2); |
| 322 | vector<Xapian::docid> newoff; |
| 323 | newoff.resize(tmp.size() / 2); |
| 324 | for (unsigned int i = 0, j; i < tmp.size(); i = j) { |
| 325 | j = i + 2; |
| 326 | if (j == tmp.size() - 1) ++j; |
| 327 | |
| 328 | string dest = destdir; |
| 329 | char buf[64]; |
| 330 | sprintf(buf, "/tmp%u_%u.", c, i / 2); |
| 331 | dest += buf; |
| 332 | |
| 333 | // Don't compress temporary tables, even if the |
| 334 | // final table would be. |
| 335 | FlintTable tmptab(dest, false); |
| 336 | tmptab.create_and_open(block_size); |
| 337 | |
| 338 | merge_postlists(&tmptab, off.begin() + i, tmp.begin() + i, tmp.begin() + j, 0); |
| 339 | if (c > 0) { |
| 340 | for (unsigned int k = i; k < j; ++k) { |
| 341 | unlink((tmp[k] + "DB").c_str()); |
| 342 | unlink((tmp[k] + "baseA").c_str()); |
| 343 | unlink((tmp[k] + "baseB").c_str()); |
| 344 | } |
| 345 | } |
| 346 | tmpout.push_back(dest); |
| 347 | tmptab.commit(1); |
| 348 | } |
| 349 | swap(tmp, tmpout); |
| 350 | swap(off, newoff); |
| 351 | ++c; |
| 352 | } |
| 353 | merge_postlists(out, off.begin(), tmp.begin(), tmp.end(), tot_off); |
| 354 | if (c > 0) { |
| 355 | for (size_t k = 0; k < tmp.size(); ++k) { |
| 356 | unlink((tmp[k] + "DB").c_str()); |
| 357 | unlink((tmp[k] + "baseA").c_str()); |
| 358 | unlink((tmp[k] + "baseB").c_str()); |
| 359 | } |
| 360 | } |
| 361 | return result; |
| 362 | } |
| 363 | |
| 364 | static CompactionResult |
| 365 | compact_table(FlintTable * out, vector<string> sources, |
| 366 | const table_list * table, |
| 367 | const vector<Xapian::docid> & offset) |
| 368 | { |
| 369 | CompactionResult result; |
| 370 | // Position, Record, Termlist, Value |
| 371 | bool is_position_table = strcmp(table->name, "position") == 0; |
| 372 | for (size_t i = 0; i < sources.size(); ++i) { |
| 373 | Xapian::docid off = offset[i]; |
| 374 | string src(sources[i]); |
| 375 | src += table->name; |
| 376 | src += '.'; |
| 377 | |
| 378 | struct stat sb; |
| 379 | if (stat(src + "DB", &sb) == 0) { |
| 380 | if (sb.st_size == 0) continue; |
| 381 | result.in_size += sb.st_size / 1024; |
| 382 | } else { |
| 383 | result.bad_stat = (errno != ENOENT); |
| 384 | } |
| 385 | |
| 386 | FlintTable in(src, true, table->compress_strategy, table->lazy); |
| 387 | in.open(); |
| 388 | if (in.get_entry_count() == 0) continue; |
| 389 | |
| 390 | FlintCursor cur(&in); |
| 391 | cur.find_entry(""); |
| 392 | |
| 393 | string key; |
| 394 | while (cur.next()) { |
| 395 | // Adjust the key if this isn't the first database. |
| 396 | if (off) { |
| 397 | Xapian::docid did; |
| 398 | const char * d = cur.current_key.data(); |
| 399 | const char * e = d + cur.current_key.size(); |
| 400 | if (!unpack_uint_preserving_sort(&d, e, &did)) { |
| 401 | string msg = "Bad "; |
| 402 | msg += table->name; |
| 403 | msg += " key"; |
| 404 | throw Xapian::DatabaseCorruptError(msg); |
| 405 | } |
| 406 | did += off; |
| 407 | key = pack_uint_preserving_sort(did); |
| 408 | if (is_position_table) { |
| 409 | // Copy over the termname too. |
| 410 | size_t tnameidx = d - cur.current_key.data(); |
| 411 | key += cur.current_key.substr(tnameidx); |
| 412 | } else if (d != e) { |
| 413 | string msg = "Bad "; |
| 414 | msg += table->name; |
| 415 | msg += " key"; |
| 416 | throw Xapian::DatabaseCorruptError(msg); |
| 417 | } |
| 418 | } else { |
| 419 | key = cur.current_key; |
| 420 | } |
| 421 | bool compressed = cur.read_tag(true); |
| 422 | out->add(key, cur.current_tag, compressed); |
| 423 | } |
| 424 | } |
| 425 | return result; |
| 426 | } |
| 427 | |
415 | | vector<string> tmp; |
416 | | tmp.reserve(sources.size()); |
417 | | for (vector<string>::const_iterator src = sources.begin(); |
418 | | src != sources.end(); ++src) { |
419 | | string s(*src); |
420 | | s += t->name; |
421 | | s += '.'; |
422 | | tmp.push_back(s); |
423 | | |
424 | | struct stat sb; |
425 | | if (stat(s + "DB", &sb) == 0) { |
426 | | in_size += sb.st_size / 1024; |
427 | | } else { |
428 | | bad_stat = (errno != ENOENT); |
429 | | } |
430 | | } |
431 | | vector<Xapian::docid> off(offset); |
432 | | unsigned int c = 0; |
433 | | while (multipass && tmp.size() > 3) { |
434 | | vector<string> tmpout; |
435 | | tmpout.reserve(tmp.size() / 2); |
436 | | vector<Xapian::docid> newoff; |
437 | | newoff.resize(tmp.size() / 2); |
438 | | for (unsigned int i = 0, j; i < tmp.size(); i = j) { |
439 | | j = i + 2; |
440 | | if (j == tmp.size() - 1) ++j; |
441 | | |
442 | | string dest = destdir; |
443 | | char buf[64]; |
444 | | sprintf(buf, "/tmp%u_%u.", c, i / 2); |
445 | | dest += buf; |
446 | | |
447 | | // Don't compress temporary tables, even if the |
448 | | // final table would be. |
449 | | FlintTable tmptab(dest, false); |
450 | | tmptab.create_and_open(block_size); |
451 | | |
452 | | merge_postlists(&tmptab, off.begin() + i, tmp.begin() + i, tmp.begin() + j, 0); |
453 | | if (c > 0) { |
454 | | for (unsigned int k = i; k < j; ++k) { |
455 | | unlink((tmp[k] + "DB").c_str()); |
456 | | unlink((tmp[k] + "baseA").c_str()); |
457 | | unlink((tmp[k] + "baseB").c_str()); |
458 | | } |
459 | | } |
460 | | tmpout.push_back(dest); |
461 | | tmptab.commit(1); |
462 | | } |
463 | | swap(tmp, tmpout); |
464 | | swap(off, newoff); |
465 | | ++c; |
466 | | } |
467 | | merge_postlists(&out, off.begin(), tmp.begin(), tmp.end(), tot_off); |
468 | | if (c > 0) { |
469 | | for (size_t k = 0; k < tmp.size(); ++k) { |
470 | | unlink((tmp[k] + "DB").c_str()); |
471 | | unlink((tmp[k] + "baseA").c_str()); |
472 | | unlink((tmp[k] + "baseB").c_str()); |
473 | | } |
474 | | } |
| 586 | result = compact_postlists(&out, sources, t, offset, multipass, destdir, block_size, tot_off); |
476 | | // Position, Record, Termlist, Value |
477 | | bool is_position_table = strcmp(t->name, "position") == 0; |
478 | | for (size_t i = 0; i < sources.size(); ++i) { |
479 | | Xapian::docid off = offset[i]; |
480 | | string src(sources[i]); |
481 | | src += t->name; |
482 | | src += '.'; |
483 | | |
484 | | struct stat sb; |
485 | | if (stat(src + "DB", &sb) == 0) { |
486 | | if (sb.st_size == 0) continue; |
487 | | in_size += sb.st_size / 1024; |
488 | | } else { |
489 | | bad_stat = (errno != ENOENT); |
490 | | } |
491 | | |
492 | | FlintTable in(src, true, t->compress_strategy, t->lazy); |
493 | | in.open(); |
494 | | if (in.get_entry_count() == 0) continue; |
495 | | |
496 | | FlintCursor cur(&in); |
497 | | cur.find_entry(""); |
498 | | |
499 | | string key; |
500 | | while (cur.next()) { |
501 | | // Adjust the key if this isn't the first database. |
502 | | if (off) { |
503 | | Xapian::docid did; |
504 | | const char * d = cur.current_key.data(); |
505 | | const char * e = d + cur.current_key.size(); |
506 | | if (!unpack_uint_preserving_sort(&d, e, &did)) { |
507 | | string msg = "Bad "; |
508 | | msg += t->name; |
509 | | msg += " key"; |
510 | | throw Xapian::DatabaseCorruptError(msg); |
511 | | } |
512 | | did += off; |
513 | | key = pack_uint_preserving_sort(did); |
514 | | if (is_position_table) { |
515 | | // Copy over the termname too. |
516 | | size_t tnameidx = d - cur.current_key.data(); |
517 | | key += cur.current_key.substr(tnameidx); |
518 | | } else if (d != e) { |
519 | | string msg = "Bad "; |
520 | | msg += t->name; |
521 | | msg += " key"; |
522 | | throw Xapian::DatabaseCorruptError(msg); |
523 | | } |
524 | | } else { |
525 | | key = cur.current_key; |
526 | | } |
527 | | bool compressed = cur.read_tag(true); |
528 | | out.add(key, cur.current_tag, compressed); |
529 | | } |
530 | | } |
| 588 | result = compact_table(&out, sources, t, offset); |