| | 234 | |
| | 235 | /// A structure holding the result of performing a compaction. |
| | 236 | struct CompactionResult { |
| | 237 | /// Sometimes stat can fail for benign reasons (e.g. >= 2GB file on certain |
| | 238 | /// systems). This keeps track of whether this has happened. |
| | 239 | bool bad_stat; |
| | 240 | |
| | 241 | /// Size of the input DB file (in Kb). |
| | 242 | off_t in_size; |
| | 243 | |
| | 244 | /// Size of the output DB file (in Kb). |
| | 245 | off_t out_size; |
| | 246 | |
| | 247 | CompactionResult() |
| | 248 | : bad_stat(false), in_size(0), out_size(0) |
| | 249 | {} |
| | 250 | |
| | 251 | /// Calculate (by calling stat) the size of the output. |
| | 252 | void read_out_size(const string & dest) |
| | 253 | { |
| | 254 | if (!bad_stat) { |
| | 255 | struct stat sb; |
| | 256 | if (stat(dest + "DB", &sb) == 0) { |
| | 257 | out_size = sb.st_size / 1024; |
| | 258 | } else { |
| | 259 | bad_stat = (errno != ENOENT); |
| | 260 | } |
| | 261 | } |
| | 262 | } |
| | 263 | |
| | 264 | /// Get the percentage decrease in the size. |
| | 265 | double percent_decrease() |
| | 266 | { |
| | 267 | return 100 * double(in_size - out_size) / in_size; |
| | 268 | } |
| | 269 | }; |
| | 270 | |
| | 271 | struct table_list { |
| | 272 | // The "base name" of the table. |
| | 273 | const char * name; |
| | 274 | // zlib compression strategy to use on tags. |
| | 275 | int compress_strategy; |
| | 276 | // Create tables after position lazily. |
| | 277 | bool lazy; |
| | 278 | }; |
| | 279 | |
| | 280 | static const table_list tables[] = { |
| | 281 | // name compress_strategy lazy |
| | 282 | { "postlist", DONT_COMPRESS, false }, |
| | 283 | { "record", Z_DEFAULT_STRATEGY, false }, |
| | 284 | { "termlist", Z_DEFAULT_STRATEGY, false }, |
| | 285 | { "position", DONT_COMPRESS, true }, |
| | 286 | { "value", DONT_COMPRESS, true }, |
| | 287 | { "spelling", Z_DEFAULT_STRATEGY, true }, |
| | 288 | { "synonyms", Z_DEFAULT_STRATEGY, true } |
| | 289 | }; |
| | 290 | |
| | 291 | static CompactionResult |
| | 292 | compact_postlists(FlintTable * out, vector<string> sources, |
| | 293 | const table_list * table, |
| | 294 | const vector<Xapian::docid> & offset, |
| | 295 | bool multipass, |
| | 296 | string destdir, |
| | 297 | size_t block_size, |
| | 298 | Xapian::docid tot_off) |
| | 299 | { |
| | 300 | CompactionResult result; |
| | 301 | vector<string> tmp; |
| | 302 | tmp.reserve(sources.size()); |
| | 303 | for (vector<string>::const_iterator src = sources.begin(); |
| | 304 | src != sources.end(); ++src) { |
| | 305 | string s(*src); |
| | 306 | s += table->name; |
| | 307 | s += '.'; |
| | 308 | tmp.push_back(s); |
| | 309 | |
| | 310 | struct stat sb; |
| | 311 | if (stat(s + "DB", &sb) == 0) { |
| | 312 | result.in_size += sb.st_size / 1024; |
| | 313 | } else { |
| | 314 | result.bad_stat = (errno != ENOENT); |
| | 315 | } |
| | 316 | } |
| | 317 | vector<Xapian::docid> off(offset); |
| | 318 | unsigned int c = 0; |
| | 319 | while (multipass && tmp.size() > 3) { |
| | 320 | vector<string> tmpout; |
| | 321 | tmpout.reserve(tmp.size() / 2); |
| | 322 | vector<Xapian::docid> newoff; |
| | 323 | newoff.resize(tmp.size() / 2); |
| | 324 | for (unsigned int i = 0, j; i < tmp.size(); i = j) { |
| | 325 | j = i + 2; |
| | 326 | if (j == tmp.size() - 1) ++j; |
| | 327 | |
| | 328 | string dest = destdir; |
| | 329 | char buf[64]; |
| | 330 | sprintf(buf, "/tmp%u_%u.", c, i / 2); |
| | 331 | dest += buf; |
| | 332 | |
| | 333 | // Don't compress temporary tables, even if the |
| | 334 | // final table would be. |
| | 335 | FlintTable tmptab(dest, false); |
| | 336 | tmptab.create_and_open(block_size); |
| | 337 | |
| | 338 | merge_postlists(&tmptab, off.begin() + i, tmp.begin() + i, tmp.begin() + j, 0); |
| | 339 | if (c > 0) { |
| | 340 | for (unsigned int k = i; k < j; ++k) { |
| | 341 | unlink((tmp[k] + "DB").c_str()); |
| | 342 | unlink((tmp[k] + "baseA").c_str()); |
| | 343 | unlink((tmp[k] + "baseB").c_str()); |
| | 344 | } |
| | 345 | } |
| | 346 | tmpout.push_back(dest); |
| | 347 | tmptab.commit(1); |
| | 348 | } |
| | 349 | swap(tmp, tmpout); |
| | 350 | swap(off, newoff); |
| | 351 | ++c; |
| | 352 | } |
| | 353 | merge_postlists(out, off.begin(), tmp.begin(), tmp.end(), tot_off); |
| | 354 | if (c > 0) { |
| | 355 | for (size_t k = 0; k < tmp.size(); ++k) { |
| | 356 | unlink((tmp[k] + "DB").c_str()); |
| | 357 | unlink((tmp[k] + "baseA").c_str()); |
| | 358 | unlink((tmp[k] + "baseB").c_str()); |
| | 359 | } |
| | 360 | } |
| | 361 | return result; |
| | 362 | } |
| | 363 | |
| | 364 | static CompactionResult |
| | 365 | compact_table(FlintTable * out, vector<string> sources, |
| | 366 | const table_list * table, |
| | 367 | const vector<Xapian::docid> & offset) |
| | 368 | { |
| | 369 | CompactionResult result; |
| | 370 | // Position, Record, Termlist, Value |
| | 371 | bool is_position_table = strcmp(table->name, "position") == 0; |
| | 372 | for (size_t i = 0; i < sources.size(); ++i) { |
| | 373 | Xapian::docid off = offset[i]; |
| | 374 | string src(sources[i]); |
| | 375 | src += table->name; |
| | 376 | src += '.'; |
| | 377 | |
| | 378 | struct stat sb; |
| | 379 | if (stat(src + "DB", &sb) == 0) { |
| | 380 | if (sb.st_size == 0) continue; |
| | 381 | result.in_size += sb.st_size / 1024; |
| | 382 | } else { |
| | 383 | result.bad_stat = (errno != ENOENT); |
| | 384 | } |
| | 385 | |
| | 386 | FlintTable in(src, true, table->compress_strategy, table->lazy); |
| | 387 | in.open(); |
| | 388 | if (in.get_entry_count() == 0) continue; |
| | 389 | |
| | 390 | FlintCursor cur(&in); |
| | 391 | cur.find_entry(""); |
| | 392 | |
| | 393 | string key; |
| | 394 | while (cur.next()) { |
| | 395 | // Adjust the key if this isn't the first database. |
| | 396 | if (off) { |
| | 397 | Xapian::docid did; |
| | 398 | const char * d = cur.current_key.data(); |
| | 399 | const char * e = d + cur.current_key.size(); |
| | 400 | if (!unpack_uint_preserving_sort(&d, e, &did)) { |
| | 401 | string msg = "Bad "; |
| | 402 | msg += table->name; |
| | 403 | msg += " key"; |
| | 404 | throw Xapian::DatabaseCorruptError(msg); |
| | 405 | } |
| | 406 | did += off; |
| | 407 | key = pack_uint_preserving_sort(did); |
| | 408 | if (is_position_table) { |
| | 409 | // Copy over the termname too. |
| | 410 | size_t tnameidx = d - cur.current_key.data(); |
| | 411 | key += cur.current_key.substr(tnameidx); |
| | 412 | } else if (d != e) { |
| | 413 | string msg = "Bad "; |
| | 414 | msg += table->name; |
| | 415 | msg += " key"; |
| | 416 | throw Xapian::DatabaseCorruptError(msg); |
| | 417 | } |
| | 418 | } else { |
| | 419 | key = cur.current_key; |
| | 420 | } |
| | 421 | bool compressed = cur.read_tag(true); |
| | 422 | out->add(key, cur.current_tag, compressed); |
| | 423 | } |
| | 424 | } |
| | 425 | return result; |
| | 426 | } |
| | 427 | |
| 415 | | vector<string> tmp; |
| 416 | | tmp.reserve(sources.size()); |
| 417 | | for (vector<string>::const_iterator src = sources.begin(); |
| 418 | | src != sources.end(); ++src) { |
| 419 | | string s(*src); |
| 420 | | s += t->name; |
| 421 | | s += '.'; |
| 422 | | tmp.push_back(s); |
| 423 | | |
| 424 | | struct stat sb; |
| 425 | | if (stat(s + "DB", &sb) == 0) { |
| 426 | | in_size += sb.st_size / 1024; |
| 427 | | } else { |
| 428 | | bad_stat = (errno != ENOENT); |
| 429 | | } |
| 430 | | } |
| 431 | | vector<Xapian::docid> off(offset); |
| 432 | | unsigned int c = 0; |
| 433 | | while (multipass && tmp.size() > 3) { |
| 434 | | vector<string> tmpout; |
| 435 | | tmpout.reserve(tmp.size() / 2); |
| 436 | | vector<Xapian::docid> newoff; |
| 437 | | newoff.resize(tmp.size() / 2); |
| 438 | | for (unsigned int i = 0, j; i < tmp.size(); i = j) { |
| 439 | | j = i + 2; |
| 440 | | if (j == tmp.size() - 1) ++j; |
| 441 | | |
| 442 | | string dest = destdir; |
| 443 | | char buf[64]; |
| 444 | | sprintf(buf, "/tmp%u_%u.", c, i / 2); |
| 445 | | dest += buf; |
| 446 | | |
| 447 | | // Don't compress temporary tables, even if the |
| 448 | | // final table would be. |
| 449 | | FlintTable tmptab(dest, false); |
| 450 | | tmptab.create_and_open(block_size); |
| 451 | | |
| 452 | | merge_postlists(&tmptab, off.begin() + i, tmp.begin() + i, tmp.begin() + j, 0); |
| 453 | | if (c > 0) { |
| 454 | | for (unsigned int k = i; k < j; ++k) { |
| 455 | | unlink((tmp[k] + "DB").c_str()); |
| 456 | | unlink((tmp[k] + "baseA").c_str()); |
| 457 | | unlink((tmp[k] + "baseB").c_str()); |
| 458 | | } |
| 459 | | } |
| 460 | | tmpout.push_back(dest); |
| 461 | | tmptab.commit(1); |
| 462 | | } |
| 463 | | swap(tmp, tmpout); |
| 464 | | swap(off, newoff); |
| 465 | | ++c; |
| 466 | | } |
| 467 | | merge_postlists(&out, off.begin(), tmp.begin(), tmp.end(), tot_off); |
| 468 | | if (c > 0) { |
| 469 | | for (size_t k = 0; k < tmp.size(); ++k) { |
| 470 | | unlink((tmp[k] + "DB").c_str()); |
| 471 | | unlink((tmp[k] + "baseA").c_str()); |
| 472 | | unlink((tmp[k] + "baseB").c_str()); |
| 473 | | } |
| 474 | | } |
| | 586 | result = compact_postlists(&out, sources, t, offset, multipass, destdir, block_size, tot_off); |
| 476 | | // Position, Record, Termlist, Value |
| 477 | | bool is_position_table = strcmp(t->name, "position") == 0; |
| 478 | | for (size_t i = 0; i < sources.size(); ++i) { |
| 479 | | Xapian::docid off = offset[i]; |
| 480 | | string src(sources[i]); |
| 481 | | src += t->name; |
| 482 | | src += '.'; |
| 483 | | |
| 484 | | struct stat sb; |
| 485 | | if (stat(src + "DB", &sb) == 0) { |
| 486 | | if (sb.st_size == 0) continue; |
| 487 | | in_size += sb.st_size / 1024; |
| 488 | | } else { |
| 489 | | bad_stat = (errno != ENOENT); |
| 490 | | } |
| 491 | | |
| 492 | | FlintTable in(src, true, t->compress_strategy, t->lazy); |
| 493 | | in.open(); |
| 494 | | if (in.get_entry_count() == 0) continue; |
| 495 | | |
| 496 | | FlintCursor cur(&in); |
| 497 | | cur.find_entry(""); |
| 498 | | |
| 499 | | string key; |
| 500 | | while (cur.next()) { |
| 501 | | // Adjust the key if this isn't the first database. |
| 502 | | if (off) { |
| 503 | | Xapian::docid did; |
| 504 | | const char * d = cur.current_key.data(); |
| 505 | | const char * e = d + cur.current_key.size(); |
| 506 | | if (!unpack_uint_preserving_sort(&d, e, &did)) { |
| 507 | | string msg = "Bad "; |
| 508 | | msg += t->name; |
| 509 | | msg += " key"; |
| 510 | | throw Xapian::DatabaseCorruptError(msg); |
| 511 | | } |
| 512 | | did += off; |
| 513 | | key = pack_uint_preserving_sort(did); |
| 514 | | if (is_position_table) { |
| 515 | | // Copy over the termname too. |
| 516 | | size_t tnameidx = d - cur.current_key.data(); |
| 517 | | key += cur.current_key.substr(tnameidx); |
| 518 | | } else if (d != e) { |
| 519 | | string msg = "Bad "; |
| 520 | | msg += t->name; |
| 521 | | msg += " key"; |
| 522 | | throw Xapian::DatabaseCorruptError(msg); |
| 523 | | } |
| 524 | | } else { |
| 525 | | key = cur.current_key; |
| 526 | | } |
| 527 | | bool compressed = cur.read_tag(true); |
| 528 | | out.add(key, cur.current_tag, compressed); |
| 529 | | } |
| 530 | | } |
| | 588 | result = compact_table(&out, sources, t, offset); |