diff -uNrEb xapian-core-1.0.5/api/omdatabase.cc xapian-core-1.0.5-philip/api/omdatabase.cc
|
old
|
new
|
|
| 43 | 43 | #include "noreturn.h" |
| 44 | 44 | |
| 45 | 45 | #include <stdlib.h> // For abs(). |
| | 46 | #include <math.h> // For pow(). |
| 46 | 47 | |
| 47 | 48 | #include <vector> |
| 48 | 49 | |
| … |
… |
|
| 384 | 385 | string result; |
| 385 | 386 | int edist_best = max_edit_distance; |
| 386 | 387 | Xapian::doccount freq_best = 0; |
| | 388 | Xapian::doccount freq_best_edist = 0; |
| 387 | 389 | while (true) { |
| 388 | 390 | TermList *ret = merger->next(); |
| 389 | 391 | if (ret) merger = ret; |
| … |
… |
|
| 403 | 405 | |
| 404 | 406 | // First check the length of the encoded UTF-8 version of term. |
| 405 | 407 | // Each UTF-32 character is 1-4 bytes in UTF-8. |
| 406 | | if (abs((long)term.size() - (long)word.size()) > edist_best * 4) { |
| | 408 | if (abs((long)term.size() - (long)word.size()) > max_edit_distance* 4) { |
| 407 | 409 | DEBUGLINE(SPELLING, "Lengths much too different"); |
| 408 | 410 | continue; |
| 409 | 411 | } |
| … |
… |
|
| 413 | 415 | utf32_term.assign(Utf8Iterator(term), Utf8Iterator()); |
| 414 | 416 | |
| 415 | 417 | if (abs((long)utf32_term.size() - (long)utf32_word.size()) |
| 416 | | > edist_best) { |
| | 418 | > max_edit_distance) { |
| 417 | 419 | DEBUGLINE(SPELLING, "Lengths too different"); |
| 418 | 420 | continue; |
| 419 | 421 | } |
| 420 | 422 | |
| | 423 | if ((long)utf32_term.size() <= TRIGRAM_SCORE_THRESHOLD + 1) { |
| | 424 | DEBUGLINE(SPELLING, "Term too short"); |
| | 425 | continue; |
| | 426 | } |
| | 427 | |
| 421 | 428 | int edist = edit_distance_unsigned(&utf32_term[0], |
| 422 | 429 | utf32_term.size(), |
| 423 | 430 | &utf32_word[0], |
| 424 | 431 | utf32_word.size()); |
| 425 | 432 | DEBUGLINE(SPELLING, "Edit distance " << edist); |
| 426 | | // If we have an exact match, return an empty string since there's |
| 427 | | // no correction required. |
| 428 | | if (edist == 0) return string(); |
| 429 | 433 | |
| 430 | | if (edist <= edist_best) { |
| 431 | 434 | Xapian::doccount freq = 0; |
| 432 | 435 | for (size_t j = 0; j < internal.size(); ++j) |
| 433 | 436 | freq += internal[j]->get_spelling_frequency(term); |
| 434 | 437 | |
| 435 | 438 | DEBUGLINE(SPELLING, "Freq " << freq << " best " << freq_best); |
| 436 | | if (edist < edist_best || freq > freq_best) { |
| | 439 | if (edist <= edist_best && (freq > (max_edit_distance - edist_best)*freq_best_edist)) { |
| 437 | 440 | DEBUGLINE(SPELLING, "Best so far: \"" << term << |
| 438 | 441 | "\" edist " << edist << " freq " << |
| 439 | 442 | freq); |
| 440 | 443 | result = term; |
| 441 | 444 | edist_best = edist; |
| 442 | 445 | freq_best = freq; |
| | 446 | freq_best_edist = freq; |
| 443 | 447 | } |
| | 448 | else if ((freq > pow(freq_best_edist, edist+2)) && |
| | 449 | ((edist - 2) < edist_best) && |
| | 450 | (freq > freq_best)) { |
| | 451 | DEBUGLINE(SPELLING, "Best so far: \"" << term << |
| | 452 | " freq " << |
| | 453 | freq); |
| | 454 | result = term; |
| | 455 | freq_best = freq; |
| 444 | 456 | } |
| 445 | 457 | } |
| 446 | 458 | } |