diff -uNrEb xapian-core-1.0.5/api/omdatabase.cc xapian-core-1.0.5-philip/api/omdatabase.cc
old
|
new
|
|
43 | 43 | #include "noreturn.h" |
44 | 44 | |
45 | 45 | #include <stdlib.h> // For abs(). |
| 46 | #include <math.h> // For pow(). |
46 | 47 | |
47 | 48 | #include <vector> |
48 | 49 | |
… |
… |
|
384 | 385 | string result; |
385 | 386 | int edist_best = max_edit_distance; |
386 | 387 | Xapian::doccount freq_best = 0; |
| 388 | Xapian::doccount freq_best_edist = 0; |
387 | 389 | while (true) { |
388 | 390 | TermList *ret = merger->next(); |
389 | 391 | if (ret) merger = ret; |
… |
… |
|
403 | 405 | |
404 | 406 | // First check the length of the encoded UTF-8 version of term. |
405 | 407 | // Each UTF-32 character is 1-4 bytes in UTF-8. |
406 | | if (abs((long)term.size() - (long)word.size()) > edist_best * 4) { |
| 408 | if (abs((long)term.size() - (long)word.size()) > max_edit_distance* 4) { |
407 | 409 | DEBUGLINE(SPELLING, "Lengths much too different"); |
408 | 410 | continue; |
409 | 411 | } |
… |
… |
|
413 | 415 | utf32_term.assign(Utf8Iterator(term), Utf8Iterator()); |
414 | 416 | |
415 | 417 | if (abs((long)utf32_term.size() - (long)utf32_word.size()) |
416 | | > edist_best) { |
| 418 | > max_edit_distance) { |
417 | 419 | DEBUGLINE(SPELLING, "Lengths too different"); |
418 | 420 | continue; |
419 | 421 | } |
420 | 422 | |
| 423 | if ((long)utf32_term.size() <= TRIGRAM_SCORE_THRESHOLD + 1) { |
| 424 | DEBUGLINE(SPELLING, "Term too short"); |
| 425 | continue; |
| 426 | } |
| 427 | |
421 | 428 | int edist = edit_distance_unsigned(&utf32_term[0], |
422 | 429 | utf32_term.size(), |
423 | 430 | &utf32_word[0], |
424 | 431 | utf32_word.size()); |
425 | 432 | DEBUGLINE(SPELLING, "Edit distance " << edist); |
426 | | // If we have an exact match, return an empty string since there's |
427 | | // no correction required. |
428 | | if (edist == 0) return string(); |
429 | 433 | |
430 | | if (edist <= edist_best) { |
431 | 434 | Xapian::doccount freq = 0; |
432 | 435 | for (size_t j = 0; j < internal.size(); ++j) |
433 | 436 | freq += internal[j]->get_spelling_frequency(term); |
434 | 437 | |
435 | 438 | DEBUGLINE(SPELLING, "Freq " << freq << " best " << freq_best); |
436 | | if (edist < edist_best || freq > freq_best) { |
| 439 | if (edist <= edist_best && (freq > (max_edit_distance - edist_best)*freq_best_edist)) { |
437 | 440 | DEBUGLINE(SPELLING, "Best so far: \"" << term << |
438 | 441 | "\" edist " << edist << " freq " << |
439 | 442 | freq); |
440 | 443 | result = term; |
441 | 444 | edist_best = edist; |
442 | 445 | freq_best = freq; |
| 446 | freq_best_edist = freq; |
443 | 447 | } |
| 448 | else if ((freq > pow(freq_best_edist, edist+2)) && |
| 449 | ((edist - 2) < edist_best) && |
| 450 | (freq > freq_best)) { |
| 451 | DEBUGLINE(SPELLING, "Best so far: \"" << term << |
| 452 | " freq " << |
| 453 | freq); |
| 454 | result = term; |
| 455 | freq_best = freq; |
444 | 456 | } |
445 | 457 | } |
446 | 458 | } |