Ticket #225: spelling_frequency_updated.patch
File spelling_frequency_updated.patch, 2.6 KB (added by , 14 years ago) |
---|
-
api/omdatabase.cc
43 43 #include "ortermlist.h" 44 44 #include "noreturn.h" 45 45 46 #include <cmath> // For pow(). 46 47 #include <cstdlib> // For abs(). 47 48 48 49 #include <cstring> … … 574 575 string result; 575 576 int edist_best = max_edit_distance; 576 577 Xapian::doccount freq_best = 0; 578 Xapian::doccount freq_best_edist = 0; 577 579 while (true) { 578 580 TermList *ret = merger->next(); 579 581 if (ret) merger.reset(ret); … … 593 595 594 596 // First check the length of the encoded UTF-8 version of term. 595 597 // Each UTF-32 character is 1-4 bytes in UTF-8. 596 if (abs(long(term.size()) - long(word.size())) > edist_best* 4) {598 if (abs(long(term.size()) - long(word.size())) > max_edit_distance * 4) { 597 599 LOGLINE(SPELLING, "Lengths much too different"); 598 600 continue; 599 601 } … … 603 605 utf32_term.assign(Utf8Iterator(term), Utf8Iterator()); 604 606 605 607 if (abs(long(utf32_term.size()) - long(utf32_word.size())) 606 > edist_best) {608 > max_edit_distance) { 607 609 LOGLINE(SPELLING, "Lengths too different"); 608 610 continue; 609 611 } … … 613 615 continue; 614 616 } 615 617 618 if ((long)utf32_term.size() <= TRIGRAM_SCORE_THRESHOLD + 1) { 619 LOGLINE(SPELLING, "Term too short"); 620 continue; 621 } 622 616 623 int edist = edit_distance_unsigned(&utf32_term[0], 617 624 int(utf32_term.size()), 618 625 &utf32_word[0], 619 626 int(utf32_word.size()), 620 627 edist_best); 621 628 LOGLINE(SPELLING, "Edit distance " << edist); 622 // If we have an exact match, return an empty string since there's623 // no correction required.624 if (edist == 0) RETURN(string());625 629 626 if (edist <= edist_best){630 { 627 631 Xapian::doccount freq = 0; 628 632 for (size_t j = 0; j < internal.size(); ++j) 629 633 freq += internal[j]->get_spelling_frequency(term); 630 634 631 635 LOGLINE(SPELLING, "Freq " << freq << " best " << freq_best); 632 if (edist < edist_best || freq > freq_best) {636 if (edist <= edist_best && (freq > (max_edit_distance - edist_best)*freq_best_edist)) { 633 637 LOGLINE(SPELLING, "Best so far: \"" << term << 634 638 "\" edist " << edist << " freq " << freq); 635 639 result = term; 636 640 edist_best = edist; 637 641 freq_best = freq; 642 freq_best_edist = freq; 643 } 644 else if ((freq > pow(freq_best_edist, edist+2)) && 645 ((edist - 2) < edist_best) && 646 (freq > freq_best)) { 647 LOGLINE(SPELLING, "Best so far: \"" << term << 648 " freq " << 649 freq); 650 result = term; 651 freq_best = freq; 638 652 } 639 653 } 640 654 }