Ticket #178: remote-spell-remainder-updated.patch
File remote-spell-remainder-updated.patch, 21.7 KB (added by , 13 years ago) |
---|
-
xapian-core/api/editdistance.cc
diff --git a/xapian-core/api/editdistance.cc b/xapian-core/api/editdistance.cc index 2365877..ca35b98 100644
a b 32 32 #include "editdistance.h" 33 33 34 34 #include "omassert.h" 35 #include "debuglog.h" 36 37 #include <xapian/unicode.h> 35 38 36 39 #include <algorithm> 37 40 #include <cstdlib> 38 41 #include <cstring> 39 42 using namespace std; 40 43 41 44 template<class CHR> … … edit_distance_unsigned(const unsigned * ptr1, int len1, 213 216 { 214 217 return seqcmp_editdist<unsigned>(ptr1, len1, ptr2, len2, max_distance); 215 218 } 219 220 using namespace Xapian; 221 222 EditDistance::EditDistance(const string& word_) 223 #if ! defined __SUNPRO_CC || __SUNPRO_CC - 0 >= 0x580 224 // Extra brackets needed to avoid this being misparsed as a function 225 // prototype. 226 :word(word_), utf32_word(Utf8Iterator(word_), Utf8Iterator()) {} 227 # else 228 { 229 // Older versions of Sun's C++ compiler need this workaround, but 5.8 230 // doesn't. Unsure of the exact version it was fixed in. 231 for (Utf8Iterator sunpro_it(word); sunpro_it != Utf8Iterator(); ++sunpro_it) { 232 utf32_word.push_back(*sunpro_it); 233 } 234 } 235 #endif 236 237 238 // We sum the character frequency histogram absolute differences to compute a 239 // lower bound on the edit distance. Rather than counting each Unicode code 240 // point uniquely, we use an array with VEC_SIZE elements and tally code points 241 // modulo VEC_SIZE which can only reduce the bound we calculate. 242 // 243 // There will be a trade-off between how good the bound is and how large and 244 // array is used (a larger array takes more time to clear and sum over). The 245 // value 64 is somewhat arbitrary - it works as well as 128 for the testsuite 246 // but that may not reflect real world performance. FIXME: profile and tune. 247 248 #define VEC_SIZE 64 249 250 static int 251 freq_edit_lower_bound(const vector<unsigned> & a, const vector<unsigned> & b) 252 { 253 int vec[VEC_SIZE]; 254 memset(vec, 0, sizeof(vec)); 255 vector<unsigned>::const_iterator i; 256 for (i = a.begin(); i != a.end(); ++i) { 257 ++vec[(*i) % VEC_SIZE]; 258 } 259 for (i = b.begin(); i != b.end(); ++i) { 260 --vec[(*i) % VEC_SIZE]; 261 } 262 unsigned int total = 0; 263 for (size_t j = 0; j < VEC_SIZE; ++j) { 264 total += abs(vec[j]); 265 } 266 // Each insertion or deletion adds at most 1 to total. Each transposition 267 // doesn't change it at all. But each substitution can change it by 2 so 268 // we need to divide it by 2. Rounding up is OK, since the odd change must 269 // be due to an actual edit. 270 return (total + 1) / 2; 271 } 272 273 int 274 EditDistance::distance(const string& term,int limit) { 275 vector<unsigned> utf32_term; 276 // we use this to signal that the distance is clearly large and that we're 277 // not going to work it out exactly 278 const int big_distance = 1000; 279 280 // First check the length of the encoded UTF-8 version of term. 281 // Each UTF-32 character is 1-4 bytes in UTF-8. 282 if (abs(long(term.size()) - long(word.size())) > limit * 4) { 283 LOGLINE(SPELLING, "Lengths much too different"); 284 return big_distance; 285 } 286 287 // Now convert to UTF-32, and compare the true lengths more 288 // strictly. 289 utf32_term.assign(Utf8Iterator(term), Utf8Iterator()); 290 291 if (abs(long(utf32_term.size()) - long(utf32_word.size())) 292 > limit) { 293 LOGLINE(SPELLING, "Lengths too different"); 294 return big_distance; 295 } 296 297 if (freq_edit_lower_bound(utf32_term, utf32_word) > limit) { 298 LOGLINE(SPELLING, "Rejected by character frequency test"); 299 return big_distance; 300 } 301 302 return edit_distance_unsigned(&utf32_term[0], 303 int(utf32_term.size()), 304 &utf32_word[0], 305 int(utf32_word.size()), 306 limit); 307 } -
xapian-core/api/editdistance.h
diff --git a/xapian-core/api/editdistance.h b/xapian-core/api/editdistance.h index 4ab7824..553e564 100644
a b 22 22 #ifndef XAPIAN_INCLUDED_EDITDISTANCE_H 23 23 #define XAPIAN_INCLUDED_EDITDISTANCE_H 24 24 25 #include<string> 26 #include<vector> 25 27 /** Calculate the edit distance between two sequences. 26 28 * 27 29 * Edit distance is defined as the minimum number of edit operations … … int edit_distance_unsigned(const unsigned* ptr1, int len1, 49 51 const unsigned* ptr2, int len2, 50 52 int max_distance); 51 53 54 55 class EditDistance { 56 const std::string word; 57 const std::vector<unsigned> utf32_word; 58 59 public: 60 EditDistance(const std::string& word_) ; 61 int distance(const std::string& other, int limit); 62 }; 52 63 #endif // XAPIAN_INCLUDED_EDITDISTANCE_H -
xapian-core/api/omdatabase.cc
diff --git a/xapian-core/api/omdatabase.cc b/xapian-core/api/omdatabase.cc index ed80043..d51a074 100644
a b 42 42 #include "editdistance.h" 43 43 #include "ortermlist.h" 44 44 #include "noreturn.h" 45 #include "remote-database.h" 45 46 46 47 #include <cstdlib> // For abs(). 47 48 … … Database::get_description() const 497 498 return "Database()"; 498 499 } 499 500 500 // We sum the character frequency histogram absolute differences to compute a501 // lower bound on the edit distance. Rather than counting each Unicode code502 // point uniquely, we use an array with VEC_SIZE elements and tally code points503 // modulo VEC_SIZE which can only reduce the bound we calculate.504 //505 // There will be a trade-off between how good the bound is and how large and506 // array is used (a larger array takes more time to clear and sum over). The507 // value 64 is somewhat arbitrary - it works as well as 128 for the testsuite508 // but that may not reflect real world performance. FIXME: profile and tune.509 510 #define VEC_SIZE 64511 512 static int513 freq_edit_lower_bound(const vector<unsigned> & a, const vector<unsigned> & b)514 {515 int vec[VEC_SIZE];516 memset(vec, 0, sizeof(vec));517 vector<unsigned>::const_iterator i;518 for (i = a.begin(); i != a.end(); ++i) {519 ++vec[(*i) % VEC_SIZE];520 }521 for (i = b.begin(); i != b.end(); ++i) {522 --vec[(*i) % VEC_SIZE];523 }524 unsigned int total = 0;525 for (size_t j = 0; j < VEC_SIZE; ++j) {526 total += abs(vec[j]);527 }528 // Each insertion or deletion adds at most 1 to total. Each transposition529 // doesn't change it at all. But each substitution can change it by 2 so530 // we need to divide it by 2. Rounding up is OK, since the odd change must531 // be due to an actual edit.532 return (total + 1) / 2;533 }534 535 501 // Word must have a trigram score at least this close to the best score seen 536 502 // so far. 537 503 #define TRIGRAM_SCORE_THRESHOLD 2 … … Database::get_spelling_suggestion(const string &word, 543 509 LOGCALL(API, string, "Database::get_spelling_suggestion", word | max_edit_distance); 544 510 if (word.size() <= 1) return string(); 545 511 AutoPtr<TermList> merger; 512 bool got_remote = false; 546 513 for (size_t i = 0; i < internal.size(); ++i) { 547 TermList * tl = internal[i]->open_spelling_termlist(word); 548 LOGLINE(SPELLING, "Sub db " << i << " tl = " << (void*)tl); 549 if (tl) { 550 if (merger.get()) { 551 merger.reset(new OrTermList(merger.release(), tl)); 552 } else { 553 merger.reset(tl); 554 } 555 } 556 } 557 if (!merger.get()) RETURN(string()); 558 559 // Convert word to UTF-32. 560 #if ! defined __SUNPRO_CC || __SUNPRO_CC - 0 >= 0x580 561 // Extra brackets needed to avoid this being misparsed as a function 562 // prototype. 563 vector<unsigned> utf32_word((Utf8Iterator(word)), Utf8Iterator()); 564 #else 565 // Older versions of Sun's C++ compiler need this workaround, but 5.8 566 // doesn't. Unsure of the exact version it was fixed in. 567 vector<unsigned> utf32_word; 568 for (Utf8Iterator sunpro_it(word); sunpro_it != Utf8Iterator(); ++sunpro_it) { 569 utf32_word.push_back(*sunpro_it); 570 } 571 #endif 572 573 vector<unsigned> utf32_term; 514 RemoteDatabase *rdb = internal[i].get()->as_remotedatabase(); 515 if (!rdb) { 516 TermList * tl = internal[i]->open_spelling_termlist(word); 517 LOGLINE(SPELLING, "Sub db " << i << " tl = " << (void*)tl); 518 if (tl) { 519 if (merger.get()) { 520 merger.reset(new OrTermList(merger.release(), tl)); 521 } else { 522 merger.reset(tl); 523 } 524 } 525 } else { 526 got_remote = true; 527 } 528 } 529 530 if (! (merger.get() || got_remote)) { 531 RETURN(string()); 532 } 533 534 EditDistance edcomp(word); 574 535 575 536 Xapian::termcount best = 1; 576 537 string result; 577 538 int edist_best = max_edit_distance; 578 539 Xapian::doccount freq_best = 0; 579 540 Xapian::doccount freq_exact = 0; 580 while (true) { 581 TermList *ret = merger->next(); 582 if (ret) merger.reset(ret); 583 584 if (merger->at_end()) break; 585 586 string term = merger->get_termname(); 587 Xapian::termcount score = merger->get_wdf(); 588 589 LOGLINE(SPELLING, "Term \"" << term << "\" ngram score " << score); 590 if (score + TRIGRAM_SCORE_THRESHOLD >= best) { 591 if (score > best) best = score; 592 593 // There's no point considering a word where the difference 594 // in length is greater than the smallest number of edits we've 595 // found so far. 596 597 // First check the length of the encoded UTF-8 version of term. 598 // Each UTF-32 character is 1-4 bytes in UTF-8. 599 if (abs(long(term.size()) - long(word.size())) > edist_best * 4) { 600 LOGLINE(SPELLING, "Lengths much too different"); 601 continue; 602 } 603 604 // Now convert to UTF-32, and compare the true lengths more 605 // strictly. 606 utf32_term.assign(Utf8Iterator(term), Utf8Iterator()); 607 608 if (abs(long(utf32_term.size()) - long(utf32_word.size())) 609 > edist_best) { 610 LOGLINE(SPELLING, "Lengths too different"); 611 continue; 612 } 613 614 if (freq_edit_lower_bound(utf32_term, utf32_word) > edist_best) { 615 LOGLINE(SPELLING, "Rejected by character frequency test"); 616 continue; 617 } 618 619 int edist = edit_distance_unsigned(&utf32_term[0], 620 int(utf32_term.size()), 621 &utf32_word[0], 622 int(utf32_word.size()), 623 edist_best); 624 LOGLINE(SPELLING, "Edit distance " << edist); 625 626 if (edist <= edist_best) { 627 Xapian::doccount freq = 0; 628 for (size_t j = 0; j < internal.size(); ++j) 629 freq += internal[j]->get_spelling_frequency(term); 630 631 LOGLINE(SPELLING, "Freq " << freq << " best " << freq_best); 632 // Even if we have an exact match, there may be a much more 633 // frequent potential correction which will still be 634 // interesting. 635 if (edist == 0) { 636 freq_exact = freq; 637 continue; 638 } 639 640 if (edist < edist_best || freq > freq_best) { 641 LOGLINE(SPELLING, "Best so far: \"" << term << 642 "\" edist " << edist << " freq " << freq); 643 result = term; 644 edist_best = edist; 645 freq_best = freq; 646 } 647 } 541 if (merger.get()) { 542 while (true) { 543 TermList *ret = merger->next(); 544 if (ret) merger.reset(ret); 545 546 if (merger->at_end()) break; 547 548 string term = merger->get_termname(); 549 Xapian::termcount score = merger->get_wdf(); 550 551 LOGLINE(SPELLING, "Term \"" << term << "\" ngram score " << score); 552 if (score + TRIGRAM_SCORE_THRESHOLD >= best) { 553 if (score > best) best = score; 554 555 // There's no point considering a word where the difference 556 // in length is greater than the smallest number of edits we've 557 // found so far. 558 559 int edist = edcomp.distance(term, edist_best); 560 561 if (edist > edist_best) continue; 562 563 LOGLINE(SPELLING, "Edit distance " << edist); 564 // If we have an exact match, return an empty string since there's 565 // no correction required. 566 if (edist == 0) RETURN(string()); 567 if (edist <= edist_best) { 568 Xapian::doccount freq = 0; 569 for (size_t j = 0; j < internal.size(); ++j) { 570 // this includes calls on get_spelling_frequency 571 // for remote databases 572 freq += internal[j]->get_spelling_frequency(term); 573 } 574 LOGLINE(SPELLING, "Freq " << freq << " best " << freq_best); 575 // Even if we have an exact match, there may be a much more 576 // frequent potential correction which will still be 577 // interesting. 578 if (edist == 0) { 579 freq_exact = freq; 580 continue; 581 } 582 583 if (edist < edist_best || freq > freq_best) { 584 LOGLINE(SPELLING, "Best so far: \"" << term << 585 "\" edist " << edist << " freq " << freq); 586 result = term; 587 edist_best = edist; 588 freq_best = freq; 589 } 590 } 591 } 648 592 } 649 593 } 594 595 // we've got the best we can locally which - it there's no 596 // correction need means we won't need to check remote 597 // databases. If we get this far we try corrections from remote 598 // databases. 599 for (size_t i = 0; i < internal.size(); ++i) { 600 RemoteDatabase *rdb = internal[i].get()->as_remotedatabase(); 601 if (rdb) { 602 string rterm = rdb->get_spelling_suggestion(word, edist_best); 603 if (rterm != "") { 604 // FIXME: repetitious of stuff in the loop above - should be factored out. 605 int edist = edcomp.distance(rterm, edist_best); 606 if (edist == 0) RETURN(string()); 607 if (edist <= edist_best) { 608 Xapian::doccount freq = 0; 609 for (size_t j = 0; j < internal.size(); ++j) 610 freq += internal[j]->get_spelling_frequency(rterm); 611 LOGLINE(SPELLING, "Freq " << freq << " best " << freq_best); 612 if (edist < edist_best || freq > freq_best) { 613 LOGLINE(SPELLING, "Best so far: \"" << term << 614 "\" edist " << edist << " freq " << freq); 615 result = rterm; 616 edist_best = edist; 617 freq_best = freq; 618 } 619 } 620 } 621 } 622 } 623 650 624 if (freq_best < freq_exact) 651 625 RETURN(string()); 652 626 RETURN(result); -
xapian-core/backends/remote/remote-database.cc
diff --git a/xapian-core/backends/remote/remote-database.cc b/xapian-core/backends/remote/remote-database.cc index 0aa3a52..15fd584 100644
a b RemoteDatabase::remove_spelling(const string & word, 785 785 data += word; 786 786 send_message(MSG_REMOVESPELLING, data); 787 787 } 788 789 string 790 RemoteDatabase::get_spelling_suggestion(const string & word, 791 unsigned max_edit_distance) 792 { 793 string data = encode_length(max_edit_distance); 794 data += word; 795 send_message(MSG_GETSPELLINGSUGGESTION, data); 796 797 string message; 798 get_message(message, REPLY_SPELLINGSUGGESTION); 799 return message; 800 } 801 802 Xapian::doccount 803 RemoteDatabase::get_spelling_frequency(const string & word) const 804 { 805 send_message(MSG_GETSPELLINGFREQ, word); 806 string message; 807 get_message(message, REPLY_SPELLINGFREQ); 808 const char * p = message.data(); 809 const char * p_end = p + message.size(); 810 return decode_length(&p, p_end, false); 811 } -
xapian-core/common/remote-database.h
diff --git a/xapian-core/common/remote-database.h b/xapian-core/common/remote-database.h index fc698ce..c9231f9 100644
a b class RemoteDatabase : public Xapian::Database::Internal { 251 251 void add_spelling(const std::string&, Xapian::termcount) const; 252 252 253 253 void remove_spelling(const std::string&, Xapian::termcount freqdec) const; 254 255 std::string get_spelling_suggestion(const std::string &word, 256 unsigned max_edit_distance); 257 258 Xapian::doccount get_spelling_frequency(const std::string & word) const; 254 259 }; 255 260 256 261 #endif // XAPIAN_INCLUDED_REMOTE_DATABASE_H -
xapian-core/common/remoteprotocol.h
diff --git a/xapian-core/common/remoteprotocol.h b/xapian-core/common/remoteprotocol.h index 843761f..d3f46e5 100644
a b 45 45 // 34: 1.1.4 Support for metadata over with remote databases. 46 46 // 35: 1.1.5 Support for add_spelling() and remove_spelling(). 47 47 // 35.1: 1.2.4 Support for metadata_keys_begin(). 48 // 35.2: 1.2.6 Support for spelling suggestions. 48 49 #define XAPIAN_REMOTE_PROTOCOL_MAJOR_VERSION 35 49 #define XAPIAN_REMOTE_PROTOCOL_MINOR_VERSION 150 #define XAPIAN_REMOTE_PROTOCOL_MINOR_VERSION 2 50 51 51 52 /** Message types (client -> server). 52 53 * … … enum message_type { 83 84 MSG_GETMSET, // Get MSet 84 85 MSG_SHUTDOWN, // Shutdown 85 86 MSG_METADATAKEYLIST, // Iterator for metadata keys 87 MSG_GETSPELLINGSUGGESTION, // Get a spelling suggestion 88 MSG_GETSPELLINGFREQ, // Get spelling frequency 86 89 MSG_MAX 87 90 }; 88 91 … … enum reply_type { 110 113 REPLY_RESULTS, // Results (MSet) 111 114 REPLY_METADATA, // Metadata 112 115 REPLY_METADATAKEYLIST, // Iterator for metadata keys 116 REPLY_SPELLINGSUGGESTION, // Get Spelling Suggestion 117 REPLY_SPELLINGFREQ, // Spelling frequency 113 118 REPLY_MAX 114 119 }; 115 120 -
xapian-core/common/remoteserver.h
diff --git a/xapian-core/common/remoteserver.h b/xapian-core/common/remoteserver.h index bff341f..25007c1 100644
a b class XAPIAN_VISIBILITY_DEFAULT RemoteServer : private RemoteConnection { 164 164 // remove a spelling 165 165 void msg_removespelling(const std::string & message); 166 166 167 // get spellings 168 void msg_getspellingsuggestion(const std::string & message); 169 170 // get spelling frequency 171 void msg_getspellingfreq(const std::string & message); 172 167 173 public: 168 174 /** Construct a RemoteServer. 169 175 * -
xapian-core/net/remoteserver.cc
diff --git a/xapian-core/net/remoteserver.cc b/xapian-core/net/remoteserver.cc index 3ee79f9..3ae3a86 100644
a b 40 40 #include "serialise-double.h" 41 41 #include "str.h" 42 42 #include "weightinternal.h" 43 #include "database.h" // so we can use internal in msg_openspellingtermlist 43 44 44 45 /// Class to throw when we receive the connection closing message. 45 46 struct ConnectionClosed { }; … … RemoteServer::run() 186 187 0, // MSG_GETMSET - used during a conversation. 187 188 0, // MSG_SHUTDOWN - handled by get_message(). 188 189 &RemoteServer::msg_openmetadatakeylist, 190 &RemoteServer::msg_getspellingsuggestion, 191 &RemoteServer::msg_getspellingfreq 189 192 }; 190 193 191 194 string message; … … RemoteServer::msg_removespelling(const string & message) 707 710 Xapian::termcount freqdec = decode_length(&p, p_end, false); 708 711 wdb->remove_spelling(string(p, p_end - p), freqdec); 709 712 } 713 714 void 715 RemoteServer::msg_getspellingsuggestion(const string & message) 716 { 717 const char *p = message.data(); 718 const char *p_end = p + message.size(); 719 size_t dist = decode_length(&p, p_end, false); 720 string word(p, p_end - p); 721 send_message(REPLY_SPELLINGSUGGESTION, db->get_spelling_suggestion(word, dist)); 722 } 723 724 void 725 RemoteServer::msg_getspellingfreq(const string & message) 726 { 727 Xapian::doccount freq = 0; 728 for (size_t j = 0; j < db->internal.size(); ++j) 729 freq += db->internal[j]->get_spelling_frequency(message); 730 send_message(REPLY_SPELLINGFREQ, encode_length(freq)); 731 } -
xapian-core/tests/api_spelling.cc
diff --git a/xapian-core/tests/api_spelling.cc b/xapian-core/tests/api_spelling.cc index c0865d8..5c2af73 100644
a b DEFINE_TESTCASE(spell2, spelling) { 182 182 } 183 183 184 184 // Test spelling correction with multi databases 185 DEFINE_TESTCASE(spell3, spelling) { 185 DEFINE_TESTCASE(spell3, spelling && !remote) { 186 // Spelling iterator not implemented for remote databases. 186 187 Xapian::WritableDatabase db1 = get_writable_database(); 187 188 // We can't just call get_writable_database() since it would delete db1 188 189 // which doesn't work at all under __WIN32__ and will go wrong elsewhere if -
xapian-core/tests/harness/testrunner.cc
diff --git a/xapian-core/tests/harness/testrunner.cc b/xapian-core/tests/harness/testrunner.cc index 946178f..0befd5a 100644
a b static BackendProperties backend_properties[] = { 62 62 { "multi_brass", "backend,positional,valuestats,multi" }, 63 63 { "multi_chert", "backend,positional,valuestats,multi" }, 64 64 { "multi_flint", "backend,positional,multi" }, 65 { "remoteprog_brass", "backend,remote,transactions,positional,valuestats,writable,metadata " },66 { "remotetcp_brass", "backend,remote,transactions,positional,valuestats,writable,metadata " },67 { "remoteprog_chert", "backend,remote,transactions,positional,valuestats,writable,metadata " },68 { "remotetcp_chert", "backend,remote,transactions,positional,valuestats,writable,metadata " },69 { "remoteprog_flint", "backend,remote,transactions,positional,writable,metadata " },70 { "remotetcp_flint", "backend,remote,transactions,positional,writable,metadata " },65 { "remoteprog_brass", "backend,remote,transactions,positional,valuestats,writable,metadata,spelling" }, 66 { "remotetcp_brass", "backend,remote,transactions,positional,valuestats,writable,metadata,spelling" }, 67 { "remoteprog_chert", "backend,remote,transactions,positional,valuestats,writable,metadata,spelling" }, 68 { "remotetcp_chert", "backend,remote,transactions,positional,valuestats,writable,metadata,spelling" }, 69 { "remoteprog_flint", "backend,remote,transactions,positional,writable,metadata,spelling" }, 70 { "remotetcp_flint", "backend,remote,transactions,positional,writable,metadata,spelling" }, 71 71 { NULL, NULL } 72 72 }; 73 73