Ticket #178: remote-spell-remainder.patch
File remote-spell-remainder.patch, 20.4 KB (added by , 15 years ago) |
---|
-
xapian-core/tests/api_spelling.cc
182 182 } 183 183 184 184 // Test spelling correction with multi databases 185 DEFINE_TESTCASE(spell3, spelling) { 185 DEFINE_TESTCASE(spell3, spelling && !remote) { 186 // Spelling iterator not implemented for remote databases. 186 187 Xapian::WritableDatabase db1 = get_writable_database(); 187 188 // We can't just call get_writable_database() since it would delete db1 188 189 // which doesn't work at all under __WIN32__ and will go wrong elsewhere if -
xapian-core/tests/harness/testrunner.cc
62 62 { "multi_brass", "backend,positional,valuestats,multi" }, 63 63 { "multi_chert", "backend,positional,valuestats,multi" }, 64 64 { "multi_flint", "backend,positional,multi" }, 65 { "remoteprog_brass", "backend,remote,transactions,positional,valuestats,writable,metadata " },66 { "remotetcp_brass", "backend,remote,transactions,positional,valuestats,writable,metadata " },67 { "remoteprog_chert", "backend,remote,transactions,positional,valuestats,writable,metadata " },68 { "remotetcp_chert", "backend,remote,transactions,positional,valuestats,writable,metadata " },69 { "remoteprog_flint", "backend,remote,transactions,positional,writable,metadata " },70 { "remotetcp_flint", "backend,remote,transactions,positional,writable,metadata " },65 { "remoteprog_brass", "backend,remote,transactions,positional,valuestats,writable,metadata,spelling" }, 66 { "remotetcp_brass", "backend,remote,transactions,positional,valuestats,writable,metadata,spelling" }, 67 { "remoteprog_chert", "backend,remote,transactions,positional,valuestats,writable,metadata,spelling" }, 68 { "remotetcp_chert", "backend,remote,transactions,positional,valuestats,writable,metadata,spelling" }, 69 { "remoteprog_flint", "backend,remote,transactions,positional,writable,metadata,spelling" }, 70 { "remotetcp_flint", "backend,remote,transactions,positional,writable,metadata,spelling" }, 71 71 { NULL, NULL } 72 72 }; 73 73 -
xapian-core/net/remoteserver.cc
40 40 #include "serialise-double.h" 41 41 #include "utils.h" 42 42 #include "weightinternal.h" 43 #include "database.h" // so we can use internal in msg_openspellingtermlist 43 44 44 45 /// Class to throw when we receive the connection closing message. 45 46 struct ConnectionClosed { }; … … 189 190 &RemoteServer::msg_setmetadata, 190 191 &RemoteServer::msg_addspelling, 191 192 &RemoteServer::msg_removespelling, 193 &RemoteServer::msg_getspellingsuggestion, 194 &RemoteServer::msg_getspellingfreq 192 195 // MSG_GETMSET - used during a conversation. 193 196 // MSG_SHUTDOWN - handled by get_message(). 194 197 }; … … 704 707 Xapian::termcount freqdec = decode_length(&p, p_end, false); 705 708 wdb->remove_spelling(string(p, p_end - p), freqdec); 706 709 } 710 711 void 712 RemoteServer::msg_getspellingsuggestion(const string & message) 713 { 714 const char *p = message.data(); 715 const char *p_end = p + message.size(); 716 size_t dist = decode_length(&p, p_end, false); 717 string word(p, p_end - p); 718 send_message(REPLY_SPELLINGSUGGESTION, db->get_spelling_suggestion(word, dist)); 719 } 720 721 void 722 RemoteServer::msg_getspellingfreq(const string & message) 723 { 724 Xapian::doccount freq = 0; 725 for (size_t j = 0; j < db->internal.size(); ++j) 726 freq += db->internal[j]->get_spelling_frequency(message); 727 send_message(REPLY_SPELLINGFREQ, encode_length(freq)); 728 } -
xapian-core/common/remote-database.h
249 249 void add_spelling(const std::string&, Xapian::termcount) const; 250 250 251 251 void remove_spelling(const std::string&, Xapian::termcount freqdec) const; 252 253 std::string get_spelling_suggestion(const std::string &word, 254 unsigned max_edit_distance); 255 256 Xapian::doccount get_spelling_frequency(const std::string & word) const; 252 257 }; 253 258 254 259 #endif // XAPIAN_INCLUDED_REMOTE_DATABASE_H -
xapian-core/common/remoteprotocol.h
44 44 // 33: 1.1.3 Support for passing matchspies over the remote connection. 45 45 // 34: 1.1.4 Support for metadata over with remote databases. 46 46 // 35: 1.1.5 Support for add_spelling() and remove_spelling(). 47 #define XAPIAN_REMOTE_PROTOCOL_MAJOR_VERSION 35 47 // 36: ????? Full remote spelling support. 48 #define XAPIAN_REMOTE_PROTOCOL_MAJOR_VERSION 36 48 49 #define XAPIAN_REMOTE_PROTOCOL_MINOR_VERSION 0 49 50 50 51 /** Message types (client -> server). … … 79 80 MSG_SETMETADATA, // Set metadata 80 81 MSG_ADDSPELLING, // Add a spelling 81 82 MSG_REMOVESPELLING, // Remove a spelling 83 MSG_GETSPELLINGSUGGESTION, // Get a spelling suggestion 84 MSG_GETSPELLINGFREQ, // Get spelling frequency 82 85 MSG_GETMSET, // Get MSet 83 86 MSG_SHUTDOWN, // Shutdown 84 87 MSG_MAX … … 107 110 REPLY_ADDDOCUMENT, // Add Document 108 111 REPLY_RESULTS, // Results (MSet) 109 112 REPLY_METADATA, // Metadata 113 REPLY_SPELLINGSUGGESTION, // Get Spelling Suggestion 114 REPLY_SPELLINGFREQ, // Spelling frequency 110 115 REPLY_MAX 111 116 }; 112 117 -
xapian-core/common/remoteserver.h
154 154 // remove a spelling 155 155 void msg_removespelling(const std::string & message); 156 156 157 // get spellings 158 void msg_getspellingsuggestion(const std::string & message); 159 160 // get spelling frequency 161 void msg_getspellingfreq(const std::string & message); 162 157 163 public: 158 164 /** Construct a RemoteServer. 159 165 * -
xapian-core/backends/remote/remote-database.cc
759 759 data += word; 760 760 send_message(MSG_REMOVESPELLING, data); 761 761 } 762 763 string 764 RemoteDatabase::get_spelling_suggestion(const string & word, 765 unsigned max_edit_distance) 766 { 767 string data = encode_length(max_edit_distance); 768 data += word; 769 send_message(MSG_GETSPELLINGSUGGESTION, data); 770 771 string message; 772 get_message(message, REPLY_SPELLINGSUGGESTION); 773 return message; 774 } 775 776 Xapian::doccount 777 RemoteDatabase::get_spelling_frequency(const string & word) const 778 { 779 send_message(MSG_GETSPELLINGFREQ, word); 780 string message; 781 get_message(message, REPLY_SPELLINGFREQ); 782 const char * p = message.data(); 783 const char * p_end = p + message.size(); 784 return decode_length(&p, p_end, false); 785 } -
xapian-core/api/omdatabase.cc
42 42 #include "editdistance.h" 43 43 #include "ortermlist.h" 44 44 #include "noreturn.h" 45 #include "remote-database.h" 45 46 46 47 #include <cstdlib> // For abs(). 47 48 … … 498 499 return "Database()"; 499 500 } 500 501 501 // We sum the character frequency histogram absolute differences to compute a502 // lower bound on the edit distance. Rather than counting each Unicode code503 // point uniquely, we use an array with VEC_SIZE elements and tally code points504 // modulo VEC_SIZE which can only reduce the bound we calculate.505 //506 // There will be a trade-off between how good the bound is and how large and507 // array is used (a larger array takes more time to clear and sum over). The508 // value 64 is somewhat arbitrary - it works as well as 128 for the testsuite509 // but that may not reflect real world performance. FIXME: profile and tune.510 511 #define VEC_SIZE 64512 513 static int514 freq_edit_lower_bound(const vector<unsigned> & a, const vector<unsigned> & b)515 {516 int vec[VEC_SIZE];517 memset(vec, 0, sizeof(vec));518 vector<unsigned>::const_iterator i;519 for (i = a.begin(); i != a.end(); ++i) {520 ++vec[(*i) % VEC_SIZE];521 }522 for (i = b.begin(); i != b.end(); ++i) {523 --vec[(*i) % VEC_SIZE];524 }525 unsigned int total = 0;526 for (size_t j = 0; j < VEC_SIZE; ++j) {527 total += abs(vec[j]);528 }529 // Each insertion or deletion adds at most 1 to total. Each transposition530 // doesn't change it at all. But each substitution can change it by 2 so531 // we need to divide it by 2. Rounding up is OK, since the odd change must532 // be due to an actual edit.533 return (total + 1) / 2;534 }535 536 502 // Word must have a trigram score at least this close to the best score seen 537 503 // so far. 538 504 #define TRIGRAM_SCORE_THRESHOLD 2 … … 545 511 word << ", " << max_edit_distance); 546 512 if (word.size() <= 1) return string(); 547 513 AutoPtr<TermList> merger; 514 bool got_remote = false; 548 515 for (size_t i = 0; i < internal.size(); ++i) { 549 TermList * tl = internal[i]->open_spelling_termlist(word); 550 LOGLINE(SPELLING, "Sub db " << i << " tl = " << (void*)tl); 551 if (tl) { 552 if (merger.get()) { 553 merger.reset(new OrTermList(merger.release(), tl)); 554 } else { 555 merger.reset(tl); 556 } 557 } 516 RemoteDatabase *rdb = internal[i].get()->as_remotedatabase(); 517 if (!rdb) { 518 TermList * tl = internal[i]->open_spelling_termlist(word); 519 LOGLINE(SPELLING, "Sub db " << i << " tl = " << (void*)tl); 520 if (tl) { 521 if (merger.get()) { 522 merger.reset(new OrTermList(merger.release(), tl)); 523 } else { 524 merger.reset(tl); 525 } 526 } 527 } else 528 got_remote = true; 558 529 } 559 if (!merger.get()) RETURN(string());560 530 561 // Convert word to UTF-32. 562 #ifdef __SUNPRO_CC 563 vector<unsigned> utf32_word; 564 for (Utf8Iterator sunpro_it(word); sunpro_it != Utf8Iterator(); ++sunpro_it) { 565 utf32_word.push_back(*sunpro_it); 531 if (! (merger.get() or got_remote)) { 532 RETURN(string()); 566 533 } 567 #else 568 // Extra brackets needed to avoid this being misparsed as a function 569 // prototype. 570 vector<unsigned> utf32_word((Utf8Iterator(word)), Utf8Iterator()); 571 #endif 534 535 EditDistance edcomp(word); 572 536 573 vector<unsigned> utf32_term;574 575 537 Xapian::termcount best = 1; 576 538 string result; 577 539 int edist_best = max_edit_distance; 578 540 Xapian::doccount freq_best = 0; 579 while (true) { 580 TermList *ret = merger->next(); 581 if (ret) merger.reset(ret); 541 if (merger.get()) { 542 while (true) { 543 TermList *ret = merger->next(); 544 if (ret) merger.reset(ret); 582 545 583 546 if (merger->at_end()) break; 584 547 585 586 548 string term = merger->get_termname(); 549 Xapian::termcount score = merger->get_wdf(); 587 550 588 589 590 551 LOGLINE(SPELLING, "Term \"" << term << "\" ngram score " << score); 552 if (score + TRIGRAM_SCORE_THRESHOLD >= best) { 553 if (score > best) best = score; 591 554 592 // There's no point considering a word where the difference 593 // in length is greater than the smallest number of edits we've 594 // found so far. 555 // There's no point considering a word where the difference 556 // in length is greater than the smallest number of edits we've 557 // found so far. 558 559 int edist = edcomp.distance(term, edist_best); 595 560 596 // First check the length of the encoded UTF-8 version of term. 597 // Each UTF-32 character is 1-4 bytes in UTF-8. 598 if (abs(long(term.size()) - long(word.size())) > edist_best * 4) { 599 LOGLINE(SPELLING, "Lengths much too different"); 600 continue; 601 } 561 if (edist > edist_best) continue; 602 562 603 // Now convert to UTF-32, and compare the true lengths more 604 // strictly. 605 utf32_term.assign(Utf8Iterator(term), Utf8Iterator()); 606 607 if (abs(long(utf32_term.size()) - long(utf32_word.size())) 608 > edist_best) { 609 LOGLINE(SPELLING, "Lengths too different"); 610 continue; 563 LOGLINE(SPELLING, "Edit distance " << edist); 564 // If we have an exact match, return an empty string since there's 565 // no correction required. 566 if (edist == 0) RETURN(string()); 567 if (edist <= edist_best) { 568 Xapian::doccount freq = 0; 569 for (size_t j = 0; j < internal.size(); ++j) 570 // this includes calls on get_spelling_frequency 571 // for remote databases 572 freq += internal[j]->get_spelling_frequency(term); 573 LOGLINE(SPELLING, "Freq " << freq << " best " << freq_best); 574 if (edist < edist_best || freq > freq_best) { 575 LOGLINE(SPELLING, "Best so far: \"" << term << 576 "\" edist " << edist << " freq " << freq); 577 result = term; 578 edist_best = edist; 579 freq_best = freq; 580 } 581 } 611 582 } 583 } 584 } 612 585 613 if (freq_edit_lower_bound(utf32_term, utf32_word) > edist_best) { 614 LOGLINE(SPELLING, "Rejected by character frequency test"); 615 continue; 616 } 617 618 int edist = edit_distance_unsigned(&utf32_term[0], 619 int(utf32_term.size()), 620 &utf32_word[0], 621 int(utf32_word.size()), 622 edist_best); 623 LOGLINE(SPELLING, "Edit distance " << edist); 624 // If we have an exact match, return an empty string since there's 625 // no correction required. 626 if (edist == 0) RETURN(string()); 627 628 if (edist <= edist_best) { 629 Xapian::doccount freq = 0; 630 for (size_t j = 0; j < internal.size(); ++j) 631 freq += internal[j]->get_spelling_frequency(term); 632 633 LOGLINE(SPELLING, "Freq " << freq << " best " << freq_best); 634 if (edist < edist_best || freq > freq_best) { 635 LOGLINE(SPELLING, "Best so far: \"" << term << 636 "\" edist " << edist << " freq " << freq); 637 result = term; 638 edist_best = edist; 639 freq_best = freq; 586 // we've got the best we can locally which - it there's no 587 // correction need means we won't need to check remote 588 // databases. If we get this far we try corrections from remote 589 // databases. 590 for (size_t i = 0; i < internal.size(); ++i) { 591 RemoteDatabase *rdb = internal[i].get()->as_remotedatabase(); 592 if (rdb) { 593 string rterm = rdb->get_spelling_suggestion(word, edist_best); 594 if (rterm != "") { 595 // FIXME: repetitious of stuff in the loop above - should be factored out. 596 int edist = edcomp.distance(rterm, edist_best); 597 if (edist == 0) RETURN(string()); 598 if (edist <= edist_best) { 599 Xapian::doccount freq = 0; 600 for (size_t j = 0; j < internal.size(); ++j) 601 freq += internal[j]->get_spelling_frequency(rterm); 602 LOGLINE(SPELLING, "Freq " << freq << " best " << freq_best); 603 if (edist < edist_best || freq > freq_best) { 604 LOGLINE(SPELLING, "Best so far: \"" << term << 605 "\" edist " << edist << " freq " << freq); 606 result = rterm; 607 edist_best = edist; 608 freq_best = freq; 609 } 640 610 } 641 611 } 642 612 } 643 613 } 644 614 RETURN(result); 645 615 } -
xapian-core/api/editdistance.cc
32 32 #include "editdistance.h" 33 33 34 34 #include "omassert.h" 35 #include "debuglog.h" 35 36 37 #include <xapian/unicode.h> 38 36 39 #include <algorithm> 37 40 #include <cstdlib> 38 41 #include <cstring> 39 42 using namespace std; 40 43 41 44 template<class CHR> … … 213 216 { 214 217 return seqcmp_editdist<unsigned>(ptr1, len1, ptr2, len2, max_distance); 215 218 } 219 220 using namespace Xapian; 221 222 EditDistance::EditDistance(const string& word_) 223 #ifdef __SUNPRO_CC 224 { 225 for (Utf8Iterator sunpro_it(word); sunpro_it != Utf8Iterator(); ++sunpro_it) { 226 utf32_word.push_back(*sunpro_it); 227 } 228 } 229 #else 230 231 :word(word_), utf32_word(Utf8Iterator(word_), Utf8Iterator()) {} 232 233 #endif 234 235 236 // We sum the character frequency histogram absolute differences to compute a 237 // lower bound on the edit distance. Rather than counting each Unicode code 238 // point uniquely, we use an array with VEC_SIZE elements and tally code points 239 // modulo VEC_SIZE which can only reduce the bound we calculate. 240 // 241 // There will be a trade-off between how good the bound is and how large and 242 // array is used (a larger array takes more time to clear and sum over). The 243 // value 64 is somewhat arbitrary - it works as well as 128 for the testsuite 244 // but that may not reflect real world performance. FIXME: profile and tune. 245 246 #define VEC_SIZE 64 247 248 static int 249 freq_edit_lower_bound(const vector<unsigned> & a, const vector<unsigned> & b) 250 { 251 int vec[VEC_SIZE]; 252 memset(vec, 0, sizeof(vec)); 253 vector<unsigned>::const_iterator i; 254 for (i = a.begin(); i != a.end(); ++i) { 255 ++vec[(*i) % VEC_SIZE]; 256 } 257 for (i = b.begin(); i != b.end(); ++i) { 258 --vec[(*i) % VEC_SIZE]; 259 } 260 unsigned int total = 0; 261 for (size_t j = 0; j < VEC_SIZE; ++j) { 262 total += abs(vec[j]); 263 } 264 // Each insertion or deletion adds at most 1 to total. Each transposition 265 // doesn't change it at all. But each substitution can change it by 2 so 266 // we need to divide it by 2. Rounding up is OK, since the odd change must 267 // be due to an actual edit. 268 return (total + 1) / 2; 269 } 270 271 int 272 EditDistance::distance(const string& term,int limit) { 273 vector<unsigned> utf32_term; 274 // we use this to signal that the distance is clearly large and that we're 275 // not going to work it out exactly 276 const int big_distance = 1000; 277 278 // First check the length of the encoded UTF-8 version of term. 279 // Each UTF-32 character is 1-4 bytes in UTF-8. 280 if (abs(long(term.size()) - long(word.size())) > limit * 4) { 281 LOGLINE(SPELLING, "Lengths much too different"); 282 return big_distance; 283 } 284 285 // Now convert to UTF-32, and compare the true lengths more 286 // strictly. 287 utf32_term.assign(Utf8Iterator(term), Utf8Iterator()); 288 289 if (abs(long(utf32_term.size()) - long(utf32_word.size())) 290 > limit) { 291 LOGLINE(SPELLING, "Lengths too different"); 292 return big_distance; 293 } 294 295 if (freq_edit_lower_bound(utf32_term, utf32_word) > limit) { 296 LOGLINE(SPELLING, "Rejected by character frequency test"); 297 return big_distance; 298 } 299 300 return edit_distance_unsigned(&utf32_term[0], 301 int(utf32_term.size()), 302 &utf32_word[0], 303 int(utf32_word.size()), 304 limit); 305 } -
xapian-core/api/editdistance.h
22 22 #ifndef XAPIAN_INCLUDED_EDITDISTANCE_H 23 23 #define XAPIAN_INCLUDED_EDITDISTANCE_H 24 24 25 #include<string> 26 #include<vector> 25 27 /** Calculate the edit distance between two sequences. 26 28 * 27 29 * Edit distance is defined as the minimum number of edit operations … … 49 51 const unsigned* ptr2, int len2, 50 52 int max_distance); 51 53 54 55 class EditDistance { 56 const std::string word; 57 const std::vector<unsigned> utf32_word; 58 59 public: 60 EditDistance(const std::string& word_) ; 61 int distance(const std::string& other, int limit); 62 }; 52 63 #endif // XAPIAN_INCLUDED_EDITDISTANCE_H