Ticket #178: remote-spell.patch
File remote-spell.patch, 22.7 KB (added by , 15 years ago) |
---|
-
xapian-core/tests/api_spelling.cc
158 158 } 159 159 160 160 // Test spelling correction with multi databases 161 DEFINE_TESTCASE(spell3, spelling) { 161 // spelling iterator not implemented for remotes. 162 DEFINE_TESTCASE(spell3, spelling & !remote) { 162 163 Xapian::WritableDatabase db1 = get_writable_database(); 163 164 // We can't just call get_writable_database() since it would delete db1 164 165 // which doesn't work at all under __WIN32__ and will go wrong elsewhere if -
xapian-core/tests/harness/testrunner.cc
62 62 { "multi_brass", "backend,positional,valuestats,multi" }, 63 63 { "multi_chert", "backend,positional,valuestats,multi" }, 64 64 { "multi_flint", "backend,positional,multi" }, 65 { "remoteprog_brass", "backend,remote,transactions,positional,valuestats,writable,metadata " },66 { "remotetcp_brass", "backend,remote,transactions,positional,valuestats,writable,metadata " },67 { "remoteprog_chert", "backend,remote,transactions,positional,valuestats,writable,metadata " },68 { "remotetcp_chert", "backend,remote,transactions,positional,valuestats,writable,metadata " },69 { "remoteprog_flint", "backend,remote,transactions,positional,writable,metadata " },70 { "remotetcp_flint", "backend,remote,transactions,positional,writable,metadata " },65 { "remoteprog_brass", "backend,remote,transactions,positional,valuestats,writable,metadata,spelling" }, 66 { "remotetcp_brass", "backend,remote,transactions,positional,valuestats,writable,metadata,spelling" }, 67 { "remoteprog_chert", "backend,remote,transactions,positional,valuestats,writable,metadata,spelling" }, 68 { "remotetcp_chert", "backend,remote,transactions,positional,valuestats,writable,metadata,spelling" }, 69 { "remoteprog_flint", "backend,remote,transactions,positional,writable,metadata,spelling" }, 70 { "remotetcp_flint", "backend,remote,transactions,positional,writable,metadata,spelling" }, 71 71 { NULL, NULL } 72 72 }; 73 73 -
xapian-core/net/remoteserver.cc
40 40 #include "serialise-double.h" 41 41 #include "utils.h" 42 42 #include "weightinternal.h" 43 #include "database.h" // so we can use internal in msg_openspellingtermlist 43 44 44 45 /// Class to throw when we receive the connection closing message. 45 46 struct ConnectionClosed { }; … … 185 186 &RemoteServer::msg_replacedocumentterm, 186 187 &RemoteServer::msg_deletedocument, 187 188 &RemoteServer::msg_writeaccess, 188 &RemoteServer::msg_getmetadata, 189 &RemoteServer::msg_setmetadata, 189 &RemoteServer::msg_getmetadata, 190 &RemoteServer::msg_setmetadata, 191 &RemoteServer::msg_addspelling, 192 &RemoteServer::msg_removespelling, 193 &RemoteServer::msg_getspellingsuggestion, 194 &RemoteServer::msg_getspellingfreq 190 195 // MSG_GETMSET - used during a conversation. 191 196 // MSG_SHUTDOWN - handled by get_message(). 192 197 }; … … 680 685 string val(p, p_end - p); 681 686 wdb->set_metadata(key, val); 682 687 } 688 689 void 690 RemoteServer::msg_addspelling(const string & message) 691 { 692 if (!wdb) 693 throw Xapian::InvalidOperationError("Server is read-only"); 694 const char *p = message.data(); 695 const char *p_end = p + message.size(); 696 Xapian::termcount freq = decode_length(&p, p_end, false); 697 string term = string(p, p_end - p); 698 wdb->add_spelling(term, freq); 699 } 700 701 void 702 RemoteServer::msg_removespelling(const string & message) 703 { 704 if (!wdb) 705 throw Xapian::InvalidOperationError("Server is read-only"); 706 const char *p = message.data(); 707 const char *p_end = p + message.size(); 708 Xapian::termcount freqdec = decode_length(&p, p_end, false); 709 wdb->remove_spelling(string(p, p_end - p), freqdec); 710 } 711 712 void 713 RemoteServer::msg_getspellingsuggestion(const string & message) 714 { 715 const char *p = message.data(); 716 const char *p_end = p + message.size(); 717 size_t dist = decode_length(&p, p_end, false); 718 string word(p, p_end - p); 719 send_message(REPLY_SPELLINGSUGGESTION, db->get_spelling_suggestion(word, dist)); 720 } 721 722 void 723 RemoteServer::msg_getspellingfreq(const string & message) 724 { 725 Xapian::doccount freq = 0; 726 for (size_t j = 0; j < db->internal.size(); ++j) 727 freq += db->internal[j]->get_spelling_frequency(message); 728 send_message(REPLY_SPELLINGFREQ, encode_length(freq)); 729 } -
xapian-core/common/remote-database.h
89 89 90 90 void update_stats(message_type msg_code = MSG_UPDATE) const; 91 91 92 // get spelling word or termlist; 93 TermList * receive_spelling_list(bool) const; 94 92 95 protected: 93 96 /** Constructor. The constructor is protected so that raw instances 94 97 * can't be created - a derived class must be instantiated which … … 245 248 string get_metadata(const string & key) const; 246 249 247 250 void set_metadata(const string & key, const string & value); 251 252 string get_spelling_suggestion(const string &word, 253 unsigned max_edit_distance); 254 void add_spelling(const std::string&, Xapian::termcount) const; 255 void remove_spelling(const std::string&, Xapian::termcount freqdec) const; 256 Xapian::doccount get_spelling_frequency(const string & word) const; 257 258 248 259 }; 249 250 260 #endif // XAPIAN_INCLUDED_REMOTE_DATABASE_H -
xapian-core/common/remoteprotocol.h
74 74 MSG_REPLACEDOCUMENTTERM, // Replace Document by term 75 75 MSG_DELETEDOCUMENT, // Delete Document 76 76 MSG_WRITEACCESS, // Upgrade to WritableDatabase 77 MSG_GETMETADATA, // Get metadata 78 MSG_SETMETADATA, // Set metadata 77 MSG_GETMETADATA, // get metadata 78 MSG_SETMETADATA, // set metadata 79 MSG_ADDSPELLING, // add a spelling 80 MSG_REMOVESPELLING, // add a spelling 81 MSG_GETSPELLINGSUGGESTION, // get a spelling suggestion 82 MSG_GETSPELLINGFREQ, // get spelling frequency 79 83 MSG_GETMSET, // Get MSet 80 84 MSG_SHUTDOWN, // Shutdown 81 85 MSG_MAX … … 103 107 REPLY_VALUE, // Document Value 104 108 REPLY_ADDDOCUMENT, // Add Document 105 109 REPLY_RESULTS, // Results (MSet) 106 REPLY_METADATA, // Metadata 110 REPLY_METADATA, // Metadata 111 REPLY_SPELLINGSUGGESTION, // Get Spelling 112 REPLY_SPELLINGFREQ, // Spelling frequency 107 113 REPLY_MAX 108 114 }; 109 115 -
xapian-core/common/remoteserver.h
148 148 // set metadata 149 149 void msg_setmetadata(const std::string & message); 150 150 151 // add a spelling 152 void msg_addspelling(const std::string & message); 153 154 // remove a spelling 155 void msg_removespelling(const std::string & message); 156 157 // get spellings 158 void msg_getspellingsuggestion(const std::string & message); 159 160 // get spelling frequency 161 void msg_getspellingfreq(const std::string & message); 162 151 163 public: 152 164 /** Construct a RemoteServer. 153 165 * -
xapian-core/api/omdatabase.cc
42 42 #include "editdistance.h" 43 43 #include "ortermlist.h" 44 44 #include "noreturn.h" 45 #include "remote-database.h" 45 46 46 47 #include <cstdlib> // For abs(). 47 48 … … 498 499 return "Database()"; 499 500 } 500 501 501 // We sum the character frequency histogram absolute differences to compute a502 // lower bound on the edit distance. Rather than counting each Unicode code503 // point uniquely, we use an array with VEC_SIZE elements and tally code points504 // modulo VEC_SIZE which can only reduce the bound we calculate.505 //506 // There will be a trade-off between how good the bound is and how large and507 // array is used (a larger array takes more time to clear and sum over). The508 // value 64 is somewhat arbitrary - it works as well as 128 for the testsuite509 // but that may not reflect real world performance. FIXME: profile and tune.510 511 #define VEC_SIZE 64512 513 static int514 freq_edit_lower_bound(const vector<unsigned> & a, const vector<unsigned> & b)515 {516 int vec[VEC_SIZE];517 memset(vec, 0, sizeof(vec));518 vector<unsigned>::const_iterator i;519 for (i = a.begin(); i != a.end(); ++i) {520 ++vec[(*i) % VEC_SIZE];521 }522 for (i = b.begin(); i != b.end(); ++i) {523 --vec[(*i) % VEC_SIZE];524 }525 unsigned int total = 0;526 for (size_t j = 0; j < VEC_SIZE; ++j) {527 total += abs(vec[j]);528 }529 // Each insertion or deletion adds at most 1 to total. Each transposition530 // doesn't change it at all. But each substitution can change it by 2 so531 // we need to divide it by 2. Rounding up is OK, since the odd change must532 // be due to an actual edit.533 return (total + 1) / 2;534 }535 536 502 // Word must have a trigram score at least this close to the best score seen 537 503 // so far. 538 504 #define TRIGRAM_SCORE_THRESHOLD 2 … … 545 511 word << ", " << max_edit_distance); 546 512 if (word.size() <= 1) return string(); 547 513 AutoPtr<TermList> merger; 514 bool got_remote = false; 548 515 for (size_t i = 0; i < internal.size(); ++i) { 549 TermList * tl = internal[i]->open_spelling_termlist(word); 550 LOGLINE(SPELLING, "Sub db " << i << " tl = " << (void*)tl); 551 if (tl) { 552 if (merger.get()) { 553 merger.reset(new OrTermList(merger.release(), tl)); 554 } else { 555 merger.reset(tl); 556 } 557 } 516 RemoteDatabase *rdb = internal[i].get()->as_remotedatabase(); 517 if (!rdb) { 518 TermList * tl = internal[i]->open_spelling_termlist(word); 519 LOGLINE(SPELLING, "Sub db " << i << " tl = " << (void*)tl); 520 if (tl) { 521 if (merger.get()) { 522 merger.reset(new OrTermList(merger.release(), tl)); 523 } else { 524 merger.reset(tl); 525 } 526 } 527 } else 528 got_remote = true; 558 529 } 559 if (!merger.get()) RETURN(string());560 530 561 // Convert word to UTF-32. 562 #ifdef __SUNPRO_CC 563 vector<unsigned> utf32_word; 564 for (Utf8Iterator sunpro_it(word); sunpro_it != Utf8Iterator(); ++sunpro_it) { 565 utf32_word.push_back(*sunpro_it); 531 if (! (merger.get() or got_remote)) { 532 RETURN(string()); 566 533 } 567 #else 568 // Extra brackets needed to avoid this being misparsed as a function 569 // prototype. 570 vector<unsigned> utf32_word((Utf8Iterator(word)), Utf8Iterator()); 571 #endif 534 535 EditDistance edcomp(word); 572 536 573 vector<unsigned> utf32_term;574 575 537 Xapian::termcount best = 1; 576 538 string result; 577 539 int edist_best = max_edit_distance; 578 540 Xapian::doccount freq_best = 0; 579 while (true) { 580 TermList *ret = merger->next(); 581 if (ret) merger.reset(ret); 541 if (merger.get()) { 542 while (true) { 543 TermList *ret = merger->next(); 544 if (ret) merger.reset(ret); 582 545 583 546 if (merger->at_end()) break; 584 547 585 586 548 string term = merger->get_termname(); 549 Xapian::termcount score = merger->get_wdf(); 587 550 588 589 590 551 LOGLINE(SPELLING, "Term \"" << term << "\" ngram score " << score); 552 if (score + TRIGRAM_SCORE_THRESHOLD >= best) { 553 if (score > best) best = score; 591 554 592 // There's no point considering a word where the difference 593 // in length is greater than the smallest number of edits we've 594 // found so far. 555 // There's no point considering a word where the difference 556 // in length is greater than the smallest number of edits we've 557 // found so far. 558 559 int edist = edcomp.distance(term, edist_best); 595 560 596 // First check the length of the encoded UTF-8 version of term. 597 // Each UTF-32 character is 1-4 bytes in UTF-8. 598 if (abs(long(term.size()) - long(word.size())) > edist_best * 4) { 599 LOGLINE(SPELLING, "Lengths much too different"); 600 continue; 601 } 561 if (edist > edist_best) continue; 602 562 603 // Now convert to UTF-32, and compare the true lengths more 604 // strictly. 605 utf32_term.assign(Utf8Iterator(term), Utf8Iterator()); 606 607 if (abs(long(utf32_term.size()) - long(utf32_word.size())) 608 > edist_best) { 609 LOGLINE(SPELLING, "Lengths too different"); 610 continue; 563 LOGLINE(SPELLING, "Edit distance " << edist); 564 // If we have an exact match, return an empty string since there's 565 // no correction required. 566 if (edist == 0) RETURN(string()); 567 if (edist <= edist_best) { 568 Xapian::doccount freq = 0; 569 for (size_t j = 0; j < internal.size(); ++j) 570 // this includes calls on get_spelling_frequency 571 // for remote databases 572 freq += internal[j]->get_spelling_frequency(term); 573 LOGLINE(SPELLING, "Freq " << freq << " best " << freq_best); 574 if (edist < edist_best || freq > freq_best) { 575 LOGLINE(SPELLING, "Best so far: \"" << term << 576 "\" edist " << edist << " freq " << freq); 577 result = term; 578 edist_best = edist; 579 freq_best = freq; 580 } 581 } 611 582 } 583 } 584 } 612 585 613 if (freq_edit_lower_bound(utf32_term, utf32_word) > edist_best) { 614 LOGLINE(SPELLING, "Rejected by character frequency test"); 615 continue; 616 } 617 618 int edist = edit_distance_unsigned(&utf32_term[0], 619 int(utf32_term.size()), 620 &utf32_word[0], 621 int(utf32_word.size()), 622 edist_best); 623 LOGLINE(SPELLING, "Edit distance " << edist); 624 // If we have an exact match, return an empty string since there's 625 // no correction required. 626 if (edist == 0) RETURN(string()); 627 628 if (edist <= edist_best) { 629 Xapian::doccount freq = 0; 630 for (size_t j = 0; j < internal.size(); ++j) 631 freq += internal[j]->get_spelling_frequency(term); 632 633 LOGLINE(SPELLING, "Freq " << freq << " best " << freq_best); 634 if (edist < edist_best || freq > freq_best) { 635 LOGLINE(SPELLING, "Best so far: \"" << term << 636 "\" edist " << edist << " freq " << freq); 637 result = term; 638 edist_best = edist; 639 freq_best = freq; 586 // we've got the best we can locally which - it there's no 587 // correction need means we won't need to check remote 588 // databases. If we get this far we try corrections from remote 589 // databases. 590 for (size_t i = 0; i < internal.size(); ++i) { 591 RemoteDatabase *rdb = internal[i].get()->as_remotedatabase(); 592 if (rdb) { 593 string rterm = rdb->get_spelling_suggestion(word, edist_best); 594 if (rterm != "") { 595 // FIXME: repetitious of stuff in the loop above - should be factored out. 596 int edist = edcomp.distance(rterm, edist_best); 597 if (edist == 0) RETURN(string()); 598 if (edist <= edist_best) { 599 Xapian::doccount freq = 0; 600 for (size_t j = 0; j < internal.size(); ++j) 601 freq += internal[j]->get_spelling_frequency(rterm); 602 LOGLINE(SPELLING, "Freq " << freq << " best " << freq_best); 603 if (edist < edist_best || freq > freq_best) { 604 LOGLINE(SPELLING, "Best so far: \"" << term << 605 "\" edist " << edist << " freq " << freq); 606 result = rterm; 607 edist_best = edist; 608 freq_best = freq; 609 } 640 610 } 641 611 } 642 612 } 643 613 } 644 614 RETURN(result); 645 615 } -
xapian-core/api/editdistance.cc
32 32 #include "editdistance.h" 33 33 34 34 #include "omassert.h" 35 #include "debuglog.h" 35 36 37 #include <xapian/unicode.h> 38 36 39 #include <algorithm> 37 40 #include <cstdlib> 38 41 #include <cstring> 39 42 using namespace std; 40 43 41 44 template<class CHR> … … 213 216 { 214 217 return seqcmp_editdist<unsigned>(ptr1, len1, ptr2, len2, max_distance); 215 218 } 219 220 using namespace Xapian; 221 222 EditDistance::EditDistance(const string& word_) 223 #ifdef __SUNPRO_CC 224 { 225 for (Utf8Iterator sunpro_it(word); sunpro_it != Utf8Iterator(); ++sunpro_it) { 226 utf32_word.push_back(*sunpro_it); 227 } 228 } 229 #else 230 231 :word(word_), utf32_word(Utf8Iterator(word_), Utf8Iterator()) {} 232 233 #endif 234 235 236 // We sum the character frequency histogram absolute differences to compute a 237 // lower bound on the edit distance. Rather than counting each Unicode code 238 // point uniquely, we use an array with VEC_SIZE elements and tally code points 239 // modulo VEC_SIZE which can only reduce the bound we calculate. 240 // 241 // There will be a trade-off between how good the bound is and how large and 242 // array is used (a larger array takes more time to clear and sum over). The 243 // value 64 is somewhat arbitrary - it works as well as 128 for the testsuite 244 // but that may not reflect real world performance. FIXME: profile and tune. 245 246 #define VEC_SIZE 64 247 248 static int 249 freq_edit_lower_bound(const vector<unsigned> & a, const vector<unsigned> & b) 250 { 251 int vec[VEC_SIZE]; 252 memset(vec, 0, sizeof(vec)); 253 vector<unsigned>::const_iterator i; 254 for (i = a.begin(); i != a.end(); ++i) { 255 ++vec[(*i) % VEC_SIZE]; 256 } 257 for (i = b.begin(); i != b.end(); ++i) { 258 --vec[(*i) % VEC_SIZE]; 259 } 260 unsigned int total = 0; 261 for (size_t j = 0; j < VEC_SIZE; ++j) { 262 total += abs(vec[j]); 263 } 264 // Each insertion or deletion adds at most 1 to total. Each transposition 265 // doesn't change it at all. But each substitution can change it by 2 so 266 // we need to divide it by 2. Rounding up is OK, since the odd change must 267 // be due to an actual edit. 268 return (total + 1) / 2; 269 } 270 271 int 272 EditDistance::distance(const string& term,int limit) { 273 vector<unsigned> utf32_term; 274 // we use this to signal that the distance is clearly large and that we're 275 // not going to work it out exactly 276 const int big_distance = 1000; 277 278 // First check the length of the encoded UTF-8 version of term. 279 // Each UTF-32 character is 1-4 bytes in UTF-8. 280 if (abs(long(term.size()) - long(word.size())) > limit * 4) { 281 LOGLINE(SPELLING, "Lengths much too different"); 282 return big_distance; 283 } 284 285 // Now convert to UTF-32, and compare the true lengths more 286 // strictly. 287 utf32_term.assign(Utf8Iterator(term), Utf8Iterator()); 288 289 if (abs(long(utf32_term.size()) - long(utf32_word.size())) 290 > limit) { 291 LOGLINE(SPELLING, "Lengths too different"); 292 return big_distance; 293 } 294 295 if (freq_edit_lower_bound(utf32_term, utf32_word) > limit) { 296 LOGLINE(SPELLING, "Rejected by character frequency test"); 297 return big_distance; 298 } 299 300 return edit_distance_unsigned(&utf32_term[0], 301 int(utf32_term.size()), 302 &utf32_word[0], 303 int(utf32_word.size()), 304 limit); 305 } -
xapian-core/api/editdistance.h
22 22 #ifndef XAPIAN_INCLUDED_EDITDISTANCE_H 23 23 #define XAPIAN_INCLUDED_EDITDISTANCE_H 24 24 25 #include<string> 26 #include<vector> 25 27 /** Calculate the edit distance between two sequences. 26 28 * 27 29 * Edit distance is defined as the minimum number of edit operations … … 49 51 const unsigned* ptr2, int len2, 50 52 int max_distance); 51 53 54 55 class EditDistance { 56 const std::string word; 57 const std::vector<unsigned> utf32_word; 58 59 public: 60 EditDistance(const std::string& word_) ; 61 int distance(const std::string& other, int limit); 62 }; 52 63 #endif // XAPIAN_INCLUDED_EDITDISTANCE_H -
xapian-core/api/registry.cc
194 194 Xapian::MatchSpy * spy; 195 195 spy = new Xapian::ValueCountMatchSpy(); 196 196 matchspies[spy->name()] = spy; 197 197 198 } 198 199 199 200 void -
xapian-core/backends/remote/remote-database.cc
741 741 data += value; 742 742 send_message(MSG_SETMETADATA, data); 743 743 } 744 745 void 746 RemoteDatabase::add_spelling(const std::string & word, 747 Xapian::termcount freqinc) const 748 { 749 string data = encode_length(freqinc); 750 data += word; 751 send_message(MSG_ADDSPELLING, data); 752 } 753 754 void 755 RemoteDatabase::remove_spelling(const std::string & word, 756 Xapian::termcount freqdec) const 757 { 758 string data = encode_length(freqdec); 759 data += word; 760 send_message(MSG_REMOVESPELLING, data); 761 } 762 763 string 764 RemoteDatabase::get_spelling_suggestion(const string & word, 765 unsigned max_edit_distance) 766 { 767 string data = encode_length(max_edit_distance); 768 data += word; 769 send_message(MSG_GETSPELLINGSUGGESTION, data); 770 771 string message; 772 get_message(message, REPLY_SPELLINGSUGGESTION); 773 return message; 774 } 775 776 Xapian::doccount 777 RemoteDatabase::get_spelling_frequency(const string & word) const 778 { 779 send_message(MSG_GETSPELLINGFREQ, word); 780 string message; 781 get_message(message, REPLY_SPELLINGFREQ); 782 const char * p = message.data(); 783 const char * p_end = p + message.size(); 784 return decode_length(&p, p_end, false); 785 }