Ticket #178: remote-spell-remainder-updated.patch

File remote-spell-remainder-updated.patch, 21.7 KB (added by Bruno Deferrari, 13 years ago)

Updated remote spelling suggestions patch.

  • xapian-core/api/editdistance.cc

    diff --git a/xapian-core/api/editdistance.cc b/xapian-core/api/editdistance.cc
    index 2365877..ca35b98 100644
    a b  
    3232#include "editdistance.h"
    3333
    3434#include "omassert.h"
     35#include "debuglog.h"
     36
     37#include <xapian/unicode.h>
    3538
    3639#include <algorithm>
    3740#include <cstdlib>
    38 
     41#include <cstring>
    3942using namespace std;
    4043
    4144template<class CHR>
    edit_distance_unsigned(const unsigned * ptr1, int len1,  
    213216{
    214217    return seqcmp_editdist<unsigned>(ptr1, len1, ptr2, len2, max_distance);
    215218}
     219
     220using namespace Xapian;
     221
     222EditDistance::EditDistance(const string& word_)
     223#if ! defined __SUNPRO_CC || __SUNPRO_CC - 0 >= 0x580
     224  // Extra brackets needed to avoid this being misparsed as a function
     225  // prototype.
     226  :word(word_), utf32_word(Utf8Iterator(word_), Utf8Iterator()) {}
     227# else
     228 {
     229     // Older versions of Sun's C++ compiler need this workaround, but 5.8
     230     // doesn't.  Unsure of the exact version it was fixed in.
     231     for (Utf8Iterator sunpro_it(word); sunpro_it != Utf8Iterator(); ++sunpro_it) {
     232         utf32_word.push_back(*sunpro_it);
     233     }
     234 }
     235#endif
     236
     237
     238// We sum the character frequency histogram absolute differences to compute a
     239// lower bound on the edit distance.  Rather than counting each Unicode code
     240// point uniquely, we use an array with VEC_SIZE elements and tally code points
     241// modulo VEC_SIZE which can only reduce the bound we calculate.
     242//
     243// There will be a trade-off between how good the bound is and how large and
     244// array is used (a larger array takes more time to clear and sum over).  The
     245// value 64 is somewhat arbitrary - it works as well as 128 for the testsuite
     246// but that may not reflect real world performance.  FIXME: profile and tune.
     247
     248#define VEC_SIZE 64
     249
     250static int
     251freq_edit_lower_bound(const vector<unsigned> & a, const vector<unsigned> & b)
     252{
     253    int vec[VEC_SIZE];
     254    memset(vec, 0, sizeof(vec));
     255    vector<unsigned>::const_iterator i;
     256    for (i = a.begin(); i != a.end(); ++i) {
     257        ++vec[(*i) % VEC_SIZE];
     258    }
     259    for (i = b.begin(); i != b.end(); ++i) {
     260        --vec[(*i) % VEC_SIZE];
     261    }
     262    unsigned int total = 0;
     263    for (size_t j = 0; j < VEC_SIZE; ++j) {
     264        total += abs(vec[j]);
     265    }
     266    // Each insertion or deletion adds at most 1 to total.  Each transposition
     267    // doesn't change it at all.  But each substitution can change it by 2 so
     268    // we need to divide it by 2.  Rounding up is OK, since the odd change must
     269    // be due to an actual edit.
     270    return (total + 1) / 2;
     271}
     272
     273int
     274EditDistance::distance(const string& term,int limit) {
     275    vector<unsigned> utf32_term;
     276    // we use this to signal that the distance is clearly large and that we're
     277    // not going to work it out exactly
     278    const int big_distance = 1000;
     279
     280    // First check the length of the encoded UTF-8 version of term.
     281    // Each UTF-32 character is 1-4 bytes in UTF-8.
     282    if (abs(long(term.size()) - long(word.size())) > limit * 4) {
     283        LOGLINE(SPELLING, "Lengths much too different");
     284        return big_distance;
     285    }
     286
     287    // Now convert to UTF-32, and compare the true lengths more
     288    // strictly.
     289    utf32_term.assign(Utf8Iterator(term), Utf8Iterator());
     290
     291    if (abs(long(utf32_term.size()) - long(utf32_word.size()))
     292        > limit) {
     293        LOGLINE(SPELLING, "Lengths too different");
     294        return big_distance;
     295    }
     296
     297    if (freq_edit_lower_bound(utf32_term, utf32_word) > limit) {
     298        LOGLINE(SPELLING, "Rejected by character frequency test");
     299        return big_distance;
     300    }
     301
     302    return edit_distance_unsigned(&utf32_term[0],
     303                                  int(utf32_term.size()),
     304                                  &utf32_word[0],
     305                                  int(utf32_word.size()),
     306                                  limit);
     307}
  • xapian-core/api/editdistance.h

    diff --git a/xapian-core/api/editdistance.h b/xapian-core/api/editdistance.h
    index 4ab7824..553e564 100644
    a b  
    2222#ifndef XAPIAN_INCLUDED_EDITDISTANCE_H
    2323#define XAPIAN_INCLUDED_EDITDISTANCE_H
    2424
     25#include<string>
     26#include<vector>
    2527/** Calculate the edit distance between two sequences.
    2628 *
    2729 *  Edit distance is defined as the minimum number of edit operations
    int edit_distance_unsigned(const unsigned* ptr1, int len1,  
    4951                           const unsigned* ptr2, int len2,
    5052                           int max_distance);
    5153
     54
     55class EditDistance {
     56    const std::string word;
     57    const std::vector<unsigned> utf32_word;
     58
     59 public:
     60    EditDistance(const std::string& word_) ;
     61    int distance(const std::string& other, int limit);
     62};
    5263#endif // XAPIAN_INCLUDED_EDITDISTANCE_H
  • xapian-core/api/omdatabase.cc

    diff --git a/xapian-core/api/omdatabase.cc b/xapian-core/api/omdatabase.cc
    index ed80043..d51a074 100644
    a b  
    4242#include "editdistance.h"
    4343#include "ortermlist.h"
    4444#include "noreturn.h"
     45#include "remote-database.h"
    4546
    4647#include <cstdlib> // For abs().
    4748
    Database::get_description() const  
    497498    return "Database()";
    498499}
    499500
    500 // We sum the character frequency histogram absolute differences to compute a
    501 // lower bound on the edit distance.  Rather than counting each Unicode code
    502 // point uniquely, we use an array with VEC_SIZE elements and tally code points
    503 // modulo VEC_SIZE which can only reduce the bound we calculate.
    504 //
    505 // There will be a trade-off between how good the bound is and how large and
    506 // array is used (a larger array takes more time to clear and sum over).  The
    507 // value 64 is somewhat arbitrary - it works as well as 128 for the testsuite
    508 // but that may not reflect real world performance.  FIXME: profile and tune.
    509 
    510 #define VEC_SIZE 64
    511 
    512 static int
    513 freq_edit_lower_bound(const vector<unsigned> & a, const vector<unsigned> & b)
    514 {
    515     int vec[VEC_SIZE];
    516     memset(vec, 0, sizeof(vec));
    517     vector<unsigned>::const_iterator i;
    518     for (i = a.begin(); i != a.end(); ++i) {
    519         ++vec[(*i) % VEC_SIZE];
    520     }
    521     for (i = b.begin(); i != b.end(); ++i) {
    522         --vec[(*i) % VEC_SIZE];
    523     }
    524     unsigned int total = 0;
    525     for (size_t j = 0; j < VEC_SIZE; ++j) {
    526         total += abs(vec[j]);
    527     }
    528     // Each insertion or deletion adds at most 1 to total.  Each transposition
    529     // doesn't change it at all.  But each substitution can change it by 2 so
    530     // we need to divide it by 2.  Rounding up is OK, since the odd change must
    531     // be due to an actual edit.
    532     return (total + 1) / 2;
    533 }
    534 
    535501// Word must have a trigram score at least this close to the best score seen
    536502// so far.
    537503#define TRIGRAM_SCORE_THRESHOLD 2
    Database::get_spelling_suggestion(const string &word,  
    543509    LOGCALL(API, string, "Database::get_spelling_suggestion", word | max_edit_distance);
    544510    if (word.size() <= 1) return string();
    545511    AutoPtr<TermList> merger;
     512    bool got_remote = false;
    546513    for (size_t i = 0; i < internal.size(); ++i) {
    547         TermList * tl = internal[i]->open_spelling_termlist(word);
    548         LOGLINE(SPELLING, "Sub db " << i << " tl = " << (void*)tl);
    549         if (tl) {
    550             if (merger.get()) {
    551                 merger.reset(new OrTermList(merger.release(), tl));
    552             } else {
    553                 merger.reset(tl);
    554             }
    555         }
    556     }
    557     if (!merger.get()) RETURN(string());
    558 
    559     // Convert word to UTF-32.
    560 #if ! defined __SUNPRO_CC || __SUNPRO_CC - 0 >= 0x580
    561     // Extra brackets needed to avoid this being misparsed as a function
    562     // prototype.
    563     vector<unsigned> utf32_word((Utf8Iterator(word)), Utf8Iterator());
    564 #else
    565     // Older versions of Sun's C++ compiler need this workaround, but 5.8
    566     // doesn't.  Unsure of the exact version it was fixed in.
    567     vector<unsigned> utf32_word;
    568     for (Utf8Iterator sunpro_it(word); sunpro_it != Utf8Iterator(); ++sunpro_it) {
    569         utf32_word.push_back(*sunpro_it);
    570     }
    571 #endif
    572 
    573     vector<unsigned> utf32_term;
     514        RemoteDatabase *rdb = internal[i].get()->as_remotedatabase();
     515        if (!rdb) {
     516            TermList * tl = internal[i]->open_spelling_termlist(word);
     517            LOGLINE(SPELLING, "Sub db " << i << " tl = " << (void*)tl);
     518            if (tl) {
     519                if (merger.get()) {
     520                    merger.reset(new OrTermList(merger.release(), tl));
     521                } else {
     522                    merger.reset(tl);
     523                }
     524            }
     525        } else {
     526            got_remote = true;
     527        }
     528    }
     529
     530    if (! (merger.get() || got_remote)) {
     531        RETURN(string());
     532    }
     533     
     534    EditDistance edcomp(word);
    574535
    575536    Xapian::termcount best = 1;
    576537    string result;
    577538    int edist_best = max_edit_distance;
    578539    Xapian::doccount freq_best = 0;
    579540    Xapian::doccount freq_exact = 0;
    580     while (true) {
    581         TermList *ret = merger->next();
    582         if (ret) merger.reset(ret);
    583 
    584         if (merger->at_end()) break;
    585 
    586         string term = merger->get_termname();
    587         Xapian::termcount score = merger->get_wdf();
    588 
    589         LOGLINE(SPELLING, "Term \"" << term << "\" ngram score " << score);
    590         if (score + TRIGRAM_SCORE_THRESHOLD >= best) {
    591             if (score > best) best = score;
    592 
    593             // There's no point considering a word where the difference
    594             // in length is greater than the smallest number of edits we've
    595             // found so far.
    596 
    597             // First check the length of the encoded UTF-8 version of term.
    598             // Each UTF-32 character is 1-4 bytes in UTF-8.
    599             if (abs(long(term.size()) - long(word.size())) > edist_best * 4) {
    600                 LOGLINE(SPELLING, "Lengths much too different");
    601                 continue;
    602             }
    603 
    604             // Now convert to UTF-32, and compare the true lengths more
    605             // strictly.
    606             utf32_term.assign(Utf8Iterator(term), Utf8Iterator());
    607 
    608             if (abs(long(utf32_term.size()) - long(utf32_word.size()))
    609                     > edist_best) {
    610                 LOGLINE(SPELLING, "Lengths too different");
    611                 continue;
    612             }
    613 
    614             if (freq_edit_lower_bound(utf32_term, utf32_word) > edist_best) {
    615                 LOGLINE(SPELLING, "Rejected by character frequency test");
    616                 continue;
    617             }
    618 
    619             int edist = edit_distance_unsigned(&utf32_term[0],
    620                                                int(utf32_term.size()),
    621                                                &utf32_word[0],
    622                                                int(utf32_word.size()),
    623                                                edist_best);
    624             LOGLINE(SPELLING, "Edit distance " << edist);
    625 
    626             if (edist <= edist_best) {
    627                 Xapian::doccount freq = 0;
    628                 for (size_t j = 0; j < internal.size(); ++j)
    629                     freq += internal[j]->get_spelling_frequency(term);
    630 
    631                 LOGLINE(SPELLING, "Freq " << freq << " best " << freq_best);
    632                 // Even if we have an exact match, there may be a much more
    633                 // frequent potential correction which will still be
    634                 // interesting.
    635                 if (edist == 0) {
    636                     freq_exact = freq;
    637                     continue;
    638                 }
    639 
    640                 if (edist < edist_best || freq > freq_best) {
    641                     LOGLINE(SPELLING, "Best so far: \"" << term <<
    642                                       "\" edist " << edist << " freq " << freq);
    643                     result = term;
    644                     edist_best = edist;
    645                     freq_best = freq;
    646                 }
    647             }
     541    if (merger.get()) {
     542        while (true) {
     543            TermList *ret = merger->next();
     544            if (ret) merger.reset(ret);
     545
     546            if (merger->at_end()) break;
     547
     548             string term = merger->get_termname();
     549             Xapian::termcount score = merger->get_wdf();
     550
     551             LOGLINE(SPELLING, "Term \"" << term << "\" ngram score " << score);
     552             if (score + TRIGRAM_SCORE_THRESHOLD >= best) {
     553                 if (score > best) best = score;
     554
     555                 // There's no point considering a word where the difference
     556                 // in length is greater than the smallest number of edits we've
     557                 // found so far.
     558
     559                 int edist = edcomp.distance(term, edist_best);
     560   
     561                 if (edist > edist_best) continue;
     562
     563                 LOGLINE(SPELLING, "Edit distance " << edist);
     564                 // If we have an exact match, return an empty string since there's
     565                 // no correction required.
     566                 if (edist == 0) RETURN(string());
     567                 if (edist <= edist_best) {
     568                     Xapian::doccount freq = 0;
     569                     for (size_t j = 0; j < internal.size(); ++j) {
     570                         // this includes calls on get_spelling_frequency
     571                         // for remote databases
     572                         freq += internal[j]->get_spelling_frequency(term);
     573                     }
     574                     LOGLINE(SPELLING, "Freq " << freq << " best " << freq_best);
     575                     // Even if we have an exact match, there may be a much more
     576                     // frequent potential correction which will still be
     577                     // interesting.
     578                     if (edist == 0) {
     579                         freq_exact = freq;
     580                         continue;
     581                     }
     582
     583                     if (edist < edist_best || freq > freq_best) {
     584                         LOGLINE(SPELLING, "Best so far: \"" << term <<
     585                                 "\" edist " << edist << " freq " << freq);
     586                         result = term;
     587                         edist_best = edist;
     588                         freq_best = freq;
     589                     }
     590                 }
     591             }
    648592        }
    649593    }
     594
     595    // we've got the best we can locally which - it there's no
     596    // correction need means we won't need to check remote
     597    // databases. If we get this far we try corrections from remote
     598    // databases.
     599    for (size_t i = 0; i < internal.size(); ++i) {
     600        RemoteDatabase *rdb = internal[i].get()->as_remotedatabase();
     601        if (rdb) {
     602            string rterm = rdb->get_spelling_suggestion(word, edist_best);
     603            if (rterm != "") {
     604                // FIXME: repetitious of stuff in the loop above - should be factored out.
     605                int edist = edcomp.distance(rterm, edist_best);
     606                if (edist == 0) RETURN(string());
     607                if (edist <= edist_best) {
     608                    Xapian::doccount freq = 0;
     609                    for (size_t j = 0; j < internal.size(); ++j)
     610                        freq += internal[j]->get_spelling_frequency(rterm);
     611                    LOGLINE(SPELLING, "Freq " << freq << " best " << freq_best);
     612                    if (edist < edist_best || freq > freq_best) {
     613                        LOGLINE(SPELLING, "Best so far: \"" << term <<
     614                                "\" edist " << edist << " freq " << freq);
     615                        result = rterm;
     616                        edist_best = edist;
     617                        freq_best = freq;
     618                    }
     619                }
     620            }
     621        }
     622    }
     623
    650624    if (freq_best < freq_exact)
    651625        RETURN(string());
    652626    RETURN(result);
  • xapian-core/backends/remote/remote-database.cc

    diff --git a/xapian-core/backends/remote/remote-database.cc b/xapian-core/backends/remote/remote-database.cc
    index 0aa3a52..15fd584 100644
    a b RemoteDatabase::remove_spelling(const string & word,  
    785785    data += word;
    786786    send_message(MSG_REMOVESPELLING, data);
    787787}
     788
     789string
     790RemoteDatabase::get_spelling_suggestion(const string & word,
     791                                        unsigned max_edit_distance)
     792{
     793    string data = encode_length(max_edit_distance);
     794    data += word;
     795    send_message(MSG_GETSPELLINGSUGGESTION, data);
     796
     797    string message;
     798    get_message(message, REPLY_SPELLINGSUGGESTION);
     799    return message;
     800}
     801
     802Xapian::doccount
     803RemoteDatabase::get_spelling_frequency(const string & word) const
     804{
     805    send_message(MSG_GETSPELLINGFREQ, word);
     806    string message;
     807    get_message(message, REPLY_SPELLINGFREQ);
     808    const char * p = message.data();
     809    const char * p_end = p + message.size();
     810    return decode_length(&p, p_end, false);
     811}
  • xapian-core/common/remote-database.h

    diff --git a/xapian-core/common/remote-database.h b/xapian-core/common/remote-database.h
    index fc698ce..c9231f9 100644
    a b class RemoteDatabase : public Xapian::Database::Internal {  
    251251    void add_spelling(const std::string&, Xapian::termcount) const;
    252252
    253253    void remove_spelling(const std::string&,  Xapian::termcount freqdec) const;
     254
     255    std::string get_spelling_suggestion(const std::string &word,
     256                                        unsigned max_edit_distance);
     257
     258    Xapian::doccount get_spelling_frequency(const std::string & word) const;
    254259};
    255260
    256261#endif // XAPIAN_INCLUDED_REMOTE_DATABASE_H
  • xapian-core/common/remoteprotocol.h

    diff --git a/xapian-core/common/remoteprotocol.h b/xapian-core/common/remoteprotocol.h
    index 843761f..d3f46e5 100644
    a b  
    4545// 34: 1.1.4 Support for metadata over with remote databases.
    4646// 35: 1.1.5 Support for add_spelling() and remove_spelling().
    4747// 35.1: 1.2.4 Support for metadata_keys_begin().
     48// 35.2: 1.2.6 Support for spelling suggestions.
    4849#define XAPIAN_REMOTE_PROTOCOL_MAJOR_VERSION 35
    49 #define XAPIAN_REMOTE_PROTOCOL_MINOR_VERSION 1
     50#define XAPIAN_REMOTE_PROTOCOL_MINOR_VERSION 2
    5051
    5152/** Message types (client -> server).
    5253 *
    enum message_type {  
    8384    MSG_GETMSET,                // Get MSet
    8485    MSG_SHUTDOWN,               // Shutdown
    8586    MSG_METADATAKEYLIST,        // Iterator for metadata keys
     87    MSG_GETSPELLINGSUGGESTION,  // Get a spelling suggestion
     88    MSG_GETSPELLINGFREQ,        // Get spelling frequency
    8689    MSG_MAX
    8790};
    8891
    enum reply_type {  
    110113    REPLY_RESULTS,              // Results (MSet)
    111114    REPLY_METADATA,             // Metadata
    112115    REPLY_METADATAKEYLIST,      // Iterator for metadata keys
     116    REPLY_SPELLINGSUGGESTION,   // Get Spelling Suggestion
     117    REPLY_SPELLINGFREQ,         // Spelling frequency
    113118    REPLY_MAX
    114119};
    115120
  • xapian-core/common/remoteserver.h

    diff --git a/xapian-core/common/remoteserver.h b/xapian-core/common/remoteserver.h
    index bff341f..25007c1 100644
    a b class XAPIAN_VISIBILITY_DEFAULT RemoteServer : private RemoteConnection {  
    164164    // remove a spelling
    165165    void msg_removespelling(const std::string & message);
    166166
     167    // get spellings
     168    void msg_getspellingsuggestion(const std::string & message);
     169
     170    // get spelling frequency
     171    void msg_getspellingfreq(const std::string & message);
     172
    167173  public:
    168174    /** Construct a RemoteServer.
    169175     *
  • xapian-core/net/remoteserver.cc

    diff --git a/xapian-core/net/remoteserver.cc b/xapian-core/net/remoteserver.cc
    index 3ee79f9..3ae3a86 100644
    a b  
    4040#include "serialise-double.h"
    4141#include "str.h"
    4242#include "weightinternal.h"
     43#include "database.h" // so we can use internal in msg_openspellingtermlist
    4344
    4445/// Class to throw when we receive the connection closing message.
    4546struct ConnectionClosed { };
    RemoteServer::run()  
    186187                0, // MSG_GETMSET - used during a conversation.
    187188                0, // MSG_SHUTDOWN - handled by get_message().
    188189                &RemoteServer::msg_openmetadatakeylist,
     190                &RemoteServer::msg_getspellingsuggestion,
     191                &RemoteServer::msg_getspellingfreq
    189192            };
    190193
    191194            string message;
    RemoteServer::msg_removespelling(const string & message)  
    707710    Xapian::termcount freqdec = decode_length(&p, p_end, false);
    708711    wdb->remove_spelling(string(p, p_end - p), freqdec);
    709712}
     713
     714void
     715RemoteServer::msg_getspellingsuggestion(const string & message)
     716{
     717    const char *p = message.data();
     718    const char *p_end = p + message.size();
     719    size_t dist = decode_length(&p, p_end, false);
     720    string word(p, p_end - p);
     721    send_message(REPLY_SPELLINGSUGGESTION, db->get_spelling_suggestion(word, dist));
     722}
     723
     724void
     725RemoteServer::msg_getspellingfreq(const string & message)
     726{
     727    Xapian::doccount freq = 0;
     728    for (size_t j = 0; j < db->internal.size(); ++j)
     729        freq += db->internal[j]->get_spelling_frequency(message);
     730    send_message(REPLY_SPELLINGFREQ, encode_length(freq));
     731}
  • xapian-core/tests/api_spelling.cc

    diff --git a/xapian-core/tests/api_spelling.cc b/xapian-core/tests/api_spelling.cc
    index c0865d8..5c2af73 100644
    a b DEFINE_TESTCASE(spell2, spelling) {  
    182182}
    183183
    184184// Test spelling correction with multi databases
    185 DEFINE_TESTCASE(spell3, spelling) {
     185DEFINE_TESTCASE(spell3, spelling && !remote) {
     186    // Spelling iterator not implemented for remote databases.
    186187    Xapian::WritableDatabase db1 = get_writable_database();
    187188    // We can't just call get_writable_database() since it would delete db1
    188189    // which doesn't work at all under __WIN32__ and will go wrong elsewhere if
  • xapian-core/tests/harness/testrunner.cc

    diff --git a/xapian-core/tests/harness/testrunner.cc b/xapian-core/tests/harness/testrunner.cc
    index 946178f..0befd5a 100644
    a b static BackendProperties backend_properties[] = {  
    6262    { "multi_brass", "backend,positional,valuestats,multi" },
    6363    { "multi_chert", "backend,positional,valuestats,multi" },
    6464    { "multi_flint", "backend,positional,multi" },
    65     { "remoteprog_brass", "backend,remote,transactions,positional,valuestats,writable,metadata" },
    66     { "remotetcp_brass", "backend,remote,transactions,positional,valuestats,writable,metadata" },
    67     { "remoteprog_chert", "backend,remote,transactions,positional,valuestats,writable,metadata" },
    68     { "remotetcp_chert", "backend,remote,transactions,positional,valuestats,writable,metadata" },
    69     { "remoteprog_flint", "backend,remote,transactions,positional,writable,metadata" },
    70     { "remotetcp_flint", "backend,remote,transactions,positional,writable,metadata" },
     65    { "remoteprog_brass", "backend,remote,transactions,positional,valuestats,writable,metadata,spelling" },
     66    { "remotetcp_brass", "backend,remote,transactions,positional,valuestats,writable,metadata,spelling" },
     67    { "remoteprog_chert", "backend,remote,transactions,positional,valuestats,writable,metadata,spelling" },
     68    { "remotetcp_chert", "backend,remote,transactions,positional,valuestats,writable,metadata,spelling" },
     69    { "remoteprog_flint", "backend,remote,transactions,positional,writable,metadata,spelling" },
     70    { "remotetcp_flint", "backend,remote,transactions,positional,writable,metadata,spelling" },
    7171    { NULL, NULL }
    7272};
    7373