Ticket #178: remote-spell-remainder.patch

File remote-spell-remainder.patch, 20.4 KB (added by Olly Betts, 14 years ago)

Updated patch with unapplied changes

  • xapian-core/tests/api_spelling.cc

     
    182182}
    183183
    184184// Test spelling correction with multi databases
    185 DEFINE_TESTCASE(spell3, spelling) {
     185DEFINE_TESTCASE(spell3, spelling && !remote) {
     186    // Spelling iterator not implemented for remote databases.
    186187    Xapian::WritableDatabase db1 = get_writable_database();
    187188    // We can't just call get_writable_database() since it would delete db1
    188189    // which doesn't work at all under __WIN32__ and will go wrong elsewhere if
  • xapian-core/tests/harness/testrunner.cc

     
    6262    { "multi_brass", "backend,positional,valuestats,multi" },
    6363    { "multi_chert", "backend,positional,valuestats,multi" },
    6464    { "multi_flint", "backend,positional,multi" },
    65     { "remoteprog_brass", "backend,remote,transactions,positional,valuestats,writable,metadata" },
    66     { "remotetcp_brass", "backend,remote,transactions,positional,valuestats,writable,metadata" },
    67     { "remoteprog_chert", "backend,remote,transactions,positional,valuestats,writable,metadata" },
    68     { "remotetcp_chert", "backend,remote,transactions,positional,valuestats,writable,metadata" },
    69     { "remoteprog_flint", "backend,remote,transactions,positional,writable,metadata" },
    70     { "remotetcp_flint", "backend,remote,transactions,positional,writable,metadata" },
     65    { "remoteprog_brass", "backend,remote,transactions,positional,valuestats,writable,metadata,spelling" },
     66    { "remotetcp_brass", "backend,remote,transactions,positional,valuestats,writable,metadata,spelling" },
     67    { "remoteprog_chert", "backend,remote,transactions,positional,valuestats,writable,metadata,spelling" },
     68    { "remotetcp_chert", "backend,remote,transactions,positional,valuestats,writable,metadata,spelling" },
     69    { "remoteprog_flint", "backend,remote,transactions,positional,writable,metadata,spelling" },
     70    { "remotetcp_flint", "backend,remote,transactions,positional,writable,metadata,spelling" },
    7171    { NULL, NULL }
    7272};
    7373
  • xapian-core/net/remoteserver.cc

     
    4040#include "serialise-double.h"
    4141#include "utils.h"
    4242#include "weightinternal.h"
     43#include "database.h" // so we can use internal in msg_openspellingtermlist
    4344
    4445/// Class to throw when we receive the connection closing message.
    4546struct ConnectionClosed { };
     
    189190                &RemoteServer::msg_setmetadata,
    190191                &RemoteServer::msg_addspelling,
    191192                &RemoteServer::msg_removespelling,
     193                &RemoteServer::msg_getspellingsuggestion,
     194                &RemoteServer::msg_getspellingfreq
    192195                // MSG_GETMSET - used during a conversation.
    193196                // MSG_SHUTDOWN - handled by get_message().
    194197            };
     
    704707    Xapian::termcount freqdec = decode_length(&p, p_end, false);
    705708    wdb->remove_spelling(string(p, p_end - p), freqdec);
    706709}
     710
     711void
     712RemoteServer::msg_getspellingsuggestion(const string & message)
     713{
     714    const char *p = message.data();
     715    const char *p_end = p + message.size();
     716    size_t dist = decode_length(&p, p_end, false);
     717    string word(p, p_end - p);
     718    send_message(REPLY_SPELLINGSUGGESTION, db->get_spelling_suggestion(word, dist));
     719}
     720
     721void
     722RemoteServer::msg_getspellingfreq(const string & message)
     723{
     724    Xapian::doccount freq = 0;
     725    for (size_t j = 0; j < db->internal.size(); ++j)
     726        freq += db->internal[j]->get_spelling_frequency(message);
     727    send_message(REPLY_SPELLINGFREQ, encode_length(freq));
     728}
  • xapian-core/common/remote-database.h

     
    249249    void add_spelling(const std::string&, Xapian::termcount) const;
    250250
    251251    void remove_spelling(const std::string&,  Xapian::termcount freqdec) const;
     252
     253    std::string get_spelling_suggestion(const std::string &word,
     254                                        unsigned max_edit_distance);
     255
     256    Xapian::doccount get_spelling_frequency(const std::string & word) const;
    252257};
    253258
    254259#endif // XAPIAN_INCLUDED_REMOTE_DATABASE_H
  • xapian-core/common/remoteprotocol.h

     
    4444// 33: 1.1.3 Support for passing matchspies over the remote connection.
    4545// 34: 1.1.4 Support for metadata over with remote databases.
    4646// 35: 1.1.5 Support for add_spelling() and remove_spelling().
    47 #define XAPIAN_REMOTE_PROTOCOL_MAJOR_VERSION 35
     47// 36: ????? Full remote spelling support.
     48#define XAPIAN_REMOTE_PROTOCOL_MAJOR_VERSION 36
    4849#define XAPIAN_REMOTE_PROTOCOL_MINOR_VERSION 0
    4950
    5051/** Message types (client -> server).
     
    7980    MSG_SETMETADATA,            // Set metadata
    8081    MSG_ADDSPELLING,            // Add a spelling
    8182    MSG_REMOVESPELLING,         // Remove a spelling
     83    MSG_GETSPELLINGSUGGESTION,  // Get a spelling suggestion
     84    MSG_GETSPELLINGFREQ,        // Get spelling frequency
    8285    MSG_GETMSET,                // Get MSet
    8386    MSG_SHUTDOWN,               // Shutdown
    8487    MSG_MAX
     
    107110    REPLY_ADDDOCUMENT,          // Add Document
    108111    REPLY_RESULTS,              // Results (MSet)
    109112    REPLY_METADATA,             // Metadata
     113    REPLY_SPELLINGSUGGESTION,   // Get Spelling Suggestion
     114    REPLY_SPELLINGFREQ,         // Spelling frequency
    110115    REPLY_MAX
    111116};
    112117
  • xapian-core/common/remoteserver.h

     
    154154    // remove a spelling
    155155    void msg_removespelling(const std::string & message);
    156156
     157    // get spellings
     158    void msg_getspellingsuggestion(const std::string & message);
     159
     160    // get spelling frequency
     161    void msg_getspellingfreq(const std::string & message);
     162
    157163  public:
    158164    /** Construct a RemoteServer.
    159165     *
  • xapian-core/backends/remote/remote-database.cc

     
    759759    data += word;
    760760    send_message(MSG_REMOVESPELLING, data);
    761761}
     762
     763string
     764RemoteDatabase::get_spelling_suggestion(const string & word,
     765                                        unsigned max_edit_distance)
     766{
     767    string data = encode_length(max_edit_distance);
     768    data += word;
     769    send_message(MSG_GETSPELLINGSUGGESTION, data);
     770
     771    string message;
     772    get_message(message, REPLY_SPELLINGSUGGESTION);
     773    return message;
     774}
     775
     776Xapian::doccount
     777RemoteDatabase::get_spelling_frequency(const string & word) const
     778{
     779    send_message(MSG_GETSPELLINGFREQ, word);
     780    string message;
     781    get_message(message, REPLY_SPELLINGFREQ);
     782    const char * p = message.data();
     783    const char * p_end = p + message.size();
     784    return decode_length(&p, p_end, false);
     785}
  • xapian-core/api/omdatabase.cc

     
    4242#include "editdistance.h"
    4343#include "ortermlist.h"
    4444#include "noreturn.h"
     45#include "remote-database.h"
    4546
    4647#include <cstdlib> // For abs().
    4748
     
    498499    return "Database()";
    499500}
    500501
    501 // We sum the character frequency histogram absolute differences to compute a
    502 // lower bound on the edit distance.  Rather than counting each Unicode code
    503 // point uniquely, we use an array with VEC_SIZE elements and tally code points
    504 // modulo VEC_SIZE which can only reduce the bound we calculate.
    505 //
    506 // There will be a trade-off between how good the bound is and how large and
    507 // array is used (a larger array takes more time to clear and sum over).  The
    508 // value 64 is somewhat arbitrary - it works as well as 128 for the testsuite
    509 // but that may not reflect real world performance.  FIXME: profile and tune.
    510 
    511 #define VEC_SIZE 64
    512 
    513 static int
    514 freq_edit_lower_bound(const vector<unsigned> & a, const vector<unsigned> & b)
    515 {
    516     int vec[VEC_SIZE];
    517     memset(vec, 0, sizeof(vec));
    518     vector<unsigned>::const_iterator i;
    519     for (i = a.begin(); i != a.end(); ++i) {
    520         ++vec[(*i) % VEC_SIZE];
    521     }
    522     for (i = b.begin(); i != b.end(); ++i) {
    523         --vec[(*i) % VEC_SIZE];
    524     }
    525     unsigned int total = 0;
    526     for (size_t j = 0; j < VEC_SIZE; ++j) {
    527         total += abs(vec[j]);
    528     }
    529     // Each insertion or deletion adds at most 1 to total.  Each transposition
    530     // doesn't change it at all.  But each substitution can change it by 2 so
    531     // we need to divide it by 2.  Rounding up is OK, since the odd change must
    532     // be due to an actual edit.
    533     return (total + 1) / 2;
    534 }
    535 
    536502// Word must have a trigram score at least this close to the best score seen
    537503// so far.
    538504#define TRIGRAM_SCORE_THRESHOLD 2
     
    545511                 word << ", " << max_edit_distance);
    546512    if (word.size() <= 1) return string();
    547513    AutoPtr<TermList> merger;
     514    bool got_remote = false;
    548515    for (size_t i = 0; i < internal.size(); ++i) {
    549         TermList * tl = internal[i]->open_spelling_termlist(word);
    550         LOGLINE(SPELLING, "Sub db " << i << " tl = " << (void*)tl);
    551         if (tl) {
    552             if (merger.get()) {
    553                 merger.reset(new OrTermList(merger.release(), tl));
    554             } else {
    555                 merger.reset(tl);
    556             }
    557         }
     516        RemoteDatabase *rdb = internal[i].get()->as_remotedatabase();
     517        if (!rdb) {
     518            TermList * tl = internal[i]->open_spelling_termlist(word);
     519            LOGLINE(SPELLING, "Sub db " << i << " tl = " << (void*)tl);
     520            if (tl) {
     521                if (merger.get()) {
     522                    merger.reset(new OrTermList(merger.release(), tl));
     523                } else {
     524                    merger.reset(tl);
     525                }
     526            }
     527        } else
     528            got_remote = true;
    558529    }
    559     if (!merger.get()) RETURN(string());
    560530
    561     // Convert word to UTF-32.
    562 #ifdef __SUNPRO_CC
    563     vector<unsigned> utf32_word;
    564     for (Utf8Iterator sunpro_it(word); sunpro_it != Utf8Iterator(); ++sunpro_it) {
    565         utf32_word.push_back(*sunpro_it);
     531    if (! (merger.get() or got_remote)) {
     532      RETURN(string());
    566533    }
    567 #else
    568     // Extra brackets needed to avoid this being misparsed as a function
    569     // prototype.
    570     vector<unsigned> utf32_word((Utf8Iterator(word)), Utf8Iterator());
    571 #endif
     534   
     535    EditDistance edcomp(word);
    572536
    573     vector<unsigned> utf32_term;
    574 
    575537    Xapian::termcount best = 1;
    576538    string result;
    577539    int edist_best = max_edit_distance;
    578540    Xapian::doccount freq_best = 0;
    579     while (true) {
    580         TermList *ret = merger->next();
    581         if (ret) merger.reset(ret);
     541    if (merger.get()) {
     542        while (true) {
     543            TermList *ret = merger->next();
     544            if (ret) merger.reset(ret);
    582545
    583         if (merger->at_end()) break;
     546            if (merger->at_end()) break;
    584547
    585         string term = merger->get_termname();
    586         Xapian::termcount score = merger->get_wdf();
     548            string term = merger->get_termname();
     549            Xapian::termcount score = merger->get_wdf();
    587550
    588         LOGLINE(SPELLING, "Term \"" << term << "\" ngram score " << score);
    589         if (score + TRIGRAM_SCORE_THRESHOLD >= best) {
    590             if (score > best) best = score;
     551            LOGLINE(SPELLING, "Term \"" << term << "\" ngram score " << score);
     552            if (score + TRIGRAM_SCORE_THRESHOLD >= best) {
     553                if (score > best) best = score;
    591554
    592             // There's no point considering a word where the difference
    593             // in length is greater than the smallest number of edits we've
    594             // found so far.
     555                // There's no point considering a word where the difference
     556                // in length is greater than the smallest number of edits we've
     557                // found so far.
     558           
     559                int edist = edcomp.distance(term, edist_best);
    595560
    596             // First check the length of the encoded UTF-8 version of term.
    597             // Each UTF-32 character is 1-4 bytes in UTF-8.
    598             if (abs(long(term.size()) - long(word.size())) > edist_best * 4) {
    599                 LOGLINE(SPELLING, "Lengths much too different");
    600                 continue;
    601             }
     561                if (edist > edist_best) continue;
    602562
    603             // Now convert to UTF-32, and compare the true lengths more
    604             // strictly.
    605             utf32_term.assign(Utf8Iterator(term), Utf8Iterator());
    606 
    607             if (abs(long(utf32_term.size()) - long(utf32_word.size()))
    608                     > edist_best) {
    609                 LOGLINE(SPELLING, "Lengths too different");
    610                 continue;
     563                LOGLINE(SPELLING, "Edit distance " << edist);
     564                // If we have an exact match, return an empty string since there's
     565                // no correction required.
     566                if (edist == 0) RETURN(string());
     567                if (edist <= edist_best) {
     568                    Xapian::doccount freq = 0;
     569                    for (size_t j = 0; j < internal.size(); ++j)
     570                        // this includes calls on get_spelling_frequency
     571                        // for remote databases
     572                        freq += internal[j]->get_spelling_frequency(term);
     573                    LOGLINE(SPELLING, "Freq " << freq << " best " << freq_best);
     574                    if (edist < edist_best || freq > freq_best) {
     575                        LOGLINE(SPELLING, "Best so far: \"" << term <<
     576                                "\" edist " << edist << " freq " << freq);
     577                        result = term;
     578                        edist_best = edist;
     579                        freq_best = freq;
     580                    }
     581                }
    611582            }
     583        }
     584    }
    612585
    613             if (freq_edit_lower_bound(utf32_term, utf32_word) > edist_best) {
    614                 LOGLINE(SPELLING, "Rejected by character frequency test");
    615                 continue;
    616             }
    617 
    618             int edist = edit_distance_unsigned(&utf32_term[0],
    619                                                int(utf32_term.size()),
    620                                                &utf32_word[0],
    621                                                int(utf32_word.size()),
    622                                                edist_best);
    623             LOGLINE(SPELLING, "Edit distance " << edist);
    624             // If we have an exact match, return an empty string since there's
    625             // no correction required.
    626             if (edist == 0) RETURN(string());
    627 
    628             if (edist <= edist_best) {
    629                 Xapian::doccount freq = 0;
    630                 for (size_t j = 0; j < internal.size(); ++j)
    631                     freq += internal[j]->get_spelling_frequency(term);
    632 
    633                 LOGLINE(SPELLING, "Freq " << freq << " best " << freq_best);
    634                 if (edist < edist_best || freq > freq_best) {
    635                     LOGLINE(SPELLING, "Best so far: \"" << term <<
    636                                       "\" edist " << edist << " freq " << freq);
    637                     result = term;
    638                     edist_best = edist;
    639                     freq_best = freq;
     586    // we've got the best we can locally which - it there's no
     587    // correction need means we won't need to check remote
     588    // databases. If we get this far we try corrections from remote
     589    // databases.
     590    for (size_t i = 0; i < internal.size(); ++i) {
     591        RemoteDatabase *rdb = internal[i].get()->as_remotedatabase();
     592        if (rdb) {
     593            string rterm = rdb->get_spelling_suggestion(word, edist_best);
     594            if (rterm != "") {
     595                // FIXME: repetitious of stuff in the loop above - should be factored out.
     596                int edist = edcomp.distance(rterm, edist_best);
     597                if (edist == 0) RETURN(string());
     598                if (edist <= edist_best) {
     599                    Xapian::doccount freq = 0;
     600                    for (size_t j = 0; j < internal.size(); ++j)
     601                        freq += internal[j]->get_spelling_frequency(rterm);
     602                    LOGLINE(SPELLING, "Freq " << freq << " best " << freq_best);
     603                    if (edist < edist_best || freq > freq_best) {
     604                        LOGLINE(SPELLING, "Best so far: \"" << term <<
     605                                "\" edist " << edist << " freq " << freq);
     606                        result = rterm;
     607                        edist_best = edist;
     608                        freq_best = freq;
     609                    }
    640610                }
    641611            }
    642612        }
    643613    }
    644614    RETURN(result);
    645615}
  • xapian-core/api/editdistance.cc

     
    3232#include "editdistance.h"
    3333
    3434#include "omassert.h"
     35#include "debuglog.h"
    3536
     37#include <xapian/unicode.h>
     38
    3639#include <algorithm>
    3740#include <cstdlib>
    38 
     41#include <cstring>
    3942using namespace std;
    4043
    4144template<class CHR>
     
    213216{
    214217    return seqcmp_editdist<unsigned>(ptr1, len1, ptr2, len2, max_distance);
    215218}
     219
     220using namespace Xapian;
     221
     222EditDistance::EditDistance(const string& word_)
     223#ifdef __SUNPRO_CC
     224 {
     225     for (Utf8Iterator sunpro_it(word); sunpro_it != Utf8Iterator(); ++sunpro_it) {
     226         utf32_word.push_back(*sunpro_it);
     227     }
     228 }
     229#else
     230 
     231  :word(word_), utf32_word(Utf8Iterator(word_), Utf8Iterator()) {}
     232
     233#endif
     234
     235
     236// We sum the character frequency histogram absolute differences to compute a
     237// lower bound on the edit distance.  Rather than counting each Unicode code
     238// point uniquely, we use an array with VEC_SIZE elements and tally code points
     239// modulo VEC_SIZE which can only reduce the bound we calculate.
     240//
     241// There will be a trade-off between how good the bound is and how large and
     242// array is used (a larger array takes more time to clear and sum over).  The
     243// value 64 is somewhat arbitrary - it works as well as 128 for the testsuite
     244// but that may not reflect real world performance.  FIXME: profile and tune.
     245
     246#define VEC_SIZE 64
     247
     248static int
     249freq_edit_lower_bound(const vector<unsigned> & a, const vector<unsigned> & b)
     250{
     251    int vec[VEC_SIZE];
     252    memset(vec, 0, sizeof(vec));
     253    vector<unsigned>::const_iterator i;
     254    for (i = a.begin(); i != a.end(); ++i) {
     255        ++vec[(*i) % VEC_SIZE];
     256    }
     257    for (i = b.begin(); i != b.end(); ++i) {
     258        --vec[(*i) % VEC_SIZE];
     259    }
     260    unsigned int total = 0;
     261    for (size_t j = 0; j < VEC_SIZE; ++j) {
     262        total += abs(vec[j]);
     263    }
     264    // Each insertion or deletion adds at most 1 to total.  Each transposition
     265    // doesn't change it at all.  But each substitution can change it by 2 so
     266    // we need to divide it by 2.  Rounding up is OK, since the odd change must
     267    // be due to an actual edit.
     268    return (total + 1) / 2;
     269}
     270
     271int
     272EditDistance::distance(const string& term,int limit) {
     273    vector<unsigned> utf32_term;
     274    // we use this to signal that the distance is clearly large and that we're
     275    // not going to work it out exactly
     276    const int big_distance = 1000;
     277
     278    // First check the length of the encoded UTF-8 version of term.
     279    // Each UTF-32 character is 1-4 bytes in UTF-8.
     280    if (abs(long(term.size()) - long(word.size())) > limit * 4) {
     281        LOGLINE(SPELLING, "Lengths much too different");
     282        return big_distance;
     283    }
     284
     285    // Now convert to UTF-32, and compare the true lengths more
     286    // strictly.
     287    utf32_term.assign(Utf8Iterator(term), Utf8Iterator());
     288
     289    if (abs(long(utf32_term.size()) - long(utf32_word.size()))
     290        > limit) {
     291        LOGLINE(SPELLING, "Lengths too different");
     292        return big_distance;
     293    }
     294
     295    if (freq_edit_lower_bound(utf32_term, utf32_word) > limit) {
     296        LOGLINE(SPELLING, "Rejected by character frequency test");
     297        return big_distance;
     298    }
     299
     300    return edit_distance_unsigned(&utf32_term[0],
     301                                  int(utf32_term.size()),
     302                                  &utf32_word[0],
     303                                  int(utf32_word.size()),
     304                                  limit);
     305}
  • xapian-core/api/editdistance.h

     
    2222#ifndef XAPIAN_INCLUDED_EDITDISTANCE_H
    2323#define XAPIAN_INCLUDED_EDITDISTANCE_H
    2424
     25#include<string>
     26#include<vector>
    2527/** Calculate the edit distance between two sequences.
    2628 *
    2729 *  Edit distance is defined as the minimum number of edit operations
     
    4951                           const unsigned* ptr2, int len2,
    5052                           int max_distance);
    5153
     54
     55class EditDistance {
     56    const std::string word;
     57    const std::vector<unsigned> utf32_word;
     58
     59 public:
     60    EditDistance(const std::string& word_) ;
     61    int distance(const std::string& other, int limit);
     62};
    5263#endif // XAPIAN_INCLUDED_EDITDISTANCE_H