root / tags / 1.0.8 / xapian-core / backends / flint / flint_spelling.cc

Revision 9689, 14.5 kB (checked in by olly, 14 months ago)

api/maptermlist.h,api/termlist.cc,backends/alltermslist.cc,
backends/flint/flint_spelling.cc,backends/flint/flint_spelling.h,
common/alltermslist.h,common/termlist.h,common/vectortermlist.h:
Provide a default implementation of accumulate_stats() in the
virtual base class TermIterator::Internal instead of repeating it
in each subclass which doesn't get used for generating an ESet, and
don't call abort() in the default implementation - an Assert(false)
is sufficient, and more consistent with how we handle other similar
cases.

Line 
1/** @file flint_spelling.cc
2 * @brief Spelling correction data for a flint database.
3 */
4/* Copyright (C) 2004,2005,2006,2007 Olly Betts
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
19 */
20
21#include <config.h>
22
23#include <xapian/error.h>
24#include <xapian/types.h>
25
26#include "expandweight.h"
27#include "flint_spelling.h"
28#include "flint_utils.h"
29#include "omassert.h"
30#include "ortermlist.h"
31
32#include <algorithm>
33#include <map>
34#include <queue>
35#include <vector>
36#include <set>
37#include <string>
38
39using namespace std;
40
41// We XOR the length values with this so that they are more likely to coincide
42// with lower case ASCII letters, which are likely to be common.  This means
43// that zlib should do a better job of compressing tag values - in tests, this
44// gave 5% better compression.
45#define MAGIC_XOR_VALUE 96
46
47class PrefixCompressedStringItor {
48    const unsigned char * p;
49    size_t left;
50    string current;
51
52    PrefixCompressedStringItor(const unsigned char * p_, size_t left_,
53                               const string &current_)
54        : p(p_), left(left_), current(current_) { }
55
56  public:
57    PrefixCompressedStringItor(const std::string & s)
58        : p(reinterpret_cast<const unsigned char *>(s.data())),
59          left(s.size()) {
60        if (left) {
61            operator++();
62        } else {
63            p = NULL;
64        }
65    }
66
67    const string & operator*() const {
68        return current;
69    }
70
71    PrefixCompressedStringItor operator++(int) {
72        const unsigned char * old_p = p;
73        size_t old_left = left;
74        string old_current = current;
75        operator++();
76        return PrefixCompressedStringItor(old_p, old_left, old_current);
77    }
78
79    PrefixCompressedStringItor & operator++() {
80        if (left == 0) {
81            p = NULL;
82        } else {
83            if (!current.empty()) {
84                current.resize(*p++ ^ MAGIC_XOR_VALUE);
85                --left;
86            }
87            size_t add;
88            if (left == 0 || (add = *p ^ MAGIC_XOR_VALUE) >= left)
89                throw Xapian::DatabaseCorruptError("Bad spelling data (too little left)");
90            current.append(reinterpret_cast<const char *>(p + 1), add);
91            p += add + 1;
92            left -= add + 1;
93        }
94        return *this;
95    }
96
97    bool at_end() const {
98        return p == NULL;
99    }
100};
101
102class PrefixCompressedStringWriter {
103    string current;
104    string & out;
105
106  public:
107    PrefixCompressedStringWriter(string & out_) : out(out_) { }
108
109    void append(const string & word) {
110        // If this isn't the first entry, see how much of the previous one
111        // we can reuse.
112        if (!current.empty()) {
113            size_t len = min(current.size(), word.size());
114            size_t i;
115            for (i = 0; i < len; ++i) {
116                if (current[i] != word[i]) break;
117            }
118            out += char(i ^ MAGIC_XOR_VALUE);
119            out += char((word.size() - i) ^ MAGIC_XOR_VALUE);
120            out.append(word.data() + i, word.size() - i);
121        } else {
122            out += char(word.size() ^ MAGIC_XOR_VALUE);
123            out += word;
124        }
125        current = word;
126    }
127};
128
129void
130FlintSpellingTable::merge_changes()
131{
132    map<fragment, set<string> >::const_iterator i;
133    for (i = termlist_deltas.begin(); i != termlist_deltas.end(); ++i) {
134        string key = i->first;
135        const set<string> & changes = i->second;
136
137        set<string>::const_iterator d = changes.begin();
138        Assert(d != changes.end());
139
140        string updated;
141        string current;
142        PrefixCompressedStringWriter out(updated);
143        if (get_exact_entry(key, current)) {
144            PrefixCompressedStringItor in(current);
145            updated.reserve(current.size()); // FIXME plus some?
146            while (!in.at_end()) {
147                const string & word = *in;
148                if (word < *d) {
149                    out.append(word);
150                    ++in;
151                } else if (word > *d) {
152                    out.append(*d++);
153                    if (d == changes.end()) break;
154                    continue;
155                } else {
156                    // If an existing entry is in the changes list, that means
157                    // we should remove it.
158                    ++in;
159                    ++d;
160                }
161            }
162            if (!in.at_end()) {
163                // FIXME : easy to optimise this to a fix-up and substring copy.
164                while (!in.at_end()) {
165                    out.append(*in++);
166                }
167            }
168        }
169        while (d != changes.end()) {
170            out.append(*d++);
171        }
172        if (!updated.empty()) {
173            add(key, updated);
174        } else {
175            del(key);
176        }
177    }
178    termlist_deltas.clear();
179
180    map<string, Xapian::termcount>::const_iterator j;
181    for (j = wordfreq_changes.begin(); j != wordfreq_changes.end(); ++j) {
182        string key = "W" + j->first;
183        if (j->second) {
184            add(key, pack_uint_last(j->second));
185        } else {
186            del(key);
187        }
188    }
189    wordfreq_changes.clear();
190}
191
192void
193FlintSpellingTable::add_fragment(fragment frag, const string & word)
194{
195    map<fragment, set<string> >::iterator i = termlist_deltas.find(frag);
196    if (i == termlist_deltas.end()) {
197        i = termlist_deltas.insert(make_pair(frag, set<string>())).first;
198    }
199    i->second.insert(word);
200}
201
202void
203FlintSpellingTable::add_word(const string & word, Xapian::termcount freqinc)
204{
205    if (word.size() <= 1) return;
206
207    map<string, Xapian::termcount>::iterator i = wordfreq_changes.find(word);
208    if (i != wordfreq_changes.end()) {
209        // Word "word" already exists and has been modified.
210        if (i->second) {
211            i->second += freqinc;
212            return;
213        }
214    } else {
215        string key = "W" + word;
216        string data;
217        if (get_exact_entry(key, data)) {
218            // Word "word" already exists, so increment its count.
219            Xapian::termcount freq;
220            const char * p = data.data();
221            if (!unpack_uint_last(&p, p + data.size(), &freq) || freq == 0) {
222                throw Xapian::DatabaseCorruptError("Bad spelling word freq");
223            }
224            wordfreq_changes[word] = freq + freqinc;
225            return;
226        }
227    }
228
229    // New word - need to create trigrams for it.
230    if (i != wordfreq_changes.end()) {
231        i->second = freqinc;
232    } else {
233        wordfreq_changes[word] = freqinc;
234    }
235
236    fragment buf;
237    // Head:
238    buf[0] = 'H';
239    buf[1] = word[0];
240    buf[2] = word[1];
241    buf[3] = '\0';
242    add_fragment(buf, word);
243
244    // Tail:
245    buf[0] = 'T';
246    buf[1] = word[word.size() - 2];
247    buf[2] = word[word.size() - 1];
248    buf[3] = '\0';
249    add_fragment(buf, word);
250
251    if (word.size() <= 4) {
252        // We also generate 'bookends' for two, three, and four character
253        // terms so we can handle transposition of the middle two characters
254        // of a four character word, substitution or deletion of the middle
255        // character of a three character word, or insertion in the middle of a
256        // two character word.
257        // 'Bookends':
258        buf[0] = 'B';
259        buf[1] = word[0];
260        buf[3] = '\0';
261        add_fragment(buf, word);
262    }
263    if (word.size() > 2) {
264        // Middles:
265        buf[0] = 'M';
266        for (size_t start = 0; start <= word.size() - 3; ++start) {
267            memcpy(buf.data + 1, word.data() + start, 3);
268            add_fragment(buf, word);
269        }
270    }
271}
272
273void
274FlintSpellingTable::remove_fragment(fragment frag, const string & word)
275{
276    map<fragment, set<string> >::iterator i = termlist_deltas.find(frag);
277    if (i != termlist_deltas.end()) {
278        i->second.erase(word);
279    }
280}
281
282void
283FlintSpellingTable::remove_word(const string & word, Xapian::termcount freqdec)
284{
285    map<string, Xapian::termcount>::iterator i = wordfreq_changes.find(word);
286    if (i != wordfreq_changes.end()) {
287        if (i->second == 0) {
288            // Word has already been deleted.
289            return;
290        }
291        // Word "word" exists and has been modified.
292        if (freqdec < i->second) {
293            i->second -= freqdec;
294            return;
295        }
296    }
297
298    {
299        string key = "W" + word;
300        string data;
301        if (!get_exact_entry(key, data)) {
302            // This word doesn't exist.
303            return;
304        }
305
306        Xapian::termcount freq;
307        const char *p = data.data();
308        if (!unpack_uint_last(&p, p + data.size(), &freq)) {
309            throw Xapian::DatabaseCorruptError("Bad spelling word freq");
310        }
311        if (freqdec < freq) {
312            wordfreq_changes[word] = freq - freqdec;
313            return;
314        }
315    }
316
317    // Mark word as deleted, and remove its fragment entries.
318    wordfreq_changes[word] = 0;
319
320    fragment buf;
321    // Head:
322    buf[0] = 'H';
323    buf[1] = word[0];
324    buf[2] = word[1];
325    buf[3] = '\0';
326    remove_fragment(buf, word);
327
328    // Tail:
329    buf[0] = 'T';
330    buf[1] = word[word.size() - 2];
331    buf[2] = word[word.size() - 1];
332    buf[3] = '\0';
333    remove_fragment(buf, word);
334
335    if (word.size() <= 4) {
336        // 'Bookends':
337        buf[0] = 'B';
338        buf[1] = word[0];
339        buf[3] = '\0';
340        remove_fragment(buf, word);
341    }
342    if (word.size() > 2) {
343        // Middles:
344        buf[0] = 'M';
345        for (size_t start = 0; start <= word.size() - 3; ++start) {
346            memcpy(buf.data + 1, word.data() + start, 3);
347            remove_fragment(buf, word);
348        }
349    }
350}
351
352struct TermListGreaterApproxSize {
353    bool operator()(const TermList *a, const TermList *b) {
354        return a->get_approx_size() > b->get_approx_size();
355    }
356};
357
358TermList *
359FlintSpellingTable::open_termlist(const string & word)
360{
361    if (word.size() <= 1) return NULL;
362
363    // Merge any pending changes to disk, but don't call commit() so they
364    // won't be switched live.
365    if (!wordfreq_changes.empty()) merge_changes();
366
367    // Build a priority queue of TermList objects which returns those of
368    // greatest approximate size first.
369    priority_queue<TermList*, vector<TermList*>, TermListGreaterApproxSize> pq;
370    try {
371        string data;
372        fragment buf;
373
374        // Head:
375        buf[0] = 'H';
376        buf[1] = word[0];
377        buf[2] = word[1];
378        if (get_exact_entry(string(buf), data))
379            pq.push(new FlintSpellingTermList(data));
380
381        // Tail:
382        buf[0] = 'T';
383        buf[1] = word[word.size() - 2];
384        buf[2] = word[word.size() - 1];
385        if (get_exact_entry(string(buf), data))
386            pq.push(new FlintSpellingTermList(data));
387
388        if (word.size() <= 4) {
389            // We also generate 'bookends' for two, three, and four character
390            // terms so we can handle transposition of the middle two
391            // characters of a four character word, substitution or deletion of
392            // the middle character of a three character word, or insertion in
393            // the middle of a two character word.
394            buf[0] = 'B';
395            buf[1] = word[0];
396            buf[3] = '\0';
397            if (get_exact_entry(string(buf), data))
398                pq.push(new FlintSpellingTermList(data));
399        }
400        if (word.size() > 2) {
401            // Middles:
402            buf[0] = 'M';
403            for (size_t start = 0; start <= word.size() - 3; ++start) {
404                memcpy(buf.data + 1, word.data() + start, 3);
405                if (get_exact_entry(string(buf), data))
406                    pq.push(new FlintSpellingTermList(data));
407            }
408
409            if (word.size() == 3) {
410                // For three letter words, we generate the two "single
411                // transposition" forms too, so that we can produce good
412                // spelling suggestions.
413                // ABC -> BAC
414                buf[1] = word[1];
415                buf[2] = word[0];
416                if (get_exact_entry(string(buf), data))
417                    pq.push(new FlintSpellingTermList(data));
418                // ABC -> ACB
419                buf[1] = word[0];
420                buf[2] = word[2];
421                buf[3] = word[1];
422                if (get_exact_entry(string(buf), data))
423                    pq.push(new FlintSpellingTermList(data));
424            }
425        } else {
426            Assert(word.size() == 2);
427            // For two letter words, we generate H and T terms for the
428            // transposed form so that we can produce good spelling
429            // suggestions.
430            // AB -> BA
431            buf[0] = 'H';
432            buf[1] = word[1];
433            buf[2] = word[0];
434            if (get_exact_entry(string(buf), data))
435                pq.push(new FlintSpellingTermList(data));
436            buf[0] = 'T';
437            if (get_exact_entry(string(buf), data))
438                pq.push(new FlintSpellingTermList(data));
439        }
440
441        if (pq.empty()) return NULL;
442
443        // Build up an OrTermList tree by combine leaves and/or branches in
444        // pairs.  The tree is balanced by the approximated sizes of the leaf
445        // FlintSpellingTermList objects - the way the tree is built are very
446        // similar to how an optimal Huffman code is often constructed.
447        //
448        // Balancing the tree like this should tend to minimise the amount of
449        // work done.
450        while (pq.size() > 1) {
451            // Build the tree such that left is always >= right so that
452            // OrTermList can rely on this when trying to minimise work.
453            TermList * termlist = pq.top();
454            pq.pop();
455
456            termlist = new OrTermList(pq.top(), termlist);
457            pq.pop();
458            pq.push(termlist);
459        }
460
461        return pq.top();
462    } catch (...) {
463        // Make sure we delete all the TermList objects to avoid leaking
464        // memory.
465        while (!pq.empty()) {
466            delete pq.top();
467            pq.pop();
468        }
469        throw;
470    }
471}
472
473Xapian::doccount
474FlintSpellingTable::get_word_frequency(const string & word) const
475{
476    map<string, Xapian::termcount>::const_iterator i;
477    i = wordfreq_changes.find(word);
478    if (i != wordfreq_changes.end()) {
479        // Modified frequency for word:
480        return i->second;
481    }
482
483    string key = "W" + word;
484    string data;
485    if (get_exact_entry(key, data)) {
486        // Word "word" already exists.
487        Xapian::termcount freq;
488        const char *p = data.data();
489        if (!unpack_uint_last(&p, p + data.size(), &freq)) {
490            throw Xapian::DatabaseCorruptError("Bad spelling word freq");
491        }
492        return freq;
493    }
494
495    return 0;
496}
497
498///////////////////////////////////////////////////////////////////////////
499
500FlintSpellingTermList::~FlintSpellingTermList() { }
501
502Xapian::termcount
503FlintSpellingTermList::get_approx_size() const
504{
505    // This is only used to decide how to build a OR-tree of TermList objects
506    // so we just need to return "sizes" which are ordered roughly correctly.
507    return data.size();
508}
509
510std::string
511FlintSpellingTermList::get_termname() const
512{
513    return current_term;
514}
515
516Xapian::termcount
517FlintSpellingTermList::get_wdf() const
518{
519    return 1;
520}
521
522Xapian::doccount
523FlintSpellingTermList::get_termfreq() const
524{
525    return 1;
526}
527
528Xapian::termcount
529FlintSpellingTermList::get_collection_freq() const
530{
531    return 1;
532}
533
534TermList *
535FlintSpellingTermList::next()
536{
537    if (p == data.size()) {
538        p = 0;
539        data.resize(0);
540        return NULL;
541    }
542    if (!current_term.empty()) {
543        if (p == data.size())
544            throw Xapian::DatabaseCorruptError("Bad spelling termlist");
545        current_term.resize(byte(data[p++]) ^ MAGIC_XOR_VALUE);
546    }
547    size_t add;
548    if (p == data.size() ||
549        (add = byte(data[p]) ^ MAGIC_XOR_VALUE) >= data.size() - p)
550        throw Xapian::DatabaseCorruptError("Bad spelling termlist");
551    current_term.append(data.data() + p + 1, add);
552    p += add + 1;
553    return NULL;
554}
555
556bool
557FlintSpellingTermList::at_end() const
558{
559    return data.empty();
560}
561
562Xapian::termcount
563FlintSpellingTermList::positionlist_count() const
564{
565    throw Xapian::UnimplementedError("FlintSpellingTermList::positionlist_count() not implemented");
566}
567
568Xapian::PositionIterator
569FlintSpellingTermList::positionlist_begin() const
570{
571    throw Xapian::UnimplementedError("FlintSpellingTermList::positionlist_begin() not implemented");
572}
Note: See TracBrowser for help on using the browser.