cjk-ngram-applied-to-1.2-branch.patch on Ticket #180 – Attachment – Xapian

xapian-core/queryparser/Makefile.mk

 endif
 noinst_HEADERS +=\
+        queryparser/cjk-tokenizer.h\
         queryparser/queryparser_internal.h\
         queryparser/queryparser_token.h\
         queryparser/termgenerator_internal.h
 …
 endif
 lib_src +=\
+        queryparser/cjk-tokenizer.cc\
         queryparser/queryparser.cc\
         queryparser/queryparser_internal.cc\
         queryparser/termgenerator.cc\

xapian-core/queryparser/cjk-tokenizer.cc

+/** @file cjk-tokenizer.cc
+ * @brief Tokenise CJK text as n-grams
+ */
+/* Copyright (c) 2007, 2008 Yung-chung Lin (henearkrxern@gmail.com)
+ * Copyright (c) 2011 Richard Boulton (richard@tartarus.org)
+ * Copyright (c) 2011 Brandon Schaefer (brandontschaefer@gmail.com)
+ * Copyright (c) 2011 Olly Betts
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#include <config.h>
+#include "cjk-tokenizer.h"
+#include "omassert.h"
+#include "xapian/unicode.h"
+#include <cstdlib>
+#include <string>
+using namespace std;
+static unsigned NGRAM_SIZE = 2;
+bool
+CJK::is_cjk_enabled()
+{
+    const char * p;
+    static bool result = ((p = getenv("XAPIAN_CJK_NGRAM")) != NULL && *p);
+    return result;
+}
+// 2E80..2EFF; CJK Radicals Supplement
+// 3000..303F; CJK Symbols and Punctuation
+// 3040..309F; Hiragana
+// 30A0..30FF; Katakana
+// 3100..312F; Bopomofo
+// 3130..318F; Hangul Compatibility Jamo
+// 3190..319F; Kanbun
+// 31A0..31BF; Bopomofo Extended
+// 31C0..31EF; CJK Strokes
+// 31F0..31FF; Katakana Phonetic Extensions
+// 3200..32FF; Enclosed CJK Letters and Months
+// 3300..33FF; CJK Compatibility
+// 3400..4DBF; CJK Unified Ideographs Extension A
+// 4DC0..4DFF; Yijing Hexagram Symbols
+// 4E00..9FFF; CJK Unified Ideographs
+// A700..A71F; Modifier Tone Letters
+// AC00..D7AF; Hangul Syllables
+// F900..FAFF; CJK Compatibility Ideographs
+// FE30..FE4F; CJK Compatibility Forms
+// FF00..FFEF; Halfwidth and Fullwidth Forms
+// 20000..2A6DF; CJK Unified Ideographs Extension B
+// 2F800..2FA1F; CJK Compatibility Ideographs Supplement
+bool
+CJK::codepoint_is_cjk(unsigned p)
+{
+    if (p < 0x2E80) return false;
+    return ((p >= 0x2E80 && p <= 0x2EFF) ||
+            (p >= 0x3000 && p <= 0x9FFF) ||
+            (p >= 0xA700 && p <= 0xA71F) ||
+            (p >= 0xAC00 && p <= 0xD7AF) ||
+            (p >= 0xF900 && p <= 0xFAFF) ||
+            (p >= 0xFE30 && p <= 0xFE4F) ||
+            (p >= 0xFF00 && p <= 0xFFEF) ||
+            (p >= 0x20000 && p <= 0x2A6DF) ||
+            (p >= 0x2F800 && p <= 0x2FA1F));
+}
+string
+CJK::get_cjk(Xapian::Utf8Iterator &it)
+{
+    string str;
+    while (it != Xapian::Utf8Iterator() && codepoint_is_cjk(*it)) {
+        Xapian::Unicode::append_utf8(str, *it);
+        ++it;
+    }
+    return str;
+}
+const string &
+CJKTokenIterator::operator*() const
+{
+    if (current_token.empty()) {
+        Assert(it != Xapian::Utf8Iterator());
+        p = it;
+        Xapian::Unicode::append_utf8(current_token, *p);
+        ++p;
+        len = 1;
+    }
+    return current_token;
+}
+CJKTokenIterator &
+CJKTokenIterator::operator++()
+{
+    if (len < NGRAM_SIZE && p != Xapian::Utf8Iterator()) {
+        Xapian::Unicode::append_utf8(current_token, *p);
+        ++p;
+        ++len;
+    } else {
+        Assert(it != Xapian::Utf8Iterator());
+        ++it;
+        current_token.resize(0);
+    }
+    return *this;
+}

xapian-core/queryparser/queryparser.lemony

 // Include the list of token values lemon generates.
 #include "queryparser_token.h"
+#include "cjk-tokenizer.h"
 #include <algorithm>
 #include <limits>
 #include <list>
 …
+    }
 };
+class Terms;
 /** Class used to pass information about a token from lexer to parser.
+ *
  *  Generally an instance of this class carries term information, but it can be
 …
      */
     Query * as_partial_query(State * state_) const;
+    /** Build a query for a string of CJK characters. */
+    Query * as_cjk_query() const;
+    /** Handle a CJK character string in a positional context. */
+    void as_positional_cjk_term(Terms * terms) const;
     /// Value range query.
     Query as_value_range_query() const;
 …
     return q;
+}
+Query *
+Term::as_cjk_query() const
+{
+    vector<Query> prefix_cjk;
+    const list<string> & prefixes = prefix_info->prefixes;
+    list<string>::const_iterator piter;
+    for (CJKTokenIterator tk(name); tk != CJKTokenIterator(); ++tk) {
+        for (piter = prefixes.begin(); piter != prefixes.end(); ++piter) {
+            string cjk = *piter;
+            cjk += *tk;
+            prefix_cjk.push_back(Query(cjk, 1, pos));
+        }
+    }
+    Query * q = new Query(Query::OP_AND, prefix_cjk.begin(), prefix_cjk.end());
+    delete this;
+    return q;
+}
 Query
 Term::as_value_range_query() const
+{
 …
 string
 QueryParser::Internal::parse_term(Utf8Iterator &it, const Utf8Iterator &end,
+                                  bool cjk_ngram, bool & is_cjk_term,
                                   bool &was_acronym)
+{
     string term;
 …
+    }
     was_acronym = !term.empty();
+    if (cjk_ngram && term.empty() && CJK::codepoint_is_cjk(*it)) {
+        term = CJK::get_cjk(it);
+        is_cjk_term = true;
+    }
     if (term.empty()) {
         unsigned prevch = *it;
         Unicode::append_utf8(term, prevch);
         while (++it != end) {
+            if (cjk_ngram && CJK::codepoint_is_cjk(*it)) break;
             unsigned ch = *it;
             if (!is_wordchar(ch)) {
                 // Treat a single embedded '&' or "'" or similar as a word
 …
 QueryParser::Internal::parse_query(const string &qs, unsigned flags,
                                    const string &default_prefix)
+{
+    bool cjk_ngram = CJK::is_cjk_enabled();
     // Set value_ranges if we may have to handle value ranges in the query.
     bool value_ranges;
     value_ranges = !valrangeprocs.empty() && (qs.find("..") != string::npos);
 …
 phrased_term:
         bool was_acronym;
+        string term = parse_term(it, end, was_acronym);
+        bool is_cjk_term = false;
+        string term = parse_term(it, end, cjk_ngram, is_cjk_term, was_acronym);
         // Boolean operators.
         if ((mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2) &&
 …
             Term * term_obj = new Term(&state, term, prefix_info,
                                        unstemmed_term, stem_term, term_pos++);
+            if (is_cjk_term) {
+                Parse(pParser, CJKTERM, term_obj, &state);
+                if (it == end) break;
+                continue;
+            }
             if (mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2) {
                 if (it != end) {
                     if ((flags & FLAG_WILDCARD) && *it == '*') {
 …
+    }
 };
+void
+Term::as_positional_cjk_term(Terms * terms) const
+{
+    // Add each individual CJK character to the phrase.
+    string t;
+    for (Utf8Iterator it(name); it != Utf8Iterator(); ++it) {
+        Unicode::append_utf8(t, *it);
+        Term * c = new Term(state, t, prefix_info, unstemmed, stem, pos);
+        terms->add_positional_term(c);
+        t.resize(0);
+    }
+    // FIXME: we want to add the n-grams as filters too for efficiency.
+    delete this;
+}
 // Helper macro for converting a boolean operation into a Xapian::Query.
 #define BOOL_OP_TO_QUERY(E, A, OP, B, OP_TXT) \
     do {\
 …
     delete U;
+}
+compound_term(T) ::= CJKTERM(U). {
+    { T = U->as_cjk_query(); }
+}
 // phrase - The "inside the quotes" part of a double-quoted phrase.
 %type phrase {Terms *}
 …
     P->add_positional_term(T);
+}
+phrase(P) ::= CJKTERM(T). {
+    P = new Terms;
+    T->as_positional_cjk_term(P);
+}
 phrase(P) ::= phrase(Q) TERM(T). {
     P = Q;
     P->add_positional_term(T);
+}
+phrase(P) ::= phrase(Q) CJKTERM(T). {
+    P = Q;
+    T->as_positional_cjk_term(P);
+}
 // phrased_term - A phrased term works like a single term, but is actually
 // 2 or more terms linked together into a phrase by punctuation.  There must be
 // at least 2 terms in order to be able to have punctuation between the terms!

xapian-core/queryparser/queryparser_internal.h

 /* queryparser_internal.h: The non-lemon-generated parts of the QueryParser
  * class.
+ *
  * Copyright (C) 2005,2006,2007,2010 Olly Betts
+ * Copyright (C) 2005,2006,2007,2010,2011 Olly Betts
  * Copyright (C) 2010 Adam Sjøgren
+ *
  * This program is free software; you can redistribute it and/or
 …
                     filter_type type);
     std::string parse_term(Utf8Iterator &it, const Utf8Iterator &end,
+                           bool cjk_ngram, bool &is_cjk_term,
                            bool &was_acronym);
   public:

xapian-core/queryparser/cjk-tokenizer.h

+/** @file cjk-tokenizer.h
+ * @brief Tokenise CJK text as n-grams
+ */
+/* Copyright (c) 2007, 2008 Yung-chung Lin (henearkrxern@gmail.com)
+ * Copyright (c) 2011 Richard Boulton (richard@tartarus.org)
+ * Copyright (c) 2011 Brandon Schaefer (brandontschaefer@gmail.com)
+ * Copyright (c) 2011 Olly Betts
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifndef XAPIAN_INCLUDED_CJK_TOKENIZER_H
+#define XAPIAN_INCLUDED_CJK_TOKENIZER_H
+#include "xapian/unicode.h"
+#include <string>
+namespace CJK {
+/** Should we use the CJK n-gram code?
+ *
+ *  The first time this is called it reads the environmental variable
+ *  XAPIAN_CJK_NGRAM and returns true if it is set to a non-empty value.
+ *  Subsequent calls cache and return the same value.
+ */
+bool is_cjk_enabled();
+bool codepoint_is_cjk(unsigned codepoint);
+std::string get_cjk(Xapian::Utf8Iterator &it);
+}
+class CJKTokenIterator {
+    Xapian::Utf8Iterator it;
+    mutable Xapian::Utf8Iterator p;
+    mutable unsigned len;
+    mutable std::string current_token;
+  public:
+    CJKTokenIterator(const std::string & s)
+        : it(s) { }
+    CJKTokenIterator(const Xapian::Utf8Iterator & it_)
+        : it(it_) { }
+    CJKTokenIterator()
+        : it() { }
+    const std::string & operator*() const;
+    CJKTokenIterator & operator++();
+    /// Get the length of the current token in Unicode characters.
+    unsigned get_length() const { return len; }
+    friend bool operator==(const CJKTokenIterator &, const CJKTokenIterator &);
+};
+inline bool
+operator==(const CJKTokenIterator & a, const CJKTokenIterator & b)
+{
+    // We only really care about comparisons where one or other is an end
+    // iterator.
+    return a.it == b.it;
+}
+inline bool
+operator!=(const CJKTokenIterator & a, const CJKTokenIterator & b)
+{
+    return !(a == b);
+}
+#endif // XAPIAN_INCLUDED_CJK_TOKENIZER_H

xapian-core/queryparser/termgenerator_internal.cc

 /** @file termgenerator_internal.cc
  * @brief TermGenerator class internals
  */
 /* Copyright (C) 2007,2010 Olly Betts
+/* Copyright (C) 2007,2010,2011 Olly Betts
+ *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
 …
 #include <limits>
 #include <string>
+#include "cjk-tokenizer.h"
 using namespace std;
 namespace Xapian {
 …
 TermGenerator::Internal::index_text(Utf8Iterator itor, termcount wdf_inc,
                                     const string & prefix, bool with_positions)
+{
+    bool cjk_ngram = CJK::is_cjk_enabled();
     int stop_mode = STOPWORDS_INDEX_UNSTEMMED_ONLY;
     if (!stopper) stop_mode = STOPWORDS_NONE;
 …
+        }
         while (true) {
+            if (cjk_ngram && CJK::codepoint_is_cjk(*itor)) {
+                const string & cjk = CJK::get_cjk(itor);
+                for (CJKTokenIterator tk(cjk); tk != CJKTokenIterator(); ++tk) {
+                    const string & cjk_token = *tk;
+                    if (cjk_token.size() > MAX_PROB_TERM_LENGTH) continue;
+                    if (stop_mode == STOPWORDS_IGNORE && (*stopper)(cjk_token))
+                        continue;
+                    if (with_positions && tk.get_length() == 1) {
+                        doc.add_posting(prefix + cjk_token, ++termpos, wdf_inc);
+                    } else {
+                        doc.add_term(prefix + cjk_token, wdf_inc);
+                    }
+                    if ((flags & FLAG_SPELLING) && prefix.empty())
+                        db.add_spelling(cjk_token);
+                    if (!stemmer.internal.get()) continue;
+                    if (stop_mode == STOPWORDS_INDEX_UNSTEMMED_ONLY &&
+                        (*stopper)(cjk_token))
+                        continue;
+                    // Note, this uses the lowercased term, but that's OK as we
+                    // only want to avoid stemming terms starting with a digit.
+                    if (!should_stem(cjk_token)) continue;
+                    // Add stemmed form without positional information.
+                    string stem("Z");
+                    stem += prefix;
+                    stem += stemmer(cjk_token);
+                    doc.add_term(stem, wdf_inc);
+                }
+                while (true) {
+                    if (itor == Utf8Iterator()) return;
+                    ch = check_wordchar(*itor);
+                    if (ch) break;
+                    ++itor;
+                }
+            }
             unsigned prevch;
             do {
                 Unicode::append_utf8(term, ch);
                 prevch = ch;
+                if (++itor == Utf8Iterator()) goto endofterm;
+                if (++itor == Utf8Iterator() ||
+                    (cjk_ngram && CJK::codepoint_is_cjk(*itor)))
+                    goto endofterm;
                 ch = check_wordchar(*itor);
             } while (ch);

xapian-core/tests/termgentest.cc

 #include "testutils.h"
 #include "utils.h"
+#include <stdlib.h> // For setenv() or putenv()
 using namespace std;
 #define TESTCASE(S) {#S, test_##S}
 …
       "Z\xe1\x80\x9d\xe1\x80\xae\xe1\x80\x80\xe1\x80\xae\xe1\x80\x95\xe1\x80\xad\xe1\x80\x9e\xe1\x80\xaf\xe1\x80\xb6\xe1\x80\xb8\xe1\x80\x85\xe1\x80\xbd\xe1\x80\xb2\xe1\x80\x9e\xe1\x80\xb0\xe1\x80\x99\xe1\x80\xbb\xe1\x80\xac\xe1\x80\xb8\xe1\x80\x80:1 \xe1\x80\x9d\xe1\x80\xae\xe1\x80\x80\xe1\x80\xae\xe1\x80\x95\xe1\x80\xad\xe1\x80\x9e\xe1\x80\xaf\xe1\x80\xb6\xe1\x80\xb8\xe1\x80\x85\xe1\x80\xbd\xe1\x80\xb2\xe1\x80\x9e\xe1\x80\xb0\xe1\x80\x99\xe1\x80\xbb\xe1\x80\xac\xe1\x80\xb8\xe1\x80\x80[1]" },
     { "", "fish+chips", "Zchip:1 Zfish:1 chips[2] fish[1]" },
+    // Basic CJK tests:
+    { "stem=", "久有归天", "久[1] 久有:1 天[4] 归[3] 归天:1 有[2] 有归:1" },
+    { "", "극지라", "극[1] 극지:1 라[3] 지[2] 지라:1" },
+    { "", "ウルス アップ", "ア[4] ウ[1] ウル:1 ス[3] ッ[5] ップ:1 プ[6] ル[2] ルス:1" },
+    // CJK with prefix:
+    { "prefix=XA", "发送从", "XA从[3] XA发[1] XA发送:1 XA送[2] XA送从:1" },
+    { "prefix=XA", "点卡思考", "XA卡[2] XA卡思:1 XA思[3] XA思考:1 XA点[1] XA点卡:1 XA考[4]" },
+    // CJK mixed with non-CJK:
+    { "prefix=", "インtestタ", "test[3] イ[1] イン:1 タ[4] ン[2]" },
+    { "", "配this is合a个 test!", "a[5] is[3] test[7] this[2] 个[6] 合[4] 配[1]" },
     // All following tests are for things which we probably don't really want to
     // behave as they currently do, but we haven't found a sufficiently general
     // way to implement them yet.
     // Test number like things
     { "", "11:59", "11[1] 59[2]" },
+    { "stem=en", "11:59", "11[1] 59[2]" },
     { "", "11:59am", "11[1] 59am[2]" },
     { NULL, NULL, NULL }
 …
 int main(int argc, char **argv)
 try {
+    // FIXME: It would be better to test with and without XAPIAN_CJK_NGRAM set.
+#ifdef __WIN32__
+    _putenv_s("XAPIAN_CJK_NGRAM", "1");
+#elif defined HAVE_SETENV
+    setenv("XAPIAN_CJK_NGRAM", "1", 1);
+#else
+    putenv(const_cast<char*>("XAPIAN_CJK_NGRAM=1"));
+#endif
     test_driver::parse_command_line(argc, argv);
     return test_driver::run(tests);
 } catch (const char * e) {

xapian-core/tests/queryparsertest.cc

 #include <string>
 #include <vector>
+#include <stdlib.h> // For setenv() or putenv()
 using namespace std;
 #define TESTCASE(S) {#S, test_##S}
 …
     { "multisite:xapian.org site:www.xapian.org author:richard authortitle:richard", "((ZArichard:(pos=1) OR ZArichard:(pos=2) OR ZXTrichard:(pos=2)) FILTER (Hwww.xapian.org AND (Hxapian.org OR Jxapian.org)))"},
     { "authortitle:richard-boulton", "((Arichard:(pos=1) PHRASE 2 Aboulton:(pos=2)) OR (XTrichard:(pos=1) PHRASE 2 XTboulton:(pos=2)))"},
     { "authortitle:\"richard boulton\"", "((Arichard:(pos=1) PHRASE 2 Aboulton:(pos=2)) OR (XTrichard:(pos=1) PHRASE 2 XTboulton:(pos=2)))"},
+    // Some CJK tests.
+    { "久有归天愿", "(久:(pos=1) AND 久有:(pos=1) AND 有:(pos=1) AND 有归:(pos=1) AND 归:(pos=1) AND 归天:(pos=1) AND 天:(pos=1) AND 天愿:(pos=1) AND 愿:(pos=1))" },
+    { "title:久有 归 天愿", "((XT久:(pos=1) AND XT久有:(pos=1) AND XT有:(pos=1)) OR 归:(pos=2) OR (天:(pos=3) AND 天愿:(pos=3) AND 愿:(pos=3)))" },
+    { "h众ello万众", "(Zh:(pos=1) OR 众:(pos=2) OR Zello:(pos=3) OR (万:(pos=4) AND 万众:(pos=4) AND 众:(pos=4)))" },
+    { "世(の中)TEST_tm", "(世:(pos=1) OR (の:(pos=2) AND の中:(pos=2) AND 中:(pos=2)) OR test_tm:(pos=3))" },
+    { "다녀 AND 와야", "(다:(pos=1) AND 다녀:(pos=1) AND 녀:(pos=1) AND 와:(pos=2) AND 와야:(pos=2) AND 야:(pos=2))" },
+    { "authortitle:학술 OR 연구를", "((A학:(pos=1) AND XT학:(pos=1) AND A학술:(pos=1) AND XT학술:(pos=1) AND A술:(pos=1) AND XT술:(pos=1)) OR (연:(pos=2) AND 연구:(pos=2) AND 구:(pos=2) AND 구를:(pos=2) AND 를:(pos=2)))" },
+    // FIXME: These should really filter by bigrams to accelerate:
+    { "\"久有归\"", "(久:(pos=1) PHRASE 3 有:(pos=1) PHRASE 3 归:(pos=1))" },
+    { "\"久有test归\"", "(久:(pos=1) PHRASE 4 有:(pos=1) PHRASE 4 test:(pos=2) PHRASE 4 归:(pos=3))" },
+    // FIXME: this should work: { "久 NEAR 有", "(久:(pos=1) NEAR 11 有:(pos=2))" },
     { NULL, NULL }
 };
 …
     // Add coverage for other cases similar to the above.
     { "a b site:xapian.org", "((Za:(pos=1) AND Zb:(pos=2)) FILTER Hxapian.org)" },
     { "site:xapian.org a b", "((Za:(pos=1) AND Zb:(pos=2)) FILTER Hxapian.org)" },
+    // Some CJK tests.
+    { "author:험가 OR subject:万众 hello world!", "((A험:(pos=1) AND A험가:(pos=1) AND A가:(pos=1)) OR (XT万:(pos=2) AND XT万众:(pos=2) AND XT众:(pos=2) AND Zhello:(pos=3) AND Zworld:(pos=4)))" },
+    { "洛伊one儿差点two脸three", "(洛:(pos=1) AND 洛伊:(pos=1) AND 伊:(pos=1) AND Zone:(pos=2) AND 儿:(pos=3) AND 儿差:(pos=3) AND 差:(pos=3) AND 差点:(pos=3) AND 点:(pos=3) AND Ztwo:(pos=4) AND 脸:(pos=5) AND Zthree:(pos=6))" },
     { NULL, NULL }
 };
 …
     TEST_STRINGS_EQUAL(qobj.get_description(), "Xapian::Query((ZAme:(pos=1) OR ZXTstuff:(pos=2)))");
     qobj = qp.parse_query("title:(stuff) me", Xapian::QueryParser::FLAG_BOOLEAN, "A");
     TEST_STRINGS_EQUAL(qobj.get_description(), "Xapian::Query((ZXTstuff:(pos=1) OR ZAme:(pos=2)))");
+    qobj = qp.parse_query("英国 title:文森hello", 0, "A");
+    TEST_STRINGS_EQUAL(qobj.get_description(), "Xapian::Query(((A英:(pos=1) AND A英国:(pos=1) AND A国:(pos=1)) OR (XT文:(pos=2) AND XT文森:(pos=2) AND XT森:(pos=2)) OR ZAhello:(pos=3)))");
     return true;
+}
 …
 int main(int argc, char **argv)
 try {
+    // FIXME: It would be better to test with and without XAPIAN_CJK_NGRAM set.
+#ifdef __WIN32__
+    _putenv_s("XAPIAN_CJK_NGRAM", "1");
+#elif defined HAVE_SETENV
+    setenv("XAPIAN_CJK_NGRAM", "1", 1);
+#else
+    putenv(const_cast<char*>("XAPIAN_CJK_NGRAM=1"));
+#endif
     test_driver::parse_command_line(argc, argv);
     return test_driver::run(tests);
 } catch (const char * e) {

xapian-core/ChangeLog

+Wed Aug 24 14:25:21 GMT 2011  Olly Betts <olly@survex.com>
+        * Backport change from trunk:
+        * queryparser/queryparser.lemony: Fix memory leak (caught by existing
+          testcase queryparser1 when run under valgrind).
+Wed Aug 24 14:13:24 GMT 2011  Olly Betts <olly@survex.com>
+        * Backport change from trunk:
+        * queryparser/,tests/queryparsertest.cc,tests/termgentest.cc: Add
+          support for indexing and searching CJK text using n-grams.  Currently
+          this is only enabled if environmental variable XAPIAN_CJK_NGRAM is
+          set to a non-empty value.
 Wed Aug 10 06:09:39 GMT 2011  Olly Betts <olly@survex.com>
         * NEWS: Finalise 1.2.7.

Context Navigation

Ticket #180: cjk-ngram-applied-to-1.2-branch.patch

xapian-core/queryparser/Makefile.mk

xapian-core/queryparser/cjk-tokenizer.cc

xapian-core/queryparser/queryparser.lemony

xapian-core/queryparser/queryparser_internal.h

xapian-core/queryparser/cjk-tokenizer.h

xapian-core/queryparser/termgenerator_internal.cc

xapian-core/tests/termgentest.cc

xapian-core/tests/queryparsertest.cc

xapian-core/ChangeLog

Download in other formats: