Ticket #180: cjk-ngram-applied-to-1.2-branch.patch

File cjk-ngram-applied-to-1.2-branch.patch, 23.0 KB (added by Olly Betts, 13 years ago)

Combined patch applied to 1.2 branch

  • xapian-core/queryparser/Makefile.mk

     
    55endif
    66
    77noinst_HEADERS +=\
     8        queryparser/cjk-tokenizer.h\
    89        queryparser/queryparser_internal.h\
    910        queryparser/queryparser_token.h\
    1011        queryparser/termgenerator_internal.h
     
    5758endif
    5859
    5960lib_src +=\
     61        queryparser/cjk-tokenizer.cc\
    6062        queryparser/queryparser.cc\
    6163        queryparser/queryparser_internal.cc\
    6264        queryparser/termgenerator.cc\
  • xapian-core/queryparser/cjk-tokenizer.cc

     
     1/** @file cjk-tokenizer.cc
     2 * @brief Tokenise CJK text as n-grams
     3 */
     4/* Copyright (c) 2007, 2008 Yung-chung Lin (henearkrxern@gmail.com)
     5 * Copyright (c) 2011 Richard Boulton (richard@tartarus.org)
     6 * Copyright (c) 2011 Brandon Schaefer (brandontschaefer@gmail.com)
     7 * Copyright (c) 2011 Olly Betts
     8 *
     9 * Permission is hereby granted, free of charge, to any person obtaining a copy
     10 * of this software and associated documentation files (the "Software"), to deal
     11 * deal in the Software without restriction, including without limitation the
     12 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
     13 * sell copies of the Software, and to permit persons to whom the Software is
     14 * furnished to do so, subject to the following conditions:
     15 *
     16 * The above copyright notice and this permission notice shall be included in
     17 * all copies or substantial portions of the Software.
     18 *
     19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     20 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     21 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
     22 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     23 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     24 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     25 * IN THE SOFTWARE.
     26 */
     27
     28#include <config.h>
     29
     30#include "cjk-tokenizer.h"
     31
     32#include "omassert.h"
     33#include "xapian/unicode.h"
     34
     35#include <cstdlib>
     36#include <string>
     37
     38using namespace std;
     39
     40static unsigned NGRAM_SIZE = 2;
     41
     42bool
     43CJK::is_cjk_enabled()
     44{
     45    const char * p;
     46    static bool result = ((p = getenv("XAPIAN_CJK_NGRAM")) != NULL && *p);
     47    return result;
     48}
     49
     50// 2E80..2EFF; CJK Radicals Supplement
     51// 3000..303F; CJK Symbols and Punctuation
     52// 3040..309F; Hiragana
     53// 30A0..30FF; Katakana
     54// 3100..312F; Bopomofo
     55// 3130..318F; Hangul Compatibility Jamo
     56// 3190..319F; Kanbun
     57// 31A0..31BF; Bopomofo Extended
     58// 31C0..31EF; CJK Strokes
     59// 31F0..31FF; Katakana Phonetic Extensions
     60// 3200..32FF; Enclosed CJK Letters and Months
     61// 3300..33FF; CJK Compatibility
     62// 3400..4DBF; CJK Unified Ideographs Extension A
     63// 4DC0..4DFF; Yijing Hexagram Symbols
     64// 4E00..9FFF; CJK Unified Ideographs
     65// A700..A71F; Modifier Tone Letters
     66// AC00..D7AF; Hangul Syllables
     67// F900..FAFF; CJK Compatibility Ideographs
     68// FE30..FE4F; CJK Compatibility Forms
     69// FF00..FFEF; Halfwidth and Fullwidth Forms
     70// 20000..2A6DF; CJK Unified Ideographs Extension B
     71// 2F800..2FA1F; CJK Compatibility Ideographs Supplement
     72bool
     73CJK::codepoint_is_cjk(unsigned p)
     74{
     75    if (p < 0x2E80) return false;
     76    return ((p >= 0x2E80 && p <= 0x2EFF) ||
     77            (p >= 0x3000 && p <= 0x9FFF) ||
     78            (p >= 0xA700 && p <= 0xA71F) ||
     79            (p >= 0xAC00 && p <= 0xD7AF) ||
     80            (p >= 0xF900 && p <= 0xFAFF) ||
     81            (p >= 0xFE30 && p <= 0xFE4F) ||
     82            (p >= 0xFF00 && p <= 0xFFEF) ||
     83            (p >= 0x20000 && p <= 0x2A6DF) ||
     84            (p >= 0x2F800 && p <= 0x2FA1F));
     85}
     86
     87string
     88CJK::get_cjk(Xapian::Utf8Iterator &it)
     89{
     90    string str;
     91    while (it != Xapian::Utf8Iterator() && codepoint_is_cjk(*it)) {
     92        Xapian::Unicode::append_utf8(str, *it);
     93        ++it;
     94    }
     95    return str;
     96}
     97
     98const string &
     99CJKTokenIterator::operator*() const
     100{
     101    if (current_token.empty()) {
     102        Assert(it != Xapian::Utf8Iterator());
     103        p = it;
     104        Xapian::Unicode::append_utf8(current_token, *p);
     105        ++p;
     106        len = 1;
     107    }
     108    return current_token;
     109}
     110
     111CJKTokenIterator &
     112CJKTokenIterator::operator++()
     113{
     114    if (len < NGRAM_SIZE && p != Xapian::Utf8Iterator()) {
     115        Xapian::Unicode::append_utf8(current_token, *p);
     116        ++p;
     117        ++len;
     118    } else {
     119        Assert(it != Xapian::Utf8Iterator());
     120        ++it;
     121        current_token.resize(0);
     122    }
     123    return *this;
     124}
  • xapian-core/queryparser/queryparser.lemony

     
    3333// Include the list of token values lemon generates.
    3434#include "queryparser_token.h"
    3535
     36#include "cjk-tokenizer.h"
     37
    3638#include <algorithm>
    3739#include <limits>
    3840#include <list>
     
    136138    }
    137139};
    138140
     141class Terms;
     142
    139143/** Class used to pass information about a token from lexer to parser.
    140144 *
    141145 *  Generally an instance of this class carries term information, but it can be
     
    192196     */
    193197    Query * as_partial_query(State * state_) const;
    194198
     199    /** Build a query for a string of CJK characters. */
     200    Query * as_cjk_query() const;
     201
     202    /** Handle a CJK character string in a positional context. */
     203    void as_positional_cjk_term(Terms * terms) const;
     204
    195205    /// Value range query.
    196206    Query as_value_range_query() const;
    197207
     
    430440    return q;
    431441}
    432442
     443Query *
     444Term::as_cjk_query() const
     445{
     446    vector<Query> prefix_cjk;
     447    const list<string> & prefixes = prefix_info->prefixes;
     448    list<string>::const_iterator piter;
     449    for (CJKTokenIterator tk(name); tk != CJKTokenIterator(); ++tk) {
     450        for (piter = prefixes.begin(); piter != prefixes.end(); ++piter) {
     451            string cjk = *piter;
     452            cjk += *tk;
     453            prefix_cjk.push_back(Query(cjk, 1, pos));
     454        }
     455    }
     456    Query * q = new Query(Query::OP_AND, prefix_cjk.begin(), prefix_cjk.end());
     457    delete this;
     458    return q;
     459}
     460
    433461Query
    434462Term::as_value_range_query() const
    435463{
     
    537565
    538566string
    539567QueryParser::Internal::parse_term(Utf8Iterator &it, const Utf8Iterator &end,
     568                                  bool cjk_ngram, bool & is_cjk_term,
    540569                                  bool &was_acronym)
    541570{
    542571    string term;
     
    562591    }
    563592    was_acronym = !term.empty();
    564593
     594    if (cjk_ngram && term.empty() && CJK::codepoint_is_cjk(*it)) {
     595        term = CJK::get_cjk(it);
     596        is_cjk_term = true;
     597    }
     598
    565599    if (term.empty()) {
    566600        unsigned prevch = *it;
    567601        Unicode::append_utf8(term, prevch);
    568602        while (++it != end) {
     603            if (cjk_ngram && CJK::codepoint_is_cjk(*it)) break;
    569604            unsigned ch = *it;
    570605            if (!is_wordchar(ch)) {
    571606                // Treat a single embedded '&' or "'" or similar as a word
     
    634669QueryParser::Internal::parse_query(const string &qs, unsigned flags,
    635670                                   const string &default_prefix)
    636671{
     672    bool cjk_ngram = CJK::is_cjk_enabled();
     673
    637674    // Set value_ranges if we may have to handle value ranges in the query.
    638675    bool value_ranges;
    639676    value_ranges = !valrangeprocs.empty() && (qs.find("..") != string::npos);
     
    9751012
    9761013phrased_term:
    9771014        bool was_acronym;
    978         string term = parse_term(it, end, was_acronym);
     1015        bool is_cjk_term = false;
     1016        string term = parse_term(it, end, cjk_ngram, is_cjk_term, was_acronym);
    9791017
    9801018        // Boolean operators.
    9811019        if ((mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2) &&
     
    10751113            Term * term_obj = new Term(&state, term, prefix_info,
    10761114                                       unstemmed_term, stem_term, term_pos++);
    10771115
     1116            if (is_cjk_term) {
     1117                Parse(pParser, CJKTERM, term_obj, &state);
     1118                if (it == end) break;
     1119                continue;
     1120            }
     1121
    10781122            if (mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2) {
    10791123                if (it != end) {
    10801124                    if ((flags & FLAG_WILDCARD) && *it == '*') {
     
    15521596    }
    15531597};
    15541598
     1599void
     1600Term::as_positional_cjk_term(Terms * terms) const
     1601{
     1602    // Add each individual CJK character to the phrase.
     1603    string t;
     1604    for (Utf8Iterator it(name); it != Utf8Iterator(); ++it) {
     1605        Unicode::append_utf8(t, *it);
     1606        Term * c = new Term(state, t, prefix_info, unstemmed, stem, pos);
     1607        terms->add_positional_term(c);
     1608        t.resize(0);
     1609    }
     1610
     1611    // FIXME: we want to add the n-grams as filters too for efficiency.
     1612
     1613    delete this;
     1614}
     1615
    15551616// Helper macro for converting a boolean operation into a Xapian::Query.
    15561617#define BOOL_OP_TO_QUERY(E, A, OP, B, OP_TXT) \
    15571618    do {\
     
    19351996    delete U;
    19361997}
    19371998
     1999compound_term(T) ::= CJKTERM(U). {
     2000    { T = U->as_cjk_query(); }
     2001}
     2002
    19382003// phrase - The "inside the quotes" part of a double-quoted phrase.
    19392004
    19402005%type phrase {Terms *}
     
    19462011    P->add_positional_term(T);
    19472012}
    19482013
     2014phrase(P) ::= CJKTERM(T). {
     2015    P = new Terms;
     2016    T->as_positional_cjk_term(P);
     2017}
     2018
    19492019phrase(P) ::= phrase(Q) TERM(T). {
    19502020    P = Q;
    19512021    P->add_positional_term(T);
    19522022}
    19532023
     2024phrase(P) ::= phrase(Q) CJKTERM(T). {
     2025    P = Q;
     2026    T->as_positional_cjk_term(P);
     2027}
     2028
    19542029// phrased_term - A phrased term works like a single term, but is actually
    19552030// 2 or more terms linked together into a phrase by punctuation.  There must be
    19562031// at least 2 terms in order to be able to have punctuation between the terms!
  • xapian-core/queryparser/queryparser_internal.h

     
    11/* queryparser_internal.h: The non-lemon-generated parts of the QueryParser
    22 * class.
    33 *
    4  * Copyright (C) 2005,2006,2007,2010 Olly Betts
     4 * Copyright (C) 2005,2006,2007,2010,2011 Olly Betts
    55 * Copyright (C) 2010 Adam Sjøgren
    66 *
    77 * This program is free software; you can redistribute it and/or
     
    8383                    filter_type type);
    8484
    8585    std::string parse_term(Utf8Iterator &it, const Utf8Iterator &end,
     86                           bool cjk_ngram, bool &is_cjk_term,
    8687                           bool &was_acronym);
    8788
    8889  public:
  • xapian-core/queryparser/cjk-tokenizer.h

     
     1/** @file cjk-tokenizer.h
     2 * @brief Tokenise CJK text as n-grams
     3 */
     4/* Copyright (c) 2007, 2008 Yung-chung Lin (henearkrxern@gmail.com)
     5 * Copyright (c) 2011 Richard Boulton (richard@tartarus.org)
     6 * Copyright (c) 2011 Brandon Schaefer (brandontschaefer@gmail.com)
     7 * Copyright (c) 2011 Olly Betts
     8 *
     9 * Permission is hereby granted, free of charge, to any person obtaining a copy
     10 * of this software and associated documentation files (the "Software"), to deal
     11 * deal in the Software without restriction, including without limitation the
     12 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
     13 * sell copies of the Software, and to permit persons to whom the Software is
     14 * furnished to do so, subject to the following conditions:
     15 *
     16 * The above copyright notice and this permission notice shall be included in
     17 * all copies or substantial portions of the Software.
     18 *
     19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     20 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     21 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
     22 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     23 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     24 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     25 * IN THE SOFTWARE.
     26 */
     27
     28#ifndef XAPIAN_INCLUDED_CJK_TOKENIZER_H
     29#define XAPIAN_INCLUDED_CJK_TOKENIZER_H
     30
     31#include "xapian/unicode.h"
     32
     33#include <string>
     34
     35namespace CJK {
     36
     37/** Should we use the CJK n-gram code?
     38 *
     39 *  The first time this is called it reads the environmental variable
     40 *  XAPIAN_CJK_NGRAM and returns true if it is set to a non-empty value.
     41 *  Subsequent calls cache and return the same value.
     42 */
     43bool is_cjk_enabled();
     44
     45bool codepoint_is_cjk(unsigned codepoint);
     46
     47std::string get_cjk(Xapian::Utf8Iterator &it);
     48
     49}
     50
     51class CJKTokenIterator {
     52    Xapian::Utf8Iterator it;
     53
     54    mutable Xapian::Utf8Iterator p;
     55
     56    mutable unsigned len;
     57
     58    mutable std::string current_token;
     59
     60  public:
     61    CJKTokenIterator(const std::string & s)
     62        : it(s) { }
     63
     64    CJKTokenIterator(const Xapian::Utf8Iterator & it_)
     65        : it(it_) { }
     66
     67    CJKTokenIterator()
     68        : it() { }
     69
     70    const std::string & operator*() const;
     71
     72    CJKTokenIterator & operator++();
     73
     74    /// Get the length of the current token in Unicode characters.
     75    unsigned get_length() const { return len; }
     76
     77    friend bool operator==(const CJKTokenIterator &, const CJKTokenIterator &);
     78};
     79
     80inline bool
     81operator==(const CJKTokenIterator & a, const CJKTokenIterator & b)
     82{
     83    // We only really care about comparisons where one or other is an end
     84    // iterator.
     85    return a.it == b.it;
     86}
     87
     88inline bool
     89operator!=(const CJKTokenIterator & a, const CJKTokenIterator & b)
     90{
     91    return !(a == b);
     92}
     93
     94#endif // XAPIAN_INCLUDED_CJK_TOKENIZER_H
  • xapian-core/queryparser/termgenerator_internal.cc

     
    11/** @file termgenerator_internal.cc
    22 * @brief TermGenerator class internals
    33 */
    4 /* Copyright (C) 2007,2010 Olly Betts
     4/* Copyright (C) 2007,2010,2011 Olly Betts
    55 *
    66 * This program is free software; you can redistribute it and/or modify
    77 * it under the terms of the GNU General Public License as published by
     
    3131#include <limits>
    3232#include <string>
    3333
     34#include "cjk-tokenizer.h"
     35
    3436using namespace std;
    3537
    3638namespace Xapian {
     
    127129TermGenerator::Internal::index_text(Utf8Iterator itor, termcount wdf_inc,
    128130                                    const string & prefix, bool with_positions)
    129131{
     132    bool cjk_ngram = CJK::is_cjk_enabled();
     133
    130134    int stop_mode = STOPWORDS_INDEX_UNSTEMMED_ONLY;
    131135
    132136    if (!stopper) stop_mode = STOPWORDS_NONE;
     
    164168        }
    165169
    166170        while (true) {
     171            if (cjk_ngram && CJK::codepoint_is_cjk(*itor)) {
     172                const string & cjk = CJK::get_cjk(itor);
     173                for (CJKTokenIterator tk(cjk); tk != CJKTokenIterator(); ++tk) {
     174                    const string & cjk_token = *tk;
     175                    if (cjk_token.size() > MAX_PROB_TERM_LENGTH) continue;
     176
     177                    if (stop_mode == STOPWORDS_IGNORE && (*stopper)(cjk_token))
     178                        continue;
     179
     180                    if (with_positions && tk.get_length() == 1) {
     181                        doc.add_posting(prefix + cjk_token, ++termpos, wdf_inc);
     182                    } else {
     183                        doc.add_term(prefix + cjk_token, wdf_inc);
     184                    }
     185                    if ((flags & FLAG_SPELLING) && prefix.empty())
     186                        db.add_spelling(cjk_token);
     187
     188                    if (!stemmer.internal.get()) continue;
     189
     190                    if (stop_mode == STOPWORDS_INDEX_UNSTEMMED_ONLY &&
     191                        (*stopper)(cjk_token))
     192                        continue;
     193
     194                    // Note, this uses the lowercased term, but that's OK as we
     195                    // only want to avoid stemming terms starting with a digit.
     196                    if (!should_stem(cjk_token)) continue;
     197
     198                    // Add stemmed form without positional information.
     199                    string stem("Z");
     200                    stem += prefix;
     201                    stem += stemmer(cjk_token);
     202                    doc.add_term(stem, wdf_inc);
     203                }
     204                while (true) {
     205                    if (itor == Utf8Iterator()) return;
     206                    ch = check_wordchar(*itor);
     207                    if (ch) break;
     208                    ++itor;
     209                }
     210            }
    167211            unsigned prevch;
    168212            do {
    169213                Unicode::append_utf8(term, ch);
    170214                prevch = ch;
    171                 if (++itor == Utf8Iterator()) goto endofterm;
     215                if (++itor == Utf8Iterator() ||
     216                    (cjk_ngram && CJK::codepoint_is_cjk(*itor)))
     217                    goto endofterm;
    172218                ch = check_wordchar(*itor);
    173219            } while (ch);
    174220
  • xapian-core/tests/termgentest.cc

     
    3131#include "testutils.h"
    3232#include "utils.h"
    3333
     34#include <stdlib.h> // For setenv() or putenv()
     35
    3436using namespace std;
    3537
    3638#define TESTCASE(S) {#S, test_##S}
     
    106108      "Z\xe1\x80\x9d\xe1\x80\xae\xe1\x80\x80\xe1\x80\xae\xe1\x80\x95\xe1\x80\xad\xe1\x80\x9e\xe1\x80\xaf\xe1\x80\xb6\xe1\x80\xb8\xe1\x80\x85\xe1\x80\xbd\xe1\x80\xb2\xe1\x80\x9e\xe1\x80\xb0\xe1\x80\x99\xe1\x80\xbb\xe1\x80\xac\xe1\x80\xb8\xe1\x80\x80:1 \xe1\x80\x9d\xe1\x80\xae\xe1\x80\x80\xe1\x80\xae\xe1\x80\x95\xe1\x80\xad\xe1\x80\x9e\xe1\x80\xaf\xe1\x80\xb6\xe1\x80\xb8\xe1\x80\x85\xe1\x80\xbd\xe1\x80\xb2\xe1\x80\x9e\xe1\x80\xb0\xe1\x80\x99\xe1\x80\xbb\xe1\x80\xac\xe1\x80\xb8\xe1\x80\x80[1]" },
    107109
    108110    { "", "fish+chips", "Zchip:1 Zfish:1 chips[2] fish[1]" },
     111
     112    // Basic CJK tests:
     113    { "stem=", "久有归天", "久[1] 久有:1 天[4] 归[3] 归天:1 有[2] 有归:1" },
     114    { "", "극지라", "극[1] 극지:1 라[3] 지[2] 지라:1" },
     115    { "", "ウルス アップ", "ア[4] ウ[1] ウル:1 ス[3] ッ[5] ップ:1 プ[6] ル[2] ルス:1" },
     116
     117    // CJK with prefix:
     118    { "prefix=XA", "发送从", "XA从[3] XA发[1] XA发送:1 XA送[2] XA送从:1" },
     119    { "prefix=XA", "点卡思考", "XA卡[2] XA卡思:1 XA思[3] XA思考:1 XA点[1] XA点卡:1 XA考[4]" },
     120
     121    // CJK mixed with non-CJK:
     122    { "prefix=", "インtestタ", "test[3] イ[1] イン:1 タ[4] ン[2]" },
     123    { "", "配this is合a个 test!", "a[5] is[3] test[7] this[2] 个[6] 合[4] 配[1]" },
     124
    109125    // All following tests are for things which we probably don't really want to
    110126    // behave as they currently do, but we haven't found a sufficiently general
    111127    // way to implement them yet.
    112128
    113129    // Test number like things
    114     { "", "11:59", "11[1] 59[2]" },
     130    { "stem=en", "11:59", "11[1] 59[2]" },
    115131    { "", "11:59am", "11[1] 59am[2]" },
    116132
    117133    { NULL, NULL, NULL }
     
    770786
    771787int main(int argc, char **argv)
    772788try {
     789    // FIXME: It would be better to test with and without XAPIAN_CJK_NGRAM set.
     790#ifdef __WIN32__
     791    _putenv_s("XAPIAN_CJK_NGRAM", "1");
     792#elif defined HAVE_SETENV
     793    setenv("XAPIAN_CJK_NGRAM", "1", 1);
     794#else
     795    putenv(const_cast<char*>("XAPIAN_CJK_NGRAM=1"));
     796#endif
    773797    test_driver::parse_command_line(argc, argv);
    774798    return test_driver::run(tests);
    775799} catch (const char * e) {
  • xapian-core/tests/queryparsertest.cc

     
    3333#include <string>
    3434#include <vector>
    3535
     36#include <stdlib.h> // For setenv() or putenv()
     37
    3638using namespace std;
    3739
    3840#define TESTCASE(S) {#S, test_##S}
     
    639641    { "multisite:xapian.org site:www.xapian.org author:richard authortitle:richard", "((ZArichard:(pos=1) OR ZArichard:(pos=2) OR ZXTrichard:(pos=2)) FILTER (Hwww.xapian.org AND (Hxapian.org OR Jxapian.org)))"},
    640642    { "authortitle:richard-boulton", "((Arichard:(pos=1) PHRASE 2 Aboulton:(pos=2)) OR (XTrichard:(pos=1) PHRASE 2 XTboulton:(pos=2)))"},
    641643    { "authortitle:\"richard boulton\"", "((Arichard:(pos=1) PHRASE 2 Aboulton:(pos=2)) OR (XTrichard:(pos=1) PHRASE 2 XTboulton:(pos=2)))"},
     644    // Some CJK tests.
     645    { "久有归天愿", "(久:(pos=1) AND 久有:(pos=1) AND 有:(pos=1) AND 有归:(pos=1) AND 归:(pos=1) AND 归天:(pos=1) AND 天:(pos=1) AND 天愿:(pos=1) AND 愿:(pos=1))" },
     646    { "title:久有 归 天愿", "((XT久:(pos=1) AND XT久有:(pos=1) AND XT有:(pos=1)) OR 归:(pos=2) OR (天:(pos=3) AND 天愿:(pos=3) AND 愿:(pos=3)))" },
     647    { "h众ello万众", "(Zh:(pos=1) OR 众:(pos=2) OR Zello:(pos=3) OR (万:(pos=4) AND 万众:(pos=4) AND 众:(pos=4)))" },
     648    { "世(の中)TEST_tm", "(世:(pos=1) OR (の:(pos=2) AND の中:(pos=2) AND 中:(pos=2)) OR test_tm:(pos=3))" },
     649    { "다녀 AND 와야", "(다:(pos=1) AND 다녀:(pos=1) AND 녀:(pos=1) AND 와:(pos=2) AND 와야:(pos=2) AND 야:(pos=2))" },
     650    { "authortitle:학술 OR 연구를", "((A학:(pos=1) AND XT학:(pos=1) AND A학술:(pos=1) AND XT학술:(pos=1) AND A술:(pos=1) AND XT술:(pos=1)) OR (연:(pos=2) AND 연구:(pos=2) AND 구:(pos=2) AND 구를:(pos=2) AND 를:(pos=2)))" },
     651    // FIXME: These should really filter by bigrams to accelerate:
     652    { "\"久有归\"", "(久:(pos=1) PHRASE 3 有:(pos=1) PHRASE 3 归:(pos=1))" },
     653    { "\"久有test归\"", "(久:(pos=1) PHRASE 4 有:(pos=1) PHRASE 4 test:(pos=2) PHRASE 4 归:(pos=3))" },
     654    // FIXME: this should work: { "久 NEAR 有", "(久:(pos=1) NEAR 11 有:(pos=2))" },
    642655    { NULL, NULL }
    643656};
    644657
     
    709722    // Add coverage for other cases similar to the above.
    710723    { "a b site:xapian.org", "((Za:(pos=1) AND Zb:(pos=2)) FILTER Hxapian.org)" },
    711724    { "site:xapian.org a b", "((Za:(pos=1) AND Zb:(pos=2)) FILTER Hxapian.org)" },
     725    // Some CJK tests.
     726    { "author:험가 OR subject:万众 hello world!", "((A험:(pos=1) AND A험가:(pos=1) AND A가:(pos=1)) OR (XT万:(pos=2) AND XT万众:(pos=2) AND XT众:(pos=2) AND Zhello:(pos=3) AND Zworld:(pos=4)))" },
     727    { "洛伊one儿差点two脸three", "(洛:(pos=1) AND 洛伊:(pos=1) AND 伊:(pos=1) AND Zone:(pos=2) AND 儿:(pos=3) AND 儿差:(pos=3) AND 差:(pos=3) AND 差点:(pos=3) AND 点:(pos=3) AND Ztwo:(pos=4) AND 脸:(pos=5) AND Zthree:(pos=6))" },
    712728    { NULL, NULL }
    713729};
    714730
     
    761777    TEST_STRINGS_EQUAL(qobj.get_description(), "Xapian::Query((ZAme:(pos=1) OR ZXTstuff:(pos=2)))");
    762778    qobj = qp.parse_query("title:(stuff) me", Xapian::QueryParser::FLAG_BOOLEAN, "A");
    763779    TEST_STRINGS_EQUAL(qobj.get_description(), "Xapian::Query((ZXTstuff:(pos=1) OR ZAme:(pos=2)))");
     780    qobj = qp.parse_query("英国 title:文森hello", 0, "A");
     781    TEST_STRINGS_EQUAL(qobj.get_description(), "Xapian::Query(((A英:(pos=1) AND A英国:(pos=1) AND A国:(pos=1)) OR (XT文:(pos=2) AND XT文森:(pos=2) AND XT森:(pos=2)) OR ZAhello:(pos=3)))");
    764782    return true;
    765783}
    766784
     
    25072525
    25082526int main(int argc, char **argv)
    25092527try {
     2528    // FIXME: It would be better to test with and without XAPIAN_CJK_NGRAM set.
     2529#ifdef __WIN32__
     2530    _putenv_s("XAPIAN_CJK_NGRAM", "1");
     2531#elif defined HAVE_SETENV
     2532    setenv("XAPIAN_CJK_NGRAM", "1", 1);
     2533#else
     2534    putenv(const_cast<char*>("XAPIAN_CJK_NGRAM=1"));
     2535#endif
    25102536    test_driver::parse_command_line(argc, argv);
    25112537    return test_driver::run(tests);
    25122538} catch (const char * e) {
  • xapian-core/ChangeLog

     
     1Wed Aug 24 14:25:21 GMT 2011  Olly Betts <olly@survex.com>
     2
     3        * Backport change from trunk:
     4        * queryparser/queryparser.lemony: Fix memory leak (caught by existing
     5          testcase queryparser1 when run under valgrind).
     6
     7Wed Aug 24 14:13:24 GMT 2011  Olly Betts <olly@survex.com>
     8
     9        * Backport change from trunk:
     10        * queryparser/,tests/queryparsertest.cc,tests/termgentest.cc: Add
     11          support for indexing and searching CJK text using n-grams.  Currently
     12          this is only enabled if environmental variable XAPIAN_CJK_NGRAM is
     13          set to a non-empty value.
     14
    115Wed Aug 10 06:09:39 GMT 2011  Olly Betts <olly@survex.com>
    216
    317        * NEWS: Finalise 1.2.7.