Ticket #180: cjk-ngram-applied-to-1.2-branch.patch
File cjk-ngram-applied-to-1.2-branch.patch, 23.0 KB (added by , 13 years ago) |
---|
-
xapian-core/queryparser/Makefile.mk
5 5 endif 6 6 7 7 noinst_HEADERS +=\ 8 queryparser/cjk-tokenizer.h\ 8 9 queryparser/queryparser_internal.h\ 9 10 queryparser/queryparser_token.h\ 10 11 queryparser/termgenerator_internal.h … … 57 58 endif 58 59 59 60 lib_src +=\ 61 queryparser/cjk-tokenizer.cc\ 60 62 queryparser/queryparser.cc\ 61 63 queryparser/queryparser_internal.cc\ 62 64 queryparser/termgenerator.cc\ -
xapian-core/queryparser/cjk-tokenizer.cc
1 /** @file cjk-tokenizer.cc 2 * @brief Tokenise CJK text as n-grams 3 */ 4 /* Copyright (c) 2007, 2008 Yung-chung Lin (henearkrxern@gmail.com) 5 * Copyright (c) 2011 Richard Boulton (richard@tartarus.org) 6 * Copyright (c) 2011 Brandon Schaefer (brandontschaefer@gmail.com) 7 * Copyright (c) 2011 Olly Betts 8 * 9 * Permission is hereby granted, free of charge, to any person obtaining a copy 10 * of this software and associated documentation files (the "Software"), to deal 11 * deal in the Software without restriction, including without limitation the 12 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 13 * sell copies of the Software, and to permit persons to whom the Software is 14 * furnished to do so, subject to the following conditions: 15 * 16 * The above copyright notice and this permission notice shall be included in 17 * all copies or substantial portions of the Software. 18 * 19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 20 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 22 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 25 * IN THE SOFTWARE. 26 */ 27 28 #include <config.h> 29 30 #include "cjk-tokenizer.h" 31 32 #include "omassert.h" 33 #include "xapian/unicode.h" 34 35 #include <cstdlib> 36 #include <string> 37 38 using namespace std; 39 40 static unsigned NGRAM_SIZE = 2; 41 42 bool 43 CJK::is_cjk_enabled() 44 { 45 const char * p; 46 static bool result = ((p = getenv("XAPIAN_CJK_NGRAM")) != NULL && *p); 47 return result; 48 } 49 50 // 2E80..2EFF; CJK Radicals Supplement 51 // 3000..303F; CJK Symbols and Punctuation 52 // 3040..309F; Hiragana 53 // 30A0..30FF; Katakana 54 // 3100..312F; Bopomofo 55 // 3130..318F; Hangul Compatibility Jamo 56 // 3190..319F; Kanbun 57 // 31A0..31BF; Bopomofo Extended 58 // 31C0..31EF; CJK Strokes 59 // 31F0..31FF; Katakana Phonetic Extensions 60 // 3200..32FF; Enclosed CJK Letters and Months 61 // 3300..33FF; CJK Compatibility 62 // 3400..4DBF; CJK Unified Ideographs Extension A 63 // 4DC0..4DFF; Yijing Hexagram Symbols 64 // 4E00..9FFF; CJK Unified Ideographs 65 // A700..A71F; Modifier Tone Letters 66 // AC00..D7AF; Hangul Syllables 67 // F900..FAFF; CJK Compatibility Ideographs 68 // FE30..FE4F; CJK Compatibility Forms 69 // FF00..FFEF; Halfwidth and Fullwidth Forms 70 // 20000..2A6DF; CJK Unified Ideographs Extension B 71 // 2F800..2FA1F; CJK Compatibility Ideographs Supplement 72 bool 73 CJK::codepoint_is_cjk(unsigned p) 74 { 75 if (p < 0x2E80) return false; 76 return ((p >= 0x2E80 && p <= 0x2EFF) || 77 (p >= 0x3000 && p <= 0x9FFF) || 78 (p >= 0xA700 && p <= 0xA71F) || 79 (p >= 0xAC00 && p <= 0xD7AF) || 80 (p >= 0xF900 && p <= 0xFAFF) || 81 (p >= 0xFE30 && p <= 0xFE4F) || 82 (p >= 0xFF00 && p <= 0xFFEF) || 83 (p >= 0x20000 && p <= 0x2A6DF) || 84 (p >= 0x2F800 && p <= 0x2FA1F)); 85 } 86 87 string 88 CJK::get_cjk(Xapian::Utf8Iterator &it) 89 { 90 string str; 91 while (it != Xapian::Utf8Iterator() && codepoint_is_cjk(*it)) { 92 Xapian::Unicode::append_utf8(str, *it); 93 ++it; 94 } 95 return str; 96 } 97 98 const string & 99 CJKTokenIterator::operator*() const 100 { 101 if (current_token.empty()) { 102 Assert(it != Xapian::Utf8Iterator()); 103 p = it; 104 Xapian::Unicode::append_utf8(current_token, *p); 105 ++p; 106 len = 1; 107 } 108 return current_token; 109 } 110 111 CJKTokenIterator & 112 CJKTokenIterator::operator++() 113 { 114 if (len < NGRAM_SIZE && p != Xapian::Utf8Iterator()) { 115 Xapian::Unicode::append_utf8(current_token, *p); 116 ++p; 117 ++len; 118 } else { 119 Assert(it != Xapian::Utf8Iterator()); 120 ++it; 121 current_token.resize(0); 122 } 123 return *this; 124 } -
xapian-core/queryparser/queryparser.lemony
33 33 // Include the list of token values lemon generates. 34 34 #include "queryparser_token.h" 35 35 36 #include "cjk-tokenizer.h" 37 36 38 #include <algorithm> 37 39 #include <limits> 38 40 #include <list> … … 136 138 } 137 139 }; 138 140 141 class Terms; 142 139 143 /** Class used to pass information about a token from lexer to parser. 140 144 * 141 145 * Generally an instance of this class carries term information, but it can be … … 192 196 */ 193 197 Query * as_partial_query(State * state_) const; 194 198 199 /** Build a query for a string of CJK characters. */ 200 Query * as_cjk_query() const; 201 202 /** Handle a CJK character string in a positional context. */ 203 void as_positional_cjk_term(Terms * terms) const; 204 195 205 /// Value range query. 196 206 Query as_value_range_query() const; 197 207 … … 430 440 return q; 431 441 } 432 442 443 Query * 444 Term::as_cjk_query() const 445 { 446 vector<Query> prefix_cjk; 447 const list<string> & prefixes = prefix_info->prefixes; 448 list<string>::const_iterator piter; 449 for (CJKTokenIterator tk(name); tk != CJKTokenIterator(); ++tk) { 450 for (piter = prefixes.begin(); piter != prefixes.end(); ++piter) { 451 string cjk = *piter; 452 cjk += *tk; 453 prefix_cjk.push_back(Query(cjk, 1, pos)); 454 } 455 } 456 Query * q = new Query(Query::OP_AND, prefix_cjk.begin(), prefix_cjk.end()); 457 delete this; 458 return q; 459 } 460 433 461 Query 434 462 Term::as_value_range_query() const 435 463 { … … 537 565 538 566 string 539 567 QueryParser::Internal::parse_term(Utf8Iterator &it, const Utf8Iterator &end, 568 bool cjk_ngram, bool & is_cjk_term, 540 569 bool &was_acronym) 541 570 { 542 571 string term; … … 562 591 } 563 592 was_acronym = !term.empty(); 564 593 594 if (cjk_ngram && term.empty() && CJK::codepoint_is_cjk(*it)) { 595 term = CJK::get_cjk(it); 596 is_cjk_term = true; 597 } 598 565 599 if (term.empty()) { 566 600 unsigned prevch = *it; 567 601 Unicode::append_utf8(term, prevch); 568 602 while (++it != end) { 603 if (cjk_ngram && CJK::codepoint_is_cjk(*it)) break; 569 604 unsigned ch = *it; 570 605 if (!is_wordchar(ch)) { 571 606 // Treat a single embedded '&' or "'" or similar as a word … … 634 669 QueryParser::Internal::parse_query(const string &qs, unsigned flags, 635 670 const string &default_prefix) 636 671 { 672 bool cjk_ngram = CJK::is_cjk_enabled(); 673 637 674 // Set value_ranges if we may have to handle value ranges in the query. 638 675 bool value_ranges; 639 676 value_ranges = !valrangeprocs.empty() && (qs.find("..") != string::npos); … … 975 1012 976 1013 phrased_term: 977 1014 bool was_acronym; 978 string term = parse_term(it, end, was_acronym); 1015 bool is_cjk_term = false; 1016 string term = parse_term(it, end, cjk_ngram, is_cjk_term, was_acronym); 979 1017 980 1018 // Boolean operators. 981 1019 if ((mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2) && … … 1075 1113 Term * term_obj = new Term(&state, term, prefix_info, 1076 1114 unstemmed_term, stem_term, term_pos++); 1077 1115 1116 if (is_cjk_term) { 1117 Parse(pParser, CJKTERM, term_obj, &state); 1118 if (it == end) break; 1119 continue; 1120 } 1121 1078 1122 if (mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2) { 1079 1123 if (it != end) { 1080 1124 if ((flags & FLAG_WILDCARD) && *it == '*') { … … 1552 1596 } 1553 1597 }; 1554 1598 1599 void 1600 Term::as_positional_cjk_term(Terms * terms) const 1601 { 1602 // Add each individual CJK character to the phrase. 1603 string t; 1604 for (Utf8Iterator it(name); it != Utf8Iterator(); ++it) { 1605 Unicode::append_utf8(t, *it); 1606 Term * c = new Term(state, t, prefix_info, unstemmed, stem, pos); 1607 terms->add_positional_term(c); 1608 t.resize(0); 1609 } 1610 1611 // FIXME: we want to add the n-grams as filters too for efficiency. 1612 1613 delete this; 1614 } 1615 1555 1616 // Helper macro for converting a boolean operation into a Xapian::Query. 1556 1617 #define BOOL_OP_TO_QUERY(E, A, OP, B, OP_TXT) \ 1557 1618 do {\ … … 1935 1996 delete U; 1936 1997 } 1937 1998 1999 compound_term(T) ::= CJKTERM(U). { 2000 { T = U->as_cjk_query(); } 2001 } 2002 1938 2003 // phrase - The "inside the quotes" part of a double-quoted phrase. 1939 2004 1940 2005 %type phrase {Terms *} … … 1946 2011 P->add_positional_term(T); 1947 2012 } 1948 2013 2014 phrase(P) ::= CJKTERM(T). { 2015 P = new Terms; 2016 T->as_positional_cjk_term(P); 2017 } 2018 1949 2019 phrase(P) ::= phrase(Q) TERM(T). { 1950 2020 P = Q; 1951 2021 P->add_positional_term(T); 1952 2022 } 1953 2023 2024 phrase(P) ::= phrase(Q) CJKTERM(T). { 2025 P = Q; 2026 T->as_positional_cjk_term(P); 2027 } 2028 1954 2029 // phrased_term - A phrased term works like a single term, but is actually 1955 2030 // 2 or more terms linked together into a phrase by punctuation. There must be 1956 2031 // at least 2 terms in order to be able to have punctuation between the terms! -
xapian-core/queryparser/queryparser_internal.h
1 1 /* queryparser_internal.h: The non-lemon-generated parts of the QueryParser 2 2 * class. 3 3 * 4 * Copyright (C) 2005,2006,2007,2010 Olly Betts4 * Copyright (C) 2005,2006,2007,2010,2011 Olly Betts 5 5 * Copyright (C) 2010 Adam Sjøgren 6 6 * 7 7 * This program is free software; you can redistribute it and/or … … 83 83 filter_type type); 84 84 85 85 std::string parse_term(Utf8Iterator &it, const Utf8Iterator &end, 86 bool cjk_ngram, bool &is_cjk_term, 86 87 bool &was_acronym); 87 88 88 89 public: -
xapian-core/queryparser/cjk-tokenizer.h
1 /** @file cjk-tokenizer.h 2 * @brief Tokenise CJK text as n-grams 3 */ 4 /* Copyright (c) 2007, 2008 Yung-chung Lin (henearkrxern@gmail.com) 5 * Copyright (c) 2011 Richard Boulton (richard@tartarus.org) 6 * Copyright (c) 2011 Brandon Schaefer (brandontschaefer@gmail.com) 7 * Copyright (c) 2011 Olly Betts 8 * 9 * Permission is hereby granted, free of charge, to any person obtaining a copy 10 * of this software and associated documentation files (the "Software"), to deal 11 * deal in the Software without restriction, including without limitation the 12 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 13 * sell copies of the Software, and to permit persons to whom the Software is 14 * furnished to do so, subject to the following conditions: 15 * 16 * The above copyright notice and this permission notice shall be included in 17 * all copies or substantial portions of the Software. 18 * 19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 20 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 22 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 25 * IN THE SOFTWARE. 26 */ 27 28 #ifndef XAPIAN_INCLUDED_CJK_TOKENIZER_H 29 #define XAPIAN_INCLUDED_CJK_TOKENIZER_H 30 31 #include "xapian/unicode.h" 32 33 #include <string> 34 35 namespace CJK { 36 37 /** Should we use the CJK n-gram code? 38 * 39 * The first time this is called it reads the environmental variable 40 * XAPIAN_CJK_NGRAM and returns true if it is set to a non-empty value. 41 * Subsequent calls cache and return the same value. 42 */ 43 bool is_cjk_enabled(); 44 45 bool codepoint_is_cjk(unsigned codepoint); 46 47 std::string get_cjk(Xapian::Utf8Iterator &it); 48 49 } 50 51 class CJKTokenIterator { 52 Xapian::Utf8Iterator it; 53 54 mutable Xapian::Utf8Iterator p; 55 56 mutable unsigned len; 57 58 mutable std::string current_token; 59 60 public: 61 CJKTokenIterator(const std::string & s) 62 : it(s) { } 63 64 CJKTokenIterator(const Xapian::Utf8Iterator & it_) 65 : it(it_) { } 66 67 CJKTokenIterator() 68 : it() { } 69 70 const std::string & operator*() const; 71 72 CJKTokenIterator & operator++(); 73 74 /// Get the length of the current token in Unicode characters. 75 unsigned get_length() const { return len; } 76 77 friend bool operator==(const CJKTokenIterator &, const CJKTokenIterator &); 78 }; 79 80 inline bool 81 operator==(const CJKTokenIterator & a, const CJKTokenIterator & b) 82 { 83 // We only really care about comparisons where one or other is an end 84 // iterator. 85 return a.it == b.it; 86 } 87 88 inline bool 89 operator!=(const CJKTokenIterator & a, const CJKTokenIterator & b) 90 { 91 return !(a == b); 92 } 93 94 #endif // XAPIAN_INCLUDED_CJK_TOKENIZER_H -
xapian-core/queryparser/termgenerator_internal.cc
1 1 /** @file termgenerator_internal.cc 2 2 * @brief TermGenerator class internals 3 3 */ 4 /* Copyright (C) 2007,2010 Olly Betts4 /* Copyright (C) 2007,2010,2011 Olly Betts 5 5 * 6 6 * This program is free software; you can redistribute it and/or modify 7 7 * it under the terms of the GNU General Public License as published by … … 31 31 #include <limits> 32 32 #include <string> 33 33 34 #include "cjk-tokenizer.h" 35 34 36 using namespace std; 35 37 36 38 namespace Xapian { … … 127 129 TermGenerator::Internal::index_text(Utf8Iterator itor, termcount wdf_inc, 128 130 const string & prefix, bool with_positions) 129 131 { 132 bool cjk_ngram = CJK::is_cjk_enabled(); 133 130 134 int stop_mode = STOPWORDS_INDEX_UNSTEMMED_ONLY; 131 135 132 136 if (!stopper) stop_mode = STOPWORDS_NONE; … … 164 168 } 165 169 166 170 while (true) { 171 if (cjk_ngram && CJK::codepoint_is_cjk(*itor)) { 172 const string & cjk = CJK::get_cjk(itor); 173 for (CJKTokenIterator tk(cjk); tk != CJKTokenIterator(); ++tk) { 174 const string & cjk_token = *tk; 175 if (cjk_token.size() > MAX_PROB_TERM_LENGTH) continue; 176 177 if (stop_mode == STOPWORDS_IGNORE && (*stopper)(cjk_token)) 178 continue; 179 180 if (with_positions && tk.get_length() == 1) { 181 doc.add_posting(prefix + cjk_token, ++termpos, wdf_inc); 182 } else { 183 doc.add_term(prefix + cjk_token, wdf_inc); 184 } 185 if ((flags & FLAG_SPELLING) && prefix.empty()) 186 db.add_spelling(cjk_token); 187 188 if (!stemmer.internal.get()) continue; 189 190 if (stop_mode == STOPWORDS_INDEX_UNSTEMMED_ONLY && 191 (*stopper)(cjk_token)) 192 continue; 193 194 // Note, this uses the lowercased term, but that's OK as we 195 // only want to avoid stemming terms starting with a digit. 196 if (!should_stem(cjk_token)) continue; 197 198 // Add stemmed form without positional information. 199 string stem("Z"); 200 stem += prefix; 201 stem += stemmer(cjk_token); 202 doc.add_term(stem, wdf_inc); 203 } 204 while (true) { 205 if (itor == Utf8Iterator()) return; 206 ch = check_wordchar(*itor); 207 if (ch) break; 208 ++itor; 209 } 210 } 167 211 unsigned prevch; 168 212 do { 169 213 Unicode::append_utf8(term, ch); 170 214 prevch = ch; 171 if (++itor == Utf8Iterator()) goto endofterm; 215 if (++itor == Utf8Iterator() || 216 (cjk_ngram && CJK::codepoint_is_cjk(*itor))) 217 goto endofterm; 172 218 ch = check_wordchar(*itor); 173 219 } while (ch); 174 220 -
xapian-core/tests/termgentest.cc
31 31 #include "testutils.h" 32 32 #include "utils.h" 33 33 34 #include <stdlib.h> // For setenv() or putenv() 35 34 36 using namespace std; 35 37 36 38 #define TESTCASE(S) {#S, test_##S} … … 106 108 "Z\xe1\x80\x9d\xe1\x80\xae\xe1\x80\x80\xe1\x80\xae\xe1\x80\x95\xe1\x80\xad\xe1\x80\x9e\xe1\x80\xaf\xe1\x80\xb6\xe1\x80\xb8\xe1\x80\x85\xe1\x80\xbd\xe1\x80\xb2\xe1\x80\x9e\xe1\x80\xb0\xe1\x80\x99\xe1\x80\xbb\xe1\x80\xac\xe1\x80\xb8\xe1\x80\x80:1 \xe1\x80\x9d\xe1\x80\xae\xe1\x80\x80\xe1\x80\xae\xe1\x80\x95\xe1\x80\xad\xe1\x80\x9e\xe1\x80\xaf\xe1\x80\xb6\xe1\x80\xb8\xe1\x80\x85\xe1\x80\xbd\xe1\x80\xb2\xe1\x80\x9e\xe1\x80\xb0\xe1\x80\x99\xe1\x80\xbb\xe1\x80\xac\xe1\x80\xb8\xe1\x80\x80[1]" }, 107 109 108 110 { "", "fish+chips", "Zchip:1 Zfish:1 chips[2] fish[1]" }, 111 112 // Basic CJK tests: 113 { "stem=", "久有归天", "久[1] 久有:1 天[4] 归[3] 归天:1 有[2] 有归:1" }, 114 { "", "극지라", "극[1] 극지:1 라[3] 지[2] 지라:1" }, 115 { "", "ウルス アップ", "ア[4] ウ[1] ウル:1 ス[3] ッ[5] ップ:1 プ[6] ル[2] ルス:1" }, 116 117 // CJK with prefix: 118 { "prefix=XA", "发送从", "XA从[3] XA发[1] XA发送:1 XA送[2] XA送从:1" }, 119 { "prefix=XA", "点卡思考", "XA卡[2] XA卡思:1 XA思[3] XA思考:1 XA点[1] XA点卡:1 XA考[4]" }, 120 121 // CJK mixed with non-CJK: 122 { "prefix=", "インtestタ", "test[3] イ[1] イン:1 タ[4] ン[2]" }, 123 { "", "配this is合a个 test!", "a[5] is[3] test[7] this[2] 个[6] 合[4] 配[1]" }, 124 109 125 // All following tests are for things which we probably don't really want to 110 126 // behave as they currently do, but we haven't found a sufficiently general 111 127 // way to implement them yet. 112 128 113 129 // Test number like things 114 { " ", "11:59", "11[1] 59[2]" },130 { "stem=en", "11:59", "11[1] 59[2]" }, 115 131 { "", "11:59am", "11[1] 59am[2]" }, 116 132 117 133 { NULL, NULL, NULL } … … 770 786 771 787 int main(int argc, char **argv) 772 788 try { 789 // FIXME: It would be better to test with and without XAPIAN_CJK_NGRAM set. 790 #ifdef __WIN32__ 791 _putenv_s("XAPIAN_CJK_NGRAM", "1"); 792 #elif defined HAVE_SETENV 793 setenv("XAPIAN_CJK_NGRAM", "1", 1); 794 #else 795 putenv(const_cast<char*>("XAPIAN_CJK_NGRAM=1")); 796 #endif 773 797 test_driver::parse_command_line(argc, argv); 774 798 return test_driver::run(tests); 775 799 } catch (const char * e) { -
xapian-core/tests/queryparsertest.cc
33 33 #include <string> 34 34 #include <vector> 35 35 36 #include <stdlib.h> // For setenv() or putenv() 37 36 38 using namespace std; 37 39 38 40 #define TESTCASE(S) {#S, test_##S} … … 639 641 { "multisite:xapian.org site:www.xapian.org author:richard authortitle:richard", "((ZArichard:(pos=1) OR ZArichard:(pos=2) OR ZXTrichard:(pos=2)) FILTER (Hwww.xapian.org AND (Hxapian.org OR Jxapian.org)))"}, 640 642 { "authortitle:richard-boulton", "((Arichard:(pos=1) PHRASE 2 Aboulton:(pos=2)) OR (XTrichard:(pos=1) PHRASE 2 XTboulton:(pos=2)))"}, 641 643 { "authortitle:\"richard boulton\"", "((Arichard:(pos=1) PHRASE 2 Aboulton:(pos=2)) OR (XTrichard:(pos=1) PHRASE 2 XTboulton:(pos=2)))"}, 644 // Some CJK tests. 645 { "久有归天愿", "(久:(pos=1) AND 久有:(pos=1) AND 有:(pos=1) AND 有归:(pos=1) AND 归:(pos=1) AND 归天:(pos=1) AND 天:(pos=1) AND 天愿:(pos=1) AND 愿:(pos=1))" }, 646 { "title:久有 归 天愿", "((XT久:(pos=1) AND XT久有:(pos=1) AND XT有:(pos=1)) OR 归:(pos=2) OR (天:(pos=3) AND 天愿:(pos=3) AND 愿:(pos=3)))" }, 647 { "h众ello万众", "(Zh:(pos=1) OR 众:(pos=2) OR Zello:(pos=3) OR (万:(pos=4) AND 万众:(pos=4) AND 众:(pos=4)))" }, 648 { "世(の中)TEST_tm", "(世:(pos=1) OR (の:(pos=2) AND の中:(pos=2) AND 中:(pos=2)) OR test_tm:(pos=3))" }, 649 { "다녀 AND 와야", "(다:(pos=1) AND 다녀:(pos=1) AND 녀:(pos=1) AND 와:(pos=2) AND 와야:(pos=2) AND 야:(pos=2))" }, 650 { "authortitle:학술 OR 연구를", "((A학:(pos=1) AND XT학:(pos=1) AND A학술:(pos=1) AND XT학술:(pos=1) AND A술:(pos=1) AND XT술:(pos=1)) OR (연:(pos=2) AND 연구:(pos=2) AND 구:(pos=2) AND 구를:(pos=2) AND 를:(pos=2)))" }, 651 // FIXME: These should really filter by bigrams to accelerate: 652 { "\"久有归\"", "(久:(pos=1) PHRASE 3 有:(pos=1) PHRASE 3 归:(pos=1))" }, 653 { "\"久有test归\"", "(久:(pos=1) PHRASE 4 有:(pos=1) PHRASE 4 test:(pos=2) PHRASE 4 归:(pos=3))" }, 654 // FIXME: this should work: { "久 NEAR 有", "(久:(pos=1) NEAR 11 有:(pos=2))" }, 642 655 { NULL, NULL } 643 656 }; 644 657 … … 709 722 // Add coverage for other cases similar to the above. 710 723 { "a b site:xapian.org", "((Za:(pos=1) AND Zb:(pos=2)) FILTER Hxapian.org)" }, 711 724 { "site:xapian.org a b", "((Za:(pos=1) AND Zb:(pos=2)) FILTER Hxapian.org)" }, 725 // Some CJK tests. 726 { "author:험가 OR subject:万众 hello world!", "((A험:(pos=1) AND A험가:(pos=1) AND A가:(pos=1)) OR (XT万:(pos=2) AND XT万众:(pos=2) AND XT众:(pos=2) AND Zhello:(pos=3) AND Zworld:(pos=4)))" }, 727 { "洛伊one儿差点two脸three", "(洛:(pos=1) AND 洛伊:(pos=1) AND 伊:(pos=1) AND Zone:(pos=2) AND 儿:(pos=3) AND 儿差:(pos=3) AND 差:(pos=3) AND 差点:(pos=3) AND 点:(pos=3) AND Ztwo:(pos=4) AND 脸:(pos=5) AND Zthree:(pos=6))" }, 712 728 { NULL, NULL } 713 729 }; 714 730 … … 761 777 TEST_STRINGS_EQUAL(qobj.get_description(), "Xapian::Query((ZAme:(pos=1) OR ZXTstuff:(pos=2)))"); 762 778 qobj = qp.parse_query("title:(stuff) me", Xapian::QueryParser::FLAG_BOOLEAN, "A"); 763 779 TEST_STRINGS_EQUAL(qobj.get_description(), "Xapian::Query((ZXTstuff:(pos=1) OR ZAme:(pos=2)))"); 780 qobj = qp.parse_query("英国 title:文森hello", 0, "A"); 781 TEST_STRINGS_EQUAL(qobj.get_description(), "Xapian::Query(((A英:(pos=1) AND A英国:(pos=1) AND A国:(pos=1)) OR (XT文:(pos=2) AND XT文森:(pos=2) AND XT森:(pos=2)) OR ZAhello:(pos=3)))"); 764 782 return true; 765 783 } 766 784 … … 2507 2525 2508 2526 int main(int argc, char **argv) 2509 2527 try { 2528 // FIXME: It would be better to test with and without XAPIAN_CJK_NGRAM set. 2529 #ifdef __WIN32__ 2530 _putenv_s("XAPIAN_CJK_NGRAM", "1"); 2531 #elif defined HAVE_SETENV 2532 setenv("XAPIAN_CJK_NGRAM", "1", 1); 2533 #else 2534 putenv(const_cast<char*>("XAPIAN_CJK_NGRAM=1")); 2535 #endif 2510 2536 test_driver::parse_command_line(argc, argv); 2511 2537 return test_driver::run(tests); 2512 2538 } catch (const char * e) { -
xapian-core/ChangeLog
1 Wed Aug 24 14:25:21 GMT 2011 Olly Betts <olly@survex.com> 2 3 * Backport change from trunk: 4 * queryparser/queryparser.lemony: Fix memory leak (caught by existing 5 testcase queryparser1 when run under valgrind). 6 7 Wed Aug 24 14:13:24 GMT 2011 Olly Betts <olly@survex.com> 8 9 * Backport change from trunk: 10 * queryparser/,tests/queryparsertest.cc,tests/termgentest.cc: Add 11 support for indexing and searching CJK text using n-grams. Currently 12 this is only enabled if environmental variable XAPIAN_CJK_NGRAM is 13 set to a non-empty value. 14 1 15 Wed Aug 10 06:09:39 GMT 2011 Olly Betts <olly@survex.com> 2 16 3 17 * NEWS: Finalise 1.2.7.