Ticket #180: cjk.patch
File cjk.patch, 16.9 KB (added by , 13 years ago) |
---|
-
xapian-core/queryparser/cjk/cjk-tokenizer.cc
diff -puNr xapian.vanilla/xapian-core/queryparser/cjk/cjk-tokenizer.cc cjk-support-patch/xapian-core/queryparser/cjk/cjk-tokenizer.cc
old new 1 /* 2 * Copyright (c) 2007, 2008 Yung-chung Lin (henearkrxern@gmail.com) 3 * Copyright (c) 2011 Richard Boulton (richard@tartarus.org) 4 * Copyright (c) 2011 Brandon Schaefer (brandontschaefer@gmail.com) 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 #include <xapian.h> 25 #include <cstring> 26 #include "cjk-tokenizer.h" 27 28 using namespace std; 29 using namespace Xapian::CJK; 30 31 // 2E80..2EFF; CJK Radicals Supplement 32 // 3000..303F; CJK Symbols and Punctuation 33 // 3040..309F; Hiragana 34 // 30A0..30FF; Katakana 35 // 3100..312F; Bopomofo 36 // 3130..318F; Hangul Compatibility Jamo 37 // 3190..319F; Kanbun 38 // 31A0..31BF; Bopomofo Extended 39 // 31C0..31EF; CJK Strokes 40 // 31F0..31FF; Katakana Phonetic Extensions 41 // 3200..32FF; Enclosed CJK Letters and Months 42 // 3300..33FF; CJK Compatibility 43 // 3400..4DBF; CJK Unified Ideographs Extension A 44 // 4DC0..4DFF; Yijing Hexagram Symbols 45 // 4E00..9FFF; CJK Unified Ideographs 46 // A700..A71F; Modifier Tone Letters 47 // AC00..D7AF; Hangul Syllables 48 // F900..FAFF; CJK Compatibility Ideographs 49 // FE30..FE4F; CJK Compatibility Forms 50 // FF00..FFEF; Halfwidth and Fullwidth Forms 51 // 20000..2A6DF; CJK Unified Ideographs Extension B 52 // 2F800..2FA1F; CJK Compatibility Ideographs Supplement 53 bool 54 Xapian::CJK::codepoint_is_cjk(unsigned p) { 55 if (p < 0x2E80) return false; 56 return (((p) >= 0x2E80 && (p) <= 0x2EFF) 57 || ((p) >= 0x3000 && (p) <= 0x9FFF) 58 || ((p) >= 0xA700 && (p) <= 0xA71F) 59 || ((p) >= 0xAC00 && (p) <= 0xD7AF) 60 || ((p) >= 0xF900 && (p) <= 0xFAFF) 61 || ((p) >= 0xFE30 && (p) <= 0xFE4F) 62 || ((p) >= 0xFF00 && (p) <= 0xFFEF) 63 || ((p) >= 0x20000 && (p) <= 0x2A6DF) 64 || ((p) >= 0x2F800 && (p) <= 0x2FA1F)); 65 } 66 67 std::string 68 Xapian::CJK::get_cjk(Xapian::Utf8Iterator &it) { 69 string str; 70 while (codepoint_is_cjk(*it) && it != Xapian::Utf8Iterator()) { 71 Xapian::Unicode::append_utf8(str, *it); 72 ++it; 73 } 74 return str; 75 } 76 77 tokenizer::tokenizer() : ngram_size(2), 78 max_token_count(0) { 79 } 80 81 void 82 tokenizer::tokenize(const string &str, vector<string> &token_list) { 83 Xapian::Utf8Iterator it(str), p, end; 84 string new_token; 85 while (it != Xapian::Utf8Iterator()) { 86 if (max_token_count > 0 87 && token_list.size() >= max_token_count) { 88 break; 89 } 90 new_token.resize(0); 91 p = it; 92 for (unsigned i = 0; i < ngram_size; ++i) { 93 if (max_token_count > 0 94 && token_list.size() >= max_token_count) { 95 break; 96 } 97 if (p == end) break; 98 if (codepoint_is_cjk(*p)) { 99 Xapian::Unicode::append_utf8(new_token, *p); 100 token_list.push_back(new_token); 101 } 102 ++p; 103 } 104 ++it; 105 } 106 } -
xapian-core/queryparser/cjk/cjk-tokenizer.h
diff -puNr xapian.vanilla/xapian-core/queryparser/cjk/cjk-tokenizer.h cjk-support-patch/xapian-core/queryparser/cjk/cjk-tokenizer.h
old new 1 /* 2 * Copyright (c) 2007, 2008 Yung-chung Lin (henearkrxern@gmail.com) 3 * Copyright (c) 2011 Richard Boulton (richard@tartarus.org) 4 * Copyright (c) 2011 Brandon Schaefer (brandontschaefer@gmail.com) 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 #ifndef __CJK_TOKENIZER_H__ 25 #define __CJK_TOKENIZER_H__ 26 27 #include <string> 28 #include <vector> 29 #include <xapian.h> 30 31 namespace Xapian { 32 namespace CJK { 33 bool codepoint_is_cjk(unsigned codepoint); 34 std::string get_cjk(Xapian::Utf8Iterator &it); 35 36 class tokenizer { 37 unsigned int ngram_size; 38 unsigned int max_token_count; 39 40 public: 41 tokenizer(); 42 void tokenize(const std::string &str, 43 std::vector<std::string> &token_list); 44 }; 45 }; /* namespace CJK end */ 46 }; /* namespace Xapian end */ 47 48 #endif -
xapian-core/queryparser/Makefile.mk
diff -puNr xapian.vanilla/xapian-core/queryparser/Makefile.mk cjk-support-patch/xapian-core/queryparser/Makefile.mk
old new endif 7 7 noinst_HEADERS +=\ 8 8 queryparser/queryparser_internal.h\ 9 9 queryparser/queryparser_token.h\ 10 queryparser/termgenerator_internal.h 10 queryparser/termgenerator_internal.h\ 11 queryparser/cjk/cjk-tokenizer.h 11 12 12 13 lemon_built_sources =\ 13 14 queryparser/queryparser_internal.cc\ … … lib_src +=\ 60 61 queryparser/queryparser.cc\ 61 62 queryparser/queryparser_internal.cc\ 62 63 queryparser/termgenerator.cc\ 63 queryparser/termgenerator_internal.cc 64 queryparser/termgenerator_internal.cc\ 65 queryparser/cjk/cjk-tokenizer.cc -
xapian-core/queryparser/queryparser.lemony
diff -puNr xapian.vanilla/xapian-core/queryparser/queryparser.lemony cjk-support-patch/xapian-core/queryparser/queryparser.lemony
old new 40 40 41 41 #include <string.h> 42 42 43 #include "cjk/cjk-tokenizer.h" 44 43 45 using namespace std; 44 46 45 47 using namespace Xapian; … … class Term { 192 194 */ 193 195 Query * as_partial_query(State * state_) const; 194 196 197 Query * as_cjk_query() const; 198 195 199 /// Value range query. 196 200 Query as_value_range_query() const; 197 201 … … Term::as_partial_query(State * state_) c 430 434 return q; 431 435 } 432 436 437 Query * 438 Term::as_cjk_query() const 439 { 440 vector<Query> prefix_cjk; 441 vector<string> cjk_tokens; 442 vector<string>::iterator cjk_it; 443 const list<string> & prefixes = prefix_info->prefixes; 444 list<string>::const_iterator piter; 445 CJK::tokenizer tk; 446 tk.tokenize(name, cjk_tokens); 447 for (cjk_it = cjk_tokens.begin(); cjk_it != cjk_tokens.end(); ++cjk_it) { 448 for (piter = prefixes.begin(); piter != prefixes.end(); ++piter) { 449 string cjk = *piter; 450 cjk += *cjk_it; 451 prefix_cjk.push_back(Query(cjk, 1, pos)); 452 } 453 } 454 Query * q = new Query(Query::OP_AND, prefix_cjk.begin(), prefix_cjk.end()); 455 delete this; 456 return q; 457 } 458 433 459 Query 434 460 Term::as_value_range_query() const 435 461 { … … QueryParser::Internal::parse_term(Utf8It 562 588 } 563 589 was_acronym = !term.empty(); 564 590 591 if (term.empty() && CJK::codepoint_is_cjk(*it)) { 592 term = CJK::get_cjk(it); 593 } 594 565 595 if (term.empty()) { 566 596 unsigned prevch = *it; 567 597 Unicode::append_utf8(term, prevch); 568 598 while (++it != end) { 599 if (CJK::codepoint_is_cjk(*it)) break; 569 600 unsigned ch = *it; 570 601 if (!is_wordchar(ch)) { 571 602 // Treat a single embedded '&' or "'" or similar as a word … … phrased_term: 1075 1106 Term * term_obj = new Term(&state, term, prefix_info, 1076 1107 unstemmed_term, stem_term, term_pos++); 1077 1108 1109 Utf8Iterator tmp_it(term); 1110 if (CJK::codepoint_is_cjk(*tmp_it)) { 1111 Parse(pParser, CJKTERM, term_obj, &state); 1112 if (it == end) break; 1113 continue; 1114 } 1115 1078 1116 if (mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2) { 1079 1117 if (it != end) { 1080 1118 if ((flags & FLAG_WILDCARD) && *it == '*') { … … compound_term(T) ::= SYNONYM TERM(U). { 1935 1973 delete U; 1936 1974 } 1937 1975 1976 compound_term(T) ::= CJKTERM(U). { 1977 { T = U->as_cjk_query(); } 1978 } 1979 1938 1980 // phrase - The "inside the quotes" part of a double-quoted phrase. 1939 1981 1940 1982 %type phrase {Terms *} -
xapian-core/queryparser/termgenerator_internal.cc
diff -puNr xapian.vanilla/xapian-core/queryparser/termgenerator_internal.cc cjk-support-patch/xapian-core/queryparser/termgenerator_internal.cc
old new 31 31 #include <limits> 32 32 #include <string> 33 33 34 #include "cjk/cjk-tokenizer.h" 35 34 36 using namespace std; 35 37 36 38 namespace Xapian { … … TermGenerator::Internal::index_text(Utf8 164 166 } 165 167 166 168 while (true) { 167 unsigned prevch; 169 if (CJK::codepoint_is_cjk(*itor)) { 170 CJK::tokenizer tk; 171 vector<string> cjk_tokens; 172 vector<string>::iterator cjk_it; 173 tk.tokenize(CJK::get_cjk(itor), cjk_tokens); 174 for (cjk_it = cjk_tokens.begin(); cjk_it != cjk_tokens.end(); ++cjk_it) { 175 if ((*cjk_it).size() > MAX_PROB_TERM_LENGTH) continue; 176 177 if (stop_mode == STOPWORDS_IGNORE && (*stopper)((*cjk_it))) continue; 178 179 if (with_positions) { 180 doc.add_posting(prefix + (*cjk_it), ++termpos, weight); 181 } else { 182 doc.add_term(prefix + (*cjk_it), weight); 183 } 184 if ((flags & FLAG_SPELLING) && prefix.empty()) db.add_spelling(*cjk_it); 185 186 if (!stemmer.internal.get()) continue; 187 188 if (stop_mode == STOPWORDS_INDEX_UNSTEMMED_ONLY && (*stopper)(*cjk_it)) 189 continue; 190 191 // Note, this uses the lowercased term, but that's OK as we only 192 // want to avoid stemming terms starting with a digit. 193 if (!should_stem(*cjk_it)) continue; 194 195 // Add stemmed form without positional information. 196 string stem("Z"); 197 stem += prefix; 198 stem += stemmer(*cjk_it); 199 doc.add_term(stem, weight); 200 } 201 while (true) { 202 if (itor == Utf8Iterator()) return; 203 ch = check_wordchar(*itor); 204 if (ch) break; 205 ++itor; 206 } 207 } 208 unsigned prevch; 168 209 do { 169 210 Unicode::append_utf8(term, ch); 170 211 prevch = ch; 171 if (++itor == Utf8Iterator() ) goto endofterm;212 if (++itor == Utf8Iterator() || CJK::codepoint_is_cjk(*itor)) goto endofterm; 172 213 ch = check_wordchar(*itor); 173 214 } while (ch); 174 215 -
xapian-core/tests/queryparsertest.cc
diff -puNr xapian.vanilla/xapian-core/tests/queryparsertest.cc cjk-support-patch/xapian-core/tests/queryparsertest.cc
old new static const test test_or_queries[] = { 639 639 { "multisite:xapian.org site:www.xapian.org author:richard authortitle:richard", "((ZArichard:(pos=1) OR ZArichard:(pos=2) OR ZXTrichard:(pos=2)) FILTER (Hwww.xapian.org AND (Hxapian.org OR Jxapian.org)))"}, 640 640 { "authortitle:richard-boulton", "((Arichard:(pos=1) PHRASE 2 Aboulton:(pos=2)) OR (XTrichard:(pos=1) PHRASE 2 XTboulton:(pos=2)))"}, 641 641 { "authortitle:\"richard boulton\"", "((Arichard:(pos=1) PHRASE 2 Aboulton:(pos=2)) OR (XTrichard:(pos=1) PHRASE 2 XTboulton:(pos=2)))"}, 642 // Some CJK Test added 643 { "久有归天愿", "(久:(pos=1) AND 久有:(pos=1) AND 有:(pos=1) AND 有归:(pos=1) AND 归:(pos=1) AND 归天:(pos=1) AND 天:(pos=1) AND 天愿:(pos=1) AND 愿:(pos=1))" }, 644 { "title:久有 归 天愿", "((XT久:(pos=1) AND XT久有:(pos=1) AND XT有:(pos=1)) OR 归:(pos=2) OR (天:(pos=3) AND 天愿:(pos=3) AND 愿:(pos=3)))" }, 645 { "h众ello万众", "(Zh:(pos=1) OR 众:(pos=2) OR Zello:(pos=3) OR (万:(pos=4) AND 万众:(pos=4) AND 众:(pos=4)))" }, 646 { "世(の中)TEST_tm", "(世:(pos=1) OR (の:(pos=2) AND の中:(pos=2) AND 中:(pos=2)) OR test_tm:(pos=3))" }, 647 { "다녀 AND 와야", "(다:(pos=1) AND 다녀:(pos=1) AND 녀:(pos=1) AND 와:(pos=2) AND 와야:(pos=2) AND 야:(pos=2))" }, 648 { "authortitle:학술 OR 연구를", "((A학:(pos=1) AND XT학:(pos=1) AND A학술:(pos=1) AND XT학술:(pos=1) AND A술:(pos=1) AND XT술:(pos=1)) OR (연:(pos=2) AND 연구:(pos=2) AND 구:(pos=2) AND 구를:(pos=2) AND 를:(pos=2)))" }, 642 649 { NULL, NULL } 650 643 651 }; 644 652 645 653 static bool test_queryparser1() … … static const test test_and_queries[] = { 709 717 // Add coverage for other cases similar to the above. 710 718 { "a b site:xapian.org", "((Za:(pos=1) AND Zb:(pos=2)) FILTER Hxapian.org)" }, 711 719 { "site:xapian.org a b", "((Za:(pos=1) AND Zb:(pos=2)) FILTER Hxapian.org)" }, 720 // Some CJK Test added 721 { "author:험가 OR subject:万众 hello world!", "((A험:(pos=1) AND A험가:(pos=1) AND A가:(pos=1)) OR (XT万:(pos=2) AND XT万众:(pos=2) AND XT众:(pos=2) AND Zhello:(pos=3) AND Zworld:(pos=4)))" }, 722 { "洛伊one儿差点two脸three", "(洛:(pos=1) AND 洛伊:(pos=1) AND 伊:(pos=1) AND Zone:(pos=2) AND 儿:(pos=3) AND 儿差:(pos=3) AND 差:(pos=3) AND 差点:(pos=3) AND 点:(pos=3) AND Ztwo:(pos=4) AND 脸:(pos=5) AND Zthree:(pos=6))" }, 712 723 { NULL, NULL } 713 724 }; 714 725 … … static bool test_qp_default_prefix1() 761 772 TEST_STRINGS_EQUAL(qobj.get_description(), "Xapian::Query((ZAme:(pos=1) OR ZXTstuff:(pos=2)))"); 762 773 qobj = qp.parse_query("title:(stuff) me", Xapian::QueryParser::FLAG_BOOLEAN, "A"); 763 774 TEST_STRINGS_EQUAL(qobj.get_description(), "Xapian::Query((ZXTstuff:(pos=1) OR ZAme:(pos=2)))"); 775 qobj = qp.parse_query("英国 title:文森hello", 0, "A"); 776 TEST_STRINGS_EQUAL(qobj.get_description(), "Xapian::Query(((A英:(pos=1) AND A英国:(pos=1) AND A国:(pos=1)) OR (XT文:(pos=2) AND XT文森:(pos=2) AND XT森:(pos=2)) OR ZAhello:(pos=3)))"); 764 777 return true; 765 778 } 766 779 -
xapian-core/tests/termgentest.cc
diff -puNr xapian.vanilla/xapian-core/tests/termgentest.cc cjk-support-patch/xapian-core/tests/termgentest.cc
old new struct test { 61 61 }; 62 62 63 63 static const test test_simple[] = { 64 65 // Basic CJK Test 66 {"", "久有归天", "久[1] 久有[2] 天[7] 归[5] 归天[6] 有[3] 有归[4]" }, 67 {"", "극지라", "극[1] 극지[2] 라[5] 지[3] 지라[4]" }, 68 {"", "ウルス アップ", "ア[6] ウ[1] ウル[2] ス[5] ッ[7] ップ[8] プ[9] ル[3] ルス[4]" }, 69 70 // CJK With Prefix 71 {"prefix=XA", "发送从", "XA从[5] XA发[1] XA发送[2] XA送[3] XA送从[4]"}, 72 {"prefix=XA", "点卡思考", "XA卡[3] XA卡思[4] XA思[5] XA思考[6] XA点[1] XA点卡[2] XA考[7]"}, 73 74 // CJK Mix 75 {"prefix=", "インtestタ", "test[4] イ[1] イン[2] タ[5] ン[3]"}, 76 {"", "配this is合a个 test!", "a[5] is[3] test[7] this[2] 个[6] 合[4] 配[1]"}, 77 64 78 // A basic test with a hyphen 65 79 { "", "simple-example", "example[2] simple[1]" }, 66 80 { "cont,weight=2",