Ticket #180: cjk.patch

File cjk.patch, 16.9 KB (added by Brandon Schaefer, 13 years ago)

Replacing the old patch, uses the parser instead of lexer now.

  • xapian-core/queryparser/cjk/cjk-tokenizer.cc

    diff -puNr xapian.vanilla/xapian-core/queryparser/cjk/cjk-tokenizer.cc cjk-support-patch/xapian-core/queryparser/cjk/cjk-tokenizer.cc
    old new  
     1/*
     2 * Copyright (c) 2007, 2008 Yung-chung Lin (henearkrxern@gmail.com)
     3 * Copyright (c) 2011 Richard Boulton (richard@tartarus.org)
     4 * Copyright (c) 2011 Brandon Schaefer (brandontschaefer@gmail.com)
     5 *
     6 * Permission is hereby granted, free of charge, to any person obtaining a copy
     7 * of this software and associated documentation files (the "Software"), to deal
     8 * in the Software without restriction, including without limitation the rights
     9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     10 * copies of the Software, and to permit persons to whom the Software is
     11 * furnished to do so, subject to the following conditions:
     12 *
     13 * The above copyright notice and this permission notice shall be included in
     14 * all copies or substantial portions of the Software.
     15 *
     16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
     19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
     22 * THE SOFTWARE.
     23 */
     24#include <xapian.h>
     25#include <cstring>
     26#include "cjk-tokenizer.h"
     27
     28using namespace std;
     29using namespace Xapian::CJK;
     30
     31// 2E80..2EFF; CJK Radicals Supplement
     32// 3000..303F; CJK Symbols and Punctuation
     33// 3040..309F; Hiragana
     34// 30A0..30FF; Katakana
     35// 3100..312F; Bopomofo
     36// 3130..318F; Hangul Compatibility Jamo
     37// 3190..319F; Kanbun
     38// 31A0..31BF; Bopomofo Extended
     39// 31C0..31EF; CJK Strokes
     40// 31F0..31FF; Katakana Phonetic Extensions
     41// 3200..32FF; Enclosed CJK Letters and Months
     42// 3300..33FF; CJK Compatibility
     43// 3400..4DBF; CJK Unified Ideographs Extension A
     44// 4DC0..4DFF; Yijing Hexagram Symbols
     45// 4E00..9FFF; CJK Unified Ideographs
     46// A700..A71F; Modifier Tone Letters
     47// AC00..D7AF; Hangul Syllables
     48// F900..FAFF; CJK Compatibility Ideographs
     49// FE30..FE4F; CJK Compatibility Forms
     50// FF00..FFEF; Halfwidth and Fullwidth Forms
     51// 20000..2A6DF; CJK Unified Ideographs Extension B
     52// 2F800..2FA1F; CJK Compatibility Ideographs Supplement
     53bool
     54Xapian::CJK::codepoint_is_cjk(unsigned p) {
     55    if (p < 0x2E80) return false;
     56    return (((p) >= 0x2E80 && (p) <= 0x2EFF)
     57        || ((p) >= 0x3000 && (p) <= 0x9FFF)
     58        || ((p) >= 0xA700 && (p) <= 0xA71F)
     59        || ((p) >= 0xAC00 && (p) <= 0xD7AF)
     60        || ((p) >= 0xF900 && (p) <= 0xFAFF)
     61        || ((p) >= 0xFE30 && (p) <= 0xFE4F)
     62        || ((p) >= 0xFF00 && (p) <= 0xFFEF)
     63        || ((p) >= 0x20000 && (p) <= 0x2A6DF)
     64        || ((p) >= 0x2F800 && (p) <= 0x2FA1F));
     65}
     66
     67std::string
     68Xapian::CJK::get_cjk(Xapian::Utf8Iterator &it) {
     69    string str;
     70    while (codepoint_is_cjk(*it) && it != Xapian::Utf8Iterator()) {
     71        Xapian::Unicode::append_utf8(str, *it);
     72        ++it;
     73    }
     74    return str;
     75}
     76
     77tokenizer::tokenizer() : ngram_size(2),
     78                         max_token_count(0) {
     79}
     80
     81void
     82tokenizer::tokenize(const string &str, vector<string> &token_list) {
     83    Xapian::Utf8Iterator it(str), p, end; 
     84    string new_token;
     85    while (it != Xapian::Utf8Iterator()) {
     86        if (max_token_count > 0
     87            && token_list.size() >= max_token_count) {
     88            break;
     89        }
     90        new_token.resize(0);
     91        p = it;
     92        for (unsigned i = 0; i < ngram_size; ++i) {
     93            if (max_token_count > 0
     94                && token_list.size() >= max_token_count) {
     95                break;
     96            }   
     97            if (p == end) break;
     98            if (codepoint_is_cjk(*p)) {
     99                Xapian::Unicode::append_utf8(new_token, *p);
     100                token_list.push_back(new_token);
     101            }
     102            ++p;
     103        }
     104        ++it;
     105    }
     106}
  • xapian-core/queryparser/cjk/cjk-tokenizer.h

    diff -puNr xapian.vanilla/xapian-core/queryparser/cjk/cjk-tokenizer.h cjk-support-patch/xapian-core/queryparser/cjk/cjk-tokenizer.h
    old new  
     1/*
     2 * Copyright (c) 2007, 2008 Yung-chung Lin (henearkrxern@gmail.com)
     3 * Copyright (c) 2011 Richard Boulton (richard@tartarus.org)
     4 * Copyright (c) 2011 Brandon Schaefer (brandontschaefer@gmail.com)
     5 *
     6 * Permission is hereby granted, free of charge, to any person obtaining a copy
     7 * of this software and associated documentation files (the "Software"), to deal
     8 * in the Software without restriction, including without limitation the rights
     9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     10 * copies of the Software, and to permit persons to whom the Software is
     11 * furnished to do so, subject to the following conditions:
     12 *
     13 * The above copyright notice and this permission notice shall be included in
     14 * all copies or substantial portions of the Software.
     15 *
     16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
     19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
     22 * THE SOFTWARE.
     23 */
     24#ifndef __CJK_TOKENIZER_H__
     25#define __CJK_TOKENIZER_H__
     26
     27#include <string>
     28#include <vector>
     29#include <xapian.h>
     30
     31namespace Xapian {
     32namespace CJK {
     33    bool codepoint_is_cjk(unsigned codepoint);
     34    std::string get_cjk(Xapian::Utf8Iterator &it);
     35
     36    class tokenizer {
     37        unsigned int ngram_size;
     38        unsigned int max_token_count;
     39
     40    public:
     41        tokenizer();
     42        void tokenize(const std::string &str,
     43                        std::vector<std::string> &token_list);
     44    };
     45}; /* namespace CJK end */
     46}; /* namespace Xapian end */
     47
     48#endif
  • xapian-core/queryparser/Makefile.mk

    diff -puNr xapian.vanilla/xapian-core/queryparser/Makefile.mk cjk-support-patch/xapian-core/queryparser/Makefile.mk
    old new endif  
    77noinst_HEADERS +=\
    88        queryparser/queryparser_internal.h\
    99        queryparser/queryparser_token.h\
    10         queryparser/termgenerator_internal.h
     10        queryparser/termgenerator_internal.h\
     11        queryparser/cjk/cjk-tokenizer.h
    1112
    1213lemon_built_sources =\
    1314        queryparser/queryparser_internal.cc\
    lib_src +=\  
    6061        queryparser/queryparser.cc\
    6162        queryparser/queryparser_internal.cc\
    6263        queryparser/termgenerator.cc\
    63         queryparser/termgenerator_internal.cc
     64        queryparser/termgenerator_internal.cc\
     65        queryparser/cjk/cjk-tokenizer.cc
  • xapian-core/queryparser/queryparser.lemony

    diff -puNr xapian.vanilla/xapian-core/queryparser/queryparser.lemony cjk-support-patch/xapian-core/queryparser/queryparser.lemony
    old new  
    4040
    4141#include <string.h>
    4242
     43#include "cjk/cjk-tokenizer.h"
     44
    4345using namespace std;
    4446
    4547using namespace Xapian;
    class Term {  
    192194     */
    193195    Query * as_partial_query(State * state_) const;
    194196
     197    Query * as_cjk_query() const;
     198
    195199    /// Value range query.
    196200    Query as_value_range_query() const;
    197201
    Term::as_partial_query(State * state_) c  
    430434    return q;
    431435}
    432436
     437Query *
     438Term::as_cjk_query() const
     439{
     440    vector<Query> prefix_cjk;
     441    vector<string> cjk_tokens;
     442    vector<string>::iterator cjk_it;
     443    const list<string> & prefixes = prefix_info->prefixes;
     444    list<string>::const_iterator piter;
     445    CJK::tokenizer tk;
     446    tk.tokenize(name, cjk_tokens);
     447    for (cjk_it = cjk_tokens.begin(); cjk_it != cjk_tokens.end(); ++cjk_it) {
     448        for (piter = prefixes.begin(); piter != prefixes.end(); ++piter) {
     449            string cjk = *piter;
     450            cjk += *cjk_it;
     451            prefix_cjk.push_back(Query(cjk, 1, pos));
     452        }
     453    }
     454    Query * q = new Query(Query::OP_AND, prefix_cjk.begin(), prefix_cjk.end());
     455    delete this;
     456    return q;
     457}
     458
    433459Query
    434460Term::as_value_range_query() const
    435461{
    QueryParser::Internal::parse_term(Utf8It  
    562588    }
    563589    was_acronym = !term.empty();
    564590
     591    if (term.empty() && CJK::codepoint_is_cjk(*it)) {
     592        term = CJK::get_cjk(it);
     593    }
     594
    565595    if (term.empty()) {
    566596        unsigned prevch = *it;
    567597        Unicode::append_utf8(term, prevch);
    568598        while (++it != end) {
     599            if (CJK::codepoint_is_cjk(*it)) break;
    569600            unsigned ch = *it;
    570601            if (!is_wordchar(ch)) {
    571602                // Treat a single embedded '&' or "'" or similar as a word
    phrased_term:  
    10751106            Term * term_obj = new Term(&state, term, prefix_info,
    10761107                                       unstemmed_term, stem_term, term_pos++);
    10771108
     1109            Utf8Iterator tmp_it(term);   
     1110            if (CJK::codepoint_is_cjk(*tmp_it)) {
     1111                Parse(pParser, CJKTERM, term_obj, &state);
     1112                if (it == end) break;
     1113                continue;
     1114            }
     1115       
    10781116            if (mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2) {
    10791117                if (it != end) {
    10801118                    if ((flags & FLAG_WILDCARD) && *it == '*') {
    compound_term(T) ::= SYNONYM TERM(U). {  
    19351973    delete U;
    19361974}
    19371975
     1976compound_term(T) ::= CJKTERM(U). {
     1977    { T = U->as_cjk_query(); }
     1978}
     1979
    19381980// phrase - The "inside the quotes" part of a double-quoted phrase.
    19391981
    19401982%type phrase {Terms *}
  • xapian-core/queryparser/termgenerator_internal.cc

    diff -puNr xapian.vanilla/xapian-core/queryparser/termgenerator_internal.cc cjk-support-patch/xapian-core/queryparser/termgenerator_internal.cc
    old new  
    3131#include <limits>
    3232#include <string>
    3333
     34#include "cjk/cjk-tokenizer.h"
     35
    3436using namespace std;
    3537
    3638namespace Xapian {
    TermGenerator::Internal::index_text(Utf8  
    164166        }
    165167
    166168        while (true) {
    167             unsigned prevch;
     169            if (CJK::codepoint_is_cjk(*itor)) {
     170                CJK::tokenizer tk;
     171                vector<string> cjk_tokens;
     172                vector<string>::iterator cjk_it;
     173                tk.tokenize(CJK::get_cjk(itor), cjk_tokens);
     174                for (cjk_it = cjk_tokens.begin(); cjk_it != cjk_tokens.end(); ++cjk_it) {
     175                    if ((*cjk_it).size() > MAX_PROB_TERM_LENGTH) continue;
     176
     177                    if (stop_mode == STOPWORDS_IGNORE && (*stopper)((*cjk_it))) continue;
     178
     179                    if (with_positions) {
     180                        doc.add_posting(prefix + (*cjk_it), ++termpos, weight);
     181                    } else {
     182                        doc.add_term(prefix + (*cjk_it), weight);
     183                    }
     184                    if ((flags & FLAG_SPELLING) && prefix.empty()) db.add_spelling(*cjk_it);
     185
     186                    if (!stemmer.internal.get()) continue;
     187
     188                    if (stop_mode == STOPWORDS_INDEX_UNSTEMMED_ONLY && (*stopper)(*cjk_it))
     189                        continue;
     190
     191                    // Note, this uses the lowercased term, but that's OK as we only
     192                    // want to avoid stemming terms starting with a digit.
     193                    if (!should_stem(*cjk_it)) continue;
     194
     195                    // Add stemmed form without positional information.
     196                    string stem("Z");
     197                    stem += prefix;
     198                    stem += stemmer(*cjk_it);
     199                    doc.add_term(stem, weight);
     200                }
     201                while (true) {
     202                    if (itor == Utf8Iterator()) return;
     203                    ch = check_wordchar(*itor);
     204                    if (ch) break;
     205                    ++itor;
     206                }
     207            }   
     208            unsigned prevch;
    168209            do {
    169210                Unicode::append_utf8(term, ch);
    170211                prevch = ch;
    171                 if (++itor == Utf8Iterator()) goto endofterm;
     212                if (++itor == Utf8Iterator() || CJK::codepoint_is_cjk(*itor)) goto endofterm;
    172213                ch = check_wordchar(*itor);
    173214            } while (ch);
    174215
  • xapian-core/tests/queryparsertest.cc

    diff -puNr xapian.vanilla/xapian-core/tests/queryparsertest.cc cjk-support-patch/xapian-core/tests/queryparsertest.cc
    old new static const test test_or_queries[] = {  
    639639    { "multisite:xapian.org site:www.xapian.org author:richard authortitle:richard", "((ZArichard:(pos=1) OR ZArichard:(pos=2) OR ZXTrichard:(pos=2)) FILTER (Hwww.xapian.org AND (Hxapian.org OR Jxapian.org)))"},
    640640    { "authortitle:richard-boulton", "((Arichard:(pos=1) PHRASE 2 Aboulton:(pos=2)) OR (XTrichard:(pos=1) PHRASE 2 XTboulton:(pos=2)))"},
    641641    { "authortitle:\"richard boulton\"", "((Arichard:(pos=1) PHRASE 2 Aboulton:(pos=2)) OR (XTrichard:(pos=1) PHRASE 2 XTboulton:(pos=2)))"},
     642    // Some CJK Test added
     643    { "久有归天愿", "(久:(pos=1) AND 久有:(pos=1) AND 有:(pos=1) AND 有归:(pos=1) AND 归:(pos=1) AND 归天:(pos=1) AND 天:(pos=1) AND 天愿:(pos=1) AND 愿:(pos=1))" },
     644    { "title:久有 归 天愿", "((XT久:(pos=1) AND XT久有:(pos=1) AND XT有:(pos=1)) OR 归:(pos=2) OR (天:(pos=3) AND 天愿:(pos=3) AND 愿:(pos=3)))" },
     645    { "h众ello万众", "(Zh:(pos=1) OR 众:(pos=2) OR Zello:(pos=3) OR (万:(pos=4) AND 万众:(pos=4) AND 众:(pos=4)))" },
     646    { "世(の中)TEST_tm", "(世:(pos=1) OR (の:(pos=2) AND の中:(pos=2) AND 中:(pos=2)) OR test_tm:(pos=3))" },
     647    { "다녀 AND 와야", "(다:(pos=1) AND 다녀:(pos=1) AND 녀:(pos=1) AND 와:(pos=2) AND 와야:(pos=2) AND 야:(pos=2))" },
     648    { "authortitle:학술 OR 연구를", "((A학:(pos=1) AND XT학:(pos=1) AND A학술:(pos=1) AND XT학술:(pos=1) AND A술:(pos=1) AND XT술:(pos=1)) OR (연:(pos=2) AND 연구:(pos=2) AND 구:(pos=2) AND 구를:(pos=2) AND 를:(pos=2)))" },
    642649    { NULL, NULL }
     650
    643651};
    644652
    645653static bool test_queryparser1()
    static const test test_and_queries[] = {  
    709717    // Add coverage for other cases similar to the above.
    710718    { "a b site:xapian.org", "((Za:(pos=1) AND Zb:(pos=2)) FILTER Hxapian.org)" },
    711719    { "site:xapian.org a b", "((Za:(pos=1) AND Zb:(pos=2)) FILTER Hxapian.org)" },
     720    // Some CJK Test added
     721    { "author:험가 OR subject:万众 hello world!", "((A험:(pos=1) AND A험가:(pos=1) AND A가:(pos=1)) OR (XT万:(pos=2) AND XT万众:(pos=2) AND XT众:(pos=2) AND Zhello:(pos=3) AND Zworld:(pos=4)))" },
     722    { "洛伊one儿差点two脸three", "(洛:(pos=1) AND 洛伊:(pos=1) AND 伊:(pos=1) AND Zone:(pos=2) AND 儿:(pos=3) AND 儿差:(pos=3) AND 差:(pos=3) AND 差点:(pos=3) AND 点:(pos=3) AND Ztwo:(pos=4) AND 脸:(pos=5) AND Zthree:(pos=6))" },
    712723    { NULL, NULL }
    713724};
    714725
    static bool test_qp_default_prefix1()  
    761772    TEST_STRINGS_EQUAL(qobj.get_description(), "Xapian::Query((ZAme:(pos=1) OR ZXTstuff:(pos=2)))");
    762773    qobj = qp.parse_query("title:(stuff) me", Xapian::QueryParser::FLAG_BOOLEAN, "A");
    763774    TEST_STRINGS_EQUAL(qobj.get_description(), "Xapian::Query((ZXTstuff:(pos=1) OR ZAme:(pos=2)))");
     775    qobj = qp.parse_query("英国 title:文森hello", 0, "A");
     776    TEST_STRINGS_EQUAL(qobj.get_description(), "Xapian::Query(((A英:(pos=1) AND A英国:(pos=1) AND A国:(pos=1)) OR (XT文:(pos=2) AND XT文森:(pos=2) AND XT森:(pos=2)) OR ZAhello:(pos=3)))");
    764777    return true;
    765778}
    766779
  • xapian-core/tests/termgentest.cc

    diff -puNr xapian.vanilla/xapian-core/tests/termgentest.cc cjk-support-patch/xapian-core/tests/termgentest.cc
    old new struct test {  
    6161};
    6262
    6363static const test test_simple[] = {
     64   
     65    // Basic CJK Test
     66    {"", "久有归天", "久[1] 久有[2] 天[7] 归[5] 归天[6] 有[3] 有归[4]" },
     67    {"", "극지라", "극[1] 극지[2] 라[5] 지[3] 지라[4]" },
     68    {"", "ウルス アップ", "ア[6] ウ[1] ウル[2] ス[5] ッ[7] ップ[8] プ[9] ル[3] ルス[4]" },
     69   
     70    // CJK With Prefix
     71    {"prefix=XA", "发送从", "XA从[5] XA发[1] XA发送[2] XA送[3] XA送从[4]"},
     72    {"prefix=XA", "点卡思考", "XA卡[3] XA卡思[4] XA思[5] XA思考[6] XA点[1] XA点卡[2] XA考[7]"},
     73
     74    // CJK Mix
     75    {"prefix=", "インtestタ", "test[4] イ[1] イン[2] タ[5] ン[3]"},
     76    {"", "配this is合a个 test!", "a[5] is[3] test[7] this[2] 个[6] 合[4] 配[1]"},
     77
    6478    // A basic test with a hyphen
    6579    { "", "simple-example", "example[2] simple[1]" },
    6680    { "cont,weight=2",