Context Navigation

Back to Ticket #180

Ticket #180: cjkv.patch

File cjkv.patch, 36.7 KB (added by Pavel Strashkin, 15 years ago)
patch to add CJKV tokenizer supporting

xapian-core-1.0.16

diff -bBrupN xapian-core-1.0.16.old/Makefile.am xapian-core-1.0.16/Makefile.am

-              old
+               lib_LTLIBRARIES = libxapian.la
 libxapian_la_SOURCES =
 libxapian_la_LDFLAGS = $(XAPIAN_LDFLAGS) -no-undefined -version-info $(LIBRARY_VERSION_INFO)
+libxapian_la_LDFLAGS = $(XAPIAN_LDFLAGS) -no-undefined -version-info $(LIBRARY_VERSION_INFO) $(GLIB2_LIBS)
 if !MAINTAINER_NO_DOCS
 dist_man_MANS = xapian-config.1

xapian-core-1.0.16

diff -bBrupN xapian-core-1.0.16.old/acinclude.m4 xapian-core-1.0.16/acinclude.m4

old	new
1	1	dnl acinclude.m4
2	2	m4_include(m4/rjb_find_stlport.m4)
3	3	m4_include(m4/type_socklen_t.m4)
	4	m4_include(m4/pkg.m4)

xapian-core-1.0.16

diff -bBrupN xapian-core-1.0.16.old/configure.ac xapian-core-1.0.16/configure.ac

-              old
+               if test yes = "$use_stlport"; then
 fi
 AC_SUBST(STLPORT_CXXFLAGS)
+dnl Check for glib-2.0 files what need to compile CJKV
+PKG_CHECK_MODULES([GLIB2], [glib-2.0])
+AM_CXXFLAGS="$GLIB2_CFLAGS $AM_CXXFLAGS"
+LIBS="$GLIB2_LIBS $LIBS"
 AC_SUBST(AM_CXXFLAGS)
 dnl Restore CXXFLAGS to those the user specified or autoconf defaulted to.
-…
+               dnl There are no files generated by AC_O
 dnl and we need to ensure they exist so that the rest of configure or make
 dnl won't fail because they don't exist when srcdir != builddir.
 if test yes = "$vpath_build" ; then
   for dir in include include/xapian languages queryparser ; do
+  for dir in include include/xapian include/xapian/cjkv languages queryparser ; do
     test -d "$dir" || mkdir "$dir"
   done
 fi

include/Makefile.mk

diff -bBrupN xapian-core-1.0.16.old/include/Makefile.mk xapian-core-1.0.16/include/Makefile.mk

-              old
+               xapianinclude_HEADERS =\
         include/xapian/types.h\
         include/xapian/unicode.h\
         include/xapian/valueiterator.h\
+        include/xapian/visibility.h
+        include/xapian/visibility.h\
+        include/xapian/cjkv/CJKVTokenizer.h
 nodist_xapianinclude_HEADERS =\
         include/xapian/version.h

include/xapian/cjkv/CJKVTokenizer.h

diff -bBrupN xapian-core-1.0.16.old/include/xapian/cjkv/CJKVTokenizer.h xapian-core-1.0.16/include/xapian/cjkv/CJKVTokenizer.h

-              old
+              new
+/*
+ *  Copyright 2007-2008 林永忠 Yung-Chung Lin
+ *  Copyright 2008-2009 Fabrice Colin
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2 of the License, or (at your option) any later version.
+ *
+ *  This library is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public
+ *  License along with this library; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef _DIJON_CJKVTOKENIZER_H
+#define _DIJON_CJKVTOKENIZER_H
+#include <string>
+#include <vector>
+#ifdef HAVE_UNICODE_H
+#include <unicode.h>
+#else
+#include <glib/gunicode.h>
+#define unicode_char_t gunichar
+#endif
+#ifndef DIJON_CJKV_EXPORT
+#if defined __GNUC__ && (__GNUC__ >= 4)
+  #define DIJON_CJKV_EXPORT __attribute__ ((visibility("default")))
+#else
+  #define DIJON_CJKV_EXPORT
+#endif
+#endif
+namespace Dijon
+{
+        class DIJON_CJKV_EXPORT CJKVTokenizer
+        {
+                public:
+                        CJKVTokenizer();
+                        ~CJKVTokenizer();
+                        class TokensHandler
+                        {
+                                public:
+                                        TokensHandler() {}
+                                        virtual ~TokensHandler() {}
+                                        virtual bool handle_token(const std::string &tok, bool is_cjkv) = 0;
+                        };
+                        void set_ngram_size(unsigned int ngram_size);
+                        unsigned int get_ngram_size(void) const;
+                        void set_max_token_count(unsigned int max_token_count);
+                        unsigned int get_max_token_count(void) const;
+                        void set_max_text_size(unsigned int max_text_size);
+                        unsigned int get_max_text_size(void) const;
+                        void tokenize(const std::string &str,
+                                std::vector<std::string> &token_list);
+                        void tokenize(const std::string &str,
+                                TokensHandler &handler,
+                                bool break_ascii_only_on_space = false);
+                        void split(const std::string &str,
+                                std::vector<std::string> &token_list);
+                        void split(const std::string &str,
+                                std::vector<unicode_char_t> &token_list);
+                        void segment(const std::string &str,
+                                std::vector<std::string> &token_segment);
+                        bool has_cjkv(const std::string &str);
+                        bool has_cjkv_only(const std::string &str);
+                protected:
+                        unsigned int m_nGramSize;
+                        unsigned int m_maxTokenCount;
+                        unsigned int m_maxTextSize;
+        };
+};
+#endif // _DIJON_CJKVTOKENIZER_H

xapian-core-1.0.16

diff -bBrupN xapian-core-1.0.16.old/m4/pkg.m4 xapian-core-1.0.16/m4/pkg.m4

-              old
+              new
+# pkg.m4 - Macros to locate and utilise pkg-config.            -*- Autoconf -*-
+#
+# Copyright © 2004 Scott James Remnant <scott@netsplit.com>.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that program.
+# PKG_PROG_PKG_CONFIG([MIN-VERSION])
+# ----------------------------------
+AC_DEFUN([PKG_PROG_PKG_CONFIG],
+[m4_pattern_forbid([^_?PKG_[A-Z_]+$])
+m4_pattern_allow([^PKG_CONFIG(_PATH)?$])
+AC_ARG_VAR([PKG_CONFIG], [path to pkg-config utility])dnl
+if test "x$ac_cv_env_PKG_CONFIG_set" != "xset"; then
+        AC_PATH_TOOL([PKG_CONFIG], [pkg-config])
+fi
+if test -n "$PKG_CONFIG"; then
+        _pkg_min_version=m4_default([$1], [0.9.0])
+        AC_MSG_CHECKING([pkg-config is at least version $_pkg_min_version])
+        if $PKG_CONFIG --atleast-pkgconfig-version $_pkg_min_version; then
+                AC_MSG_RESULT([yes])
+        else
+                AC_MSG_RESULT([no])
+                PKG_CONFIG=""
+        fi
+fi[]dnl
+])# PKG_PROG_PKG_CONFIG
+# PKG_CHECK_EXISTS(MODULES, [ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND])
+#
+# Check to see whether a particular set of modules exists.  Similar
+# to PKG_CHECK_MODULES(), but does not set variables or print errors.
+#
+#
+# Similar to PKG_CHECK_MODULES, make sure that the first instance of
+# this or PKG_CHECK_MODULES is called, or make sure to call
+# PKG_CHECK_EXISTS manually
+# --------------------------------------------------------------
+AC_DEFUN([PKG_CHECK_EXISTS],
+[AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl
+if test -n "$PKG_CONFIG" && \
+    AC_RUN_LOG([$PKG_CONFIG --exists --print-errors "$1"]); then
+  m4_ifval([$2], [$2], [:])
+m4_ifvaln([$3], [else
+  $3])dnl
+fi])
+# _PKG_CONFIG([VARIABLE], [COMMAND], [MODULES])
+# ---------------------------------------------
+m4_define([_PKG_CONFIG],
+[if test -n "$PKG_CONFIG"; then
+    if test -n "$$1"; then
+        pkg_cv_[]$1="$$1"
+    else
+        PKG_CHECK_EXISTS([$3],
+                         [pkg_cv_[]$1=`$PKG_CONFIG --[]$2 "$3" 2>/dev/null`],
+                         [pkg_failed=yes])
+    fi
+else
+        pkg_failed=untried
+fi[]dnl
+])# _PKG_CONFIG
+# _PKG_SHORT_ERRORS_SUPPORTED
+# -----------------------------
+AC_DEFUN([_PKG_SHORT_ERRORS_SUPPORTED],
+[AC_REQUIRE([PKG_PROG_PKG_CONFIG])
+if $PKG_CONFIG --atleast-pkgconfig-version 0.20; then
+        _pkg_short_errors_supported=yes
+else
+        _pkg_short_errors_supported=no
+fi[]dnl
+])# _PKG_SHORT_ERRORS_SUPPORTED
+# PKG_CHECK_MODULES(VARIABLE-PREFIX, MODULES, [ACTION-IF-FOUND],
+# [ACTION-IF-NOT-FOUND])
+#
+#
+# Note that if there is a possibility the first call to
+# PKG_CHECK_MODULES might not happen, you should be sure to include an
+# explicit call to PKG_PROG_PKG_CONFIG in your configure.ac
+#
+#
+# --------------------------------------------------------------
+AC_DEFUN([PKG_CHECK_MODULES],
+[AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl
+AC_ARG_VAR([$1][_CFLAGS], [C compiler flags for $1, overriding pkg-config])dnl
+AC_ARG_VAR([$1][_LIBS], [linker flags for $1, overriding pkg-config])dnl
+pkg_failed=no
+AC_MSG_CHECKING([for $1])
+_PKG_CONFIG([$1][_CFLAGS], [cflags], [$2])
+_PKG_CONFIG([$1][_LIBS], [libs], [$2])
+m4_define([_PKG_TEXT], [Alternatively, you may set the environment variables $1[]_CFLAGS
+and $1[]_LIBS to avoid the need to call pkg-config.
+See the pkg-config man page for more details.])
+if test $pkg_failed = yes; then
+        _PKG_SHORT_ERRORS_SUPPORTED
+        if test $_pkg_short_errors_supported = yes; then
+                $1[]_PKG_ERRORS=`$PKG_CONFIG --short-errors --errors-to-stdout --print-errors "$2"`
+        else
+                $1[]_PKG_ERRORS=`$PKG_CONFIG --errors-to-stdout --print-errors "$2"`
+        fi
+        # Put the nasty error message in config.log where it belongs
+        echo "$$1[]_PKG_ERRORS" >&AS_MESSAGE_LOG_FD
+        ifelse([$4], , [AC_MSG_ERROR(dnl
+[Package requirements ($2) were not met:
+$$1_PKG_ERRORS
+Consider adjusting the PKG_CONFIG_PATH environment variable if you
+installed software in a non-standard prefix.
+_PKG_TEXT
+])],
+                [$4])
+elif test $pkg_failed = untried; then
+        ifelse([$4], , [AC_MSG_FAILURE(dnl
+[The pkg-config script could not be found or is too old.  Make sure it
+is in your PATH or set the PKG_CONFIG environment variable to the full
+path to pkg-config.
+_PKG_TEXT
+To get pkg-config, see <http://www.freedesktop.org/software/pkgconfig>.])],
+                [$4])
+else
+        $1[]_CFLAGS=$pkg_cv_[]$1[]_CFLAGS
+        $1[]_LIBS=$pkg_cv_[]$1[]_LIBS
+        AC_MSG_RESULT([yes])
+        ifelse([$3], , :, [$3])
+fi[]dnl
+])# PKG_CHECK_MODULES

queryparser/CJKVTokenizer.cc

diff -bBrupN xapian-core-1.0.16.old/queryparser/CJKVTokenizer.cc xapian-core-1.0.16/queryparser/CJKVTokenizer.cc

-              old
+              new
+/*
+ *  Copyright 2007-2008 林永忠 Yung-Chung Lin
+ *  Copyright 2008-2009 Fabrice Colin
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2 of the License, or (at your option) any later version.
+ *
+ *  This library is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public
+ *  License along with this library; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include <ctype.h>
+#include <string.h>
+#include <iostream>
+#include <xapian/cjkv/CJKVTokenizer.h>
+#ifndef HAVE_UNICODE_H
+static void unicode_init(void)
+{
+}
+static char *unicode_get_utf8(const char *p, unicode_char_t *result)
+{
+        *result = g_utf8_get_char(p);
+        return (*result == (unicode_char_t)-1) ? NULL : g_utf8_next_char(p);
+}
+static int unicode_strlen(const char *p, int max)
+{
+        return (int)g_utf8_strlen(p, (gssize)max);
+}
+static int unicode_ispunct(unicode_char_t c)
+{
+        if (g_unichar_ispunct(c))
+        {
+                return 1;
+        }
+        return 0;
+}
+static int unicode_isspace(unicode_char_t c)
+{
+        if (g_unichar_isspace(c))
+        {
+                return 1;
+        }
+        return 0;
+}
+#endif
+// 2E80..2EFF; CJK Radicals Supplement
+// 3000..303F; CJK Symbols and Punctuation
+// 3040..309F; Hiragana
+// 30A0..30FF; Katakana
+// 3100..312F; Bopomofo
+// 3130..318F; Hangul Compatibility Jamo
+// 3190..319F; Kanbun
+// 31A0..31BF; Bopomofo Extended
+// 31C0..31EF; CJK Strokes
+// 31F0..31FF; Katakana Phonetic Extensions
+// 3200..32FF; Enclosed CJK Letters and Months
+// 3300..33FF; CJK Compatibility
+// 3400..4DBF; CJK Unified Ideographs Extension A
+// 4DC0..4DFF; Yijing Hexagram Symbols
+// 4E00..9FFF; CJK Unified Ideographs
+// A700..A71F; Modifier Tone Letters
+// AC00..D7AF; Hangul Syllables
+// F900..FAFF; CJK Compatibility Ideographs
+// FE30..FE4F; CJK Compatibility Forms
+// FF00..FFEF; Halfwidth and Fullwidth Forms
+// 20000..2A6DF; CJK Unified Ideographs Extension B
+// 2F800..2FA1F; CJK Compatibility Ideographs Supplement
+#define UTF8_IS_CJKV(p)                                                  \
+    (((p) >= 0x2E80 && (p) <= 0x2EFF)                                   \
+     || ((p) >= 0x3000 && (p) <= 0x303F)                                \
+     || ((p) >= 0x3040 && (p) <= 0x309F)                                \
+     || ((p) >= 0x30A0 && (p) <= 0x30FF)                                \
+     || ((p) >= 0x3100 && (p) <= 0x312F)                                \
+     || ((p) >= 0x3130 && (p) <= 0x318F)                                \
+     || ((p) >= 0x3190 && (p) <= 0x319F)                                \
+     || ((p) >= 0x31A0 && (p) <= 0x31BF)                                \
+     || ((p) >= 0x31C0 && (p) <= 0x31EF)                                \
+     || ((p) >= 0x31F0 && (p) <= 0x31FF)                                \
+     || ((p) >= 0x3200 && (p) <= 0x32FF)                                \
+     || ((p) >= 0x3300 && (p) <= 0x33FF)                                \
+     || ((p) >= 0x3400 && (p) <= 0x4DBF)                                \
+     || ((p) >= 0x4DC0 && (p) <= 0x4DFF)                                \
+     || ((p) >= 0x4E00 && (p) <= 0x9FFF)                                \
+     || ((p) >= 0xA700 && (p) <= 0xA71F)                                \
+     || ((p) >= 0xAC00 && (p) <= 0xD7AF)                                \
+     || ((p) >= 0xF900 && (p) <= 0xFAFF)                                \
+     || ((p) >= 0xFE30 && (p) <= 0xFE4F)                                \
+     || ((p) >= 0xFF00 && (p) <= 0xFFEF)                                \
+     || ((p) >= 0x20000 && (p) <= 0x2A6DF)                              \
+     || ((p) >= 0x2F800 && (p) <= 0x2FA1F)                              \
+     || ((p) >= 0x2F800 && (p) <= 0x2FA1F))
+using namespace std;
+using namespace Dijon;
+static void _split_string(string str, const string &delim,
+        vector<string> &list)
+{
+        list.clear();
+        string::size_type cut_at = 0;
+        while ((cut_at = str.find_first_of(delim)) != str.npos)
+        {
+                if (cut_at > 0)
+                {
+                        list.push_back(str.substr(0,cut_at));
+                }
+                str = str.substr(cut_at+1);
+        }
+        if (str.length() > 0)
+        {
+                list.push_back(str);
+        }
+}
+static inline unsigned char *_unicode_to_char(unicode_char_t &uchar,
+        unsigned char *p)
+{
+        if (p == NULL)
+        {
+                return NULL;
+        }
+        memset(p, 0, sizeof(unicode_char_t) + 1);
+        if (unicode_isspace(uchar) ||
+                unicode_ispunct(uchar))
+        {
+                p[0] = ' ';
+        }
+        else if (uchar < 0x80)
+        {
+                p[0] = uchar;
+        }
+        else if (uchar < 0x800)
+        {
+                p[0] = (0xC0 | uchar >> 6);
+                p[1] = (0x80 | uchar & 0x3F);
+        }
+        else if (uchar < 0x10000)
+        {
+                p[0] = (0xE0 | uchar >> 12);
+                p[1] = (0x80 | uchar >> 6 & 0x3F);
+                p[2] = (0x80 | uchar & 0x3F);
+        }
+        else if (uchar < 0x200000)
+        {
+                p[0] = (0xF0 | uchar >> 18);
+                p[1] = (0x80 | uchar >> 12 & 0x3F);
+                p[2] = (0x80 | uchar >> 6 & 0x3F);
+                p[3] = (0x80 | uchar & 0x3F);
+        }
+        return p;
+}
+class VectorTokensHandler : public CJKVTokenizer::TokensHandler
+{
+        public:
+                VectorTokensHandler(vector<string> &token_list) :
+                        CJKVTokenizer::TokensHandler(),
+                        m_token_list(token_list)
+                {
+                }
+                virtual ~VectorTokensHandler()
+                {
+                }
+                virtual bool handle_token(const string &tok, bool is_cjkv)
+                {
+                        m_token_list.push_back(tok);
+                        return true;
+                }
+        protected:
+                vector<string> &m_token_list;
+};
+CJKVTokenizer::CJKVTokenizer() :
+        m_nGramSize(2),
+        m_maxTokenCount(0),
+        m_maxTextSize(5242880)
+{
+        unicode_init();
+}
+CJKVTokenizer::~CJKVTokenizer()
+{
+}
+void CJKVTokenizer::set_ngram_size(unsigned int ngram_size)
+{
+        m_nGramSize = ngram_size;
+}
+unsigned int CJKVTokenizer::get_ngram_size(void) const
+{
+        return m_nGramSize;
+}
+void CJKVTokenizer::set_max_token_count(unsigned int max_token_count)
+{
+        m_maxTokenCount = max_token_count;
+}
+unsigned int CJKVTokenizer::get_max_token_count(void) const
+{
+        return m_maxTokenCount;
+}
+void CJKVTokenizer::set_max_text_size(unsigned int max_text_size)
+{
+        m_maxTextSize = max_text_size;
+}
+unsigned int CJKVTokenizer::get_max_text_size(void) const
+{
+        return m_maxTextSize;
+}
+void CJKVTokenizer::tokenize(const string &str, vector<string> &token_list)
+{
+        VectorTokensHandler handler(token_list);
+        tokenize(str, handler);
+}
+void CJKVTokenizer::tokenize(const string &str, TokensHandler &handler,
+        bool break_ascii_only_on_space)
+{
+        string token_str;
+        vector<string> temp_token_list;
+        vector<unicode_char_t> temp_uchar_list;
+        unsigned int tokens_count = 0;
+        split(str, temp_token_list);
+        split(str, temp_uchar_list);
+        for (unsigned int i = 0; i < temp_token_list.size();)
+        {
+                if ((m_maxTokenCount > 0) &&
+                        (tokens_count >= m_maxTokenCount))
+                {
+                        break;
+                }
+                token_str.resize(0);
+                if (UTF8_IS_CJKV(temp_uchar_list[i]))
+                {
+                        for (unsigned int j = i; j < i + m_nGramSize; j++)
+                        {
+                                if ((m_maxTokenCount > 0) &&
+                                        (tokens_count >= m_maxTokenCount))
+                                {
+                                        break;
+                                }
+                                if (j == temp_token_list.size())
+                                {
+                                        break;
+                                }
+                                if (UTF8_IS_CJKV(temp_uchar_list[j]))
+                                {
+                                        token_str += temp_token_list[j];
+                                        if (handler.handle_token(token_str, true) == true)
+                                        {
+                                                ++tokens_count;
+                                        }
+                                }
+                        }
+                        i++;
+                }
+                else
+                {
+                        unsigned int j = i;
+                        while (j < temp_token_list.size())
+                        {
+                                unsigned char *p = (unsigned char*) temp_token_list[j].c_str();
+                                bool break_ascii = false;
+                                if (isascii((int)p[0]) != 0)
+                                {
+                                        if (break_ascii_only_on_space == true)
+                                        {
+                                                if (isspace((int)p[0]) != 0)
+                                                {
+                                                        break_ascii = true;
+                                                }
+                                        }
+                                        else if (isalnum((int)p[0]) == 0)
+                                        {
+                                                break_ascii = true;
+                                        }
+                                }
+                                if (break_ascii == true)
+                                {
+                                        j++;
+                                        break;
+                                }
+                                else if (UTF8_IS_CJKV(temp_uchar_list[j]))
+                                {
+                                        break;
+                                }
+                                token_str += temp_token_list[j];
+                                j++;
+                        }
+                        i = j;
+                        if ((m_maxTokenCount > 0) &&
+                                (tokens_count >= m_maxTokenCount))
+                        {
+                                break;
+                        }
+                        if (token_str.length() > 0)
+                        {
+                                if (handler.handle_token(token_str, false) == true)
+                                {
+                                        ++tokens_count;
+                                }
+                        }
+                }
+        }
+}
+void CJKVTokenizer::split(const string &str, vector<string> &token_list)
+{
+        unicode_char_t uchar;
+        const char *str_ptr = str.c_str();
+        int str_utf8_len = unicode_strlen(str_ptr, str.length());
+        unsigned char p[sizeof(unicode_char_t) + 1];
+        for (int i = 0; i < str_utf8_len; i++)
+        {
+                str_ptr = unicode_get_utf8(str_ptr, &uchar);
+                if (str_ptr == NULL)
+                {
+                        break;
+                }
+                if (i >= m_maxTextSize)
+                {
+                        break;
+                }
+                token_list.push_back((const char*)_unicode_to_char(uchar, p));
+        }
+}
+void CJKVTokenizer::split(const string &str, vector<unicode_char_t> &token_list)
+{
+        unicode_char_t uchar;
+        const char *str_ptr = str.c_str();
+        int str_utf8_len = unicode_strlen(str_ptr, str.length());
+        for (int i = 0; i < str_utf8_len; i++)
+        {
+                str_ptr = unicode_get_utf8(str_ptr, &uchar);
+                if (str_ptr == NULL)
+                {
+                        break;
+                }
+                if (i >= m_maxTextSize)
+                {
+                        break;
+                }
+                token_list.push_back(uchar);
+        }
+}
+void CJKVTokenizer::segment(const string &str, vector<string> &token_segment)
+{
+        vector<string> token_list;
+        string onlySpacesStr(str);
+        for (string::iterator it = onlySpacesStr.begin(); it != onlySpacesStr.end(); ++it)
+        {
+                if (isspace((int)*it) != 0)
+                {
+                        *it = ' ';
+                }
+        }
+        _split_string(onlySpacesStr, " ", token_segment);
+}
+bool CJKVTokenizer::has_cjkv(const string &str)
+{
+        vector<unicode_char_t> temp_uchar_list;
+        split(str, temp_uchar_list);
+        for (unsigned int i = 0; i < temp_uchar_list.size(); i++)
+        {
+                if (UTF8_IS_CJKV(temp_uchar_list[i]))
+                {
+                        return true;
+                }
+        }
+        return false;
+}
+bool CJKVTokenizer::has_cjkv_only(const string &str)
+{
+        vector<unicode_char_t> temp_uchar_list;
+        split(str, temp_uchar_list);
+        for (unsigned int i = 0; i < temp_uchar_list.size(); i++)
+        {
+                if (!(UTF8_IS_CJKV(temp_uchar_list[i])))
+                {
+                        unsigned char p[sizeof(unicode_char_t) + 1];
+                        _unicode_to_char(temp_uchar_list[i], p);
+                        if (isspace((int)p[0]) == 0)
+                        {
+                                return false;
+                        }
+                }
+        }
+        return true;
+}

queryparser/Makefile.mk

diff -bBrupN xapian-core-1.0.16.old/queryparser/Makefile.mk xapian-core-1.0.16/queryparser/Makefile.mk

old	new	libxapian_la_SOURCES +=\
60	60	queryparser/queryparser.cc\
61	61	queryparser/queryparser_internal.cc\
62	62	queryparser/termgenerator.cc\
63		queryparser/termgenerator_internal.cc
	63	queryparser/termgenerator_internal.cc\
	64	queryparser/CJKVTokenizer.cc

queryparser/queryparser_internal.cc

diff -bBrupN xapian-core-1.0.16.old/queryparser/queryparser_internal.cc xapian-core-1.0.16/queryparser/queryparser_internal.cc

-              old
+              new
 #include "queryparser_internal.h"
 #include <xapian/error.h>
 #include <xapian/unicode.h>
+#include <xapian/cjkv/CJKVTokenizer.h>
 #include "stringutils.h"
 // Include the list of token values lemon generates.
-…
+               QueryParser::Internal::parse_term(Utf8It
     return term;
+}
+class QueryModifier: public Dijon::CJKVTokenizer::TokensHandler
+{
+public:
+    typedef enum {
+        NONE     = 0,
+        BRACKETS = 1
+    } CJKVWrap;
+    QueryModifier(const string &query, unsigned int nGramSize):
+        m_query(query),
+        m_pos(0),
+        m_wrap(BRACKETS),
+        m_wrapped(false),
+        m_nGramCount(0),
+        m_nGramSize(nGramSize),
+        m_tokensCount(0),
+        m_hasCJKV(false),
+        m_hasNonCJKV(false)
+    {
+    }
+    virtual ~QueryModifier()
+    {
+    }
+    virtual bool handle_token(const string &tok, bool is_cjkv)
+    {
+        if (tok.empty()) {
+            return false;
+        }
+        // Where is this token in the original query ?
+        string::size_type tokPos = m_query.find(tok, m_pos);
+        ++m_tokensCount;
+        // Is this CJKV ?
+        if (!is_cjkv) {
+            char lastChar = tok[tok.length() - 1];
+            if (tokPos == string::npos) {
+                // This should have been found
+                return false;
+            }
+            if (m_nGramCount > 0) {
+                wrapClose();
+                m_nGramCount = 0;
+                m_pos = tokPos;
+            }
+            m_currentFilter.clear();
+            if (lastChar == '"') {
+                // It's a quoted string
+                m_wrap = NONE;
+            } else if (lastChar == ':') {
+                // It's a filter
+                m_wrap = NONE;
+                m_currentFilter = tok;
+            } else {
+                m_wrap = BRACKETS;
+            }
+            if (m_currentFilter.empty()) {
+                m_hasNonCJKV = true;
+            }
+            // Return right away
+            return true;
+        }
+        // First n-gram ?
+        if (m_nGramCount == 0) {
+            if (tokPos == string::npos) {
+                // That's definitely not right
+                return false;
+            }
+            // Append non-CJKV text that precedes and start wrapping CJKV tokens
+            if (tokPos > m_pos) {
+                m_modifiedQuery += " " + m_query.substr(m_pos, tokPos - m_pos);
+            }
+            m_pos += tok.length();
+            wrapOpen();
+        } else {
+            m_modifiedQuery += " ";
+            if (!m_currentFilter.empty()) {
+                m_modifiedQuery += m_currentFilter;
+            }
+        }
+        m_modifiedQuery += tok;
+        if (tokPos != string::npos) {
+            m_pos = tokPos + tok.length();
+        }
+        ++m_nGramCount;
+        m_hasCJKV = true;
+        return true;
+    }
+    unsigned int get_tokens_count(void) const
+    {
+        return m_tokensCount;
+    }
+    string get_modified_query(bool &pureCJKV)
+    {
+        // Anything left ?
+        if (m_pos < m_query.length() - 1) {
+            m_modifiedQuery += " " + m_query.substr(m_pos);
+        }
+        wrapClose();
+        if (m_hasCJKV && !m_hasNonCJKV) {
+            pureCJKV = true;
+        } else {
+            pureCJKV = false;
+        }
+        return m_modifiedQuery;
+    }
+protected:
+    string m_query;
+    string m_modifiedQuery;
+    string::size_type m_pos;
+    CJKVWrap m_wrap;
+    bool m_wrapped;
+    string m_currentFilter;
+    unsigned int m_nGramCount;
+    unsigned int m_nGramSize;
+    unsigned int m_tokensCount;
+    bool m_hasCJKV;
+    bool m_hasNonCJKV;
+    void wrapOpen(void)
+    {
+        switch (m_wrap) {
+            case BRACKETS:
+                m_modifiedQuery += " (";
+                break;
+            case NONE:
+            default:
+                break;
+        }
+        m_wrapped = true;
+    }
+    void wrapClose(void)
+    {
+        if (!m_wrapped) {
+            return;
+        }
+        // Finish wrapping CJKV tokens
+        switch (m_wrap) {
+            case BRACKETS:
+                m_modifiedQuery += ')';
+                break;
+            case NONE:
+            default:
+                break;
+        }
+        m_wrapped = false;
+    }
+}; // class QueryModifier: public Dijon::CJKVTokenizer::TokensHandler
 Query
 QueryParser::Internal::parse_query(const string &qs, unsigned flags,
+QueryParser::Internal::parse_query(const string &iqs, unsigned flags,
                                    const string &default_prefix)
+{
+    Dijon::CJKVTokenizer tokenizer;
+    string qs(iqs);
+    // Modifying the query is necessary if it's CJKV
+    if (tokenizer.has_cjkv(qs)) {
+        QueryModifier handler(qs, tokenizer.get_ngram_size());
+        tokenizer.tokenize(qs, handler, true);
+        // We can disable stemming and spelling correction for pure CJKV queries
+        bool pureCJKV = false;
+        qs = handler.get_modified_query(pureCJKV);
+    }
     yyParser * pParser = ParseAlloc();
     // Set value_ranges if we may have to handle value ranges in the query.

queryparser/termgenerator_internal.cc

diff -bBrupN xapian-core-1.0.16.old/queryparser/termgenerator_internal.cc xapian-core-1.0.16/queryparser/termgenerator_internal.cc

-              old
+              new
 #include <xapian/document.h>
 #include <xapian/queryparser.h>
 #include <xapian/unicode.h>
+#include <xapian/cjkv/CJKVTokenizer.h>
 #include "stringutils.h"
-…
+               using namespace std;
 namespace Xapian {
+// FIXME: add API for this:
+#define STOPWORDS_NONE 0
+#define STOPWORDS_IGNORE 1
+#define STOPWORDS_INDEX_UNSTEMMED_ONLY 2
 // Put a limit on the size of terms to help prevent the index being bloated
 // by useless junk terms.
 static const unsigned int MAX_PROB_TERM_LENGTH = 64;
-…
+               should_stem(const std::string & term)
     return ((SHOULD_STEM_MASK >> Unicode::get_category(*u)) & 1);
+}
+inline bool
+should_index(const std::string &term, int stop_mode, const Stopper *stopper)
+{
+    if (term.size() > MAX_PROB_TERM_LENGTH) {
+        return false;
+    }
+    if (stop_mode == STOPWORDS_IGNORE && stopper && (*stopper)(term)) {
+        return false;
+    }
+    return true;
+}
 inline unsigned check_infix(unsigned ch) {
     if (ch == '\'' || ch == '&' || ch == 0xb7 || ch == 0x5f4 || ch == 0x2027) {
         // Unicode includes all these except '&' in it's word boundary rules,
-…
+               inline unsigned check_suffix(unsigned ch
     return 0;
+}
+// FIXME: add API for this:
+#define STOPWORDS_NONE 0
+#define STOPWORDS_IGNORE 1
+#define STOPWORDS_INDEX_UNSTEMMED_ONLY 2
+class TokensIndexer: public Dijon::CJKVTokenizer::TokensHandler
+{
+public:
+    TokensIndexer(
+        TermGenerator::Internal &generator,
+        const string &prefix,
+        termcount &weight,
+        bool with_positions,
+        int stop_mode,
+        unsigned int gram_size
+    ):
+        m_generator(generator),
+        m_prefix(prefix),
+        m_weight(weight),
+        m_with_positions(with_positions),
+        m_stop_mode(stop_mode),
+        m_gram_size(gram_size),
+        m_gram_count(0),
+        m_hasCJKV(false)
+    {
+    }
+    virtual ~TokensIndexer()
+    {
+    }
+    virtual bool handle_token(const string &token, bool is_cjkv)
+    {
+        termcount termpos_inc = 0;
+        if (is_cjkv) {
+            if ((m_gram_count + 1) % m_gram_size == 0) {
+                termpos_inc = 1;
+            }
+        } else {
+            termpos_inc = 1;
+        }
+        string term(to_lower(token));
+        if (m_generator.index_term(m_prefix, term, m_weight, m_stop_mode, m_with_positions, termpos_inc) == false) {
+            return false;
+        }
+        if (is_cjkv) {
+            m_gram_count++;
+            m_hasCJKV = true;
+        } else {
+            m_gram_count = 0;
+        }
+        return true;
+    }
+protected:
+    string to_lower(const string &str)
+    {
+        string out(str);
+        for (string::iterator i = out.begin(); i != out.end(); ++i) {
+            if (U_isupper(*i)) {
+                *i = Unicode::tolower(*i);
+            }
+        }
+        return out;
+    }
+    TermGenerator::Internal &m_generator;
+    const string &m_prefix;
+    termcount &m_weight;
+    bool m_with_positions;
+    int m_stop_mode;
+    unsigned int m_gram_size;
+    unsigned int m_gram_count;
+    bool m_hasCJKV;
+};
 void
 TermGenerator::Internal::index_text(Utf8Iterator itor, termcount weight,
                                     const string & prefix, bool with_positions)
+{
+    int stop_mode = STOPWORDS_INDEX_UNSTEMMED_ONLY;
+    if (!stopper) stop_mode = STOPWORDS_NONE;
+    // set default stop words mode depend on
+    // stopper handler availability
+    int stop_mode = stopper ? STOPWORDS_INDEX_UNSTEMMED_ONLY : STOPWORDS_NONE;
+    const char *raw = itor.raw();
+    if (raw) {
+        Dijon::CJKVTokenizer tokenizer;
+        string text(raw);
+        // is there CJKV chars? then we should use CJKVTokenizer
+        // instead of builtin TermGenerator
+        if (tokenizer.has_cjkv(text)) {
+            TokensIndexer handler(*this, prefix, weight, with_positions, stop_mode, tokenizer.get_ngram_size());
+            tokenizer.tokenize(text, handler);
+            return;
+        }
+    }
     while (true) {
         // Advance to the start of the next term.
         unsigned ch;
         while (true) {
+            if (itor == Utf8Iterator()) return;
+            if (itor == Utf8Iterator()) {
+                return;
+            }
             ch = check_wordchar(*itor);
+            if (ch) break;
+            if (ch) {
+                break;
+            }
             ++itor;
+        }
-…
+               TermGenerator::Internal::index_text(Utf8
             do {
                 Unicode::append_utf8(term, ch);
                 prevch = ch;
+                if (++itor == Utf8Iterator()) goto endofterm;
+                if (++itor == Utf8Iterator()) {
+                    goto endofterm;
+                }
                 ch = check_wordchar(*itor);
             } while (ch);
             Utf8Iterator next(itor);
             ++next;
+            if (next == Utf8Iterator()) break;
+            if (next == Utf8Iterator()) {
+                break;
+            }
             unsigned nextch = check_wordchar(*next);
+            if (!nextch) break;
+            if (!nextch) {
+                break;
+            }
             unsigned infix_ch = *itor;
             if (is_digit(prevch) && is_digit(*next)) {
                 infix_ch = check_infix_digit(infix_ch);
-…
+               TermGenerator::Internal::index_text(Utf8
                 // Handle things like '&' in AT&T, apostrophes, etc.
                 infix_ch = check_infix(infix_ch);
+            }
+            if (!infix_ch) break;
+            if (!infix_ch) {
+                break;
+            }
             Unicode::append_utf8(term, infix_ch);
             ch = nextch;
             itor = next;
-…
+               TermGenerator::Internal::index_text(Utf8
                     break;
+                }
                 Unicode::append_utf8(term, ch);
+                if (++itor == Utf8Iterator()) goto endofterm;
+                if (++itor == Utf8Iterator()) {
+                    goto endofterm;
+                }
+            }
+        }
 endofterm:
+        if (term.size() > MAX_PROB_TERM_LENGTH) continue;
+        index_term(prefix, term, weight, stop_mode, with_positions);
+    } // while(true)
+}
+        if (stop_mode == STOPWORDS_IGNORE && (*stopper)(term)) continue;
+bool
+TermGenerator::Internal::index_term(const string &prefix, const string &term, termcount &weight, int stop_mode, bool with_positions, termcount termpos_inc)
+{
+    if (should_index(term, stop_mode, stopper) == false) {
+        return false;
+    }
         if (with_positions) {
+            doc.add_posting(prefix + term, ++termpos, weight);
+        termpos += termpos_inc;
+        doc.add_posting(prefix + term, termpos, weight);
         } else {
             doc.add_term(prefix + term, weight);
+        }
-        if ((flags & FLAG_SPELLING) && prefix.empty()) db.add_spelling(term);
+        if (!stemmer.internal.get()) continue;
+    if ((flags & FLAG_SPELLING) && prefix.empty()) {
+        db.add_spelling(term);
+    }
+        if (stop_mode == STOPWORDS_INDEX_UNSTEMMED_ONLY && (*stopper)(term))
+            continue;
+    if (!stemmer.internal.get()) {
+        return true;
+    }
+    if (stop_mode == STOPWORDS_INDEX_UNSTEMMED_ONLY && stopper && (*stopper)(term)) {
+        return true;
+    }
         // Note, this uses the lowercased term, but that's OK as we only
         // want to avoid stemming terms starting with a digit.
+        if (!should_stem(term)) continue;
+    if (should_stem(term) == false) {
+        return true;
+    }
         // Add stemmed form without positional information.
         string stem("Z");
         stem += prefix;
         stem += stemmer(term);
         doc.add_term(stem, weight);
+    }
+}
+    return true;
+}
+} // namespace Xapian

queryparser/termgenerator_internal.h

diff -bBrupN xapian-core-1.0.16.old/queryparser/termgenerator_internal.h xapian-core-1.0.16/queryparser/termgenerator_internal.h

-              old
+              new
 #ifndef XAPIAN_INCLUDED_TERMGENERATOR_INTERNAL_H
 #define XAPIAN_INCLUDED_TERMGENERATOR_INTERNAL_H
+#include <string>
 #include <xapian/base.h>
 #include <xapian/database.h>
 #include <xapian/document.h>
 …
 namespace Xapian {
 class Stopper;
+using std::string;
 class TermGenerator::Internal : public Xapian::Internal::RefCntBase {
     friend class TermGenerator;
-…
+               class TermGenerator::Internal : public X
   public:
     Internal() : stopper(NULL), termpos(0),
         flags(TermGenerator::flags(0)) { }
+    void index_text(Utf8Iterator itor,
+                    termcount weight,
+                    const std::string & prefix,
+                    bool with_positions);
+    void index_text(Utf8Iterator itor, termcount weight, const std::string & prefix, bool with_positions);
+    bool index_term(const string &prefix, const string &term, termcount &weight, int stop_mode, bool with_positions, termcount termpos_inc=1);
 };
+}

xapian-core-1.0.16

diff -bBrupN xapian-core-1.0.16.old/xapian-config.in xapian-core-1.0.16/xapian-config.in

-              old
+               while [ 0 != "$#" ] ; do
         cxxflags=
         [ -n "@ANSI_CXXFLAGS@" ] && cxxflags="@ANSI_CXXFLAGS@ "
         [ -n "@STLPORT_CXXFLAGS@" ] && cxxflags="${cxxflags}@STLPORT_CXXFLAGS@ "
+        [ -n "@GLIB2_CFLAGS@" ] && cxxflags="${cxxflags}@GLIB2_CFLAGS@ "
         echo "$cxxflags$I"
         ;;

Download in other formats:

Original Format