Ticket #180: cjkv.patch
File cjkv.patch, 36.7 KB (added by , 15 years ago) |
---|
-
xapian-core-1.0.16
diff -bBrupN xapian-core-1.0.16.old/Makefile.am xapian-core-1.0.16/Makefile.am
old new lib_LTLIBRARIES = libxapian.la 104 104 105 105 libxapian_la_SOURCES = 106 106 107 libxapian_la_LDFLAGS = $(XAPIAN_LDFLAGS) -no-undefined -version-info $(LIBRARY_VERSION_INFO) 107 libxapian_la_LDFLAGS = $(XAPIAN_LDFLAGS) -no-undefined -version-info $(LIBRARY_VERSION_INFO) $(GLIB2_LIBS) 108 108 109 109 if !MAINTAINER_NO_DOCS 110 110 dist_man_MANS = xapian-config.1 -
xapian-core-1.0.16
diff -bBrupN xapian-core-1.0.16.old/acinclude.m4 xapian-core-1.0.16/acinclude.m4
old new 1 1 dnl acinclude.m4 2 2 m4_include(m4/rjb_find_stlport.m4) 3 3 m4_include(m4/type_socklen_t.m4) 4 m4_include(m4/pkg.m4) -
xapian-core-1.0.16
diff -bBrupN xapian-core-1.0.16.old/configure.ac xapian-core-1.0.16/configure.ac
old new if test yes = "$use_stlport"; then 1079 1079 fi 1080 1080 AC_SUBST(STLPORT_CXXFLAGS) 1081 1081 1082 dnl Check for glib-2.0 files what need to compile CJKV 1083 PKG_CHECK_MODULES([GLIB2], [glib-2.0]) 1084 AM_CXXFLAGS="$GLIB2_CFLAGS $AM_CXXFLAGS" 1085 LIBS="$GLIB2_LIBS $LIBS" 1086 1082 1087 AC_SUBST(AM_CXXFLAGS) 1083 1088 1084 1089 dnl Restore CXXFLAGS to those the user specified or autoconf defaulted to. … … dnl There are no files generated by AC_O 1119 1124 dnl and we need to ensure they exist so that the rest of configure or make 1120 1125 dnl won't fail because they don't exist when srcdir != builddir. 1121 1126 if test yes = "$vpath_build" ; then 1122 for dir in include include/xapian languages queryparser ; do1127 for dir in include include/xapian include/xapian/cjkv languages queryparser ; do 1123 1128 test -d "$dir" || mkdir "$dir" 1124 1129 done 1125 1130 fi -
include/Makefile.mk
diff -bBrupN xapian-core-1.0.16.old/include/Makefile.mk xapian-core-1.0.16/include/Makefile.mk
old new xapianinclude_HEADERS =\ 29 29 include/xapian/types.h\ 30 30 include/xapian/unicode.h\ 31 31 include/xapian/valueiterator.h\ 32 include/xapian/visibility.h 32 include/xapian/visibility.h\ 33 include/xapian/cjkv/CJKVTokenizer.h 33 34 34 35 nodist_xapianinclude_HEADERS =\ 35 36 include/xapian/version.h -
include/xapian/cjkv/CJKVTokenizer.h
diff -bBrupN xapian-core-1.0.16.old/include/xapian/cjkv/CJKVTokenizer.h xapian-core-1.0.16/include/xapian/cjkv/CJKVTokenizer.h
old new 1 /* 2 * Copyright 2007-2008 林永忠 Yung-Chung Lin 3 * Copyright 2008-2009 Fabrice Colin 4 * 5 * This library is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU Lesser General Public 7 * License as published by the Free Software Foundation; either 8 * version 2 of the License, or (at your option) any later version. 9 * 10 * This library is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 * Lesser General Public License for more details. 14 * 15 * You should have received a copy of the GNU Lesser General Public 16 * License along with this library; if not, write to the Free Software 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 18 */ 19 20 #ifndef _DIJON_CJKVTOKENIZER_H 21 #define _DIJON_CJKVTOKENIZER_H 22 23 #include <string> 24 #include <vector> 25 #ifdef HAVE_UNICODE_H 26 #include <unicode.h> 27 #else 28 #include <glib/gunicode.h> 29 #define unicode_char_t gunichar 30 #endif 31 32 #ifndef DIJON_CJKV_EXPORT 33 #if defined __GNUC__ && (__GNUC__ >= 4) 34 #define DIJON_CJKV_EXPORT __attribute__ ((visibility("default"))) 35 #else 36 #define DIJON_CJKV_EXPORT 37 #endif 38 #endif 39 40 namespace Dijon 41 { 42 class DIJON_CJKV_EXPORT CJKVTokenizer 43 { 44 public: 45 CJKVTokenizer(); 46 ~CJKVTokenizer(); 47 48 class TokensHandler 49 { 50 public: 51 TokensHandler() {} 52 virtual ~TokensHandler() {} 53 54 virtual bool handle_token(const std::string &tok, bool is_cjkv) = 0; 55 }; 56 57 void set_ngram_size(unsigned int ngram_size); 58 59 unsigned int get_ngram_size(void) const; 60 61 void set_max_token_count(unsigned int max_token_count); 62 63 unsigned int get_max_token_count(void) const; 64 65 void set_max_text_size(unsigned int max_text_size); 66 67 unsigned int get_max_text_size(void) const; 68 69 void tokenize(const std::string &str, 70 std::vector<std::string> &token_list); 71 72 void tokenize(const std::string &str, 73 TokensHandler &handler, 74 bool break_ascii_only_on_space = false); 75 76 void split(const std::string &str, 77 std::vector<std::string> &token_list); 78 79 void split(const std::string &str, 80 std::vector<unicode_char_t> &token_list); 81 82 void segment(const std::string &str, 83 std::vector<std::string> &token_segment); 84 85 bool has_cjkv(const std::string &str); 86 87 bool has_cjkv_only(const std::string &str); 88 89 protected: 90 unsigned int m_nGramSize; 91 unsigned int m_maxTokenCount; 92 unsigned int m_maxTextSize; 93 94 }; 95 }; 96 97 #endif // _DIJON_CJKVTOKENIZER_H -
xapian-core-1.0.16
diff -bBrupN xapian-core-1.0.16.old/m4/pkg.m4 xapian-core-1.0.16/m4/pkg.m4
old new 1 # pkg.m4 - Macros to locate and utilise pkg-config. -*- Autoconf -*- 2 # 3 # Copyright © 2004 Scott James Remnant <scott@netsplit.com>. 4 # 5 # This program is free software; you can redistribute it and/or modify 6 # it under the terms of the GNU General Public License as published by 7 # the Free Software Foundation; either version 2 of the License, or 8 # (at your option) any later version. 9 # 10 # This program is distributed in the hope that it will be useful, but 11 # WITHOUT ANY WARRANTY; without even the implied warranty of 12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 # General Public License for more details. 14 # 15 # You should have received a copy of the GNU General Public License 16 # along with this program; if not, write to the Free Software 17 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 18 # 19 # As a special exception to the GNU General Public License, if you 20 # distribute this file as part of a program that contains a 21 # configuration script generated by Autoconf, you may include it under 22 # the same distribution terms that you use for the rest of that program. 23 24 # PKG_PROG_PKG_CONFIG([MIN-VERSION]) 25 # ---------------------------------- 26 AC_DEFUN([PKG_PROG_PKG_CONFIG], 27 [m4_pattern_forbid([^_?PKG_[A-Z_]+$]) 28 m4_pattern_allow([^PKG_CONFIG(_PATH)?$]) 29 AC_ARG_VAR([PKG_CONFIG], [path to pkg-config utility])dnl 30 if test "x$ac_cv_env_PKG_CONFIG_set" != "xset"; then 31 AC_PATH_TOOL([PKG_CONFIG], [pkg-config]) 32 fi 33 if test -n "$PKG_CONFIG"; then 34 _pkg_min_version=m4_default([$1], [0.9.0]) 35 AC_MSG_CHECKING([pkg-config is at least version $_pkg_min_version]) 36 if $PKG_CONFIG --atleast-pkgconfig-version $_pkg_min_version; then 37 AC_MSG_RESULT([yes]) 38 else 39 AC_MSG_RESULT([no]) 40 PKG_CONFIG="" 41 fi 42 43 fi[]dnl 44 ])# PKG_PROG_PKG_CONFIG 45 46 # PKG_CHECK_EXISTS(MODULES, [ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND]) 47 # 48 # Check to see whether a particular set of modules exists. Similar 49 # to PKG_CHECK_MODULES(), but does not set variables or print errors. 50 # 51 # 52 # Similar to PKG_CHECK_MODULES, make sure that the first instance of 53 # this or PKG_CHECK_MODULES is called, or make sure to call 54 # PKG_CHECK_EXISTS manually 55 # -------------------------------------------------------------- 56 AC_DEFUN([PKG_CHECK_EXISTS], 57 [AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl 58 if test -n "$PKG_CONFIG" && \ 59 AC_RUN_LOG([$PKG_CONFIG --exists --print-errors "$1"]); then 60 m4_ifval([$2], [$2], [:]) 61 m4_ifvaln([$3], [else 62 $3])dnl 63 fi]) 64 65 66 # _PKG_CONFIG([VARIABLE], [COMMAND], [MODULES]) 67 # --------------------------------------------- 68 m4_define([_PKG_CONFIG], 69 [if test -n "$PKG_CONFIG"; then 70 if test -n "$$1"; then 71 pkg_cv_[]$1="$$1" 72 else 73 PKG_CHECK_EXISTS([$3], 74 [pkg_cv_[]$1=`$PKG_CONFIG --[]$2 "$3" 2>/dev/null`], 75 [pkg_failed=yes]) 76 fi 77 else 78 pkg_failed=untried 79 fi[]dnl 80 ])# _PKG_CONFIG 81 82 # _PKG_SHORT_ERRORS_SUPPORTED 83 # ----------------------------- 84 AC_DEFUN([_PKG_SHORT_ERRORS_SUPPORTED], 85 [AC_REQUIRE([PKG_PROG_PKG_CONFIG]) 86 if $PKG_CONFIG --atleast-pkgconfig-version 0.20; then 87 _pkg_short_errors_supported=yes 88 else 89 _pkg_short_errors_supported=no 90 fi[]dnl 91 ])# _PKG_SHORT_ERRORS_SUPPORTED 92 93 94 # PKG_CHECK_MODULES(VARIABLE-PREFIX, MODULES, [ACTION-IF-FOUND], 95 # [ACTION-IF-NOT-FOUND]) 96 # 97 # 98 # Note that if there is a possibility the first call to 99 # PKG_CHECK_MODULES might not happen, you should be sure to include an 100 # explicit call to PKG_PROG_PKG_CONFIG in your configure.ac 101 # 102 # 103 # -------------------------------------------------------------- 104 AC_DEFUN([PKG_CHECK_MODULES], 105 [AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl 106 AC_ARG_VAR([$1][_CFLAGS], [C compiler flags for $1, overriding pkg-config])dnl 107 AC_ARG_VAR([$1][_LIBS], [linker flags for $1, overriding pkg-config])dnl 108 109 pkg_failed=no 110 AC_MSG_CHECKING([for $1]) 111 112 _PKG_CONFIG([$1][_CFLAGS], [cflags], [$2]) 113 _PKG_CONFIG([$1][_LIBS], [libs], [$2]) 114 115 m4_define([_PKG_TEXT], [Alternatively, you may set the environment variables $1[]_CFLAGS 116 and $1[]_LIBS to avoid the need to call pkg-config. 117 See the pkg-config man page for more details.]) 118 119 if test $pkg_failed = yes; then 120 _PKG_SHORT_ERRORS_SUPPORTED 121 if test $_pkg_short_errors_supported = yes; then 122 $1[]_PKG_ERRORS=`$PKG_CONFIG --short-errors --errors-to-stdout --print-errors "$2"` 123 else 124 $1[]_PKG_ERRORS=`$PKG_CONFIG --errors-to-stdout --print-errors "$2"` 125 fi 126 # Put the nasty error message in config.log where it belongs 127 echo "$$1[]_PKG_ERRORS" >&AS_MESSAGE_LOG_FD 128 129 ifelse([$4], , [AC_MSG_ERROR(dnl 130 [Package requirements ($2) were not met: 131 132 $$1_PKG_ERRORS 133 134 Consider adjusting the PKG_CONFIG_PATH environment variable if you 135 installed software in a non-standard prefix. 136 137 _PKG_TEXT 138 ])], 139 [$4]) 140 elif test $pkg_failed = untried; then 141 ifelse([$4], , [AC_MSG_FAILURE(dnl 142 [The pkg-config script could not be found or is too old. Make sure it 143 is in your PATH or set the PKG_CONFIG environment variable to the full 144 path to pkg-config. 145 146 _PKG_TEXT 147 148 To get pkg-config, see <http://www.freedesktop.org/software/pkgconfig>.])], 149 [$4]) 150 else 151 $1[]_CFLAGS=$pkg_cv_[]$1[]_CFLAGS 152 $1[]_LIBS=$pkg_cv_[]$1[]_LIBS 153 AC_MSG_RESULT([yes]) 154 ifelse([$3], , :, [$3]) 155 fi[]dnl 156 ])# PKG_CHECK_MODULES -
queryparser/CJKVTokenizer.cc
diff -bBrupN xapian-core-1.0.16.old/queryparser/CJKVTokenizer.cc xapian-core-1.0.16/queryparser/CJKVTokenizer.cc
old new 1 /* 2 * Copyright 2007-2008 林永忠 Yung-Chung Lin 3 * Copyright 2008-2009 Fabrice Colin 4 * 5 * This library is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU Lesser General Public 7 * License as published by the Free Software Foundation; either 8 * version 2 of the License, or (at your option) any later version. 9 * 10 * This library is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 * Lesser General Public License for more details. 14 * 15 * You should have received a copy of the GNU Lesser General Public 16 * License along with this library; if not, write to the Free Software 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 18 */ 19 20 #include <ctype.h> 21 #include <string.h> 22 #include <iostream> 23 24 #include <xapian/cjkv/CJKVTokenizer.h> 25 26 #ifndef HAVE_UNICODE_H 27 static void unicode_init(void) 28 { 29 } 30 31 static char *unicode_get_utf8(const char *p, unicode_char_t *result) 32 { 33 *result = g_utf8_get_char(p); 34 35 return (*result == (unicode_char_t)-1) ? NULL : g_utf8_next_char(p); 36 } 37 38 static int unicode_strlen(const char *p, int max) 39 { 40 return (int)g_utf8_strlen(p, (gssize)max); 41 } 42 43 static int unicode_ispunct(unicode_char_t c) 44 { 45 if (g_unichar_ispunct(c)) 46 { 47 return 1; 48 } 49 50 return 0; 51 } 52 53 static int unicode_isspace(unicode_char_t c) 54 { 55 if (g_unichar_isspace(c)) 56 { 57 return 1; 58 } 59 60 return 0; 61 } 62 #endif 63 64 // 2E80..2EFF; CJK Radicals Supplement 65 // 3000..303F; CJK Symbols and Punctuation 66 // 3040..309F; Hiragana 67 // 30A0..30FF; Katakana 68 // 3100..312F; Bopomofo 69 // 3130..318F; Hangul Compatibility Jamo 70 // 3190..319F; Kanbun 71 // 31A0..31BF; Bopomofo Extended 72 // 31C0..31EF; CJK Strokes 73 // 31F0..31FF; Katakana Phonetic Extensions 74 // 3200..32FF; Enclosed CJK Letters and Months 75 // 3300..33FF; CJK Compatibility 76 // 3400..4DBF; CJK Unified Ideographs Extension A 77 // 4DC0..4DFF; Yijing Hexagram Symbols 78 // 4E00..9FFF; CJK Unified Ideographs 79 // A700..A71F; Modifier Tone Letters 80 // AC00..D7AF; Hangul Syllables 81 // F900..FAFF; CJK Compatibility Ideographs 82 // FE30..FE4F; CJK Compatibility Forms 83 // FF00..FFEF; Halfwidth and Fullwidth Forms 84 // 20000..2A6DF; CJK Unified Ideographs Extension B 85 // 2F800..2FA1F; CJK Compatibility Ideographs Supplement 86 #define UTF8_IS_CJKV(p) \ 87 (((p) >= 0x2E80 && (p) <= 0x2EFF) \ 88 || ((p) >= 0x3000 && (p) <= 0x303F) \ 89 || ((p) >= 0x3040 && (p) <= 0x309F) \ 90 || ((p) >= 0x30A0 && (p) <= 0x30FF) \ 91 || ((p) >= 0x3100 && (p) <= 0x312F) \ 92 || ((p) >= 0x3130 && (p) <= 0x318F) \ 93 || ((p) >= 0x3190 && (p) <= 0x319F) \ 94 || ((p) >= 0x31A0 && (p) <= 0x31BF) \ 95 || ((p) >= 0x31C0 && (p) <= 0x31EF) \ 96 || ((p) >= 0x31F0 && (p) <= 0x31FF) \ 97 || ((p) >= 0x3200 && (p) <= 0x32FF) \ 98 || ((p) >= 0x3300 && (p) <= 0x33FF) \ 99 || ((p) >= 0x3400 && (p) <= 0x4DBF) \ 100 || ((p) >= 0x4DC0 && (p) <= 0x4DFF) \ 101 || ((p) >= 0x4E00 && (p) <= 0x9FFF) \ 102 || ((p) >= 0xA700 && (p) <= 0xA71F) \ 103 || ((p) >= 0xAC00 && (p) <= 0xD7AF) \ 104 || ((p) >= 0xF900 && (p) <= 0xFAFF) \ 105 || ((p) >= 0xFE30 && (p) <= 0xFE4F) \ 106 || ((p) >= 0xFF00 && (p) <= 0xFFEF) \ 107 || ((p) >= 0x20000 && (p) <= 0x2A6DF) \ 108 || ((p) >= 0x2F800 && (p) <= 0x2FA1F) \ 109 || ((p) >= 0x2F800 && (p) <= 0x2FA1F)) 110 111 using namespace std; 112 using namespace Dijon; 113 114 static void _split_string(string str, const string &delim, 115 vector<string> &list) 116 { 117 list.clear(); 118 119 string::size_type cut_at = 0; 120 while ((cut_at = str.find_first_of(delim)) != str.npos) 121 { 122 if (cut_at > 0) 123 { 124 list.push_back(str.substr(0,cut_at)); 125 } 126 str = str.substr(cut_at+1); 127 } 128 129 if (str.length() > 0) 130 { 131 list.push_back(str); 132 } 133 } 134 135 static inline unsigned char *_unicode_to_char(unicode_char_t &uchar, 136 unsigned char *p) 137 { 138 if (p == NULL) 139 { 140 return NULL; 141 } 142 143 memset(p, 0, sizeof(unicode_char_t) + 1); 144 if (unicode_isspace(uchar) || 145 unicode_ispunct(uchar)) 146 { 147 p[0] = ' '; 148 } 149 else if (uchar < 0x80) 150 { 151 p[0] = uchar; 152 } 153 else if (uchar < 0x800) 154 { 155 p[0] = (0xC0 | uchar >> 6); 156 p[1] = (0x80 | uchar & 0x3F); 157 } 158 else if (uchar < 0x10000) 159 { 160 p[0] = (0xE0 | uchar >> 12); 161 p[1] = (0x80 | uchar >> 6 & 0x3F); 162 p[2] = (0x80 | uchar & 0x3F); 163 } 164 else if (uchar < 0x200000) 165 { 166 p[0] = (0xF0 | uchar >> 18); 167 p[1] = (0x80 | uchar >> 12 & 0x3F); 168 p[2] = (0x80 | uchar >> 6 & 0x3F); 169 p[3] = (0x80 | uchar & 0x3F); 170 } 171 172 return p; 173 } 174 175 class VectorTokensHandler : public CJKVTokenizer::TokensHandler 176 { 177 public: 178 VectorTokensHandler(vector<string> &token_list) : 179 CJKVTokenizer::TokensHandler(), 180 m_token_list(token_list) 181 { 182 } 183 184 virtual ~VectorTokensHandler() 185 { 186 } 187 188 virtual bool handle_token(const string &tok, bool is_cjkv) 189 { 190 m_token_list.push_back(tok); 191 return true; 192 } 193 194 protected: 195 vector<string> &m_token_list; 196 197 }; 198 199 CJKVTokenizer::CJKVTokenizer() : 200 m_nGramSize(2), 201 m_maxTokenCount(0), 202 m_maxTextSize(5242880) 203 { 204 unicode_init(); 205 } 206 207 CJKVTokenizer::~CJKVTokenizer() 208 { 209 } 210 211 void CJKVTokenizer::set_ngram_size(unsigned int ngram_size) 212 { 213 m_nGramSize = ngram_size; 214 } 215 216 unsigned int CJKVTokenizer::get_ngram_size(void) const 217 { 218 return m_nGramSize; 219 } 220 221 void CJKVTokenizer::set_max_token_count(unsigned int max_token_count) 222 { 223 m_maxTokenCount = max_token_count; 224 } 225 226 unsigned int CJKVTokenizer::get_max_token_count(void) const 227 { 228 return m_maxTokenCount; 229 } 230 231 void CJKVTokenizer::set_max_text_size(unsigned int max_text_size) 232 { 233 m_maxTextSize = max_text_size; 234 } 235 236 unsigned int CJKVTokenizer::get_max_text_size(void) const 237 { 238 return m_maxTextSize; 239 } 240 241 void CJKVTokenizer::tokenize(const string &str, vector<string> &token_list) 242 { 243 VectorTokensHandler handler(token_list); 244 245 tokenize(str, handler); 246 } 247 248 void CJKVTokenizer::tokenize(const string &str, TokensHandler &handler, 249 bool break_ascii_only_on_space) 250 { 251 string token_str; 252 vector<string> temp_token_list; 253 vector<unicode_char_t> temp_uchar_list; 254 unsigned int tokens_count = 0; 255 256 split(str, temp_token_list); 257 split(str, temp_uchar_list); 258 259 for (unsigned int i = 0; i < temp_token_list.size();) 260 { 261 if ((m_maxTokenCount > 0) && 262 (tokens_count >= m_maxTokenCount)) 263 { 264 break; 265 } 266 token_str.resize(0); 267 if (UTF8_IS_CJKV(temp_uchar_list[i])) 268 { 269 for (unsigned int j = i; j < i + m_nGramSize; j++) 270 { 271 if ((m_maxTokenCount > 0) && 272 (tokens_count >= m_maxTokenCount)) 273 { 274 break; 275 } 276 if (j == temp_token_list.size()) 277 { 278 break; 279 } 280 if (UTF8_IS_CJKV(temp_uchar_list[j])) 281 { 282 token_str += temp_token_list[j]; 283 if (handler.handle_token(token_str, true) == true) 284 { 285 ++tokens_count; 286 } 287 } 288 } 289 i++; 290 } 291 else 292 { 293 unsigned int j = i; 294 295 while (j < temp_token_list.size()) 296 { 297 unsigned char *p = (unsigned char*) temp_token_list[j].c_str(); 298 bool break_ascii = false; 299 300 if (isascii((int)p[0]) != 0) 301 { 302 if (break_ascii_only_on_space == true) 303 { 304 if (isspace((int)p[0]) != 0) 305 { 306 break_ascii = true; 307 } 308 } 309 else if (isalnum((int)p[0]) == 0) 310 { 311 break_ascii = true; 312 } 313 } 314 315 if (break_ascii == true) 316 { 317 j++; 318 break; 319 } 320 else if (UTF8_IS_CJKV(temp_uchar_list[j])) 321 { 322 break; 323 } 324 325 token_str += temp_token_list[j]; 326 j++; 327 } 328 i = j; 329 if ((m_maxTokenCount > 0) && 330 (tokens_count >= m_maxTokenCount)) 331 { 332 break; 333 } 334 if (token_str.length() > 0) 335 { 336 if (handler.handle_token(token_str, false) == true) 337 { 338 ++tokens_count; 339 } 340 } 341 } 342 } 343 } 344 345 void CJKVTokenizer::split(const string &str, vector<string> &token_list) 346 { 347 unicode_char_t uchar; 348 const char *str_ptr = str.c_str(); 349 int str_utf8_len = unicode_strlen(str_ptr, str.length()); 350 unsigned char p[sizeof(unicode_char_t) + 1]; 351 352 for (int i = 0; i < str_utf8_len; i++) 353 { 354 str_ptr = unicode_get_utf8(str_ptr, &uchar); 355 if (str_ptr == NULL) 356 { 357 break; 358 } 359 360 if (i >= m_maxTextSize) 361 { 362 break; 363 } 364 365 token_list.push_back((const char*)_unicode_to_char(uchar, p)); 366 } 367 } 368 369 void CJKVTokenizer::split(const string &str, vector<unicode_char_t> &token_list) 370 { 371 unicode_char_t uchar; 372 const char *str_ptr = str.c_str(); 373 int str_utf8_len = unicode_strlen(str_ptr, str.length()); 374 375 for (int i = 0; i < str_utf8_len; i++) 376 { 377 str_ptr = unicode_get_utf8(str_ptr, &uchar); 378 if (str_ptr == NULL) 379 { 380 break; 381 } 382 383 if (i >= m_maxTextSize) 384 { 385 break; 386 } 387 388 token_list.push_back(uchar); 389 } 390 } 391 392 void CJKVTokenizer::segment(const string &str, vector<string> &token_segment) 393 { 394 vector<string> token_list; 395 string onlySpacesStr(str); 396 397 for (string::iterator it = onlySpacesStr.begin(); it != onlySpacesStr.end(); ++it) 398 { 399 if (isspace((int)*it) != 0) 400 { 401 *it = ' '; 402 } 403 } 404 405 _split_string(onlySpacesStr, " ", token_segment); 406 } 407 408 bool CJKVTokenizer::has_cjkv(const string &str) 409 { 410 vector<unicode_char_t> temp_uchar_list; 411 412 split(str, temp_uchar_list); 413 414 for (unsigned int i = 0; i < temp_uchar_list.size(); i++) 415 { 416 if (UTF8_IS_CJKV(temp_uchar_list[i])) 417 { 418 return true; 419 } 420 } 421 return false; 422 } 423 424 bool CJKVTokenizer::has_cjkv_only(const string &str) 425 { 426 vector<unicode_char_t> temp_uchar_list; 427 428 split(str, temp_uchar_list); 429 430 for (unsigned int i = 0; i < temp_uchar_list.size(); i++) 431 { 432 if (!(UTF8_IS_CJKV(temp_uchar_list[i]))) 433 { 434 unsigned char p[sizeof(unicode_char_t) + 1]; 435 436 _unicode_to_char(temp_uchar_list[i], p); 437 if (isspace((int)p[0]) == 0) 438 { 439 return false; 440 } 441 } 442 } 443 return true; 444 } 445 -
queryparser/Makefile.mk
diff -bBrupN xapian-core-1.0.16.old/queryparser/Makefile.mk xapian-core-1.0.16/queryparser/Makefile.mk
old new libxapian_la_SOURCES +=\ 60 60 queryparser/queryparser.cc\ 61 61 queryparser/queryparser_internal.cc\ 62 62 queryparser/termgenerator.cc\ 63 queryparser/termgenerator_internal.cc 63 queryparser/termgenerator_internal.cc\ 64 queryparser/CJKVTokenizer.cc -
queryparser/queryparser_internal.cc
diff -bBrupN xapian-core-1.0.16.old/queryparser/queryparser_internal.cc xapian-core-1.0.16/queryparser/queryparser_internal.cc
old new 31 31 #include "queryparser_internal.h" 32 32 #include <xapian/error.h> 33 33 #include <xapian/unicode.h> 34 #include <xapian/cjkv/CJKVTokenizer.h> 34 35 #include "stringutils.h" 35 36 36 37 // Include the list of token values lemon generates. … … QueryParser::Internal::parse_term(Utf8It 554 555 return term; 555 556 } 556 557 558 class QueryModifier: public Dijon::CJKVTokenizer::TokensHandler 559 { 560 public: 561 typedef enum { 562 NONE = 0, 563 BRACKETS = 1 564 } CJKVWrap; 565 566 QueryModifier(const string &query, unsigned int nGramSize): 567 m_query(query), 568 m_pos(0), 569 m_wrap(BRACKETS), 570 m_wrapped(false), 571 m_nGramCount(0), 572 m_nGramSize(nGramSize), 573 m_tokensCount(0), 574 m_hasCJKV(false), 575 m_hasNonCJKV(false) 576 { 577 } 578 579 virtual ~QueryModifier() 580 { 581 } 582 583 virtual bool handle_token(const string &tok, bool is_cjkv) 584 { 585 if (tok.empty()) { 586 return false; 587 } 588 589 // Where is this token in the original query ? 590 string::size_type tokPos = m_query.find(tok, m_pos); 591 ++m_tokensCount; 592 593 // Is this CJKV ? 594 if (!is_cjkv) { 595 char lastChar = tok[tok.length() - 1]; 596 597 if (tokPos == string::npos) { 598 // This should have been found 599 return false; 600 } 601 602 if (m_nGramCount > 0) { 603 wrapClose(); 604 605 m_nGramCount = 0; 606 m_pos = tokPos; 607 } 608 609 m_currentFilter.clear(); 610 if (lastChar == '"') { 611 // It's a quoted string 612 m_wrap = NONE; 613 } else if (lastChar == ':') { 614 // It's a filter 615 m_wrap = NONE; 616 m_currentFilter = tok; 617 } else { 618 m_wrap = BRACKETS; 619 } 620 621 if (m_currentFilter.empty()) { 622 m_hasNonCJKV = true; 623 } 624 625 // Return right away 626 return true; 627 } 628 629 // First n-gram ? 630 if (m_nGramCount == 0) { 631 if (tokPos == string::npos) { 632 // That's definitely not right 633 return false; 634 } 635 636 // Append non-CJKV text that precedes and start wrapping CJKV tokens 637 if (tokPos > m_pos) { 638 m_modifiedQuery += " " + m_query.substr(m_pos, tokPos - m_pos); 639 } 640 m_pos += tok.length(); 641 642 wrapOpen(); 643 } else { 644 m_modifiedQuery += " "; 645 if (!m_currentFilter.empty()) { 646 m_modifiedQuery += m_currentFilter; 647 } 648 } 649 m_modifiedQuery += tok; 650 651 if (tokPos != string::npos) { 652 m_pos = tokPos + tok.length(); 653 } 654 655 ++m_nGramCount; 656 m_hasCJKV = true; 657 658 return true; 659 } 660 661 unsigned int get_tokens_count(void) const 662 { 663 return m_tokensCount; 664 } 665 666 string get_modified_query(bool &pureCJKV) 667 { 668 // Anything left ? 669 if (m_pos < m_query.length() - 1) { 670 m_modifiedQuery += " " + m_query.substr(m_pos); 671 } 672 wrapClose(); 673 674 if (m_hasCJKV && !m_hasNonCJKV) { 675 pureCJKV = true; 676 } else { 677 pureCJKV = false; 678 } 679 680 return m_modifiedQuery; 681 } 682 683 protected: 684 string m_query; 685 string m_modifiedQuery; 686 string::size_type m_pos; 687 CJKVWrap m_wrap; 688 bool m_wrapped; 689 string m_currentFilter; 690 unsigned int m_nGramCount; 691 unsigned int m_nGramSize; 692 unsigned int m_tokensCount; 693 bool m_hasCJKV; 694 bool m_hasNonCJKV; 695 696 void wrapOpen(void) 697 { 698 switch (m_wrap) { 699 case BRACKETS: 700 m_modifiedQuery += " ("; 701 break; 702 case NONE: 703 default: 704 break; 705 } 706 m_wrapped = true; 707 } 708 709 void wrapClose(void) 710 { 711 if (!m_wrapped) { 712 return; 713 } 714 715 // Finish wrapping CJKV tokens 716 switch (m_wrap) { 717 case BRACKETS: 718 m_modifiedQuery += ')'; 719 break; 720 case NONE: 721 default: 722 break; 723 } 724 m_wrapped = false; 725 } 726 }; // class QueryModifier: public Dijon::CJKVTokenizer::TokensHandler 727 557 728 Query 558 QueryParser::Internal::parse_query(const string & qs, unsigned flags,729 QueryParser::Internal::parse_query(const string &iqs, unsigned flags, 559 730 const string &default_prefix) 560 731 { 732 Dijon::CJKVTokenizer tokenizer; 733 string qs(iqs); 734 735 // Modifying the query is necessary if it's CJKV 736 if (tokenizer.has_cjkv(qs)) { 737 QueryModifier handler(qs, tokenizer.get_ngram_size()); 738 tokenizer.tokenize(qs, handler, true); 739 740 // We can disable stemming and spelling correction for pure CJKV queries 741 bool pureCJKV = false; 742 qs = handler.get_modified_query(pureCJKV); 743 } 744 561 745 yyParser * pParser = ParseAlloc(); 562 746 563 747 // Set value_ranges if we may have to handle value ranges in the query. -
queryparser/termgenerator_internal.cc
diff -bBrupN xapian-core-1.0.16.old/queryparser/termgenerator_internal.cc xapian-core-1.0.16/queryparser/termgenerator_internal.cc
old new 25 25 #include <xapian/document.h> 26 26 #include <xapian/queryparser.h> 27 27 #include <xapian/unicode.h> 28 #include <xapian/cjkv/CJKVTokenizer.h> 28 29 29 30 #include "stringutils.h" 30 31 … … using namespace std; 34 35 35 36 namespace Xapian { 36 37 38 // FIXME: add API for this: 39 #define STOPWORDS_NONE 0 40 #define STOPWORDS_IGNORE 1 41 #define STOPWORDS_INDEX_UNSTEMMED_ONLY 2 42 37 43 // Put a limit on the size of terms to help prevent the index being bloated 38 44 // by useless junk terms. 39 45 static const unsigned int MAX_PROB_TERM_LENGTH = 64; … … should_stem(const std::string & term) 64 70 return ((SHOULD_STEM_MASK >> Unicode::get_category(*u)) & 1); 65 71 } 66 72 73 inline bool 74 should_index(const std::string &term, int stop_mode, const Stopper *stopper) 75 { 76 if (term.size() > MAX_PROB_TERM_LENGTH) { 77 return false; 78 } 79 80 if (stop_mode == STOPWORDS_IGNORE && stopper && (*stopper)(term)) { 81 return false; 82 } 83 84 return true; 85 } 86 67 87 inline unsigned check_infix(unsigned ch) { 68 88 if (ch == '\'' || ch == '&' || ch == 0xb7 || ch == 0x5f4 || ch == 0x2027) { 69 89 // Unicode includes all these except '&' in it's word boundary rules, … … inline unsigned check_suffix(unsigned ch 108 128 return 0; 109 129 } 110 130 111 // FIXME: add API for this: 112 #define STOPWORDS_NONE 0 113 #define STOPWORDS_IGNORE 1 114 #define STOPWORDS_INDEX_UNSTEMMED_ONLY 2 131 class TokensIndexer: public Dijon::CJKVTokenizer::TokensHandler 132 { 133 public: 134 TokensIndexer( 135 TermGenerator::Internal &generator, 136 const string &prefix, 137 termcount &weight, 138 bool with_positions, 139 int stop_mode, 140 unsigned int gram_size 141 ): 142 m_generator(generator), 143 m_prefix(prefix), 144 m_weight(weight), 145 m_with_positions(with_positions), 146 m_stop_mode(stop_mode), 147 m_gram_size(gram_size), 148 m_gram_count(0), 149 m_hasCJKV(false) 150 { 151 } 152 153 virtual ~TokensIndexer() 154 { 155 } 156 157 virtual bool handle_token(const string &token, bool is_cjkv) 158 { 159 termcount termpos_inc = 0; 160 if (is_cjkv) { 161 if ((m_gram_count + 1) % m_gram_size == 0) { 162 termpos_inc = 1; 163 } 164 } else { 165 termpos_inc = 1; 166 } 167 168 string term(to_lower(token)); 169 if (m_generator.index_term(m_prefix, term, m_weight, m_stop_mode, m_with_positions, termpos_inc) == false) { 170 return false; 171 } 172 173 if (is_cjkv) { 174 m_gram_count++; 175 m_hasCJKV = true; 176 } else { 177 m_gram_count = 0; 178 } 179 180 return true; 181 } 182 183 protected: 184 string to_lower(const string &str) 185 { 186 string out(str); 187 for (string::iterator i = out.begin(); i != out.end(); ++i) { 188 if (U_isupper(*i)) { 189 *i = Unicode::tolower(*i); 190 } 191 } 192 return out; 193 } 194 195 TermGenerator::Internal &m_generator; 196 const string &m_prefix; 197 termcount &m_weight; 198 bool m_with_positions; 199 int m_stop_mode; 200 unsigned int m_gram_size; 201 unsigned int m_gram_count; 202 bool m_hasCJKV; 203 }; 115 204 116 205 void 117 206 TermGenerator::Internal::index_text(Utf8Iterator itor, termcount weight, 118 207 const string & prefix, bool with_positions) 119 208 { 120 int stop_mode = STOPWORDS_INDEX_UNSTEMMED_ONLY; 121 122 if (!stopper) stop_mode = STOPWORDS_NONE; 209 // set default stop words mode depend on 210 // stopper handler availability 211 int stop_mode = stopper ? STOPWORDS_INDEX_UNSTEMMED_ONLY : STOPWORDS_NONE; 212 213 const char *raw = itor.raw(); 214 if (raw) { 215 Dijon::CJKVTokenizer tokenizer; 216 string text(raw); 217 218 // is there CJKV chars? then we should use CJKVTokenizer 219 // instead of builtin TermGenerator 220 if (tokenizer.has_cjkv(text)) { 221 TokensIndexer handler(*this, prefix, weight, with_positions, stop_mode, tokenizer.get_ngram_size()); 222 tokenizer.tokenize(text, handler); 223 return; 224 } 225 } 123 226 124 227 while (true) { 125 228 // Advance to the start of the next term. 126 229 unsigned ch; 127 230 while (true) { 128 if (itor == Utf8Iterator()) return; 231 if (itor == Utf8Iterator()) { 232 return; 233 } 234 129 235 ch = check_wordchar(*itor); 130 if (ch) break; 236 if (ch) { 237 break; 238 } 239 131 240 ++itor; 132 241 } 133 242 … … TermGenerator::Internal::index_text(Utf8 158 269 do { 159 270 Unicode::append_utf8(term, ch); 160 271 prevch = ch; 161 if (++itor == Utf8Iterator()) goto endofterm; 272 if (++itor == Utf8Iterator()) { 273 goto endofterm; 274 } 162 275 ch = check_wordchar(*itor); 163 276 } while (ch); 164 277 165 278 Utf8Iterator next(itor); 166 279 ++next; 167 if (next == Utf8Iterator()) break; 280 if (next == Utf8Iterator()) { 281 break; 282 } 283 168 284 unsigned nextch = check_wordchar(*next); 169 if (!nextch) break; 285 if (!nextch) { 286 break; 287 } 288 170 289 unsigned infix_ch = *itor; 171 290 if (is_digit(prevch) && is_digit(*next)) { 172 291 infix_ch = check_infix_digit(infix_ch); … … TermGenerator::Internal::index_text(Utf8 174 293 // Handle things like '&' in AT&T, apostrophes, etc. 175 294 infix_ch = check_infix(infix_ch); 176 295 } 177 if (!infix_ch) break; 296 297 if (!infix_ch) { 298 break; 299 } 300 178 301 Unicode::append_utf8(term, infix_ch); 179 302 ch = nextch; 180 303 itor = next; … … TermGenerator::Internal::index_text(Utf8 189 312 break; 190 313 } 191 314 Unicode::append_utf8(term, ch); 192 if (++itor == Utf8Iterator()) goto endofterm; 315 if (++itor == Utf8Iterator()) { 316 goto endofterm; 317 } 193 318 } 194 319 } 195 320 196 321 endofterm: 197 if (term.size() > MAX_PROB_TERM_LENGTH) continue; 322 index_term(prefix, term, weight, stop_mode, with_positions); 323 } // while(true) 324 } 198 325 199 if (stop_mode == STOPWORDS_IGNORE && (*stopper)(term)) continue; 326 bool 327 TermGenerator::Internal::index_term(const string &prefix, const string &term, termcount &weight, int stop_mode, bool with_positions, termcount termpos_inc) 328 { 329 if (should_index(term, stop_mode, stopper) == false) { 330 return false; 331 } 200 332 201 333 if (with_positions) { 202 doc.add_posting(prefix + term, ++termpos, weight); 334 termpos += termpos_inc; 335 doc.add_posting(prefix + term, termpos, weight); 203 336 } else { 204 337 doc.add_term(prefix + term, weight); 205 338 } 206 if ((flags & FLAG_SPELLING) && prefix.empty()) db.add_spelling(term);207 339 208 if (!stemmer.internal.get()) continue; 340 if ((flags & FLAG_SPELLING) && prefix.empty()) { 341 db.add_spelling(term); 342 } 209 343 210 if (stop_mode == STOPWORDS_INDEX_UNSTEMMED_ONLY && (*stopper)(term)) 211 continue; 344 if (!stemmer.internal.get()) { 345 return true; 346 } 347 348 if (stop_mode == STOPWORDS_INDEX_UNSTEMMED_ONLY && stopper && (*stopper)(term)) { 349 return true; 350 } 212 351 213 352 // Note, this uses the lowercased term, but that's OK as we only 214 353 // want to avoid stemming terms starting with a digit. 215 if (!should_stem(term)) continue; 354 if (should_stem(term) == false) { 355 return true; 356 } 216 357 217 358 // Add stemmed form without positional information. 218 359 string stem("Z"); 219 360 stem += prefix; 220 361 stem += stemmer(term); 221 362 doc.add_term(stem, weight); 222 }223 }224 363 364 return true; 225 365 } 366 367 } // namespace Xapian -
queryparser/termgenerator_internal.h
diff -bBrupN xapian-core-1.0.16.old/queryparser/termgenerator_internal.h xapian-core-1.0.16/queryparser/termgenerator_internal.h
old new 21 21 #ifndef XAPIAN_INCLUDED_TERMGENERATOR_INTERNAL_H 22 22 #define XAPIAN_INCLUDED_TERMGENERATOR_INTERNAL_H 23 23 24 #include <string> 25 24 26 #include <xapian/base.h> 25 27 #include <xapian/database.h> 26 28 #include <xapian/document.h> … … 30 32 namespace Xapian { 31 33 32 34 class Stopper; 35 using std::string; 33 36 34 37 class TermGenerator::Internal : public Xapian::Internal::RefCntBase { 35 38 friend class TermGenerator; … … class TermGenerator::Internal : public X 43 46 public: 44 47 Internal() : stopper(NULL), termpos(0), 45 48 flags(TermGenerator::flags(0)) { } 46 void index_text(Utf8Iterator itor, 47 termcount weight, 48 const std::string & prefix, 49 bool with_positions); 49 void index_text(Utf8Iterator itor, termcount weight, const std::string & prefix, bool with_positions); 50 bool index_term(const string &prefix, const string &term, termcount &weight, int stop_mode, bool with_positions, termcount termpos_inc=1); 50 51 }; 51 52 52 53 } -
xapian-core-1.0.16
diff -bBrupN xapian-core-1.0.16.old/xapian-config.in xapian-core-1.0.16/xapian-config.in
old new while [ 0 != "$#" ] ; do 187 187 cxxflags= 188 188 [ -n "@ANSI_CXXFLAGS@" ] && cxxflags="@ANSI_CXXFLAGS@ " 189 189 [ -n "@STLPORT_CXXFLAGS@" ] && cxxflags="${cxxflags}@STLPORT_CXXFLAGS@ " 190 [ -n "@GLIB2_CFLAGS@" ] && cxxflags="${cxxflags}@GLIB2_CFLAGS@ " 190 191 echo "$cxxflags$I" 191 192 ;; 192 193