| 1 | *** xapian-core-1.3.0_svn/configure.ac 2012-03-28 18:11:06.000000000 +0800
|
|---|
| 2 | --- xapian-core-1.3.0_scws/configure.ac 2012-03-30 12:31:10.000000000 +0800
|
|---|
| 3 | ***************
|
|---|
| 4 | *** 997,1002 ****
|
|---|
| 5 | --- 997,1058 ----
|
|---|
| 6 | [Define if you want a log of methods called and other debug messages])
|
|---|
| 7 | fi
|
|---|
| 8 |
|
|---|
| 9 | + dnl **********************
|
|---|
| 10 | + dnl * Check scws library *
|
|---|
| 11 | + dnl **********************
|
|---|
| 12 | + dnl hightman.20110411: See if we want to use scws as default tokenizer
|
|---|
| 13 | + SCWS_DIR=""
|
|---|
| 14 | + AC_MSG_CHECKING(for scws)
|
|---|
| 15 | + AC_ARG_WITH(scws,
|
|---|
| 16 | + [AS_HELP_STRING([--with-scws@<:@=DIR@:>@], [use scws as default tokenizer, DIR is the install PREFIX of scws])],
|
|---|
| 17 | + [ ],[ with_scws=no ]
|
|---|
| 18 | + )
|
|---|
| 19 | +
|
|---|
| 20 | + if test "$with_scws" = "no"; then
|
|---|
| 21 | + AC_MSG_RESULT(no)
|
|---|
| 22 | + else
|
|---|
| 23 | + # Check header file
|
|---|
| 24 | + if test "$with_scws" = "yes"; then
|
|---|
| 25 | + searchdirs="/usr /usr/local /usr/local/scws /opt/local"
|
|---|
| 26 | + for tmpdir in $searchdirs ; do
|
|---|
| 27 | + if test -f $tmpdir/include/scws/scws.h ; then
|
|---|
| 28 | + SCWS_DIR=$tmpdir
|
|---|
| 29 | + break
|
|---|
| 30 | + fi
|
|---|
| 31 | + done
|
|---|
| 32 | + if test "$SCWS_DIR" = ""; then
|
|---|
| 33 | + AC_MSG_RESULT(no)
|
|---|
| 34 | + AC_MSG_ERROR([scws not found on default search directories, specify DIR plz...])
|
|---|
| 35 | + fi
|
|---|
| 36 | + elif test -f $withval/include/scws/scws.h ; then
|
|---|
| 37 | + SCWS_DIR=$withval
|
|---|
| 38 | + else
|
|---|
| 39 | + AC_MSG_RESULT(no)
|
|---|
| 40 | + AC_MSG_ERROR([Invalid scws directory, unable to find the scws.h under $withval/include/scws])
|
|---|
| 41 | + fi
|
|---|
| 42 | + AC_MSG_RESULT([yes: $SCWS_DIR])
|
|---|
| 43 | +
|
|---|
| 44 | + # Etc directory
|
|---|
| 45 | + if test "$SCWS_DIR" = "/usr"; then
|
|---|
| 46 | + SCWS_ETCDIR="/etc"
|
|---|
| 47 | + else
|
|---|
| 48 | + SCWS_ETCDIR="$SCWS_DIR/etc"
|
|---|
| 49 | + fi
|
|---|
| 50 | +
|
|---|
| 51 | + # Check scws library
|
|---|
| 52 | + AC_CHECK_LIB(scws, scws_new, [
|
|---|
| 53 | + LIBS="$LIBS -L$SCWS_DIR/lib -lscws"
|
|---|
| 54 | + XAPIAN_LDFLAGS="$XAPIAN_LDFLAGS -L$SCWS_DIR/lib -lscws"
|
|---|
| 55 | + CPPFLAGS="$CPPFLAGS -I$SCWS_DIR/include"
|
|---|
| 56 | + AC_DEFINE(HAVE_SCWS, 1, [Define to 1 if you want to use scws as default tokenizer])
|
|---|
| 57 | + AC_DEFINE_UNQUOTED(SCWS_ETCDIR, "$SCWS_ETCDIR", [Resources directory of scws to load dictionary and rules])
|
|---|
| 58 | + ],[
|
|---|
| 59 | + AC_MSG_ERROR([scws_new() NOT found in libscws, please check it first.])
|
|---|
| 60 | + ],[
|
|---|
| 61 | + -L$SCWS_DIR/lib
|
|---|
| 62 | + ])
|
|---|
| 63 | + fi
|
|---|
| 64 | +
|
|---|
| 65 | dnl ******************************
|
|---|
| 66 | dnl * Set special compiler flags *
|
|---|
| 67 | dnl ******************************
|
|---|
| 68 | *** xapian-core-1.3.0_svn/include/xapian/queryparser.h 2012-03-24 20:31:02.000000000 +0800
|
|---|
| 69 | --- xapian-core-1.3.0_scws/include/xapian/queryparser.h 2012-03-30 12:31:10.000000000 +0800
|
|---|
| 70 | ***************
|
|---|
| 71 | *** 499,504 ****
|
|---|
| 72 | --- 499,513 ----
|
|---|
| 73 | */
|
|---|
| 74 | void set_max_wildcard_expansion(Xapian::termcount limit);
|
|---|
| 75 |
|
|---|
| 76 | + #if 1 /* HAVE_SCWS */
|
|---|
| 77 | + /** hightman.20070706: Specify the dict and rules file for scws, only used when HAVE_SCWS.
|
|---|
| 78 | + * @param fpath path for dict file and rule file (char *)
|
|---|
| 79 | + * @param xmem whether to load whold dict into memory(default to false)
|
|---|
| 80 | + * @param multi multiset (int 0~15)
|
|---|
| 81 | + */
|
|---|
| 82 | + void load_libscws(const char *fpath, bool xmem = false, int multi = 0);
|
|---|
| 83 | + #endif
|
|---|
| 84 | +
|
|---|
| 85 | /** Parse a query.
|
|---|
| 86 | *
|
|---|
| 87 | * @param query_string A free-text query as entered by a user
|
|---|
| 88 | *** xapian-core-1.3.0_svn/include/xapian/termgenerator.h 2011-11-07 11:11:05.000000000 +0800
|
|---|
| 89 | --- xapian-core-1.3.0_scws/include/xapian/termgenerator.h 2012-03-30 12:31:10.000000000 +0800
|
|---|
| 90 | ***************
|
|---|
| 91 | *** 82,87 ****
|
|---|
| 92 | --- 82,92 ----
|
|---|
| 93 | /// Set the database to index spelling data to.
|
|---|
| 94 | void set_database(const Xapian::WritableDatabase &db);
|
|---|
| 95 |
|
|---|
| 96 | + #if 1 /* HAVE_SCWS */
|
|---|
| 97 | + /// hightman.20070706: Specify the dict and rules file for scws, only used when HAVE_SCWS.
|
|---|
| 98 | + void load_libscws(const char *fpath, bool xmem = false, int multi = 0);
|
|---|
| 99 | + #endif
|
|---|
| 100 | +
|
|---|
| 101 | /// Flags to OR together and pass to TermGenerator::set_flags().
|
|---|
| 102 | enum flags {
|
|---|
| 103 | /// Index data required for spelling correction.
|
|---|
| 104 | *** xapian-core-1.3.0_svn/queryparser/queryparser_internal.h 2012-03-24 20:31:02.000000000 +0800
|
|---|
| 105 | --- xapian-core-1.3.0_scws/queryparser/queryparser_internal.h 2012-03-30 12:38:27.000000000 +0800
|
|---|
| 106 | ***************
|
|---|
| 107 | *** 29,34 ****
|
|---|
| 108 | --- 29,39 ----
|
|---|
| 109 | #include <xapian/queryparser.h>
|
|---|
| 110 | #include <xapian/stem.h>
|
|---|
| 111 |
|
|---|
| 112 | + /// hightman.20070701: use scws as default tokneizer
|
|---|
| 113 | + #ifdef HAVE_SCWS
|
|---|
| 114 | + #include <scws/scws.h>
|
|---|
| 115 | + #endif
|
|---|
| 116 | +
|
|---|
| 117 | #include <list>
|
|---|
| 118 | #include <map>
|
|---|
| 119 |
|
|---|
| 120 | ***************
|
|---|
| 121 | *** 63,68 ****
|
|---|
| 122 | --- 68,79 ----
|
|---|
| 123 | Stem stemmer;
|
|---|
| 124 | stem_strategy stem_action;
|
|---|
| 125 | const Stopper * stopper;
|
|---|
| 126 | + #ifdef HAVE_SCWS
|
|---|
| 127 | + scws_t scws;
|
|---|
| 128 | + scws_res_t rptr, rcur;
|
|---|
| 129 | + const char *qptr;
|
|---|
| 130 | + int last_off;
|
|---|
| 131 | + #endif
|
|---|
| 132 | Query::op default_op;
|
|---|
| 133 | const char * errmsg;
|
|---|
| 134 | Database db;
|
|---|
| 135 | ***************
|
|---|
| 136 | *** 88,94 ****
|
|---|
| 137 | --- 99,112 ----
|
|---|
| 138 |
|
|---|
| 139 | public:
|
|---|
| 140 | Internal() : stem_action(STEM_SOME), stopper(NULL),
|
|---|
| 141 | + #ifdef HAVE_SCWS
|
|---|
| 142 | + scws(NULL), rptr(NULL), rcur(NULL),
|
|---|
| 143 | + #endif
|
|---|
| 144 | default_op(Query::OP_OR), errmsg(NULL), max_wildcard_expansion(0) { }
|
|---|
| 145 | + #ifdef HAVE_SCWS
|
|---|
| 146 | + ~Internal();
|
|---|
| 147 | + void load_libscws(const char *fpath, bool xmem, int multi);
|
|---|
| 148 | + #endif
|
|---|
| 149 |
|
|---|
| 150 | Query parse_query(const string & query_string, unsigned int flags, const string & default_prefix);
|
|---|
| 151 | };
|
|---|
| 152 | *** xapian-core-1.3.0_svn/queryparser/termgenerator_internal.h 2011-07-03 20:31:02.000000000 +0800
|
|---|
| 153 | --- xapian-core-1.3.0_scws/queryparser/termgenerator_internal.h 2012-03-30 12:33:15.000000000 +0800
|
|---|
| 154 | ***************
|
|---|
| 155 | *** 26,31 ****
|
|---|
| 156 | --- 26,35 ----
|
|---|
| 157 | #include <xapian/document.h>
|
|---|
| 158 | #include <xapian/termgenerator.h>
|
|---|
| 159 | #include <xapian/stem.h>
|
|---|
| 160 | + /// hightman.20070701: use scws as default tokneizer
|
|---|
| 161 | + #ifdef HAVE_SCWS
|
|---|
| 162 | + #include <scws/scws.h>
|
|---|
| 163 | + #endif
|
|---|
| 164 |
|
|---|
| 165 | namespace Xapian {
|
|---|
| 166 |
|
|---|
| 167 | ***************
|
|---|
| 168 | *** 37,48 ****
|
|---|
| 169 | --- 41,62 ----
|
|---|
| 170 | const Stopper * stopper;
|
|---|
| 171 | Document doc;
|
|---|
| 172 | termcount termpos;
|
|---|
| 173 | + #ifdef HAVE_SCWS
|
|---|
| 174 | + scws_t scws;
|
|---|
| 175 | + #endif
|
|---|
| 176 | TermGenerator::flags flags;
|
|---|
| 177 | WritableDatabase db;
|
|---|
| 178 |
|
|---|
| 179 | public:
|
|---|
| 180 | Internal() : stopper(NULL), termpos(0),
|
|---|
| 181 | + #ifdef HAVE_SCWS
|
|---|
| 182 | + scws(NULL),
|
|---|
| 183 | + #endif
|
|---|
| 184 | flags(TermGenerator::flags(0)) { }
|
|---|
| 185 | + #ifdef HAVE_SCWS
|
|---|
| 186 | + ~Internal();
|
|---|
| 187 | + void load_libscws(const char *fpath, bool xmem, int multi);
|
|---|
| 188 | + #endif
|
|---|
| 189 | void index_text(Utf8Iterator itor,
|
|---|
| 190 | termcount weight,
|
|---|
| 191 | const std::string & prefix,
|
|---|
| 192 | *** xapian-core-1.3.0_svn/queryparser/queryparser.cc 2011-12-26 20:31:02.000000000 +0800
|
|---|
| 193 | --- xapian-core-1.3.0_scws/queryparser/queryparser.cc 2012-03-30 12:33:15.000000000 +0800
|
|---|
| 194 | ***************
|
|---|
| 195 | *** 136,141 ****
|
|---|
| 196 | --- 136,152 ----
|
|---|
| 197 | internal->max_wildcard_expansion = max;
|
|---|
| 198 | }
|
|---|
| 199 |
|
|---|
| 200 | + #if 1 /* HAVE_SCWS */
|
|---|
| 201 | + /// hightman.20070701: load the specified dict file for scws
|
|---|
| 202 | + void
|
|---|
| 203 | + QueryParser::load_libscws(const char *fpath, bool xmem, int multi)
|
|---|
| 204 | + {
|
|---|
| 205 | + #ifdef HAVE_SCWS
|
|---|
| 206 | + internal->load_libscws(fpath, xmem, multi);
|
|---|
| 207 | + #endif
|
|---|
| 208 | + }
|
|---|
| 209 | + #endif
|
|---|
| 210 | +
|
|---|
| 211 | Query
|
|---|
| 212 | QueryParser::parse_query(const string &query_string, unsigned flags,
|
|---|
| 213 | const string &default_prefix)
|
|---|
| 214 | *** xapian-core-1.3.0_svn/queryparser/queryparser.lemony 2012-01-26 20:51:02.000000000 +0800
|
|---|
| 215 | --- xapian-core-1.3.0_scws/queryparser/queryparser.lemony 2012-03-30 13:46:27.000000000 +0800
|
|---|
| 216 | ***************
|
|---|
| 217 | *** 166,171 ****
|
|---|
| 218 | --- 166,174 ----
|
|---|
| 219 | string unstemmed;
|
|---|
| 220 | QueryParser::stem_strategy stem;
|
|---|
| 221 | termpos pos;
|
|---|
| 222 | + #ifdef HAVE_SCWS
|
|---|
| 223 | + vector<string> multi;
|
|---|
| 224 | + #endif
|
|---|
| 225 |
|
|---|
| 226 | Term(const string &name_, termpos pos_) : name(name_), stem(QueryParser::STEM_NONE), pos(pos_) { }
|
|---|
| 227 | Term(const string &name_) : name(name_), stem(QueryParser::STEM_NONE), pos(0) { }
|
|---|
| 228 | ***************
|
|---|
| 229 | *** 501,513 ****
|
|---|
| 230 | vector<Query> prefix_cjk;
|
|---|
| 231 | const list<string> & prefixes = prefix_info->prefixes;
|
|---|
| 232 | list<string>::const_iterator piter;
|
|---|
| 233 | ! for (CJKTokenIterator tk(name); tk != CJKTokenIterator(); ++tk) {
|
|---|
| 234 | for (piter = prefixes.begin(); piter != prefixes.end(); ++piter) {
|
|---|
| 235 | string cjk = *piter;
|
|---|
| 236 | cjk += *tk;
|
|---|
| 237 | prefix_cjk.push_back(Query(cjk, 1, pos));
|
|---|
| 238 | }
|
|---|
| 239 | }
|
|---|
| 240 | Query * q = new Query(Query::OP_AND, prefix_cjk.begin(), prefix_cjk.end());
|
|---|
| 241 | delete this;
|
|---|
| 242 | return q;
|
|---|
| 243 | --- 504,546 ----
|
|---|
| 244 | vector<Query> prefix_cjk;
|
|---|
| 245 | const list<string> & prefixes = prefix_info->prefixes;
|
|---|
| 246 | list<string>::const_iterator piter;
|
|---|
| 247 | ! /* hightman.20111223: used CJKTERM for multi segment */
|
|---|
| 248 | ! #ifdef HAVE_SCWS
|
|---|
| 249 | ! for (piter = prefixes.begin(); piter != prefixes.end(); ++piter) {
|
|---|
| 250 | ! Query org = Query(*piter + name, 1, pos);
|
|---|
| 251 | ! termpos mpos = pos + 88;
|
|---|
| 252 | !
|
|---|
| 253 | ! /* hightman.20120104: get synonyms */
|
|---|
| 254 | ! if (state->flags & QueryParser::FLAG_AUTO_SYNONYMS) {
|
|---|
| 255 | ! Xapian::Database db = state->get_database();
|
|---|
| 256 | ! Xapian::TermIterator syn = db.synonyms_begin(name);
|
|---|
| 257 | ! Xapian::TermIterator end = db.synonyms_end(name);
|
|---|
| 258 | ! while (syn != end) {
|
|---|
| 259 | ! org = Query(Query::OP_SYNONYM, org, Query(*piter + *syn, 1, mpos++));
|
|---|
| 260 | ! ++syn;
|
|---|
| 261 | ! }
|
|---|
| 262 | ! }
|
|---|
| 263 | ! if (!multi.empty()) {
|
|---|
| 264 | ! vector<string>::const_iterator mi;
|
|---|
| 265 | ! vector<Query> multi_cjk;
|
|---|
| 266 | ! for (mi = multi.begin(); mi != multi.end(); ++mi) {
|
|---|
| 267 | ! // hightman: force to sort behind for get_terms()
|
|---|
| 268 | ! multi_cjk.push_back(Query(*piter + *mi, 1, mpos++));
|
|---|
| 269 | ! }
|
|---|
| 270 | ! Query syn = Query(state->default_op(), multi_cjk.begin(), multi_cjk.end());
|
|---|
| 271 | ! org = Query(Query::OP_SYNONYM, org, syn);
|
|---|
| 272 | ! }
|
|---|
| 273 | ! prefix_cjk.push_back(org);
|
|---|
| 274 | ! }
|
|---|
| 275 | ! #else
|
|---|
| 276 | ! for (CJKTokenIterator tk(name); tk != CJKTokenIterator(); ++tk) {
|
|---|
| 277 | for (piter = prefixes.begin(); piter != prefixes.end(); ++piter) {
|
|---|
| 278 | string cjk = *piter;
|
|---|
| 279 | cjk += *tk;
|
|---|
| 280 | prefix_cjk.push_back(Query(cjk, 1, pos));
|
|---|
| 281 | }
|
|---|
| 282 | }
|
|---|
| 283 | + #endif /* HAVE_SCWS */
|
|---|
| 284 | Query * q = new Query(Query::OP_AND, prefix_cjk.begin(), prefix_cjk.end());
|
|---|
| 285 | delete this;
|
|---|
| 286 | return q;
|
|---|
| 287 | ***************
|
|---|
| 288 | *** 618,629 ****
|
|---|
| 289 | --- 651,728 ----
|
|---|
| 290 | }
|
|---|
| 291 | }
|
|---|
| 292 |
|
|---|
| 293 | + /// hightman.20110701: load libscws
|
|---|
| 294 | + #ifdef HAVE_SCWS
|
|---|
| 295 | + QueryParser::Internal::~Internal()
|
|---|
| 296 | + {
|
|---|
| 297 | + if (rptr != NULL) {
|
|---|
| 298 | + scws_free_result(rptr);
|
|---|
| 299 | + rptr = NULL;
|
|---|
| 300 | + }
|
|---|
| 301 | + if (scws != NULL) {
|
|---|
| 302 | + scws_free(scws);
|
|---|
| 303 | + scws = NULL;
|
|---|
| 304 | + }
|
|---|
| 305 | + }
|
|---|
| 306 | +
|
|---|
| 307 | + void
|
|---|
| 308 | + QueryParser::Internal::load_libscws(const char *fpath, bool xmem, int multi)
|
|---|
| 309 | + {
|
|---|
| 310 | + if (scws == NULL) {
|
|---|
| 311 | + string temp;
|
|---|
| 312 | +
|
|---|
| 313 | + scws = scws_new();
|
|---|
| 314 | + scws_set_charset(scws, "utf8");
|
|---|
| 315 | + scws_set_ignore(scws, SCWS_NA);
|
|---|
| 316 | + scws_set_duality(scws, SCWS_YEA);
|
|---|
| 317 | +
|
|---|
| 318 | + temp = string(fpath ? fpath : SCWS_ETCDIR) + string("/rules.utf8.ini");
|
|---|
| 319 | + scws_set_rule(scws, temp.data());
|
|---|
| 320 | + temp = string(fpath ? fpath : SCWS_ETCDIR) + string("/dict.utf8.xdb");
|
|---|
| 321 | + scws_set_dict(scws, temp.data(), xmem == true ? SCWS_XDICT_MEM : SCWS_XDICT_XDB);
|
|---|
| 322 | + /* hightman.20111209: custom dict support */
|
|---|
| 323 | + temp = string(fpath ? fpath : SCWS_ETCDIR) + string("/dict_user.txt");
|
|---|
| 324 | + scws_add_dict(scws, temp.data(), SCWS_XDICT_TXT);
|
|---|
| 325 | + }
|
|---|
| 326 | + if (multi >= 0 && multi < 0x10)
|
|---|
| 327 | + scws_set_multi(scws, (multi<<12));
|
|---|
| 328 | + }
|
|---|
| 329 | + #endif /* HAVE_SCWS */
|
|---|
| 330 | +
|
|---|
| 331 | string
|
|---|
| 332 | QueryParser::Internal::parse_term(Utf8Iterator &it, const Utf8Iterator &end,
|
|---|
| 333 | bool cjk_ngram, bool & is_cjk_term,
|
|---|
| 334 | bool &was_acronym)
|
|---|
| 335 | {
|
|---|
| 336 | string term;
|
|---|
| 337 | + #ifdef HAVE_SCWS
|
|---|
| 338 | + int off = it.raw() - qptr;
|
|---|
| 339 | + while (rcur && (off > rcur->off)) {
|
|---|
| 340 | + rcur = rcur->next;
|
|---|
| 341 | + }
|
|---|
| 342 | + was_acronym = false;
|
|---|
| 343 | + if (rcur == NULL) {
|
|---|
| 344 | + it = end;
|
|---|
| 345 | + term.resize(0);
|
|---|
| 346 | + } else {
|
|---|
| 347 | + // sometimes, auto_duality + word-end single word char will be repeated
|
|---|
| 348 | + // 说明几句 => 说明/几/几句
|
|---|
| 349 | + if (rcur->next && rcur->next->off == rcur->off && rcur->next->len > rcur->len)
|
|---|
| 350 | + rcur = rcur->next;
|
|---|
| 351 | +
|
|---|
| 352 | + term.append(qptr + rcur->off, rcur->len);
|
|---|
| 353 | + was_acronym = (rcur->attr[0] == 'n' && rcur->attr[1] == 'z') ? true : false;
|
|---|
| 354 | + is_cjk_term = CJK::codepoint_is_cjk(*it);
|
|---|
| 355 | + last_off = off = rcur->off + rcur->len;
|
|---|
| 356 | + rcur = rcur->next;
|
|---|
| 357 | +
|
|---|
| 358 | + // sometimes, auto duality or multisegment
|
|---|
| 359 | + // 几句说搞笑 => 几句/句说/搞笑
|
|---|
| 360 | + if (rcur && off > rcur->off && (rcur->off + rcur->len) > off)
|
|---|
| 361 | + off = rcur->off;
|
|---|
| 362 | + while ((it.raw() - qptr) < off) it++;
|
|---|
| 363 | + }
|
|---|
| 364 | + #else /* HAVE_SCWS */
|
|---|
| 365 | // Look for initials separated by '.' (e.g. P.T.O., U.N.C.L.E).
|
|---|
| 366 | // Don't worry if there's a trailing '.' or not.
|
|---|
| 367 | if (U_isupper(*it)) {
|
|---|
| 368 | ***************
|
|---|
| 369 | *** 708,713 ****
|
|---|
| 370 | --- 807,813 ----
|
|---|
| 371 | }
|
|---|
| 372 | }
|
|---|
| 373 | }
|
|---|
| 374 | + #endif /* HAVE_SCWS */
|
|---|
| 375 | return term;
|
|---|
| 376 | }
|
|---|
| 377 |
|
|---|
| 378 | ***************
|
|---|
| 379 | *** 759,764 ****
|
|---|
| 380 | --- 859,890 ----
|
|---|
| 381 |
|
|---|
| 382 | ParserHandler pParser(ParseAlloc());
|
|---|
| 383 |
|
|---|
| 384 | + #ifdef HAVE_SCWS
|
|---|
| 385 | + /// Pre segmentation use scws
|
|---|
| 386 | + scws_res_t res;
|
|---|
| 387 | +
|
|---|
| 388 | + if (!scws) {
|
|---|
| 389 | + load_libscws(NULL, false, 3);
|
|---|
| 390 | + }
|
|---|
| 391 | + if (rptr != NULL) {
|
|---|
| 392 | + scws_free_result(rptr);
|
|---|
| 393 | + rptr = NULL;
|
|---|
| 394 | + }
|
|---|
| 395 | + qptr = qs.data();
|
|---|
| 396 | + scws_send_text(scws, qptr, qs.size());
|
|---|
| 397 | + while ((res = scws_get_result(scws)) != NULL) {
|
|---|
| 398 | + if (rptr == NULL) {
|
|---|
| 399 | + rcur = rptr = res;
|
|---|
| 400 | + } else {
|
|---|
| 401 | + rcur->next = res;
|
|---|
| 402 | + }
|
|---|
| 403 | + while (rcur->next != NULL) {
|
|---|
| 404 | + rcur = rcur->next;
|
|---|
| 405 | + }
|
|---|
| 406 | + }
|
|---|
| 407 | + rcur = rptr;
|
|---|
| 408 | + #endif /* HAVE_SCWS */
|
|---|
| 409 | +
|
|---|
| 410 | unsigned newprev = ' ';
|
|---|
| 411 | main_lex_loop:
|
|---|
| 412 | enum {
|
|---|
| 413 | ***************
|
|---|
| 414 | *** 1162,1167 ****
|
|---|
| 415 | --- 1288,1298 ----
|
|---|
| 416 | if (!stemmer.internal.get()) {
|
|---|
| 417 | // No stemmer is set.
|
|---|
| 418 | stem_term = STEM_NONE;
|
|---|
| 419 | + #ifdef HAVE_SCWS
|
|---|
| 420 | + } else if (is_cjk_term) {
|
|---|
| 421 | + // Don't stem CJK terms.
|
|---|
| 422 | + stem_term = STEM_NONE;
|
|---|
| 423 | + #endif
|
|---|
| 424 | } else if (stem_term == STEM_SOME) {
|
|---|
| 425 | if (!should_stem(unstemmed_term) ||
|
|---|
| 426 | (it != end && is_stem_preventer(*it))) {
|
|---|
| 427 | ***************
|
|---|
| 428 | *** 1175,1180 ****
|
|---|
| 429 | --- 1306,1322 ----
|
|---|
| 430 | unstemmed_term, stem_term, term_pos++);
|
|---|
| 431 |
|
|---|
| 432 | if (is_cjk_term) {
|
|---|
| 433 | + #ifdef HAVE_SCWS
|
|---|
| 434 | + /* multi scws handler */
|
|---|
| 435 | + term_obj->multi.clear();
|
|---|
| 436 | + while (rcur && (rcur->off + rcur->len) <= last_off) {
|
|---|
| 437 | + if (rcur->len > 3)
|
|---|
| 438 | + term_obj->multi.push_back(string(qptr + rcur->off, rcur->len));
|
|---|
| 439 | + rcur = rcur->next;
|
|---|
| 440 | + }
|
|---|
| 441 | + if (mode == IN_GROUP || mode == IN_GROUP2)
|
|---|
| 442 | + mode = DEFAULT;
|
|---|
| 443 | + #endif
|
|---|
| 444 | Parse(pParser, CJKTERM, term_obj, &state);
|
|---|
| 445 | if (it == end) break;
|
|---|
| 446 | continue;
|
|---|
| 447 | ***************
|
|---|
| 448 | *** 1305,1310 ****
|
|---|
| 449 | --- 1447,1459 ----
|
|---|
| 450 | }
|
|---|
| 451 | }
|
|---|
| 452 | done:
|
|---|
| 453 | + #ifdef HAVE_SCWS
|
|---|
| 454 | + /// Free all segmented terms/words
|
|---|
| 455 | + if (rptr != NULL) {
|
|---|
| 456 | + scws_free_result(rptr);
|
|---|
| 457 | + rptr = NULL;
|
|---|
| 458 | + }
|
|---|
| 459 | + #endif
|
|---|
| 460 | if (!state.error) {
|
|---|
| 461 | // Implicitly close any unclosed quotes...
|
|---|
| 462 | if (mode == IN_QUOTES || mode == IN_PREFIXED_QUOTES)
|
|---|
| 463 | ***************
|
|---|
| 464 | *** 1656,1661 ****
|
|---|
| 465 | --- 1805,1815 ----
|
|---|
| 466 | void
|
|---|
| 467 | Term::as_positional_cjk_term(Terms * terms) const
|
|---|
| 468 | {
|
|---|
| 469 | + #ifdef HAVE_SCWS
|
|---|
| 470 | + // Add SCWS term only
|
|---|
| 471 | + Term * c = new Term(state, name, prefix_info, unstemmed, stem, pos);
|
|---|
| 472 | + terms->add_positional_term(c);
|
|---|
| 473 | + #else
|
|---|
| 474 | // Add each individual CJK character to the phrase.
|
|---|
| 475 | string t;
|
|---|
| 476 | for (Utf8Iterator it(name); it != Utf8Iterator(); ++it) {
|
|---|
| 477 | ***************
|
|---|
| 478 | *** 1664,1669 ****
|
|---|
| 479 | --- 1818,1824 ----
|
|---|
| 480 | terms->add_positional_term(c);
|
|---|
| 481 | t.resize(0);
|
|---|
| 482 | }
|
|---|
| 483 | + #endif /* HAVE_SCWS */
|
|---|
| 484 |
|
|---|
| 485 | // FIXME: we want to add the n-grams as filters too for efficiency.
|
|---|
| 486 |
|
|---|
| 487 | *** xapian-core-1.3.0_svn/queryparser/termgenerator.cc 2011-07-03 20:31:02.000000000 +0800
|
|---|
| 488 | --- xapian-core-1.3.0_scws/queryparser/termgenerator.cc 2012-03-30 12:33:15.000000000 +0800
|
|---|
| 489 | ***************
|
|---|
| 490 | *** 74,79 ****
|
|---|
| 491 | --- 74,90 ----
|
|---|
| 492 | internal->db = db;
|
|---|
| 493 | }
|
|---|
| 494 |
|
|---|
| 495 | + #if 1 /* HAVE_SCWS */
|
|---|
| 496 | + /// hightman.20070701: load the specified dict file for scws
|
|---|
| 497 | + void
|
|---|
| 498 | + TermGenerator::load_libscws(const char *fpath, bool xmem, int multi)
|
|---|
| 499 | + {
|
|---|
| 500 | + #ifdef HAVE_SCWS
|
|---|
| 501 | + internal->load_libscws(fpath, xmem, multi);
|
|---|
| 502 | + #endif
|
|---|
| 503 | + }
|
|---|
| 504 | + #endif
|
|---|
| 505 | +
|
|---|
| 506 | TermGenerator::flags
|
|---|
| 507 | TermGenerator::set_flags(flags toggle, flags mask)
|
|---|
| 508 | {
|
|---|
| 509 | *** xapian-core-1.3.0_svn/queryparser/termgenerator_internal.cc 2011-08-24 20:51:02.000000000 +0800
|
|---|
| 510 | --- xapian-core-1.3.0_scws/queryparser/termgenerator_internal.cc 2012-03-30 13:51:10.000000000 +0800
|
|---|
| 511 | ***************
|
|---|
| 512 | *** 125,130 ****
|
|---|
| 513 | --- 125,164 ----
|
|---|
| 514 | #define STOPWORDS_IGNORE 1
|
|---|
| 515 | #define STOPWORDS_INDEX_UNSTEMMED_ONLY 2
|
|---|
| 516 |
|
|---|
| 517 | + /// hightman.20070701: load libscws
|
|---|
| 518 | + #ifdef HAVE_SCWS
|
|---|
| 519 | + TermGenerator::Internal::~Internal()
|
|---|
| 520 | + {
|
|---|
| 521 | + if (scws != NULL) {
|
|---|
| 522 | + scws_free(scws);
|
|---|
| 523 | + scws = NULL;
|
|---|
| 524 | + }
|
|---|
| 525 | + }
|
|---|
| 526 | +
|
|---|
| 527 | + void
|
|---|
| 528 | + TermGenerator::Internal::load_libscws(const char *fpath, bool xmem, int multi)
|
|---|
| 529 | + {
|
|---|
| 530 | + if (scws == NULL) {
|
|---|
| 531 | + string temp;
|
|---|
| 532 | +
|
|---|
| 533 | + scws = scws_new();
|
|---|
| 534 | + scws_set_charset(scws, "utf8");
|
|---|
| 535 | + scws_set_ignore(scws, SCWS_NA);
|
|---|
| 536 | + scws_set_duality(scws, SCWS_YEA);
|
|---|
| 537 | +
|
|---|
| 538 | + temp = string(fpath ? fpath : SCWS_ETCDIR) + string("/rules.utf8.ini");
|
|---|
| 539 | + scws_set_rule(scws, temp.data());
|
|---|
| 540 | + temp = string(fpath ? fpath : SCWS_ETCDIR) + string("/dict.utf8.xdb");
|
|---|
| 541 | + scws_set_dict(scws, temp.data(), xmem == true ? SCWS_XDICT_MEM : SCWS_XDICT_XDB);
|
|---|
| 542 | + /* hightman.20111209: custom dict support */
|
|---|
| 543 | + temp = string(fpath ? fpath : SCWS_ETCDIR) + string("/dict_user.txt");
|
|---|
| 544 | + scws_add_dict(scws, temp.data(), SCWS_XDICT_TXT);
|
|---|
| 545 | + }
|
|---|
| 546 | + if (multi >= 0 && multi < 0x10)
|
|---|
| 547 | + scws_set_multi(scws, (multi<<12));
|
|---|
| 548 | + }
|
|---|
| 549 | + #endif
|
|---|
| 550 | +
|
|---|
| 551 | void
|
|---|
| 552 | TermGenerator::Internal::index_text(Utf8Iterator itor, termcount wdf_inc,
|
|---|
| 553 | const string & prefix, bool with_positions)
|
|---|
| 554 | ***************
|
|---|
| 555 | *** 135,140 ****
|
|---|
| 556 | --- 169,205 ----
|
|---|
| 557 |
|
|---|
| 558 | if (!stopper) stop_mode = STOPWORDS_NONE;
|
|---|
| 559 |
|
|---|
| 560 | + #ifdef HAVE_SCWS
|
|---|
| 561 | + int last_endpos = 0, last_off = 0;
|
|---|
| 562 | + scws_res_t res, cur;
|
|---|
| 563 | + Utf8Iterator iterm;
|
|---|
| 564 | + const char *text = itor.raw();
|
|---|
| 565 | +
|
|---|
| 566 | + if (!scws) load_libscws(NULL, false, 3);
|
|---|
| 567 | + scws_send_text(scws, text, itor.left());
|
|---|
| 568 | + while ((res = cur = scws_get_result(scws)) != NULL) { while (cur != NULL) {
|
|---|
| 569 | + string term;
|
|---|
| 570 | +
|
|---|
| 571 | + iterm.assign(text + cur->off, cur->len);
|
|---|
| 572 | + if (!Unicode::is_wordchar(*iterm)) {
|
|---|
| 573 | + cur = cur->next;
|
|---|
| 574 | + continue;
|
|---|
| 575 | + }
|
|---|
| 576 | + term = Unicode::tolower(string(text + cur->off, cur->len));
|
|---|
| 577 | + if (with_positions) {
|
|---|
| 578 | + /// for part word(short, duality)
|
|---|
| 579 | + if ((cur->off + cur->len) <= last_endpos)
|
|---|
| 580 | + --termpos;
|
|---|
| 581 | + else {
|
|---|
| 582 | + /// for dualities' first single word
|
|---|
| 583 | + if (cur->off == last_off)
|
|---|
| 584 | + --termpos;
|
|---|
| 585 | + last_endpos = cur->off + cur->len;
|
|---|
| 586 | + }
|
|---|
| 587 | + }
|
|---|
| 588 | + last_off = cur->off;
|
|---|
| 589 | + cur = cur->next;
|
|---|
| 590 | + #else
|
|---|
| 591 | while (true) {
|
|---|
| 592 | // Advance to the start of the next term.
|
|---|
| 593 | unsigned ch;
|
|---|
| 594 | ***************
|
|---|
| 595 | *** 254,259 ****
|
|---|
| 596 | --- 319,325 ----
|
|---|
| 597 | }
|
|---|
| 598 |
|
|---|
| 599 | endofterm:
|
|---|
| 600 | + #endif /* HAVE_SCWS */
|
|---|
| 601 | if (term.size() > MAX_PROB_TERM_LENGTH) continue;
|
|---|
| 602 |
|
|---|
| 603 | if (stop_mode == STOPWORDS_IGNORE && (*stopper)(term)) continue;
|
|---|
| 604 | ***************
|
|---|
| 605 | *** 263,268 ****
|
|---|
| 606 | --- 329,338 ----
|
|---|
| 607 | } else {
|
|---|
| 608 | doc.add_term(prefix + term, wdf_inc);
|
|---|
| 609 | }
|
|---|
| 610 | + #ifdef HAVE_SCWS
|
|---|
| 611 | + /// hightman: Term start with CJK character needn't spell & stem
|
|---|
| 612 | + if (CJK::codepoint_is_cjk(*iterm)) continue;
|
|---|
| 613 | + #endif
|
|---|
| 614 | if ((flags & FLAG_SPELLING) && prefix.empty()) db.add_spelling(term);
|
|---|
| 615 |
|
|---|
| 616 | if (!stemmer.internal.get()) continue;
|
|---|
| 617 | ***************
|
|---|
| 618 | *** 280,285 ****
|
|---|
| 619 | --- 350,358 ----
|
|---|
| 620 | stem += stemmer(term);
|
|---|
| 621 | doc.add_term(stem, wdf_inc);
|
|---|
| 622 | }
|
|---|
| 623 | + #ifdef HAVE_SCWS
|
|---|
| 624 | + scws_free_result(res); }
|
|---|
| 625 | + #endif
|
|---|
| 626 | }
|
|---|
| 627 |
|
|---|
| 628 | }
|
|---|