Index: queryparser/termgenerator_internal.h =================================================================== --- queryparser/termgenerator_internal.h (revision 16065) +++ queryparser/termgenerator_internal.h (working copy) @@ -27,6 +27,11 @@ #include #include +/// hightman.20110701: use scws as default tokneizer +#ifdef HAVE_SCWS +#include +#endif + namespace Xapian { class Stopper; @@ -37,12 +42,22 @@ const Stopper * stopper; Document doc; termcount termpos; +#ifdef HAVE_SCWS + scws_t scws; +#endif TermGenerator::flags flags; WritableDatabase db; public: Internal() : stopper(NULL), termpos(0), +#ifdef HAVE_SCWS + scws(NULL), +#endif flags(TermGenerator::flags(0)) { } +#ifdef HAVE_SCWS + ~Internal(); + void load_libscws(const char *fpath, bool xmem, int multi); +#endif void index_text(Utf8Iterator itor, termcount weight, const std::string & prefix, Index: queryparser/termgenerator.cc =================================================================== --- queryparser/termgenerator.cc (revision 16065) +++ queryparser/termgenerator.cc (working copy) @@ -74,6 +74,17 @@ internal->db = db; } +#if 1 /* HAVE_SCWS */ +/// hightman.20110701: load the specified dict file for scws +void +TermGenerator::load_libscws(const char *fpath, bool xmem, int multi) +{ +#ifdef HAVE_SCWS + internal->load_libscws(fpath, xmem, multi); +#endif +} +#endif + TermGenerator::flags TermGenerator::set_flags(flags toggle, flags mask) { Index: queryparser/queryparser.lemony =================================================================== --- queryparser/queryparser.lemony (revision 16065) +++ queryparser/queryparser.lemony (working copy) @@ -563,12 +563,75 @@ } } +/// hightman.20110701: load libscws +#ifdef HAVE_SCWS +QueryParser::Internal::~Internal() +{ + if (rptr != NULL) { + scws_free_result(rptr); + rptr = NULL; + } + if (scws != NULL) { + scws_free(scws); + scws = NULL; + } +} + +void +QueryParser::Internal::load_libscws(const char *fpath, bool xmem, int multi) +{ + if (scws == NULL) { + string temp; + + scws = scws_new(); + scws_set_charset(scws, "utf8"); + scws_set_ignore(scws, SCWS_NA); + scws_set_duality(scws, SCWS_YEA); + + temp = string(fpath ? fpath : SCWS_ETCDIR) + string("/rules.utf8.ini"); + scws_set_rule(scws, temp.data()); + temp = string(fpath ? fpath : SCWS_ETCDIR) + string("/dict.utf8.xdb"); + scws_set_dict(scws, temp.data(), xmem == true ? SCWS_XDICT_MEM : SCWS_XDICT_XDB); + } + if (multi >= 0 && multi < 0x10) + scws_set_multi(scws, (multi<<12)); +} +#endif /* HAVE_SCWS */ + string QueryParser::Internal::parse_term(Utf8Iterator &it, const Utf8Iterator &end, bool cjk_ngram, bool & is_cjk_term, bool &was_acronym) { string term; +#ifdef HAVE_SCWS + int off = it.raw() - qptr; + while (rcur && (off > rcur->off)) { + rcur = rcur->next; + } + was_acronym = false; + if (rcur == NULL) { + it = end; + term.resize(0); + } else { + // sometimes, auto_duality + word-end single word char will be repeated + // 说明几句 => 说明/几/几句 + if (rcur->next && rcur->next->off == rcur->off && rcur->next->len > rcur->len) + rcur = rcur->next; + + term.append(qptr + rcur->off, rcur->len); + was_acronym = (rcur->attr[0] == 'n' && rcur->attr[1] == 'z') ? true : false; + is_cjk_term = CJK::codepoint_is_cjk(*it); + off = rcur->off + rcur->len; + rcur = rcur->next; + + // sometimes, auto duality or multisegment + // 几句说搞笑 => 几句/句说/搞笑 + if (rcur && off > rcur->off && (rcur->off + rcur->len) > off) + off = rcur->off; + it = Utf8Iterator(qptr + off); + } +#else /* HAVE_SCWS */ // Look for initials separated by '.' (e.g. P.T.O., U.N.C.L.E). // Don't worry if there's a trailing '.' or not. if (U_isupper(*it)) { @@ -653,6 +716,7 @@ } } } +#endif /* HAVE_SCWS */ return term; } @@ -704,6 +768,32 @@ ParserHandler pParser(ParseAlloc()); +#ifdef HAVE_SCWS + /// Pre segmentation use scws + scws_res_t res; + + if (!scws) { + load_libscws(NULL, false, 0); + } + if (rptr != NULL) { + scws_free_result(rptr); + rptr = NULL; + } + qptr = qs.data(); + scws_send_text(scws, qptr, qs.size()); + while ((res = scws_get_result(scws)) != NULL) { + if (rptr == NULL) { + rcur = rptr = res; + } else { + rcur->next = res; + } + while (rcur->next != NULL) { + rcur = rcur->next; + } + } + rcur = rptr; +#endif /* HAVE_SCWS */ + unsigned newprev = ' '; main_lex_loop: enum { @@ -1101,6 +1191,12 @@ if (!stemmer.internal.get()) { // No stemmer is set. stem_term = STEM_NONE; +#ifdef HAVE_SCWS + else if (is_cjk_term) { + // Don't stem CJK terms. + stem_term = STEM_NONE; + } +#endif } else if (stem_term == STEM_SOME) { if (!should_stem(unstemmed_term) || (it != end && is_stem_preventer(*it))) { @@ -1244,6 +1340,14 @@ } } done: +#ifdef HAVE_SCWS + /// Free all segmented terms/words + if (rptr != NULL) { + scws_free_result(rptr); + rptr = NULL; + } +#endif + if (!state.error) { // Implicitly close any unclosed quotes... if (mode == IN_QUOTES || mode == IN_PREFIXED_QUOTES) Index: queryparser/queryparser_internal.h =================================================================== --- queryparser/queryparser_internal.h (revision 16065) +++ queryparser/queryparser_internal.h (working copy) @@ -29,6 +29,11 @@ #include #include +/// hightman.20110701: use scws as default tokneizer +#ifdef HAVE_SCWS +#include +#endif + #include #include @@ -63,6 +68,11 @@ Stem stemmer; stem_strategy stem_action; const Stopper * stopper; +#ifdef HAVE_SCWS + scws_t scws; + scws_res_t rptr, rcur; + const char *qptr; +#endif Query::op default_op; const char * errmsg; Database db; @@ -88,7 +98,14 @@ public: Internal() : stem_action(STEM_NONE), stopper(NULL), +#ifdef HAVE_SCWS + scws(NULL), rptr(NULL), rcur(NULL), +#endif default_op(Query::OP_OR), errmsg(NULL), max_wildcard_expansion(0) { } +#ifdef HAVE_SCWS + ~Internal(); + void load_libscws(const char *fpath, bool xmem, int multi); +#endif Query parse_query(const string & query_string, unsigned int flags, const string & default_prefix); }; Index: queryparser/queryparser.cc =================================================================== --- queryparser/queryparser.cc (revision 16065) +++ queryparser/queryparser.cc (working copy) @@ -112,6 +112,17 @@ internal->max_wildcard_expansion = max; } +#if 1 /* HAVE_SCWS */ +/// hightman.20110701: load the specified dict file for scws +void +QueryParser::load_libscws(const char *fpath, bool xmem, int multi) +{ +#ifdef HAVE_SCWS + internal->load_libscws(fpath, xmem, multi); +#endif +} +#endif + Query QueryParser::parse_query(const string &query_string, unsigned flags, const string &default_prefix) Index: queryparser/termgenerator_internal.cc =================================================================== --- queryparser/termgenerator_internal.cc (revision 16065) +++ queryparser/termgenerator_internal.cc (working copy) @@ -125,6 +125,36 @@ #define STOPWORDS_IGNORE 1 #define STOPWORDS_INDEX_UNSTEMMED_ONLY 2 +/// hightman.20070701: load libscws +#ifdef HAVE_SCWS +TermGenerator::Internal::~Internal() +{ + if (scws != NULL) { + scws_free(scws); + scws = NULL; + } +} +void +TermGenerator::Internal::load_libscws(const char *fpath, bool xmem, int multi) +{ + if (scws == NULL) { + string temp; + + scws = scws_new(); + scws_set_charset(scws, "utf8"); + scws_set_ignore(scws, SCWS_NA); + scws_set_duality(scws, SCWS_YEA); + + temp = string(fpath ? fpath : SCWS_ETCDIR) + string("/rules.utf8.ini"); + scws_set_rule(scws, temp.data()); + temp = string(fpath ? fpath : SCWS_ETCDIR) + string("/dict.utf8.xdb"); + scws_set_dict(scws, temp.data(), xmem == true ? SCWS_XDICT_MEM : SCWS_XDICT_XDB); + } + if (multi >= 0 && multi < 0x10) + scws_set_multi(scws, (multi<<12)); +} +#endif /* HAVE_SCWS */ + void TermGenerator::Internal::index_text(Utf8Iterator itor, termcount wdf_inc, const string & prefix, bool with_positions) @@ -135,6 +165,37 @@ if (!stopper) stop_mode = STOPWORDS_NONE; +#ifdef HAVE_SCWS + int last_endpos = 0, last_off = 0; + scws_res_t res, cur; + Utf8Iterator iterm; + const char *text = itor.raw(); + + if (!scws) load_libscws(NULL, false, 0); + scws_send_text(scws, text, itor.left()); + while ((res = cur = scws_get_result(scws)) != NULL) { while (cur != NULL) { + string term; + + iterm.assign(text + cur->off, cur->len); + if (!Unicode::is_wordchar(*iterm)) { + cur = cur->next; + continue; + } + term = Unicode::tolower(string(text + cur->off, cur->len)); + if (with_positions) { + /// for part word(short, duality) + if ((cur->off + cur->len) <= last_endpos) + --termpos; + else { + /// for dualities' first single word + if (cur->off == last_off) + --termpos; + last_endpos = cur->off + cur->len; + } + } + last_off = cur->off; + cur = cur->next; +#else /* HAVE_SCWS */ while (true) { // Advance to the start of the next term. unsigned ch; @@ -254,6 +315,7 @@ } endofterm: +#endif /* HAVE_SCWS */ if (term.size() > MAX_PROB_TERM_LENGTH) continue; if (stop_mode == STOPWORDS_IGNORE && (*stopper)(term)) continue; @@ -263,6 +325,10 @@ } else { doc.add_term(prefix + term, wdf_inc); } +#ifdef HAVE_SCWS + /// CJK term need not spelling & stemmer + if (CJK::codepoint_is_cjk(*iterm)) continue; +#endif if ((flags & FLAG_SPELLING) && prefix.empty()) db.add_spelling(term); if (!stemmer.internal.get()) continue; @@ -280,6 +346,9 @@ stem += stemmer(term); doc.add_term(stem, wdf_inc); } +#ifdef HAVE_SCWS + scws_free_result(res); } +#endif } } Index: configure.ac =================================================================== --- configure.ac (revision 16065) +++ configure.ac (working copy) @@ -1000,6 +1000,63 @@ [Define if you want a log of methods called and other debug messages]) fi +dnl ********************** +dnl * Check scws library * +dnl ********************** +dnl hightman.20110411: See if we want to use scws as the default tokenizer +SCWS_DIR="" +AC_MSG_CHECKING(for scws) +AC_ARG_WITH(scws, + [AS_HELP_STRING([--with-scws@<:@=DIR@:>@], + [use scws as the default tokenizer, DIR is the installation directory scws] + )], [ ],[ with_scws=no ] +) + +if test "$with_scws" = "no"; then + AC_MSG_RESULT(no) +else + # Check header file + if test "$with_scws" = "yes"; then + searchdirs="/usr /usr/local /usr/local/scws /opt/local" + for tmpdir in $searchdirs ; do + if test -f $tmpdir/include/scws/scws.h ; then + SCWS_DIR=$tmpdir + break + fi + done + if test "$SCWS_DIR" = ""; then + AC_MSG_RESULT(no) + AC_MSG_ERROR([scws not found in default directories, specify DIR plz...]) + fi + elif test -f $withval/include/scws/scws.h ; then + SCWS_DIR=$withval + else + AC_MSG_RESULT(no) + AC_MSG_ERROR([Invalid scws directory, unable to find the scws.h under $withval/include/scws]) + fi + AC_MSG_RESULT([yes: $SCWS_DIR]) + + # Etc directory + if test "$SCWS_DIR" = "/usr"; then + SCWS_ETCDIR="/etc" + else + SCWS_ETCDIR="$SCWS_DIR/etc" + fi + + # Check scws library + AC_CHECK_LIB(scws, scws_new, [ + LIBS="$LIBS -L$SCWS_DIR/lib -lscws" + XAPIAN_LDFLAGS="$XAPIAN_LDFLAGS -L$SCWS_DIR/lib -lscws" + CPPFLAGS="$CPPFLAGS -I$SCWS_DIR/include" + AC_DEFINE(HAVE_SCWS, 1, [Define to 1 if you want to use scws as default tokenizer]) + AC_DEFINE_UNQUOTED(SCWS_ETCDIR, "$SCWS_ETCDIR", [Resources directory of scws to load dictionary and rules]) + ],[ + AC_MSG_ERROR([scws_new() NOT found in libscws, please re-install scws]) + ],[ + -L$SCWS_DIR/lib + ]) +fi + dnl ****************************** dnl * Set special compiler flags * dnl ****************************** Index: include/xapian/termgenerator.h =================================================================== --- include/xapian/termgenerator.h (revision 16065) +++ include/xapian/termgenerator.h (working copy) @@ -78,6 +78,11 @@ /// Set the database to index spelling data to. void set_database(const Xapian::WritableDatabase &db); + +#if 1 /* HAVE_SCWS */ + /// hightman.20110706: Specify the dict and rules file for scws, only used when HAVE_SCWS. + void load_libscws(const char *fpath, bool xmem = false, int multi = 0); +#endif /// Flags to OR together and pass to TermGenerator::set_flags(). enum flags { Index: include/xapian/queryparser.h =================================================================== --- include/xapian/queryparser.h (revision 16065) +++ include/xapian/queryparser.h (working copy) @@ -455,6 +455,15 @@ * can expand to, or 0 for no limit (which is the default). */ void set_max_wildcard_expansion(Xapian::termcount limit); + +#if 1 /* HAVE_SCWS */ + /** hightman.20110706: Specify the dict and rules file for scws, only used when HAVE_SCWS. + * @param fpath Path of dictionary file and rule files (char *) + * @param xmem Whether to load the whole dict file into memory (default to false) + * @param multi multiset (int 0~15) + */ + void load_libscws(const char *fpath, bool xmem = false, int multi = 0); +#endif /** Parse a query. *