Index: queryparser/termgenerator_internal.h
===================================================================
--- queryparser/termgenerator_internal.h	(revision 16065)
+++ queryparser/termgenerator_internal.h	(working copy)
@@ -27,6 +27,11 @@
 #include <xapian/termgenerator.h>
 #include <xapian/stem.h>
 
+/// hightman.20110701: use scws as default tokneizer
+#ifdef HAVE_SCWS
+#include <scws/scws.h>
+#endif
+
 namespace Xapian {
 
 class Stopper;
@@ -37,12 +42,22 @@
     const Stopper * stopper;
     Document doc;
     termcount termpos;
+#ifdef HAVE_SCWS
+    scws_t scws;
+#endif    
     TermGenerator::flags flags;
     WritableDatabase db;
 
   public:
     Internal() : stopper(NULL), termpos(0),
+#ifdef HAVE_SCWS
+	scws(NULL),
+#endif
 	flags(TermGenerator::flags(0)) { }
+#ifdef HAVE_SCWS
+    ~Internal();
+    void load_libscws(const char *fpath, bool xmem, int multi);
+#endif	
     void index_text(Utf8Iterator itor,
 		    termcount weight,
 		    const std::string & prefix,
Index: queryparser/termgenerator.cc
===================================================================
--- queryparser/termgenerator.cc	(revision 16065)
+++ queryparser/termgenerator.cc	(working copy)
@@ -74,6 +74,17 @@
     internal->db = db;
 }
 
+#if 1   /* HAVE_SCWS */
+/// hightman.20110701: load the specified dict file for scws
+void
+TermGenerator::load_libscws(const char *fpath, bool xmem, int multi)
+{
+#ifdef HAVE_SCWS
+    internal->load_libscws(fpath, xmem, multi);
+#endif
+}
+#endif
+
 TermGenerator::flags
 TermGenerator::set_flags(flags toggle, flags mask)
 {
Index: queryparser/queryparser.lemony
===================================================================
--- queryparser/queryparser.lemony	(revision 16065)
+++ queryparser/queryparser.lemony	(working copy)
@@ -563,12 +563,75 @@
    }
 }
 
+/// hightman.20110701: load libscws
+#ifdef HAVE_SCWS
+QueryParser::Internal::~Internal()
+{
+    if (rptr != NULL) {
+	scws_free_result(rptr);
+	rptr = NULL;
+    }    
+    if (scws != NULL) {
+	scws_free(scws);
+	scws = NULL;
+    }    
+}
+
+void
+QueryParser::Internal::load_libscws(const char *fpath, bool xmem, int multi)
+{
+    if (scws == NULL) {
+	string temp;
+
+	scws = scws_new();
+	scws_set_charset(scws, "utf8");
+	scws_set_ignore(scws, SCWS_NA);
+	scws_set_duality(scws, SCWS_YEA);
+
+	temp = string(fpath ? fpath : SCWS_ETCDIR) + string("/rules.utf8.ini");
+	scws_set_rule(scws, temp.data());
+	temp = string(fpath ? fpath : SCWS_ETCDIR) + string("/dict.utf8.xdb");
+	scws_set_dict(scws, temp.data(), xmem == true ? SCWS_XDICT_MEM : SCWS_XDICT_XDB);
+    }
+    if (multi >= 0 && multi < 0x10)
+	scws_set_multi(scws, (multi<<12));
+}
+#endif	/* HAVE_SCWS */
+
 string
 QueryParser::Internal::parse_term(Utf8Iterator &it, const Utf8Iterator &end,
 				  bool cjk_ngram, bool & is_cjk_term,
 				  bool &was_acronym)
 {
     string term;
+#ifdef HAVE_SCWS
+    int off = it.raw() - qptr;
+    while (rcur && (off > rcur->off)) {
+	rcur = rcur->next;
+    }
+    was_acronym = false;
+    if (rcur == NULL) { 
+	it = end;
+	term.resize(0);
+    } else {
+	// sometimes, auto_duality + word-end single word char will be repeated
+	// 说明几句 => 说明/几/几句
+	if (rcur->next && rcur->next->off == rcur->off && rcur->next->len > rcur->len)
+	    rcur = rcur->next;
+
+	term.append(qptr + rcur->off, rcur->len);
+	was_acronym = (rcur->attr[0] == 'n' && rcur->attr[1] == 'z') ? true : false;
+	is_cjk_term = CJK::codepoint_is_cjk(*it);
+	off = rcur->off + rcur->len;
+	rcur = rcur->next;
+
+	// sometimes, auto duality or multisegment
+	// 几句说搞笑 => 几句/句说/搞笑
+	if (rcur && off > rcur->off && (rcur->off + rcur->len) > off)
+	    off = rcur->off;
+	it = Utf8Iterator(qptr + off);
+    }
+#else	/* HAVE_SCWS */
     // Look for initials separated by '.' (e.g. P.T.O., U.N.C.L.E).
     // Don't worry if there's a trailing '.' or not.
     if (U_isupper(*it)) {
@@ -653,6 +716,7 @@
 	    }
 	}
     }
+#endif	/* HAVE_SCWS */
     return term;
 }
 
@@ -704,6 +768,32 @@
 
     ParserHandler pParser(ParseAlloc());
 
+#ifdef HAVE_SCWS
+    /// Pre segmentation use scws
+    scws_res_t res;
+
+    if (!scws) { 
+	load_libscws(NULL, false, 0);
+    }
+    if (rptr != NULL) {
+	scws_free_result(rptr);
+	rptr = NULL;
+    }
+    qptr = qs.data();
+    scws_send_text(scws, qptr, qs.size());
+    while ((res = scws_get_result(scws)) != NULL) {
+	if (rptr == NULL) { 
+	    rcur = rptr = res;
+        } else { 
+	    rcur->next = res;
+        }
+	while (rcur->next != NULL) { 
+	    rcur = rcur->next;
+	}
+    }
+    rcur = rptr;
+#endif	/* HAVE_SCWS */
+
     unsigned newprev = ' ';
 main_lex_loop:
     enum {
@@ -1101,6 +1191,12 @@
 		if (!stemmer.internal.get()) {
 		    // No stemmer is set.
 		    stem_term = STEM_NONE;
+#ifdef HAVE_SCWS
+		else if (is_cjk_term) {
+		    // Don't stem CJK terms.
+		    stem_term = STEM_NONE;
+		}
+#endif
 		} else if (stem_term == STEM_SOME) {
 		    if (!should_stem(unstemmed_term) ||
 			(it != end && is_stem_preventer(*it))) {
@@ -1244,6 +1340,14 @@
 	}
     }
 done:
+#ifdef HAVE_SCWS
+    /// Free all segmented terms/words
+    if (rptr != NULL) {
+	scws_free_result(rptr);
+	rptr = NULL;
+    }
+#endif
+
     if (!state.error) {
 	// Implicitly close any unclosed quotes...
 	if (mode == IN_QUOTES || mode == IN_PREFIXED_QUOTES)
Index: queryparser/queryparser_internal.h
===================================================================
--- queryparser/queryparser_internal.h	(revision 16065)
+++ queryparser/queryparser_internal.h	(working copy)
@@ -29,6 +29,11 @@
 #include <xapian/queryparser.h>
 #include <xapian/stem.h>
 
+/// hightman.20110701: use scws as default tokneizer
+#ifdef HAVE_SCWS
+#include <scws/scws.h>
+#endif
+
 #include <list>
 #include <map>
 
@@ -63,6 +68,11 @@
     Stem stemmer;
     stem_strategy stem_action;
     const Stopper * stopper;
+#ifdef HAVE_SCWS
+    scws_t scws;
+    scws_res_t rptr, rcur;
+    const char *qptr;
+#endif    
     Query::op default_op;
     const char * errmsg;
     Database db;
@@ -88,7 +98,14 @@
 
   public:
     Internal() : stem_action(STEM_NONE), stopper(NULL),
+#ifdef HAVE_SCWS
+	scws(NULL), rptr(NULL), rcur(NULL),
+#endif
 	default_op(Query::OP_OR), errmsg(NULL), max_wildcard_expansion(0) { }
+#ifdef HAVE_SCWS
+    ~Internal();
+    void load_libscws(const char *fpath, bool xmem, int multi);
+#endif
     Query parse_query(const string & query_string, unsigned int flags, const string & default_prefix);
 };
 
Index: queryparser/queryparser.cc
===================================================================
--- queryparser/queryparser.cc	(revision 16065)
+++ queryparser/queryparser.cc	(working copy)
@@ -112,6 +112,17 @@
     internal->max_wildcard_expansion = max;
 }
 
+#if 1   /* HAVE_SCWS */
+/// hightman.20110701: load the specified dict file for scws
+void
+QueryParser::load_libscws(const char *fpath, bool xmem, int multi)
+{
+#ifdef HAVE_SCWS
+    internal->load_libscws(fpath, xmem, multi);
+#endif
+}
+#endif
+
 Query
 QueryParser::parse_query(const string &query_string, unsigned flags,
 			 const string &default_prefix)
Index: queryparser/termgenerator_internal.cc
===================================================================
--- queryparser/termgenerator_internal.cc	(revision 16065)
+++ queryparser/termgenerator_internal.cc	(working copy)
@@ -125,6 +125,36 @@
 #define STOPWORDS_IGNORE 1
 #define STOPWORDS_INDEX_UNSTEMMED_ONLY 2
 
+/// hightman.20070701: load libscws
+#ifdef HAVE_SCWS
+TermGenerator::Internal::~Internal()
+{
+    if (scws != NULL) {
+	scws_free(scws);
+	scws = NULL;
+    }   
+}
+void 
+TermGenerator::Internal::load_libscws(const char *fpath, bool xmem, int multi)
+{
+    if (scws == NULL) {
+	string temp;
+
+	scws = scws_new();
+	scws_set_charset(scws, "utf8");
+	scws_set_ignore(scws, SCWS_NA);
+	scws_set_duality(scws, SCWS_YEA);
+
+	temp = string(fpath ? fpath : SCWS_ETCDIR) + string("/rules.utf8.ini");
+	scws_set_rule(scws, temp.data());
+	temp = string(fpath ? fpath : SCWS_ETCDIR) + string("/dict.utf8.xdb");
+	scws_set_dict(scws, temp.data(), xmem == true ? SCWS_XDICT_MEM : SCWS_XDICT_XDB);
+	}
+	if (multi >= 0 && multi < 0x10)
+	scws_set_multi(scws, (multi<<12));
+}
+#endif	/* HAVE_SCWS */
+
 void
 TermGenerator::Internal::index_text(Utf8Iterator itor, termcount wdf_inc,
 				    const string & prefix, bool with_positions)
@@ -135,6 +165,37 @@
 
     if (!stopper) stop_mode = STOPWORDS_NONE;
 
+#ifdef HAVE_SCWS
+    int last_endpos = 0, last_off = 0;
+    scws_res_t res, cur;
+    Utf8Iterator iterm;
+    const char *text = itor.raw();
+
+    if (!scws) load_libscws(NULL, false, 0);
+    scws_send_text(scws, text, itor.left());
+    while ((res = cur = scws_get_result(scws)) != NULL) { while (cur != NULL) {
+    string term;
+
+    iterm.assign(text + cur->off, cur->len);
+    if (!Unicode::is_wordchar(*iterm)) {
+	cur = cur->next;
+	continue;
+    }
+    term = Unicode::tolower(string(text + cur->off, cur->len));
+    if (with_positions) {
+	/// for part word(short, duality)
+ 	if ((cur->off + cur->len) <= last_endpos)
+	    --termpos;
+	else {
+	    /// for dualities' first single word
+	    if (cur->off == last_off)
+		--termpos;
+	    last_endpos = cur->off + cur->len;
+	}
+    }
+    last_off = cur->off;
+    cur = cur->next;
+#else	/* HAVE_SCWS */
     while (true) {
 	// Advance to the start of the next term.
 	unsigned ch;
@@ -254,6 +315,7 @@
 	}
 
 endofterm:
+#endif	/* HAVE_SCWS */
 	if (term.size() > MAX_PROB_TERM_LENGTH) continue;
 
 	if (stop_mode == STOPWORDS_IGNORE && (*stopper)(term)) continue;
@@ -263,6 +325,10 @@
 	} else {
 	    doc.add_term(prefix + term, wdf_inc);
 	}
+#ifdef HAVE_SCWS
+	/// CJK term need not spelling & stemmer
+	if (CJK::codepoint_is_cjk(*iterm)) continue;
+#endif
 	if ((flags & FLAG_SPELLING) && prefix.empty()) db.add_spelling(term);
 
 	if (!stemmer.internal.get()) continue;
@@ -280,6 +346,9 @@
 	stem += stemmer(term);
 	doc.add_term(stem, wdf_inc);
     }
+#ifdef HAVE_SCWS
+    scws_free_result(res); }
+#endif
 }
 
 }
Index: configure.ac
===================================================================
--- configure.ac	(revision 16065)
+++ configure.ac	(working copy)
@@ -1000,6 +1000,63 @@
     [Define if you want a log of methods called and other debug messages])
 fi
 
+dnl **********************
+dnl * Check scws library *
+dnl **********************
+dnl hightman.20110411: See if we want to use scws as the default tokenizer
+SCWS_DIR=""
+AC_MSG_CHECKING(for scws)
+AC_ARG_WITH(scws,
+  [AS_HELP_STRING([--with-scws@<:@=DIR@:>@],
+    [use scws as the default tokenizer, DIR is the installation directory scws]
+  )], [ ],[ with_scws=no ]
+)
+
+if test "$with_scws" = "no"; then 
+  AC_MSG_RESULT(no)
+else
+  # Check header file
+  if test "$with_scws" = "yes"; then 
+    searchdirs="/usr /usr/local /usr/local/scws /opt/local"
+    for tmpdir in $searchdirs ; do
+      if test -f $tmpdir/include/scws/scws.h ; then
+        SCWS_DIR=$tmpdir
+        break
+      fi
+    done
+    if test "$SCWS_DIR" = ""; then
+      AC_MSG_RESULT(no)
+      AC_MSG_ERROR([scws not found in default directories, specify DIR plz...])
+    fi
+  elif test -f $withval/include/scws/scws.h ; then
+    SCWS_DIR=$withval
+  else
+    AC_MSG_RESULT(no)
+    AC_MSG_ERROR([Invalid scws directory, unable to find the scws.h under $withval/include/scws])
+  fi
+  AC_MSG_RESULT([yes: $SCWS_DIR])
+
+  # Etc directory
+  if test "$SCWS_DIR" = "/usr"; then
+    SCWS_ETCDIR="/etc"
+  else
+    SCWS_ETCDIR="$SCWS_DIR/etc"
+  fi
+
+  # Check scws library
+  AC_CHECK_LIB(scws, scws_new, [
+    LIBS="$LIBS -L$SCWS_DIR/lib -lscws"
+    XAPIAN_LDFLAGS="$XAPIAN_LDFLAGS -L$SCWS_DIR/lib -lscws"
+    CPPFLAGS="$CPPFLAGS -I$SCWS_DIR/include"
+    AC_DEFINE(HAVE_SCWS, 1, [Define to 1 if you want to use scws as default tokenizer])
+    AC_DEFINE_UNQUOTED(SCWS_ETCDIR, "$SCWS_ETCDIR", [Resources directory of scws to load dictionary and rules])
+  ],[
+    AC_MSG_ERROR([scws_new() NOT found in libscws, please re-install scws])
+  ],[
+    -L$SCWS_DIR/lib
+  ])
+fi  
+
 dnl ******************************
 dnl * Set special compiler flags *
 dnl ******************************
Index: include/xapian/termgenerator.h
===================================================================
--- include/xapian/termgenerator.h	(revision 16065)
+++ include/xapian/termgenerator.h	(working copy)
@@ -78,6 +78,11 @@
 
     /// Set the database to index spelling data to.
     void set_database(const Xapian::WritableDatabase &db);
+    
+#if 1   /* HAVE_SCWS */
+    /// hightman.20110706: Specify the dict and rules file for scws, only used when HAVE_SCWS.
+    void load_libscws(const char *fpath, bool xmem = false, int multi = 0); 
+#endif    
 
     /// Flags to OR together and pass to TermGenerator::set_flags().
     enum flags {
Index: include/xapian/queryparser.h
===================================================================
--- include/xapian/queryparser.h	(revision 16065)
+++ include/xapian/queryparser.h	(working copy)
@@ -455,6 +455,15 @@
      *			can expand to, or 0 for no limit (which is the default).
      */
     void set_max_wildcard_expansion(Xapian::termcount limit);
+    
+#if 1   /* HAVE_SCWS */
+    /** hightman.20110706: Specify the dict and rules file for scws, only used when HAVE_SCWS.
+     *  @param fpath    Path of dictionary file and rule files (char *) 
+     *  @param xmem     Whether to load the whole dict file into memory (default to false)
+     *  @param multi    multiset (int 0~15)
+     */
+    void load_libscws(const char *fpath, bool xmem = false, int multi = 0); 
+#endif    
 
     /** Parse a query.
      *
