Ticket #594: xapian-scws-1.3.x-trunk.updated.patch
File xapian-scws-1.3.x-trunk.updated.patch, 14.2 KB (added by , 13 years ago) |
---|
-
xapian-core/configure.ac
diff --git a/xapian-core/configure.ac b/xapian-core/configure.ac index b5e491d..ef95d14 100644
a b if test yes = "$enable_log"; then 997 997 [Define if you want a log of methods called and other debug messages]) 998 998 fi 999 999 1000 dnl ********************** 1001 dnl * Check scws library * 1002 dnl ********************** 1003 dnl See if we want to use scws as the default tokenizer 1004 SCWS_DIR= 1005 AC_MSG_CHECKING([for scws]) 1006 AC_ARG_WITH(scws, 1007 [AS_HELP_STRING([[--with-scws[=DIR]]], 1008 [use scws as the default tokenizer, DIR is the scws installation directory] 1009 )], [ ],[ with_scws=no ] 1010 ) 1011 1012 if test "$with_scws" = "no"; then 1013 AC_MSG_RESULT(no) 1014 else 1015 # Check header file 1016 if test "$with_scws" = "yes"; then 1017 for tmpdir in /usr /usr/local /usr/local/scws /opt/local ; do 1018 if test -f "$tmpdir/include/scws/scws.h" ; then 1019 SCWS_DIR=$tmpdir 1020 break 1021 fi 1022 done 1023 if test "$SCWS_DIR" = ""; then 1024 AC_MSG_RESULT(no) 1025 AC_MSG_ERROR([scws not found in default directories, please specify --with-scws=DIR]) 1026 fi 1027 elif test -f "$withval/include/scws/scws.h" ; then 1028 SCWS_DIR=$withval 1029 else 1030 AC_MSG_RESULT(no) 1031 AC_MSG_ERROR([Invalid scws directory, unable to find the scws.h under $withval/include/scws]) 1032 fi 1033 AC_MSG_RESULT([yes: $SCWS_DIR]) 1034 1035 dnl Etc directory 1036 if test "$SCWS_DIR" = "/usr"; then 1037 SCWS_ETCDIR="/etc" 1038 else 1039 SCWS_ETCDIR="$SCWS_DIR/etc" 1040 fi 1041 1042 dnl Check scws library 1043 AC_CHECK_LIB(scws, scws_new, [ 1044 LIBS="$LIBS -L$SCWS_DIR/lib -lscws" 1045 XAPIAN_LDFLAGS="$XAPIAN_LDFLAGS -L$SCWS_DIR/lib -lscws" 1046 CPPFLAGS="$CPPFLAGS -I$SCWS_DIR/include" 1047 AC_DEFINE(HAVE_SCWS, 1, [Define to 1 if you want to use scws as default tokenizer]) 1048 AC_DEFINE_UNQUOTED(SCWS_ETCDIR, "$SCWS_ETCDIR", [Resources directory of scws to load dictionary and rules]) 1049 ],[ 1050 AC_MSG_ERROR([scws_new() NOT found in libscws, please re-install scws]) 1051 ],[ 1052 -L$SCWS_DIR/lib 1053 ]) 1054 fi 1055 1000 1056 dnl ****************************** 1001 1057 dnl * Set special compiler flags * 1002 1058 dnl ****************************** -
xapian-core/include/xapian/queryparser.h
diff --git a/xapian-core/include/xapian/queryparser.h b/xapian-core/include/xapian/queryparser.h index 829e187..e3e0477 100644
a b class XAPIAN_VISIBILITY_DEFAULT QueryParser { 499 499 */ 500 500 void set_max_wildcard_expansion(Xapian::termcount limit); 501 501 502 #if 1 /* HAVE_SCWS */ 503 /** Specify the dict and rules file for scws, only used when HAVE_SCWS. 504 * 505 * @param fpath Path of directory containing dictionary file and rule files (char *) 506 * @param xmem Whether to load the whole dict file into memory (default to false) 507 * @param multi multiset (int 0~15) 508 */ 509 void load_libscws(const char *fpath, bool xmem = false, int multi = 0); 510 #endif 511 502 512 /** Parse a query. 503 513 * 504 514 * @param query_string A free-text query as entered by a user -
xapian-core/include/xapian/termgenerator.h
diff --git a/xapian-core/include/xapian/termgenerator.h b/xapian-core/include/xapian/termgenerator.h index 28f4294..d38601f 100644
a b class XAPIAN_VISIBILITY_DEFAULT TermGenerator { 82 82 /// Set the database to index spelling data to. 83 83 void set_database(const Xapian::WritableDatabase &db); 84 84 85 /// Specify the dict and rules file for scws, only used when HAVE_SCWS. 86 void load_libscws(const char *fpath, bool xmem = false, int multi = 0); 87 85 88 /// Flags to OR together and pass to TermGenerator::set_flags(). 86 89 enum flags { 87 90 /// Index data required for spelling correction. -
xapian-core/queryparser/queryparser.cc
diff --git a/xapian-core/queryparser/queryparser.cc b/xapian-core/queryparser/queryparser.cc index 5136da2..6571f8b 100644
a b QueryParser::set_max_wildcard_expansion(Xapian::termcount max) 136 136 internal->max_wildcard_expansion = max; 137 137 } 138 138 139 void 140 QueryParser::load_libscws(const char *fpath, bool xmem, int multi) 141 { 142 #ifdef HAVE_SCWS 143 internal->load_libscws(fpath, xmem, multi); 144 #else 145 (void)fpath; 146 (void)xmem; 147 (void)multi; 148 #endif 149 } 150 139 151 Query 140 152 QueryParser::parse_query(const string &query_string, unsigned flags, 141 153 const string &default_prefix) -
xapian-core/queryparser/queryparser.lemony
diff --git a/xapian-core/queryparser/queryparser.lemony b/xapian-core/queryparser/queryparser.lemony index 8dedb80..81af1bb 100644
a b QueryParser::Internal::add_prefix(const string &field, const string &prefix, 618 618 } 619 619 } 620 620 621 #ifdef HAVE_SCWS 622 QueryParser::Internal::~Internal() 623 { 624 if (rptr != NULL) { 625 scws_free_result(rptr); 626 rptr = NULL; 627 } 628 if (scws != NULL) { 629 scws_free(scws); 630 scws = NULL; 631 } 632 } 633 634 void 635 QueryParser::Internal::load_libscws(const char *fpath, bool xmem, int multi) 636 { 637 if (scws == NULL) { 638 string temp; 639 640 scws = scws_new(); 641 scws_set_charset(scws, "utf8"); 642 scws_set_ignore(scws, SCWS_NA); 643 scws_set_duality(scws, SCWS_YEA); 644 645 temp = string(fpath ? fpath : SCWS_ETCDIR) + string("/rules.utf8.ini"); 646 scws_set_rule(scws, temp.data()); 647 temp = string(fpath ? fpath : SCWS_ETCDIR) + string("/dict.utf8.xdb"); 648 scws_set_dict(scws, temp.data(), xmem == true ? SCWS_XDICT_MEM : SCWS_XDICT_XDB); 649 } 650 if (multi >= 0 && multi < 0x10) 651 scws_set_multi(scws, (multi<<12)); 652 } 653 #endif /* HAVE_SCWS */ 654 621 655 string 622 656 QueryParser::Internal::parse_term(Utf8Iterator &it, const Utf8Iterator &end, 623 657 bool cjk_ngram, bool & is_cjk_term, 624 658 bool &was_acronym) 625 659 { 626 660 string term; 661 #ifdef HAVE_SCWS 662 int off = it.raw() - qptr; 663 while (rcur && (off > rcur->off)) { 664 rcur = rcur->next; 665 } 666 was_acronym = false; 667 if (rcur == NULL) { 668 it = end; 669 term.resize(0); 670 } else { 671 // sometimes, auto_duality + word-end single word char will be repeated 672 // 说明几句 => 说明/几/几句 673 if (rcur->next && rcur->next->off == rcur->off && rcur->next->len > rcur->len) 674 rcur = rcur->next; 675 676 term.append(qptr + rcur->off, rcur->len); 677 was_acronym = (rcur->attr[0] == 'n' && rcur->attr[1] == 'z') ? true : false; 678 is_cjk_term = CJK::codepoint_is_cjk(*it); 679 off = rcur->off + rcur->len; 680 rcur = rcur->next; 681 682 // sometimes, auto duality or multisegment 683 // 几句说搞笑 => 几句/句说/搞笑 684 if (rcur && off > rcur->off && (rcur->off + rcur->len) > off) 685 off = rcur->off; 686 it = Utf8Iterator(qptr + off); 687 } 688 #else /* HAVE_SCWS */ 627 689 // Look for initials separated by '.' (e.g. P.T.O., U.N.C.L.E). 628 690 // Don't worry if there's a trailing '.' or not. 629 691 if (U_isupper(*it)) { … … QueryParser::Internal::parse_term(Utf8Iterator &it, const Utf8Iterator &end, 708 770 } 709 771 } 710 772 } 773 #endif /* HAVE_SCWS */ 711 774 return term; 712 775 } 713 776 … … QueryParser::Internal::parse_query(const string &qs, unsigned flags, 759 822 760 823 ParserHandler pParser(ParseAlloc()); 761 824 825 #ifdef HAVE_SCWS 826 /// Pre segmentation use scws 827 scws_res_t res; 828 829 if (!scws) { 830 load_libscws(NULL, false, 0); 831 } 832 if (rptr != NULL) { 833 scws_free_result(rptr); 834 rptr = NULL; 835 } 836 qptr = qs.data(); 837 scws_send_text(scws, qptr, qs.size()); 838 while ((res = scws_get_result(scws)) != NULL) { 839 if (rptr == NULL) { 840 rcur = rptr = res; 841 } else { 842 rcur->next = res; 843 } 844 while (rcur->next != NULL) { 845 rcur = rcur->next; 846 } 847 } 848 rcur = rptr; 849 #endif /* HAVE_SCWS */ 850 762 851 unsigned newprev = ' '; 763 852 main_lex_loop: 764 853 enum { … … phrased_term: 1162 1251 if (!stemmer.internal.get()) { 1163 1252 // No stemmer is set. 1164 1253 stem_term = STEM_NONE; 1254 #ifdef HAVE_SCWS 1255 else if (is_cjk_term) { 1256 // Don't stem CJK terms. 1257 stem_term = STEM_NONE; 1258 } 1259 #endif 1165 1260 } else if (stem_term == STEM_SOME) { 1166 1261 if (!should_stem(unstemmed_term) || 1167 1262 (it != end && is_stem_preventer(*it))) { … … phrased_term: 1305 1400 } 1306 1401 } 1307 1402 done: 1403 #ifdef HAVE_SCWS 1404 /// Free all segmented terms/words 1405 if (rptr != NULL) { 1406 scws_free_result(rptr); 1407 rptr = NULL; 1408 } 1409 #endif 1410 1308 1411 if (!state.error) { 1309 1412 // Implicitly close any unclosed quotes... 1310 1413 if (mode == IN_QUOTES || mode == IN_PREFIXED_QUOTES) -
xapian-core/queryparser/queryparser_internal.h
diff --git a/xapian-core/queryparser/queryparser_internal.h b/xapian-core/queryparser/queryparser_internal.h index 86ce563..41d516a 100644
a b 29 29 #include <xapian/queryparser.h> 30 30 #include <xapian/stem.h> 31 31 32 #ifdef HAVE_SCWS 33 #include <scws/scws.h> 34 #endif 35 32 36 #include <list> 33 37 #include <map> 34 38 … … class QueryParser::Internal : public Xapian::Internal::intrusive_base { 63 67 Stem stemmer; 64 68 stem_strategy stem_action; 65 69 const Stopper * stopper; 70 #ifdef HAVE_SCWS 71 scws_t scws; 72 scws_res_t rptr, rcur; 73 const char *qptr; 74 #endif 66 75 Query::op default_op; 67 76 const char * errmsg; 68 77 Database db; … … class QueryParser::Internal : public Xapian::Internal::intrusive_base { 88 97 89 98 public: 90 99 Internal() : stem_action(STEM_SOME), stopper(NULL), 100 #ifdef HAVE_SCWS 101 scws(NULL), rptr(NULL), rcur(NULL), 102 #endif 91 103 default_op(Query::OP_OR), errmsg(NULL), max_wildcard_expansion(0) { } 92 104 105 #ifdef HAVE_SCWS 106 ~Internal(); 107 108 void load_libscws(const char *fpath, bool xmem, int multi); 109 #endif 110 93 111 Query parse_query(const string & query_string, unsigned int flags, const string & default_prefix); 94 112 }; 95 113 -
xapian-core/queryparser/termgenerator.cc
diff --git a/xapian-core/queryparser/termgenerator.cc b/xapian-core/queryparser/termgenerator.cc index e6f745f..20846f2 100644
a b TermGenerator::set_database(const Xapian::WritableDatabase &db) 74 74 internal->db = db; 75 75 } 76 76 77 /// Load the specified dictionary file for scws. 78 void 79 TermGenerator::load_libscws(const char *fpath, bool xmem, int multi) 80 { 81 #ifdef HAVE_SCWS 82 internal->load_libscws(fpath, xmem, multi); 83 #else 84 (void)fpath; 85 (void)xmem; 86 (void)multi; 87 #endif 88 } 89 77 90 TermGenerator::flags 78 91 TermGenerator::set_flags(flags toggle, flags mask) 79 92 { -
xapian-core/queryparser/termgenerator_internal.cc
diff --git a/xapian-core/queryparser/termgenerator_internal.cc b/xapian-core/queryparser/termgenerator_internal.cc index 93f04a1..633750f 100644
a b inline unsigned check_suffix(unsigned ch) { 125 125 #define STOPWORDS_IGNORE 1 126 126 #define STOPWORDS_INDEX_UNSTEMMED_ONLY 2 127 127 128 #ifdef HAVE_SCWS 129 TermGenerator::Internal::~Internal() 130 { 131 if (scws != NULL) { 132 scws_free(scws); 133 scws = NULL; 134 } 135 } 136 137 void 138 TermGenerator::Internal::load_libscws(const char *fpath, bool xmem, int multi) 139 { 140 if (scws == NULL) { 141 string temp; 142 143 scws = scws_new(); 144 scws_set_charset(scws, "utf8"); 145 scws_set_ignore(scws, SCWS_NA); 146 scws_set_duality(scws, SCWS_YEA); 147 148 temp = string(fpath ? fpath : SCWS_ETCDIR) + "/rules.utf8.ini"; 149 scws_set_rule(scws, temp.c_str()); 150 temp = string(fpath ? fpath : SCWS_ETCDIR) + "/dict.utf8.xdb"; 151 scws_set_dict(scws, temp.c_str(), xmem ? SCWS_XDICT_MEM : SCWS_XDICT_XDB); 152 } 153 if (multi >= 0 && multi < 0x10) 154 scws_set_multi(scws, (multi<<12)); 155 } 156 #endif /* HAVE_SCWS */ 157 128 158 void 129 159 TermGenerator::Internal::index_text(Utf8Iterator itor, termcount wdf_inc, 130 160 const string & prefix, bool with_positions) … … TermGenerator::Internal::index_text(Utf8Iterator itor, termcount wdf_inc, 135 165 136 166 if (!stopper) stop_mode = STOPWORDS_NONE; 137 167 168 #ifdef HAVE_SCWS 169 int last_endpos = 0, last_off = 0; 170 scws_res_t res, cur; 171 Utf8Iterator iterm; 172 const char *text = itor.raw(); 173 174 if (!scws) load_libscws(NULL, false, 0); 175 scws_send_text(scws, text, itor.left()); 176 while ((res = cur = scws_get_result(scws)) != NULL) { while (cur != NULL) { 177 string term; 178 179 iterm.assign(text + cur->off, cur->len); 180 if (!Unicode::is_wordchar(*iterm)) { 181 cur = cur->next; 182 continue; 183 } 184 term = Unicode::tolower(string(text + cur->off, cur->len)); 185 if (with_positions) { 186 // for part word(short, duality) 187 if ((cur->off + cur->len) <= last_endpos) 188 --termpos; 189 else { 190 // for dualities' first single word 191 if (cur->off == last_off) 192 --termpos; 193 last_endpos = cur->off + cur->len; 194 } 195 } 196 last_off = cur->off; 197 cur = cur->next; 198 } 199 #else /* HAVE_SCWS */ 138 200 while (true) { 139 201 // Advance to the start of the next term. 140 202 unsigned ch; … … TermGenerator::Internal::index_text(Utf8Iterator itor, termcount wdf_inc, 254 316 } 255 317 256 318 endofterm: 319 #endif /* HAVE_SCWS */ 257 320 if (term.size() > MAX_PROB_TERM_LENGTH) continue; 258 321 259 322 if (stop_mode == STOPWORDS_IGNORE && (*stopper)(term)) continue; … … endofterm: 263 326 } else { 264 327 doc.add_term(prefix + term, wdf_inc); 265 328 } 329 #ifdef HAVE_SCWS 330 // CJK term need not spelling & stemmer 331 if (CJK::codepoint_is_cjk(*iterm)) continue; 332 #endif 266 333 if ((flags & FLAG_SPELLING) && prefix.empty()) db.add_spelling(term); 267 334 268 335 if (!stemmer.internal.get()) continue; … … endofterm: 280 347 stem += stemmer(term); 281 348 doc.add_term(stem, wdf_inc); 282 349 } 350 #ifdef HAVE_SCWS 351 scws_free_result(res); } 352 #endif 283 353 } 284 354 285 355 } -
xapian-core/queryparser/termgenerator_internal.h
diff --git a/xapian-core/queryparser/termgenerator_internal.h b/xapian-core/queryparser/termgenerator_internal.h index f074fd9..1381c54 100644
a b 27 27 #include <xapian/termgenerator.h> 28 28 #include <xapian/stem.h> 29 29 30 #ifdef HAVE_SCWS 31 #include <scws/scws.h> 32 #endif 33 30 34 namespace Xapian { 31 35 32 36 class Stopper; … … class TermGenerator::Internal : public Xapian::Internal::intrusive_base { 37 41 const Stopper * stopper; 38 42 Document doc; 39 43 termcount termpos; 44 #ifdef HAVE_SCWS 45 scws_t scws; 46 #endif 40 47 TermGenerator::flags flags; 41 48 WritableDatabase db; 42 49 43 50 public: 44 51 Internal() : stopper(NULL), termpos(0), 52 #ifdef HAVE_SCWS 53 scws(NULL), 54 #endif 45 55 flags(TermGenerator::flags(0)) { } 56 57 #ifdef HAVE_SCWS 58 ~Internal(); 59 60 void load_libscws(const char *fpath, bool xmem, int multi); 61 #endif 62 46 63 void index_text(Utf8Iterator itor, 47 64 termcount weight, 48 65 const std::string & prefix,