Ticket #594: xapian-scws-1.3.x-snap.patch

File xapian-scws-1.3.x-snap.patch, 18.1 KB (added by Olly Betts, 11 years ago)

newer patch from original author

Line 
1*** xapian-core-1.3.0_svn/configure.ac 2012-03-28 18:11:06.000000000 +0800
2--- xapian-core-1.3.0_scws/configure.ac 2012-03-30 12:31:10.000000000 +0800
3***************
4*** 997,1002 ****
5--- 997,1058 ----
6 [Define if you want a log of methods called and other debug messages])
7 fi
8
9+ dnl **********************
10+ dnl * Check scws library *
11+ dnl **********************
12+ dnl hightman.20110411: See if we want to use scws as default tokenizer
13+ SCWS_DIR=""
14+ AC_MSG_CHECKING(for scws)
15+ AC_ARG_WITH(scws,
16+ [AS_HELP_STRING([--with-scws@<:@=DIR@:>@], [use scws as default tokenizer, DIR is the install PREFIX of scws])],
17+ [ ],[ with_scws=no ]
18+ )
19+
20+ if test "$with_scws" = "no"; then
21+ AC_MSG_RESULT(no)
22+ else
23+ # Check header file
24+ if test "$with_scws" = "yes"; then
25+ searchdirs="/usr /usr/local /usr/local/scws /opt/local"
26+ for tmpdir in $searchdirs ; do
27+ if test -f $tmpdir/include/scws/scws.h ; then
28+ SCWS_DIR=$tmpdir
29+ break
30+ fi
31+ done
32+ if test "$SCWS_DIR" = ""; then
33+ AC_MSG_RESULT(no)
34+ AC_MSG_ERROR([scws not found on default search directories, specify DIR plz...])
35+ fi
36+ elif test -f $withval/include/scws/scws.h ; then
37+ SCWS_DIR=$withval
38+ else
39+ AC_MSG_RESULT(no)
40+ AC_MSG_ERROR([Invalid scws directory, unable to find the scws.h under $withval/include/scws])
41+ fi
42+ AC_MSG_RESULT([yes: $SCWS_DIR])
43+
44+ # Etc directory
45+ if test "$SCWS_DIR" = "/usr"; then
46+ SCWS_ETCDIR="/etc"
47+ else
48+ SCWS_ETCDIR="$SCWS_DIR/etc"
49+ fi
50+
51+ # Check scws library
52+ AC_CHECK_LIB(scws, scws_new, [
53+ LIBS="$LIBS -L$SCWS_DIR/lib -lscws"
54+ XAPIAN_LDFLAGS="$XAPIAN_LDFLAGS -L$SCWS_DIR/lib -lscws"
55+ CPPFLAGS="$CPPFLAGS -I$SCWS_DIR/include"
56+ AC_DEFINE(HAVE_SCWS, 1, [Define to 1 if you want to use scws as default tokenizer])
57+ AC_DEFINE_UNQUOTED(SCWS_ETCDIR, "$SCWS_ETCDIR", [Resources directory of scws to load dictionary and rules])
58+ ],[
59+ AC_MSG_ERROR([scws_new() NOT found in libscws, please check it first.])
60+ ],[
61+ -L$SCWS_DIR/lib
62+ ])
63+ fi
64+
65 dnl ******************************
66 dnl * Set special compiler flags *
67 dnl ******************************
68*** xapian-core-1.3.0_svn/include/xapian/queryparser.h 2012-03-24 20:31:02.000000000 +0800
69--- xapian-core-1.3.0_scws/include/xapian/queryparser.h 2012-03-30 12:31:10.000000000 +0800
70***************
71*** 499,504 ****
72--- 499,513 ----
73 */
74 void set_max_wildcard_expansion(Xapian::termcount limit);
75
76+ #if 1 /* HAVE_SCWS */
77+ /** hightman.20070706: Specify the dict and rules file for scws, only used when HAVE_SCWS.
78+ * @param fpath path for dict file and rule file (char *)
79+ * @param xmem whether to load whold dict into memory(default to false)
80+ * @param multi multiset (int 0~15)
81+ */
82+ void load_libscws(const char *fpath, bool xmem = false, int multi = 0);
83+ #endif
84+
85 /** Parse a query.
86 *
87 * @param query_string A free-text query as entered by a user
88*** xapian-core-1.3.0_svn/include/xapian/termgenerator.h 2011-11-07 11:11:05.000000000 +0800
89--- xapian-core-1.3.0_scws/include/xapian/termgenerator.h 2012-03-30 12:31:10.000000000 +0800
90***************
91*** 82,87 ****
92--- 82,92 ----
93 /// Set the database to index spelling data to.
94 void set_database(const Xapian::WritableDatabase &db);
95
96+ #if 1 /* HAVE_SCWS */
97+ /// hightman.20070706: Specify the dict and rules file for scws, only used when HAVE_SCWS.
98+ void load_libscws(const char *fpath, bool xmem = false, int multi = 0);
99+ #endif
100+
101 /// Flags to OR together and pass to TermGenerator::set_flags().
102 enum flags {
103 /// Index data required for spelling correction.
104*** xapian-core-1.3.0_svn/queryparser/queryparser_internal.h 2012-03-24 20:31:02.000000000 +0800
105--- xapian-core-1.3.0_scws/queryparser/queryparser_internal.h 2012-03-30 12:38:27.000000000 +0800
106***************
107*** 29,34 ****
108--- 29,39 ----
109 #include <xapian/queryparser.h>
110 #include <xapian/stem.h>
111
112+ /// hightman.20070701: use scws as default tokneizer
113+ #ifdef HAVE_SCWS
114+ #include <scws/scws.h>
115+ #endif
116+
117 #include <list>
118 #include <map>
119
120***************
121*** 63,68 ****
122--- 68,79 ----
123 Stem stemmer;
124 stem_strategy stem_action;
125 const Stopper * stopper;
126+ #ifdef HAVE_SCWS
127+ scws_t scws;
128+ scws_res_t rptr, rcur;
129+ const char *qptr;
130+ int last_off;
131+ #endif
132 Query::op default_op;
133 const char * errmsg;
134 Database db;
135***************
136*** 88,94 ****
137--- 99,112 ----
138
139 public:
140 Internal() : stem_action(STEM_SOME), stopper(NULL),
141+ #ifdef HAVE_SCWS
142+ scws(NULL), rptr(NULL), rcur(NULL),
143+ #endif
144 default_op(Query::OP_OR), errmsg(NULL), max_wildcard_expansion(0) { }
145+ #ifdef HAVE_SCWS
146+ ~Internal();
147+ void load_libscws(const char *fpath, bool xmem, int multi);
148+ #endif
149
150 Query parse_query(const string & query_string, unsigned int flags, const string & default_prefix);
151 };
152*** xapian-core-1.3.0_svn/queryparser/termgenerator_internal.h 2011-07-03 20:31:02.000000000 +0800
153--- xapian-core-1.3.0_scws/queryparser/termgenerator_internal.h 2012-03-30 12:33:15.000000000 +0800
154***************
155*** 26,31 ****
156--- 26,35 ----
157 #include <xapian/document.h>
158 #include <xapian/termgenerator.h>
159 #include <xapian/stem.h>
160+ /// hightman.20070701: use scws as default tokneizer
161+ #ifdef HAVE_SCWS
162+ #include <scws/scws.h>
163+ #endif
164
165 namespace Xapian {
166
167***************
168*** 37,48 ****
169--- 41,62 ----
170 const Stopper * stopper;
171 Document doc;
172 termcount termpos;
173+ #ifdef HAVE_SCWS
174+ scws_t scws;
175+ #endif
176 TermGenerator::flags flags;
177 WritableDatabase db;
178
179 public:
180 Internal() : stopper(NULL), termpos(0),
181+ #ifdef HAVE_SCWS
182+ scws(NULL),
183+ #endif
184 flags(TermGenerator::flags(0)) { }
185+ #ifdef HAVE_SCWS
186+ ~Internal();
187+ void load_libscws(const char *fpath, bool xmem, int multi);
188+ #endif
189 void index_text(Utf8Iterator itor,
190 termcount weight,
191 const std::string & prefix,
192*** xapian-core-1.3.0_svn/queryparser/queryparser.cc 2011-12-26 20:31:02.000000000 +0800
193--- xapian-core-1.3.0_scws/queryparser/queryparser.cc 2012-03-30 12:33:15.000000000 +0800
194***************
195*** 136,141 ****
196--- 136,152 ----
197 internal->max_wildcard_expansion = max;
198 }
199
200+ #if 1 /* HAVE_SCWS */
201+ /// hightman.20070701: load the specified dict file for scws
202+ void
203+ QueryParser::load_libscws(const char *fpath, bool xmem, int multi)
204+ {
205+ #ifdef HAVE_SCWS
206+ internal->load_libscws(fpath, xmem, multi);
207+ #endif
208+ }
209+ #endif
210+
211 Query
212 QueryParser::parse_query(const string &query_string, unsigned flags,
213 const string &default_prefix)
214*** xapian-core-1.3.0_svn/queryparser/queryparser.lemony 2012-01-26 20:51:02.000000000 +0800
215--- xapian-core-1.3.0_scws/queryparser/queryparser.lemony 2012-03-30 13:46:27.000000000 +0800
216***************
217*** 166,171 ****
218--- 166,174 ----
219 string unstemmed;
220 QueryParser::stem_strategy stem;
221 termpos pos;
222+ #ifdef HAVE_SCWS
223+ vector<string> multi;
224+ #endif
225
226 Term(const string &name_, termpos pos_) : name(name_), stem(QueryParser::STEM_NONE), pos(pos_) { }
227 Term(const string &name_) : name(name_), stem(QueryParser::STEM_NONE), pos(0) { }
228***************
229*** 501,513 ****
230 vector<Query> prefix_cjk;
231 const list<string> & prefixes = prefix_info->prefixes;
232 list<string>::const_iterator piter;
233! for (CJKTokenIterator tk(name); tk != CJKTokenIterator(); ++tk) {
234 for (piter = prefixes.begin(); piter != prefixes.end(); ++piter) {
235 string cjk = *piter;
236 cjk += *tk;
237 prefix_cjk.push_back(Query(cjk, 1, pos));
238 }
239 }
240 Query * q = new Query(Query::OP_AND, prefix_cjk.begin(), prefix_cjk.end());
241 delete this;
242 return q;
243--- 504,546 ----
244 vector<Query> prefix_cjk;
245 const list<string> & prefixes = prefix_info->prefixes;
246 list<string>::const_iterator piter;
247! /* hightman.20111223: used CJKTERM for multi segment */
248! #ifdef HAVE_SCWS
249! for (piter = prefixes.begin(); piter != prefixes.end(); ++piter) {
250! Query org = Query(*piter + name, 1, pos);
251! termpos mpos = pos + 88;
252!
253! /* hightman.20120104: get synonyms */
254! if (state->flags & QueryParser::FLAG_AUTO_SYNONYMS) {
255! Xapian::Database db = state->get_database();
256! Xapian::TermIterator syn = db.synonyms_begin(name);
257! Xapian::TermIterator end = db.synonyms_end(name);
258! while (syn != end) {
259! org = Query(Query::OP_SYNONYM, org, Query(*piter + *syn, 1, mpos++));
260! ++syn;
261! }
262! }
263! if (!multi.empty()) {
264! vector<string>::const_iterator mi;
265! vector<Query> multi_cjk;
266! for (mi = multi.begin(); mi != multi.end(); ++mi) {
267! // hightman: force to sort behind for get_terms()
268! multi_cjk.push_back(Query(*piter + *mi, 1, mpos++));
269! }
270! Query syn = Query(state->default_op(), multi_cjk.begin(), multi_cjk.end());
271! org = Query(Query::OP_SYNONYM, org, syn);
272! }
273! prefix_cjk.push_back(org);
274! }
275! #else
276! for (CJKTokenIterator tk(name); tk != CJKTokenIterator(); ++tk) {
277 for (piter = prefixes.begin(); piter != prefixes.end(); ++piter) {
278 string cjk = *piter;
279 cjk += *tk;
280 prefix_cjk.push_back(Query(cjk, 1, pos));
281 }
282 }
283+ #endif /* HAVE_SCWS */
284 Query * q = new Query(Query::OP_AND, prefix_cjk.begin(), prefix_cjk.end());
285 delete this;
286 return q;
287***************
288*** 618,629 ****
289--- 651,728 ----
290 }
291 }
292
293+ /// hightman.20110701: load libscws
294+ #ifdef HAVE_SCWS
295+ QueryParser::Internal::~Internal()
296+ {
297+ if (rptr != NULL) {
298+ scws_free_result(rptr);
299+ rptr = NULL;
300+ }
301+ if (scws != NULL) {
302+ scws_free(scws);
303+ scws = NULL;
304+ }
305+ }
306+
307+ void
308+ QueryParser::Internal::load_libscws(const char *fpath, bool xmem, int multi)
309+ {
310+ if (scws == NULL) {
311+ string temp;
312+
313+ scws = scws_new();
314+ scws_set_charset(scws, "utf8");
315+ scws_set_ignore(scws, SCWS_NA);
316+ scws_set_duality(scws, SCWS_YEA);
317+
318+ temp = string(fpath ? fpath : SCWS_ETCDIR) + string("/rules.utf8.ini");
319+ scws_set_rule(scws, temp.data());
320+ temp = string(fpath ? fpath : SCWS_ETCDIR) + string("/dict.utf8.xdb");
321+ scws_set_dict(scws, temp.data(), xmem == true ? SCWS_XDICT_MEM : SCWS_XDICT_XDB);
322+ /* hightman.20111209: custom dict support */
323+ temp = string(fpath ? fpath : SCWS_ETCDIR) + string("/dict_user.txt");
324+ scws_add_dict(scws, temp.data(), SCWS_XDICT_TXT);
325+ }
326+ if (multi >= 0 && multi < 0x10)
327+ scws_set_multi(scws, (multi<<12));
328+ }
329+ #endif /* HAVE_SCWS */
330+
331 string
332 QueryParser::Internal::parse_term(Utf8Iterator &it, const Utf8Iterator &end,
333 bool cjk_ngram, bool & is_cjk_term,
334 bool &was_acronym)
335 {
336 string term;
337+ #ifdef HAVE_SCWS
338+ int off = it.raw() - qptr;
339+ while (rcur && (off > rcur->off)) {
340+ rcur = rcur->next;
341+ }
342+ was_acronym = false;
343+ if (rcur == NULL) {
344+ it = end;
345+ term.resize(0);
346+ } else {
347+ // sometimes, auto_duality + word-end single word char will be repeated
348+ // 说明几句 => 说明/几/几句
349+ if (rcur->next && rcur->next->off == rcur->off && rcur->next->len > rcur->len)
350+ rcur = rcur->next;
351+
352+ term.append(qptr + rcur->off, rcur->len);
353+ was_acronym = (rcur->attr[0] == 'n' && rcur->attr[1] == 'z') ? true : false;
354+ is_cjk_term = CJK::codepoint_is_cjk(*it);
355+ last_off = off = rcur->off + rcur->len;
356+ rcur = rcur->next;
357+
358+ // sometimes, auto duality or multisegment
359+ // 几句说搞笑 => 几句/句说/搞笑
360+ if (rcur && off > rcur->off && (rcur->off + rcur->len) > off)
361+ off = rcur->off;
362+ while ((it.raw() - qptr) < off) it++;
363+ }
364+ #else /* HAVE_SCWS */
365 // Look for initials separated by '.' (e.g. P.T.O., U.N.C.L.E).
366 // Don't worry if there's a trailing '.' or not.
367 if (U_isupper(*it)) {
368***************
369*** 708,713 ****
370--- 807,813 ----
371 }
372 }
373 }
374+ #endif /* HAVE_SCWS */
375 return term;
376 }
377
378***************
379*** 759,764 ****
380--- 859,890 ----
381
382 ParserHandler pParser(ParseAlloc());
383
384+ #ifdef HAVE_SCWS
385+ /// Pre segmentation use scws
386+ scws_res_t res;
387+
388+ if (!scws) {
389+ load_libscws(NULL, false, 3);
390+ }
391+ if (rptr != NULL) {
392+ scws_free_result(rptr);
393+ rptr = NULL;
394+ }
395+ qptr = qs.data();
396+ scws_send_text(scws, qptr, qs.size());
397+ while ((res = scws_get_result(scws)) != NULL) {
398+ if (rptr == NULL) {
399+ rcur = rptr = res;
400+ } else {
401+ rcur->next = res;
402+ }
403+ while (rcur->next != NULL) {
404+ rcur = rcur->next;
405+ }
406+ }
407+ rcur = rptr;
408+ #endif /* HAVE_SCWS */
409+
410 unsigned newprev = ' ';
411 main_lex_loop:
412 enum {
413***************
414*** 1162,1167 ****
415--- 1288,1298 ----
416 if (!stemmer.internal.get()) {
417 // No stemmer is set.
418 stem_term = STEM_NONE;
419+ #ifdef HAVE_SCWS
420+ } else if (is_cjk_term) {
421+ // Don't stem CJK terms.
422+ stem_term = STEM_NONE;
423+ #endif
424 } else if (stem_term == STEM_SOME) {
425 if (!should_stem(unstemmed_term) ||
426 (it != end && is_stem_preventer(*it))) {
427***************
428*** 1175,1180 ****
429--- 1306,1322 ----
430 unstemmed_term, stem_term, term_pos++);
431
432 if (is_cjk_term) {
433+ #ifdef HAVE_SCWS
434+ /* multi scws handler */
435+ term_obj->multi.clear();
436+ while (rcur && (rcur->off + rcur->len) <= last_off) {
437+ if (rcur->len > 3)
438+ term_obj->multi.push_back(string(qptr + rcur->off, rcur->len));
439+ rcur = rcur->next;
440+ }
441+ if (mode == IN_GROUP || mode == IN_GROUP2)
442+ mode = DEFAULT;
443+ #endif
444 Parse(pParser, CJKTERM, term_obj, &state);
445 if (it == end) break;
446 continue;
447***************
448*** 1305,1310 ****
449--- 1447,1459 ----
450 }
451 }
452 done:
453+ #ifdef HAVE_SCWS
454+ /// Free all segmented terms/words
455+ if (rptr != NULL) {
456+ scws_free_result(rptr);
457+ rptr = NULL;
458+ }
459+ #endif
460 if (!state.error) {
461 // Implicitly close any unclosed quotes...
462 if (mode == IN_QUOTES || mode == IN_PREFIXED_QUOTES)
463***************
464*** 1656,1661 ****
465--- 1805,1815 ----
466 void
467 Term::as_positional_cjk_term(Terms * terms) const
468 {
469+ #ifdef HAVE_SCWS
470+ // Add SCWS term only
471+ Term * c = new Term(state, name, prefix_info, unstemmed, stem, pos);
472+ terms->add_positional_term(c);
473+ #else
474 // Add each individual CJK character to the phrase.
475 string t;
476 for (Utf8Iterator it(name); it != Utf8Iterator(); ++it) {
477***************
478*** 1664,1669 ****
479--- 1818,1824 ----
480 terms->add_positional_term(c);
481 t.resize(0);
482 }
483+ #endif /* HAVE_SCWS */
484
485 // FIXME: we want to add the n-grams as filters too for efficiency.
486
487*** xapian-core-1.3.0_svn/queryparser/termgenerator.cc 2011-07-03 20:31:02.000000000 +0800
488--- xapian-core-1.3.0_scws/queryparser/termgenerator.cc 2012-03-30 12:33:15.000000000 +0800
489***************
490*** 74,79 ****
491--- 74,90 ----
492 internal->db = db;
493 }
494
495+ #if 1 /* HAVE_SCWS */
496+ /// hightman.20070701: load the specified dict file for scws
497+ void
498+ TermGenerator::load_libscws(const char *fpath, bool xmem, int multi)
499+ {
500+ #ifdef HAVE_SCWS
501+ internal->load_libscws(fpath, xmem, multi);
502+ #endif
503+ }
504+ #endif
505+
506 TermGenerator::flags
507 TermGenerator::set_flags(flags toggle, flags mask)
508 {
509*** xapian-core-1.3.0_svn/queryparser/termgenerator_internal.cc 2011-08-24 20:51:02.000000000 +0800
510--- xapian-core-1.3.0_scws/queryparser/termgenerator_internal.cc 2012-03-30 13:51:10.000000000 +0800
511***************
512*** 125,130 ****
513--- 125,164 ----
514 #define STOPWORDS_IGNORE 1
515 #define STOPWORDS_INDEX_UNSTEMMED_ONLY 2
516
517+ /// hightman.20070701: load libscws
518+ #ifdef HAVE_SCWS
519+ TermGenerator::Internal::~Internal()
520+ {
521+ if (scws != NULL) {
522+ scws_free(scws);
523+ scws = NULL;
524+ }
525+ }
526+
527+ void
528+ TermGenerator::Internal::load_libscws(const char *fpath, bool xmem, int multi)
529+ {
530+ if (scws == NULL) {
531+ string temp;
532+
533+ scws = scws_new();
534+ scws_set_charset(scws, "utf8");
535+ scws_set_ignore(scws, SCWS_NA);
536+ scws_set_duality(scws, SCWS_YEA);
537+
538+ temp = string(fpath ? fpath : SCWS_ETCDIR) + string("/rules.utf8.ini");
539+ scws_set_rule(scws, temp.data());
540+ temp = string(fpath ? fpath : SCWS_ETCDIR) + string("/dict.utf8.xdb");
541+ scws_set_dict(scws, temp.data(), xmem == true ? SCWS_XDICT_MEM : SCWS_XDICT_XDB);
542+ /* hightman.20111209: custom dict support */
543+ temp = string(fpath ? fpath : SCWS_ETCDIR) + string("/dict_user.txt");
544+ scws_add_dict(scws, temp.data(), SCWS_XDICT_TXT);
545+ }
546+ if (multi >= 0 && multi < 0x10)
547+ scws_set_multi(scws, (multi<<12));
548+ }
549+ #endif
550+
551 void
552 TermGenerator::Internal::index_text(Utf8Iterator itor, termcount wdf_inc,
553 const string & prefix, bool with_positions)
554***************
555*** 135,140 ****
556--- 169,205 ----
557
558 if (!stopper) stop_mode = STOPWORDS_NONE;
559
560+ #ifdef HAVE_SCWS
561+ int last_endpos = 0, last_off = 0;
562+ scws_res_t res, cur;
563+ Utf8Iterator iterm;
564+ const char *text = itor.raw();
565+
566+ if (!scws) load_libscws(NULL, false, 3);
567+ scws_send_text(scws, text, itor.left());
568+ while ((res = cur = scws_get_result(scws)) != NULL) { while (cur != NULL) {
569+ string term;
570+
571+ iterm.assign(text + cur->off, cur->len);
572+ if (!Unicode::is_wordchar(*iterm)) {
573+ cur = cur->next;
574+ continue;
575+ }
576+ term = Unicode::tolower(string(text + cur->off, cur->len));
577+ if (with_positions) {
578+ /// for part word(short, duality)
579+ if ((cur->off + cur->len) <= last_endpos)
580+ --termpos;
581+ else {
582+ /// for dualities' first single word
583+ if (cur->off == last_off)
584+ --termpos;
585+ last_endpos = cur->off + cur->len;
586+ }
587+ }
588+ last_off = cur->off;
589+ cur = cur->next;
590+ #else
591 while (true) {
592 // Advance to the start of the next term.
593 unsigned ch;
594***************
595*** 254,259 ****
596--- 319,325 ----
597 }
598
599 endofterm:
600+ #endif /* HAVE_SCWS */
601 if (term.size() > MAX_PROB_TERM_LENGTH) continue;
602
603 if (stop_mode == STOPWORDS_IGNORE && (*stopper)(term)) continue;
604***************
605*** 263,268 ****
606--- 329,338 ----
607 } else {
608 doc.add_term(prefix + term, wdf_inc);
609 }
610+ #ifdef HAVE_SCWS
611+ /// hightman: Term start with CJK character needn't spell & stem
612+ if (CJK::codepoint_is_cjk(*iterm)) continue;
613+ #endif
614 if ((flags & FLAG_SPELLING) && prefix.empty()) db.add_spelling(term);
615
616 if (!stemmer.internal.get()) continue;
617***************
618*** 280,285 ****
619--- 350,358 ----
620 stem += stemmer(term);
621 doc.add_term(stem, wdf_inc);
622 }
623+ #ifdef HAVE_SCWS
624+ scws_free_result(res); }
625+ #endif
626 }
627
628 }