Ticket #594: xapian-scws-1.3.x-trunk.patch.txt

File xapian-scws-1.3.x-trunk.patch.txt, 13.4 KB (added by Olly Betts, 12 years ago)

Updated patch from second email thread

Line 
1Index: queryparser/termgenerator_internal.h
2===================================================================
3--- queryparser/termgenerator_internal.h (revision 16065)
4+++ queryparser/termgenerator_internal.h (working copy)
5@@ -27,6 +27,11 @@
6 #include <xapian/termgenerator.h>
7 #include <xapian/stem.h>
8
9+/// hightman.20110701: use scws as default tokneizer
10+#ifdef HAVE_SCWS
11+#include <scws/scws.h>
12+#endif
13+
14 namespace Xapian {
15
16 class Stopper;
17@@ -37,12 +42,22 @@
18 const Stopper * stopper;
19 Document doc;
20 termcount termpos;
21+#ifdef HAVE_SCWS
22+ scws_t scws;
23+#endif
24 TermGenerator::flags flags;
25 WritableDatabase db;
26
27 public:
28 Internal() : stopper(NULL), termpos(0),
29+#ifdef HAVE_SCWS
30+ scws(NULL),
31+#endif
32 flags(TermGenerator::flags(0)) { }
33+#ifdef HAVE_SCWS
34+ ~Internal();
35+ void load_libscws(const char *fpath, bool xmem, int multi);
36+#endif
37 void index_text(Utf8Iterator itor,
38 termcount weight,
39 const std::string & prefix,
40Index: queryparser/termgenerator.cc
41===================================================================
42--- queryparser/termgenerator.cc (revision 16065)
43+++ queryparser/termgenerator.cc (working copy)
44@@ -74,6 +74,17 @@
45 internal->db = db;
46 }
47
48+#if 1 /* HAVE_SCWS */
49+/// hightman.20110701: load the specified dict file for scws
50+void
51+TermGenerator::load_libscws(const char *fpath, bool xmem, int multi)
52+{
53+#ifdef HAVE_SCWS
54+ internal->load_libscws(fpath, xmem, multi);
55+#endif
56+}
57+#endif
58+
59 TermGenerator::flags
60 TermGenerator::set_flags(flags toggle, flags mask)
61 {
62Index: queryparser/queryparser.lemony
63===================================================================
64--- queryparser/queryparser.lemony (revision 16065)
65+++ queryparser/queryparser.lemony (working copy)
66@@ -563,12 +563,75 @@
67 }
68 }
69
70+/// hightman.20110701: load libscws
71+#ifdef HAVE_SCWS
72+QueryParser::Internal::~Internal()
73+{
74+ if (rptr != NULL) {
75+ scws_free_result(rptr);
76+ rptr = NULL;
77+ }
78+ if (scws != NULL) {
79+ scws_free(scws);
80+ scws = NULL;
81+ }
82+}
83+
84+void
85+QueryParser::Internal::load_libscws(const char *fpath, bool xmem, int multi)
86+{
87+ if (scws == NULL) {
88+ string temp;
89+
90+ scws = scws_new();
91+ scws_set_charset(scws, "utf8");
92+ scws_set_ignore(scws, SCWS_NA);
93+ scws_set_duality(scws, SCWS_YEA);
94+
95+ temp = string(fpath ? fpath : SCWS_ETCDIR) + string("/rules.utf8.ini");
96+ scws_set_rule(scws, temp.data());
97+ temp = string(fpath ? fpath : SCWS_ETCDIR) + string("/dict.utf8.xdb");
98+ scws_set_dict(scws, temp.data(), xmem == true ? SCWS_XDICT_MEM : SCWS_XDICT_XDB);
99+ }
100+ if (multi >= 0 && multi < 0x10)
101+ scws_set_multi(scws, (multi<<12));
102+}
103+#endif /* HAVE_SCWS */
104+
105 string
106 QueryParser::Internal::parse_term(Utf8Iterator &it, const Utf8Iterator &end,
107 bool cjk_ngram, bool & is_cjk_term,
108 bool &was_acronym)
109 {
110 string term;
111+#ifdef HAVE_SCWS
112+ int off = it.raw() - qptr;
113+ while (rcur && (off > rcur->off)) {
114+ rcur = rcur->next;
115+ }
116+ was_acronym = false;
117+ if (rcur == NULL) {
118+ it = end;
119+ term.resize(0);
120+ } else {
121+ // sometimes, auto_duality + word-end single word char will be repeated
122+ // 说明几句 => 说明/几/几句
123+ if (rcur->next && rcur->next->off == rcur->off && rcur->next->len > rcur->len)
124+ rcur = rcur->next;
125+
126+ term.append(qptr + rcur->off, rcur->len);
127+ was_acronym = (rcur->attr[0] == 'n' && rcur->attr[1] == 'z') ? true : false;
128+ is_cjk_term = CJK::codepoint_is_cjk(*it);
129+ off = rcur->off + rcur->len;
130+ rcur = rcur->next;
131+
132+ // sometimes, auto duality or multisegment
133+ // 几句说搞笑 => 几句/句说/搞笑
134+ if (rcur && off > rcur->off && (rcur->off + rcur->len) > off)
135+ off = rcur->off;
136+ it = Utf8Iterator(qptr + off);
137+ }
138+#else /* HAVE_SCWS */
139 // Look for initials separated by '.' (e.g. P.T.O., U.N.C.L.E).
140 // Don't worry if there's a trailing '.' or not.
141 if (U_isupper(*it)) {
142@@ -653,6 +716,7 @@
143 }
144 }
145 }
146+#endif /* HAVE_SCWS */
147 return term;
148 }
149
150@@ -704,6 +768,32 @@
151
152 ParserHandler pParser(ParseAlloc());
153
154+#ifdef HAVE_SCWS
155+ /// Pre segmentation use scws
156+ scws_res_t res;
157+
158+ if (!scws) {
159+ load_libscws(NULL, false, 0);
160+ }
161+ if (rptr != NULL) {
162+ scws_free_result(rptr);
163+ rptr = NULL;
164+ }
165+ qptr = qs.data();
166+ scws_send_text(scws, qptr, qs.size());
167+ while ((res = scws_get_result(scws)) != NULL) {
168+ if (rptr == NULL) {
169+ rcur = rptr = res;
170+ } else {
171+ rcur->next = res;
172+ }
173+ while (rcur->next != NULL) {
174+ rcur = rcur->next;
175+ }
176+ }
177+ rcur = rptr;
178+#endif /* HAVE_SCWS */
179+
180 unsigned newprev = ' ';
181 main_lex_loop:
182 enum {
183@@ -1101,6 +1191,12 @@
184 if (!stemmer.internal.get()) {
185 // No stemmer is set.
186 stem_term = STEM_NONE;
187+#ifdef HAVE_SCWS
188+ else if (is_cjk_term) {
189+ // Don't stem CJK terms.
190+ stem_term = STEM_NONE;
191+ }
192+#endif
193 } else if (stem_term == STEM_SOME) {
194 if (!should_stem(unstemmed_term) ||
195 (it != end && is_stem_preventer(*it))) {
196@@ -1244,6 +1340,14 @@
197 }
198 }
199 done:
200+#ifdef HAVE_SCWS
201+ /// Free all segmented terms/words
202+ if (rptr != NULL) {
203+ scws_free_result(rptr);
204+ rptr = NULL;
205+ }
206+#endif
207+
208 if (!state.error) {
209 // Implicitly close any unclosed quotes...
210 if (mode == IN_QUOTES || mode == IN_PREFIXED_QUOTES)
211Index: queryparser/queryparser_internal.h
212===================================================================
213--- queryparser/queryparser_internal.h (revision 16065)
214+++ queryparser/queryparser_internal.h (working copy)
215@@ -29,6 +29,11 @@
216 #include <xapian/queryparser.h>
217 #include <xapian/stem.h>
218
219+/// hightman.20110701: use scws as default tokneizer
220+#ifdef HAVE_SCWS
221+#include <scws/scws.h>
222+#endif
223+
224 #include <list>
225 #include <map>
226
227@@ -63,6 +68,11 @@
228 Stem stemmer;
229 stem_strategy stem_action;
230 const Stopper * stopper;
231+#ifdef HAVE_SCWS
232+ scws_t scws;
233+ scws_res_t rptr, rcur;
234+ const char *qptr;
235+#endif
236 Query::op default_op;
237 const char * errmsg;
238 Database db;
239@@ -88,7 +98,14 @@
240
241 public:
242 Internal() : stem_action(STEM_NONE), stopper(NULL),
243+#ifdef HAVE_SCWS
244+ scws(NULL), rptr(NULL), rcur(NULL),
245+#endif
246 default_op(Query::OP_OR), errmsg(NULL), max_wildcard_expansion(0) { }
247+#ifdef HAVE_SCWS
248+ ~Internal();
249+ void load_libscws(const char *fpath, bool xmem, int multi);
250+#endif
251 Query parse_query(const string & query_string, unsigned int flags, const string & default_prefix);
252 };
253
254Index: queryparser/queryparser.cc
255===================================================================
256--- queryparser/queryparser.cc (revision 16065)
257+++ queryparser/queryparser.cc (working copy)
258@@ -112,6 +112,17 @@
259 internal->max_wildcard_expansion = max;
260 }
261
262+#if 1 /* HAVE_SCWS */
263+/// hightman.20110701: load the specified dict file for scws
264+void
265+QueryParser::load_libscws(const char *fpath, bool xmem, int multi)
266+{
267+#ifdef HAVE_SCWS
268+ internal->load_libscws(fpath, xmem, multi);
269+#endif
270+}
271+#endif
272+
273 Query
274 QueryParser::parse_query(const string &query_string, unsigned flags,
275 const string &default_prefix)
276Index: queryparser/termgenerator_internal.cc
277===================================================================
278--- queryparser/termgenerator_internal.cc (revision 16065)
279+++ queryparser/termgenerator_internal.cc (working copy)
280@@ -125,6 +125,36 @@
281 #define STOPWORDS_IGNORE 1
282 #define STOPWORDS_INDEX_UNSTEMMED_ONLY 2
283
284+/// hightman.20070701: load libscws
285+#ifdef HAVE_SCWS
286+TermGenerator::Internal::~Internal()
287+{
288+ if (scws != NULL) {
289+ scws_free(scws);
290+ scws = NULL;
291+ }
292+}
293+void
294+TermGenerator::Internal::load_libscws(const char *fpath, bool xmem, int multi)
295+{
296+ if (scws == NULL) {
297+ string temp;
298+
299+ scws = scws_new();
300+ scws_set_charset(scws, "utf8");
301+ scws_set_ignore(scws, SCWS_NA);
302+ scws_set_duality(scws, SCWS_YEA);
303+
304+ temp = string(fpath ? fpath : SCWS_ETCDIR) + string("/rules.utf8.ini");
305+ scws_set_rule(scws, temp.data());
306+ temp = string(fpath ? fpath : SCWS_ETCDIR) + string("/dict.utf8.xdb");
307+ scws_set_dict(scws, temp.data(), xmem == true ? SCWS_XDICT_MEM : SCWS_XDICT_XDB);
308+ }
309+ if (multi >= 0 && multi < 0x10)
310+ scws_set_multi(scws, (multi<<12));
311+}
312+#endif /* HAVE_SCWS */
313+
314 void
315 TermGenerator::Internal::index_text(Utf8Iterator itor, termcount wdf_inc,
316 const string & prefix, bool with_positions)
317@@ -135,6 +165,37 @@
318
319 if (!stopper) stop_mode = STOPWORDS_NONE;
320
321+#ifdef HAVE_SCWS
322+ int last_endpos = 0, last_off = 0;
323+ scws_res_t res, cur;
324+ Utf8Iterator iterm;
325+ const char *text = itor.raw();
326+
327+ if (!scws) load_libscws(NULL, false, 0);
328+ scws_send_text(scws, text, itor.left());
329+ while ((res = cur = scws_get_result(scws)) != NULL) { while (cur != NULL) {
330+ string term;
331+
332+ iterm.assign(text + cur->off, cur->len);
333+ if (!Unicode::is_wordchar(*iterm)) {
334+ cur = cur->next;
335+ continue;
336+ }
337+ term = Unicode::tolower(string(text + cur->off, cur->len));
338+ if (with_positions) {
339+ /// for part word(short, duality)
340+ if ((cur->off + cur->len) <= last_endpos)
341+ --termpos;
342+ else {
343+ /// for dualities' first single word
344+ if (cur->off == last_off)
345+ --termpos;
346+ last_endpos = cur->off + cur->len;
347+ }
348+ }
349+ last_off = cur->off;
350+ cur = cur->next;
351+#else /* HAVE_SCWS */
352 while (true) {
353 // Advance to the start of the next term.
354 unsigned ch;
355@@ -254,6 +315,7 @@
356 }
357
358 endofterm:
359+#endif /* HAVE_SCWS */
360 if (term.size() > MAX_PROB_TERM_LENGTH) continue;
361
362 if (stop_mode == STOPWORDS_IGNORE && (*stopper)(term)) continue;
363@@ -263,6 +325,10 @@
364 } else {
365 doc.add_term(prefix + term, wdf_inc);
366 }
367+#ifdef HAVE_SCWS
368+ /// CJK term need not spelling & stemmer
369+ if (CJK::codepoint_is_cjk(*iterm)) continue;
370+#endif
371 if ((flags & FLAG_SPELLING) && prefix.empty()) db.add_spelling(term);
372
373 if (!stemmer.internal.get()) continue;
374@@ -280,6 +346,9 @@
375 stem += stemmer(term);
376 doc.add_term(stem, wdf_inc);
377 }
378+#ifdef HAVE_SCWS
379+ scws_free_result(res); }
380+#endif
381 }
382
383 }
384Index: configure.ac
385===================================================================
386--- configure.ac (revision 16065)
387+++ configure.ac (working copy)
388@@ -1000,6 +1000,63 @@
389 [Define if you want a log of methods called and other debug messages])
390 fi
391
392+dnl **********************
393+dnl * Check scws library *
394+dnl **********************
395+dnl hightman.20110411: See if we want to use scws as the default tokenizer
396+SCWS_DIR=""
397+AC_MSG_CHECKING(for scws)
398+AC_ARG_WITH(scws,
399+ [AS_HELP_STRING([--with-scws@<:@=DIR@:>@],
400+ [use scws as the default tokenizer, DIR is the installation directory scws]
401+ )], [ ],[ with_scws=no ]
402+)
403+
404+if test "$with_scws" = "no"; then
405+ AC_MSG_RESULT(no)
406+else
407+ # Check header file
408+ if test "$with_scws" = "yes"; then
409+ searchdirs="/usr /usr/local /usr/local/scws /opt/local"
410+ for tmpdir in $searchdirs ; do
411+ if test -f $tmpdir/include/scws/scws.h ; then
412+ SCWS_DIR=$tmpdir
413+ break
414+ fi
415+ done
416+ if test "$SCWS_DIR" = ""; then
417+ AC_MSG_RESULT(no)
418+ AC_MSG_ERROR([scws not found in default directories, specify DIR plz...])
419+ fi
420+ elif test -f $withval/include/scws/scws.h ; then
421+ SCWS_DIR=$withval
422+ else
423+ AC_MSG_RESULT(no)
424+ AC_MSG_ERROR([Invalid scws directory, unable to find the scws.h under $withval/include/scws])
425+ fi
426+ AC_MSG_RESULT([yes: $SCWS_DIR])
427+
428+ # Etc directory
429+ if test "$SCWS_DIR" = "/usr"; then
430+ SCWS_ETCDIR="/etc"
431+ else
432+ SCWS_ETCDIR="$SCWS_DIR/etc"
433+ fi
434+
435+ # Check scws library
436+ AC_CHECK_LIB(scws, scws_new, [
437+ LIBS="$LIBS -L$SCWS_DIR/lib -lscws"
438+ XAPIAN_LDFLAGS="$XAPIAN_LDFLAGS -L$SCWS_DIR/lib -lscws"
439+ CPPFLAGS="$CPPFLAGS -I$SCWS_DIR/include"
440+ AC_DEFINE(HAVE_SCWS, 1, [Define to 1 if you want to use scws as default tokenizer])
441+ AC_DEFINE_UNQUOTED(SCWS_ETCDIR, "$SCWS_ETCDIR", [Resources directory of scws to load dictionary and rules])
442+ ],[
443+ AC_MSG_ERROR([scws_new() NOT found in libscws, please re-install scws])
444+ ],[
445+ -L$SCWS_DIR/lib
446+ ])
447+fi
448+
449 dnl ******************************
450 dnl * Set special compiler flags *
451 dnl ******************************
452Index: include/xapian/termgenerator.h
453===================================================================
454--- include/xapian/termgenerator.h (revision 16065)
455+++ include/xapian/termgenerator.h (working copy)
456@@ -78,6 +78,11 @@
457
458 /// Set the database to index spelling data to.
459 void set_database(const Xapian::WritableDatabase &db);
460+
461+#if 1 /* HAVE_SCWS */
462+ /// hightman.20110706: Specify the dict and rules file for scws, only used when HAVE_SCWS.
463+ void load_libscws(const char *fpath, bool xmem = false, int multi = 0);
464+#endif
465
466 /// Flags to OR together and pass to TermGenerator::set_flags().
467 enum flags {
468Index: include/xapian/queryparser.h
469===================================================================
470--- include/xapian/queryparser.h (revision 16065)
471+++ include/xapian/queryparser.h (working copy)
472@@ -455,6 +455,15 @@
473 * can expand to, or 0 for no limit (which is the default).
474 */
475 void set_max_wildcard_expansion(Xapian::termcount limit);
476+
477+#if 1 /* HAVE_SCWS */
478+ /** hightman.20110706: Specify the dict and rules file for scws, only used when HAVE_SCWS.
479+ * @param fpath Path of dictionary file and rule files (char *)
480+ * @param xmem Whether to load the whole dict file into memory (default to false)
481+ * @param multi multiset (int 0~15)
482+ */
483+ void load_libscws(const char *fpath, bool xmem = false, int multi = 0);
484+#endif
485
486 /** Parse a query.
487 *