1 | *** xapian-core-1.3.0_svn/configure.ac 2012-03-28 18:11:06.000000000 +0800
|
---|
2 | --- xapian-core-1.3.0_scws/configure.ac 2012-03-30 12:31:10.000000000 +0800
|
---|
3 | ***************
|
---|
4 | *** 997,1002 ****
|
---|
5 | --- 997,1058 ----
|
---|
6 | [Define if you want a log of methods called and other debug messages])
|
---|
7 | fi
|
---|
8 |
|
---|
9 | + dnl **********************
|
---|
10 | + dnl * Check scws library *
|
---|
11 | + dnl **********************
|
---|
12 | + dnl hightman.20110411: See if we want to use scws as default tokenizer
|
---|
13 | + SCWS_DIR=""
|
---|
14 | + AC_MSG_CHECKING(for scws)
|
---|
15 | + AC_ARG_WITH(scws,
|
---|
16 | + [AS_HELP_STRING([--with-scws@<:@=DIR@:>@], [use scws as default tokenizer, DIR is the install PREFIX of scws])],
|
---|
17 | + [ ],[ with_scws=no ]
|
---|
18 | + )
|
---|
19 | +
|
---|
20 | + if test "$with_scws" = "no"; then
|
---|
21 | + AC_MSG_RESULT(no)
|
---|
22 | + else
|
---|
23 | + # Check header file
|
---|
24 | + if test "$with_scws" = "yes"; then
|
---|
25 | + searchdirs="/usr /usr/local /usr/local/scws /opt/local"
|
---|
26 | + for tmpdir in $searchdirs ; do
|
---|
27 | + if test -f $tmpdir/include/scws/scws.h ; then
|
---|
28 | + SCWS_DIR=$tmpdir
|
---|
29 | + break
|
---|
30 | + fi
|
---|
31 | + done
|
---|
32 | + if test "$SCWS_DIR" = ""; then
|
---|
33 | + AC_MSG_RESULT(no)
|
---|
34 | + AC_MSG_ERROR([scws not found on default search directories, specify DIR plz...])
|
---|
35 | + fi
|
---|
36 | + elif test -f $withval/include/scws/scws.h ; then
|
---|
37 | + SCWS_DIR=$withval
|
---|
38 | + else
|
---|
39 | + AC_MSG_RESULT(no)
|
---|
40 | + AC_MSG_ERROR([Invalid scws directory, unable to find the scws.h under $withval/include/scws])
|
---|
41 | + fi
|
---|
42 | + AC_MSG_RESULT([yes: $SCWS_DIR])
|
---|
43 | +
|
---|
44 | + # Etc directory
|
---|
45 | + if test "$SCWS_DIR" = "/usr"; then
|
---|
46 | + SCWS_ETCDIR="/etc"
|
---|
47 | + else
|
---|
48 | + SCWS_ETCDIR="$SCWS_DIR/etc"
|
---|
49 | + fi
|
---|
50 | +
|
---|
51 | + # Check scws library
|
---|
52 | + AC_CHECK_LIB(scws, scws_new, [
|
---|
53 | + LIBS="$LIBS -L$SCWS_DIR/lib -lscws"
|
---|
54 | + XAPIAN_LDFLAGS="$XAPIAN_LDFLAGS -L$SCWS_DIR/lib -lscws"
|
---|
55 | + CPPFLAGS="$CPPFLAGS -I$SCWS_DIR/include"
|
---|
56 | + AC_DEFINE(HAVE_SCWS, 1, [Define to 1 if you want to use scws as default tokenizer])
|
---|
57 | + AC_DEFINE_UNQUOTED(SCWS_ETCDIR, "$SCWS_ETCDIR", [Resources directory of scws to load dictionary and rules])
|
---|
58 | + ],[
|
---|
59 | + AC_MSG_ERROR([scws_new() NOT found in libscws, please check it first.])
|
---|
60 | + ],[
|
---|
61 | + -L$SCWS_DIR/lib
|
---|
62 | + ])
|
---|
63 | + fi
|
---|
64 | +
|
---|
65 | dnl ******************************
|
---|
66 | dnl * Set special compiler flags *
|
---|
67 | dnl ******************************
|
---|
68 | *** xapian-core-1.3.0_svn/include/xapian/queryparser.h 2012-03-24 20:31:02.000000000 +0800
|
---|
69 | --- xapian-core-1.3.0_scws/include/xapian/queryparser.h 2012-03-30 12:31:10.000000000 +0800
|
---|
70 | ***************
|
---|
71 | *** 499,504 ****
|
---|
72 | --- 499,513 ----
|
---|
73 | */
|
---|
74 | void set_max_wildcard_expansion(Xapian::termcount limit);
|
---|
75 |
|
---|
76 | + #if 1 /* HAVE_SCWS */
|
---|
77 | + /** hightman.20070706: Specify the dict and rules file for scws, only used when HAVE_SCWS.
|
---|
78 | + * @param fpath path for dict file and rule file (char *)
|
---|
79 | + * @param xmem whether to load whold dict into memory(default to false)
|
---|
80 | + * @param multi multiset (int 0~15)
|
---|
81 | + */
|
---|
82 | + void load_libscws(const char *fpath, bool xmem = false, int multi = 0);
|
---|
83 | + #endif
|
---|
84 | +
|
---|
85 | /** Parse a query.
|
---|
86 | *
|
---|
87 | * @param query_string A free-text query as entered by a user
|
---|
88 | *** xapian-core-1.3.0_svn/include/xapian/termgenerator.h 2011-11-07 11:11:05.000000000 +0800
|
---|
89 | --- xapian-core-1.3.0_scws/include/xapian/termgenerator.h 2012-03-30 12:31:10.000000000 +0800
|
---|
90 | ***************
|
---|
91 | *** 82,87 ****
|
---|
92 | --- 82,92 ----
|
---|
93 | /// Set the database to index spelling data to.
|
---|
94 | void set_database(const Xapian::WritableDatabase &db);
|
---|
95 |
|
---|
96 | + #if 1 /* HAVE_SCWS */
|
---|
97 | + /// hightman.20070706: Specify the dict and rules file for scws, only used when HAVE_SCWS.
|
---|
98 | + void load_libscws(const char *fpath, bool xmem = false, int multi = 0);
|
---|
99 | + #endif
|
---|
100 | +
|
---|
101 | /// Flags to OR together and pass to TermGenerator::set_flags().
|
---|
102 | enum flags {
|
---|
103 | /// Index data required for spelling correction.
|
---|
104 | *** xapian-core-1.3.0_svn/queryparser/queryparser_internal.h 2012-03-24 20:31:02.000000000 +0800
|
---|
105 | --- xapian-core-1.3.0_scws/queryparser/queryparser_internal.h 2012-03-30 12:38:27.000000000 +0800
|
---|
106 | ***************
|
---|
107 | *** 29,34 ****
|
---|
108 | --- 29,39 ----
|
---|
109 | #include <xapian/queryparser.h>
|
---|
110 | #include <xapian/stem.h>
|
---|
111 |
|
---|
112 | + /// hightman.20070701: use scws as default tokneizer
|
---|
113 | + #ifdef HAVE_SCWS
|
---|
114 | + #include <scws/scws.h>
|
---|
115 | + #endif
|
---|
116 | +
|
---|
117 | #include <list>
|
---|
118 | #include <map>
|
---|
119 |
|
---|
120 | ***************
|
---|
121 | *** 63,68 ****
|
---|
122 | --- 68,79 ----
|
---|
123 | Stem stemmer;
|
---|
124 | stem_strategy stem_action;
|
---|
125 | const Stopper * stopper;
|
---|
126 | + #ifdef HAVE_SCWS
|
---|
127 | + scws_t scws;
|
---|
128 | + scws_res_t rptr, rcur;
|
---|
129 | + const char *qptr;
|
---|
130 | + int last_off;
|
---|
131 | + #endif
|
---|
132 | Query::op default_op;
|
---|
133 | const char * errmsg;
|
---|
134 | Database db;
|
---|
135 | ***************
|
---|
136 | *** 88,94 ****
|
---|
137 | --- 99,112 ----
|
---|
138 |
|
---|
139 | public:
|
---|
140 | Internal() : stem_action(STEM_SOME), stopper(NULL),
|
---|
141 | + #ifdef HAVE_SCWS
|
---|
142 | + scws(NULL), rptr(NULL), rcur(NULL),
|
---|
143 | + #endif
|
---|
144 | default_op(Query::OP_OR), errmsg(NULL), max_wildcard_expansion(0) { }
|
---|
145 | + #ifdef HAVE_SCWS
|
---|
146 | + ~Internal();
|
---|
147 | + void load_libscws(const char *fpath, bool xmem, int multi);
|
---|
148 | + #endif
|
---|
149 |
|
---|
150 | Query parse_query(const string & query_string, unsigned int flags, const string & default_prefix);
|
---|
151 | };
|
---|
152 | *** xapian-core-1.3.0_svn/queryparser/termgenerator_internal.h 2011-07-03 20:31:02.000000000 +0800
|
---|
153 | --- xapian-core-1.3.0_scws/queryparser/termgenerator_internal.h 2012-03-30 12:33:15.000000000 +0800
|
---|
154 | ***************
|
---|
155 | *** 26,31 ****
|
---|
156 | --- 26,35 ----
|
---|
157 | #include <xapian/document.h>
|
---|
158 | #include <xapian/termgenerator.h>
|
---|
159 | #include <xapian/stem.h>
|
---|
160 | + /// hightman.20070701: use scws as default tokneizer
|
---|
161 | + #ifdef HAVE_SCWS
|
---|
162 | + #include <scws/scws.h>
|
---|
163 | + #endif
|
---|
164 |
|
---|
165 | namespace Xapian {
|
---|
166 |
|
---|
167 | ***************
|
---|
168 | *** 37,48 ****
|
---|
169 | --- 41,62 ----
|
---|
170 | const Stopper * stopper;
|
---|
171 | Document doc;
|
---|
172 | termcount termpos;
|
---|
173 | + #ifdef HAVE_SCWS
|
---|
174 | + scws_t scws;
|
---|
175 | + #endif
|
---|
176 | TermGenerator::flags flags;
|
---|
177 | WritableDatabase db;
|
---|
178 |
|
---|
179 | public:
|
---|
180 | Internal() : stopper(NULL), termpos(0),
|
---|
181 | + #ifdef HAVE_SCWS
|
---|
182 | + scws(NULL),
|
---|
183 | + #endif
|
---|
184 | flags(TermGenerator::flags(0)) { }
|
---|
185 | + #ifdef HAVE_SCWS
|
---|
186 | + ~Internal();
|
---|
187 | + void load_libscws(const char *fpath, bool xmem, int multi);
|
---|
188 | + #endif
|
---|
189 | void index_text(Utf8Iterator itor,
|
---|
190 | termcount weight,
|
---|
191 | const std::string & prefix,
|
---|
192 | *** xapian-core-1.3.0_svn/queryparser/queryparser.cc 2011-12-26 20:31:02.000000000 +0800
|
---|
193 | --- xapian-core-1.3.0_scws/queryparser/queryparser.cc 2012-03-30 12:33:15.000000000 +0800
|
---|
194 | ***************
|
---|
195 | *** 136,141 ****
|
---|
196 | --- 136,152 ----
|
---|
197 | internal->max_wildcard_expansion = max;
|
---|
198 | }
|
---|
199 |
|
---|
200 | + #if 1 /* HAVE_SCWS */
|
---|
201 | + /// hightman.20070701: load the specified dict file for scws
|
---|
202 | + void
|
---|
203 | + QueryParser::load_libscws(const char *fpath, bool xmem, int multi)
|
---|
204 | + {
|
---|
205 | + #ifdef HAVE_SCWS
|
---|
206 | + internal->load_libscws(fpath, xmem, multi);
|
---|
207 | + #endif
|
---|
208 | + }
|
---|
209 | + #endif
|
---|
210 | +
|
---|
211 | Query
|
---|
212 | QueryParser::parse_query(const string &query_string, unsigned flags,
|
---|
213 | const string &default_prefix)
|
---|
214 | *** xapian-core-1.3.0_svn/queryparser/queryparser.lemony 2012-01-26 20:51:02.000000000 +0800
|
---|
215 | --- xapian-core-1.3.0_scws/queryparser/queryparser.lemony 2012-03-30 13:46:27.000000000 +0800
|
---|
216 | ***************
|
---|
217 | *** 166,171 ****
|
---|
218 | --- 166,174 ----
|
---|
219 | string unstemmed;
|
---|
220 | QueryParser::stem_strategy stem;
|
---|
221 | termpos pos;
|
---|
222 | + #ifdef HAVE_SCWS
|
---|
223 | + vector<string> multi;
|
---|
224 | + #endif
|
---|
225 |
|
---|
226 | Term(const string &name_, termpos pos_) : name(name_), stem(QueryParser::STEM_NONE), pos(pos_) { }
|
---|
227 | Term(const string &name_) : name(name_), stem(QueryParser::STEM_NONE), pos(0) { }
|
---|
228 | ***************
|
---|
229 | *** 501,513 ****
|
---|
230 | vector<Query> prefix_cjk;
|
---|
231 | const list<string> & prefixes = prefix_info->prefixes;
|
---|
232 | list<string>::const_iterator piter;
|
---|
233 | ! for (CJKTokenIterator tk(name); tk != CJKTokenIterator(); ++tk) {
|
---|
234 | for (piter = prefixes.begin(); piter != prefixes.end(); ++piter) {
|
---|
235 | string cjk = *piter;
|
---|
236 | cjk += *tk;
|
---|
237 | prefix_cjk.push_back(Query(cjk, 1, pos));
|
---|
238 | }
|
---|
239 | }
|
---|
240 | Query * q = new Query(Query::OP_AND, prefix_cjk.begin(), prefix_cjk.end());
|
---|
241 | delete this;
|
---|
242 | return q;
|
---|
243 | --- 504,546 ----
|
---|
244 | vector<Query> prefix_cjk;
|
---|
245 | const list<string> & prefixes = prefix_info->prefixes;
|
---|
246 | list<string>::const_iterator piter;
|
---|
247 | ! /* hightman.20111223: used CJKTERM for multi segment */
|
---|
248 | ! #ifdef HAVE_SCWS
|
---|
249 | ! for (piter = prefixes.begin(); piter != prefixes.end(); ++piter) {
|
---|
250 | ! Query org = Query(*piter + name, 1, pos);
|
---|
251 | ! termpos mpos = pos + 88;
|
---|
252 | !
|
---|
253 | ! /* hightman.20120104: get synonyms */
|
---|
254 | ! if (state->flags & QueryParser::FLAG_AUTO_SYNONYMS) {
|
---|
255 | ! Xapian::Database db = state->get_database();
|
---|
256 | ! Xapian::TermIterator syn = db.synonyms_begin(name);
|
---|
257 | ! Xapian::TermIterator end = db.synonyms_end(name);
|
---|
258 | ! while (syn != end) {
|
---|
259 | ! org = Query(Query::OP_SYNONYM, org, Query(*piter + *syn, 1, mpos++));
|
---|
260 | ! ++syn;
|
---|
261 | ! }
|
---|
262 | ! }
|
---|
263 | ! if (!multi.empty()) {
|
---|
264 | ! vector<string>::const_iterator mi;
|
---|
265 | ! vector<Query> multi_cjk;
|
---|
266 | ! for (mi = multi.begin(); mi != multi.end(); ++mi) {
|
---|
267 | ! // hightman: force to sort behind for get_terms()
|
---|
268 | ! multi_cjk.push_back(Query(*piter + *mi, 1, mpos++));
|
---|
269 | ! }
|
---|
270 | ! Query syn = Query(state->default_op(), multi_cjk.begin(), multi_cjk.end());
|
---|
271 | ! org = Query(Query::OP_SYNONYM, org, syn);
|
---|
272 | ! }
|
---|
273 | ! prefix_cjk.push_back(org);
|
---|
274 | ! }
|
---|
275 | ! #else
|
---|
276 | ! for (CJKTokenIterator tk(name); tk != CJKTokenIterator(); ++tk) {
|
---|
277 | for (piter = prefixes.begin(); piter != prefixes.end(); ++piter) {
|
---|
278 | string cjk = *piter;
|
---|
279 | cjk += *tk;
|
---|
280 | prefix_cjk.push_back(Query(cjk, 1, pos));
|
---|
281 | }
|
---|
282 | }
|
---|
283 | + #endif /* HAVE_SCWS */
|
---|
284 | Query * q = new Query(Query::OP_AND, prefix_cjk.begin(), prefix_cjk.end());
|
---|
285 | delete this;
|
---|
286 | return q;
|
---|
287 | ***************
|
---|
288 | *** 618,629 ****
|
---|
289 | --- 651,728 ----
|
---|
290 | }
|
---|
291 | }
|
---|
292 |
|
---|
293 | + /// hightman.20110701: load libscws
|
---|
294 | + #ifdef HAVE_SCWS
|
---|
295 | + QueryParser::Internal::~Internal()
|
---|
296 | + {
|
---|
297 | + if (rptr != NULL) {
|
---|
298 | + scws_free_result(rptr);
|
---|
299 | + rptr = NULL;
|
---|
300 | + }
|
---|
301 | + if (scws != NULL) {
|
---|
302 | + scws_free(scws);
|
---|
303 | + scws = NULL;
|
---|
304 | + }
|
---|
305 | + }
|
---|
306 | +
|
---|
307 | + void
|
---|
308 | + QueryParser::Internal::load_libscws(const char *fpath, bool xmem, int multi)
|
---|
309 | + {
|
---|
310 | + if (scws == NULL) {
|
---|
311 | + string temp;
|
---|
312 | +
|
---|
313 | + scws = scws_new();
|
---|
314 | + scws_set_charset(scws, "utf8");
|
---|
315 | + scws_set_ignore(scws, SCWS_NA);
|
---|
316 | + scws_set_duality(scws, SCWS_YEA);
|
---|
317 | +
|
---|
318 | + temp = string(fpath ? fpath : SCWS_ETCDIR) + string("/rules.utf8.ini");
|
---|
319 | + scws_set_rule(scws, temp.data());
|
---|
320 | + temp = string(fpath ? fpath : SCWS_ETCDIR) + string("/dict.utf8.xdb");
|
---|
321 | + scws_set_dict(scws, temp.data(), xmem == true ? SCWS_XDICT_MEM : SCWS_XDICT_XDB);
|
---|
322 | + /* hightman.20111209: custom dict support */
|
---|
323 | + temp = string(fpath ? fpath : SCWS_ETCDIR) + string("/dict_user.txt");
|
---|
324 | + scws_add_dict(scws, temp.data(), SCWS_XDICT_TXT);
|
---|
325 | + }
|
---|
326 | + if (multi >= 0 && multi < 0x10)
|
---|
327 | + scws_set_multi(scws, (multi<<12));
|
---|
328 | + }
|
---|
329 | + #endif /* HAVE_SCWS */
|
---|
330 | +
|
---|
331 | string
|
---|
332 | QueryParser::Internal::parse_term(Utf8Iterator &it, const Utf8Iterator &end,
|
---|
333 | bool cjk_ngram, bool & is_cjk_term,
|
---|
334 | bool &was_acronym)
|
---|
335 | {
|
---|
336 | string term;
|
---|
337 | + #ifdef HAVE_SCWS
|
---|
338 | + int off = it.raw() - qptr;
|
---|
339 | + while (rcur && (off > rcur->off)) {
|
---|
340 | + rcur = rcur->next;
|
---|
341 | + }
|
---|
342 | + was_acronym = false;
|
---|
343 | + if (rcur == NULL) {
|
---|
344 | + it = end;
|
---|
345 | + term.resize(0);
|
---|
346 | + } else {
|
---|
347 | + // sometimes, auto_duality + word-end single word char will be repeated
|
---|
348 | + // 说明几句 => 说明/几/几句
|
---|
349 | + if (rcur->next && rcur->next->off == rcur->off && rcur->next->len > rcur->len)
|
---|
350 | + rcur = rcur->next;
|
---|
351 | +
|
---|
352 | + term.append(qptr + rcur->off, rcur->len);
|
---|
353 | + was_acronym = (rcur->attr[0] == 'n' && rcur->attr[1] == 'z') ? true : false;
|
---|
354 | + is_cjk_term = CJK::codepoint_is_cjk(*it);
|
---|
355 | + last_off = off = rcur->off + rcur->len;
|
---|
356 | + rcur = rcur->next;
|
---|
357 | +
|
---|
358 | + // sometimes, auto duality or multisegment
|
---|
359 | + // 几句说搞笑 => 几句/句说/搞笑
|
---|
360 | + if (rcur && off > rcur->off && (rcur->off + rcur->len) > off)
|
---|
361 | + off = rcur->off;
|
---|
362 | + while ((it.raw() - qptr) < off) it++;
|
---|
363 | + }
|
---|
364 | + #else /* HAVE_SCWS */
|
---|
365 | // Look for initials separated by '.' (e.g. P.T.O., U.N.C.L.E).
|
---|
366 | // Don't worry if there's a trailing '.' or not.
|
---|
367 | if (U_isupper(*it)) {
|
---|
368 | ***************
|
---|
369 | *** 708,713 ****
|
---|
370 | --- 807,813 ----
|
---|
371 | }
|
---|
372 | }
|
---|
373 | }
|
---|
374 | + #endif /* HAVE_SCWS */
|
---|
375 | return term;
|
---|
376 | }
|
---|
377 |
|
---|
378 | ***************
|
---|
379 | *** 759,764 ****
|
---|
380 | --- 859,890 ----
|
---|
381 |
|
---|
382 | ParserHandler pParser(ParseAlloc());
|
---|
383 |
|
---|
384 | + #ifdef HAVE_SCWS
|
---|
385 | + /// Pre segmentation use scws
|
---|
386 | + scws_res_t res;
|
---|
387 | +
|
---|
388 | + if (!scws) {
|
---|
389 | + load_libscws(NULL, false, 3);
|
---|
390 | + }
|
---|
391 | + if (rptr != NULL) {
|
---|
392 | + scws_free_result(rptr);
|
---|
393 | + rptr = NULL;
|
---|
394 | + }
|
---|
395 | + qptr = qs.data();
|
---|
396 | + scws_send_text(scws, qptr, qs.size());
|
---|
397 | + while ((res = scws_get_result(scws)) != NULL) {
|
---|
398 | + if (rptr == NULL) {
|
---|
399 | + rcur = rptr = res;
|
---|
400 | + } else {
|
---|
401 | + rcur->next = res;
|
---|
402 | + }
|
---|
403 | + while (rcur->next != NULL) {
|
---|
404 | + rcur = rcur->next;
|
---|
405 | + }
|
---|
406 | + }
|
---|
407 | + rcur = rptr;
|
---|
408 | + #endif /* HAVE_SCWS */
|
---|
409 | +
|
---|
410 | unsigned newprev = ' ';
|
---|
411 | main_lex_loop:
|
---|
412 | enum {
|
---|
413 | ***************
|
---|
414 | *** 1162,1167 ****
|
---|
415 | --- 1288,1298 ----
|
---|
416 | if (!stemmer.internal.get()) {
|
---|
417 | // No stemmer is set.
|
---|
418 | stem_term = STEM_NONE;
|
---|
419 | + #ifdef HAVE_SCWS
|
---|
420 | + } else if (is_cjk_term) {
|
---|
421 | + // Don't stem CJK terms.
|
---|
422 | + stem_term = STEM_NONE;
|
---|
423 | + #endif
|
---|
424 | } else if (stem_term == STEM_SOME) {
|
---|
425 | if (!should_stem(unstemmed_term) ||
|
---|
426 | (it != end && is_stem_preventer(*it))) {
|
---|
427 | ***************
|
---|
428 | *** 1175,1180 ****
|
---|
429 | --- 1306,1322 ----
|
---|
430 | unstemmed_term, stem_term, term_pos++);
|
---|
431 |
|
---|
432 | if (is_cjk_term) {
|
---|
433 | + #ifdef HAVE_SCWS
|
---|
434 | + /* multi scws handler */
|
---|
435 | + term_obj->multi.clear();
|
---|
436 | + while (rcur && (rcur->off + rcur->len) <= last_off) {
|
---|
437 | + if (rcur->len > 3)
|
---|
438 | + term_obj->multi.push_back(string(qptr + rcur->off, rcur->len));
|
---|
439 | + rcur = rcur->next;
|
---|
440 | + }
|
---|
441 | + if (mode == IN_GROUP || mode == IN_GROUP2)
|
---|
442 | + mode = DEFAULT;
|
---|
443 | + #endif
|
---|
444 | Parse(pParser, CJKTERM, term_obj, &state);
|
---|
445 | if (it == end) break;
|
---|
446 | continue;
|
---|
447 | ***************
|
---|
448 | *** 1305,1310 ****
|
---|
449 | --- 1447,1459 ----
|
---|
450 | }
|
---|
451 | }
|
---|
452 | done:
|
---|
453 | + #ifdef HAVE_SCWS
|
---|
454 | + /// Free all segmented terms/words
|
---|
455 | + if (rptr != NULL) {
|
---|
456 | + scws_free_result(rptr);
|
---|
457 | + rptr = NULL;
|
---|
458 | + }
|
---|
459 | + #endif
|
---|
460 | if (!state.error) {
|
---|
461 | // Implicitly close any unclosed quotes...
|
---|
462 | if (mode == IN_QUOTES || mode == IN_PREFIXED_QUOTES)
|
---|
463 | ***************
|
---|
464 | *** 1656,1661 ****
|
---|
465 | --- 1805,1815 ----
|
---|
466 | void
|
---|
467 | Term::as_positional_cjk_term(Terms * terms) const
|
---|
468 | {
|
---|
469 | + #ifdef HAVE_SCWS
|
---|
470 | + // Add SCWS term only
|
---|
471 | + Term * c = new Term(state, name, prefix_info, unstemmed, stem, pos);
|
---|
472 | + terms->add_positional_term(c);
|
---|
473 | + #else
|
---|
474 | // Add each individual CJK character to the phrase.
|
---|
475 | string t;
|
---|
476 | for (Utf8Iterator it(name); it != Utf8Iterator(); ++it) {
|
---|
477 | ***************
|
---|
478 | *** 1664,1669 ****
|
---|
479 | --- 1818,1824 ----
|
---|
480 | terms->add_positional_term(c);
|
---|
481 | t.resize(0);
|
---|
482 | }
|
---|
483 | + #endif /* HAVE_SCWS */
|
---|
484 |
|
---|
485 | // FIXME: we want to add the n-grams as filters too for efficiency.
|
---|
486 |
|
---|
487 | *** xapian-core-1.3.0_svn/queryparser/termgenerator.cc 2011-07-03 20:31:02.000000000 +0800
|
---|
488 | --- xapian-core-1.3.0_scws/queryparser/termgenerator.cc 2012-03-30 12:33:15.000000000 +0800
|
---|
489 | ***************
|
---|
490 | *** 74,79 ****
|
---|
491 | --- 74,90 ----
|
---|
492 | internal->db = db;
|
---|
493 | }
|
---|
494 |
|
---|
495 | + #if 1 /* HAVE_SCWS */
|
---|
496 | + /// hightman.20070701: load the specified dict file for scws
|
---|
497 | + void
|
---|
498 | + TermGenerator::load_libscws(const char *fpath, bool xmem, int multi)
|
---|
499 | + {
|
---|
500 | + #ifdef HAVE_SCWS
|
---|
501 | + internal->load_libscws(fpath, xmem, multi);
|
---|
502 | + #endif
|
---|
503 | + }
|
---|
504 | + #endif
|
---|
505 | +
|
---|
506 | TermGenerator::flags
|
---|
507 | TermGenerator::set_flags(flags toggle, flags mask)
|
---|
508 | {
|
---|
509 | *** xapian-core-1.3.0_svn/queryparser/termgenerator_internal.cc 2011-08-24 20:51:02.000000000 +0800
|
---|
510 | --- xapian-core-1.3.0_scws/queryparser/termgenerator_internal.cc 2012-03-30 13:51:10.000000000 +0800
|
---|
511 | ***************
|
---|
512 | *** 125,130 ****
|
---|
513 | --- 125,164 ----
|
---|
514 | #define STOPWORDS_IGNORE 1
|
---|
515 | #define STOPWORDS_INDEX_UNSTEMMED_ONLY 2
|
---|
516 |
|
---|
517 | + /// hightman.20070701: load libscws
|
---|
518 | + #ifdef HAVE_SCWS
|
---|
519 | + TermGenerator::Internal::~Internal()
|
---|
520 | + {
|
---|
521 | + if (scws != NULL) {
|
---|
522 | + scws_free(scws);
|
---|
523 | + scws = NULL;
|
---|
524 | + }
|
---|
525 | + }
|
---|
526 | +
|
---|
527 | + void
|
---|
528 | + TermGenerator::Internal::load_libscws(const char *fpath, bool xmem, int multi)
|
---|
529 | + {
|
---|
530 | + if (scws == NULL) {
|
---|
531 | + string temp;
|
---|
532 | +
|
---|
533 | + scws = scws_new();
|
---|
534 | + scws_set_charset(scws, "utf8");
|
---|
535 | + scws_set_ignore(scws, SCWS_NA);
|
---|
536 | + scws_set_duality(scws, SCWS_YEA);
|
---|
537 | +
|
---|
538 | + temp = string(fpath ? fpath : SCWS_ETCDIR) + string("/rules.utf8.ini");
|
---|
539 | + scws_set_rule(scws, temp.data());
|
---|
540 | + temp = string(fpath ? fpath : SCWS_ETCDIR) + string("/dict.utf8.xdb");
|
---|
541 | + scws_set_dict(scws, temp.data(), xmem == true ? SCWS_XDICT_MEM : SCWS_XDICT_XDB);
|
---|
542 | + /* hightman.20111209: custom dict support */
|
---|
543 | + temp = string(fpath ? fpath : SCWS_ETCDIR) + string("/dict_user.txt");
|
---|
544 | + scws_add_dict(scws, temp.data(), SCWS_XDICT_TXT);
|
---|
545 | + }
|
---|
546 | + if (multi >= 0 && multi < 0x10)
|
---|
547 | + scws_set_multi(scws, (multi<<12));
|
---|
548 | + }
|
---|
549 | + #endif
|
---|
550 | +
|
---|
551 | void
|
---|
552 | TermGenerator::Internal::index_text(Utf8Iterator itor, termcount wdf_inc,
|
---|
553 | const string & prefix, bool with_positions)
|
---|
554 | ***************
|
---|
555 | *** 135,140 ****
|
---|
556 | --- 169,205 ----
|
---|
557 |
|
---|
558 | if (!stopper) stop_mode = STOPWORDS_NONE;
|
---|
559 |
|
---|
560 | + #ifdef HAVE_SCWS
|
---|
561 | + int last_endpos = 0, last_off = 0;
|
---|
562 | + scws_res_t res, cur;
|
---|
563 | + Utf8Iterator iterm;
|
---|
564 | + const char *text = itor.raw();
|
---|
565 | +
|
---|
566 | + if (!scws) load_libscws(NULL, false, 3);
|
---|
567 | + scws_send_text(scws, text, itor.left());
|
---|
568 | + while ((res = cur = scws_get_result(scws)) != NULL) { while (cur != NULL) {
|
---|
569 | + string term;
|
---|
570 | +
|
---|
571 | + iterm.assign(text + cur->off, cur->len);
|
---|
572 | + if (!Unicode::is_wordchar(*iterm)) {
|
---|
573 | + cur = cur->next;
|
---|
574 | + continue;
|
---|
575 | + }
|
---|
576 | + term = Unicode::tolower(string(text + cur->off, cur->len));
|
---|
577 | + if (with_positions) {
|
---|
578 | + /// for part word(short, duality)
|
---|
579 | + if ((cur->off + cur->len) <= last_endpos)
|
---|
580 | + --termpos;
|
---|
581 | + else {
|
---|
582 | + /// for dualities' first single word
|
---|
583 | + if (cur->off == last_off)
|
---|
584 | + --termpos;
|
---|
585 | + last_endpos = cur->off + cur->len;
|
---|
586 | + }
|
---|
587 | + }
|
---|
588 | + last_off = cur->off;
|
---|
589 | + cur = cur->next;
|
---|
590 | + #else
|
---|
591 | while (true) {
|
---|
592 | // Advance to the start of the next term.
|
---|
593 | unsigned ch;
|
---|
594 | ***************
|
---|
595 | *** 254,259 ****
|
---|
596 | --- 319,325 ----
|
---|
597 | }
|
---|
598 |
|
---|
599 | endofterm:
|
---|
600 | + #endif /* HAVE_SCWS */
|
---|
601 | if (term.size() > MAX_PROB_TERM_LENGTH) continue;
|
---|
602 |
|
---|
603 | if (stop_mode == STOPWORDS_IGNORE && (*stopper)(term)) continue;
|
---|
604 | ***************
|
---|
605 | *** 263,268 ****
|
---|
606 | --- 329,338 ----
|
---|
607 | } else {
|
---|
608 | doc.add_term(prefix + term, wdf_inc);
|
---|
609 | }
|
---|
610 | + #ifdef HAVE_SCWS
|
---|
611 | + /// hightman: Term start with CJK character needn't spell & stem
|
---|
612 | + if (CJK::codepoint_is_cjk(*iterm)) continue;
|
---|
613 | + #endif
|
---|
614 | if ((flags & FLAG_SPELLING) && prefix.empty()) db.add_spelling(term);
|
---|
615 |
|
---|
616 | if (!stemmer.internal.get()) continue;
|
---|
617 | ***************
|
---|
618 | *** 280,285 ****
|
---|
619 | --- 350,358 ----
|
---|
620 | stem += stemmer(term);
|
---|
621 | doc.add_term(stem, wdf_inc);
|
---|
622 | }
|
---|
623 | + #ifdef HAVE_SCWS
|
---|
624 | + scws_free_result(res); }
|
---|
625 | + #endif
|
---|
626 | }
|
---|
627 |
|
---|
628 | }
|
---|