Ticket #594: xapian-scws-1.3.x-trunk.updated.patch

File xapian-scws-1.3.x-trunk.updated.patch, 14.2 KB (added by Olly Betts, 12 years ago)

Updated patch which compiles without SCWS

  • xapian-core/configure.ac

    diff --git a/xapian-core/configure.ac b/xapian-core/configure.ac
    index b5e491d..ef95d14 100644
    a b if test yes = "$enable_log"; then  
    997997    [Define if you want a log of methods called and other debug messages])
    998998fi
    999999
     1000dnl **********************
     1001dnl * Check scws library *
     1002dnl **********************
     1003dnl See if we want to use scws as the default tokenizer
     1004SCWS_DIR=
     1005AC_MSG_CHECKING([for scws])
     1006AC_ARG_WITH(scws,
     1007  [AS_HELP_STRING([[--with-scws[=DIR]]],
     1008    [use scws as the default tokenizer, DIR is the scws installation directory]
     1009  )], [ ],[ with_scws=no ]
     1010)
     1011
     1012if test "$with_scws" = "no"; then
     1013  AC_MSG_RESULT(no)
     1014else
     1015  # Check header file
     1016  if test "$with_scws" = "yes"; then
     1017    for tmpdir in /usr /usr/local /usr/local/scws /opt/local ; do
     1018      if test -f "$tmpdir/include/scws/scws.h" ; then
     1019        SCWS_DIR=$tmpdir
     1020        break
     1021      fi
     1022    done
     1023    if test "$SCWS_DIR" = ""; then
     1024      AC_MSG_RESULT(no)
     1025      AC_MSG_ERROR([scws not found in default directories, please specify --with-scws=DIR])
     1026    fi
     1027  elif test -f "$withval/include/scws/scws.h" ; then
     1028    SCWS_DIR=$withval
     1029  else
     1030    AC_MSG_RESULT(no)
     1031    AC_MSG_ERROR([Invalid scws directory, unable to find the scws.h under $withval/include/scws])
     1032  fi
     1033  AC_MSG_RESULT([yes: $SCWS_DIR])
     1034
     1035  dnl Etc directory
     1036  if test "$SCWS_DIR" = "/usr"; then
     1037    SCWS_ETCDIR="/etc"
     1038  else
     1039    SCWS_ETCDIR="$SCWS_DIR/etc"
     1040  fi
     1041
     1042  dnl Check scws library
     1043  AC_CHECK_LIB(scws, scws_new, [
     1044    LIBS="$LIBS -L$SCWS_DIR/lib -lscws"
     1045    XAPIAN_LDFLAGS="$XAPIAN_LDFLAGS -L$SCWS_DIR/lib -lscws"
     1046    CPPFLAGS="$CPPFLAGS -I$SCWS_DIR/include"
     1047    AC_DEFINE(HAVE_SCWS, 1, [Define to 1 if you want to use scws as default tokenizer])
     1048    AC_DEFINE_UNQUOTED(SCWS_ETCDIR, "$SCWS_ETCDIR", [Resources directory of scws to load dictionary and rules])
     1049  ],[
     1050    AC_MSG_ERROR([scws_new() NOT found in libscws, please re-install scws])
     1051  ],[
     1052    -L$SCWS_DIR/lib
     1053  ])
     1054fi
     1055
    10001056dnl ******************************
    10011057dnl * Set special compiler flags *
    10021058dnl ******************************
  • xapian-core/include/xapian/queryparser.h

    diff --git a/xapian-core/include/xapian/queryparser.h b/xapian-core/include/xapian/queryparser.h
    index 829e187..e3e0477 100644
    a b class XAPIAN_VISIBILITY_DEFAULT QueryParser {  
    499499     */
    500500    void set_max_wildcard_expansion(Xapian::termcount limit);
    501501
     502#if 1   /* HAVE_SCWS */
     503    /** Specify the dict and rules file for scws, only used when HAVE_SCWS.
     504     *
     505     *  @param fpath    Path of directory containing dictionary file and rule files (char *)
     506     *  @param xmem     Whether to load the whole dict file into memory (default to false)
     507     *  @param multi    multiset (int 0~15)
     508     */
     509    void load_libscws(const char *fpath, bool xmem = false, int multi = 0);
     510#endif
     511
    502512    /** Parse a query.
    503513     *
    504514     *  @param query_string  A free-text query as entered by a user
  • xapian-core/include/xapian/termgenerator.h

    diff --git a/xapian-core/include/xapian/termgenerator.h b/xapian-core/include/xapian/termgenerator.h
    index 28f4294..d38601f 100644
    a b class XAPIAN_VISIBILITY_DEFAULT TermGenerator {  
    8282    /// Set the database to index spelling data to.
    8383    void set_database(const Xapian::WritableDatabase &db);
    8484
     85    /// Specify the dict and rules file for scws, only used when HAVE_SCWS.
     86    void load_libscws(const char *fpath, bool xmem = false, int multi = 0);
     87
    8588    /// Flags to OR together and pass to TermGenerator::set_flags().
    8689    enum flags {
    8790        /// Index data required for spelling correction.
  • xapian-core/queryparser/queryparser.cc

    diff --git a/xapian-core/queryparser/queryparser.cc b/xapian-core/queryparser/queryparser.cc
    index 5136da2..6571f8b 100644
    a b QueryParser::set_max_wildcard_expansion(Xapian::termcount max)  
    136136    internal->max_wildcard_expansion = max;
    137137}
    138138
     139void
     140QueryParser::load_libscws(const char *fpath, bool xmem, int multi)
     141{
     142#ifdef HAVE_SCWS
     143    internal->load_libscws(fpath, xmem, multi);
     144#else
     145    (void)fpath;
     146    (void)xmem;
     147    (void)multi;
     148#endif
     149}
     150
    139151Query
    140152QueryParser::parse_query(const string &query_string, unsigned flags,
    141153                         const string &default_prefix)
  • xapian-core/queryparser/queryparser.lemony

    diff --git a/xapian-core/queryparser/queryparser.lemony b/xapian-core/queryparser/queryparser.lemony
    index 8dedb80..81af1bb 100644
    a b QueryParser::Internal::add_prefix(const string &field, const string &prefix,  
    618618   }
    619619}
    620620
     621#ifdef HAVE_SCWS
     622QueryParser::Internal::~Internal()
     623{
     624    if (rptr != NULL) {
     625        scws_free_result(rptr);
     626        rptr = NULL;
     627    }
     628    if (scws != NULL) {
     629        scws_free(scws);
     630        scws = NULL;
     631    }
     632}
     633
     634void
     635QueryParser::Internal::load_libscws(const char *fpath, bool xmem, int multi)
     636{
     637    if (scws == NULL) {
     638        string temp;
     639
     640        scws = scws_new();
     641        scws_set_charset(scws, "utf8");
     642        scws_set_ignore(scws, SCWS_NA);
     643        scws_set_duality(scws, SCWS_YEA);
     644
     645        temp = string(fpath ? fpath : SCWS_ETCDIR) + string("/rules.utf8.ini");
     646        scws_set_rule(scws, temp.data());
     647        temp = string(fpath ? fpath : SCWS_ETCDIR) + string("/dict.utf8.xdb");
     648        scws_set_dict(scws, temp.data(), xmem == true ? SCWS_XDICT_MEM : SCWS_XDICT_XDB);
     649    }
     650    if (multi >= 0 && multi < 0x10)
     651        scws_set_multi(scws, (multi<<12));
     652}
     653#endif  /* HAVE_SCWS */
     654
    621655string
    622656QueryParser::Internal::parse_term(Utf8Iterator &it, const Utf8Iterator &end,
    623657                                  bool cjk_ngram, bool & is_cjk_term,
    624658                                  bool &was_acronym)
    625659{
    626660    string term;
     661#ifdef HAVE_SCWS
     662    int off = it.raw() - qptr;
     663    while (rcur && (off > rcur->off)) {
     664        rcur = rcur->next;
     665    }
     666    was_acronym = false;
     667    if (rcur == NULL) {
     668        it = end;
     669        term.resize(0);
     670    } else {
     671        // sometimes, auto_duality + word-end single word char will be repeated
     672        // 说明几句 => 说明/几/几句
     673        if (rcur->next && rcur->next->off == rcur->off && rcur->next->len > rcur->len)
     674            rcur = rcur->next;
     675
     676        term.append(qptr + rcur->off, rcur->len);
     677        was_acronym = (rcur->attr[0] == 'n' && rcur->attr[1] == 'z') ? true : false;
     678        is_cjk_term = CJK::codepoint_is_cjk(*it);
     679        off = rcur->off + rcur->len;
     680        rcur = rcur->next;
     681
     682        // sometimes, auto duality or multisegment
     683        // 几句说搞笑 => 几句/句说/搞笑
     684        if (rcur && off > rcur->off && (rcur->off + rcur->len) > off)
     685            off = rcur->off;
     686        it = Utf8Iterator(qptr + off);
     687    }
     688#else   /* HAVE_SCWS */
    627689    // Look for initials separated by '.' (e.g. P.T.O., U.N.C.L.E).
    628690    // Don't worry if there's a trailing '.' or not.
    629691    if (U_isupper(*it)) {
    QueryParser::Internal::parse_term(Utf8Iterator &it, const Utf8Iterator &end,  
    708770            }
    709771        }
    710772    }
     773#endif  /* HAVE_SCWS */
    711774    return term;
    712775}
    713776
    QueryParser::Internal::parse_query(const string &qs, unsigned flags,  
    759822
    760823    ParserHandler pParser(ParseAlloc());
    761824
     825#ifdef HAVE_SCWS
     826    /// Pre segmentation use scws
     827    scws_res_t res;
     828
     829    if (!scws) {
     830        load_libscws(NULL, false, 0);
     831    }
     832    if (rptr != NULL) {
     833        scws_free_result(rptr);
     834        rptr = NULL;
     835    }
     836    qptr = qs.data();
     837    scws_send_text(scws, qptr, qs.size());
     838    while ((res = scws_get_result(scws)) != NULL) {
     839        if (rptr == NULL) {
     840            rcur = rptr = res;
     841        } else {
     842            rcur->next = res;
     843        }
     844        while (rcur->next != NULL) {
     845            rcur = rcur->next;
     846        }
     847    }
     848    rcur = rptr;
     849#endif  /* HAVE_SCWS */
     850
    762851    unsigned newprev = ' ';
    763852main_lex_loop:
    764853    enum {
    phrased_term:  
    11621251                if (!stemmer.internal.get()) {
    11631252                    // No stemmer is set.
    11641253                    stem_term = STEM_NONE;
     1254#ifdef HAVE_SCWS
     1255                else if (is_cjk_term) {
     1256                    // Don't stem CJK terms.
     1257                    stem_term = STEM_NONE;
     1258                }
     1259#endif
    11651260                } else if (stem_term == STEM_SOME) {
    11661261                    if (!should_stem(unstemmed_term) ||
    11671262                        (it != end && is_stem_preventer(*it))) {
    phrased_term:  
    13051400        }
    13061401    }
    13071402done:
     1403#ifdef HAVE_SCWS
     1404    /// Free all segmented terms/words
     1405    if (rptr != NULL) {
     1406        scws_free_result(rptr);
     1407        rptr = NULL;
     1408    }
     1409#endif
     1410
    13081411    if (!state.error) {
    13091412        // Implicitly close any unclosed quotes...
    13101413        if (mode == IN_QUOTES || mode == IN_PREFIXED_QUOTES)
  • xapian-core/queryparser/queryparser_internal.h

    diff --git a/xapian-core/queryparser/queryparser_internal.h b/xapian-core/queryparser/queryparser_internal.h
    index 86ce563..41d516a 100644
    a b  
    2929#include <xapian/queryparser.h>
    3030#include <xapian/stem.h>
    3131
     32#ifdef HAVE_SCWS
     33#include <scws/scws.h>
     34#endif
     35
    3236#include <list>
    3337#include <map>
    3438
    class QueryParser::Internal : public Xapian::Internal::intrusive_base {  
    6367    Stem stemmer;
    6468    stem_strategy stem_action;
    6569    const Stopper * stopper;
     70#ifdef HAVE_SCWS
     71    scws_t scws;
     72    scws_res_t rptr, rcur;
     73    const char *qptr;
     74#endif
    6675    Query::op default_op;
    6776    const char * errmsg;
    6877    Database db;
    class QueryParser::Internal : public Xapian::Internal::intrusive_base {  
    8897
    8998  public:
    9099    Internal() : stem_action(STEM_SOME), stopper(NULL),
     100#ifdef HAVE_SCWS
     101        scws(NULL), rptr(NULL), rcur(NULL),
     102#endif
    91103        default_op(Query::OP_OR), errmsg(NULL), max_wildcard_expansion(0) { }
    92104
     105#ifdef HAVE_SCWS
     106    ~Internal();
     107
     108    void load_libscws(const char *fpath, bool xmem, int multi);
     109#endif
     110
    93111    Query parse_query(const string & query_string, unsigned int flags, const string & default_prefix);
    94112};
    95113
  • xapian-core/queryparser/termgenerator.cc

    diff --git a/xapian-core/queryparser/termgenerator.cc b/xapian-core/queryparser/termgenerator.cc
    index e6f745f..20846f2 100644
    a b TermGenerator::set_database(const Xapian::WritableDatabase &db)  
    7474    internal->db = db;
    7575}
    7676
     77/// Load the specified dictionary file for scws.
     78void
     79TermGenerator::load_libscws(const char *fpath, bool xmem, int multi)
     80{
     81#ifdef HAVE_SCWS
     82    internal->load_libscws(fpath, xmem, multi);
     83#else
     84    (void)fpath;
     85    (void)xmem;
     86    (void)multi;
     87#endif
     88}
     89
    7790TermGenerator::flags
    7891TermGenerator::set_flags(flags toggle, flags mask)
    7992{
  • xapian-core/queryparser/termgenerator_internal.cc

    diff --git a/xapian-core/queryparser/termgenerator_internal.cc b/xapian-core/queryparser/termgenerator_internal.cc
    index 93f04a1..633750f 100644
    a b inline unsigned check_suffix(unsigned ch) {  
    125125#define STOPWORDS_IGNORE 1
    126126#define STOPWORDS_INDEX_UNSTEMMED_ONLY 2
    127127
     128#ifdef HAVE_SCWS
     129TermGenerator::Internal::~Internal()
     130{
     131    if (scws != NULL) {
     132        scws_free(scws);
     133        scws = NULL;
     134    }
     135}
     136
     137void
     138TermGenerator::Internal::load_libscws(const char *fpath, bool xmem, int multi)
     139{
     140    if (scws == NULL) {
     141        string temp;
     142
     143        scws = scws_new();
     144        scws_set_charset(scws, "utf8");
     145        scws_set_ignore(scws, SCWS_NA);
     146        scws_set_duality(scws, SCWS_YEA);
     147
     148        temp = string(fpath ? fpath : SCWS_ETCDIR) + "/rules.utf8.ini";
     149        scws_set_rule(scws, temp.c_str());
     150        temp = string(fpath ? fpath : SCWS_ETCDIR) + "/dict.utf8.xdb";
     151        scws_set_dict(scws, temp.c_str(), xmem ? SCWS_XDICT_MEM : SCWS_XDICT_XDB);
     152    }
     153    if (multi >= 0 && multi < 0x10)
     154        scws_set_multi(scws, (multi<<12));
     155}
     156#endif  /* HAVE_SCWS */
     157
    128158void
    129159TermGenerator::Internal::index_text(Utf8Iterator itor, termcount wdf_inc,
    130160                                    const string & prefix, bool with_positions)
    TermGenerator::Internal::index_text(Utf8Iterator itor, termcount wdf_inc,  
    135165
    136166    if (!stopper) stop_mode = STOPWORDS_NONE;
    137167
     168#ifdef HAVE_SCWS
     169    int last_endpos = 0, last_off = 0;
     170    scws_res_t res, cur;
     171    Utf8Iterator iterm;
     172    const char *text = itor.raw();
     173
     174    if (!scws) load_libscws(NULL, false, 0);
     175    scws_send_text(scws, text, itor.left());
     176    while ((res = cur = scws_get_result(scws)) != NULL) { while (cur != NULL) {
     177        string term;
     178
     179        iterm.assign(text + cur->off, cur->len);
     180        if (!Unicode::is_wordchar(*iterm)) {
     181            cur = cur->next;
     182            continue;
     183        }
     184        term = Unicode::tolower(string(text + cur->off, cur->len));
     185        if (with_positions) {
     186            // for part word(short, duality)
     187            if ((cur->off + cur->len) <= last_endpos)
     188                --termpos;
     189            else {
     190                // for dualities' first single word
     191                if (cur->off == last_off)
     192                    --termpos;
     193                last_endpos = cur->off + cur->len;
     194            }
     195        }
     196        last_off = cur->off;
     197        cur = cur->next;
     198    }
     199#else   /* HAVE_SCWS */
    138200    while (true) {
    139201        // Advance to the start of the next term.
    140202        unsigned ch;
    TermGenerator::Internal::index_text(Utf8Iterator itor, termcount wdf_inc,  
    254316        }
    255317
    256318endofterm:
     319#endif  /* HAVE_SCWS */
    257320        if (term.size() > MAX_PROB_TERM_LENGTH) continue;
    258321
    259322        if (stop_mode == STOPWORDS_IGNORE && (*stopper)(term)) continue;
    endofterm:  
    263326        } else {
    264327            doc.add_term(prefix + term, wdf_inc);
    265328        }
     329#ifdef HAVE_SCWS
     330        // CJK term need not spelling & stemmer
     331        if (CJK::codepoint_is_cjk(*iterm)) continue;
     332#endif
    266333        if ((flags & FLAG_SPELLING) && prefix.empty()) db.add_spelling(term);
    267334
    268335        if (!stemmer.internal.get()) continue;
    endofterm:  
    280347        stem += stemmer(term);
    281348        doc.add_term(stem, wdf_inc);
    282349    }
     350#ifdef HAVE_SCWS
     351    scws_free_result(res); }
     352#endif
    283353}
    284354
    285355}
  • xapian-core/queryparser/termgenerator_internal.h

    diff --git a/xapian-core/queryparser/termgenerator_internal.h b/xapian-core/queryparser/termgenerator_internal.h
    index f074fd9..1381c54 100644
    a b  
    2727#include <xapian/termgenerator.h>
    2828#include <xapian/stem.h>
    2929
     30#ifdef HAVE_SCWS
     31#include <scws/scws.h>
     32#endif
     33
    3034namespace Xapian {
    3135
    3236class Stopper;
    class TermGenerator::Internal : public Xapian::Internal::intrusive_base {  
    3741    const Stopper * stopper;
    3842    Document doc;
    3943    termcount termpos;
     44#ifdef HAVE_SCWS
     45    scws_t scws;
     46#endif
    4047    TermGenerator::flags flags;
    4148    WritableDatabase db;
    4249
    4350  public:
    4451    Internal() : stopper(NULL), termpos(0),
     52#ifdef HAVE_SCWS
     53        scws(NULL),
     54#endif
    4555        flags(TermGenerator::flags(0)) { }
     56
     57#ifdef HAVE_SCWS
     58    ~Internal();
     59
     60    void load_libscws(const char *fpath, bool xmem, int multi);
     61#endif
     62
    4663    void index_text(Utf8Iterator itor,
    4764                    termcount weight,
    4865                    const std::string & prefix,