Ticket #448: xapian-core-stem-implementation.patch

File xapian-core-stem-implementation.patch, 12.6 KB (added by Evgeny Sizikov, 14 years ago)

The initial implementation of Stem::Implementation according to comment:8

  • xapian-core/include/xapian/stem.h

     
    3232class XAPIAN_VISIBILITY_DEFAULT Stem {
    3333  public:
    3434    /// @private @internal Class representing the stemmer internals.
    35     class Internal;
     35    class Implementation;
     36
     37    /// @private @internal Class representing the Snowball stemmer internals.
     38    class SnowballImplementation;
     39           
    3640    /// @private @internal Reference counted internals.
    37     Xapian::Internal::RefCntPtr<Internal> internal;
     41    Xapian::Internal::RefCntPtr<Implementation> internal;
    3842
    3943    /// Copy constructor.
    4044    Stem(const Stem & o);
     
    4852     */
    4953    Stem();
    5054
     55    /** Construct a Xapian::Stem object with user-provided stemming algorithm.
     56     *
     57     *  User could create a subclass of Xapian::Stem::Implementation, and wrap
     58     *  it in a Xapian::Stem object to pass to the Xapian API.
     59     */
     60    Stem(Implementation * p);
     61
    5162    /** Construct a Xapian::Stem object for a particular language.
    5263     *
    5364     *  @param language Either the English name for the language
  • xapian-core/languages/Makefile.mk

     
    5656        $(CC_FOR_BUILD) -o languages/snowball -DDISABLE_JAVA `for f in $(snowball_sources) ; do test -f $$f && echo $$f || echo $(srcdir)/$$f ; done`
    5757
    5858.sbl.cc:
    59         languages/snowball $< -o `echo $@|sed 's!\.cc$$!!'` -c++ -u -n InternalStem`echo $<|sed 's!.*/\(.\).*!\1!'|tr a-z A-Z``echo $<|sed 's!.*/.!!;s!\.sbl!!'` -p Stem::Internal
     59        languages/snowball $< -o `echo $@|sed 's!\.cc$$!!'` -c++ -u -n InternalStem`echo $<|sed 's!.*/\(.\).*!\1!'|tr a-z A-Z``echo $<|sed 's!.*/.!!;s!\.sbl!!'` -p Stem::SnowballImplementation
    6060
    6161.sbl.h:
    62         languages/snowball $< -o `echo $@|sed 's!\.h$$!!'` -c++ -u -n InternalStem`echo $<|sed 's!.*/\(.\).*!\1!'|tr a-z A-Z``echo $<|sed 's!.*/.!!;s!\.sbl!!'` -p Stem::Internal
     62        languages/snowball $< -o `echo $@|sed 's!\.h$$!!'` -c++ -u -n InternalStem`echo $<|sed 's!.*/\(.\).*!\1!'|tr a-z A-Z``echo $<|sed 's!.*/.!!;s!\.sbl!!'` -p Stem::SnowballImplementation
    6363
    6464languages/allsnowballheaders.h: languages/generate-allsnowballheaders languages/Makefile.mk
    6565        languages/generate-allsnowballheaders $(snowball_built_sources)
  • xapian-core/languages/steminternal.cc

     
    129129
    130130namespace Xapian {
    131131
    132 Stem::Internal::~Internal()
     132Stem::Implementation::Implementation()
    133133{
     134}
     135   
     136Stem::Implementation::~Implementation()
     137{
     138}
     139   
     140Stem::SnowballImplementation::~SnowballImplementation()
     141{
    134142    lose_s(p);
    135143}
    136144
    137145string
    138 Stem::Internal::operator()(const string & word)
     146Stem::SnowballImplementation::operator()(const string & word)
    139147{
    140148    const symbol * s = reinterpret_cast<const symbol *>(word.data());
    141149    replace_s(0, l, word.size(), s);
     
    149157
    150158/* Code for character groupings: utf8 cases */
    151159
    152 int Stem::Internal::get_utf8(int * slot) {
     160int Stem::SnowballImplementation::get_utf8(int * slot) {
    153161    int b0, b1;
    154162    int tmp = c;
    155163    if (tmp >= l) return 0;
     
    164172    * slot = (b0 & 0xF) << 12 | (b1 & 0x3F) << 6 | (p[tmp] & 0x3F); return 3;
    165173}
    166174
    167 int Stem::Internal::get_b_utf8(int * slot) {
     175int Stem::SnowballImplementation::get_b_utf8(int * slot) {
    168176    int b0, b1;
    169177    int tmp = c;
    170178    if (tmp <= lb) return 0;
     
    179187    * slot = (p[tmp] & 0xF) << 12 | (b1 & 0x3F) << 6 | (b0 & 0x3F); return 3;
    180188}
    181189
    182 int Stem::Internal::in_grouping_U(const unsigned char * s, int min, int max, int repeat) {
     190int
     191Stem::SnowballImplementation::in_grouping_U(const unsigned char * s, int min,
     192                                            int max, int repeat) {
    183193    do {
    184194        int ch;
    185195        int w = get_utf8(&ch);
    186196        if (!w) return -1;
    187         if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
     197        if (ch > max || (ch -= min) < 0 ||
     198            (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
    188199            return w;
    189200        c += w;
    190201    } while (repeat);
    191202    return 0;
    192203}
    193204
    194 int Stem::Internal::in_grouping_b_U(const unsigned char * s, int min, int max, int repeat) {
     205int
     206Stem::SnowballImplementation::in_grouping_b_U(const unsigned char * s, int min,
     207                                              int max, int repeat) {
    195208    do {
    196209        int ch;
    197210        int w = get_b_utf8(&ch);
    198211        if (!w) return -1;
    199         if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
     212        if (ch > max || (ch -= min) < 0 ||
     213            (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
    200214            return w;
    201215        c -= w;
    202216    } while (repeat);
    203217    return 0;
    204218}
    205219
    206 int Stem::Internal::out_grouping_U(const unsigned char * s, int min, int max, int repeat) {
     220int
     221Stem::SnowballImplementation::out_grouping_U(const unsigned char * s, int min,
     222                                             int max, int repeat) {
    207223    do {
    208224        int ch;
    209225        int w = get_utf8(&ch);
    210226        if (!w) return -1;
    211         if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0))
    212             /* FIXME: try adding this so gopast in generated code is simpler: if (repeat == 2) c += w; */ return w;
     227        if (!(ch > max || (ch -= min) < 0 ||
     228            (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0))
     229            /* FIXME: try adding this so gopast in generated code is simpler:
     230               if (repeat == 2) c += w; */ return w;
    213231        c += w;
    214232    } while (repeat);
    215233    return 0;
    216234}
    217235
    218 int Stem::Internal::out_grouping_b_U(const unsigned char * s, int min, int max, int repeat) {
     236int
     237Stem::SnowballImplementation::out_grouping_b_U(const unsigned char * s, int min,
     238                                               int max, int repeat) {
    219239    do {
    220240        int ch;
    221241        int w = get_b_utf8(&ch);
    222242        if (!w) return -1;
    223         if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0))
     243        if (!(ch > max || (ch -= min) < 0 ||
     244            (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0))
    224245            return w;
    225246        c -= w;
    226247    } while (repeat);
    227248    return 0;
    228249}
    229250
    230 int Stem::Internal::eq_s(int s_size, const symbol * s) {
     251int Stem::SnowballImplementation::eq_s(int s_size, const symbol * s) {
    231252    if (l - c < s_size || memcmp(p + c, s, s_size * sizeof(symbol)) != 0)
    232253        return 0;
    233254    c += s_size;
    234255    return 1;
    235256}
    236257
    237 int Stem::Internal::eq_s_b(int s_size, const symbol * s) {
    238     if (c - lb < s_size || memcmp(p + c - s_size, s, s_size * sizeof(symbol)) != 0)
     258int Stem::SnowballImplementation::eq_s_b(int s_size, const symbol * s) {
     259    if (c - lb < s_size ||
     260        memcmp(p + c - s_size, s, s_size * sizeof(symbol)) != 0)
    239261        return 0;
    240262    c -= s_size;
    241263    return 1;
    242264}
    243265
    244266int
    245 Stem::Internal::find_among(const symbol * pool, const struct among * v,
    246                            int v_size, const unsigned char * fnum,
    247                            const among_function * f)
     267Stem::SnowballImplementation::find_among(const symbol * pool,
     268                                         const struct among * v, int v_size,
     269                                         const unsigned char * fnum,
     270                                         const among_function * f)
    248271{
    249272    int i = 0;
    250273    int j = v_size;
     
    300323
    301324/* find_among_b is for backwards processing. Same comments apply */
    302325int
    303 Stem::Internal::find_among_b(const symbol * pool, const struct among * v,
    304                              int v_size, const unsigned char * fnum,
    305                              const among_function * f)
     326Stem::SnowballImplementation::find_among_b(const symbol * pool,
     327                                           const struct among * v, int v_size,
     328                                           const unsigned char * fnum,
     329                                           const among_function * f)
    306330{
    307331    int i = 0;
    308332    int j = v_size;
     
    352376}
    353377
    354378int
    355 Stem::Internal::replace_s(int c_bra, int c_ket, int s_size, const symbol * s)
     379Stem::SnowballImplementation::replace_s(int c_bra, int c_ket, int s_size,
     380                                        const symbol * s)
    356381{
    357382    int adjustment;
    358383    int len;
     
    378403    return adjustment;
    379404}
    380405
    381 int Stem::Internal::slice_check() {
     406int Stem::SnowballImplementation::slice_check() {
    382407    Assert(p);
    383408    if (bra < 0 || bra > ket || ket > l) {
    384409#if 0
     
    390415    return 0;
    391416}
    392417
    393 int Stem::Internal::slice_from_s(int s_size, const symbol * s) {
     418int Stem::SnowballImplementation::slice_from_s(int s_size, const symbol * s) {
    394419    if (slice_check()) return -1;
    395420    replace_s(bra, ket, s_size, s);
    396421    return 0;
    397422}
    398423
    399 void Stem::Internal::insert_s(int c_bra, int c_ket, int s_size, const symbol * s) {
     424void
     425Stem::SnowballImplementation::insert_s(int c_bra, int c_ket, int s_size,
     426                                       const symbol * s) {
    400427    int adjustment = replace_s(c_bra, c_ket, s_size, s);
    401428    if (c_bra <= bra) bra += adjustment;
    402429    if (c_bra <= ket) ket += adjustment;
    403430}
    404431
    405 symbol * Stem::Internal::slice_to(symbol * v) {
     432symbol * Stem::SnowballImplementation::slice_to(symbol * v) {
    406433    if (slice_check()) return NULL;
    407434    {
    408435        int len = ket - bra;
     
    415442    return v;
    416443}
    417444
    418 symbol * Stem::Internal::assign_to(symbol * v) {
     445symbol * Stem::SnowballImplementation::assign_to(symbol * v) {
    419446    int len = l;
    420447    if (CAPACITY(v) < len) {
    421448        v = increase_size(v, len);
     
    426453}
    427454
    428455#if 0
    429 void Stem::Internal::debug(int number, int line_count) {
     456void Stem::SnowballImplementation::debug(int number, int line_count) {
    430457    int i;
    431458    int limit = SIZE(p);
    432459    /*if (number >= 0) printf("%3d (line %4d): '", number, line_count);*/
  • xapian-core/languages/steminternal.h

     
    6565    reinterpret_cast<int *>(void_p)[-2] = n;
    6666}
    6767
    68 typedef int (*among_function)(Xapian::Stem::Internal *);
     68typedef int (*among_function)(Xapian::Stem::Implementation *);
    6969
    7070struct among {
    7171    int s_size;         /* length of search string (in symbols) */
     
    8484
    8585namespace Xapian {
    8686
    87 class Stem::Internal : public Xapian::Internal::RefCntBase {
     87/// Class representing a stemming algorithm implementation.
     88struct Stem::Implementation : public Xapian::Internal::RefCntBase
     89{
     90    /// Perform initialisation common to all stemmers.
     91    Implementation();
     92
     93    /// Perform cleanup common to all stemmers.
     94    virtual ~Implementation();
     95
     96    /// Stem the specified word.
     97    virtual std::string operator()(const std::string & word) = 0;
     98
     99    /// Return string describing this object.
     100    virtual const char * get_description() const = 0;
     101};
     102
     103class Stem::SnowballImplementation : public Stem::Implementation {
    88104    int slice_check();
    89105
    90106  protected:
     
    129145
    130146  public:
    131147    /// Perform initialisation common to all Snowball stemmers.
    132     Internal() : p(create_s()), c(0), l(0), lb(0), bra(0), ket(0) { }
     148    SnowballImplementation() : p(create_s()),
     149                               c(0), l(0), lb(0), bra(0), ket(0) { }
    133150
    134151    /// Perform cleanup common to all Snowball stemmers.
    135     virtual ~Internal();
     152    virtual ~SnowballImplementation();
    136153
    137154    /// Stem the specified word.
    138     std::string operator()(const std::string & word);
     155    virtual std::string operator()(const std::string & word);
    139156
    140157    /// Virtual method implemented by the subclass to actually do the work.
    141158    virtual int stem() = 0;
    142 
    143     /// Return string describing this object.
    144     virtual const char * get_description() const = 0;
    145159};
    146160
    147161}
  • xapian-core/languages/stem.cc

     
    4444
    4545Stem::Stem() : internal(0) { }
    4646
     47Stem::Stem(Stem::Implementation * p) : internal(p) { }
     48
    4749Stem::Stem(const std::string &language) : internal(0) {
    4850    if (language.empty()) return;
    4951    switch (language[0]) {
  • xapian-core/languages/compiler/generator.c

     
    15251525        if (q->type == t_routine && q->routine_called_from_among) {
    15261526            q->among_func_count = ++among_func_count;
    15271527            g->V[0] = q;
    1528             w(g, "static int t~V0(Xapian::Stem::Internal * this_ptr) {~N"
     1528            w(g, "static int t~V0(Xapian::Stem::Implementation * this_ptr) {~N"
    15291529                 "    return (static_cast<Xapian::~S0 *>(this_ptr))->~V0();~N"
    15301530                 "}~N"
    15311531                 "~N");