Ticket #186: patch

File patch, 15.4 KB (added by Richard Boulton, 16 years ago)

Patch which allows subclasses of Stem

  • include/xapian/stem.h

     
    2828
    2929namespace Xapian {
    3030
    31 /// Class representing a stemming algorithm.
    32 class XAPIAN_VISIBILITY_DEFAULT Stem {
     31/// Base representing a stemming algorithm.
     32class XAPIAN_VISIBILITY_DEFAULT BaseStem : public Xapian::Internal::RefCntBase {
     33    /// No copying allowed.
     34    BaseStem(const BaseStem & o);
     35
     36    /// No assignment allowed.
     37    void operator=(const BaseStem & o);
     38
     39  protected:
     40    /** Destructor is protected since it should only be called by subclasses
     41     *  and RefCntPtr.
     42     */
     43    virtual ~BaseStem() {}
     44
     45    friend class Xapian::Internal::RefCntPtr<BaseStem>;
     46
    3347  public:
    34     /// @private @internal Class representing the stemmer internals.
    35     class Internal;
    36     /// @private @internal Reference counted internals.
    37     Xapian::Internal::RefCntPtr<Internal> internal;
     48    BaseStem() {}
    3849
    39     /// Copy constructor.
    40     Stem(const Stem & o);
     50    /** Stem a word.
     51     *
     52     *  @param word  a word to stem.
     53     *  @return      the stemmed form of the word.
     54     */
     55    virtual std::string operator()(const std::string &word) const = 0;
    4156
    42     /// Assignment.
    43     void operator=(const Stem & o);
     57    /// Return a string describing this object.
     58    virtual std::string get_description() const = 0;
     59};
    4460
    45     /** Construct a Xapian::Stem object which doesn't change terms.
     61/// A stemming algorithm which doesn't change words at all.
     62class XAPIAN_VISIBILITY_DEFAULT IdentityStem : public BaseStem {
     63    /// No copying allowed.
     64    IdentityStem(const IdentityStem & o);
     65
     66    /// No assignment allowed.
     67    void operator=(const IdentityStem & o);
     68
     69  public:
     70    IdentityStem() {}
     71
     72    /** Stem a word.
    4673     *
    47      *  Equivalent to Stem("none").
     74     *  Since this stemmer doesn't change words, this simply returns the word.
     75     *
     76     *  @param word  a word to stem.
     77     *  @return      the word supplied.
    4878     */
    49     Stem();
     79    std::string operator()(const std::string &word) const;
    5080
    51     /** Construct a Xapian::Stem object for a particular language.
     81    /// Return a string describing this object.
     82    std::string get_description() const;
     83};
     84
     85/// Class representing one of the snowball stemming algorithms.
     86class XAPIAN_VISIBILITY_DEFAULT SnowballStem : public BaseStem {
     87    /// No copying allowed.
     88    SnowballStem(const SnowballStem & o);
     89
     90    /// No assignment allowed.
     91    void operator=(const SnowballStem & o);
     92
     93  public:
     94    /// @private @internal Class representing the snowball stemmer internals.
     95    class Internal;
     96
     97  private:
     98    /// @private @internal Snowball stemmer internals.
     99    Internal * internal;
     100
     101  public:
     102    /** Construct a Xapian::SnowballStem object for a particular language.
    52103     *
    53104     *  @param language Either the English name for the language
    54105     *                  or the two letter ISO639 code.
     
    56107     *  The following language names are understood (aliases follow the
    57108     *  name):
    58109     *
    59      *  - none - don't stem terms
    60110     *  - danish (da)
    61111     *  - dutch (nl)
    62112     *  - english (en) - Martin Porter's 2002 revision of his stemmer
     
    76126     *  @exception              Xapian::InvalidArgumentError is thrown if
    77127     *                  language isn't recognised.
    78128     */
    79     explicit Stem(const std::string &language);
     129    explicit SnowballStem(const std::string &language);
    80130
    81131    /// Destructor.
    82     ~Stem();
     132    ~SnowballStem();
    83133
    84134    /** Stem a word.
    85135     *
    86      *  @param word             a word to stem.
    87      *  @return         the stem
     136     *  @param word  a word to stem.
     137     *  @return      the stemmed form of the word.
    88138     */
    89139    std::string operator()(const std::string &word) const;
    90140
     
    104154    static std::string get_available_languages();
    105155};
    106156
     157/// Class wrapping a reference counted stemming algorithm.
     158class XAPIAN_VISIBILITY_DEFAULT Stem {
     159  public:
     160    /// @private @internal Reference counted internals.
     161    Xapian::Internal::RefCntPtr<Xapian::BaseStem> internal;
     162
     163    /// Copy constructor.
     164    Stem(const Stem & o) : internal(o.internal) { }
     165
     166    /// Assignment.
     167    void operator=(const Stem & o) { internal = o.internal; }
     168
     169    /** Construct a Xapian::Stem object from a pointer to a BaseStem.
     170     */
     171    Stem(Xapian::Internal::RefCntPtr<Xapian::BaseStem> internal_)
     172            : internal(internal_) {}
     173
     174    /** Construct a Xapian::Stem object which doesn't change terms.
     175     */
     176    Stem() : internal(new Xapian::IdentityStem()) {}
     177
     178    /** Construct a Xapian::Stem object for a particular language.
     179     *
     180     *  This constructor is included for convenience, and is equivalent to
     181     *  Stem(new SnowballStem(language)) - except that a language parameter of
     182     *  "none" will produce a stemmer which doesn't remove any stems (ie, an
     183     *  IdentityStem stemmer).
     184     *
     185     *  See Xapian::SnowballStem for details.
     186     */
     187    explicit Stem(const std::string &language)
     188    {
     189        if (language == "none")
     190            internal = new Xapian::IdentityStem();
     191        else
     192            internal = new Xapian::SnowballStem(language);
     193    }
     194
     195    /** Stem a word.
     196     *
     197     *  @param word  a word to stem.
     198     *  @return      the stemmed form of the word.
     199     */
     200    std::string operator()(const std::string &word) const
     201    {
     202        return internal->operator()(word);
     203    }
     204
     205    /// Return a string describing this object.
     206    std::string get_description() const
     207    {
     208        return "Xapian::Stem(" + internal->get_description() + ")";
     209    }
     210
     211    /** Return a list of available languages.
     212     *
     213     *  This is included for convenience, and is equivalent to
     214     *  SnowballStem.get_available_languages().
     215     *
     216     *  See Xapian::SnowballStem for details.
     217     */
     218    static std::string get_available_languages()
     219    {
     220        return SnowballStem::get_available_languages();
     221    }
     222};
     223
    107224}
    108225
    109226#endif // XAPIAN_INCLUDED_STEM_H
  • languages/Makefile.mk

     
    5656        $(CC_FOR_BUILD) -o languages/snowball -DDISABLE_JAVA `for f in $(snowball_sources) ; do test -f $$f && echo $$f || echo $(srcdir)/$$f ; done`
    5757
    5858.sbl.cc:
    59         languages/snowball $< -o `echo $@|sed 's!\.cc$$!!'` -c++ -u -n InternalStem`echo $<|sed 's!.*/\(.\).*!\1!'|tr a-z A-Z``echo $<|sed 's!.*/.!!;s!\.sbl!!'` -p Stem::Internal
     59        languages/snowball $< -o `echo $@|sed 's!\.cc$$!!'` -c++ -u -n InternalStem`echo $<|sed 's!.*/\(.\).*!\1!'|tr a-z A-Z``echo $<|sed 's!.*/.!!;s!\.sbl!!'` -p SnowballStem::Internal
    6060
    6161.sbl.h:
    62         languages/snowball $< -o `echo $@|sed 's!\.h$$!!'` -c++ -u -n InternalStem`echo $<|sed 's!.*/\(.\).*!\1!'|tr a-z A-Z``echo $<|sed 's!.*/.!!;s!\.sbl!!'` -p Stem::Internal
     62        languages/snowball $< -o `echo $@|sed 's!\.h$$!!'` -c++ -u -n InternalStem`echo $<|sed 's!.*/\(.\).*!\1!'|tr a-z A-Z``echo $<|sed 's!.*/.!!;s!\.sbl!!'` -p SnowballStem::Internal
    6363
    6464languages/allsnowballheaders.h: languages/generate-allsnowballheaders languages/Makefile.mk
    6565        languages/generate-allsnowballheaders $(snowball_built_sources)
  • languages/steminternal.cc

     
    131131
    132132namespace Xapian {
    133133
    134 Stem::Internal::Internal()
     134SnowballStem::Internal::Internal()
    135135    : p(create_s()), c(0), l(0), lb(0), bra(0), ket(0)
    136136{
    137137}
    138138
    139 Stem::Internal::~Internal()
     139SnowballStem::Internal::~Internal()
    140140{
    141141    lose_s(p);
    142142}
    143143
    144144string
    145 Stem::Internal::operator()(const string & word)
     145SnowballStem::Internal::operator()(const string & word)
    146146{
    147147    const symbol * s = reinterpret_cast<const symbol *>(word.data());
    148148    replace_s(0, l, word.size(), s);
     
    156156
    157157/* Code for character groupings: utf8 cases */
    158158
    159 int Stem::Internal::get_utf8(int * slot) {
     159int SnowballStem::Internal::get_utf8(int * slot) {
    160160    int b0, b1;
    161161    int tmp = c;
    162162    if (tmp >= l) return 0;
     
    171171    * slot = (b0 & 0xF) << 12 | (b1 & 0x3F) << 6 | (p[tmp] & 0x3F); return 3;
    172172}
    173173
    174 int Stem::Internal::get_b_utf8(int * slot) {
     174int SnowballStem::Internal::get_b_utf8(int * slot) {
    175175    int b0, b1;
    176176    int tmp = c;
    177177    if (tmp <= lb) return 0;
     
    186186    * slot = (p[tmp] & 0xF) << 12 | (b1 & 0x3F) << 6 | (b0 & 0x3F); return 3;
    187187}
    188188
    189 int Stem::Internal::in_grouping_U(const unsigned char * s, int min, int max, int repeat) {
     189int SnowballStem::Internal::in_grouping_U(const unsigned char * s, int min, int max, int repeat) {
    190190    do {
    191191        int ch;
    192192        int w = get_utf8(&ch);
     
    198198    return 0;
    199199}
    200200
    201 int Stem::Internal::in_grouping_b_U(const unsigned char * s, int min, int max, int repeat) {
     201int SnowballStem::Internal::in_grouping_b_U(const unsigned char * s, int min, int max, int repeat) {
    202202    do {
    203203        int ch;
    204204        int w = get_b_utf8(&ch);
     
    210210    return 0;
    211211}
    212212
    213 int Stem::Internal::out_grouping_U(const unsigned char * s, int min, int max, int repeat) {
     213int SnowballStem::Internal::out_grouping_U(const unsigned char * s, int min, int max, int repeat) {
    214214    do {
    215215        int ch;
    216216        int w = get_utf8(&ch);
     
    222222    return 0;
    223223}
    224224
    225 int Stem::Internal::out_grouping_b_U(const unsigned char * s, int min, int max, int repeat) {
     225int SnowballStem::Internal::out_grouping_b_U(const unsigned char * s, int min, int max, int repeat) {
    226226    do {
    227227        int ch;
    228228        int w = get_b_utf8(&ch);
     
    234234    return 0;
    235235}
    236236
    237 int Stem::Internal::eq_s(int s_size, const symbol * s) {
     237int SnowballStem::Internal::eq_s(int s_size, const symbol * s) {
    238238    if (l - c < s_size || memcmp(p + c, s, s_size * sizeof(symbol)) != 0)
    239239        return 0;
    240240    c += s_size;
    241241    return 1;
    242242}
    243243
    244 int Stem::Internal::eq_s_b(int s_size, const symbol * s) {
     244int SnowballStem::Internal::eq_s_b(int s_size, const symbol * s) {
    245245    if (c - lb < s_size || memcmp(p + c - s_size, s, s_size * sizeof(symbol)) != 0)
    246246        return 0;
    247247    c -= s_size;
    248248    return 1;
    249249}
    250250
    251 int Stem::Internal::find_among(const struct among * v, int v_size, const unsigned char * fnum, const among_function * f) {
     251int SnowballStem::Internal::find_among(const struct among * v, int v_size, const unsigned char * fnum, const among_function * f) {
    252252    int i = 0;
    253253    int j = v_size;
    254254
     
    302302}
    303303
    304304/* find_among_b is for backwards processing. Same comments apply */
    305 int Stem::Internal::find_among_b(const struct among * v, int v_size, const unsigned char * fnum, const among_function * f) {
     305int SnowballStem::Internal::find_among_b(const struct among * v, int v_size, const unsigned char * fnum, const among_function * f) {
    306306    int i = 0;
    307307    int j = v_size;
    308308
     
    351351}
    352352
    353353int
    354 Stem::Internal::replace_s(int c_bra, int c_ket, int s_size, const symbol * s)
     354SnowballStem::Internal::replace_s(int c_bra, int c_ket, int s_size, const symbol * s)
    355355{
    356356    int adjustment;
    357357    int len;
     
    377377    return adjustment;
    378378}
    379379
    380 int Stem::Internal::slice_check() {
     380int SnowballStem::Internal::slice_check() {
    381381    Assert(p);
    382382    if (bra < 0 || bra > ket || ket > l) {
    383383#if 0
     
    389389    return 0;
    390390}
    391391
    392 int Stem::Internal::slice_from_s(int s_size, const symbol * s) {
     392int SnowballStem::Internal::slice_from_s(int s_size, const symbol * s) {
    393393    if (slice_check()) return -1;
    394394    replace_s(bra, ket, s_size, s);
    395395    return 0;
    396396}
    397397
    398 void Stem::Internal::insert_s(int c_bra, int c_ket, int s_size, const symbol * s) {
     398void SnowballStem::Internal::insert_s(int c_bra, int c_ket, int s_size, const symbol * s) {
    399399    int adjustment = replace_s(c_bra, c_ket, s_size, s);
    400400    if (c_bra <= bra) bra += adjustment;
    401401    if (c_bra <= ket) ket += adjustment;
    402402}
    403403
    404 symbol * Stem::Internal::slice_to(symbol * v) {
     404symbol * SnowballStem::Internal::slice_to(symbol * v) {
    405405    if (slice_check()) return NULL;
    406406    {
    407407        int len = ket - bra;
     
    414414    return v;
    415415}
    416416
    417 symbol * Stem::Internal::assign_to(symbol * v) {
     417symbol * SnowballStem::Internal::assign_to(symbol * v) {
    418418    int len = l;
    419419    if (CAPACITY(v) < len) {
    420420        v = increase_size(v, len);
     
    425425}
    426426
    427427#if 0
    428 void Stem::Internal::debug(int number, int line_count) {
     428void SnowballStem::Internal::debug(int number, int line_count) {
    429429    int i;
    430430    int limit = SIZE(p);
    431431    /*if (number >= 0) printf("%3d (line %4d): '", number, line_count);*/
  • languages/steminternal.h

     
    4141#define CAPACITY(P)    ((const int *)(const void *)(P))[-2]
    4242#define SET_CAPACITY(P, N) ((int *)(void *)(P))[-2] = N
    4343
    44 typedef int (*among_function)(Xapian::Stem::Internal *);
     44typedef int (*among_function)(Xapian::SnowballStem::Internal *);
    4545
    4646struct among {
    4747    int s_size;         /* length of search string (in symbols) */
     
    6060
    6161namespace Xapian {
    6262
    63 class Stem::Internal : public Xapian::Internal::RefCntBase {
     63class SnowballStem::Internal {
    6464    int slice_check();
    6565
    6666  protected:
  • languages/stem.cc

     
    3333
    3434namespace Xapian {
    3535
    36 Stem::Stem(const Stem & o) : internal(o.internal) { }
     36std::string
     37IdentityStem::operator()(const std::string &word) const
     38{
     39    return word;
     40}
    3741
    38 void
    39 Stem::operator=(const Stem & o)
     42std::string
     43IdentityStem::get_description() const
    4044{
    41     internal = o.internal;
     45    return "Xapian::IdentityStem";
    4246}
    4347
    44 Stem::Stem() : internal(0) { }
    4548
    46 Stem::Stem(const std::string &language) : internal(0) {
     49SnowballStem::SnowballStem(const std::string &language)
     50        : internal(0)
     51{
    4752    if (language.empty()) return;
    4853    switch (language[0]) {
    4954        case 'd':
     
    119124                internal = new InternalStemNorwegian;
    120125                return;
    121126            }
    122             if (language == "none") {
    123                 return;
    124             }
    125127            break;
    126128        case 'p':
    127129            if (language == "pt" || language == "portuguese") {
     
    163165    throw Xapian::InvalidArgumentError("Language code " + language + " unknown");
    164166}
    165167
    166 Stem::~Stem() { }
     168SnowballStem::~SnowballStem()
     169{
     170    delete internal;
     171}
    167172
    168173string
    169 Stem::operator()(const std::string &word) const
     174SnowballStem::operator()(const std::string &word) const
    170175{
    171     if (!internal.get() || word.empty()) return word;
     176    if (word.empty()) return word;
    172177    return internal->operator()(word);
    173178}
    174179
    175180string
    176 Stem::get_description() const
     181SnowballStem::get_description() const
    177182{
    178     string desc = "Xapian::Stem(";
    179     if (internal.get()) {
     183    string desc = "Xapian::SnowballStem(";
     184    if (internal) {
    180185        desc += internal->get_description();
    181186        desc += ')';
    182187    } else {
     
    186191}
    187192
    188193string
    189 Stem::get_available_languages()
     194SnowballStem::get_available_languages()
    190195{
    191196    return LANGSTRING;
    192197}
  • languages/compiler/generator.c

     
    14811481        if (q->type == t_routine && q->routine_called_from_among) {
    14821482            q->among_func_count = ++among_func_count;
    14831483            g->V[0] = q;
    1484             w(g, "static int t~V0(Xapian::Stem::Internal * this_ptr) {~N"
     1484            w(g, "static int t~V0(Xapian::SnowballStem::Internal * this_ptr) {~N"
    14851485                 "    return (static_cast<Xapian::~S0 *>(this_ptr))->~V0();~N"
    14861486                 "}~N"
    14871487                 "~N");