Ticket #186: patch
File patch, 15.4 KB (added by , 17 years ago) |
---|
-
include/xapian/stem.h
28 28 29 29 namespace Xapian { 30 30 31 /// Class representing a stemming algorithm. 32 class XAPIAN_VISIBILITY_DEFAULT Stem { 31 /// Base representing a stemming algorithm. 32 class XAPIAN_VISIBILITY_DEFAULT BaseStem : public Xapian::Internal::RefCntBase { 33 /// No copying allowed. 34 BaseStem(const BaseStem & o); 35 36 /// No assignment allowed. 37 void operator=(const BaseStem & o); 38 39 protected: 40 /** Destructor is protected since it should only be called by subclasses 41 * and RefCntPtr. 42 */ 43 virtual ~BaseStem() {} 44 45 friend class Xapian::Internal::RefCntPtr<BaseStem>; 46 33 47 public: 34 /// @private @internal Class representing the stemmer internals. 35 class Internal; 36 /// @private @internal Reference counted internals. 37 Xapian::Internal::RefCntPtr<Internal> internal; 48 BaseStem() {} 38 49 39 /// Copy constructor. 40 Stem(const Stem & o); 50 /** Stem a word. 51 * 52 * @param word a word to stem. 53 * @return the stemmed form of the word. 54 */ 55 virtual std::string operator()(const std::string &word) const = 0; 41 56 42 /// Assignment. 43 void operator=(const Stem & o); 57 /// Return a string describing this object. 58 virtual std::string get_description() const = 0; 59 }; 44 60 45 /** Construct a Xapian::Stem object which doesn't change terms. 61 /// A stemming algorithm which doesn't change words at all. 62 class XAPIAN_VISIBILITY_DEFAULT IdentityStem : public BaseStem { 63 /// No copying allowed. 64 IdentityStem(const IdentityStem & o); 65 66 /// No assignment allowed. 67 void operator=(const IdentityStem & o); 68 69 public: 70 IdentityStem() {} 71 72 /** Stem a word. 46 73 * 47 * Equivalent to Stem("none"). 74 * Since this stemmer doesn't change words, this simply returns the word. 75 * 76 * @param word a word to stem. 77 * @return the word supplied. 48 78 */ 49 Stem();79 std::string operator()(const std::string &word) const; 50 80 51 /** Construct a Xapian::Stem object for a particular language. 81 /// Return a string describing this object. 82 std::string get_description() const; 83 }; 84 85 /// Class representing one of the snowball stemming algorithms. 86 class XAPIAN_VISIBILITY_DEFAULT SnowballStem : public BaseStem { 87 /// No copying allowed. 88 SnowballStem(const SnowballStem & o); 89 90 /// No assignment allowed. 91 void operator=(const SnowballStem & o); 92 93 public: 94 /// @private @internal Class representing the snowball stemmer internals. 95 class Internal; 96 97 private: 98 /// @private @internal Snowball stemmer internals. 99 Internal * internal; 100 101 public: 102 /** Construct a Xapian::SnowballStem object for a particular language. 52 103 * 53 104 * @param language Either the English name for the language 54 105 * or the two letter ISO639 code. … … 56 107 * The following language names are understood (aliases follow the 57 108 * name): 58 109 * 59 * - none - don't stem terms60 110 * - danish (da) 61 111 * - dutch (nl) 62 112 * - english (en) - Martin Porter's 2002 revision of his stemmer … … 76 126 * @exception Xapian::InvalidArgumentError is thrown if 77 127 * language isn't recognised. 78 128 */ 79 explicit S tem(const std::string &language);129 explicit SnowballStem(const std::string &language); 80 130 81 131 /// Destructor. 82 ~S tem();132 ~SnowballStem(); 83 133 84 134 /** Stem a word. 85 135 * 86 * @param word 87 * @return the stem136 * @param word a word to stem. 137 * @return the stemmed form of the word. 88 138 */ 89 139 std::string operator()(const std::string &word) const; 90 140 … … 104 154 static std::string get_available_languages(); 105 155 }; 106 156 157 /// Class wrapping a reference counted stemming algorithm. 158 class XAPIAN_VISIBILITY_DEFAULT Stem { 159 public: 160 /// @private @internal Reference counted internals. 161 Xapian::Internal::RefCntPtr<Xapian::BaseStem> internal; 162 163 /// Copy constructor. 164 Stem(const Stem & o) : internal(o.internal) { } 165 166 /// Assignment. 167 void operator=(const Stem & o) { internal = o.internal; } 168 169 /** Construct a Xapian::Stem object from a pointer to a BaseStem. 170 */ 171 Stem(Xapian::Internal::RefCntPtr<Xapian::BaseStem> internal_) 172 : internal(internal_) {} 173 174 /** Construct a Xapian::Stem object which doesn't change terms. 175 */ 176 Stem() : internal(new Xapian::IdentityStem()) {} 177 178 /** Construct a Xapian::Stem object for a particular language. 179 * 180 * This constructor is included for convenience, and is equivalent to 181 * Stem(new SnowballStem(language)) - except that a language parameter of 182 * "none" will produce a stemmer which doesn't remove any stems (ie, an 183 * IdentityStem stemmer). 184 * 185 * See Xapian::SnowballStem for details. 186 */ 187 explicit Stem(const std::string &language) 188 { 189 if (language == "none") 190 internal = new Xapian::IdentityStem(); 191 else 192 internal = new Xapian::SnowballStem(language); 193 } 194 195 /** Stem a word. 196 * 197 * @param word a word to stem. 198 * @return the stemmed form of the word. 199 */ 200 std::string operator()(const std::string &word) const 201 { 202 return internal->operator()(word); 203 } 204 205 /// Return a string describing this object. 206 std::string get_description() const 207 { 208 return "Xapian::Stem(" + internal->get_description() + ")"; 209 } 210 211 /** Return a list of available languages. 212 * 213 * This is included for convenience, and is equivalent to 214 * SnowballStem.get_available_languages(). 215 * 216 * See Xapian::SnowballStem for details. 217 */ 218 static std::string get_available_languages() 219 { 220 return SnowballStem::get_available_languages(); 221 } 222 }; 223 107 224 } 108 225 109 226 #endif // XAPIAN_INCLUDED_STEM_H -
languages/Makefile.mk
56 56 $(CC_FOR_BUILD) -o languages/snowball -DDISABLE_JAVA `for f in $(snowball_sources) ; do test -f $$f && echo $$f || echo $(srcdir)/$$f ; done` 57 57 58 58 .sbl.cc: 59 languages/snowball $< -o `echo $@|sed 's!\.cc$$!!'` -c++ -u -n InternalStem`echo $<|sed 's!.*/\(.\).*!\1!'|tr a-z A-Z``echo $<|sed 's!.*/.!!;s!\.sbl!!'` -p S tem::Internal59 languages/snowball $< -o `echo $@|sed 's!\.cc$$!!'` -c++ -u -n InternalStem`echo $<|sed 's!.*/\(.\).*!\1!'|tr a-z A-Z``echo $<|sed 's!.*/.!!;s!\.sbl!!'` -p SnowballStem::Internal 60 60 61 61 .sbl.h: 62 languages/snowball $< -o `echo $@|sed 's!\.h$$!!'` -c++ -u -n InternalStem`echo $<|sed 's!.*/\(.\).*!\1!'|tr a-z A-Z``echo $<|sed 's!.*/.!!;s!\.sbl!!'` -p S tem::Internal62 languages/snowball $< -o `echo $@|sed 's!\.h$$!!'` -c++ -u -n InternalStem`echo $<|sed 's!.*/\(.\).*!\1!'|tr a-z A-Z``echo $<|sed 's!.*/.!!;s!\.sbl!!'` -p SnowballStem::Internal 63 63 64 64 languages/allsnowballheaders.h: languages/generate-allsnowballheaders languages/Makefile.mk 65 65 languages/generate-allsnowballheaders $(snowball_built_sources) -
languages/steminternal.cc
131 131 132 132 namespace Xapian { 133 133 134 S tem::Internal::Internal()134 SnowballStem::Internal::Internal() 135 135 : p(create_s()), c(0), l(0), lb(0), bra(0), ket(0) 136 136 { 137 137 } 138 138 139 S tem::Internal::~Internal()139 SnowballStem::Internal::~Internal() 140 140 { 141 141 lose_s(p); 142 142 } 143 143 144 144 string 145 S tem::Internal::operator()(const string & word)145 SnowballStem::Internal::operator()(const string & word) 146 146 { 147 147 const symbol * s = reinterpret_cast<const symbol *>(word.data()); 148 148 replace_s(0, l, word.size(), s); … … 156 156 157 157 /* Code for character groupings: utf8 cases */ 158 158 159 int S tem::Internal::get_utf8(int * slot) {159 int SnowballStem::Internal::get_utf8(int * slot) { 160 160 int b0, b1; 161 161 int tmp = c; 162 162 if (tmp >= l) return 0; … … 171 171 * slot = (b0 & 0xF) << 12 | (b1 & 0x3F) << 6 | (p[tmp] & 0x3F); return 3; 172 172 } 173 173 174 int S tem::Internal::get_b_utf8(int * slot) {174 int SnowballStem::Internal::get_b_utf8(int * slot) { 175 175 int b0, b1; 176 176 int tmp = c; 177 177 if (tmp <= lb) return 0; … … 186 186 * slot = (p[tmp] & 0xF) << 12 | (b1 & 0x3F) << 6 | (b0 & 0x3F); return 3; 187 187 } 188 188 189 int S tem::Internal::in_grouping_U(const unsigned char * s, int min, int max, int repeat) {189 int SnowballStem::Internal::in_grouping_U(const unsigned char * s, int min, int max, int repeat) { 190 190 do { 191 191 int ch; 192 192 int w = get_utf8(&ch); … … 198 198 return 0; 199 199 } 200 200 201 int S tem::Internal::in_grouping_b_U(const unsigned char * s, int min, int max, int repeat) {201 int SnowballStem::Internal::in_grouping_b_U(const unsigned char * s, int min, int max, int repeat) { 202 202 do { 203 203 int ch; 204 204 int w = get_b_utf8(&ch); … … 210 210 return 0; 211 211 } 212 212 213 int S tem::Internal::out_grouping_U(const unsigned char * s, int min, int max, int repeat) {213 int SnowballStem::Internal::out_grouping_U(const unsigned char * s, int min, int max, int repeat) { 214 214 do { 215 215 int ch; 216 216 int w = get_utf8(&ch); … … 222 222 return 0; 223 223 } 224 224 225 int S tem::Internal::out_grouping_b_U(const unsigned char * s, int min, int max, int repeat) {225 int SnowballStem::Internal::out_grouping_b_U(const unsigned char * s, int min, int max, int repeat) { 226 226 do { 227 227 int ch; 228 228 int w = get_b_utf8(&ch); … … 234 234 return 0; 235 235 } 236 236 237 int S tem::Internal::eq_s(int s_size, const symbol * s) {237 int SnowballStem::Internal::eq_s(int s_size, const symbol * s) { 238 238 if (l - c < s_size || memcmp(p + c, s, s_size * sizeof(symbol)) != 0) 239 239 return 0; 240 240 c += s_size; 241 241 return 1; 242 242 } 243 243 244 int S tem::Internal::eq_s_b(int s_size, const symbol * s) {244 int SnowballStem::Internal::eq_s_b(int s_size, const symbol * s) { 245 245 if (c - lb < s_size || memcmp(p + c - s_size, s, s_size * sizeof(symbol)) != 0) 246 246 return 0; 247 247 c -= s_size; 248 248 return 1; 249 249 } 250 250 251 int S tem::Internal::find_among(const struct among * v, int v_size, const unsigned char * fnum, const among_function * f) {251 int SnowballStem::Internal::find_among(const struct among * v, int v_size, const unsigned char * fnum, const among_function * f) { 252 252 int i = 0; 253 253 int j = v_size; 254 254 … … 302 302 } 303 303 304 304 /* find_among_b is for backwards processing. Same comments apply */ 305 int S tem::Internal::find_among_b(const struct among * v, int v_size, const unsigned char * fnum, const among_function * f) {305 int SnowballStem::Internal::find_among_b(const struct among * v, int v_size, const unsigned char * fnum, const among_function * f) { 306 306 int i = 0; 307 307 int j = v_size; 308 308 … … 351 351 } 352 352 353 353 int 354 S tem::Internal::replace_s(int c_bra, int c_ket, int s_size, const symbol * s)354 SnowballStem::Internal::replace_s(int c_bra, int c_ket, int s_size, const symbol * s) 355 355 { 356 356 int adjustment; 357 357 int len; … … 377 377 return adjustment; 378 378 } 379 379 380 int S tem::Internal::slice_check() {380 int SnowballStem::Internal::slice_check() { 381 381 Assert(p); 382 382 if (bra < 0 || bra > ket || ket > l) { 383 383 #if 0 … … 389 389 return 0; 390 390 } 391 391 392 int S tem::Internal::slice_from_s(int s_size, const symbol * s) {392 int SnowballStem::Internal::slice_from_s(int s_size, const symbol * s) { 393 393 if (slice_check()) return -1; 394 394 replace_s(bra, ket, s_size, s); 395 395 return 0; 396 396 } 397 397 398 void S tem::Internal::insert_s(int c_bra, int c_ket, int s_size, const symbol * s) {398 void SnowballStem::Internal::insert_s(int c_bra, int c_ket, int s_size, const symbol * s) { 399 399 int adjustment = replace_s(c_bra, c_ket, s_size, s); 400 400 if (c_bra <= bra) bra += adjustment; 401 401 if (c_bra <= ket) ket += adjustment; 402 402 } 403 403 404 symbol * S tem::Internal::slice_to(symbol * v) {404 symbol * SnowballStem::Internal::slice_to(symbol * v) { 405 405 if (slice_check()) return NULL; 406 406 { 407 407 int len = ket - bra; … … 414 414 return v; 415 415 } 416 416 417 symbol * S tem::Internal::assign_to(symbol * v) {417 symbol * SnowballStem::Internal::assign_to(symbol * v) { 418 418 int len = l; 419 419 if (CAPACITY(v) < len) { 420 420 v = increase_size(v, len); … … 425 425 } 426 426 427 427 #if 0 428 void S tem::Internal::debug(int number, int line_count) {428 void SnowballStem::Internal::debug(int number, int line_count) { 429 429 int i; 430 430 int limit = SIZE(p); 431 431 /*if (number >= 0) printf("%3d (line %4d): '", number, line_count);*/ -
languages/steminternal.h
41 41 #define CAPACITY(P) ((const int *)(const void *)(P))[-2] 42 42 #define SET_CAPACITY(P, N) ((int *)(void *)(P))[-2] = N 43 43 44 typedef int (*among_function)(Xapian::S tem::Internal *);44 typedef int (*among_function)(Xapian::SnowballStem::Internal *); 45 45 46 46 struct among { 47 47 int s_size; /* length of search string (in symbols) */ … … 60 60 61 61 namespace Xapian { 62 62 63 class S tem::Internal : public Xapian::Internal::RefCntBase{63 class SnowballStem::Internal { 64 64 int slice_check(); 65 65 66 66 protected: -
languages/stem.cc
33 33 34 34 namespace Xapian { 35 35 36 Stem::Stem(const Stem & o) : internal(o.internal) { } 36 std::string 37 IdentityStem::operator()(const std::string &word) const 38 { 39 return word; 40 } 37 41 38 void 39 Stem::operator=(const Stem & o) 42 std::string 43 IdentityStem::get_description() const 40 44 { 41 internal = o.internal;45 return "Xapian::IdentityStem"; 42 46 } 43 47 44 Stem::Stem() : internal(0) { }45 48 46 Stem::Stem(const std::string &language) : internal(0) { 49 SnowballStem::SnowballStem(const std::string &language) 50 : internal(0) 51 { 47 52 if (language.empty()) return; 48 53 switch (language[0]) { 49 54 case 'd': … … 119 124 internal = new InternalStemNorwegian; 120 125 return; 121 126 } 122 if (language == "none") {123 return;124 }125 127 break; 126 128 case 'p': 127 129 if (language == "pt" || language == "portuguese") { … … 163 165 throw Xapian::InvalidArgumentError("Language code " + language + " unknown"); 164 166 } 165 167 166 Stem::~Stem() { } 168 SnowballStem::~SnowballStem() 169 { 170 delete internal; 171 } 167 172 168 173 string 169 S tem::operator()(const std::string &word) const174 SnowballStem::operator()(const std::string &word) const 170 175 { 171 if ( !internal.get() ||word.empty()) return word;176 if (word.empty()) return word; 172 177 return internal->operator()(word); 173 178 } 174 179 175 180 string 176 S tem::get_description() const181 SnowballStem::get_description() const 177 182 { 178 string desc = "Xapian::S tem(";179 if (internal .get()) {183 string desc = "Xapian::SnowballStem("; 184 if (internal) { 180 185 desc += internal->get_description(); 181 186 desc += ')'; 182 187 } else { … … 186 191 } 187 192 188 193 string 189 S tem::get_available_languages()194 SnowballStem::get_available_languages() 190 195 { 191 196 return LANGSTRING; 192 197 } -
languages/compiler/generator.c
1481 1481 if (q->type == t_routine && q->routine_called_from_among) { 1482 1482 q->among_func_count = ++among_func_count; 1483 1483 g->V[0] = q; 1484 w(g, "static int t~V0(Xapian::S tem::Internal * this_ptr) {~N"1484 w(g, "static int t~V0(Xapian::SnowballStem::Internal * this_ptr) {~N" 1485 1485 " return (static_cast<Xapian::~S0 *>(this_ptr))->~V0();~N" 1486 1486 "}~N" 1487 1487 "~N");