Ticket #165: patch

File patch, 16.7 KB (added by Richard Boulton, 17 years ago)

Updated implementation of fix

  • tests/queryparsertest.cc

     
    2222#include <xapian.h>
    2323#include <iostream>
    2424#include <string>
     25#include <math.h>
    2526#include "utils.h"
    2627
    2728using namespace std;
     
    10401041
    10411042static test test_value_range2_queries[] = {
    10421043    { "a..b", "VALUE_RANGE 3 a b" },
    1043     { "1..12", "VALUE_RANGE 2 1 12" },
     1044    { "1..12", "VALUE_RANGE 2 \2044 \2047\200" },
    10441045    { "20070201..20070228", "VALUE_RANGE 1 20070201 20070228" },
    1045     { "$10..20", "VALUE_RANGE 4 10 20" },
    1046     { "$10..$20", "VALUE_RANGE 4 10 20" },
    1047     { "12..42kg", "VALUE_RANGE 5 12 42" },
    1048     { "12kg..42kg", "VALUE_RANGE 5 12 42" },
     1046    { "$10..20", "VALUE_RANGE 4 \2047@ \2048@" },
     1047    { "$10..$20", "VALUE_RANGE 4 \2047@ \2048@" },
     1048    { "12..42kg", "VALUE_RANGE 5 \2047\200 \2049P" },
     1049    { "12kg..42kg", "VALUE_RANGE 5 \2047\200 \2049P" },
    10491050    { "12kg..42", "VALUE_RANGE 3 12kg 42" },
    10501051    { "10..$20", "VALUE_RANGE 3 10 $20" },
    10511052    { "1999-03-12..2020-12-30", "VALUE_RANGE 1 19990312 20201230" },
     
    10541055    { "12/03/99..12/04/01", "VALUE_RANGE 1 19990312 20010412" },
    10551056    { "03-12-99..04-14-01", "VALUE_RANGE 1 19990312 20010414" },
    10561057    { "(test:a..test:b hello)", "(hello:(pos=1) FILTER VALUE_RANGE 3 test:a test:b)" },
    1057     { "12..42kg 5..6kg 1..12", "(VALUE_RANGE 2 1 12 AND (VALUE_RANGE 5 12 42 OR VALUE_RANGE 5 5 6))" },
     1058    { "12..42kg 5..6kg 1..12", "(VALUE_RANGE 2 \2044 \2047\200 AND (VALUE_RANGE 5 \2047\200 \2049P OR VALUE_RANGE 5 \2046@ \2046\200))" },
    10581059    { NULL, NULL }
    10591060};
    10601061
     
    10961097    return true;
    10971098}
    10981099
    1099 // Test NumberValueRangeProcessors with actual data..
     1100// Test NumberValueRangeProcessors with actual data.
    11001101static bool test_qp_value_range3()
    11011102{
    11021103    Xapian::WritableDatabase db(Xapian::InMemory::open());
    1103     int low = 0;  // FIXME - should it work with negative numbers?
    1104                   // If so, test it with some by setting low to -10
    1105     int high = 9; // Currently the test passes if high is 9, but not if it is 10.
     1104    double low = -10;
     1105    double high = 20;
     1106    double step = 0.5;
    11061107
    1107     for (int i = low; i <= high; ++i) {
     1108    for (double i = low; i <= high; i += step) {
    11081109        Xapian::Document doc;
    1109         doc.add_value(1, om_tostring(i));
     1110        doc.add_value(1, Xapian::NumberValueRangeProcessor::float_to_string(i));
     1111        tout << "Value: " << i << " = " << Xapian::NumberValueRangeProcessor::float_to_string(i) << "\n";
    11101112        db.add_document(doc);
    11111113    }
    11121114
     
    11141116    Xapian::QueryParser qp;
    11151117    qp.add_valuerangeprocessor(&vrp_num);
    11161118
    1117     for (int start = low; start <= high; ++start) {
    1118         for (int end = low; end <= high; ++end) {
     1119    for (double start = low; start <= high; start += step) {
     1120        for (double end = low; end <= high; end += step) {
    11191121            string query = om_tostring(start) + ".." + om_tostring(end);
    11201122            tout << "Query: " << query << '\n';
    11211123            Xapian::Query qobj = qp.parse_query(query);
     1124            tout << "Qobj: " << qobj.get_description() << '\n';
    11221125            Xapian::Enquire enq(db);
    11231126            enq.set_query(qobj);
    1124             Xapian::MSet mset = enq.get_mset(0, 1 + high - low);
     1127            Xapian::MSet mset = enq.get_mset(0, 1 + static_cast<int>(floor((high - low) / step)));
    11251128            if (end < start) {
    11261129                TEST_EQUAL(mset.size(), 0);
    11271130            } else {
    1128                 //TEST_EQUAL(mset.size(), 1u + end - start);
     1131                tout << "Expect " << mset.size() << " == " << (1u + (end - start) / step) << "\n";
     1132                TEST_EQUAL(mset.size(), 1u + (end - start) / step);
    11291133                for (unsigned int j = 0; j != mset.size(); j++) {
    11301134                    TEST_EQUAL(mset[j].get_document().get_value(1),
    1131                                om_tostring(static_cast<int>(j) + start));
     1135                               Xapian::NumberValueRangeProcessor::float_to_string(j * step + start));
    11321136                }
    11331137            }
    11341138        }
     
    11361140    return true;
    11371141}
    11381142
     1143static double test_value_range_numbers[] = {
     1144    -pow(2, 1022),
     1145    -1024.5,
     1146    -3.14159265358979323846,
     1147    -2,
     1148    -1.8,
     1149    -1.1,
     1150    -1,
     1151    -0.5,
     1152    -0.2,
     1153    -0.1,
     1154    -0.000005,
     1155    -0.000002,
     1156    -0.000001,
     1157    -pow(2, -1023),
     1158    -pow(2, -1024),
     1159    -pow(2, -1074),
     1160    0,
     1161    pow(2, -1074),
     1162    pow(2, -1024),
     1163    pow(2, -1023),
     1164    0.000001,
     1165    0.000002,
     1166    0.000005,
     1167    0.1,
     1168    0.2,
     1169    0.5,
     1170    1,
     1171    1.1,
     1172    1.8,
     1173    2,
     1174    3.14159265358979323846,
     1175    1024.5,
     1176    pow(2, 1022),
     1177
     1178    64 // Magic number which we stop at.
     1179};
     1180
     1181// Test serialisation and unserialisation of various numbers.
     1182static bool test_value_range_serialise1()
     1183{
     1184    double prevnum = 0;
     1185    string prevstr = "";
     1186    bool started = false;
     1187    for (double *p = test_value_range_numbers; *p != 64; ++p) {
     1188        double num = *p;
     1189        tout << "Number: " << num << '\n';
     1190        string str = Xapian::NumberValueRangeProcessor::float_to_string(num);
     1191        tout << "String: " << str << '\n';
     1192        TEST_EQUAL(Xapian::NumberValueRangeProcessor::string_to_float(str), num);
     1193
     1194        if (started) {
     1195            TEST_AND_EXPLAIN(prevnum < num, "Expected previous number (" <<
     1196                             prevnum << ") to be less than current number (" <<
     1197                             num << ")");
     1198            TEST_AND_EXPLAIN(prevstr < str, "Expected previous string (" <<
     1199                             prevstr << ") to be less than current string (" <<
     1200                             str << ")");
     1201        }
     1202
     1203        prevnum = num;
     1204        prevstr = str;
     1205        started = true;
     1206    }
     1207    return true;
     1208}
     1209
    11391210static test test_value_daterange1_queries[] = {
    11401211    { "12/03/99..12/04/01", "VALUE_RANGE 1 19991203 20011204" },
    11411212    { "03-12-99..04-14-01", "VALUE_RANGE 1 19990312 20010414" },
     
    12721343    TESTCASE(qp_value_range2),
    12731344    TESTCASE(qp_value_range3),
    12741345    TESTCASE(qp_value_daterange1),
     1346    TESTCASE(value_range_serialise1),
    12751347    TESTCASE(qp_value_customrange1),
    12761348    TESTCASE(qp_stoplist1),
    12771349    END_OF_TESTCASES
  • include/xapian/queryparser.h

     
    152152
    153153/** Handle a number range.
    154154 *
    155  *  This class currently has a design bug - a string comparison is used so the
    156  *  numbers must be the same length for it to work, but you can't just zero
    157  *  pad the values in the database because those from the query aren't.  We
    158  *  therefore recommend that you avoid using this class at present.
     155 *  This class requires that the values stored which the range is being applied
     156 *  to are numbers which have been converted to strings using its \a
     157 *  float_to_string() method.  This method produces strings which will sort in
     158 *  numeric order, so you can use it if you want to be able to sort based on
     159 *  the value in numeric order, too.
    159160 */
    160161class XAPIAN_VISIBILITY_DEFAULT NumberValueRangeProcessor : public ValueRangeProcessor {
    161162    Xapian::valueno valno;
     
    163164    std::string str;
    164165
    165166  public:
     167    /** Constructor.
     168     *
     169     *  @param valno_   The value number to return from operator().
     170     */
    166171    NumberValueRangeProcessor(Xapian::valueno valno_)
    167172        : valno(valno_), prefix(false) { }
    168173
     174    /** Constructor.
     175     *
     176     *  @param valno_   The value number to return from operator().
     177     *
     178     *  @param str_     A string to look for to recognise values as belonging
     179     *                  to this numeric range.
     180     *
     181     *  @param prefix_  Whether to look for the string at the start or end of
     182     *                  the values.  If true, the string is a prefix; if
     183     *                  false, the string is a suffix.
     184     *
     185     *  The string supplied in str_ is used by \a operator() to decide whether
     186     *  the pair of strings supplied to it constitute a valid range.  If
     187     *  prefix_ is true, the first value in a range must begin with str_ (and
     188     *  the second value may also begin with str_, but this is not compulsory);
     189     *  if prefix_ is false, the second value in a range must end with str_
     190     *  (and the first value may also end with str_, but this is not
     191     *  compulsory).
     192     *
     193     *  If str_ is empty, the setting of prefix_ is irrelevant, and no special
     194     *  strings are required at the start or end of the strings defining the
     195     *  range.
     196     *
     197     *  The remainder of both strings defining the endpoints must be valid
     198     *  floating point numbers, as defined by the ANSI C standard library
     199     *  function `strtod()`.
     200     *
     201     *  For example, if str_ is "$" and prefix_ is true, and the range
     202     *  processor has been added to the queryparser, the queryparser will
     203     *  accept "$10..50" or "$10..50", but not "10..50" or "10..$50" as valid
     204     *  ranges.  If str_ is "kg" and prefix_ is false, the queryparser will
     205     *  accept "10..50kg" or "10kg..50kg", but not "10..50" or "10kg..50" as
     206     *  valid ranges.
     207     */
    169208    NumberValueRangeProcessor(Xapian::valueno valno_, const std::string &str_,
    170209                              bool prefix_ = true)
    171210        : valno(valno_), prefix(prefix_), str(str_) { }
    172211
     212    /** See if <begin>..<end> is a valid numeric value range.
     213     *
     214     *  If <begin>..<end> is a valid numeric value range, and has the
     215     *  appropriate prefix or suffix (if specified) required for this
     216     *  NumberValueRangeProcessor, this method returns the value number of
     217     *  range filter on, and sets begin and end to the appropriate serialised
     218     *  values needed to delimit the range.  Otherwise it returns
     219     *  Xapian::BAD_VALUENO.
     220     */
    173221    Xapian::valueno operator()(std::string &begin, std::string &end);
     222
     223    /** Convert a floating point number to a string, preserving sort order.
     224     *
     225     *  This method converts a floating point number to a string, suitable for
     226     *  using as a value for numeric range restriction, or for use as a sort
     227     *  key.
     228     *
     229     *  The conversion attempts to ensure that, for any pair of values supplied
     230     *  to the conversion algorithm, the result of comparing the original
     231     *  values (with a numeric comparison operator) will be the same as the
     232     *  result of comparing the resulting values (with a string comparison
     233     *  operator).  On platforms which represent doubles with the precisions
     234     *  specified by IEEE_754, this will be the case: if the representation of
     235     *  doubles is more precise, it is possible that two very close doubles
     236     *  will be mapped to the same string, so will compare equal.
     237     *
     238     *  The conversion is platform independent.
     239     */
     240    static std::string float_to_string(double value);
     241
     242    /** Convert a string to a floating point number.
     243     *
     244     *  This expects the input to be a string produced by \a float_to_string().
     245     *  If the input is not such a string, the value returned is undefined (but
     246     *  no error will be thrown).
     247     *
     248     *  The result of the conversion will be exactly the value which was
     249     *  supplied to \a string_to_float() when making the string on platforms
     250     *  which represent doubles with the precisions specified by IEEE_754, but
     251     *  may be a different (nearby) value on other platforms.
     252     */
     253    static double string_to_float(const std::string & value);
     254
    174255};
    175256
    176257/// Build a Xapian::Query object from a user query string.
  • api/valuerangeproc.cc

     
    2222
    2323#include <xapian/queryparser.h>
    2424
     25#include <math.h>
    2526#include <stdio.h>
    2627#include <stdlib.h>
    2728
    2829#include <string>
    2930#include "stringutils.h"
     31#include "safeerrno.h"
     32#include "omassert.h"
    3033
    3134using namespace std;
    3235
     
    151154
    152155    if (str.size()) {
    153156        if (prefix) {
    154             // If there's a prefix, require it on the start.
     157            // If there's a prefix, require it on the start of the range.
    155158            if (!begins_with(begin, str)) {
    156159                // Prefix not given.
    157160                return Xapian::BAD_VALUENO;
    158161            }
    159162            b_b = str.size();
    160             // But it's optional on the end, e.g. $10..50
     163            // But it's optional on the end of the range, e.g. $10..50
    161164            if (begins_with(end, str)) {
    162165                e_b = str.size();
    163166            }
    164167        } else {
    165             // If there's a suffix, require it on the end.
     168            // If there's a suffix, require it on the end of the range.
    166169            if (!ends_with(end, str)) {
    167                 // Prefix not given.
     170                // Suffix not given.
    168171                return Xapian::BAD_VALUENO;
    169172            }
    170173            e_e = end.size() - str.size();
    171             // But it's optional on the start, e.g. 10..50kg
     174            // But it's optional on the start of the range, e.g. 10..50kg
    172175            if (ends_with(begin, str)) {
    173176                b_e = begin.size() - str.size();
    174177            }
    175178        }
    176179    }
    177180
    178     if (begin.find_first_not_of("0123456789", b_b) != b_e)
    179         // Not a number.
    180         return Xapian::BAD_VALUENO;
    181 
    182     if (end.find_first_not_of("0123456789", e_b) != e_e)
    183         // Not a number.
    184         return Xapian::BAD_VALUENO;
    185 
    186181    // Adjust begin string if necessary.
    187182    if (b_b)
    188183        begin.erase(0, b_b);
     
    195190    else if (e_e != string::npos)
    196191        end.resize(e_e);
    197192
     193
     194    // Parse the numbers to floating point.
     195    double beginnum, endnum;
     196    const char * startptr;
     197    char * endptr;
     198
     199    errno = 0;
     200    startptr = begin.c_str();
     201    beginnum = strtod(startptr, &endptr);
     202    if (endptr != startptr + begin.size())
     203        // Invalid characters in string
     204        return Xapian::BAD_VALUENO;
     205    if (errno)
     206        // Overflow or underflow
     207        return Xapian::BAD_VALUENO;
     208
     209    errno = 0;
     210    startptr = end.c_str();
     211    endnum = strtod(startptr, &endptr);
     212    if (endptr != startptr + end.size())
     213        // Invalid characters in string
     214        return Xapian::BAD_VALUENO;
     215    if (errno)
     216        // Overflow or underflow
     217        return Xapian::BAD_VALUENO;
     218
     219    begin.assign(float_to_string(beginnum));
     220    end.assign(float_to_string(endnum));
     221
    198222    return valno;
    199223}
     224
     225string
     226Xapian::NumberValueRangeProcessor::float_to_string(double value)
     227{
     228    double mantissa;
     229    int exponent;
     230
     231    mantissa = frexp(value, &exponent);
     232
     233    bool negative = false;
     234    if (mantissa < 0) {
     235        negative = true;
     236        mantissa = -mantissa;
     237    }
     238
     239    /* IEEE representation of doubles uses 11 bits for the exponent, with a
     240     * bias of 1023.  There's then another 52 bits in the mantissa, so we need
     241     * to add 1075 to be sure that the exponent won't be negative.  Even then,
     242     * we check that the exponent isn't negative, and consider the value to be
     243     * equal to zero if it is, to be safe on architectures which use a
     244     * different representation.
     245     */
     246    exponent += 1075;
     247    if (exponent < 0) {
     248        /* Note - this can't happen on most architectures. */
     249        exponent = 0;
     250        mantissa = 0;
     251        negative = false;
     252    } else if (mantissa == 0) {
     253        exponent = 0;
     254    }
     255
     256    // First, store the exponent, as two bytes
     257    // Top bit of first byte is a sign bit.
     258    // If the sign bit is set, number is positive.
     259    // If the sign bit is unset, number is negative.
     260    // For negative numbers, we invert the bytes, so that the sort order
     261    // is reversed (so that larger negative numbers come first).
     262    int n = (exponent & 0x7f00) >> 8;
     263    Assert(exponent >= 0);
     264    Assert(exponent < 128);
     265    string digits;
     266    digits.push_back(negative ? 127 - n : 128 + n);
     267
     268    n = exponent & 0xff;
     269    digits.push_back(negative ? 255 - n: n);
     270
     271    // Now, store the mantissa, in 7 bytes.
     272    // For negative numbers, we invert the bytes, as for the exponent.
     273    // Mantissa is in range .5 <= m < 1.
     274    //
     275    // Therefore, we first multiply by 512 and subtract 256, to get the first
     276    // byte.  For subsequent bytes, we multiply by 256.
     277    mantissa = mantissa * 512 - 256;
     278    Assert(mantissa >= 0);
     279    Assert(mantissa < 256);
     280    int i;
     281    for (i = 0; i != 7; ++i) {
     282        n = static_cast<int>(floor(mantissa));
     283        digits.push_back(negative ? 255 - n : n);
     284        mantissa -= n;
     285        Assert(mantissa >= 0);
     286        Assert(mantissa < 1.0);
     287        mantissa *= 256;
     288    }
     289
     290    // Finally, we can chop off any trailing zeros.
     291    i = digits.size();
     292    while (i > 0 && digits[i - 1] == '\0') {
     293        i--;
     294    }
     295    digits.resize(i);
     296
     297    return digits;
     298}
     299
     300/// Get a number from the character at a given position in a string, returning
     301/// 0 if the string isn't long enough.
     302static inline unsigned int
     303numfromstr(const std::string & str, std::string::size_type pos)
     304{
     305    return (str.size() > pos) ? static_cast<unsigned char>(str[pos]) : 0;
     306}
     307
     308double
     309Xapian::NumberValueRangeProcessor::string_to_float(const std::string & value)
     310{
     311    // Read the exponent
     312    unsigned int n = numfromstr(value, 0);
     313    bool negative = (n < 128);
     314    int exponent = (negative ? 127 - n : n - 128) << 8;
     315    n = numfromstr(value, 1);
     316    exponent += negative ? 255 - n : n;
     317    exponent -= 1075;
     318
     319    // Read the mantissa
     320    double mantissa = 0;
     321
     322    for (int i = 8; i != 2; --i)
     323    {
     324        n = numfromstr(value, i);
     325        double byteval(negative ? 255 - n : n);
     326        mantissa += ldexp(byteval, 8 * (1 - i) - 1);
     327    }
     328
     329    n = numfromstr(value, 2);
     330    if (negative) n = 255 - n;
     331    n += 256;
     332    mantissa += ldexp(n, -9);
     333
     334    return (negative ? -1 : 1) * ldexp(mantissa, exponent);
     335}