Context Navigation

Back to Ticket #165

Ticket #165: patch

File patch, 16.7 KB (added by Richard Boulton, 18 years ago)
Updated implementation of fix

tests/queryparsertest.cc

 #include <xapian.h>
 #include <iostream>
 #include <string>
+#include <math.h>
 #include "utils.h"
 using namespace std;
 …
 static test test_value_range2_queries[] = {
     { "a..b", "VALUE_RANGE 3 a b" },
     { "1..12", "VALUE_RANGE 2 1 12" },
+    { "1..12", "VALUE_RANGE 2 \2044 \2047\200" },
     { "20070201..20070228", "VALUE_RANGE 1 20070201 20070228" },
     { "$10..20", "VALUE_RANGE 4 10 20" },
     { "$10..$20", "VALUE_RANGE 4 10 20" },
     { "12..42kg", "VALUE_RANGE 5 12 42" },
     { "12kg..42kg", "VALUE_RANGE 5 12 42" },
+    { "$10..20", "VALUE_RANGE 4 \2047@ \2048@" },
+    { "$10..$20", "VALUE_RANGE 4 \2047@ \2048@" },
+    { "12..42kg", "VALUE_RANGE 5 \2047\200 \2049P" },
+    { "12kg..42kg", "VALUE_RANGE 5 \2047\200 \2049P" },
     { "12kg..42", "VALUE_RANGE 3 12kg 42" },
     { "10..$20", "VALUE_RANGE 3 10 $20" },
     { "1999-03-12..2020-12-30", "VALUE_RANGE 1 19990312 20201230" },
 …
     { "12/03/99..12/04/01", "VALUE_RANGE 1 19990312 20010412" },
     { "03-12-99..04-14-01", "VALUE_RANGE 1 19990312 20010414" },
     { "(test:a..test:b hello)", "(hello:(pos=1) FILTER VALUE_RANGE 3 test:a test:b)" },
     { "12..42kg 5..6kg 1..12", "(VALUE_RANGE 2 1 12 AND (VALUE_RANGE 5 12 42 OR VALUE_RANGE 5 5 6))" },
+    { "12..42kg 5..6kg 1..12", "(VALUE_RANGE 2 \2044 \2047\200 AND (VALUE_RANGE 5 \2047\200 \2049P OR VALUE_RANGE 5 \2046@ \2046\200))" },
     { NULL, NULL }
 };
 …
     return true;
+}
 // Test NumberValueRangeProcessors with actual data..
+// Test NumberValueRangeProcessors with actual data.
 static bool test_qp_value_range3()
+{
     Xapian::WritableDatabase db(Xapian::InMemory::open());
     int low = 0;  // FIXME - should it work with negative numbers?
                   // If so, test it with some by setting low to -10
     int high = 9; // Currently the test passes if high is 9, but not if it is 10.
+    double low = -10;
+    double high = 20;
+    double step = 0.5;
     for (int i = low; i <= high; ++i) {
+    for (double i = low; i <= high; i += step) {
         Xapian::Document doc;
+        doc.add_value(1, om_tostring(i));
+        doc.add_value(1, Xapian::NumberValueRangeProcessor::float_to_string(i));
+        tout << "Value: " << i << " = " << Xapian::NumberValueRangeProcessor::float_to_string(i) << "\n";
         db.add_document(doc);
+    }
 …
     Xapian::QueryParser qp;
     qp.add_valuerangeprocessor(&vrp_num);
     for (int start = low; start <= high; ++start) {
         for (int end = low; end <= high; ++end) {
+    for (double start = low; start <= high; start += step) {
+        for (double end = low; end <= high; end += step) {
             string query = om_tostring(start) + ".." + om_tostring(end);
             tout << "Query: " << query << '\n';
             Xapian::Query qobj = qp.parse_query(query);
+            tout << "Qobj: " << qobj.get_description() << '\n';
             Xapian::Enquire enq(db);
             enq.set_query(qobj);
             Xapian::MSet mset = enq.get_mset(0, 1 + high - low);
+            Xapian::MSet mset = enq.get_mset(0, 1 + static_cast<int>(floor((high - low) / step)));
             if (end < start) {
                 TEST_EQUAL(mset.size(), 0);
             } else {
+                //TEST_EQUAL(mset.size(), 1u + end - start);
+                tout << "Expect " << mset.size() << " == " << (1u + (end - start) / step) << "\n";
+                TEST_EQUAL(mset.size(), 1u + (end - start) / step);
                 for (unsigned int j = 0; j != mset.size(); j++) {
                     TEST_EQUAL(mset[j].get_document().get_value(1),
                                om_tostring(static_cast<int>(j) + start));
+                               Xapian::NumberValueRangeProcessor::float_to_string(j * step + start));
+                }
+            }
+        }
 …
     return true;
+}
+static double test_value_range_numbers[] = {
+    -pow(2, 1022),
+    -1024.5,
+    -3.14159265358979323846,
+    -2,
+    -1.8,
+    -1.1,
+    -1,
+    -0.5,
+    -0.2,
+    -0.1,
+    -0.000005,
+    -0.000002,
+    -0.000001,
+    -pow(2, -1023),
+    -pow(2, -1024),
+    -pow(2, -1074),
+,
+    pow(2, -1074),
+    pow(2, -1024),
+    pow(2, -1023),
+.000001,
+.000002,
+.000005,
+.1,
+.2,
+.5,
+,
+.1,
+.8,
+,
+.14159265358979323846,
+.5,
+    pow(2, 1022),
+// Magic number which we stop at.
+};
+// Test serialisation and unserialisation of various numbers.
+static bool test_value_range_serialise1()
+{
+    double prevnum = 0;
+    string prevstr = "";
+    bool started = false;
+    for (double *p = test_value_range_numbers; *p != 64; ++p) {
+        double num = *p;
+        tout << "Number: " << num << '\n';
+        string str = Xapian::NumberValueRangeProcessor::float_to_string(num);
+        tout << "String: " << str << '\n';
+        TEST_EQUAL(Xapian::NumberValueRangeProcessor::string_to_float(str), num);
+        if (started) {
+            TEST_AND_EXPLAIN(prevnum < num, "Expected previous number (" <<
+                             prevnum << ") to be less than current number (" <<
+                             num << ")");
+            TEST_AND_EXPLAIN(prevstr < str, "Expected previous string (" <<
+                             prevstr << ") to be less than current string (" <<
+                             str << ")");
+        }
+        prevnum = num;
+        prevstr = str;
+        started = true;
+    }
+    return true;
+}
 static test test_value_daterange1_queries[] = {
     { "12/03/99..12/04/01", "VALUE_RANGE 1 19991203 20011204" },
     { "03-12-99..04-14-01", "VALUE_RANGE 1 19990312 20010414" },
 …
     TESTCASE(qp_value_range2),
     TESTCASE(qp_value_range3),
     TESTCASE(qp_value_daterange1),
+    TESTCASE(value_range_serialise1),
     TESTCASE(qp_value_customrange1),
     TESTCASE(qp_stoplist1),
     END_OF_TESTCASES

include/xapian/queryparser.h

 /** Handle a number range.
+ *
+ *  This class currently has a design bug - a string comparison is used so the
+ *  numbers must be the same length for it to work, but you can't just zero
+ *  pad the values in the database because those from the query aren't.  We
+ *  therefore recommend that you avoid using this class at present.
+ *  This class requires that the values stored which the range is being applied
+ *  to are numbers which have been converted to strings using its \a
+ *  float_to_string() method.  This method produces strings which will sort in
+ *  numeric order, so you can use it if you want to be able to sort based on
+ *  the value in numeric order, too.
  */
 class XAPIAN_VISIBILITY_DEFAULT NumberValueRangeProcessor : public ValueRangeProcessor {
     Xapian::valueno valno;
 …
     std::string str;
   public:
+    /** Constructor.
+     *
+     *  @param valno_   The value number to return from operator().
+     */
     NumberValueRangeProcessor(Xapian::valueno valno_)
         : valno(valno_), prefix(false) { }
+    /** Constructor.
+     *
+     *  @param valno_   The value number to return from operator().
+     *
+     *  @param str_     A string to look for to recognise values as belonging
+     *                  to this numeric range.
+     *
+     *  @param prefix_  Whether to look for the string at the start or end of
+     *                  the values.  If true, the string is a prefix; if
+     *                  false, the string is a suffix.
+     *
+     *  The string supplied in str_ is used by \a operator() to decide whether
+     *  the pair of strings supplied to it constitute a valid range.  If
+     *  prefix_ is true, the first value in a range must begin with str_ (and
+     *  the second value may also begin with str_, but this is not compulsory);
+     *  if prefix_ is false, the second value in a range must end with str_
+     *  (and the first value may also end with str_, but this is not
+     *  compulsory).
+     *
+     *  If str_ is empty, the setting of prefix_ is irrelevant, and no special
+     *  strings are required at the start or end of the strings defining the
+     *  range.
+     *
+     *  The remainder of both strings defining the endpoints must be valid
+     *  floating point numbers, as defined by the ANSI C standard library
+     *  function `strtod()`.
+     *
+     *  For example, if str_ is "$" and prefix_ is true, and the range
+     *  processor has been added to the queryparser, the queryparser will
+     *  accept "$10..50" or "$10..50", but not "10..50" or "10..$50" as valid
+     *  ranges.  If str_ is "kg" and prefix_ is false, the queryparser will
+     *  accept "10..50kg" or "10kg..50kg", but not "10..50" or "10kg..50" as
+     *  valid ranges.
+     */
     NumberValueRangeProcessor(Xapian::valueno valno_, const std::string &str_,
                               bool prefix_ = true)
         : valno(valno_), prefix(prefix_), str(str_) { }
+    /** See if <begin>..<end> is a valid numeric value range.
+     *
+     *  If <begin>..<end> is a valid numeric value range, and has the
+     *  appropriate prefix or suffix (if specified) required for this
+     *  NumberValueRangeProcessor, this method returns the value number of
+     *  range filter on, and sets begin and end to the appropriate serialised
+     *  values needed to delimit the range.  Otherwise it returns
+     *  Xapian::BAD_VALUENO.
+     */
     Xapian::valueno operator()(std::string &begin, std::string &end);
+    /** Convert a floating point number to a string, preserving sort order.
+     *
+     *  This method converts a floating point number to a string, suitable for
+     *  using as a value for numeric range restriction, or for use as a sort
+     *  key.
+     *
+     *  The conversion attempts to ensure that, for any pair of values supplied
+     *  to the conversion algorithm, the result of comparing the original
+     *  values (with a numeric comparison operator) will be the same as the
+     *  result of comparing the resulting values (with a string comparison
+     *  operator).  On platforms which represent doubles with the precisions
+     *  specified by IEEE_754, this will be the case: if the representation of
+     *  doubles is more precise, it is possible that two very close doubles
+     *  will be mapped to the same string, so will compare equal.
+     *
+     *  The conversion is platform independent.
+     */
+    static std::string float_to_string(double value);
+    /** Convert a string to a floating point number.
+     *
+     *  This expects the input to be a string produced by \a float_to_string().
+     *  If the input is not such a string, the value returned is undefined (but
+     *  no error will be thrown).
+     *
+     *  The result of the conversion will be exactly the value which was
+     *  supplied to \a string_to_float() when making the string on platforms
+     *  which represent doubles with the precisions specified by IEEE_754, but
+     *  may be a different (nearby) value on other platforms.
+     */
+    static double string_to_float(const std::string & value);
 };
 /// Build a Xapian::Query object from a user query string.

api/valuerangeproc.cc

 #include <xapian/queryparser.h>
+#include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string>
 #include "stringutils.h"
+#include "safeerrno.h"
+#include "omassert.h"
 using namespace std;
 …
     if (str.size()) {
         if (prefix) {
             // If there's a prefix, require it on the start.
+            // If there's a prefix, require it on the start of the range.
             if (!begins_with(begin, str)) {
                 // Prefix not given.
                 return Xapian::BAD_VALUENO;
+            }
             b_b = str.size();
             // But it's optional on the end, e.g. $10..50
+            // But it's optional on the end of the range, e.g. $10..50
             if (begins_with(end, str)) {
                 e_b = str.size();
+            }
         } else {
             // If there's a suffix, require it on the end.
+            // If there's a suffix, require it on the end of the range.
             if (!ends_with(end, str)) {
                 // Prefix not given.
+                // Suffix not given.
                 return Xapian::BAD_VALUENO;
+            }
             e_e = end.size() - str.size();
             // But it's optional on the start, e.g. 10..50kg
+            // But it's optional on the start of the range, e.g. 10..50kg
             if (ends_with(begin, str)) {
                 b_e = begin.size() - str.size();
+            }
+        }
+    }
-    if (begin.find_first_not_of("0123456789", b_b) != b_e)
-        // Not a number.
-        return Xapian::BAD_VALUENO;
-    if (end.find_first_not_of("0123456789", e_b) != e_e)
-        // Not a number.
-        return Xapian::BAD_VALUENO;
     // Adjust begin string if necessary.
     if (b_b)
         begin.erase(0, b_b);
 …
     else if (e_e != string::npos)
         end.resize(e_e);
+    // Parse the numbers to floating point.
+    double beginnum, endnum;
+    const char * startptr;
+    char * endptr;
+    errno = 0;
+    startptr = begin.c_str();
+    beginnum = strtod(startptr, &endptr);
+    if (endptr != startptr + begin.size())
+        // Invalid characters in string
+        return Xapian::BAD_VALUENO;
+    if (errno)
+        // Overflow or underflow
+        return Xapian::BAD_VALUENO;
+    errno = 0;
+    startptr = end.c_str();
+    endnum = strtod(startptr, &endptr);
+    if (endptr != startptr + end.size())
+        // Invalid characters in string
+        return Xapian::BAD_VALUENO;
+    if (errno)
+        // Overflow or underflow
+        return Xapian::BAD_VALUENO;
+    begin.assign(float_to_string(beginnum));
+    end.assign(float_to_string(endnum));
     return valno;
+}
+string
+Xapian::NumberValueRangeProcessor::float_to_string(double value)
+{
+    double mantissa;
+    int exponent;
+    mantissa = frexp(value, &exponent);
+    bool negative = false;
+    if (mantissa < 0) {
+        negative = true;
+        mantissa = -mantissa;
+    }
+    /* IEEE representation of doubles uses 11 bits for the exponent, with a
+     * bias of 1023.  There's then another 52 bits in the mantissa, so we need
+     * to add 1075 to be sure that the exponent won't be negative.  Even then,
+     * we check that the exponent isn't negative, and consider the value to be
+     * equal to zero if it is, to be safe on architectures which use a
+     * different representation.
+     */
+    exponent += 1075;
+    if (exponent < 0) {
+        /* Note - this can't happen on most architectures. */
+        exponent = 0;
+        mantissa = 0;
+        negative = false;
+    } else if (mantissa == 0) {
+        exponent = 0;
+    }
+    // First, store the exponent, as two bytes
+    // Top bit of first byte is a sign bit.
+    // If the sign bit is set, number is positive.
+    // If the sign bit is unset, number is negative.
+    // For negative numbers, we invert the bytes, so that the sort order
+    // is reversed (so that larger negative numbers come first).
+    int n = (exponent & 0x7f00) >> 8;
+    Assert(exponent >= 0);
+    Assert(exponent < 128);
+    string digits;
+    digits.push_back(negative ? 127 - n : 128 + n);
+    n = exponent & 0xff;
+    digits.push_back(negative ? 255 - n: n);
+    // Now, store the mantissa, in 7 bytes.
+    // For negative numbers, we invert the bytes, as for the exponent.
+    // Mantissa is in range .5 <= m < 1.
+    //
+    // Therefore, we first multiply by 512 and subtract 256, to get the first
+    // byte.  For subsequent bytes, we multiply by 256.
+    mantissa = mantissa * 512 - 256;
+    Assert(mantissa >= 0);
+    Assert(mantissa < 256);
+    int i;
+    for (i = 0; i != 7; ++i) {
+        n = static_cast<int>(floor(mantissa));
+        digits.push_back(negative ? 255 - n : n);
+        mantissa -= n;
+        Assert(mantissa >= 0);
+        Assert(mantissa < 1.0);
+        mantissa *= 256;
+    }
+    // Finally, we can chop off any trailing zeros.
+    i = digits.size();
+    while (i > 0 && digits[i - 1] == '\0') {
+        i--;
+    }
+    digits.resize(i);
+    return digits;
+}
+/// Get a number from the character at a given position in a string, returning
+/// 0 if the string isn't long enough.
+static inline unsigned int
+numfromstr(const std::string & str, std::string::size_type pos)
+{
+    return (str.size() > pos) ? static_cast<unsigned char>(str[pos]) : 0;
+}
+double
+Xapian::NumberValueRangeProcessor::string_to_float(const std::string & value)
+{
+    // Read the exponent
+    unsigned int n = numfromstr(value, 0);
+    bool negative = (n < 128);
+    int exponent = (negative ? 127 - n : n - 128) << 8;
+    n = numfromstr(value, 1);
+    exponent += negative ? 255 - n : n;
+    exponent -= 1075;
+    // Read the mantissa
+    double mantissa = 0;
+    for (int i = 8; i != 2; --i)
+    {
+        n = numfromstr(value, i);
+        double byteval(negative ? 255 - n : n);
+        mantissa += ldexp(byteval, 8 * (1 - i) - 1);
+    }
+    n = numfromstr(value, 2);
+    if (negative) n = 255 - n;
+    n += 256;
+    mantissa += ldexp(n, -9);
+    return (negative ? -1 : 1) * ldexp(mantissa, exponent);
+}

Download in other formats:

Original Format