root / tags / 1.0.8 / xapian-core / tests / api_unicode.cc

Revision 10148, 4.9 kB (checked in by olly, 10 months ago)

Backport change from trunk:
include/xapian/unicode.h: Add Unicode::toupper() to complement
Unicode::tolower().
tests/api_unicode.cc: Add caseconvert1 testcase to test
Unicode::tolower() and Unicode::toupper().

Line 
1/** @file api_unicode.cc
2 * @brief test the Unicode and UTF-8 classes and functions.
3 */
4/* Copyright (C) 2006,2007,2008 Olly Betts
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
19 */
20
21#include <config.h>
22
23#include "api_unicode.h"
24
25#include <xapian.h>
26
27#include "apitest.h"
28#include "testutils.h"
29
30#include <cctype>
31
32using namespace std;
33
34struct testcase {
35    const char * a, * b;
36};
37
38static const testcase testcases[] = {
39    { "abcd", "abcd" }, // Sanity check!
40    { "a\x80""bcd", "a\xc2\x80""bcd" },
41    { "a\xa0", "a\xc2\xa0" },
42    { 0, 0 }
43};
44
45// Test handling of invalid UTF-8 is as desired.
46DEFINE_TESTCASE(utf8iterator1,!backend) {
47    const testcase * p;
48    for (p = testcases; p->a; ++p) {
49        tout << '"' << p->a << "\" and \"" << p->b << '"' << endl;
50        size_t a_len = strlen(p->a);
51        Xapian::Utf8Iterator a(p->a, a_len);
52
53        size_t b_len = strlen(p->b);
54        Xapian::Utf8Iterator b(p->b, b_len);
55
56        while (a != Xapian::Utf8Iterator() && b != Xapian::Utf8Iterator()) {
57            TEST_EQUAL(*a, *b);
58            ++a;
59            ++b;
60        }
61
62        // Test that we don't reach the end of one before the other.
63        TEST(a == Xapian::Utf8Iterator());
64        TEST(b == Xapian::Utf8Iterator());
65    }
66    return true;
67}
68
69struct testcase2 {
70    const char * a;
71    unsigned long n;
72};
73
74static const testcase2 testcases2[] = {
75    { "a", 97 },
76    { "\x80", 128 },
77    { "\xa0", 160 },
78    { "\xc2\x80", 128 },
79    { "\xc2\xa0", 160 },
80    { "\xf0\xa8\xa8\x8f", 166415 },
81    { 0, 0 }
82};
83
84// Test decoding of UTF-8.
85DEFINE_TESTCASE(utf8iterator2,!backend) {
86    const testcase2 * p;
87    for (p = testcases2; p->a; ++p) {
88        Xapian::Utf8Iterator a(p->a, strlen(p->a));
89
90        TEST(a != Xapian::Utf8Iterator());
91        TEST_EQUAL(*a, p->n);
92        TEST(++a == Xapian::Utf8Iterator());
93    }
94    return true;
95}
96
97// Test Unicode categorisation.
98DEFINE_TESTCASE(unicode1,!backend) {
99    using namespace Xapian;
100    TEST_EQUAL(Unicode::get_category('a'), Unicode::LOWERCASE_LETTER);
101    TEST_EQUAL(Unicode::get_category('0'), Unicode::DECIMAL_DIGIT_NUMBER);
102    TEST_EQUAL(Unicode::get_category('$'), Unicode::CURRENCY_SYMBOL);
103    TEST_EQUAL(Unicode::get_category(0xa3), Unicode::CURRENCY_SYMBOL);
104    // U+0242 was added in Unicode 5.0.0.
105    TEST_EQUAL(Unicode::get_category(0x242), Unicode::LOWERCASE_LETTER);
106    TEST_EQUAL(Unicode::get_category(0xFFFF), Unicode::UNASSIGNED);
107    // Test characters outside BMP.
108    TEST_EQUAL(Unicode::get_category(0x10345), Unicode::OTHER_LETTER);
109    TEST_EQUAL(Unicode::get_category(0x10FFFD), Unicode::PRIVATE_USE);
110    TEST_EQUAL(Unicode::get_category(0x10FFFF), Unicode::UNASSIGNED);
111    // Test some invalid Unicode values.
112    TEST_EQUAL(Unicode::get_category(0x110000), Unicode::UNASSIGNED);
113    TEST_EQUAL(Unicode::get_category(0xFFFFFFFF), Unicode::UNASSIGNED);
114    return true;
115}
116
117DEFINE_TESTCASE(caseconvert1,!backend) {
118    using namespace Xapian;
119    for (unsigned ch = 0; ch < 128; ++ch) {
120        if (isupper((char)ch)) {
121            TEST_EQUAL(Unicode::tolower(ch), unsigned(tolower((char)ch)));
122        } else {
123            TEST_EQUAL(Unicode::tolower(ch), ch);
124        }
125        if (islower((char)ch)) {
126            TEST_EQUAL(Unicode::toupper(ch), unsigned(toupper((char)ch)));
127        } else {
128            TEST_EQUAL(Unicode::toupper(ch), ch);
129        }
130    }
131
132    // U+0242 was added in Unicode 5.0.0 as a lowercase form of U+0241.
133    TEST_EQUAL(Unicode::tolower(0x242), 0x242);
134    TEST_EQUAL(Unicode::toupper(0x242), 0x241);
135    TEST_EQUAL(Unicode::toupper(0x241), 0x241);
136    TEST_EQUAL(Unicode::tolower(0x241), 0x242);
137
138    // Pound currency symbol:
139    TEST_EQUAL(Unicode::tolower(0xa3), 0xa3);
140    TEST_EQUAL(Unicode::toupper(0xa3), 0xa3);
141    // Unassigned:
142    TEST_EQUAL(Unicode::tolower(0xFFFF), 0xFFFF);
143    TEST_EQUAL(Unicode::toupper(0xFFFF), 0xFFFF);
144    // Test characters outside BMP.
145    TEST_EQUAL(Unicode::tolower(0x10345), 0x10345);
146    TEST_EQUAL(Unicode::toupper(0x10345), 0x10345);
147    TEST_EQUAL(Unicode::tolower(0x10FFFD), 0x10FFFD);
148    TEST_EQUAL(Unicode::toupper(0x10FFFD), 0x10FFFD);
149    TEST_EQUAL(Unicode::tolower(0x10FFFF), 0x10FFFF);
150    TEST_EQUAL(Unicode::toupper(0x10FFFF), 0x10FFFF);
151    // Test some invalid Unicode values.
152    TEST_EQUAL(Unicode::tolower(0x110000), 0x110000);
153    TEST_EQUAL(Unicode::toupper(0x110000), 0x110000);
154    TEST_EQUAL(Unicode::tolower(0xFFFFFFFF), 0xFFFFFFFF);
155    TEST_EQUAL(Unicode::toupper(0xFFFFFFFF), 0xFFFFFFFF);
156
157    return true;
158}
Note: See TracBrowser for help on using the browser.