1 /* Test of canonical normalization of UTF-8 strings.
2 Copyright (C) 2009 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
17 /* Written by Bruno Haible <bruno@clisp.org>, 2009. */
21 #if GNULIB_UNINORM_U8_NORMALIZE
32 #define SIZEOF(array) (sizeof (array) / sizeof (array[0]))
33 #define ASSERT(expr) \
38 fprintf (stderr, "%s:%d: assertion failed\n", __FILE__, __LINE__); \
46 check (const uint8_t *input, size_t input_length,
47 const uint8_t *expected, size_t expected_length)
52 /* Test return conventions with resultbuf == NULL. */
53 result = u8_normalize (UNINORM_NFC, input, input_length, NULL, &length);
54 if (!(result != NULL))
56 if (!(length == expected_length))
58 if (!(u8_cmp (result, expected, expected_length) == 0))
62 /* Test return conventions with resultbuf too small. */
63 if (expected_length > 0)
65 uint8_t *preallocated;
67 length = expected_length - 1;
68 preallocated = (uint8_t *) malloc (length * sizeof (uint8_t));
69 result = u8_normalize (UNINORM_NFC, input, input_length, preallocated, &length);
70 if (!(result != NULL))
72 if (!(result != preallocated))
74 if (!(length == expected_length))
76 if (!(u8_cmp (result, expected, expected_length) == 0))
82 /* Test return conventions with resultbuf large enough. */
84 uint8_t *preallocated;
86 length = expected_length;
87 preallocated = (uint8_t *) malloc (length * sizeof (uint8_t));
88 result = u8_normalize (UNINORM_NFC, input, input_length, preallocated, &length);
89 if (!(result != NULL))
91 if (!(result == preallocated))
93 if (!(length == expected_length))
95 if (!(u8_cmp (result, expected, expected_length) == 0))
107 static const uint8_t input[] = { 0x20 };
108 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
111 { /* LATIN CAPITAL LETTER A WITH DIAERESIS */
112 static const uint8_t input[] = { 0xC3, 0x84 };
113 static const uint8_t decomposed[] = { 0x41, 0xCC, 0x88 };
114 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
115 ASSERT (check (decomposed, SIZEOF (decomposed), input, SIZEOF (input)) == 0);
118 { /* LATIN CAPITAL LETTER A WITH DIAERESIS AND MACRON */
119 static const uint8_t input[] = { 0xC7, 0x9E };
120 static const uint8_t decomposed[] = { 0x41, 0xCC, 0x88, 0xCC, 0x84 };
121 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
122 ASSERT (check (decomposed, SIZEOF (decomposed), input, SIZEOF (input)) == 0);
125 { /* ANGSTROM SIGN */
126 static const uint8_t input[] = { 0xE2, 0x84, 0xAB };
127 static const uint8_t decomposed[] = { 0x41, 0xCC, 0x8A };
128 static const uint8_t expected[] = { 0xC3, 0x85 };
129 ASSERT (check (input, SIZEOF (input), expected, SIZEOF (expected)) == 0);
130 ASSERT (check (decomposed, SIZEOF (decomposed), expected, SIZEOF (expected)) == 0);
131 ASSERT (check (expected, SIZEOF (expected), expected, SIZEOF (expected)) == 0);
134 { /* GREEK DIALYTIKA AND PERISPOMENI */
135 static const uint8_t input[] = { 0xE1, 0xBF, 0x81 };
136 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
139 { /* SCRIPT SMALL L */
140 static const uint8_t input[] = { 0xE2, 0x84, 0x93 };
141 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
144 { /* NO-BREAK SPACE */
145 static const uint8_t input[] = { 0xC2, 0xA0 };
146 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
149 { /* ARABIC LETTER VEH INITIAL FORM */
150 static const uint8_t input[] = { 0xEF, 0xAD, 0xAC };
151 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
154 { /* ARABIC LETTER VEH MEDIAL FORM */
155 static const uint8_t input[] = { 0xEF, 0xAD, 0xAD };
156 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
159 { /* ARABIC LETTER VEH FINAL FORM */
160 static const uint8_t input[] = { 0xEF, 0xAD, 0xAB };
161 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
164 { /* ARABIC LETTER VEH ISOLATED FORM */
165 static const uint8_t input[] = { 0xEF, 0xAD, 0xAA };
166 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
169 { /* CIRCLED NUMBER FIFTEEN */
170 static const uint8_t input[] = { 0xE2, 0x91, 0xAE };
171 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
174 { /* TRADE MARK SIGN */
175 static const uint8_t input[] = { 0xE2, 0x84, 0xA2 };
176 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
179 { /* LATIN SUBSCRIPT SMALL LETTER I */
180 static const uint8_t input[] = { 0xE1, 0xB5, 0xA2 };
181 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
184 { /* PRESENTATION FORM FOR VERTICAL LEFT PARENTHESIS */
185 static const uint8_t input[] = { 0xEF, 0xB8, 0xB5 };
186 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
189 { /* FULLWIDTH LATIN CAPITAL LETTER A */
190 static const uint8_t input[] = { 0xEF, 0xBC, 0xA1 };
191 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
194 { /* HALFWIDTH IDEOGRAPHIC COMMA */
195 static const uint8_t input[] = { 0xEF, 0xBD, 0xA4 };
196 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
199 { /* SMALL IDEOGRAPHIC COMMA */
200 static const uint8_t input[] = { 0xEF, 0xB9, 0x91 };
201 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
205 static const uint8_t input[] = { 0xE3, 0x8E, 0x92 };
206 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
209 { /* VULGAR FRACTION THREE EIGHTHS */
210 static const uint8_t input[] = { 0xE2, 0x85, 0x9C };
211 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
215 static const uint8_t input[] = { 0xC2, 0xB5 };
216 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
219 { /* ARABIC LIGATURE SALLALLAHOU ALAYHE WASALLAM */
220 static const uint8_t input[] = { 0xEF, 0xB7, 0xBA };
221 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
224 { /* HANGUL SYLLABLE GEUL */
225 static const uint8_t input[] = { 0xEA, 0xB8, 0x80 };
226 static const uint8_t decomposed[] =
227 { 0xE1, 0x84, 0x80, 0xE1, 0x85, 0xB3, 0xE1, 0x86, 0xAF };
228 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
229 ASSERT (check (decomposed, SIZEOF (decomposed), input, SIZEOF (input)) == 0);
232 { /* HANGUL SYLLABLE GEU */
233 static const uint8_t input[] = { 0xEA, 0xB7, 0xB8 };
234 static const uint8_t decomposed[] = { 0xE1, 0x84, 0x80, 0xE1, 0x85, 0xB3 };
235 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
236 ASSERT (check (decomposed, SIZEOF (decomposed), input, SIZEOF (input)) == 0);
239 { /* "Grüß Gott. Здравствуйте! x=(-b±sqrt(b²-4ac))/(2a) 日本語,中文,한글" */
240 static const uint8_t input[] =
241 { 'G', 'r', 0xC3, 0xBC, 0xC3, 0x9F, ' ', 'G', 'o', 't', 't', '.',
242 ' ', 0xD0, 0x97, 0xD0, 0xB4, 0xD1, 0x80, 0xD0, 0xB0, 0xD0, 0xB2, 0xD1,
243 0x81, 0xD1, 0x82, 0xD0, 0xB2, 0xD1, 0x83, 0xD0, 0xB9,
244 0xD1, 0x82, 0xD0, 0xB5, '!', ' ', 'x', '=', '(', '-', 'b', 0xC2, 0xB1,
245 's', 'q', 'r', 't', '(', 'b', 0xC2, 0xB2, '-', '4', 'a', 'c', ')', ')',
246 '/', '(', '2', 'a', ')', ' ', ' ', 0xE6, 0x97, 0xA5, 0xE6, 0x9C, 0xAC,
247 0xE8, 0xAA, 0x9E, ',', 0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87, ',',
249 0xEA, 0xB8, 0x80, '\n'
251 static const uint8_t decomposed[] =
252 { 'G', 'r', 0x75, 0xCC, 0x88, 0xC3, 0x9F, ' ', 'G', 'o', 't', 't', '.',
253 ' ', 0xD0, 0x97, 0xD0, 0xB4, 0xD1, 0x80, 0xD0, 0xB0, 0xD0, 0xB2, 0xD1,
254 0x81, 0xD1, 0x82, 0xD0, 0xB2, 0xD1, 0x83, 0xD0, 0xB8, 0xCC, 0x86,
255 0xD1, 0x82, 0xD0, 0xB5, '!', ' ', 'x', '=', '(', '-', 'b', 0xC2, 0xB1,
256 's', 'q', 'r', 't', '(', 'b', 0xC2, 0xB2, '-', '4', 'a', 'c', ')', ')',
257 '/', '(', '2', 'a', ')', ' ', ' ', 0xE6, 0x97, 0xA5, 0xE6, 0x9C, 0xAC,
258 0xE8, 0xAA, 0x9E, ',', 0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87, ',',
259 0xE1, 0x84, 0x92, 0xE1, 0x85, 0xA1, 0xE1, 0x86, 0xAB,
260 0xE1, 0x84, 0x80, 0xE1, 0x85, 0xB3, 0xE1, 0x86, 0xAF, '\n'
262 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
263 ASSERT (check (decomposed, SIZEOF (decomposed), input, SIZEOF (input)) == 0);
267 /* Declare failure if test takes too long, by using default abort
268 caused by SIGALRM. */
269 signal (SIGALRM, SIG_DFL);
273 /* Check that the sorting is not O(n²) but O(n log n). */
276 for (pass = 0; pass < 3; pass++)
280 uint8_t *input = (uint8_t *) malloc (2 * (2 * m - 1) * sizeof (uint8_t));
283 uint8_t *expected = input + (2 * m - 1);
285 size_t m2 = (m - 1) / 2;
286 /* NB: m1 + m2 == m - 1. */
295 for (i = 0; i < m1; i++)
300 for (i = 0; i < m2; i++)
308 for (i = 0; i < m2; i++)
313 for (i = 0; i < m1; i++)
321 for (i = 0; i < m2; i++)
342 for (i = 0; i < m1; i++)
347 for (i = 0; i < m2 - 1; i++)
353 for (; repeat > 0; repeat--)
355 ASSERT (check (input, 2 * m - 1, expected, 2 * m - 2) == 0);
356 ASSERT (check (expected, 2 * m - 2, expected, 2 * m - 2) == 0);