1 /* linebreak.c - line breaking of Unicode strings
2 Copyright (C) 2001-2003 Free Software Foundation, Inc.
3 Written by Bruno Haible <haible@clisp.cons.org>, 2001.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
24 #include "linebreak.h"
31 #include "utf8-ucs4.h"
33 #include "utf16-ucs4.h"
37 u32_mbtouc (unsigned int *puc, const unsigned int *s, size_t n)
45 /* Help GCC to generate good code for string comparisons with
47 #if defined (__GNUC__) && defined (__OPTIMIZE__)
50 streq9 (const char *s1, const char *s2)
52 return strcmp (s1 + 9, s2 + 9) == 0;
56 streq8 (const char *s1, const char *s2, char s28)
63 return streq9 (s1, s2);
70 streq7 (const char *s1, const char *s2, char s27, char s28)
77 return streq8 (s1, s2, s28);
84 streq6 (const char *s1, const char *s2, char s26, char s27, char s28)
91 return streq7 (s1, s2, s27, s28);
98 streq5 (const char *s1, const char *s2, char s25, char s26, char s27, char s28)
105 return streq6 (s1, s2, s26, s27, s28);
112 streq4 (const char *s1, const char *s2, char s24, char s25, char s26, char s27, char s28)
119 return streq5 (s1, s2, s25, s26, s27, s28);
126 streq3 (const char *s1, const char *s2, char s23, char s24, char s25, char s26, char s27, char s28)
133 return streq4 (s1, s2, s24, s25, s26, s27, s28);
140 streq2 (const char *s1, const char *s2, char s22, char s23, char s24, char s25, char s26, char s27, char s28)
147 return streq3 (s1, s2, s23, s24, s25, s26, s27, s28);
154 streq1 (const char *s1, const char *s2, char s21, char s22, char s23, char s24, char s25, char s26, char s27, char s28)
161 return streq2 (s1, s2, s22, s23, s24, s25, s26, s27, s28);
168 streq0 (const char *s1, const char *s2, char s20, char s21, char s22, char s23, char s24, char s25, char s26, char s27, char s28)
175 return streq1 (s1, s2, s21, s22, s23, s24, s25, s26, s27, s28);
181 #define STREQ(s1,s2,s20,s21,s22,s23,s24,s25,s26,s27,s28) \
182 streq0 (s1, s2, s20, s21, s22, s23, s24, s25, s26, s27, s28)
186 #define STREQ(s1,s2,s20,s21,s22,s23,s24,s25,s26,s27,s28) \
187 (strcmp (s1, s2) == 0)
193 is_cjk_encoding (const char *encoding)
196 /* Legacy Japanese encodings */
197 || STREQ (encoding, "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0)
198 /* Legacy Chinese encodings */
199 || STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
200 || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0)
201 || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)
202 || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)
203 /* Legacy Korean encodings */
204 || STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
205 || STREQ (encoding, "CP949", 'C', 'P', '9', '4', '9', 0, 0, 0, 0)
206 || STREQ (encoding, "JOHAB", 'J', 'O', 'H', 'A', 'B', 0, 0, 0, 0))
212 is_utf8_encoding (const char *encoding)
214 if (STREQ (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0 ,0))
220 /* Determine number of column positions required for UC. */
221 int uc_width (unsigned int uc, const char *encoding);
224 * Non-spacing attribute table.
226 * - Non-spacing characters; generated from PropList.txt or
227 * "grep '^[^;]*;[^;]*;[^;]*;[^;]*;NSM;' UnicodeData.txt"
228 * - Format control characters; generated from
229 * "grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt"
230 * - Zero width characters; generated from
231 * "grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt"
233 static const unsigned char nonspacing_table_data[16*64] = {
235 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, /* 0x0000-0x003f */
236 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, /* 0x0040-0x007f */
237 0xff, 0xff, 0xff, 0xff, 0x00, 0x20, 0x00, 0x00, /* 0x0080-0x00bf */
238 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00c0-0x00ff */
239 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0100-0x013f */
240 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0140-0x017f */
241 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0180-0x01bf */
242 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x01c0-0x01ff */
244 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0200-0x023f */
245 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0240-0x027f */
246 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0280-0x02bf */
247 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x02c0-0x02ff */
248 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x0300-0x033f */
249 0xff, 0xff, 0xff, 0xe0, 0xff, 0xff, 0x00, 0x00, /* 0x0340-0x037f */
250 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0380-0x03bf */
251 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x03c0-0x03ff */
253 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0400-0x043f */
254 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0440-0x047f */
255 0x78, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0480-0x04bf */
256 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04c0-0x04ff */
257 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0500-0x053f */
258 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0540-0x057f */
259 0x00, 0x00, 0xfe, 0xff, 0xfb, 0xff, 0xff, 0xbb, /* 0x0580-0x05bf */
260 0x16, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x05c0-0x05ff */
262 0x0f, 0x00, 0x3f, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0600-0x063f */
263 0x00, 0xf8, 0xff, 0x01, 0x00, 0x00, 0x01, 0x00, /* 0x0640-0x067f */
264 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0680-0x06bf */
265 0x00, 0x00, 0xc0, 0xff, 0x9f, 0x3d, 0x00, 0x00, /* 0x06c0-0x06ff */
266 0x00, 0x80, 0x02, 0x00, 0x00, 0x00, 0xff, 0xff, /* 0x0700-0x073f */
267 0xff, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0740-0x077f */
268 0x00, 0x00, 0x00, 0x00, 0xc0, 0xff, 0x01, 0x00, /* 0x0780-0x07bf */
269 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x07c0-0x07ff */
271 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0800-0x083f */
272 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0840-0x087f */
273 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0880-0x08bf */
274 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08c0-0x08ff */
275 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0900-0x093f */
276 0xfe, 0x21, 0x1e, 0x00, 0x0c, 0x00, 0x00, 0x00, /* 0x0940-0x097f */
277 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0980-0x09bf */
278 0x1e, 0x20, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, /* 0x09c0-0x09ff */
280 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0a00-0x0a3f */
281 0x86, 0x39, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, /* 0x0a40-0x0a7f */
282 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0a80-0x0abf */
283 0xbe, 0x21, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, /* 0x0ac0-0x0aff */
284 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, /* 0x0b00-0x0b3f */
285 0x0e, 0x20, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0b40-0x0b7f */
286 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0b80-0x0bbf */
287 0x01, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0bc0-0x0bff */
289 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc0, /* 0x0c00-0x0c3f */
290 0xc1, 0x3d, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0c40-0x0c7f */
291 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0c80-0x0cbf */
292 0x00, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0cc0-0x0cff */
293 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0d00-0x0d3f */
294 0x0e, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0d40-0x0d7f */
295 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0d80-0x0dbf */
296 0x00, 0x04, 0x5c, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0dc0-0x0dff */
298 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf2, 0x07, /* 0x0e00-0x0e3f */
299 0x80, 0x7f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0e40-0x0e7f */
300 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf2, 0x1b, /* 0x0e80-0x0ebf */
301 0x00, 0x3f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0ec0-0x0eff */
302 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0xa0, 0x02, /* 0x0f00-0x0f3f */
303 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfe, 0x7f, /* 0x0f40-0x0f7f */
304 0xdf, 0x00, 0xff, 0xfe, 0xff, 0xff, 0xff, 0x1f, /* 0x0f80-0x0fbf */
305 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0fc0-0x0fff */
307 0x00, 0x00, 0x00, 0x00, 0x00, 0xe0, 0xc5, 0x02, /* 0x1000-0x103f */
308 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, /* 0x1040-0x107f */
309 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1080-0x10bf */
310 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10c0-0x10ff */
311 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1100-0x113f */
312 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1140-0x117f */
313 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1180-0x11bf */
314 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x11c0-0x11ff */
316 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1600-0x163f */
317 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1640-0x167f */
318 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1680-0x16bf */
319 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x16c0-0x16ff */
320 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x1c, 0x00, /* 0x1700-0x173f */
321 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x0c, 0x00, /* 0x1740-0x177f */
322 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb0, 0x3f, /* 0x1780-0x17bf */
323 0x40, 0xfe, 0x0f, 0x20, 0x00, 0x00, 0x00, 0x00, /* 0x17c0-0x17ff */
325 0x00, 0x38, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1800-0x183f */
326 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1840-0x187f */
327 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, /* 0x1880-0x18bf */
328 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18c0-0x18ff */
329 0x00, 0x00, 0x00, 0x00, 0x87, 0x0f, 0x04, 0x0e, /* 0x1900-0x193f */
330 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1940-0x197f */
331 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1980-0x19bf */
332 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x19c0-0x19ff */
334 0x00, 0xf8, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, /* 0x2000-0x203f */
335 0x00, 0x00, 0x00, 0x00, 0x0f, 0xfc, 0x00, 0x00, /* 0x2040-0x207f */
336 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2080-0x20bf */
337 0x00, 0x00, 0xff, 0xff, 0xff, 0x07, 0x00, 0x00, /* 0x20c0-0x20ff */
338 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2100-0x213f */
339 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2140-0x217f */
340 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2180-0x21bf */
341 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x21c0-0x21ff */
343 0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, 0x00, 0x00, /* 0x3000-0x303f */
344 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3040-0x307f */
345 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, /* 0x3080-0x30bf */
346 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30c0-0x30ff */
347 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3100-0x313f */
348 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3140-0x317f */
349 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3180-0x31bf */
350 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x31c0-0x31ff */
352 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfa00-0xfa3f */
353 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfa40-0xfa7f */
354 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfa80-0xfabf */
355 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfac0-0xfaff */
356 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, /* 0xfb00-0xfb3f */
357 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfb40-0xfb7f */
358 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfb80-0xfbbf */
359 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfbc0-0xfbff */
361 0xff, 0xff, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, /* 0xfe00-0xfe3f */
362 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfe40-0xfe7f */
363 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfe80-0xfebf */
364 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, /* 0xfec0-0xfeff */
365 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xff00-0xff3f */
366 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xff40-0xff7f */
367 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xff80-0xffbf */
368 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, /* 0xffc0-0xffff */
369 /* 0x1d000-0x1d1ff */
370 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d000-0x1d03f */
371 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d040-0x1d07f */
372 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d080-0x1d0bf */
373 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d0c0-0x1d0ff */
374 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d100-0x1d13f */
375 0x00, 0x00, 0x00, 0x00, 0x80, 0x03, 0x00, 0xf8, /* 0x1d140-0x1d17f */
376 0xe7, 0x0f, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, /* 0x1d180-0x1d1bf */
377 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 /* 0x1d1c0-0x1d1ff */
379 static const signed char nonspacing_table_ind[240] = {
380 0, 1, 2, 3, 4, 5, 6, 7, /* 0x0000-0x0fff */
381 8, -1, -1, 9, 10, -1, -1, -1, /* 0x1000-0x1fff */
382 11, -1, -1, -1, -1, -1, -1, -1, /* 0x2000-0x2fff */
383 12, -1, -1, -1, -1, -1, -1, -1, /* 0x3000-0x3fff */
384 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x4000-0x4fff */
385 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x5000-0x5fff */
386 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x6000-0x6fff */
387 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x7000-0x7fff */
388 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x8000-0x8fff */
389 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x9000-0x9fff */
390 -1, -1, -1, -1, -1, -1, -1, -1, /* 0xa000-0xafff */
391 -1, -1, -1, -1, -1, -1, -1, -1, /* 0xb000-0xbfff */
392 -1, -1, -1, -1, -1, -1, -1, -1, /* 0xc000-0xcfff */
393 -1, -1, -1, -1, -1, -1, -1, -1, /* 0xd000-0xdfff */
394 -1, -1, -1, -1, -1, -1, -1, -1, /* 0xe000-0xefff */
395 -1, -1, -1, -1, -1, 13, -1, 14, /* 0xf000-0xffff */
396 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x10000-0x10fff */
397 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x11000-0x11fff */
398 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x12000-0x12fff */
399 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x13000-0x13fff */
400 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x14000-0x14fff */
401 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x15000-0x15fff */
402 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x16000-0x16fff */
403 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x17000-0x17fff */
404 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x18000-0x18fff */
405 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x19000-0x19fff */
406 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x1a000-0x1afff */
407 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x1b000-0x1bfff */
408 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x1c000-0x1cfff */
409 15, -1, -1, -1, -1, -1, -1, -1 /* 0x1d000-0x1dfff */
412 /* Determine number of column positions required for UC. */
414 uc_width (unsigned int uc, const char *encoding)
416 /* Test for non-spacing or control character. */
419 int ind = nonspacing_table_ind[uc >> 9];
421 if ((nonspacing_table_data[64*ind + ((uc >> 3) & 63)] >> (uc & 7)) & 1)
423 if (uc > 0 && uc < 0xa0)
429 else if ((uc >> 9) == (0xe0000 >> 9))
432 ? (uc >= 0xe0020 ? uc <= 0xe007f : uc == 0xe0001)
436 /* Test for double-width character.
437 * Generated from "grep '^....;[WF]' EastAsianWidth.txt"
438 * and "grep '^....;[^WF]' EastAsianWidth.txt"
441 && ((uc < 0x1160) /* Hangul Jamo */
442 || (uc >= 0x2e80 && uc < 0x4dc0 /* CJK */
444 || (uc >= 0x4e00 && uc < 0xa4d0) /* CJK ... Yi */
445 || (uc >= 0xac00 && uc < 0xd7a4) /* Hangul Syllables */
446 || (uc >= 0xf900 && uc < 0xfb00) /* CJK Compatibility Ideographs */
447 || (uc >= 0xfe30 && uc < 0xfe70) /* CJK Compatibility Forms */
448 || (uc >= 0xff00 && uc < 0xff61) /* Fullwidth Forms */
449 || (uc >= 0xffe0 && uc < 0xffe7)
450 || (uc >= 0x20000 && uc <= 0x2fffd) /* CJK, CJK Compatibility Ideographs */
451 || (uc >= 0x30000 && uc <= 0x3fffd)
454 /* In ancient CJK encodings, Cyrillic and most other characters are
455 double-width as well. */
456 if (uc >= 0x00A1 && uc < 0xFF61 && uc != 0x20A9
457 && is_cjk_encoding (encoding))
463 /* Determine number of column positions required for first N units
464 (or fewer if S ends before this) in S. */
467 u8_width (const unsigned char *s, size_t n, const char *encoding)
469 const unsigned char *s_end = s + n;
477 s += u8_mbtouc (&uc, s, s_end - s);
480 break; /* end of string reached */
482 w = uc_width (uc, encoding);
483 if (w >= 0) /* ignore control characters in the string */
491 u16_width (const unsigned short *s, size_t n, const char *encoding)
493 const unsigned short *s_end = s + n;
501 s += u16_mbtouc (&uc, s, s_end - s);
504 break; /* end of string reached */
506 w = uc_width (uc, encoding);
507 if (w >= 0) /* ignore control characters in the string */
515 u32_width (const unsigned int *s, size_t n, const char *encoding)
517 const unsigned int *s_end = s + n;
522 unsigned int uc = *s++;
526 break; /* end of string reached */
528 w = uc_width (uc, encoding);
529 if (w >= 0) /* ignore control characters in the string */
537 /* Determine the line break points in S, and store the result at p[0..n-1]. */
538 /* We don't support line breaking of complex-context dependent characters
539 (Thai, Lao, Myanmar, Khmer) yet, because it requires dictionary lookup. */
541 /* Line breaking classification. */
545 /* Values >= 20 are resolved at run time. */
546 LBP_BK = 0, /* mandatory break */
547 /*LBP_CR, carriage return - not used here because it's a DOSism */
548 /*LBP_LF, line feed - not used here because it's a DOSism */
549 LBP_CM = 20, /* attached characters and combining marks */
550 /*LBP_SG, surrogates - not used here because they are not characters */
551 LBP_ZW = 1, /* zero width space */
552 LBP_IN = 2, /* inseparable */
553 LBP_GL = 3, /* non-breaking (glue) */
554 LBP_CB = 22, /* contingent break opportunity */
555 LBP_SP = 21, /* space */
556 LBP_BA = 4, /* break opportunity after */
557 LBP_BB = 5, /* break opportunity before */
558 LBP_B2 = 6, /* break opportunity before and after */
559 LBP_HY = 7, /* hyphen */
560 LBP_NS = 8, /* non starter */
561 LBP_OP = 9, /* opening punctuation */
562 LBP_CL = 10, /* closing punctuation */
563 LBP_QU = 11, /* ambiguous quotation */
564 LBP_EX = 12, /* exclamation/interrogation */
565 LBP_ID = 13, /* ideographic */
566 LBP_NU = 14, /* numeric */
567 LBP_IS = 15, /* infix separator (numeric) */
568 LBP_SY = 16, /* symbols allowing breaks */
569 LBP_AL = 17, /* ordinary alphabetic and symbol characters */
570 LBP_PR = 18, /* prefix (numeric) */
571 LBP_PO = 19, /* postfix (numeric) */
572 LBP_SA = 23, /* complex context (South East Asian) */
573 LBP_AI = 24, /* ambiguous (alphabetic or ideograph) */
574 LBP_XX = 25 /* unknown */
577 #include "lbrkprop.h"
579 static inline unsigned char
580 lbrkprop_lookup (unsigned int uc)
582 unsigned int index1 = uc >> lbrkprop_header_0;
583 if (index1 < lbrkprop_header_1)
585 int lookup1 = lbrkprop.level1[index1];
588 unsigned int index2 = (uc >> lbrkprop_header_2) & lbrkprop_header_3;
589 int lookup2 = lbrkprop.level2[lookup1 + index2];
592 unsigned int index3 = uc & lbrkprop_header_4;
593 return lbrkprop.level3[lookup2 + index3];
600 /* Table indexed by two line breaking classifications. */
601 #define D 1 /* direct break opportunity, empty in table 7.3 of UTR #14 */
602 #define I 2 /* indirect break opportunity, '%' in table 7.3 of UTR #14 */
603 #define P 3 /* prohibited break, '^' in table 7.3 of UTR #14 */
604 static const unsigned char lbrk_table[19][19] = {
606 /* ZW IN GL BA BB B2 HY NS OP CL QU EX ID NU IS SY AL PR PO */
607 /* ZW */ { P, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, },
608 /* IN */ { P, I, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
609 /* GL */ { P, I, I, I, I, I, I, I, I, P, I, P, I, I, P, P, I, I, I, },
610 /* BA */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
611 /* BB */ { P, I, I, I, I, I, I, I, I, P, I, P, I, I, P, P, I, I, I, },
612 /* B2 */ { P, D, I, I, D, P, I, I, D, P, I, P, D, D, P, P, D, D, D, },
613 /* HY */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
614 /* NS */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
615 /* OP */ { P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, },
616 /* CL */ { P, D, I, I, D, D, I, P, D, P, I, P, D, D, P, P, D, D, I, },
617 /* QU */ { P, I, I, I, I, I, I, I, P, P, I, P, I, I, P, P, I, I, I, },
618 /* EX */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
619 /* ID */ { P, I, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, I, },
620 /* NU */ { P, I, I, I, D, D, I, I, D, P, I, P, D, I, P, P, I, D, I, },
621 /* IS */ { P, D, I, I, D, D, I, I, D, P, I, P, D, I, P, P, D, D, D, },
622 /* SY */ { P, D, I, I, D, D, I, I, D, P, I, P, D, I, P, P, D, D, D, },
623 /* AL */ { P, I, I, I, D, D, I, I, D, P, I, P, D, I, P, P, I, D, D, },
624 /* PR */ { P, D, I, I, D, D, I, I, I, P, I, P, I, I, P, P, I, D, D, },
625 /* PO */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
629 /* Note: The (B2,B2) entry should probably be D instead of P. */
630 /* Note: The (PR,ID) entry should probably be D instead of I. */
633 u8_possible_linebreaks (const unsigned char *s, size_t n, const char *encoding, char *p)
635 int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
636 const unsigned char *s_end = s + n;
637 int last_prop = LBP_BK; /* line break property of last non-space character */
638 char *seen_space = NULL; /* Was a space seen after the last non-space character? */
639 char *seen_space2 = NULL; /* At least two spaces after the last non-space? */
641 /* Don't break inside multibyte characters. */
642 memset (p, UC_BREAK_PROHIBITED, n);
647 int count = u8_mbtouc (&uc, s, s_end - s);
648 int prop = lbrkprop_lookup (uc);
652 /* Mandatory break. */
653 *p = UC_BREAK_MANDATORY;
662 /* Resolve property values whose behaviour is not fixed. */
666 /* Resolve ambiguous. */
667 prop = LBP_AI_REPLACEMENT;
670 /* This is arbitrary. */
674 /* We don't handle complex scripts yet.
675 Treat LBP_SA like LBP_XX. */
677 /* This is arbitrary. */
682 /* Deal with combining characters. */
686 /* Don't break just before a combining character. */
687 *p = UC_BREAK_PROHIBITED;
688 /* A combining character turns a preceding space into LBP_AL. */
689 if (seen_space != NULL)
692 seen_space = seen_space2;
694 goto lookup_via_table;
697 else if (prop == LBP_SP)
699 /* Don't break just before a space. */
700 *p = UC_BREAK_PROHIBITED;
701 seen_space2 = seen_space;
707 /* prop must be usable as an index for table 7.3 of UTR #14. */
708 if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0])))
711 if (last_prop == LBP_BK)
713 /* Don't break at the beginning of a line. */
714 *q = UC_BREAK_PROHIBITED;
718 switch (lbrk_table [last_prop-1] [prop-1])
721 *q = UC_BREAK_POSSIBLE;
724 *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
727 *q = UC_BREAK_PROHIBITED;
745 u16_possible_linebreaks (const unsigned short *s, size_t n, const char *encoding, char *p)
747 int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
748 const unsigned short *s_end = s + n;
749 int last_prop = LBP_BK; /* line break property of last non-space character */
750 char *seen_space = NULL; /* Was a space seen after the last non-space character? */
751 char *seen_space2 = NULL; /* At least two spaces after the last non-space? */
753 /* Don't break inside multibyte characters. */
754 memset (p, UC_BREAK_PROHIBITED, n);
759 int count = u16_mbtouc (&uc, s, s_end - s);
760 int prop = lbrkprop_lookup (uc);
764 /* Mandatory break. */
765 *p = UC_BREAK_MANDATORY;
774 /* Resolve property values whose behaviour is not fixed. */
778 /* Resolve ambiguous. */
779 prop = LBP_AI_REPLACEMENT;
782 /* This is arbitrary. */
786 /* We don't handle complex scripts yet.
787 Treat LBP_SA like LBP_XX. */
789 /* This is arbitrary. */
794 /* Deal with combining characters. */
798 /* Don't break just before a combining character. */
799 *p = UC_BREAK_PROHIBITED;
800 /* A combining character turns a preceding space into LBP_AL. */
801 if (seen_space != NULL)
804 seen_space = seen_space2;
806 goto lookup_via_table;
809 else if (prop == LBP_SP)
811 /* Don't break just before a space. */
812 *p = UC_BREAK_PROHIBITED;
813 seen_space2 = seen_space;
819 /* prop must be usable as an index for table 7.3 of UTR #14. */
820 if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0])))
823 if (last_prop == LBP_BK)
825 /* Don't break at the beginning of a line. */
826 *q = UC_BREAK_PROHIBITED;
830 switch (lbrk_table [last_prop-1] [prop-1])
833 *q = UC_BREAK_POSSIBLE;
836 *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
839 *q = UC_BREAK_PROHIBITED;
857 u32_possible_linebreaks (const unsigned int *s, size_t n, const char *encoding, char *p)
859 int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
860 const unsigned int *s_end = s + n;
861 int last_prop = LBP_BK; /* line break property of last non-space character */
862 char *seen_space = NULL; /* Was a space seen after the last non-space character? */
863 char *seen_space2 = NULL; /* At least two spaces after the last non-space? */
867 unsigned int uc = *s;
868 int prop = lbrkprop_lookup (uc);
872 /* Mandatory break. */
873 *p = UC_BREAK_MANDATORY;
882 /* Resolve property values whose behaviour is not fixed. */
886 /* Resolve ambiguous. */
887 prop = LBP_AI_REPLACEMENT;
890 /* This is arbitrary. */
894 /* We don't handle complex scripts yet.
895 Treat LBP_SA like LBP_XX. */
897 /* This is arbitrary. */
902 /* Deal with combining characters. */
906 /* Don't break just before a combining character. */
907 *p = UC_BREAK_PROHIBITED;
908 /* A combining character turns a preceding space into LBP_AL. */
909 if (seen_space != NULL)
912 seen_space = seen_space2;
914 goto lookup_via_table;
917 else if (prop == LBP_SP)
919 /* Don't break just before a space. */
920 *p = UC_BREAK_PROHIBITED;
921 seen_space2 = seen_space;
927 /* prop must be usable as an index for table 7.3 of UTR #14. */
928 if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0])))
931 if (last_prop == LBP_BK)
933 /* Don't break at the beginning of a line. */
934 *q = UC_BREAK_PROHIBITED;
938 switch (lbrk_table [last_prop-1] [prop-1])
941 *q = UC_BREAK_POSSIBLE;
944 *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
947 *q = UC_BREAK_PROHIBITED;
965 /* Choose the best line breaks, assuming the uc_width function.
966 Return the column after the end of the string. */
969 u8_width_linebreaks (const unsigned char *s, size_t n,
970 int width, int start_column, int at_end_columns,
971 const char *o, const char *encoding,
974 const unsigned char *s_end;
979 u8_possible_linebreaks (s, n, encoding, p);
983 last_column = start_column;
988 int count = u8_mbtouc (&uc, s, s_end - s);
990 /* Respect the override. */
991 if (o != NULL && *o != UC_BREAK_UNDEFINED)
994 if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY)
996 /* An atomic piece of text ends here. */
997 if (last_p != NULL && last_column + piece_width > width)
999 /* Insert a line break. */
1000 *last_p = UC_BREAK_POSSIBLE;
1005 if (*p == UC_BREAK_MANDATORY)
1007 /* uc is a line break character. */
1008 /* Start a new piece at column 0. */
1015 /* uc is not a line break character. */
1018 if (*p == UC_BREAK_POSSIBLE)
1020 /* Start a new piece. */
1022 last_column += piece_width;
1024 /* No line break for the moment, may be turned into
1025 UC_BREAK_POSSIBLE later, via last_p. */
1028 *p = UC_BREAK_PROHIBITED;
1030 w = uc_width (uc, encoding);
1031 if (w >= 0) /* ignore control characters in the string */
1041 /* The last atomic piece of text ends here. */
1042 if (last_p != NULL && last_column + piece_width + at_end_columns > width)
1044 /* Insert a line break. */
1045 *last_p = UC_BREAK_POSSIBLE;
1049 return last_column + piece_width;
1053 u16_width_linebreaks (const unsigned short *s, size_t n,
1054 int width, int start_column, int at_end_columns,
1055 const char *o, const char *encoding,
1058 const unsigned short *s_end;
1063 u16_possible_linebreaks (s, n, encoding, p);
1067 last_column = start_column;
1072 int count = u16_mbtouc (&uc, s, s_end - s);
1074 /* Respect the override. */
1075 if (o != NULL && *o != UC_BREAK_UNDEFINED)
1078 if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY)
1080 /* An atomic piece of text ends here. */
1081 if (last_p != NULL && last_column + piece_width > width)
1083 /* Insert a line break. */
1084 *last_p = UC_BREAK_POSSIBLE;
1089 if (*p == UC_BREAK_MANDATORY)
1091 /* uc is a line break character. */
1092 /* Start a new piece at column 0. */
1099 /* uc is not a line break character. */
1102 if (*p == UC_BREAK_POSSIBLE)
1104 /* Start a new piece. */
1106 last_column += piece_width;
1108 /* No line break for the moment, may be turned into
1109 UC_BREAK_POSSIBLE later, via last_p. */
1112 *p = UC_BREAK_PROHIBITED;
1114 w = uc_width (uc, encoding);
1115 if (w >= 0) /* ignore control characters in the string */
1125 /* The last atomic piece of text ends here. */
1126 if (last_p != NULL && last_column + piece_width + at_end_columns > width)
1128 /* Insert a line break. */
1129 *last_p = UC_BREAK_POSSIBLE;
1133 return last_column + piece_width;
1137 u32_width_linebreaks (const unsigned int *s, size_t n,
1138 int width, int start_column, int at_end_columns,
1139 const char *o, const char *encoding,
1142 const unsigned int *s_end;
1147 u32_possible_linebreaks (s, n, encoding, p);
1151 last_column = start_column;
1155 unsigned int uc = *s;
1157 /* Respect the override. */
1158 if (o != NULL && *o != UC_BREAK_UNDEFINED)
1161 if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY)
1163 /* An atomic piece of text ends here. */
1164 if (last_p != NULL && last_column + piece_width > width)
1166 /* Insert a line break. */
1167 *last_p = UC_BREAK_POSSIBLE;
1172 if (*p == UC_BREAK_MANDATORY)
1174 /* uc is a line break character. */
1175 /* Start a new piece at column 0. */
1182 /* uc is not a line break character. */
1185 if (*p == UC_BREAK_POSSIBLE)
1187 /* Start a new piece. */
1189 last_column += piece_width;
1191 /* No line break for the moment, may be turned into
1192 UC_BREAK_POSSIBLE later, via last_p. */
1195 *p = UC_BREAK_PROHIBITED;
1197 w = uc_width (uc, encoding);
1198 if (w >= 0) /* ignore control characters in the string */
1208 /* The last atomic piece of text ends here. */
1209 if (last_p != NULL && last_column + piece_width + at_end_columns > width)
1211 /* Insert a line break. */
1212 *last_p = UC_BREAK_POSSIBLE;
1216 return last_column + piece_width;
1224 /* Read the contents of an input stream, and return it, terminated with a NUL
1227 read_file (FILE *stream)
1229 #define BUFSIZE 4096
1235 while (! feof (stream))
1237 if (size + BUFSIZE > alloc)
1239 alloc = alloc + alloc / 2;
1240 if (alloc < size + BUFSIZE)
1241 alloc = size + BUFSIZE;
1242 buf = realloc (buf, alloc);
1245 fprintf (stderr, "out of memory\n");
1249 count = fread (buf + size, 1, BUFSIZE, stream);
1252 if (ferror (stream))
1261 buf = realloc (buf, size + 1);
1264 fprintf (stderr, "out of memory\n");
1273 main (int argc, char * argv[])
1277 /* Display all the break opportunities in the input string. */
1278 char *input = read_file (stdin);
1279 int length = strlen (input);
1280 char *breaks = malloc (length);
1283 u8_possible_linebreaks ((unsigned char *) input, length, "UTF-8", breaks);
1285 for (i = 0; i < length; i++)
1289 case UC_BREAK_POSSIBLE:
1290 /* U+2027 in UTF-8 encoding */
1291 putc (0xe2, stdout); putc (0x80, stdout); putc (0xa7, stdout);
1293 case UC_BREAK_MANDATORY:
1294 /* U+21B2 (or U+21B5) in UTF-8 encoding */
1295 putc (0xe2, stdout); putc (0x86, stdout); putc (0xb2, stdout);
1297 case UC_BREAK_PROHIBITED:
1302 putc (input[i], stdout);
1311 /* Insert line breaks for a given width. */
1312 int width = atoi (argv[1]);
1313 char *input = read_file (stdin);
1314 int length = strlen (input);
1315 char *breaks = malloc (length);
1318 u8_width_linebreaks ((unsigned char *) input, length, width, 0, 0, NULL, "UTF-8", breaks);
1320 for (i = 0; i < length; i++)
1324 case UC_BREAK_POSSIBLE:
1325 putc ('\n', stdout);
1327 case UC_BREAK_MANDATORY:
1329 case UC_BREAK_PROHIBITED:
1334 putc (input[i], stdout);
1348 /* Now the same thing with an arbitrary encoding.
1350 We convert the input string to Unicode.
1352 The standardized Unicode encodings are UTF-8, UCS-2, UCS-4, UTF-16,
1353 UTF-16BE, UTF-16LE, UTF-7. UCS-2 supports only characters up to
1354 \U0000FFFF. UTF-16 and variants support only characters up to
1355 \U0010FFFF. UTF-7 is way too complex and not supported by glibc-2.1.
1356 UCS-4 specification leaves doubts about endianness and byte order mark.
1357 glibc currently interprets it as big endian without byte order mark,
1358 but this is not backed by an RFC. So we use UTF-8. It supports
1359 characters up to \U7FFFFFFF and is unambiguously defined. */
1366 /* Luckily, the encoding's name is platform independent. */
1367 #define UTF8_NAME "UTF-8"
1369 /* Return the length of a string after conversion through an iconv_t. */
1371 iconv_string_length (iconv_t cd, const char *s, size_t n)
1373 #define TMPBUFSIZE 4096
1375 char tmpbuf[TMPBUFSIZE];
1376 const char *inptr = s;
1380 char *outptr = tmpbuf;
1381 size_t outsize = TMPBUFSIZE;
1382 size_t res = iconv (cd, (ICONV_CONST char **) &inptr, &insize, &outptr, &outsize);
1383 if (res == (size_t)(-1) && errno != E2BIG)
1384 return (size_t)(-1);
1385 count += outptr - tmpbuf;
1387 /* Avoid glibc-2.1 bug and Solaris 7 through 9 bug. */
1388 #if defined _LIBICONV_VERSION \
1389 || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
1391 char *outptr = tmpbuf;
1392 size_t outsize = TMPBUFSIZE;
1393 size_t res = iconv (cd, NULL, NULL, &outptr, &outsize);
1394 if (res == (size_t)(-1))
1395 return (size_t)(-1);
1396 count += outptr - tmpbuf;
1398 /* Return to the initial state. */
1399 iconv (cd, NULL, NULL, NULL, NULL);
1406 iconv_string_keeping_offsets (iconv_t cd, const char *s, size_t n,
1407 size_t *offtable, char *t, size_t m)
1414 /* Avoid glibc-2.1 bug. */
1415 #if !defined _LIBICONV_VERSION && (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1)
1416 const size_t extra = 1;
1418 const size_t extra = 0;
1421 for (i = 0; i < n; i++)
1422 offtable[i] = (size_t)(-1);
1427 outsize = m + extra;
1428 while (inptr < s_end)
1430 const char *saved_inptr;
1434 offtable[inptr - s] = outptr - t;
1436 saved_inptr = inptr;
1438 for (insize = 1; inptr + insize <= s_end; insize++)
1440 res = iconv (cd, (ICONV_CONST char **) &inptr, &insize, &outptr, &outsize);
1441 if (!(res == (size_t)(-1) && errno == EINVAL))
1443 /* We expect that no input bytes have been consumed so far. */
1444 if (inptr != saved_inptr)
1447 /* After we verified the convertibility and computed the translation's
1448 size m, there shouldn't be any conversion error here. */
1449 if (res == (size_t)(-1))
1452 /* Avoid glibc-2.1 bug and Solaris 7 bug. */
1453 #if defined _LIBICONV_VERSION \
1454 || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
1455 if (iconv (cd, NULL, NULL, &outptr, &outsize) == (size_t)(-1))
1458 /* We should have produced exactly m output bytes. */
1459 if (outsize != extra)
1463 #endif /* HAVE_ICONV */
1467 /* Tests whether a string is entirely ASCII. Returns 1 if yes.
1468 Returns 0 if the string is in an 8-bit encoding or an ISO-2022 encoding. */
1470 is_all_ascii (const char *s, size_t n)
1472 for (; n > 0; s++, n--)
1474 unsigned char c = (unsigned char) *s;
1476 if (!(c_isprint (c) || c_isspace (c)))
1482 #endif /* C_CTYPE_ASCII */
1485 mbs_possible_linebreaks (const char *s, size_t n, const char *encoding,
1490 if (is_utf8_encoding (encoding))
1491 u8_possible_linebreaks ((const unsigned char *) s, n, encoding, p);
1496 /* Avoid glibc-2.1 bug with EUC-KR. */
1497 # if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
1498 if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0))
1499 to_utf8 = (iconv_t)(-1);
1502 /* Avoid Solaris 9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS, GBK,
1504 # if defined __sun && !defined _LIBICONV_VERSION
1505 if ( STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
1506 || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)
1507 || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)
1508 || STREQ (encoding, "BIG5-HKSCS", 'B', 'I', 'G', '5', '-', 'H', 'K', 'S', 'C')
1509 || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0)
1510 || STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
1511 to_utf8 = (iconv_t)(-1);
1514 to_utf8 = iconv_open (UTF8_NAME, encoding);
1515 if (to_utf8 != (iconv_t)(-1))
1517 /* Determine the length of the resulting UTF-8 string. */
1518 size_t m = iconv_string_length (to_utf8, s, n);
1519 if (m != (size_t)(-1))
1521 /* Convert the string to UTF-8 and build a translation table
1522 from offsets into s to offsets into the translated string. */
1523 size_t memory_size = xsum3 (xtimes (n, sizeof (size_t)), m, m);
1525 (size_in_bounds_p (memory_size) ? malloc (memory_size) : NULL);
1528 size_t *offtable = (size_t *) memory;
1529 char *t = (char *) (offtable + n);
1530 char *q = (char *) (t + m);
1533 iconv_string_keeping_offsets (to_utf8, s, n, offtable, t, m);
1535 /* Determine the possible line breaks of the UTF-8 string. */
1536 u8_possible_linebreaks ((const unsigned char *) t, m, encoding, q);
1538 /* Translate the result back to the original string. */
1539 memset (p, UC_BREAK_PROHIBITED, n);
1540 for (i = 0; i < n; i++)
1541 if (offtable[i] != (size_t)(-1))
1542 p[i] = q[offtable[i]];
1545 iconv_close (to_utf8);
1549 iconv_close (to_utf8);
1552 /* Impossible to convert. */
1554 if (is_all_ascii (s, n))
1556 /* ASCII is a subset of UTF-8. */
1557 u8_possible_linebreaks ((const unsigned char *) s, n, encoding, p);
1561 /* We have a non-ASCII string and cannot convert it.
1562 Don't produce line breaks except those already present in the
1563 input string. All we assume here is that the encoding is
1564 minimally ASCII compatible. */
1566 const char *s_end = s + n;
1569 *p = (*s == '\n' ? UC_BREAK_MANDATORY : UC_BREAK_PROHIBITED);
1578 mbs_width_linebreaks (const char *s, size_t n,
1579 int width, int start_column, int at_end_columns,
1580 const char *o, const char *encoding,
1584 return start_column;
1585 if (is_utf8_encoding (encoding))
1586 return u8_width_linebreaks ((const unsigned char *) s, n, width, start_column, at_end_columns, o, encoding, p);
1591 /* Avoid glibc-2.1 bug with EUC-KR. */
1592 # if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
1593 if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0))
1594 to_utf8 = (iconv_t)(-1);
1597 /* Avoid Solaris 9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS, GBK,
1599 # if defined __sun && !defined _LIBICONV_VERSION
1600 if ( STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
1601 || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)
1602 || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)
1603 || STREQ (encoding, "BIG5-HKSCS", 'B', 'I', 'G', '5', '-', 'H', 'K', 'S', 'C')
1604 || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0)
1605 || STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
1606 to_utf8 = (iconv_t)(-1);
1609 to_utf8 = iconv_open (UTF8_NAME, encoding);
1610 if (to_utf8 != (iconv_t)(-1))
1612 /* Determine the length of the resulting UTF-8 string. */
1613 size_t m = iconv_string_length (to_utf8, s, n);
1614 if (m != (size_t)(-1))
1616 /* Convert the string to UTF-8 and build a translation table
1617 from offsets into s to offsets into the translated string. */
1618 size_t memory_size =
1619 xsum4 (xtimes (n, sizeof (size_t)), m, m,
1620 (o != NULL ? m : 0));
1622 (size_in_bounds_p (memory_size) ? malloc (memory_size) : NULL);
1625 size_t *offtable = (size_t *) memory;
1626 char *t = (char *) (offtable + n);
1627 char *q = (char *) (t + m);
1628 char *o8 = (o != NULL ? (char *) (q + m) : NULL);
1632 iconv_string_keeping_offsets (to_utf8, s, n, offtable, t, m);
1634 /* Translate the overrides to the UTF-8 string. */
1637 memset (o8, UC_BREAK_UNDEFINED, m);
1638 for (i = 0; i < n; i++)
1639 if (offtable[i] != (size_t)(-1))
1640 o8[offtable[i]] = o[i];
1643 /* Determine the line breaks of the UTF-8 string. */
1645 u8_width_linebreaks ((const unsigned char *) t, m, width, start_column, at_end_columns, o8, encoding, q);
1647 /* Translate the result back to the original string. */
1648 memset (p, UC_BREAK_PROHIBITED, n);
1649 for (i = 0; i < n; i++)
1650 if (offtable[i] != (size_t)(-1))
1651 p[i] = q[offtable[i]];
1654 iconv_close (to_utf8);
1658 iconv_close (to_utf8);
1661 /* Impossible to convert. */
1663 if (is_all_ascii (s, n))
1665 /* ASCII is a subset of UTF-8. */
1666 return u8_width_linebreaks ((const unsigned char *) s, n, width, start_column, at_end_columns, o, encoding, p);
1669 /* We have a non-ASCII string and cannot convert it.
1670 Don't produce line breaks except those already present in the
1671 input string. All we assume here is that the encoding is
1672 minimally ASCII compatible. */
1674 const char *s_end = s + n;
1677 *p = ((o != NULL && *o == UC_BREAK_MANDATORY) || *s == '\n'
1678 ? UC_BREAK_MANDATORY
1679 : UC_BREAK_PROHIBITED);
1685 /* We cannot compute widths in this case. */
1686 return start_column;
1697 /* Read the contents of an input stream, and return it, terminated with a NUL
1700 read_file (FILE *stream)
1702 #define BUFSIZE 4096
1708 while (! feof (stream))
1710 if (size + BUFSIZE > alloc)
1712 alloc = alloc + alloc / 2;
1713 if (alloc < size + BUFSIZE)
1714 alloc = size + BUFSIZE;
1715 buf = realloc (buf, alloc);
1718 fprintf (stderr, "out of memory\n");
1722 count = fread (buf + size, 1, BUFSIZE, stream);
1725 if (ferror (stream))
1734 buf = realloc (buf, size + 1);
1737 fprintf (stderr, "out of memory\n");
1746 main (int argc, char * argv[])
1748 setlocale (LC_CTYPE, "");
1751 /* Display all the break opportunities in the input string. */
1752 char *input = read_file (stdin);
1753 int length = strlen (input);
1754 char *breaks = malloc (length);
1757 mbs_possible_linebreaks (input, length, locale_charset (), breaks);
1759 for (i = 0; i < length; i++)
1763 case UC_BREAK_POSSIBLE:
1766 case UC_BREAK_MANDATORY:
1768 case UC_BREAK_PROHIBITED:
1773 putc (input[i], stdout);
1782 /* Insert line breaks for a given width. */
1783 int width = atoi (argv[1]);
1784 char *input = read_file (stdin);
1785 int length = strlen (input);
1786 char *breaks = malloc (length);
1789 mbs_width_linebreaks (input, length, width, 0, 0, NULL, locale_charset (), breaks);
1791 for (i = 0; i < length; i++)
1795 case UC_BREAK_POSSIBLE:
1796 putc ('\n', stdout);
1798 case UC_BREAK_MANDATORY:
1800 case UC_BREAK_PROHIBITED:
1805 putc (input[i], stdout);