1 /* linebreak.c - line breaking of Unicode strings
2 Copyright (C) 2001-2003, 2006-2007 Free Software Foundation, Inc.
3 Written by Bruno Haible <haible@clisp.cons.org>, 2001.
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>. */
21 #include "linebreak.h"
29 #include "uniwidth/cjk.h"
34 is_utf8_encoding (const char *encoding)
36 if (STREQ (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0 ,0))
42 /* Determine the line break points in S, and store the result at p[0..n-1]. */
43 /* We don't support line breaking of complex-context dependent characters
44 (Thai, Lao, Myanmar, Khmer) yet, because it requires dictionary lookup. */
46 /* Line breaking classification. */
50 /* Values >= 20 are resolved at run time. */
51 LBP_BK = 0, /* mandatory break */
52 /*LBP_CR, carriage return - not used here because it's a DOSism */
53 /*LBP_LF, line feed - not used here because it's a DOSism */
54 LBP_CM = 20, /* attached characters and combining marks */
55 /*LBP_SG, surrogates - not used here because they are not characters */
56 LBP_ZW = 1, /* zero width space */
57 LBP_IN = 2, /* inseparable */
58 LBP_GL = 3, /* non-breaking (glue) */
59 LBP_CB = 22, /* contingent break opportunity */
60 LBP_SP = 21, /* space */
61 LBP_BA = 4, /* break opportunity after */
62 LBP_BB = 5, /* break opportunity before */
63 LBP_B2 = 6, /* break opportunity before and after */
64 LBP_HY = 7, /* hyphen */
65 LBP_NS = 8, /* non starter */
66 LBP_OP = 9, /* opening punctuation */
67 LBP_CL = 10, /* closing punctuation */
68 LBP_QU = 11, /* ambiguous quotation */
69 LBP_EX = 12, /* exclamation/interrogation */
70 LBP_ID = 13, /* ideographic */
71 LBP_NU = 14, /* numeric */
72 LBP_IS = 15, /* infix separator (numeric) */
73 LBP_SY = 16, /* symbols allowing breaks */
74 LBP_AL = 17, /* ordinary alphabetic and symbol characters */
75 LBP_PR = 18, /* prefix (numeric) */
76 LBP_PO = 19, /* postfix (numeric) */
77 LBP_SA = 23, /* complex context (South East Asian) */
78 LBP_AI = 24, /* ambiguous (alphabetic or ideograph) */
79 LBP_XX = 25 /* unknown */
84 static inline unsigned char
85 lbrkprop_lookup (unsigned int uc)
87 unsigned int index1 = uc >> lbrkprop_header_0;
88 if (index1 < lbrkprop_header_1)
90 int lookup1 = lbrkprop.level1[index1];
93 unsigned int index2 = (uc >> lbrkprop_header_2) & lbrkprop_header_3;
94 int lookup2 = lbrkprop.level2[lookup1 + index2];
97 unsigned int index3 = uc & lbrkprop_header_4;
98 return lbrkprop.level3[lookup2 + index3];
105 /* Table indexed by two line breaking classifications. */
106 #define D 1 /* direct break opportunity, empty in table 7.3 of UTR #14 */
107 #define I 2 /* indirect break opportunity, '%' in table 7.3 of UTR #14 */
108 #define P 3 /* prohibited break, '^' in table 7.3 of UTR #14 */
109 static const unsigned char lbrk_table[19][19] = {
111 /* ZW IN GL BA BB B2 HY NS OP CL QU EX ID NU IS SY AL PR PO */
112 /* ZW */ { P, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, },
113 /* IN */ { P, I, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
114 /* GL */ { P, I, I, I, I, I, I, I, I, P, I, P, I, I, P, P, I, I, I, },
115 /* BA */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
116 /* BB */ { P, I, I, I, I, I, I, I, I, P, I, P, I, I, P, P, I, I, I, },
117 /* B2 */ { P, D, I, I, D, P, I, I, D, P, I, P, D, D, P, P, D, D, D, },
118 /* HY */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
119 /* NS */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
120 /* OP */ { P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, },
121 /* CL */ { P, D, I, I, D, D, I, P, D, P, I, P, D, D, P, P, D, D, I, },
122 /* QU */ { P, I, I, I, I, I, I, I, P, P, I, P, I, I, P, P, I, I, I, },
123 /* EX */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
124 /* ID */ { P, I, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, I, },
125 /* NU */ { P, I, I, I, D, D, I, I, D, P, I, P, D, I, P, P, I, D, I, },
126 /* IS */ { P, D, I, I, D, D, I, I, D, P, I, P, D, I, P, P, D, D, D, },
127 /* SY */ { P, D, I, I, D, D, I, I, D, P, I, P, D, I, P, P, D, D, D, },
128 /* AL */ { P, I, I, I, D, D, I, I, D, P, I, P, D, I, P, P, I, D, D, },
129 /* PR */ { P, D, I, I, D, D, I, I, I, P, I, P, I, I, P, P, I, D, D, },
130 /* PO */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
134 /* Note: The (B2,B2) entry should probably be D instead of P. */
135 /* Note: The (PR,ID) entry should probably be D instead of I. */
138 u8_possible_linebreaks (const unsigned char *s, size_t n, const char *encoding, char *p)
140 int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
141 const unsigned char *s_end = s + n;
142 int last_prop = LBP_BK; /* line break property of last non-space character */
143 char *seen_space = NULL; /* Was a space seen after the last non-space character? */
144 char *seen_space2 = NULL; /* At least two spaces after the last non-space? */
146 /* Don't break inside multibyte characters. */
147 memset (p, UC_BREAK_PROHIBITED, n);
152 int count = u8_mbtouc_unsafe (&uc, s, s_end - s);
153 int prop = lbrkprop_lookup (uc);
157 /* Mandatory break. */
158 *p = UC_BREAK_MANDATORY;
167 /* Resolve property values whose behaviour is not fixed. */
171 /* Resolve ambiguous. */
172 prop = LBP_AI_REPLACEMENT;
175 /* This is arbitrary. */
179 /* We don't handle complex scripts yet.
180 Treat LBP_SA like LBP_XX. */
182 /* This is arbitrary. */
187 /* Deal with combining characters. */
191 /* Don't break just before a combining character. */
192 *p = UC_BREAK_PROHIBITED;
193 /* A combining character turns a preceding space into LBP_AL. */
194 if (seen_space != NULL)
197 seen_space = seen_space2;
199 goto lookup_via_table;
202 else if (prop == LBP_SP)
204 /* Don't break just before a space. */
205 *p = UC_BREAK_PROHIBITED;
206 seen_space2 = seen_space;
212 /* prop must be usable as an index for table 7.3 of UTR #14. */
213 if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0])))
216 if (last_prop == LBP_BK)
218 /* Don't break at the beginning of a line. */
219 *q = UC_BREAK_PROHIBITED;
223 switch (lbrk_table [last_prop-1] [prop-1])
226 *q = UC_BREAK_POSSIBLE;
229 *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
232 *q = UC_BREAK_PROHIBITED;
250 u16_possible_linebreaks (const unsigned short *s, size_t n, const char *encoding, char *p)
252 int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
253 const unsigned short *s_end = s + n;
254 int last_prop = LBP_BK; /* line break property of last non-space character */
255 char *seen_space = NULL; /* Was a space seen after the last non-space character? */
256 char *seen_space2 = NULL; /* At least two spaces after the last non-space? */
258 /* Don't break inside multibyte characters. */
259 memset (p, UC_BREAK_PROHIBITED, n);
264 int count = u16_mbtouc_unsafe (&uc, s, s_end - s);
265 int prop = lbrkprop_lookup (uc);
269 /* Mandatory break. */
270 *p = UC_BREAK_MANDATORY;
279 /* Resolve property values whose behaviour is not fixed. */
283 /* Resolve ambiguous. */
284 prop = LBP_AI_REPLACEMENT;
287 /* This is arbitrary. */
291 /* We don't handle complex scripts yet.
292 Treat LBP_SA like LBP_XX. */
294 /* This is arbitrary. */
299 /* Deal with combining characters. */
303 /* Don't break just before a combining character. */
304 *p = UC_BREAK_PROHIBITED;
305 /* A combining character turns a preceding space into LBP_AL. */
306 if (seen_space != NULL)
309 seen_space = seen_space2;
311 goto lookup_via_table;
314 else if (prop == LBP_SP)
316 /* Don't break just before a space. */
317 *p = UC_BREAK_PROHIBITED;
318 seen_space2 = seen_space;
324 /* prop must be usable as an index for table 7.3 of UTR #14. */
325 if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0])))
328 if (last_prop == LBP_BK)
330 /* Don't break at the beginning of a line. */
331 *q = UC_BREAK_PROHIBITED;
335 switch (lbrk_table [last_prop-1] [prop-1])
338 *q = UC_BREAK_POSSIBLE;
341 *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
344 *q = UC_BREAK_PROHIBITED;
362 u32_possible_linebreaks (const unsigned int *s, size_t n, const char *encoding, char *p)
364 int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
365 const unsigned int *s_end = s + n;
366 int last_prop = LBP_BK; /* line break property of last non-space character */
367 char *seen_space = NULL; /* Was a space seen after the last non-space character? */
368 char *seen_space2 = NULL; /* At least two spaces after the last non-space? */
372 unsigned int uc = *s;
373 int prop = lbrkprop_lookup (uc);
377 /* Mandatory break. */
378 *p = UC_BREAK_MANDATORY;
387 /* Resolve property values whose behaviour is not fixed. */
391 /* Resolve ambiguous. */
392 prop = LBP_AI_REPLACEMENT;
395 /* This is arbitrary. */
399 /* We don't handle complex scripts yet.
400 Treat LBP_SA like LBP_XX. */
402 /* This is arbitrary. */
407 /* Deal with combining characters. */
411 /* Don't break just before a combining character. */
412 *p = UC_BREAK_PROHIBITED;
413 /* A combining character turns a preceding space into LBP_AL. */
414 if (seen_space != NULL)
417 seen_space = seen_space2;
419 goto lookup_via_table;
422 else if (prop == LBP_SP)
424 /* Don't break just before a space. */
425 *p = UC_BREAK_PROHIBITED;
426 seen_space2 = seen_space;
432 /* prop must be usable as an index for table 7.3 of UTR #14. */
433 if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0])))
436 if (last_prop == LBP_BK)
438 /* Don't break at the beginning of a line. */
439 *q = UC_BREAK_PROHIBITED;
443 switch (lbrk_table [last_prop-1] [prop-1])
446 *q = UC_BREAK_POSSIBLE;
449 *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
452 *q = UC_BREAK_PROHIBITED;
470 /* Choose the best line breaks, assuming the uc_width function.
471 Return the column after the end of the string. */
474 u8_width_linebreaks (const unsigned char *s, size_t n,
475 int width, int start_column, int at_end_columns,
476 const char *o, const char *encoding,
479 const unsigned char *s_end;
484 u8_possible_linebreaks (s, n, encoding, p);
488 last_column = start_column;
493 int count = u8_mbtouc_unsafe (&uc, s, s_end - s);
495 /* Respect the override. */
496 if (o != NULL && *o != UC_BREAK_UNDEFINED)
499 if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY)
501 /* An atomic piece of text ends here. */
502 if (last_p != NULL && last_column + piece_width > width)
504 /* Insert a line break. */
505 *last_p = UC_BREAK_POSSIBLE;
510 if (*p == UC_BREAK_MANDATORY)
512 /* uc is a line break character. */
513 /* Start a new piece at column 0. */
520 /* uc is not a line break character. */
523 if (*p == UC_BREAK_POSSIBLE)
525 /* Start a new piece. */
527 last_column += piece_width;
529 /* No line break for the moment, may be turned into
530 UC_BREAK_POSSIBLE later, via last_p. */
533 *p = UC_BREAK_PROHIBITED;
535 w = uc_width (uc, encoding);
536 if (w >= 0) /* ignore control characters in the string */
546 /* The last atomic piece of text ends here. */
547 if (last_p != NULL && last_column + piece_width + at_end_columns > width)
549 /* Insert a line break. */
550 *last_p = UC_BREAK_POSSIBLE;
554 return last_column + piece_width;
558 u16_width_linebreaks (const unsigned short *s, size_t n,
559 int width, int start_column, int at_end_columns,
560 const char *o, const char *encoding,
563 const unsigned short *s_end;
568 u16_possible_linebreaks (s, n, encoding, p);
572 last_column = start_column;
577 int count = u16_mbtouc_unsafe (&uc, s, s_end - s);
579 /* Respect the override. */
580 if (o != NULL && *o != UC_BREAK_UNDEFINED)
583 if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY)
585 /* An atomic piece of text ends here. */
586 if (last_p != NULL && last_column + piece_width > width)
588 /* Insert a line break. */
589 *last_p = UC_BREAK_POSSIBLE;
594 if (*p == UC_BREAK_MANDATORY)
596 /* uc is a line break character. */
597 /* Start a new piece at column 0. */
604 /* uc is not a line break character. */
607 if (*p == UC_BREAK_POSSIBLE)
609 /* Start a new piece. */
611 last_column += piece_width;
613 /* No line break for the moment, may be turned into
614 UC_BREAK_POSSIBLE later, via last_p. */
617 *p = UC_BREAK_PROHIBITED;
619 w = uc_width (uc, encoding);
620 if (w >= 0) /* ignore control characters in the string */
630 /* The last atomic piece of text ends here. */
631 if (last_p != NULL && last_column + piece_width + at_end_columns > width)
633 /* Insert a line break. */
634 *last_p = UC_BREAK_POSSIBLE;
638 return last_column + piece_width;
642 u32_width_linebreaks (const unsigned int *s, size_t n,
643 int width, int start_column, int at_end_columns,
644 const char *o, const char *encoding,
647 const unsigned int *s_end;
652 u32_possible_linebreaks (s, n, encoding, p);
656 last_column = start_column;
660 unsigned int uc = *s;
662 /* Respect the override. */
663 if (o != NULL && *o != UC_BREAK_UNDEFINED)
666 if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY)
668 /* An atomic piece of text ends here. */
669 if (last_p != NULL && last_column + piece_width > width)
671 /* Insert a line break. */
672 *last_p = UC_BREAK_POSSIBLE;
677 if (*p == UC_BREAK_MANDATORY)
679 /* uc is a line break character. */
680 /* Start a new piece at column 0. */
687 /* uc is not a line break character. */
690 if (*p == UC_BREAK_POSSIBLE)
692 /* Start a new piece. */
694 last_column += piece_width;
696 /* No line break for the moment, may be turned into
697 UC_BREAK_POSSIBLE later, via last_p. */
700 *p = UC_BREAK_PROHIBITED;
702 w = uc_width (uc, encoding);
703 if (w >= 0) /* ignore control characters in the string */
713 /* The last atomic piece of text ends here. */
714 if (last_p != NULL && last_column + piece_width + at_end_columns > width)
716 /* Insert a line break. */
717 *last_p = UC_BREAK_POSSIBLE;
721 return last_column + piece_width;
729 /* Read the contents of an input stream, and return it, terminated with a NUL
732 read_file (FILE *stream)
740 while (! feof (stream))
742 if (size + BUFSIZE > alloc)
744 alloc = alloc + alloc / 2;
745 if (alloc < size + BUFSIZE)
746 alloc = size + BUFSIZE;
747 buf = realloc (buf, alloc);
750 fprintf (stderr, "out of memory\n");
754 count = fread (buf + size, 1, BUFSIZE, stream);
766 buf = realloc (buf, size + 1);
769 fprintf (stderr, "out of memory\n");
778 main (int argc, char * argv[])
782 /* Display all the break opportunities in the input string. */
783 char *input = read_file (stdin);
784 int length = strlen (input);
785 char *breaks = malloc (length);
788 u8_possible_linebreaks ((unsigned char *) input, length, "UTF-8", breaks);
790 for (i = 0; i < length; i++)
794 case UC_BREAK_POSSIBLE:
795 /* U+2027 in UTF-8 encoding */
796 putc (0xe2, stdout); putc (0x80, stdout); putc (0xa7, stdout);
798 case UC_BREAK_MANDATORY:
799 /* U+21B2 (or U+21B5) in UTF-8 encoding */
800 putc (0xe2, stdout); putc (0x86, stdout); putc (0xb2, stdout);
802 case UC_BREAK_PROHIBITED:
807 putc (input[i], stdout);
816 /* Insert line breaks for a given width. */
817 int width = atoi (argv[1]);
818 char *input = read_file (stdin);
819 int length = strlen (input);
820 char *breaks = malloc (length);
823 u8_width_linebreaks ((unsigned char *) input, length, width, 0, 0, NULL, "UTF-8", breaks);
825 for (i = 0; i < length; i++)
829 case UC_BREAK_POSSIBLE:
832 case UC_BREAK_MANDATORY:
834 case UC_BREAK_PROHIBITED:
839 putc (input[i], stdout);
853 /* Now the same thing with an arbitrary encoding.
855 We convert the input string to Unicode.
857 The standardized Unicode encodings are UTF-8, UCS-2, UCS-4, UTF-16,
858 UTF-16BE, UTF-16LE, UTF-7. UCS-2 supports only characters up to
859 \U0000FFFF. UTF-16 and variants support only characters up to
860 \U0010FFFF. UTF-7 is way too complex and not supported by glibc-2.1.
861 UCS-4 specification leaves doubts about endianness and byte order mark.
862 glibc currently interprets it as big endian without byte order mark,
863 but this is not backed by an RFC. So we use UTF-8. It supports
864 characters up to \U7FFFFFFF and is unambiguously defined. */
871 /* Luckily, the encoding's name is platform independent. */
872 #define UTF8_NAME "UTF-8"
874 /* Return the length of a string after conversion through an iconv_t. */
876 iconv_string_length (iconv_t cd, const char *s, size_t n)
878 #define TMPBUFSIZE 4096
880 char tmpbuf[TMPBUFSIZE];
881 const char *inptr = s;
885 char *outptr = tmpbuf;
886 size_t outsize = TMPBUFSIZE;
887 size_t res = iconv (cd, (ICONV_CONST char **) &inptr, &insize, &outptr, &outsize);
888 if (res == (size_t)(-1) && errno != E2BIG)
890 count += outptr - tmpbuf;
892 /* Avoid glibc-2.1 bug and Solaris 7 through 9 bug. */
893 #if defined _LIBICONV_VERSION \
894 || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
896 char *outptr = tmpbuf;
897 size_t outsize = TMPBUFSIZE;
898 size_t res = iconv (cd, NULL, NULL, &outptr, &outsize);
899 if (res == (size_t)(-1))
901 count += outptr - tmpbuf;
903 /* Return to the initial state. */
904 iconv (cd, NULL, NULL, NULL, NULL);
911 iconv_string_keeping_offsets (iconv_t cd, const char *s, size_t n,
912 size_t *offtable, char *t, size_t m)
919 /* Avoid glibc-2.1 bug. */
920 #if !defined _LIBICONV_VERSION && (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1)
921 const size_t extra = 1;
923 const size_t extra = 0;
926 for (i = 0; i < n; i++)
927 offtable[i] = (size_t)(-1);
933 while (inptr < s_end)
935 const char *saved_inptr;
939 offtable[inptr - s] = outptr - t;
943 for (insize = 1; inptr + insize <= s_end; insize++)
945 res = iconv (cd, (ICONV_CONST char **) &inptr, &insize, &outptr, &outsize);
946 if (!(res == (size_t)(-1) && errno == EINVAL))
948 /* We expect that no input bytes have been consumed so far. */
949 if (inptr != saved_inptr)
952 /* After we verified the convertibility and computed the translation's
953 size m, there shouldn't be any conversion error here. */
954 if (res == (size_t)(-1))
957 /* Avoid glibc-2.1 bug and Solaris 7 bug. */
958 #if defined _LIBICONV_VERSION \
959 || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
960 if (iconv (cd, NULL, NULL, &outptr, &outsize) == (size_t)(-1))
963 /* We should have produced exactly m output bytes. */
964 if (outsize != extra)
968 #endif /* HAVE_ICONV */
972 /* Tests whether a string is entirely ASCII. Returns 1 if yes.
973 Returns 0 if the string is in an 8-bit encoding or an ISO-2022 encoding. */
975 is_all_ascii (const char *s, size_t n)
977 for (; n > 0; s++, n--)
979 unsigned char c = (unsigned char) *s;
981 if (!(c_isprint (c) || c_isspace (c)))
987 #endif /* C_CTYPE_ASCII */
990 mbs_possible_linebreaks (const char *s, size_t n, const char *encoding,
995 if (is_utf8_encoding (encoding))
996 u8_possible_linebreaks ((const unsigned char *) s, n, encoding, p);
1001 /* Avoid glibc-2.1 bug with EUC-KR. */
1002 # if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
1003 if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0))
1004 to_utf8 = (iconv_t)(-1);
1007 /* Avoid Solaris 9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS, GBK,
1009 # if defined __sun && !defined _LIBICONV_VERSION
1010 if ( STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
1011 || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)
1012 || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)
1013 || STREQ (encoding, "BIG5-HKSCS", 'B', 'I', 'G', '5', '-', 'H', 'K', 'S', 'C')
1014 || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0)
1015 || STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
1016 to_utf8 = (iconv_t)(-1);
1019 to_utf8 = iconv_open (UTF8_NAME, encoding);
1020 if (to_utf8 != (iconv_t)(-1))
1022 /* Determine the length of the resulting UTF-8 string. */
1023 size_t m = iconv_string_length (to_utf8, s, n);
1024 if (m != (size_t)(-1))
1026 /* Convert the string to UTF-8 and build a translation table
1027 from offsets into s to offsets into the translated string. */
1028 size_t memory_size = xsum3 (xtimes (n, sizeof (size_t)), m, m);
1030 (size_in_bounds_p (memory_size) ? malloc (memory_size) : NULL);
1033 size_t *offtable = (size_t *) memory;
1034 char *t = (char *) (offtable + n);
1035 char *q = (char *) (t + m);
1038 iconv_string_keeping_offsets (to_utf8, s, n, offtable, t, m);
1040 /* Determine the possible line breaks of the UTF-8 string. */
1041 u8_possible_linebreaks ((const unsigned char *) t, m, encoding, q);
1043 /* Translate the result back to the original string. */
1044 memset (p, UC_BREAK_PROHIBITED, n);
1045 for (i = 0; i < n; i++)
1046 if (offtable[i] != (size_t)(-1))
1047 p[i] = q[offtable[i]];
1050 iconv_close (to_utf8);
1054 iconv_close (to_utf8);
1057 /* Impossible to convert. */
1059 if (is_all_ascii (s, n))
1061 /* ASCII is a subset of UTF-8. */
1062 u8_possible_linebreaks ((const unsigned char *) s, n, encoding, p);
1066 /* We have a non-ASCII string and cannot convert it.
1067 Don't produce line breaks except those already present in the
1068 input string. All we assume here is that the encoding is
1069 minimally ASCII compatible. */
1071 const char *s_end = s + n;
1074 *p = (*s == '\n' ? UC_BREAK_MANDATORY : UC_BREAK_PROHIBITED);
1083 mbs_width_linebreaks (const char *s, size_t n,
1084 int width, int start_column, int at_end_columns,
1085 const char *o, const char *encoding,
1089 return start_column;
1090 if (is_utf8_encoding (encoding))
1091 return u8_width_linebreaks ((const unsigned char *) s, n, width, start_column, at_end_columns, o, encoding, p);
1096 /* Avoid glibc-2.1 bug with EUC-KR. */
1097 # if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
1098 if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0))
1099 to_utf8 = (iconv_t)(-1);
1102 /* Avoid Solaris 9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS, GBK,
1104 # if defined __sun && !defined _LIBICONV_VERSION
1105 if ( STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
1106 || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)
1107 || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)
1108 || STREQ (encoding, "BIG5-HKSCS", 'B', 'I', 'G', '5', '-', 'H', 'K', 'S', 'C')
1109 || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0)
1110 || STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
1111 to_utf8 = (iconv_t)(-1);
1114 to_utf8 = iconv_open (UTF8_NAME, encoding);
1115 if (to_utf8 != (iconv_t)(-1))
1117 /* Determine the length of the resulting UTF-8 string. */
1118 size_t m = iconv_string_length (to_utf8, s, n);
1119 if (m != (size_t)(-1))
1121 /* Convert the string to UTF-8 and build a translation table
1122 from offsets into s to offsets into the translated string. */
1123 size_t memory_size =
1124 xsum4 (xtimes (n, sizeof (size_t)), m, m,
1125 (o != NULL ? m : 0));
1128 (size_in_bounds_p (memory_size) ? malloc (memory_size) : NULL);
1131 size_t *offtable = (size_t *) memory;
1132 char *t = (char *) (offtable + n);
1133 char *q = (char *) (t + m);
1134 char *o8 = (o != NULL ? (char *) (q + m) : NULL);
1138 iconv_string_keeping_offsets (to_utf8, s, n, offtable, t, m);
1140 /* Translate the overrides to the UTF-8 string. */
1143 memset (o8, UC_BREAK_UNDEFINED, m);
1144 for (i = 0; i < n; i++)
1145 if (offtable[i] != (size_t)(-1))
1146 o8[offtable[i]] = o[i];
1149 /* Determine the line breaks of the UTF-8 string. */
1151 u8_width_linebreaks ((const unsigned char *) t, m, width, start_column, at_end_columns, o8, encoding, q);
1153 /* Translate the result back to the original string. */
1154 memset (p, UC_BREAK_PROHIBITED, n);
1155 for (i = 0; i < n; i++)
1156 if (offtable[i] != (size_t)(-1))
1157 p[i] = q[offtable[i]];
1160 iconv_close (to_utf8);
1164 iconv_close (to_utf8);
1167 /* Impossible to convert. */
1169 if (is_all_ascii (s, n))
1171 /* ASCII is a subset of UTF-8. */
1172 return u8_width_linebreaks ((const unsigned char *) s, n, width, start_column, at_end_columns, o, encoding, p);
1175 /* We have a non-ASCII string and cannot convert it.
1176 Don't produce line breaks except those already present in the
1177 input string. All we assume here is that the encoding is
1178 minimally ASCII compatible. */
1180 const char *s_end = s + n;
1183 *p = ((o != NULL && *o == UC_BREAK_MANDATORY) || *s == '\n'
1184 ? UC_BREAK_MANDATORY
1185 : UC_BREAK_PROHIBITED);
1191 /* We cannot compute widths in this case. */
1192 return start_column;
1203 /* Read the contents of an input stream, and return it, terminated with a NUL
1206 read_file (FILE *stream)
1208 #define BUFSIZE 4096
1214 while (! feof (stream))
1216 if (size + BUFSIZE > alloc)
1218 alloc = alloc + alloc / 2;
1219 if (alloc < size + BUFSIZE)
1220 alloc = size + BUFSIZE;
1221 buf = realloc (buf, alloc);
1224 fprintf (stderr, "out of memory\n");
1228 count = fread (buf + size, 1, BUFSIZE, stream);
1231 if (ferror (stream))
1240 buf = realloc (buf, size + 1);
1243 fprintf (stderr, "out of memory\n");
1252 main (int argc, char * argv[])
1254 setlocale (LC_CTYPE, "");
1257 /* Display all the break opportunities in the input string. */
1258 char *input = read_file (stdin);
1259 int length = strlen (input);
1260 char *breaks = malloc (length);
1263 mbs_possible_linebreaks (input, length, locale_charset (), breaks);
1265 for (i = 0; i < length; i++)
1269 case UC_BREAK_POSSIBLE:
1272 case UC_BREAK_MANDATORY:
1274 case UC_BREAK_PROHIBITED:
1279 putc (input[i], stdout);
1288 /* Insert line breaks for a given width. */
1289 int width = atoi (argv[1]);
1290 char *input = read_file (stdin);
1291 int length = strlen (input);
1292 char *breaks = malloc (length);
1295 mbs_width_linebreaks (input, length, width, 0, 0, NULL, locale_charset (), breaks);
1297 for (i = 0; i < length; i++)
1301 case UC_BREAK_POSSIBLE:
1302 putc ('\n', stdout);
1304 case UC_BREAK_MANDATORY:
1306 case UC_BREAK_PROHIBITED:
1311 putc (input[i], stdout);