1 /* Line breaking of strings.
2 Copyright (C) 2001-2003, 2006-2008 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2001.
5 This program is free software: you can redistribute it and/or modify it
6 under the terms of the GNU Lesser General Public License as published
7 by the Free Software Foundation; either version 3 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>. */
32 #include "unilbrk/ulc-common.h"
34 /* Line breaking of a string in an arbitrary encoding.
36 We convert the input string to Unicode.
38 The standardized Unicode encodings are UTF-8, UCS-2, UCS-4, UTF-16,
39 UTF-16BE, UTF-16LE, UTF-7. UCS-2 supports only characters up to
40 \U0000FFFF. UTF-16 and variants support only characters up to
41 \U0010FFFF. UTF-7 is way too complex and not supported by glibc-2.1.
42 UCS-4 specification leaves doubts about endianness and byte order mark.
43 glibc currently interprets it as big endian without byte order mark,
44 but this is not backed by an RFC. So we use UTF-8. It supports
45 characters up to \U7FFFFFFF and is unambiguously defined. */
48 ulc_width_linebreaks (const char *s, size_t n,
49 int width, int start_column, int at_end_columns,
50 const char *o, const char *encoding,
55 if (is_utf8_encoding (encoding))
56 return u8_width_linebreaks ((const uint8_t *) s, n, width, start_column, at_end_columns, o, encoding, p);
61 /* Avoid glibc-2.1 bug with EUC-KR. */
62 # if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
63 if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0))
64 to_utf8 = (iconv_t)(-1);
67 /* Avoid Solaris 9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS, GBK,
69 # if defined __sun && !defined _LIBICONV_VERSION
70 if ( STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
71 || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)
72 || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)
73 || STREQ (encoding, "BIG5-HKSCS", 'B', 'I', 'G', '5', '-', 'H', 'K', 'S', 'C')
74 || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0)
75 || STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
76 to_utf8 = (iconv_t)(-1);
79 to_utf8 = iconv_open (UTF8_NAME, encoding);
80 if (to_utf8 != (iconv_t)(-1))
82 /* Determine the length of the resulting UTF-8 string. */
83 size_t m = iconv_string_length (to_utf8, s, n);
84 if (m != (size_t)(-1))
86 /* Convert the string to UTF-8 and build a translation table
87 from offsets into s to offsets into the translated string. */
89 xsum4 (xtimes (n, sizeof (size_t)), m, m,
93 (size_in_bounds_p (memory_size) ? malloc (memory_size) : NULL);
96 size_t *offtable = (size_t *) memory;
97 char *t = (char *) (offtable + n);
98 char *q = (char *) (t + m);
99 char *o8 = (o != NULL ? (char *) (q + m) : NULL);
103 iconv_string_keeping_offsets (to_utf8, s, n, offtable, t, m);
105 /* Translate the overrides to the UTF-8 string. */
108 memset (o8, UC_BREAK_UNDEFINED, m);
109 for (i = 0; i < n; i++)
110 if (offtable[i] != (size_t)(-1))
111 o8[offtable[i]] = o[i];
114 /* Determine the line breaks of the UTF-8 string. */
116 u8_width_linebreaks ((const uint8_t *) t, m, width, start_column, at_end_columns, o8, encoding, q);
118 /* Translate the result back to the original string. */
119 memset (p, UC_BREAK_PROHIBITED, n);
120 for (i = 0; i < n; i++)
121 if (offtable[i] != (size_t)(-1))
122 p[i] = q[offtable[i]];
125 iconv_close (to_utf8);
129 iconv_close (to_utf8);
132 /* Impossible to convert. */
134 if (is_all_ascii (s, n))
136 /* ASCII is a subset of UTF-8. */
137 return u8_width_linebreaks ((const uint8_t *) s, n, width, start_column, at_end_columns, o, encoding, p);
140 /* We have a non-ASCII string and cannot convert it.
141 Don't produce line breaks except those already present in the
142 input string. All we assume here is that the encoding is
143 minimally ASCII compatible. */
145 const char *s_end = s + n;
148 *p = ((o != NULL && *o == UC_BREAK_MANDATORY) || *s == '\n'
150 : UC_BREAK_PROHIBITED);
156 /* We cannot compute widths in this case. */
168 /* Read the contents of an input stream, and return it, terminated with a NUL
171 read_file (FILE *stream)
179 while (! feof (stream))
181 if (size + BUFSIZE > alloc)
183 alloc = alloc + alloc / 2;
184 if (alloc < size + BUFSIZE)
185 alloc = size + BUFSIZE;
186 buf = realloc (buf, alloc);
189 fprintf (stderr, "out of memory\n");
193 count = fread (buf + size, 1, BUFSIZE, stream);
205 buf = realloc (buf, size + 1);
208 fprintf (stderr, "out of memory\n");
217 main (int argc, char * argv[])
219 setlocale (LC_CTYPE, "");
222 /* Insert line breaks for a given width. */
223 int width = atoi (argv[1]);
224 char *input = read_file (stdin);
225 int length = strlen (input);
226 char *breaks = malloc (length);
229 ulc_width_linebreaks (input, length, width, 0, 0, NULL, locale_charset (), breaks);
231 for (i = 0; i < length; i++)
235 case UC_BREAK_POSSIBLE:
238 case UC_BREAK_MANDATORY:
240 case UC_BREAK_PROHIBITED:
245 putc (input[i], stdout);