pintos-os.org Git - pspp/blob - lib/unilbrk/ulc-width-linebreaks.c

   1 /* Line breaking of strings.
   2    Copyright (C) 2001-2003, 2006-2008 Free Software Foundation, Inc.
   3    Written by Bruno Haible <bruno@clisp.org>, 2001.
   4
   5    This program is free software: you can redistribute it and/or modify it
   6    under the terms of the GNU Lesser General Public License as published
   7    by the Free Software Foundation; either version 3 of the License, or
   8    (at your option) any later version.
   9
  10    This program is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public License
  16    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  17
  18 #include <config.h>
  19
  20 /* Specification.  */
  21 #include "unilbrk.h"
  22
  23 #include <stdlib.h>
  24 #include <string.h>
  25 #if HAVE_ICONV
  26 # include <iconv.h>
  27 #endif
  28
  29 #include "c-ctype.h"
  30 #include "streq.h"
  31 #include "xsize.h"
  32 #include "unilbrk/ulc-common.h"
  33
  34 /* Line breaking of a string in an arbitrary encoding.
  35
  36    We convert the input string to Unicode.
  37
  38    The standardized Unicode encodings are UTF-8, UCS-2, UCS-4, UTF-16,
  39    UTF-16BE, UTF-16LE, UTF-7.  UCS-2 supports only characters up to
  40    \U0000FFFF.  UTF-16 and variants support only characters up to
  41    \U0010FFFF.  UTF-7 is way too complex and not supported by glibc-2.1.
  42    UCS-4 specification leaves doubts about endianness and byte order mark.
  43    glibc currently interprets it as big endian without byte order mark,
  44    but this is not backed by an RFC.  So we use UTF-8. It supports
  45    characters up to \U7FFFFFFF and is unambiguously defined.  */
  46
  47 int
  48 ulc_width_linebreaks (const char *s, size_t n,
  49                       int width, int start_column, int at_end_columns,
  50                       const char *o, const char *encoding,
  51                       char *p)
  52 {
  53   if (n == 0)
  54     return start_column;
  55   if (is_utf8_encoding (encoding))
  56     return u8_width_linebreaks ((const uint8_t *) s, n, width, start_column, at_end_columns, o, encoding, p);
  57   else
  58     {
  59 #if HAVE_ICONV
  60       iconv_t to_utf8;
  61       /* Avoid glibc-2.1 bug with EUC-KR.  */
  62 # if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
  63       if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0))
  64         to_utf8 = (iconv_t)(-1);
  65       else
  66 # endif
  67       /* Avoid Solaris 9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS, GBK,
  68          GB18030.  */
  69 # if defined __sun && !defined _LIBICONV_VERSION
  70       if (   STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
  71           || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)
  72           || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)
  73           || STREQ (encoding, "BIG5-HKSCS", 'B', 'I', 'G', '5', '-', 'H', 'K', 'S', 'C')
  74           || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0)
  75           || STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
  76         to_utf8 = (iconv_t)(-1);
  77       else
  78 # endif
  79       to_utf8 = iconv_open (UTF8_NAME, encoding);
  80       if (to_utf8 != (iconv_t)(-1))
  81         {
  82           /* Determine the length of the resulting UTF-8 string.  */
  83           size_t m = iconv_string_length (to_utf8, s, n);
  84           if (m != (size_t)(-1))
  85             {
  86               /* Convert the string to UTF-8 and build a translation table
  87                  from offsets into s to offsets into the translated string.  */
  88               size_t memory_size =
  89                 xsum4 (xtimes (n, sizeof (size_t)), m, m,
  90                        (o != NULL ? m : 0));
  91               char *memory =
  92                 (char *)
  93                 (size_in_bounds_p (memory_size) ? malloc (memory_size) : NULL);
  94               if (memory != NULL)
  95                 {
  96                   size_t *offtable = (size_t *) memory;
  97                   char *t = (char *) (offtable + n);
  98                   char *q = (char *) (t + m);
  99                   char *o8 = (o != NULL ? (char *) (q + m) : NULL);
 100                   int res_column;
 101                   size_t i;
 102
 103                   iconv_string_keeping_offsets (to_utf8, s, n, offtable, t, m);
 104
 105                   /* Translate the overrides to the UTF-8 string.  */
 106                   if (o != NULL)
 107                     {
 108                       memset (o8, UC_BREAK_UNDEFINED, m);
 109                       for (i = 0; i < n; i++)
 110                         if (offtable[i] != (size_t)(-1))
 111                           o8[offtable[i]] = o[i];
 112                     }
 113
 114                   /* Determine the line breaks of the UTF-8 string.  */
 115                   res_column =
 116                     u8_width_linebreaks ((const uint8_t *) t, m, width, start_column, at_end_columns, o8, encoding, q);
 117
 118                   /* Translate the result back to the original string.  */
 119                   memset (p, UC_BREAK_PROHIBITED, n);
 120                   for (i = 0; i < n; i++)
 121                     if (offtable[i] != (size_t)(-1))
 122                       p[i] = q[offtable[i]];
 123
 124                   free (memory);
 125                   iconv_close (to_utf8);
 126                   return res_column;
 127                 }
 128             }
 129           iconv_close (to_utf8);
 130         }
 131 #endif
 132       /* Impossible to convert.  */
 133 #if C_CTYPE_ASCII
 134       if (is_all_ascii (s, n))
 135         {
 136           /* ASCII is a subset of UTF-8.  */
 137           return u8_width_linebreaks ((const uint8_t *) s, n, width, start_column, at_end_columns, o, encoding, p);
 138         }
 139 #endif
 140       /* We have a non-ASCII string and cannot convert it.
 141          Don't produce line breaks except those already present in the
 142          input string.  All we assume here is that the encoding is
 143          minimally ASCII compatible.  */
 144       {
 145         const char *s_end = s + n;
 146         while (s < s_end)
 147           {
 148             *p = ((o != NULL && *o == UC_BREAK_MANDATORY) || *s == '\n'
 149                   ? UC_BREAK_MANDATORY
 150                   : UC_BREAK_PROHIBITED);
 151             s++;
 152             p++;
 153             if (o != NULL)
 154               o++;
 155           }
 156         /* We cannot compute widths in this case.  */
 157         return start_column;
 158       }
 159     }
 160 }
 161
 162
 163 #ifdef TEST
 164
 165 #include <stdio.h>
 166 #include <locale.h>
 167
 168 /* Read the contents of an input stream, and return it, terminated with a NUL
 169    byte. */
 170 char *
 171 read_file (FILE *stream)
 172 {
 173 #define BUFSIZE 4096
 174   char *buf = NULL;
 175   int alloc = 0;
 176   int size = 0;
 177   int count;
 178
 179   while (! feof (stream))
 180     {
 181       if (size + BUFSIZE > alloc)
 182         {
 183           alloc = alloc + alloc / 2;
 184           if (alloc < size + BUFSIZE)
 185             alloc = size + BUFSIZE;
 186           buf = realloc (buf, alloc);
 187           if (buf == NULL)
 188             {
 189               fprintf (stderr, "out of memory\n");
 190               exit (1);
 191             }
 192         }
 193       count = fread (buf + size, 1, BUFSIZE, stream);
 194       if (count == 0)
 195         {
 196           if (ferror (stream))
 197             {
 198               perror ("fread");
 199               exit (1);
 200             }
 201         }
 202       else
 203         size += count;
 204     }
 205   buf = realloc (buf, size + 1);
 206   if (buf == NULL)
 207     {
 208       fprintf (stderr, "out of memory\n");
 209       exit (1);
 210     }
 211   buf[size] = '\0';
 212   return buf;
 213 #undef BUFSIZE
 214 }
 215
 216 int
 217 main (int argc, char * argv[])
 218 {
 219   setlocale (LC_CTYPE, "");
 220   if (argc == 2)
 221     {
 222       /* Insert line breaks for a given width.  */
 223       int width = atoi (argv[1]);
 224       char *input = read_file (stdin);
 225       int length = strlen (input);
 226       char *breaks = malloc (length);
 227       int i;
 228
 229       ulc_width_linebreaks (input, length, width, 0, 0, NULL, locale_charset (), breaks);
 230
 231       for (i = 0; i < length; i++)
 232         {
 233           switch (breaks[i])
 234             {
 235             case UC_BREAK_POSSIBLE:
 236               putc ('\n', stdout);
 237               break;
 238             case UC_BREAK_MANDATORY:
 239               break;
 240             case UC_BREAK_PROHIBITED:
 241               break;
 242             default:
 243               abort ();
 244             }
 245           putc (input[i], stdout);
 246         }
 247
 248       free (breaks);
 249
 250       return 0;
 251     }
 252   else
 253     return 1;
 254 }
 255
 256 #endif /* TEST */