pintos-os.org Git - pspp/blob - lib/unilbrk/gen-lbrk.c

   1 /* Generate a Unicode conforming Line Break Properties tables from a
   2    UnicodeData file.
   3    Copyright (C) 2000-2002, 2004, 2008 Free Software Foundation, Inc.
   4    Written by Bruno Haible <bruno@clisp.org>, 2000-2002.
   5
   6    This program is free software: you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 3 of the License, or
   9    (at your option) any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  18
  19 /* Usage example:
  20      $ gen-lbrk /usr/local/share/Unidata/UnicodeData.txt \
  21                 /usr/local/share/Unidata/EastAsianWidth.txt \
  22                 /usr/local/share/Unidata/LineBreak.txt \
  23                 5.0.0
  24  */
  25
  26 #include <stdio.h>
  27 #include <stdlib.h>
  28 #include <stdbool.h>
  29 #include <stdint.h>
  30 #include <string.h>
  31 #include <time.h>
  32
  33 /* This structure represents one line in the UnicodeData.txt file.  */
  34 struct unicode_attribute
  35 {
  36   const char *name;           /* Character name */
  37   const char *category;       /* General category */
  38   const char *combining;      /* Canonical combining classes */
  39   const char *bidi;           /* Bidirectional category */
  40   const char *decomposition;  /* Character decomposition mapping */
  41   const char *decdigit;       /* Decimal digit value */
  42   const char *digit;          /* Digit value */
  43   const char *numeric;        /* Numeric value */
  44   int mirrored;               /* mirrored */
  45   const char *oldname;        /* Old Unicode 1.0 name */
  46   const char *comment;        /* Comment */
  47   unsigned int upper;         /* Uppercase mapping */
  48   unsigned int lower;         /* Lowercase mapping */
  49   unsigned int title;         /* Titlecase mapping */
  50 };
  51
  52 /* Missing fields are represented with "" for strings, and NONE for
  53    characters.  */
  54 #define NONE (~(unsigned int)0)
  55
  56 /* The entire contents of the UnicodeData.txt file.  */
  57 struct unicode_attribute unicode_attributes [0x110000];
  58
  59 /* Stores in unicode_attributes[i] the values from the given fields.  */
  60 static void
  61 fill_attribute (unsigned int i,
  62                 const char *field1, const char *field2,
  63                 const char *field3, const char *field4,
  64                 const char *field5, const char *field6,
  65                 const char *field7, const char *field8,
  66                 const char *field9, const char *field10,
  67                 const char *field11, const char *field12,
  68                 const char *field13, const char *field14)
  69 {
  70   struct unicode_attribute * uni;
  71
  72   if (i >= 0x110000)
  73     {
  74       fprintf (stderr, "index too large\n");
  75       exit (1);
  76     }
  77   uni = &unicode_attributes[i];
  78   /* Copy the strings.  */
  79   uni->name          = strdup (field1);
  80   uni->category      = (field2[0] == '\0' ? "" : strdup (field2));
  81   uni->combining     = (field3[0] == '\0' ? "" : strdup (field3));
  82   uni->bidi          = (field4[0] == '\0' ? "" : strdup (field4));
  83   uni->decomposition = (field5[0] == '\0' ? "" : strdup (field5));
  84   uni->decdigit      = (field6[0] == '\0' ? "" : strdup (field6));
  85   uni->digit         = (field7[0] == '\0' ? "" : strdup (field7));
  86   uni->numeric       = (field8[0] == '\0' ? "" : strdup (field8));
  87   uni->mirrored      = (field9[0] == 'Y');
  88   uni->oldname       = (field10[0] == '\0' ? "" : strdup (field10));
  89   uni->comment       = (field11[0] == '\0' ? "" : strdup (field11));
  90   uni->upper = (field12[0] =='\0' ? NONE : strtoul (field12, NULL, 16));
  91   uni->lower = (field13[0] =='\0' ? NONE : strtoul (field13, NULL, 16));
  92   uni->title = (field14[0] =='\0' ? NONE : strtoul (field14, NULL, 16));
  93 }
  94
  95 /* Maximum length of a field in the UnicodeData.txt file.  */
  96 #define FIELDLEN 120
  97
  98 /* Reads the next field from STREAM.  The buffer BUFFER has size FIELDLEN.
  99    Reads up to (but excluding) DELIM.
 100    Returns 1 when a field was successfully read, otherwise 0.  */
 101 static int
 102 getfield (FILE *stream, char *buffer, int delim)
 103 {
 104   int count = 0;
 105   int c;
 106
 107   for (; (c = getc (stream)), (c != EOF && c != delim); )
 108     {
 109       /* The original unicode.org UnicodeData.txt file happens to have
 110          CR/LF line terminators.  Silently convert to LF.  */
 111       if (c == '\r')
 112         continue;
 113
 114       /* Put c into the buffer.  */
 115       if (++count >= FIELDLEN - 1)
 116         {
 117           fprintf (stderr, "field too long\n");
 118           exit (1);
 119         }
 120       *buffer++ = c;
 121     }
 122
 123   if (c == EOF)
 124     return 0;
 125
 126   *buffer = '\0';
 127   return 1;
 128 }
 129
 130 /* Stores in unicode_attributes[] the entire contents of the UnicodeData.txt
 131    file.  */
 132 static void
 133 fill_attributes (const char *unicodedata_filename)
 134 {
 135   unsigned int i, j;
 136   FILE *stream;
 137   char field0[FIELDLEN];
 138   char field1[FIELDLEN];
 139   char field2[FIELDLEN];
 140   char field3[FIELDLEN];
 141   char field4[FIELDLEN];
 142   char field5[FIELDLEN];
 143   char field6[FIELDLEN];
 144   char field7[FIELDLEN];
 145   char field8[FIELDLEN];
 146   char field9[FIELDLEN];
 147   char field10[FIELDLEN];
 148   char field11[FIELDLEN];
 149   char field12[FIELDLEN];
 150   char field13[FIELDLEN];
 151   char field14[FIELDLEN];
 152   int lineno = 0;
 153
 154   for (i = 0; i < 0x110000; i++)
 155     unicode_attributes[i].name = NULL;
 156
 157   stream = fopen (unicodedata_filename, "r");
 158   if (stream == NULL)
 159     {
 160       fprintf (stderr, "error during fopen of '%s'\n", unicodedata_filename);
 161       exit (1);
 162     }
 163
 164   for (;;)
 165     {
 166       int n;
 167
 168       lineno++;
 169       n = getfield (stream, field0, ';');
 170       n += getfield (stream, field1, ';');
 171       n += getfield (stream, field2, ';');
 172       n += getfield (stream, field3, ';');
 173       n += getfield (stream, field4, ';');
 174       n += getfield (stream, field5, ';');
 175       n += getfield (stream, field6, ';');
 176       n += getfield (stream, field7, ';');
 177       n += getfield (stream, field8, ';');
 178       n += getfield (stream, field9, ';');
 179       n += getfield (stream, field10, ';');
 180       n += getfield (stream, field11, ';');
 181       n += getfield (stream, field12, ';');
 182       n += getfield (stream, field13, ';');
 183       n += getfield (stream, field14, '\n');
 184       if (n == 0)
 185         break;
 186       if (n != 15)
 187         {
 188           fprintf (stderr, "short line in'%s':%d\n",
 189                    unicodedata_filename, lineno);
 190           exit (1);
 191         }
 192       i = strtoul (field0, NULL, 16);
 193       if (field1[0] == '<'
 194           && strlen (field1) >= 9
 195           && !strcmp (field1 + strlen(field1) - 8, ", First>"))
 196         {
 197           /* Deal with a range. */
 198           lineno++;
 199           n = getfield (stream, field0, ';');
 200           n += getfield (stream, field1, ';');
 201           n += getfield (stream, field2, ';');
 202           n += getfield (stream, field3, ';');
 203           n += getfield (stream, field4, ';');
 204           n += getfield (stream, field5, ';');
 205           n += getfield (stream, field6, ';');
 206           n += getfield (stream, field7, ';');
 207           n += getfield (stream, field8, ';');
 208           n += getfield (stream, field9, ';');
 209           n += getfield (stream, field10, ';');
 210           n += getfield (stream, field11, ';');
 211           n += getfield (stream, field12, ';');
 212           n += getfield (stream, field13, ';');
 213           n += getfield (stream, field14, '\n');
 214           if (n != 15)
 215             {
 216               fprintf (stderr, "missing end range in '%s':%d\n",
 217                        unicodedata_filename, lineno);
 218               exit (1);
 219             }
 220           if (!(field1[0] == '<'
 221                 && strlen (field1) >= 8
 222                 && !strcmp (field1 + strlen (field1) - 7, ", Last>")))
 223             {
 224               fprintf (stderr, "missing end range in '%s':%d\n",
 225                        unicodedata_filename, lineno);
 226               exit (1);
 227             }
 228           field1[strlen (field1) - 7] = '\0';
 229           j = strtoul (field0, NULL, 16);
 230           for (; i <= j; i++)
 231             fill_attribute (i, field1+1, field2, field3, field4, field5,
 232                                field6, field7, field8, field9, field10,
 233                                field11, field12, field13, field14);
 234         }
 235       else
 236         {
 237           /* Single character line */
 238           fill_attribute (i, field1, field2, field3, field4, field5,
 239                              field6, field7, field8, field9, field10,
 240                              field11, field12, field13, field14);
 241         }
 242     }
 243   if (ferror (stream) || fclose (stream))
 244     {
 245       fprintf (stderr, "error reading from '%s'\n", unicodedata_filename);
 246       exit (1);
 247     }
 248 }
 249
 250 /* The width property from the EastAsianWidth.txt file.
 251    Each is NULL (unassigned) or "N", "A", "H", "W", "F", "Na".  */
 252 const char * unicode_width[0x110000];
 253
 254 /* Stores in unicode_width[] the width property from the EastAsianWidth.txt
 255    file.  */
 256 static void
 257 fill_width (const char *width_filename)
 258 {
 259   unsigned int i, j;
 260   FILE *stream;
 261   char field0[FIELDLEN];
 262   char field1[FIELDLEN];
 263   char field2[FIELDLEN];
 264   int lineno = 0;
 265
 266   for (i = 0; i < 0x110000; i++)
 267     unicode_width[i] = (unicode_attributes[i].name != NULL ? "N" : NULL);
 268
 269   stream = fopen (width_filename, "r");
 270   if (stream == NULL)
 271     {
 272       fprintf (stderr, "error during fopen of '%s'\n", width_filename);
 273       exit (1);
 274     }
 275
 276   for (;;)
 277     {
 278       int n;
 279       int c;
 280
 281       lineno++;
 282       c = getc (stream);
 283       if (c == EOF)
 284         break;
 285       if (c == '#')
 286         {
 287           do c = getc (stream); while (c != EOF && c != '\n');
 288           continue;
 289         }
 290       ungetc (c, stream);
 291       n = getfield (stream, field0, ';');
 292       n += getfield (stream, field1, ' ');
 293       n += getfield (stream, field2, '\n');
 294       if (n == 0)
 295         break;
 296       if (n != 3)
 297         {
 298           fprintf (stderr, "short line in '%s':%d\n", width_filename, lineno);
 299           exit (1);
 300         }
 301       i = strtoul (field0, NULL, 16);
 302       if (strstr (field0, "..") != NULL)
 303         {
 304           /* Deal with a range.  */
 305           j = strtoul (strstr (field0, "..") + 2, NULL, 16);
 306           for (; i <= j; i++)
 307             unicode_width[i] = strdup (field1);
 308         }
 309       else
 310         {
 311           /* Single character line.  */
 312           unicode_width[i] = strdup (field1);
 313         }
 314     }
 315   if (ferror (stream) || fclose (stream))
 316     {
 317       fprintf (stderr, "error reading from '%s'\n", width_filename);
 318       exit (1);
 319     }
 320 }
 321
 322 /* Line breaking classification.  */
 323
 324 enum
 325 {
 326   /* Values >= 24 are resolved at run time. */
 327   LBP_BK = 24, /* mandatory break */
 328 /*LBP_CR,         carriage return - not used here because it's a DOSism */
 329 /*LBP_LF,         line feed - not used here because it's a DOSism */
 330   LBP_CM = 25, /* attached characters and combining marks */
 331 /*LBP_NL,         next line - not used here because it's equivalent to LBP_BK */
 332 /*LBP_SG,         surrogates - not used here because they are not characters */
 333   LBP_WJ =  0, /* word joiner */
 334   LBP_ZW = 26, /* zero width space */
 335   LBP_GL =  1, /* non-breaking (glue) */
 336   LBP_SP = 27, /* space */
 337   LBP_B2 =  2, /* break opportunity before and after */
 338   LBP_BA =  3, /* break opportunity after */
 339   LBP_BB =  4, /* break opportunity before */
 340   LBP_HY =  5, /* hyphen */
 341   LBP_CB = 28, /* contingent break opportunity */
 342   LBP_CL =  6, /* closing punctuation */
 343   LBP_EX =  7, /* exclamation/interrogation */
 344   LBP_IN =  8, /* inseparable */
 345   LBP_NS =  9, /* non starter */
 346   LBP_OP = 10, /* opening punctuation */
 347   LBP_QU = 11, /* ambiguous quotation */
 348   LBP_IS = 12, /* infix separator (numeric) */
 349   LBP_NU = 13, /* numeric */
 350   LBP_PO = 14, /* postfix (numeric) */
 351   LBP_PR = 15, /* prefix (numeric) */
 352   LBP_SY = 16, /* symbols allowing breaks */
 353   LBP_AI = 29, /* ambiguous (alphabetic or ideograph) */
 354   LBP_AL = 17, /* ordinary alphabetic and symbol characters */
 355   LBP_H2 = 18, /* Hangul LV syllable */
 356   LBP_H3 = 19, /* Hangul LVT syllable */
 357   LBP_ID = 20, /* ideographic */
 358   LBP_JL = 21, /* Hangul L Jamo */
 359   LBP_JV = 22, /* Hangul V Jamo */
 360   LBP_JT = 23, /* Hangul T Jamo */
 361   LBP_SA = 30, /* complex context (South East Asian) */
 362   LBP_XX = 31  /* unknown */
 363 };
 364
 365 /* Returns the line breaking classification for ch, as a bit mask.  */
 366 static int
 367 get_lbp (unsigned int ch)
 368 {
 369   int attr = 0;
 370
 371   if (unicode_attributes[ch].name != NULL)
 372     {
 373       /* mandatory break */
 374       if (ch == 0x000A || ch == 0x000D || ch == 0x0085 /* newline */
 375           || ch == 0x000C /* form feed */
 376           || ch == 0x000B /* line tabulation */
 377           || ch == 0x2028 /* LINE SEPARATOR */
 378           || ch == 0x2029 /* PARAGRAPH SEPARATOR */)
 379         attr |= 1 << LBP_BK;
 380
 381       if (ch == 0x2060 /* WORD JOINER */
 382           || ch == 0xFEFF /* ZERO WIDTH NO-BREAK SPACE */)
 383         attr |= 1 << LBP_WJ;
 384
 385       /* zero width space */
 386       if (ch == 0x200B /* ZERO WIDTH SPACE */)
 387         attr |= 1 << LBP_ZW;
 388
 389       /* non-breaking (glue) */
 390       if (ch == 0x00A0 /* NO-BREAK SPACE */
 391           || ch == 0x202F /* NARROW NO-BREAK SPACE */
 392           || ch == 0x180E /* MONGOLIAN VOWEL SEPARATOR */
 393           || ch == 0x034F /* COMBINING GRAPHEME JOINER */
 394           || ch == 0x2007 /* FIGURE SPACE */
 395           || ch == 0x2011 /* NON-BREAKING HYPHEN */
 396           || ch == 0x0F08 /* TIBETAN MARK SBRUL SHAD */
 397           || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */
 398           || ch == 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */
 399           || (ch >= 0x035C && ch <= 0x0362) /* COMBINING DOUBLE ... */)
 400         attr |= 1 << LBP_GL;
 401
 402       /* space */
 403       if (ch == 0x0020 /* SPACE */)
 404         attr |= 1 << LBP_SP;
 405
 406       /* break opportunity before and after */
 407       if (ch == 0x2014 /* EM DASH */)
 408         attr |= 1 << LBP_B2;
 409
 410       /* break opportunity after */
 411       if (ch == 0x1680 /* OGHAM SPACE MARK */
 412           || ch == 0x2000 /* EN QUAD */
 413           || ch == 0x2001 /* EM QUAD */
 414           || ch == 0x2002 /* EN SPACE */
 415           || ch == 0x2003 /* EM SPACE */
 416           || ch == 0x2004 /* THREE-PER-EM SPACE */
 417           || ch == 0x2005 /* FOUR-PER-EM SPACE */
 418           || ch == 0x2006 /* SIX-PER-EM SPACE */
 419           || ch == 0x2008 /* PUNCTUATION SPACE */
 420           || ch == 0x2009 /* THIN SPACE */
 421           || ch == 0x200A /* HAIR SPACE */
 422           || ch == 0x205F /* MEDIUM MATHEMATICAL SPACE */
 423           || ch == 0x0009 /* tab */
 424           || ch == 0x00AD /* SOFT HYPHEN */
 425           || ch == 0x058A /* ARMENIAN HYPHEN */
 426           || ch == 0x2010 /* HYPHEN */
 427           || ch == 0x2012 /* FIGURE DASH */
 428           || ch == 0x2013 /* EN DASH */
 429           || ch == 0x05BE /* HEBREW PUNCTUATION MAQAF */
 430           || ch == 0x0F0B /* TIBETAN MARK INTERSYLLABIC TSHEG */
 431           || ch == 0x1361 /* ETHIOPIC WORDSPACE */
 432           || ch == 0x17D8 /* KHMER SIGN BEYYAL */
 433           || ch == 0x17DA /* KHMER SIGN KOOMUUT */
 434           || ch == 0x2027 /* HYPHENATION POINT */
 435           || ch == 0x007C /* VERTICAL LINE */
 436           || ch == 0x16EB /* RUNIC SINGLE PUNCTUATION */
 437           || ch == 0x16EC /* RUNIC MULTIPLE PUNCTUATION */
 438           || ch == 0x16ED /* RUNIC CROSS PUNCTUATION */
 439           || ch == 0x2056 /* THREE DOT PUNCTUATION */
 440           || ch == 0x2058 /* FOUR DOT PUNCTUATION */
 441           || ch == 0x2059 /* FIVE DOT PUNCTUATION */
 442           || ch == 0x205A /* TWO DOT PUNCTUATION */
 443           || ch == 0x205B /* FOUR DOT MARK */
 444           || ch == 0x205D /* TRICOLON */
 445           || ch == 0x205E /* VERTICAL FOUR DOTS */
 446           || ch == 0x2E19 /* PALM BRANCH */
 447           || ch == 0x2E2A /* TWO DOTS OVER ONE DOT PUNCTUATION */
 448           || ch == 0x2E2B /* ONE DOT OVER TWO DOTS PUNCTUATION */
 449           || ch == 0x2E2C /* SQUARED FOUR DOT PUNCTUATION */
 450           || ch == 0x2E2D /* FIVE DOT PUNCTUATION */
 451           || ch == 0x2E30 /* RING POINT */
 452           || ch == 0x10100 /* AEGEAN WORD SEPARATOR LINE */
 453           || ch == 0x10101 /* AEGEAN WORD SEPARATOR DOT */
 454           || ch == 0x10102 /* AEGEAN CHECK MARK */
 455           || ch == 0x1039F /* UGARITIC WORD DIVIDER */
 456           || ch == 0x103D0 /* OLD PERSIAN WORD DIVIDER */
 457           || ch == 0x1091F /* PHOENICIAN WORD SEPARATOR */
 458           || ch == 0x12470 /* CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER */
 459           || ch == 0x0964 /* DEVANAGARI DANDA */
 460           || ch == 0x0965 /* DEVANAGARI DOUBLE DANDA */
 461           || ch == 0x0E5A /* THAI CHARACTER ANGKHANKHU */
 462           || ch == 0x0E5B /* THAI CHARACTER KHOMUT */
 463           || ch == 0x104A /* MYANMAR SIGN LITTLE SECTION */
 464           || ch == 0x104B /* MYANMAR SIGN SECTION */
 465           || ch == 0x1735 /* PHILIPPINE SINGLE PUNCTUATION */
 466           || ch == 0x1736 /* PHILIPPINE DOUBLE PUNCTUATION */
 467           || ch == 0x17D4 /* KHMER SIGN KHAN */
 468           || ch == 0x17D5 /* KHMER SIGN BARIYOOSAN */
 469           || ch == 0x1B5E /* BALINESE CARIK SIKI */
 470           || ch == 0x1B5F /* BALINESE CARIK PAREREN */
 471           || ch == 0xA8CE /* SAURASHTRA DANDA */
 472           || ch == 0xA8CF /* SAURASHTRA DOUBLE DANDA */
 473           || ch == 0xAA5D /* CHAM PUNCTUATION DANDA */
 474           || ch == 0xAA5E /* CHAM PUNCTUATION DOUBLE DANDA */
 475           || ch == 0xAA5F /* CHAM PUNCTUATION TRIPLE DANDA */
 476           || ch == 0x10A56 /* KHAROSHTHI PUNCTUATION DANDA */
 477           || ch == 0x10A57 /* KHAROSHTHI PUNCTUATION DOUBLE DANDA */
 478           || ch == 0x0F34 /* TIBETAN MARK BSDUS RTAGS */
 479           || ch == 0x0F7F /* TIBETAN SIGN RNAM BCAD */
 480           || ch == 0x0F85 /* TIBETAN MARK PALUTA */
 481           || ch == 0x0FBE /* TIBETAN KU RU KHA */
 482           || ch == 0x0FBF /* TIBETAN KU RU KHA BZHI MIG CAN */
 483           || ch == 0x0FD2 /* TIBETAN MARK NYIS TSHEG */
 484 #if !REVISION_22
 485           || ch == 0x1802 /* MONGOLIAN COMMA */
 486           || ch == 0x1803 /* MONGOLIAN FULL STOP */
 487 #endif
 488           || ch == 0x1804 /* MONGOLIAN COLON */
 489           || ch == 0x1805 /* MONGOLIAN FOUR DOTS */
 490 #if !REVISION_22
 491           || ch == 0x1808 /* MONGOLIAN MANCHU COMMA */
 492           || ch == 0x1809 /* MONGOLIAN MANCHU FULL STOP */
 493 #endif
 494           || ch == 0x1B5A /* BALINESE PANTI */
 495           || ch == 0x1B5B /* BALINESE PAMADA */
 496           || ch == 0x1B5C /* BALINESE WINDU */
 497           || ch == 0x1B5D /* BALINESE CARIK PAMUNGKAH */
 498           || ch == 0x1B60 /* BALINESE PAMENENG */
 499           || ch == 0x1C3B /* LEPCHA PUNCTUATION TA-ROL */
 500           || ch == 0x1C3C /* LEPCHA PUNCTUATION NYET THYOOM TA-ROL */
 501           || ch == 0x1C3D /* LEPCHA PUNCTUATION CER-WA */
 502           || ch == 0x1C3E /* LEPCHA PUNCTUATION TSHOOK CER-WA */
 503           || ch == 0x1C3F /* LEPCHA PUNCTUATION TSHOOK */
 504           || ch == 0x1C7E /* OL CHIKI PUNCTUATION MUCAAD */
 505           || ch == 0x1C7F /* OL CHIKI PUNCTUATION DOUBLE MUCAAD */
 506 #if !REVISION_22
 507           || ch == 0x2CF9 /* COPTIC OLD NUBIAN FULL STOP */
 508 #endif
 509           || ch == 0x2CFA /* COPTIC OLD NUBIAN DIRECT QUESTION MARK */
 510           || ch == 0x2CFB /* COPTIC OLD NUBIAN INDIRECT QUESTION MARK */
 511           || ch == 0x2CFC /* COPTIC OLD NUBIAN VERSE DIVIDER */
 512 #if !REVISION_22
 513           || ch == 0x2CFE /* COPTIC FULL STOP */
 514 #endif
 515           || ch == 0x2CFF /* COPTIC MORPHOLOGICAL DIVIDER */
 516           || (ch >= 0x2E0E && ch <= 0x2E15) /* EDITORIAL CORONIS .. UPWARDS ANCORA */
 517           || ch == 0x2E17 /* DOUBLE OBLIQUE HYPHEN */
 518           || ch == 0xA60D /* VAI COMMA */
 519           || ch == 0xA60F /* VAI QUESTION MARK */
 520           || ch == 0xA92E /* KAYAH LI SIGN CWI */
 521           || ch == 0xA92F /* KAYAH LI SIGN SHYA */
 522           || ch == 0x10A50 /* KHAROSHTHI PUNCTUATION DOT */
 523           || ch == 0x10A51 /* KHAROSHTHI PUNCTUATION SMALL CIRCLE */
 524           || ch == 0x10A52 /* KHAROSHTHI PUNCTUATION CIRCLE */
 525           || ch == 0x10A53 /* KHAROSHTHI PUNCTUATION CRESCENT BAR */
 526           || ch == 0x10A54 /* KHAROSHTHI PUNCTUATION MANGALAM */
 527           || ch == 0x10A55 /* KHAROSHTHI PUNCTUATION LOTUS */
 528           /* Extra characters for compatibility with Unicode LineBreak.txt.  */
 529 #if !REVISION_22
 530           || ch == 0x1A1E /* BUGINESE PALLAWA */
 531 #endif
 532           || ch == 0x12471 /* CUNEIFORM PUNCTUATION SIGN VERTICAL COLON */
 533           || ch == 0x12472 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL COLON */
 534           || ch == 0x12473 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL TRICOLON */)
 535         attr |= 1 << LBP_BA;
 536
 537       /* break opportunity before */
 538       if (ch == 0x00B4 /* ACUTE ACCENT */
 539 #if REVISION_22
 540           || ch == 0x1FFD /* GREEK OXIA */
 541           || ch == 0x02DF /* MODIFIER LETTER CROSS ACCENT */
 542 #endif
 543           || ch == 0x02C8 /* MODIFIER LETTER VERTICAL LINE */
 544           || ch == 0x02CC /* MODIFIER LETTER LOW VERTICAL LINE */
 545           || ch == 0x0F01 /* TIBETAN MARK GTER YIG MGO TRUNCATED A */
 546           || ch == 0x0F02 /* TIBETAN MARK GTER YIG MGO -UM RNAM BCAD MA */
 547           || ch == 0x0F03 /* TIBETAN MARK GTER YIG MGO -UM GTER TSHEG MA */
 548           || ch == 0x0F04 /* TIBETAN MARK INITIAL YIG MGO MDUN MA */
 549           || ch == 0x0F06 /* TIBETAN MARK CARET YIG MGO PHUR SHAD MA */
 550           || ch == 0x0F07 /* TIBETAN MARK YIG MGO TSHEG SHAD MA */
 551           || ch == 0x0F09 /* TIBETAN MARK BSKUR YIG MGO */
 552           || ch == 0x0F0A /* TIBETAN MARK BKA- SHOG YIG MGO */
 553           || ch == 0x0FD0 /* TIBETAN MARK BSKA- SHOG GI MGO RGYAN */
 554           || ch == 0x0FD1 /* TIBETAN MARK MNYAM YIG GI MGO RGYAN */
 555           || ch == 0x0FD3 /* TIBETAN MARK INITIAL BRDA RNYING YIG MGO MDUN MA */
 556           || ch == 0xA874 /* PHAGS-PA SINGLE HEAD MARK */
 557           || ch == 0xA875 /* PHAGS-PA DOUBLE HEAD MARK */
 558           || ch == 0x1806 /* MONGOLIAN TODO SOFT HYPHEN */)
 559         attr |= 1 << LBP_BB;
 560
 561       /* hyphen */
 562       if (ch == 0x002D /* HYPHEN-MINUS */)
 563         attr |= 1 << LBP_HY;
 564
 565       /* contingent break opportunity */
 566       if (ch == 0xFFFC /* OBJECT REPLACEMENT CHARACTER */)
 567         attr |= 1 << LBP_CB;
 568
 569       /* closing punctuation */
 570       if ((unicode_attributes[ch].category[0] == 'P'
 571            && unicode_attributes[ch].category[1] == 'e')
 572           || ch == 0x3001 /* IDEOGRAPHIC COMMA */
 573           || ch == 0x3002 /* IDEOGRAPHIC FULL STOP */
 574           || ch == 0xFE11 /* PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC COMMA */
 575           || ch == 0xFE12 /* PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC FULL STOP */
 576           || ch == 0xFE50 /* SMALL COMMA */
 577           || ch == 0xFE52 /* SMALL FULL STOP */
 578           || ch == 0xFF0C /* FULLWIDTH COMMA */
 579           || ch == 0xFF0E /* FULLWIDTH FULL STOP */
 580           || ch == 0xFF61 /* HALFWIDTH IDEOGRAPHIC FULL STOP */
 581           || ch == 0xFF64 /* HALFWIDTH IDEOGRAPHIC COMMA */)
 582         attr |= 1 << LBP_CL;
 583
 584       /* exclamation/interrogation */
 585       if (ch == 0x0021 /* EXCLAMATION MARK */
 586           || ch == 0x003F /* QUESTION MARK */
 587           || ch == 0x05C6 /* HEBREW PUNCTUATION NUN HAFUKHA */
 588 #if !REVISION_22
 589           || ch == 0x060C /* ARABIC COMMA */
 590 #endif
 591           || ch == 0x061B /* ARABIC SEMICOLON */
 592           || ch == 0x061E /* ARABIC TRIPLE DOT PUNCTUATION MARK */
 593           || ch == 0x061F /* ARABIC QUESTION MARK */
 594 #if !REVISION_22
 595           || ch == 0x066A /* ARABIC PERCENT SIGN */
 596 #endif
 597           || ch == 0x06D4 /* ARABIC FULL STOP */
 598           || ch == 0x07F9 /* NKO EXCLAMATION MARK */
 599           || ch == 0x0F0D /* TIBETAN MARK SHAD */
 600           || ch == 0x0F0E /* TIBETAN MARK NYIS SHAD */
 601           || ch == 0x0F0F /* TIBETAN MARK TSHEG SHAD */
 602           || ch == 0x0F10 /* TIBETAN MARK NYIS TSHEG SHAD */
 603           || ch == 0x0F11 /* TIBETAN MARK RIN CHEN SPUNGS SHAD */
 604           || ch == 0x0F14 /* TIBETAN MARK GTER TSHEG */
 605 #if REVISION_22
 606           || ch == 0x1802 /* MONGOLIAN COMMA */
 607           || ch == 0x1803 /* MONGOLIAN FULL STOP */
 608           || ch == 0x1808 /* MONGOLIAN MANCHU COMMA */
 609           || ch == 0x1809 /* MONGOLIAN MANCHU FULL STOP */
 610 #endif
 611           || ch == 0x1944 /* LIMBU EXCLAMATION MARK */
 612           || ch == 0x1945 /* LIMBU QUESTION MARK */
 613           || ch == 0x2762 /* HEAVY EXCLAMATION MARK ORNAMENT */
 614           || ch == 0x2763 /* HEAVY HEART EXCLAMATION MARK ORNAMENT */
 615 #if REVISION_22
 616           || ch == 0x2CF9 /* COPTIC OLD NUBIAN FULL STOP */
 617           || ch == 0x2CFE /* COPTIC FULL STOP */
 618 #endif
 619           || ch == 0x2E2E /* REVERSED QUESTION MARK */
 620           || ch == 0xA60C /* VAI SYLLABLE LENGTHENER */
 621           || ch == 0xA60E /* VAI FULL STOP */
 622           || ch == 0xA876 /* PHAGS-PA MARK SHAD */
 623           || ch == 0xA877 /* PHAGS-PA MARK DOUBLE SHAD */
 624           || ch == 0xFE15 /* PRESENTATION FORM FOR VERTICAL EXCLAMATION MARK */
 625           || ch == 0xFE16 /* PRESENTATION FORM FOR VERTICAL QUESTION MARK */
 626           || ch == 0xFE56 /* SMALL QUESTION MARK */
 627           || ch == 0xFE57 /* SMALL EXCLAMATION MARK */
 628           || ch == 0xFF01 /* FULLWIDTH EXCLAMATION MARK */
 629           || ch == 0xFF1F /* FULLWIDTH QUESTION MARK */)
 630         attr |= 1 << LBP_EX;
 631
 632       /* inseparable */
 633       if (ch == 0x2024 /* ONE DOT LEADER */
 634           || ch == 0x2025 /* TWO DOT LEADER */
 635           || ch == 0x2026 /* HORIZONTAL ELLIPSIS */
 636           || ch == 0xFE19 /* PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS */)
 637         attr |= 1 << LBP_IN;
 638
 639       /* non starter */
 640       if (ch == 0x17D6 /* KHMER SIGN CAMNUC PII KUUH */
 641           || ch == 0x203C /* DOUBLE EXCLAMATION MARK */
 642           || ch == 0x203D /* INTERROBANG */
 643           || ch == 0x2047 /* DOUBLE QUESTION MARK */
 644           || ch == 0x2048 /* QUESTION EXCLAMATION MARK */
 645           || ch == 0x2049 /* EXCLAMATION QUESTION MARK */
 646           || ch == 0x3005 /* IDEOGRAPHIC ITERATION MARK */
 647           || ch == 0x301C /* WAVE DASH */
 648           || ch == 0x303C /* MASU MARK */
 649           || ch == 0x303B /* VERTICAL IDEOGRAPHIC ITERATION MARK */
 650           || ch == 0x309B /* KATAKANA-HIRAGANA VOICED SOUND MARK */
 651           || ch == 0x309C /* KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK */
 652           || ch == 0x309D /* HIRAGANA ITERATION MARK */
 653           || ch == 0x309E /* HIRAGANA VOICED ITERATION MARK */
 654           || ch == 0x30A0 /* KATAKANA-HIRAGANA DOUBLE HYPHEN */
 655           || ch == 0x30FB /* KATAKANA MIDDLE DOT */
 656           || ch == 0x30FC /* KATAKANA-HIRAGANA PROLONGED SOUND MARK */
 657           || ch == 0x30FD /* KATAKANA ITERATION MARK */
 658           || ch == 0x30FE /* KATAKANA VOICED ITERATION MARK */
 659           || ch == 0xA015 /* YI SYLLABLE WU */
 660           || ch == 0xFE54 /* SMALL SEMICOLON */
 661           || ch == 0xFE55 /* SMALL COLON */
 662           || ch == 0xFF1A /* FULLWIDTH COLON */
 663           || ch == 0xFF1B /* FULLWIDTH SEMICOLON */
 664           || ch == 0xFF65 /* HALFWIDTH KATAKANA MIDDLE DOT */
 665           || ch == 0xFF70 /* HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK */
 666           || ch == 0xFF9E /* HALFWIDTH KATAKANA VOICED SOUND MARK */
 667           || ch == 0xFF9F /* HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK */
 668           || strstr (unicode_attributes[ch].name, "HIRAGANA LETTER SMALL ") != NULL
 669           || strstr (unicode_attributes[ch].name, "KATAKANA LETTER SMALL ") != NULL)
 670         attr |= 1 << LBP_NS;
 671
 672       /* opening punctuation */
 673       if ((unicode_attributes[ch].category[0] == 'P'
 674            && unicode_attributes[ch].category[1] == 's')
 675 #if REVISION_22
 676           || ch == 0x00A1 /* INVERTED EXCLAMATION MARK */
 677           || ch == 0x00BF /* INVERTED QUESTION MARK */
 678 #endif
 679           || ch == 0x2E18 /* INVERTED INTERROBANG */)
 680         attr |= 1 << LBP_OP;
 681
 682       /* ambiguous quotation */
 683       if ((unicode_attributes[ch].category[0] == 'P'
 684            && (unicode_attributes[ch].category[1] == 'f'
 685                || unicode_attributes[ch].category[1] == 'i'))
 686           || ch == 0x0022 /* QUOTATION MARK */
 687           || ch == 0x0027 /* APOSTROPHE */
 688           || ch == 0x275B /* HEAVY SINGLE TURNED COMMA QUOTATION MARK ORNAMENT */
 689           || ch == 0x275C /* HEAVY SINGLE COMMA QUOTATION MARK ORNAMENT */
 690           || ch == 0x275D /* HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT */
 691           || ch == 0x275E /* HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT */
 692           || ch == 0x2E00 /* RIGHT ANGLE SUBSTITUTION MARKER */
 693           || ch == 0x2E01 /* RIGHT ANGLE DOTTED SUBSTITUTION MARKER */
 694           || ch == 0x2E06 /* RAISED INTERPOLATION MARKER */
 695           || ch == 0x2E07 /* RAISED DOTTED INTERPOLATION MARKER */
 696           || ch == 0x2E08 /* DOTTED TRANSPOSITION MARKER */
 697           || ch == 0x2E0B /* RAISED SQUARE */)
 698         attr |= 1 << LBP_QU;
 699
 700       /* infix separator (numeric) */
 701       if (ch == 0x002C /* COMMA */
 702           || ch == 0x002E /* FULL STOP */
 703           || ch == 0x003A /* COLON */
 704           || ch == 0x003B /* SEMICOLON */
 705           || ch == 0x037E /* GREEK QUESTION MARK */
 706           || ch == 0x0589 /* ARMENIAN FULL STOP */
 707 #if REVISION_22
 708           || ch == 0x060C /* ARABIC COMMA */
 709 #endif
 710           || ch == 0x060D /* ARABIC DATE SEPARATOR */
 711           || ch == 0x07F8 /* NKO COMMA */
 712           || ch == 0x2044 /* FRACTION SLASH */
 713           || ch == 0xFE10 /* PRESENTATION FORM FOR VERTICAL COMMA */
 714           || ch == 0xFE13 /* PRESENTATION FORM FOR VERTICAL COLON */
 715           || ch == 0xFE14 /* PRESENTATION FORM FOR VERTICAL SEMICOLON */)
 716         attr |= 1 << LBP_IS;
 717
 718       /* numeric */
 719       if ((unicode_attributes[ch].category[0] == 'N'
 720            && unicode_attributes[ch].category[1] == 'd'
 721            && strstr (unicode_attributes[ch].name, "FULLWIDTH") == NULL)
 722           || ch == 0x066B /* ARABIC DECIMAL SEPARATOR */
 723           || ch == 0x066C /* ARABIC THOUSANDS SEPARATOR */)
 724         attr |= 1 << LBP_NU;
 725
 726       /* postfix (numeric) */
 727       if (ch == 0x0025 /* PERCENT SIGN */
 728           || ch == 0x00A2 /* CENT SIGN */
 729           || ch == 0x00B0 /* DEGREE SIGN */
 730           || ch == 0x060B /* AFGHANI SIGN */
 731 #if REVISION_22
 732           || ch == 0x066A /* ARABIC PERCENT SIGN */
 733 #endif
 734           || ch == 0x2030 /* PER MILLE SIGN */
 735           || ch == 0x2031 /* PER TEN THOUSAND SIGN */
 736           || ch == 0x2032 /* PRIME */
 737           || ch == 0x2033 /* DOUBLE PRIME */
 738           || ch == 0x2034 /* TRIPLE PRIME */
 739           || ch == 0x2035 /* REVERSED PRIME */
 740           || ch == 0x2036 /* REVERSED DOUBLE PRIME */
 741           || ch == 0x2037 /* REVERSED TRIPLE PRIME */
 742           || ch == 0x20A7 /* PESETA SIGN */
 743           || ch == 0x2103 /* DEGREE CELSIUS */
 744           || ch == 0x2109 /* DEGREE FAHRENHEIT */
 745           || ch == 0xFDFC /* RIAL SIGN */
 746           || ch == 0xFE6A /* SMALL PERCENT SIGN */
 747           || ch == 0xFF05 /* FULLWIDTH PERCENT SIGN */
 748           || ch == 0xFFE0 /* FULLWIDTH DIGIT ZERO */)
 749         attr |= 1 << LBP_PO;
 750
 751       /* prefix (numeric) */
 752       if ((unicode_attributes[ch].category[0] == 'S'
 753            && unicode_attributes[ch].category[1] == 'c')
 754           || ch == 0x002B /* PLUS SIGN */
 755           || ch == 0x005C /* REVERSE SOLIDUS */
 756           || ch == 0x00B1 /* PLUS-MINUS SIGN */
 757           || ch == 0x2116 /* NUMERO SIGN */
 758           || ch == 0x2212 /* MINUS SIGN */
 759           || ch == 0x2213 /* MINUS-OR-PLUS SIGN */)
 760         if (!(attr & (1 << LBP_PO)))
 761           attr |= 1 << LBP_PR;
 762
 763       /* symbols allowing breaks */
 764       if (ch == 0x002F /* SOLIDUS */)
 765         attr |= 1 << LBP_SY;
 766
 767       if (ch >= 0xAC00 && ch <= 0xD7A3 && ((ch - 0xAC00) % 28) == 0)
 768         attr |= 1 << LBP_H2;
 769
 770       if (ch >= 0xAC00 && ch <= 0xD7A3 && ((ch - 0xAC00) % 28) != 0)
 771         attr |= 1 << LBP_H3;
 772
 773       if ((ch >= 0x1100 && ch <= 0x1159) || ch == 0x115F)
 774         attr |= 1 << LBP_JL;
 775
 776       if (ch >= 0x1160 && ch <= 0x11A2)
 777         attr |= 1 << LBP_JV;
 778
 779       if (ch >= 0x11A8 && ch <= 0x11F9)
 780         attr |= 1 << LBP_JT;
 781
 782       /* complex context (South East Asian) */
 783       if (((unicode_attributes[ch].category[0] == 'C'
 784             && unicode_attributes[ch].category[1] == 'f')
 785            || (unicode_attributes[ch].category[0] == 'L'
 786                && (unicode_attributes[ch].category[1] == 'm'
 787                    || unicode_attributes[ch].category[1] == 'o'))
 788            || (unicode_attributes[ch].category[0] == 'M'
 789                && (unicode_attributes[ch].category[1] == 'c'
 790                    || unicode_attributes[ch].category[1] == 'n'))
 791            /* Extra characters for compatibility with Unicode LineBreak.txt.  */
 792            || ch == 0x19DE /* NEW TAI LUE SIGN LAE */
 793            || ch == 0x19DF /* NEW TAI LUE SIGN LAEV */)
 794           && ((ch >= 0x0E00 && ch <= 0x0EFF)
 795               || (ch >= 0x1000 && ch <= 0x109F)
 796               || (ch >= 0x1780 && ch <= 0x17FF)
 797               || (ch >= 0x1950 && ch <= 0x19DF)))
 798         attr |= 1 << LBP_SA;
 799
 800       /* attached characters and combining marks */
 801       if ((unicode_attributes[ch].category[0] == 'M'
 802            && (unicode_attributes[ch].category[1] == 'c'
 803                || unicode_attributes[ch].category[1] == 'e'
 804                || unicode_attributes[ch].category[1] == 'n'))
 805           || (unicode_attributes[ch].category[0] == 'C'
 806               && (unicode_attributes[ch].category[1] == 'c'
 807                   || unicode_attributes[ch].category[1] == 'f')))
 808         if (!(attr & ((1 << LBP_BK) | (1 << LBP_BA) | (1 << LBP_GL) | (1 << LBP_SA) | (1 << LBP_WJ) | (1 << LBP_ZW))))
 809           attr |= 1 << LBP_CM;
 810
 811       /* ideographic */
 812       if ((ch >= 0x2E80 && ch <= 0x2FFF) /* CJK RADICAL, KANGXI RADICAL, IDEOGRAPHIC DESCRIPTION */
 813           || ch == 0x3000 /* IDEOGRAPHIC SPACE */
 814           || (ch >= 0x3040 && ch <= 0x309F) /* HIRAGANA */
 815           || (ch >= 0x30A0 && ch <= 0x30FF) /* KATAKANA */
 816           || (ch >= 0x3400 && ch <= 0x4DB5) /* CJK Ideograph Extension A */
 817           || (ch >= 0x4E00 && ch <= 0x9FBB) /* CJK Ideograph */
 818           || (ch >= 0xF900 && ch <= 0xFAD9) /* CJK COMPATIBILITY IDEOGRAPH */
 819           || (ch >= 0xA000 && ch <= 0xA48F) /* YI SYLLABLE */
 820           || (ch >= 0xA490 && ch <= 0xA4CF) /* YI RADICAL */
 821           || ch == 0xFE62 /* SMALL PLUS SIGN */
 822           || ch == 0xFE63 /* SMALL HYPHEN-MINUS */
 823           || ch == 0xFE64 /* SMALL LESS-THAN SIGN */
 824           || ch == 0xFE65 /* SMALL GREATER-THAN SIGN */
 825           || ch == 0xFE66 /* SMALL EQUALS SIGN */
 826           || (ch >= 0xFF10 && ch <= 0xFF19) /* FULLWIDTH DIGIT */
 827           || (ch >= 0x20000 && ch <= 0x2A6D6) /* CJK Ideograph Extension B */
 828           || (ch >= 0x2F800 && ch <= 0x2FA1D) /* CJK COMPATIBILITY IDEOGRAPH */
 829           || strstr (unicode_attributes[ch].name, "FULLWIDTH LATIN ") != NULL
 830           || (ch >= 0x3000 && ch <= 0x33FF
 831               && !(attr & ((1 << LBP_CM) | (1 << LBP_NS) | (1 << LBP_OP) | (1 << LBP_CL))))
 832           /* Extra characters for compatibility with Unicode LineBreak.txt.  */
 833           || ch == 0xFE30 /* PRESENTATION FORM FOR VERTICAL TWO DOT LEADER */
 834           || ch == 0xFE31 /* PRESENTATION FORM FOR VERTICAL EM DASH */
 835           || ch == 0xFE32 /* PRESENTATION FORM FOR VERTICAL EN DASH */
 836           || ch == 0xFE33 /* PRESENTATION FORM FOR VERTICAL LOW LINE */
 837           || ch == 0xFE34 /* PRESENTATION FORM FOR VERTICAL WAVY LOW LINE */
 838           || ch == 0xFE45 /* SESAME DOT */
 839           || ch == 0xFE46 /* WHITE SESAME DOT */
 840           || ch == 0xFE49 /* DASHED OVERLINE */
 841           || ch == 0xFE4A /* CENTRELINE OVERLINE */
 842           || ch == 0xFE4B /* WAVY OVERLINE */
 843           || ch == 0xFE4C /* DOUBLE WAVY OVERLINE */
 844           || ch == 0xFE4D /* DASHED LOW LINE */
 845           || ch == 0xFE4E /* CENTRELINE LOW LINE */
 846           || ch == 0xFE4F /* WAVY LOW LINE */
 847           || ch == 0xFE51 /* SMALL IDEOGRAPHIC COMMA */
 848           || ch == 0xFE58 /* SMALL EM DASH */
 849           || ch == 0xFE5F /* SMALL NUMBER SIGN */
 850           || ch == 0xFE60 /* SMALL AMPERSAND */
 851           || ch == 0xFE61 /* SMALL ASTERISK */
 852           || ch == 0xFE68 /* SMALL REVERSE SOLIDUS */
 853           || ch == 0xFE6B /* SMALL COMMERCIAL AT */
 854           || ch == 0xFF02 /* FULLWIDTH QUOTATION MARK */
 855           || ch == 0xFF03 /* FULLWIDTH NUMBER SIGN */
 856           || ch == 0xFF06 /* FULLWIDTH AMPERSAND */
 857           || ch == 0xFF07 /* FULLWIDTH APOSTROPHE */
 858           || ch == 0xFF0A /* FULLWIDTH ASTERISK */
 859           || ch == 0xFF0B /* FULLWIDTH PLUS SIGN */
 860           || ch == 0xFF0D /* FULLWIDTH HYPHEN-MINUS */
 861           || ch == 0xFF0F /* FULLWIDTH SOLIDUS */
 862           || ch == 0xFF1C /* FULLWIDTH LESS-THAN SIGN */
 863           || ch == 0xFF1D /* FULLWIDTH EQUALS SIGN */
 864           || ch == 0xFF1E /* FULLWIDTH GREATER-THAN SIGN */
 865           || ch == 0xFF20 /* FULLWIDTH COMMERCIAL AT */
 866           || ch == 0xFF3C /* FULLWIDTH REVERSE SOLIDUS */
 867           || ch == 0xFF3E /* FULLWIDTH CIRCUMFLEX ACCENT */
 868           || ch == 0xFF3F /* FULLWIDTH LOW LINE */
 869           || ch == 0xFF40 /* FULLWIDTH GRAVE ACCENT */
 870           || ch == 0xFF5C /* FULLWIDTH VERTICAL LINE */
 871           || ch == 0xFF5E /* FULLWIDTH TILDE */
 872           || ch == 0xFFE2 /* FULLWIDTH NOT SIGN */
 873           || ch == 0xFFE3 /* FULLWIDTH MACRON */
 874           || ch == 0xFFE4 /* FULLWIDTH BROKEN BAR */)
 875         if (!(attr & ((1 << LBP_NS) | (1 << LBP_CM))))
 876           {
 877             /* ambiguous (ideograph) ? */
 878             if ((unicode_width[ch] != NULL
 879                  && unicode_width[ch][0] == 'A'
 880                  && ch >= 0x2000)
 881                 || ch == 0x24EA /* CIRCLED DIGIT ZERO */
 882                 || (ch >= 0x2780 && ch <= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */)
 883               attr |= 1 << LBP_AI;
 884             else
 885               attr |= 1 << LBP_ID;
 886           }
 887
 888       /* ordinary alphabetic and symbol characters */
 889       if ((unicode_attributes[ch].category[0] == 'L'
 890            && (unicode_attributes[ch].category[1] == 'u'
 891                || unicode_attributes[ch].category[1] == 'l'
 892                || unicode_attributes[ch].category[1] == 't'
 893                || unicode_attributes[ch].category[1] == 'm'
 894                || unicode_attributes[ch].category[1] == 'o'))
 895           || (unicode_attributes[ch].category[0] == 'S'
 896               && (unicode_attributes[ch].category[1] == 'm'
 897                   || unicode_attributes[ch].category[1] == 'k'
 898                   || unicode_attributes[ch].category[1] == 'o'))
 899           || (unicode_attributes[ch].category[0] == 'N'
 900               && (unicode_attributes[ch].category[1] == 'l'
 901                   || unicode_attributes[ch].category[1] == 'o'))
 902           || (unicode_attributes[ch].category[0] == 'P'
 903               && (unicode_attributes[ch].category[1] == 'c'
 904                   || unicode_attributes[ch].category[1] == 'd'
 905                   || unicode_attributes[ch].category[1] == 'o'))
 906           || ch == 0x0600 /* ARABIC NUMBER SIGN */
 907           || ch == 0x0601 /* ARABIC SIGN SANAH */
 908           || ch == 0x0602 /* ARABIC FOOTNOTE MARKER */
 909           || ch == 0x0603 /* ARABIC SIGN SAFHA */
 910           || ch == 0x06DD /* ARABIC END OF AYAH */
 911           || ch == 0x070F /* SYRIAC ABBREVIATION MARK */
 912           || ch == 0x2061 /* FUNCTION APPLICATION */
 913           || ch == 0x2062 /* INVISIBLE TIMES */
 914           || ch == 0x2063 /* INVISIBLE SEPARATOR */
 915           || ch == 0x2064 /* INVISIBLE PLUS */)
 916         if (!(attr & ((1 << LBP_GL) | (1 << LBP_B2) | (1 << LBP_BA) | (1 << LBP_BB) | (1 << LBP_HY) | (1 << LBP_CB) | (1 << LBP_CL) | (1 << LBP_EX) | (1 << LBP_IN) | (1 << LBP_NS) | (1 << LBP_OP) | (1 << LBP_QU) | (1 << LBP_IS) | (1 << LBP_NU) | (1 << LBP_PO) | (1 << LBP_PR) | (1 << LBP_SY) | (1 << LBP_H2) | (1 << LBP_H3) | (1 << LBP_JL) | (1 << LBP_JV) | (1 << LBP_JT) | (1 << LBP_SA) | (1 << LBP_ID))))
 917           {
 918             /* ambiguous (alphabetic) ? */
 919             if ((unicode_width[ch] != NULL
 920                  && unicode_width[ch][0] == 'A'
 921                  && ch >= 0x2000
 922                  /* Extra exceptions for compatibility with Unicode LineBreak.txt.  */
 923                  && ch != 0x2022 /* BULLET */
 924                  && ch != 0x203E /* OVERLINE */
 925                  && ch != 0x2126 /* OHM SIGN */
 926                  && ch != 0x2153 /* VULGAR FRACTION ONE THIRD */
 927                  && ch != 0x215C /* VULGAR FRACTION THREE EIGHTHS */
 928                  && ch != 0x215D /* VULGAR FRACTION FIVE EIGHTHS */
 929                  && ch != 0x21B8 /* NORTH WEST ARROW TO LONG BAR */
 930                  && ch != 0x21B9 /* LEFTWARDS ARROW TO BAR OVER RIGHTWARDS ARROW TO BAR */
 931                  && ch != 0x21E7 /* UPWARDS WHITE ARROW */
 932                  && ch != 0x24FF /* NEGATIVE CIRCLED DIGIT ZERO */
 933                  && ch != 0x273D /* HEAVY TEARDROP-SPOKED ASTERISK */)
 934 #if !REVISION_22
 935                 || ch == 0x00A1 /* INVERTED EXCLAMATION MARK */
 936                 || ch == 0x00A7 /* SECTION SIGN */
 937                 || ch == 0x00A8 /* DIAERESIS */
 938                 || ch == 0x00AA /* FEMININE ORDINAL INDICATOR */
 939                 || ch == 0x00B2 /* SUPERSCRIPT TWO */
 940                 || ch == 0x00B3 /* SUPERSCRIPT THREE */
 941                 || ch == 0x00B6 /* PILCROW SIGN */
 942                 || ch == 0x00B7 /* MIDDLE DOT */
 943                 || ch == 0x00B8 /* CEDILLA */
 944                 || ch == 0x00B9 /* SUPERSCRIPT ONE */
 945                 || ch == 0x00BA /* MASCULINE ORDINAL INDICATOR */
 946                 || ch == 0x00BC /* VULGAR FRACTION ONE QUARTER */
 947                 || ch == 0x00BD /* VULGAR FRACTION ONE HALF */
 948                 || ch == 0x00BE /* VULGAR FRACTION THREE QUARTERS */
 949                 || ch == 0x00BF /* INVERTED QUESTION MARK */
 950                 || ch == 0x00D7 /* MULTIPLICATION SIGN */
 951                 || ch == 0x00F7 /* DIVISION SIGN */
 952                 || ch == 0x02C7 /* CARON */
 953                 || ch == 0x02C9 /* MODIFIER LETTER MACRON */
 954                 || ch == 0x02CA /* MODIFIER LETTER ACUTE ACCENT */
 955                 || ch == 0x02CB /* MODIFIER LETTER GRAVE ACCENT */
 956                 || ch == 0x02CD /* MODIFIER LETTER LOW MACRON */
 957                 || ch == 0x02D0 /* MODIFIER LETTER TRIANGULAR COLON */
 958                 || ch == 0x02D8 /* BREVE */
 959                 || ch == 0x02D9 /* DOT ABOVE */
 960                 || ch == 0x02DA /* RING ABOVE */
 961                 || ch == 0x02DB /* OGONEK */
 962                 || ch == 0x02DD /* DOUBLE ACUTE ACCENT */
 963 #endif
 964                 || ch == 0x24EA /* CIRCLED DIGIT ZERO */
 965                 || (ch >= 0x2780 && ch <= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */
 966                 /* Extra characters for compatibility with Unicode LineBreak.txt.  */
 967                 || ch == 0x2155 /* VULGAR FRACTION ONE FIFTH */
 968                 || ch == 0x2574 /* BOX DRAWINGS LIGHT LEFT */
 969                 || ch == 0x2616 /* WHITE SHOGI PIECE */
 970                 || ch == 0x2617 /* BLACK SHOGI PIECE */)
 971               attr |= 1 << LBP_AI;
 972             else
 973               attr |= 1 << LBP_AL;
 974             attr &= ~(1 << LBP_CM);
 975           }
 976     }
 977
 978   if (attr == 0)
 979     /* unknown */
 980     attr |= 1 << LBP_XX;
 981
 982   return attr;
 983 }
 984
 985 /* Output the line breaking properties in a human readable format.  */
 986 static void
 987 debug_output_lbp (FILE *stream)
 988 {
 989   unsigned int i;
 990
 991   for (i = 0; i < 0x110000; i++)
 992     {
 993       int attr = get_lbp (i);
 994       if (attr != 1 << LBP_XX)
 995         {
 996           fprintf (stream, "0x%04X", i);
 997 #define PRINT_BIT(attr,bit) \
 998   if (attr & (1 << bit)) fprintf (stream, " " #bit);
 999           PRINT_BIT(attr,LBP_BK);
1000           PRINT_BIT(attr,LBP_CM);
1001           PRINT_BIT(attr,LBP_WJ);
1002           PRINT_BIT(attr,LBP_ZW);
1003           PRINT_BIT(attr,LBP_GL);
1004           PRINT_BIT(attr,LBP_SP);
1005           PRINT_BIT(attr,LBP_B2);
1006           PRINT_BIT(attr,LBP_BA);
1007           PRINT_BIT(attr,LBP_BB);
1008           PRINT_BIT(attr,LBP_HY);
1009           PRINT_BIT(attr,LBP_CB);
1010           PRINT_BIT(attr,LBP_CL);
1011           PRINT_BIT(attr,LBP_EX);
1012           PRINT_BIT(attr,LBP_IN);
1013           PRINT_BIT(attr,LBP_NS);
1014           PRINT_BIT(attr,LBP_OP);
1015           PRINT_BIT(attr,LBP_QU);
1016           PRINT_BIT(attr,LBP_IS);
1017           PRINT_BIT(attr,LBP_NU);
1018           PRINT_BIT(attr,LBP_PO);
1019           PRINT_BIT(attr,LBP_PR);
1020           PRINT_BIT(attr,LBP_SY);
1021           PRINT_BIT(attr,LBP_AI);
1022           PRINT_BIT(attr,LBP_AL);
1023           PRINT_BIT(attr,LBP_H2);
1024           PRINT_BIT(attr,LBP_H3);
1025           PRINT_BIT(attr,LBP_ID);
1026           PRINT_BIT(attr,LBP_JL);
1027           PRINT_BIT(attr,LBP_JV);
1028           PRINT_BIT(attr,LBP_JT);
1029           PRINT_BIT(attr,LBP_SA);
1030           PRINT_BIT(attr,LBP_XX);
1031 #undef PRINT_BIT
1032           fprintf (stream, "\n");
1033         }
1034     }
1035 }
1036
1037 static void
1038 debug_output_tables (const char *filename)
1039 {
1040   FILE *stream;
1041
1042   stream = fopen (filename, "w");
1043   if (stream == NULL)
1044     {
1045       fprintf (stderr, "cannot open '%s' for writing\n", filename);
1046       exit (1);
1047     }
1048
1049   debug_output_lbp (stream);
1050
1051   if (ferror (stream) || fclose (stream))
1052     {
1053       fprintf (stderr, "error writing to '%s'\n", filename);
1054       exit (1);
1055     }
1056 }
1057
1058 /* The line breaking property from the LineBreak.txt file.  */
1059 int unicode_org_lbp[0x110000];
1060
1061 /* Stores in unicode_org_lbp[] the line breaking property from the
1062    LineBreak.txt file.  */
1063 static void
1064 fill_org_lbp (const char *linebreak_filename)
1065 {
1066   unsigned int i, j;
1067   FILE *stream;
1068   char field0[FIELDLEN];
1069   char field1[FIELDLEN];
1070   char field2[FIELDLEN];
1071   int lineno = 0;
1072
1073   for (i = 0; i < 0x110000; i++)
1074     unicode_org_lbp[i] = LBP_XX;
1075
1076   stream = fopen (linebreak_filename, "r");
1077   if (stream == NULL)
1078     {
1079       fprintf (stderr, "error during fopen of '%s'\n", linebreak_filename);
1080       exit (1);
1081     }
1082
1083   for (;;)
1084     {
1085       int n;
1086       int c;
1087       int value;
1088
1089       lineno++;
1090       c = getc (stream);
1091       if (c == EOF)
1092         break;
1093       if (c == '#')
1094         {
1095           do c = getc (stream); while (c != EOF && c != '\n');
1096           continue;
1097         }
1098       ungetc (c, stream);
1099       n = getfield (stream, field0, ';');
1100       n += getfield (stream, field1, ' ');
1101       n += getfield (stream, field2, '\n');
1102       if (n == 0)
1103         break;
1104       if (n != 3)
1105         {
1106           fprintf (stderr, "short line in '%s':%d\n", linebreak_filename,
1107                    lineno);
1108           exit (1);
1109         }
1110 #define TRY(bit) else if (strcmp (field1, #bit + 4) == 0) value = bit;
1111       if (false) {}
1112       TRY(LBP_BK)
1113       TRY(LBP_CM)
1114       TRY(LBP_WJ)
1115       TRY(LBP_ZW)
1116       TRY(LBP_GL)
1117       TRY(LBP_SP)
1118       TRY(LBP_B2)
1119       TRY(LBP_BA)
1120       TRY(LBP_BB)
1121       TRY(LBP_HY)
1122       TRY(LBP_CB)
1123       TRY(LBP_CL)
1124       TRY(LBP_EX)
1125       TRY(LBP_IN)
1126       TRY(LBP_NS)
1127       TRY(LBP_OP)
1128       TRY(LBP_QU)
1129       TRY(LBP_IS)
1130       TRY(LBP_NU)
1131       TRY(LBP_PO)
1132       TRY(LBP_PR)
1133       TRY(LBP_SY)
1134       TRY(LBP_AI)
1135       TRY(LBP_AL)
1136       TRY(LBP_H2)
1137       TRY(LBP_H3)
1138       TRY(LBP_ID)
1139       TRY(LBP_JL)
1140       TRY(LBP_JV)
1141       TRY(LBP_JT)
1142       TRY(LBP_SA)
1143       TRY(LBP_XX)
1144 #undef TRY
1145       else if (strcmp (field1, "LF") == 0) value = LBP_BK;
1146       else if (strcmp (field1, "CR") == 0) value = LBP_BK;
1147       else if (strcmp (field1, "NL") == 0) value = LBP_BK;
1148       else if (strcmp (field1, "SG") == 0) value = LBP_XX;
1149       else
1150         {
1151           fprintf (stderr, "unknown property value \"%s\" in '%s':%d\n",
1152                    field1, linebreak_filename, lineno);
1153           exit (1);
1154         }
1155       i = strtoul (field0, NULL, 16);
1156       if (strstr (field0, "..") != NULL)
1157         {
1158           /* Deal with a range.  */
1159           j = strtoul (strstr (field0, "..") + 2, NULL, 16);
1160           for (; i <= j; i++)
1161             unicode_org_lbp[i] = value;
1162         }
1163       else
1164         {
1165           /* Single character line.  */
1166           unicode_org_lbp[i] = value;
1167         }
1168     }
1169   if (ferror (stream) || fclose (stream))
1170     {
1171       fprintf (stderr, "error reading from '%s'\n", linebreak_filename);
1172       exit (1);
1173     }
1174 }
1175
1176 /* Output the line breaking properties in a human readable format.  */
1177 static void
1178 debug_output_org_lbp (FILE *stream)
1179 {
1180   unsigned int i;
1181
1182   for (i = 0; i < 0x110000; i++)
1183     {
1184       int attr = unicode_org_lbp[i];
1185       if (attr != LBP_XX)
1186         {
1187           fprintf (stream, "0x%04X", i);
1188 #define PRINT_BIT(attr,bit) \
1189   if (attr == bit) fprintf (stream, " " #bit);
1190           PRINT_BIT(attr,LBP_BK);
1191           PRINT_BIT(attr,LBP_CM);
1192           PRINT_BIT(attr,LBP_WJ);
1193           PRINT_BIT(attr,LBP_ZW);
1194           PRINT_BIT(attr,LBP_GL);
1195           PRINT_BIT(attr,LBP_SP);
1196           PRINT_BIT(attr,LBP_B2);
1197           PRINT_BIT(attr,LBP_BA);
1198           PRINT_BIT(attr,LBP_BB);
1199           PRINT_BIT(attr,LBP_HY);
1200           PRINT_BIT(attr,LBP_CB);
1201           PRINT_BIT(attr,LBP_CL);
1202           PRINT_BIT(attr,LBP_EX);
1203           PRINT_BIT(attr,LBP_IN);
1204           PRINT_BIT(attr,LBP_NS);
1205           PRINT_BIT(attr,LBP_OP);
1206           PRINT_BIT(attr,LBP_QU);
1207           PRINT_BIT(attr,LBP_IS);
1208           PRINT_BIT(attr,LBP_NU);
1209           PRINT_BIT(attr,LBP_PO);
1210           PRINT_BIT(attr,LBP_PR);
1211           PRINT_BIT(attr,LBP_SY);
1212           PRINT_BIT(attr,LBP_AI);
1213           PRINT_BIT(attr,LBP_AL);
1214           PRINT_BIT(attr,LBP_H2);
1215           PRINT_BIT(attr,LBP_H3);
1216           PRINT_BIT(attr,LBP_ID);
1217           PRINT_BIT(attr,LBP_JL);
1218           PRINT_BIT(attr,LBP_JV);
1219           PRINT_BIT(attr,LBP_JT);
1220           PRINT_BIT(attr,LBP_SA);
1221           PRINT_BIT(attr,LBP_XX);
1222 #undef PRINT_BIT
1223           fprintf (stream, "\n");
1224         }
1225     }
1226 }
1227
1228 static void
1229 debug_output_org_tables (const char *filename)
1230 {
1231   FILE *stream;
1232
1233   stream = fopen (filename, "w");
1234   if (stream == NULL)
1235     {
1236       fprintf (stderr, "cannot open '%s' for writing\n", filename);
1237       exit (1);
1238     }
1239
1240   debug_output_org_lbp (stream);
1241
1242   if (ferror (stream) || fclose (stream))
1243     {
1244       fprintf (stderr, "error writing to '%s'\n", filename);
1245       exit (1);
1246     }
1247 }
1248
1249 /* Construction of sparse 3-level tables.  */
1250 #define TABLE lbp_table
1251 #define ELEMENT unsigned char
1252 #define DEFAULT LBP_XX
1253 #define xmalloc malloc
1254 #define xrealloc realloc
1255 #include "3level.h"
1256
1257 static void
1258 output_lbp (FILE *stream1, FILE *stream2)
1259 {
1260   unsigned int i;
1261   struct lbp_table t;
1262   unsigned int level1_offset, level2_offset, level3_offset;
1263
1264   t.p = 7;
1265   t.q = 9;
1266   lbp_table_init (&t);
1267
1268   for (i = 0; i < 0x110000; i++)
1269     {
1270       int attr = get_lbp (i);
1271
1272       /* Now attr should contain exactly one bit.  */
1273       if (attr == 0 || ((attr & (attr - 1)) != 0))
1274         abort ();
1275
1276       if (attr != 1 << LBP_XX)
1277         {
1278           unsigned int log2_attr;
1279           for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++);
1280
1281           lbp_table_add (&t, i, log2_attr);
1282         }
1283     }
1284
1285   lbp_table_finalize (&t);
1286
1287   level1_offset =
1288     5 * sizeof (uint32_t);
1289   level2_offset =
1290     5 * sizeof (uint32_t)
1291     + t.level1_size * sizeof (uint32_t);
1292   level3_offset =
1293     5 * sizeof (uint32_t)
1294     + t.level1_size * sizeof (uint32_t)
1295     + (t.level2_size << t.q) * sizeof (uint32_t);
1296
1297   for (i = 0; i < 5; i++)
1298     fprintf (stream1, "#define lbrkprop_header_%d %d\n", i,
1299              ((uint32_t *) t.result)[i]);
1300   fprintf (stream1, "\n");
1301   fprintf (stream1, "typedef struct\n");
1302   fprintf (stream1, "  {\n");
1303   fprintf (stream1, "    int level1[%d];\n", t.level1_size);
1304   fprintf (stream1, "    int level2[%d << %d];\n", t.level2_size, t.q);
1305   fprintf (stream1, "    unsigned char level3[%d << %d];\n", t.level3_size, t.p);
1306   fprintf (stream1, "  }\n");
1307   fprintf (stream1, "lbrkprop_t;\n");
1308   fprintf (stream1, "extern const lbrkprop_t unilbrkprop;\n");
1309
1310   fprintf (stream2, "const lbrkprop_t unilbrkprop =\n");
1311   fprintf (stream2, "{\n");
1312   fprintf (stream2, "  {");
1313   if (t.level1_size > 8)
1314     fprintf (stream2, "\n   ");
1315   for (i = 0; i < t.level1_size; i++)
1316     {
1317       uint32_t offset;
1318       if (i > 0 && (i % 8) == 0)
1319         fprintf (stream2, "\n   ");
1320       offset = ((uint32_t *) (t.result + level1_offset))[i];
1321       fprintf (stream2, " %5d%s",
1322                offset == 0 ? -1 : (offset - level2_offset) / sizeof (uint32_t),
1323                (i+1 < t.level1_size ? "," : ""));
1324     }
1325   if (t.level1_size > 8)
1326     fprintf (stream2, "\n ");
1327   fprintf (stream2, " },\n");
1328   fprintf (stream2, "  {");
1329   if (t.level2_size << t.q > 8)
1330     fprintf (stream2, "\n   ");
1331   for (i = 0; i < t.level2_size << t.q; i++)
1332     {
1333       uint32_t offset;
1334       if (i > 0 && (i % 8) == 0)
1335         fprintf (stream2, "\n   ");
1336       offset = ((uint32_t *) (t.result + level2_offset))[i];
1337       fprintf (stream2, " %5d%s",
1338                offset == 0 ? -1 : (offset - level3_offset) / sizeof (uint8_t),
1339                (i+1 < t.level2_size << t.q ? "," : ""));
1340     }
1341   if (t.level2_size << t.q > 8)
1342     fprintf (stream2, "\n ");
1343   fprintf (stream2, " },\n");
1344   fprintf (stream2, "  {");
1345   if (t.level3_size << t.p > 8)
1346     fprintf (stream2, "\n   ");
1347   for (i = 0; i < t.level3_size << t.p; i++)
1348     {
1349       unsigned char value = ((unsigned char *) (t.result + level3_offset))[i];
1350       const char *value_string;
1351       switch (value)
1352         {
1353 #define CASE(x) case x: value_string = #x; break;
1354           CASE(LBP_BK);
1355           CASE(LBP_CM);
1356           CASE(LBP_WJ);
1357           CASE(LBP_ZW);
1358           CASE(LBP_GL);
1359           CASE(LBP_SP);
1360           CASE(LBP_B2);
1361           CASE(LBP_BA);
1362           CASE(LBP_BB);
1363           CASE(LBP_HY);
1364           CASE(LBP_CB);
1365           CASE(LBP_CL);
1366           CASE(LBP_EX);
1367           CASE(LBP_IN);
1368           CASE(LBP_NS);
1369           CASE(LBP_OP);
1370           CASE(LBP_QU);
1371           CASE(LBP_IS);
1372           CASE(LBP_NU);
1373           CASE(LBP_PO);
1374           CASE(LBP_PR);
1375           CASE(LBP_SY);
1376           CASE(LBP_AI);
1377           CASE(LBP_AL);
1378           CASE(LBP_H2);
1379           CASE(LBP_H3);
1380           CASE(LBP_ID);
1381           CASE(LBP_JL);
1382           CASE(LBP_JV);
1383           CASE(LBP_JT);
1384           CASE(LBP_SA);
1385           CASE(LBP_XX);
1386 #undef CASE
1387           default:
1388             abort ();
1389         }
1390       if (i > 0 && (i % 8) == 0)
1391         fprintf (stream2, "\n   ");
1392       fprintf (stream2, " %s%s", value_string,
1393                (i+1 < t.level3_size << t.p ? "," : ""));
1394     }
1395   if (t.level3_size << t.p > 8)
1396     fprintf (stream2, "\n ");
1397   fprintf (stream2, " }\n");
1398   fprintf (stream2, "};\n");
1399 }
1400
1401 static void
1402 output_tables (const char *filename1, const char *filename2, const char *version)
1403 {
1404   const char *filenames[2];
1405   FILE *streams[2];
1406   size_t i;
1407
1408   filenames[0] = filename1;
1409   filenames[1] = filename2;
1410
1411   for (i = 0; i < 2; i++)
1412     {
1413       streams[i] = fopen (filenames[i], "w");
1414       if (streams[i] == NULL)
1415         {
1416           fprintf (stderr, "cannot open '%s' for writing\n", filenames[i]);
1417           exit (1);
1418         }
1419     }
1420
1421   for (i = 0; i < 2; i++)
1422     {
1423       FILE *stream = streams[i];
1424
1425       fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1426       fprintf (stream, "/* Line breaking properties of Unicode characters.  */\n");
1427       fprintf (stream, "/* Generated automatically by gen-lbrk for Unicode %s.  */\n",
1428                version);
1429       fprintf (stream, "\n");
1430
1431       /* Put a GPL header on it.  The gnulib module is under LGPL (although it
1432          still carries the GPL header), and it's gnulib-tool which replaces the
1433          GPL header with an LGPL header.  */
1434       fprintf (stream, "/* Copyright (C) 2000-2002, 2004, 2008 Free Software Foundation, Inc.\n");
1435       fprintf (stream, "\n");
1436       fprintf (stream, "   This program is free software: you can redistribute it and/or modify\n");
1437       fprintf (stream, "   it under the terms of the GNU General Public License as published by\n");
1438       fprintf (stream, "   the Free Software Foundation; either version 3 of the License, or\n");
1439       fprintf (stream, "   (at your option) any later version.\n");
1440       fprintf (stream, "\n");
1441       fprintf (stream, "   This program is distributed in the hope that it will be useful,\n");
1442       fprintf (stream, "   but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
1443       fprintf (stream, "   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\n");
1444       fprintf (stream, "   GNU General Public License for more details.\n");
1445       fprintf (stream, "\n");
1446       fprintf (stream, "   You should have received a copy of the GNU General Public License\n");
1447       fprintf (stream, "   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */\n");
1448       fprintf (stream, "\n");
1449     }
1450
1451   output_lbp (streams[0], streams[1]);
1452
1453   for (i = 0; i < 2; i++)
1454     {
1455       if (ferror (streams[i]) || fclose (streams[i]))
1456         {
1457           fprintf (stderr, "error writing to '%s'\n", filenames[i]);
1458           exit (1);
1459         }
1460     }
1461 }
1462
1463 int
1464 main (int argc, char * argv[])
1465 {
1466   if (argc != 5)
1467     {
1468       fprintf (stderr, "Usage: %s UnicodeData.txt EastAsianWidth.txt LineBreak.txt version\n",
1469                argv[0]);
1470       exit (1);
1471     }
1472
1473   fill_attributes (argv[1]);
1474   fill_width (argv[2]);
1475   fill_org_lbp (argv[3]);
1476
1477   debug_output_tables ("lbrkprop.txt");
1478   debug_output_org_tables ("lbrkprop_org.txt");
1479
1480   output_tables ("lbrkprop1.h", "lbrkprop2.h", argv[4]);
1481
1482   return 0;
1483 }
1484
1485 /*
1486  * For Emacs M-x compile
1487  * Local Variables:
1488  * compile-command: "
1489    gcc -O -Wall -I../unictype gen-lbrk.c -o gen-lbrk && \
1490    ./gen-lbrk \
1491         /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.0.0/ucd/UnicodeData.txt \
1492         /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.0.0/ucd/EastAsianWidth.txt \
1493         /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.0.0/ucd/LineBreak.txt \
1494         5.0.0
1495    "
1496  * End:
1497  */