X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=lib%2Fgen-uni-tables.c;h=add1a42aa62ca2a5a175f674f412911adb40fe24;hb=fc492e92429a239ba32ffe14d3236685963818a3;hp=59d246bca29031063df3bf948ba622b3e2f5bb8a;hpb=b3bff7a868656967a8007faa831a89f3a1601eb2;p=pspp

diff --git a/lib/gen-uni-tables.c b/lib/gen-uni-tables.c
index 59d246bca2..add1a42aa6 100644
--- a/lib/gen-uni-tables.c
+++ b/lib/gen-uni-tables.c
@@ -31,7 +31,7 @@
                       /usr/local/share/Unidata/CompositionExclusions.txt \
                       /usr/local/share/Unidata/SpecialCasing.txt \
                       /usr/local/share/Unidata/CaseFolding.txt \
-                      5.1.0
+                      5.2.0
  */
 
 #include <stdbool.h>
@@ -690,7 +690,7 @@ output_predicate (const char *filename, bool (*predicate) (unsigned int), const
 
   fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
   fprintf (stream, "/* %s of Unicode characters.  */\n", comment);
-  fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s.  */\n",
+  fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s.  */\n",
            version);
 
   t.p = 4; /* or: 5 */
@@ -994,7 +994,7 @@ output_category (const char *filename, const char *version)
 
   fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
   fprintf (stream, "/* Categories of Unicode characters.  */\n");
-  fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s.  */\n",
+  fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s.  */\n",
            version);
 
   t.p = 7;
@@ -1158,7 +1158,7 @@ output_combclass (const char *filename, const char *version)
 
   fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
   fprintf (stream, "/* Combining class of Unicode characters.  */\n");
-  fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s.  */\n",
+  fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s.  */\n",
            version);
 
   t.p = 7;
@@ -1498,7 +1498,7 @@ output_bidi_category (const char *filename, const char *version)
 
   fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
   fprintf (stream, "/* Bidi categories of Unicode characters.  */\n");
-  fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s.  */\n",
+  fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s.  */\n",
            version);
 
   t.p = 7;
@@ -1655,7 +1655,7 @@ output_decimal_digit_test (const char *filename, const char *version)
 
   fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
   fprintf (stream, "/* Decimal digit values of Unicode characters.  */\n");
-  fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s.  */\n",
+  fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s.  */\n",
            version);
 
   need_comma = false;
@@ -1702,7 +1702,7 @@ output_decimal_digit (const char *filename, const char *version)
 
   fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
   fprintf (stream, "/* Decimal digit values of Unicode characters.  */\n");
-  fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s.  */\n",
+  fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s.  */\n",
            version);
 
   t.p = 7;
@@ -1842,7 +1842,7 @@ output_digit_test (const char *filename, const char *version)
 
   fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
   fprintf (stream, "/* Digit values of Unicode characters.  */\n");
-  fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s.  */\n",
+  fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s.  */\n",
            version);
 
   need_comma = false;
@@ -1889,7 +1889,7 @@ output_digit (const char *filename, const char *version)
 
   fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
   fprintf (stream, "/* Digit values of Unicode characters.  */\n");
-  fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s.  */\n",
+  fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s.  */\n",
            version);
 
   t.p = 7;
@@ -2046,7 +2046,7 @@ output_numeric_test (const char *filename, const char *version)
 
   fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
   fprintf (stream, "/* Numeric values of Unicode characters.  */\n");
-  fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s.  */\n",
+  fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s.  */\n",
            version);
 
   need_comma = false;
@@ -2102,7 +2102,7 @@ output_numeric (const char *filename, const char *version)
 
   fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
   fprintf (stream, "/* Numeric values of Unicode characters.  */\n");
-  fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s.  */\n",
+  fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s.  */\n",
            version);
 
   /* Create table of occurring fractions.  */
@@ -2387,7 +2387,7 @@ output_mirror (const char *filename, const char *version)
 
   fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
   fprintf (stream, "/* Mirrored Unicode characters.  */\n");
-  fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s.  */\n",
+  fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s.  */\n",
            version);
 
   t.p = 7;
@@ -2491,6 +2491,24 @@ output_mirror (const char *filename, const char *version)
 
 /* ========================================================================= */
 
+/* Particular values of the word break property.  */
+
+static bool
+is_WBP_MIDNUMLET (unsigned int ch)
+{
+  return (ch == 0x0027 || ch == 0x002E || ch == 0x2018 || ch == 0x2019
+          || ch == 0x2024 || ch == 0xFE52 || ch == 0xFF07 || ch == 0xFF0E);
+}
+
+static bool
+is_WBP_MIDLETTER (unsigned int ch)
+{
+  return (ch == 0x00B7 || ch == 0x05F4 || ch == 0x2027 || ch == 0x003A
+          || ch == 0x0387 || ch == 0xFE13 || ch == 0xFE55 || ch == 0xFF1A);
+}
+
+/* ========================================================================= */
+
 /* Properties.  */
 
 /* Reading PropList.txt and DerivedCoreProperties.txt.  */
@@ -2534,6 +2552,13 @@ enum
   PROP_ALPHABETIC,
   PROP_LOWERCASE,
   PROP_UPPERCASE,
+  PROP_CASED,
+  PROP_CASE_IGNORABLE,
+  PROP_CHANGES_WHEN_LOWERCASED,
+  PROP_CHANGES_WHEN_UPPERCASED,
+  PROP_CHANGES_WHEN_TITLECASED,
+  PROP_CHANGES_WHEN_CASEFOLDED,
+  PROP_CHANGES_WHEN_CASEMAPPED,
   PROP_ID_START,
   PROP_ID_CONTINUE,
   PROP_XID_START,
@@ -2632,6 +2657,13 @@ fill_properties (const char *proplist_filename)
       PROP ("Alphabetic", PROP_ALPHABETIC)
       PROP ("Lowercase", PROP_LOWERCASE)
       PROP ("Uppercase", PROP_UPPERCASE)
+      PROP ("Cased", PROP_CASED)
+      PROP ("Case_Ignorable", PROP_CASE_IGNORABLE)
+      PROP ("Changes_When_Lowercased", PROP_CHANGES_WHEN_LOWERCASED)
+      PROP ("Changes_When_Uppercased", PROP_CHANGES_WHEN_UPPERCASED)
+      PROP ("Changes_When_Titlecased", PROP_CHANGES_WHEN_TITLECASED)
+      PROP ("Changes_When_Casefolded", PROP_CHANGES_WHEN_CASEFOLDED)
+      PROP ("Changes_When_Casemapped", PROP_CHANGES_WHEN_CASEMAPPED)
       PROP ("ID_Start", PROP_ID_START)
       PROP ("ID_Continue", PROP_ID_CONTINUE)
       PROP ("XID_Start", PROP_XID_START)
@@ -2777,6 +2809,7 @@ is_property_alphabetic (unsigned int ch)
     || (ch == 0x3007) /* IDEOGRAPHIC NUMBER ZERO */
     || (ch >= 0x3021 && ch <= 0x3029) /* HANGZHOU NUMERAL */
     || (ch >= 0x3038 && ch <= 0x303A) /* HANGZHOU NUMERAL */
+    || (ch >= 0xA6E6 && ch <= 0xA6EF) /* BAMUM LETTERS */
     || (ch >= 0x10140 && ch <= 0x10174) /* GREEK ACROPHONICS */
     || (ch == 0x10341) /* GOTHIC LETTER NINETY */
     || (ch == 0x1034A) /* GOTHIC LETTER NINE HUNDRED */
@@ -2812,7 +2845,10 @@ is_property_default_ignorable_code_point (unsigned int ch)
   bool result1 =
     (is_category_Cf (ch)
      && !(ch >= 0xFFF9 && ch <= 0xFFFB) /* Annotations */
-     && !((ch >= 0x0600 && ch <= 0x0603) || ch == 0x06DD || ch == 0x070F))
+     && !((ch >= 0x0600 && ch <= 0x0603) || ch == 0x06DD || ch == 0x070F)
+     /* For some reason, the following are not listed as having property
+        Default_Ignorable_Code_Point.  */
+     && !(ch == 0x110BD))
     || ((unicode_properties[ch] & (1ULL << PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)) != 0)
     || ((unicode_properties[ch] & (1ULL << PROP_VARIATION_SELECTOR)) != 0);
   bool result2 =
@@ -2921,6 +2957,79 @@ is_property_titlecase (unsigned int ch)
   return is_category_Lt (ch);
 }
 
+/* See DerivedCoreProperties.txt.  */
+static bool
+is_property_cased (unsigned int ch)
+{
+  bool result1 = (is_property_lowercase (ch)
+                  || is_property_uppercase (ch)
+                  || is_category_Lt (ch));
+  bool result2 = ((unicode_properties[ch] & (1ULL << PROP_CASED)) != 0);
+
+  if (result1 != result2)
+    abort ();
+  return result1;
+}
+
+/* See DerivedCoreProperties.txt.  */
+static bool
+is_property_case_ignorable (unsigned int ch)
+{
+  bool result1 = (is_WBP_MIDLETTER (ch) || is_WBP_MIDNUMLET (ch)
+                  || is_category_Mn (ch)
+                  || is_category_Me (ch)
+                  || is_category_Cf (ch)
+                  || is_category_Lm (ch)
+                  || is_category_Sk (ch));
+  bool result2 = ((unicode_properties[ch] & (1ULL << PROP_CASE_IGNORABLE)) != 0);
+
+  if (result1 != result2)
+    abort ();
+  return result1;
+}
+
+/* See DerivedCoreProperties.txt.  */
+static bool
+is_property_changes_when_lowercased (unsigned int ch)
+{
+  bool result1 = ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_LOWERCASED)) != 0);
+  bool result2 = (unicode_attributes[ch].name != NULL
+                  && unicode_attributes[ch].lower != NONE
+                  && unicode_attributes[ch].lower != ch);
+
+  if (result1 != result2)
+    abort ();
+  return result1;
+}
+
+/* See DerivedCoreProperties.txt.  */
+static bool
+is_property_changes_when_uppercased (unsigned int ch)
+{
+  return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_UPPERCASED)) != 0);
+}
+
+/* See DerivedCoreProperties.txt.  */
+static bool
+is_property_changes_when_titlecased (unsigned int ch)
+{
+  return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_TITLECASED)) != 0);
+}
+
+/* See DerivedCoreProperties.txt.  */
+static bool
+is_property_changes_when_casefolded (unsigned int ch)
+{
+  return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_CASEFOLDED)) != 0);
+}
+
+/* See DerivedCoreProperties.txt.  */
+static bool
+is_property_changes_when_casemapped (unsigned int ch)
+{
+  return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_CASEMAPPED)) != 0);
+}
+
 /* See PropList.txt, UCD.html.  */
 static bool
 is_property_soft_dotted (unsigned int ch)
@@ -3467,6 +3576,13 @@ output_properties (const char *version)
   PROPERTY(lowercase)
   PROPERTY(other_lowercase)
   PROPERTY(titlecase)
+  PROPERTY(cased)
+  PROPERTY(case_ignorable)
+  PROPERTY(changes_when_lowercased)
+  PROPERTY(changes_when_uppercased)
+  PROPERTY(changes_when_titlecased)
+  PROPERTY(changes_when_casefolded)
+  PROPERTY(changes_when_casemapped)
   PROPERTY(soft_dotted)
   PROPERTY(id_start)
   PROPERTY(other_id_start)
@@ -3648,7 +3764,7 @@ output_scripts (const char *version)
 
   fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
   fprintf (stream, "/* Unicode scripts.  */\n");
-  fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s.  */\n",
+  fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s.  */\n",
            version);
 
   for (s = 0; s < numscripts; s++)
@@ -3825,7 +3941,7 @@ output_scripts_byname (const char *version)
 
   fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
   fprintf (stream, "/* Unicode scripts.  */\n");
-  fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s.  */\n",
+  fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s.  */\n",
            version);
   fprintf (stream, "struct named_script { const char *name; unsigned int index; };\n");
   fprintf (stream, "%%struct-type\n");
@@ -3965,7 +4081,7 @@ output_blocks (const char *version)
 
   fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
   fprintf (stream, "/* Unicode blocks.  */\n");
-  fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s.  */\n",
+  fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s.  */\n",
            version);
 
   fprintf (stream, "static const uc_block_t blocks[] =\n");
@@ -4382,7 +4498,7 @@ output_ident_category (const char *filename, int (*predicate) (unsigned int), co
 
   fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
   fprintf (stream, "/* Language syntax properties of Unicode characters.  */\n");
-  fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s.  */\n",
+  fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s.  */\n",
            version);
 
   t.p = 7; /* or 8 */
@@ -5277,9 +5393,13 @@ symbolic_width (unsigned int ch)
   if (is_property_unassigned_code_value (ch))
     {
       /* Unicode TR#11 section "Unassigned and Private-Use Characters".  */
-      if (ch >= 0xE000 && ch <= 0xF8FF)
+      if (ch >= 0xE000 && ch <= 0xF8FF) /* Private Use */
         return 'A';
-      if ((ch >= 0x20000 && ch <= 0x2FFFD) || (ch >= 0x30000 && ch <= 0x3FFFD))
+      if ((ch >= 0x4E00 && ch <= 0x9FFF) /* CJK Unified Ideographs block */
+          || (ch >= 0x3400 && ch <= 0x4DBF) /* CJK Unified Ideographs Extension A block */
+          || (ch >= 0xF900 && ch <= 0xFAFF) /* CJK Compatibility Ideographs block */
+          || (ch >= 0x20000 && ch <= 0x2FFFD) /* Supplementary Ideographic Plane */
+          || (ch >= 0x30000 && ch <= 0x3FFFD) /* Tertiary Ideographic Plane */)
         return '2';
       return 0;
     }
@@ -5365,47 +5485,49 @@ output_width_property_test (const char *filename)
 
 /* ========================================================================= */
 
-/* Line breaking classification.  */
+/* Line breaking classification.
+   Updated for Unicode TR #14 revision 24.  */
 
 enum
 {
-  /* Values >= 24 are resolved at run time. */
-  LBP_BK = 24, /* mandatory break */
+  /* Values >= 25 are resolved at run time. */
+  LBP_BK = 25, /* mandatory break */
 /*LBP_CR,         carriage return - not used here because it's a DOSism */
 /*LBP_LF,         line feed - not used here because it's a DOSism */
-  LBP_CM = 25, /* attached characters and combining marks */
+  LBP_CM = 26, /* attached characters and combining marks */
 /*LBP_NL,         next line - not used here because it's equivalent to LBP_BK */
 /*LBP_SG,         surrogates - not used here because they are not characters */
   LBP_WJ =  0, /* word joiner */
-  LBP_ZW = 26, /* zero width space */
+  LBP_ZW = 27, /* zero width space */
   LBP_GL =  1, /* non-breaking (glue) */
-  LBP_SP = 27, /* space */
+  LBP_SP = 28, /* space */
   LBP_B2 =  2, /* break opportunity before and after */
   LBP_BA =  3, /* break opportunity after */
   LBP_BB =  4, /* break opportunity before */
   LBP_HY =  5, /* hyphen */
-  LBP_CB = 28, /* contingent break opportunity */
+  LBP_CB = 29, /* contingent break opportunity */
   LBP_CL =  6, /* closing punctuation */
-  LBP_EX =  7, /* exclamation/interrogation */
-  LBP_IN =  8, /* inseparable */
-  LBP_NS =  9, /* non starter */
-  LBP_OP = 10, /* opening punctuation */
-  LBP_QU = 11, /* ambiguous quotation */
-  LBP_IS = 12, /* infix separator (numeric) */
-  LBP_NU = 13, /* numeric */
-  LBP_PO = 14, /* postfix (numeric) */
-  LBP_PR = 15, /* prefix (numeric) */
-  LBP_SY = 16, /* symbols allowing breaks */
-  LBP_AI = 29, /* ambiguous (alphabetic or ideograph) */
-  LBP_AL = 17, /* ordinary alphabetic and symbol characters */
-  LBP_H2 = 18, /* Hangul LV syllable */
-  LBP_H3 = 19, /* Hangul LVT syllable */
-  LBP_ID = 20, /* ideographic */
-  LBP_JL = 21, /* Hangul L Jamo */
-  LBP_JV = 22, /* Hangul V Jamo */
-  LBP_JT = 23, /* Hangul T Jamo */
-  LBP_SA = 30, /* complex context (South East Asian) */
-  LBP_XX = 31  /* unknown */
+  LBP_CP =  7, /* closing parenthesis */
+  LBP_EX =  8, /* exclamation/interrogation */
+  LBP_IN =  9, /* inseparable */
+  LBP_NS = 10, /* non starter */
+  LBP_OP = 11, /* opening punctuation */
+  LBP_QU = 12, /* ambiguous quotation */
+  LBP_IS = 13, /* infix separator (numeric) */
+  LBP_NU = 14, /* numeric */
+  LBP_PO = 15, /* postfix (numeric) */
+  LBP_PR = 16, /* prefix (numeric) */
+  LBP_SY = 17, /* symbols allowing breaks */
+  LBP_AI = 30, /* ambiguous (alphabetic or ideograph) */
+  LBP_AL = 18, /* ordinary alphabetic and symbol characters */
+  LBP_H2 = 19, /* Hangul LV syllable */
+  LBP_H3 = 20, /* Hangul LVT syllable */
+  LBP_ID = 21, /* ideographic */
+  LBP_JL = 22, /* Hangul L Jamo */
+  LBP_JV = 23, /* Hangul V Jamo */
+  LBP_JT = 24, /* Hangul T Jamo */
+  LBP_SA = 31, /* complex context (South East Asian) */
+  LBP_XX = 32  /* unknown */
 };
 
 /* Returns the line breaking classification for ch, as a bit mask.  */
@@ -5454,7 +5576,8 @@ get_lbp (unsigned int ch)
         attr |= (int64_t) 1 << LBP_B2;
 
       /* break opportunity after */
-      if (ch == 0x1680 /* OGHAM SPACE MARK */
+      if (/* Breaking Spaces */
+          ch == 0x1680 /* OGHAM SPACE MARK */
           || ch == 0x2000 /* EN QUAD */
           || ch == 0x2001 /* EM QUAD */
           || ch == 0x2002 /* EN SPACE */
@@ -5466,12 +5589,17 @@ get_lbp (unsigned int ch)
           || ch == 0x2009 /* THIN SPACE */
           || ch == 0x200A /* HAIR SPACE */
           || ch == 0x205F /* MEDIUM MATHEMATICAL SPACE */
+          /* Tabs */
           || ch == 0x0009 /* tab */
+          /* Conditional Hyphens */
           || ch == 0x00AD /* SOFT HYPHEN */
+          /* Breaking Hyphens */
           || ch == 0x058A /* ARMENIAN HYPHEN */
+          || ch == 0x1400 /* CANADIAN SYLLABICS HYPHEN */
           || ch == 0x2010 /* HYPHEN */
           || ch == 0x2012 /* FIGURE DASH */
           || ch == 0x2013 /* EN DASH */
+          /* Visible Word Dividers */
           || ch == 0x05BE /* HEBREW PUNCTUATION MAQAF */
           || ch == 0x0F0B /* TIBETAN MARK INTERSYLLABIC TSHEG */
           || ch == 0x1361 /* ETHIOPIC WORDSPACE */
@@ -5479,6 +5607,7 @@ get_lbp (unsigned int ch)
           || ch == 0x17DA /* KHMER SIGN KOOMUUT */
           || ch == 0x2027 /* HYPHENATION POINT */
           || ch == 0x007C /* VERTICAL LINE */
+          /* Historic Word Separators */
           || ch == 0x16EB /* RUNIC SINGLE PUNCTUATION */
           || ch == 0x16EC /* RUNIC MULTIPLE PUNCTUATION */
           || ch == 0x16ED /* RUNIC CROSS PUNCTUATION */
@@ -5495,6 +5624,7 @@ get_lbp (unsigned int ch)
           || ch == 0x2E2C /* SQUARED FOUR DOT PUNCTUATION */
           || ch == 0x2E2D /* FIVE DOT PUNCTUATION */
           || ch == 0x2E30 /* RING POINT */
+          || ch == 0x2E31 /* WORD SEPARATOR MIDDLE DOT */
           || ch == 0x10100 /* AEGEAN WORD SEPARATOR LINE */
           || ch == 0x10101 /* AEGEAN WORD SEPARATOR DOT */
           || ch == 0x10102 /* AEGEAN CHECK MARK */
@@ -5502,6 +5632,7 @@ get_lbp (unsigned int ch)
           || ch == 0x103D0 /* OLD PERSIAN WORD DIVIDER */
           || ch == 0x1091F /* PHOENICIAN WORD SEPARATOR */
           || ch == 0x12470 /* CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER */
+          /* Dandas */
           || ch == 0x0964 /* DEVANAGARI DANDA */
           || ch == 0x0965 /* DEVANAGARI DOUBLE DANDA */
           || ch == 0x0E5A /* THAI CHARACTER ANGKHANKHU */
@@ -5521,17 +5652,18 @@ get_lbp (unsigned int ch)
           || ch == 0xAA5F /* CHAM PUNCTUATION TRIPLE DANDA */
           || ch == 0x10A56 /* KHAROSHTHI PUNCTUATION DANDA */
           || ch == 0x10A57 /* KHAROSHTHI PUNCTUATION DOUBLE DANDA */
+          /* Tibetan */
           || ch == 0x0F34 /* TIBETAN MARK BSDUS RTAGS */
           || ch == 0x0F7F /* TIBETAN SIGN RNAM BCAD */
           || ch == 0x0F85 /* TIBETAN MARK PALUTA */
           || ch == 0x0FBE /* TIBETAN KU RU KHA */
           || ch == 0x0FBF /* TIBETAN KU RU KHA BZHI MIG CAN */
           || ch == 0x0FD2 /* TIBETAN MARK NYIS TSHEG */
+          /* Other Terminating Punctuation */
           || ch == 0x1804 /* MONGOLIAN COLON */
           || ch == 0x1805 /* MONGOLIAN FOUR DOTS */
           || ch == 0x1B5A /* BALINESE PANTI */
           || ch == 0x1B5B /* BALINESE PAMADA */
-          || ch == 0x1B5C /* BALINESE WINDU */
           || ch == 0x1B5D /* BALINESE CARIK PAMUNGKAH */
           || ch == 0x1B60 /* BALINESE PAMENENG */
           || ch == 0x1C3B /* LEPCHA PUNCTUATION TA-ROL */
@@ -5558,6 +5690,29 @@ get_lbp (unsigned int ch)
           || ch == 0x10A54 /* KHAROSHTHI PUNCTUATION MANGALAM */
           || ch == 0x10A55 /* KHAROSHTHI PUNCTUATION LOTUS */
           /* Extra characters for compatibility with Unicode LineBreak.txt.  */
+          || ch == 0xA4FE /* LISU PUNCTUATION COMMA */
+          || ch == 0xA4FF /* LISU PUNCTUATION FULL STOP */
+          || ch == 0xA6F3 /* BAMUM FULL STOP */
+          || ch == 0xA6F4 /* BAMUM COLON */
+          || ch == 0xA6F5 /* BAMUM COMMA */
+          || ch == 0xA6F6 /* BAMUM SEMICOLON */
+          || ch == 0xA6F7 /* BAMUM QUESTION MARK */
+          || ch == 0xA9C7 /* JAVANESE PADA PANGKAT */
+          || ch == 0xA9C8 /* JAVANESE PADA LINGSA */
+          || ch == 0xA9C9 /* JAVANESE PADA LUNGSI */
+          || ch == 0xABEB /* MEETEI MAYEK CHEIKHEI */
+          || ch == 0x10857 /* IMPERIAL ARAMAIC SECTION SIGN */
+          || ch == 0x10B39 /* AVESTAN ABBREVIATION MARK */
+          || ch == 0x10B3A /* TINY TWO DOTS OVER ONE DOT PUNCTUATION */
+          || ch == 0x10B3B /* SMALL TWO DOTS OVER ONE DOT PUNCTUATION */
+          || ch == 0x10B3C /* LARGE TWO DOTS OVER ONE DOT PUNCTUATION */
+          || ch == 0x10B3D /* LARGE ONE DOT OVER TWO DOTS PUNCTUATION */
+          || ch == 0x10B3E /* LARGE TWO RINGS OVER ONE RING PUNCTUATION */
+          || ch == 0x10B3F /* LARGE ONE RING OVER TWO RINGS PUNCTUATION */
+          || ch == 0x110BE /* KAITHI SECTION MARK */
+          || ch == 0x110BF /* KAITHI DOUBLE SECTION MARK */
+          || ch == 0x110C0 /* KAITHI DANDA */
+          || ch == 0x110C1 /* KAITHI DOUBLE DANDA */
           || ch == 0x12471 /* CUNEIFORM PUNCTUATION SIGN VERTICAL COLON */
           || ch == 0x12472 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL COLON */
           || ch == 0x12473 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL TRICOLON */)
@@ -5593,9 +5748,15 @@ get_lbp (unsigned int ch)
       if (ch == 0xFFFC /* OBJECT REPLACEMENT CHARACTER */)
         attr |= (int64_t) 1 << LBP_CB;
 
+      /* closing parenthesis */
+      if (ch == 0x0029 /* RIGHT PARENTHESIS */
+          || ch == 0x005D /* RIGHT SQUARE BRACKET */)
+        attr |= (int64_t) 1 << LBP_CP;
+
       /* closing punctuation */
       if ((unicode_attributes[ch].category[0] == 'P'
-           && unicode_attributes[ch].category[1] == 'e')
+           && unicode_attributes[ch].category[1] == 'e'
+           && !(attr & ((int64_t) 1 << LBP_CP)))
           || ch == 0x3001 /* IDEOGRAPHIC COMMA */
           || ch == 0x3002 /* IDEOGRAPHIC FULL STOP */
           || ch == 0xFE11 /* PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC COMMA */
@@ -5605,7 +5766,16 @@ get_lbp (unsigned int ch)
           || ch == 0xFF0C /* FULLWIDTH COMMA */
           || ch == 0xFF0E /* FULLWIDTH FULL STOP */
           || ch == 0xFF61 /* HALFWIDTH IDEOGRAPHIC FULL STOP */
-          || ch == 0xFF64 /* HALFWIDTH IDEOGRAPHIC COMMA */)
+          || ch == 0xFF64 /* HALFWIDTH IDEOGRAPHIC COMMA */
+          /* Extra characters for compatibility with Unicode LineBreak.txt.  */
+          || ch == 0x1325B /* EGYPTIAN HIEROGLYPH O006D */
+          || ch == 0x1325C /* EGYPTIAN HIEROGLYPH O006E */
+          || ch == 0x1325D /* EGYPTIAN HIEROGLYPH O006F */
+          || ch == 0x13282 /* EGYPTIAN HIEROGLYPH O033A */
+          || ch == 0x13287 /* EGYPTIAN HIEROGLYPH O036B */
+          || ch == 0x13289 /* EGYPTIAN HIEROGLYPH O036D */
+          || ch == 0x1337A /* EGYPTIAN HIEROGLYPH V011B */
+          || ch == 0x1337B /* EGYPTIAN HIEROGLYPH V011C */)
         attr |= (int64_t) 1 << LBP_CL;
 
       /* exclamation/interrogation */
@@ -5690,7 +5860,14 @@ get_lbp (unsigned int ch)
            && unicode_attributes[ch].category[1] == 's')
           || ch == 0x00A1 /* INVERTED EXCLAMATION MARK */
           || ch == 0x00BF /* INVERTED QUESTION MARK */
-          || ch == 0x2E18 /* INVERTED INTERROBANG */)
+          || ch == 0x2E18 /* INVERTED INTERROBANG */
+          /* Extra characters for compatibility with Unicode LineBreak.txt.  */
+          || ch == 0x13258 /* EGYPTIAN HIEROGLYPH O006A */
+          || ch == 0x13259 /* EGYPTIAN HIEROGLYPH O006B */
+          || ch == 0x1325A /* EGYPTIAN HIEROGLYPH O006C */
+          || ch == 0x13286 /* EGYPTIAN HIEROGLYPH O036A */
+          || ch == 0x13288 /* EGYPTIAN HIEROGLYPH O036C */
+          || ch == 0x13379 /* EGYPTIAN HIEROGLYPH V011A */)
         attr |= (int64_t) 1 << LBP_OP;
 
       /* ambiguous quotation */
@@ -5759,7 +5936,12 @@ get_lbp (unsigned int ch)
           /* Extra characters for compatibility with Unicode LineBreak.txt.  */
           || ch == 0x0609 /* ARABIC-INDIC PER MILLE SIGN */
           || ch == 0x060A /* ARABIC-INDIC PER TEN THOUSAND SIGN */
-          || ch == 0x0D79 /* MALAYALAM DATE MARK */)
+          || ch == 0x09F2 /* BENGALI RUPEE MARK */
+          || ch == 0x09F3 /* BENGALI RUPEE SIGN */
+          || ch == 0x09F9 /* BENGALI CURRENCY DENOMINATOR SIXTEEN */
+          || ch == 0x0D79 /* MALAYALAM DATE MARK */
+          || ch == 0x20B6 /* LIVRE TOURNOIS SIGN */
+          || ch == 0xA838 /* NORTH INDIC RUPEE MARK */)
         attr |= (int64_t) 1 << LBP_PO;
 
       /* prefix (numeric) */
@@ -5784,13 +5966,13 @@ get_lbp (unsigned int ch)
       if (ch >= 0xAC00 && ch <= 0xD7A3 && ((ch - 0xAC00) % 28) != 0)
         attr |= (int64_t) 1 << LBP_H3;
 
-      if ((ch >= 0x1100 && ch <= 0x1159) || ch == 0x115F)
+      if ((ch >= 0x1100 && ch <= 0x115F) || (ch >= 0xA960 && ch <= 0xA97C))
         attr |= (int64_t) 1 << LBP_JL;
 
-      if (ch >= 0x1160 && ch <= 0x11A2)
+      if ((ch >= 0x1160 && ch <= 0x11A7) || (ch >= 0xD7B0 && ch <= 0xD7C6))
         attr |= (int64_t) 1 << LBP_JV;
 
-      if (ch >= 0x11A8 && ch <= 0x11F9)
+      if ((ch >= 0x11A8 && ch <= 0x11FF) || (ch >= 0xD7CB && ch <= 0xD7FB))
         attr |= (int64_t) 1 << LBP_JT;
 
       /* complex context (South East Asian) */
@@ -5801,16 +5983,22 @@ get_lbp (unsigned int ch)
                    || unicode_attributes[ch].category[1] == 'o'))
            || (unicode_attributes[ch].category[0] == 'M'
                && (unicode_attributes[ch].category[1] == 'c'
-                   || unicode_attributes[ch].category[1] == 'n'))
+                   || unicode_attributes[ch].category[1] == 'n')
+               && ch != 0x1A7F /* TAI THAM COMBINING CRYPTOGRAMMIC DOT */)
            /* Extra characters for compatibility with Unicode LineBreak.txt.  */
            || ch == 0x109E /* MYANMAR SYMBOL SHAN ONE */
            || ch == 0x109F /* MYANMAR SYMBOL SHAN EXCLAMATION */
            || ch == 0x19DE /* NEW TAI LUE SIGN LAE */
-           || ch == 0x19DF /* NEW TAI LUE SIGN LAEV */)
-          && ((ch >= 0x0E00 && ch <= 0x0EFF)
-              || (ch >= 0x1000 && ch <= 0x109F)
-              || (ch >= 0x1780 && ch <= 0x17FF)
-              || (ch >= 0x1950 && ch <= 0x19DF)))
+           || ch == 0x19DF /* NEW TAI LUE SIGN LAEV */
+           || (ch >= 0x1AA0 && ch <= 0x1AAD) /* TAI THAM SIGN */
+           || (ch >= 0xAA77 && ch <= 0xAA79) /* MYANMAR SYMBOL AITON */
+           || (ch >= 0xAADE && ch <= 0xAADF) /* TAI VIET SYMBOL */)
+          && ((ch >= 0x0E00 && ch <= 0x0EFF) /* Thai, Lao */
+              || (ch >= 0x1000 && ch <= 0x109F) /* Myanmar */
+              || (ch >= 0x1780 && ch <= 0x17FF) /* Khmer */
+              || (ch >= 0x1950 && ch <= 0x19DF) /* Tai Le, New Tai Lue */
+              || (ch >= 0x1A20 && ch <= 0x1AAF) /* Tai Tham */
+              || (ch >= 0xAA60 && ch <= 0xAADF) /* Myanmar Extended-A, Tai Viet */))
         attr |= (int64_t) 1 << LBP_SA;
 
       /* attached characters and combining marks */
@@ -5820,7 +6008,8 @@ get_lbp (unsigned int ch)
                || unicode_attributes[ch].category[1] == 'n'))
           || (unicode_attributes[ch].category[0] == 'C'
               && (unicode_attributes[ch].category[1] == 'c'
-                  || unicode_attributes[ch].category[1] == 'f')))
+                  || unicode_attributes[ch].category[1] == 'f')
+              && ch != 0x110BD /* KAITHI NUMBER SIGN */))
         if (!(attr & (((int64_t) 1 << LBP_BK) | ((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_GL) | ((int64_t) 1 << LBP_SA) | ((int64_t) 1 << LBP_WJ) | ((int64_t) 1 << LBP_ZW))))
           attr |= (int64_t) 1 << LBP_CM;
 
@@ -5829,8 +6018,8 @@ get_lbp (unsigned int ch)
           || ch == 0x3000 /* IDEOGRAPHIC SPACE */
           || (ch >= 0x3040 && ch <= 0x309F) /* HIRAGANA */
           || (ch >= 0x30A0 && ch <= 0x30FF) /* KATAKANA */
-          || (ch >= 0x3400 && ch <= 0x4DB5) /* CJK Ideograph Extension A */
-          || (ch >= 0x4E00 && ch <= 0x9FC3) /* CJK Ideograph */
+          || (ch >= 0x3400 && ch <= 0x4DBF) /* CJK Ideograph Extension A */
+          || (ch >= 0x4E00 && ch <= 0x9FFF) /* CJK Ideograph */
           || (ch >= 0xF900 && ch <= 0xFAD9) /* CJK COMPATIBILITY IDEOGRAPH */
           || (ch >= 0xA000 && ch <= 0xA48F) /* YI SYLLABLE */
           || (ch >= 0xA490 && ch <= 0xA4CF) /* YI RADICAL */
@@ -5844,7 +6033,7 @@ get_lbp (unsigned int ch)
           || (ch >= 0x2F800 && ch <= 0x2FA1D) /* CJK COMPATIBILITY IDEOGRAPH */
           || strstr (unicode_attributes[ch].name, "FULLWIDTH LATIN ") != NULL
           || (ch >= 0x3000 && ch <= 0x33FF
-              && !(attr & (((int64_t) 1 << LBP_CM) | ((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_OP) | ((int64_t) 1 << LBP_CL))))
+              && !(attr & (((int64_t) 1 << LBP_CM) | ((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_OP) | ((int64_t) 1 << LBP_CL) | ((int64_t) 1 << LBP_CP))))
           /* Extra characters for compatibility with Unicode LineBreak.txt.  */
           || ch == 0xFE30 /* PRESENTATION FORM FOR VERTICAL TWO DOT LEADER */
           || ch == 0xFE31 /* PRESENTATION FORM FOR VERTICAL EM DASH */
@@ -5887,7 +6076,10 @@ get_lbp (unsigned int ch)
           || ch == 0xFF5E /* FULLWIDTH TILDE */
           || ch == 0xFFE2 /* FULLWIDTH NOT SIGN */
           || ch == 0xFFE3 /* FULLWIDTH MACRON */
-          || ch == 0xFFE4 /* FULLWIDTH BROKEN BAR */)
+          || ch == 0xFFE4 /* FULLWIDTH BROKEN BAR */
+          /* Extra characters for compatibility with Unicode LineBreak.txt.  */
+          || (ch >= 0x1F200 && ch <= 0x1F248) /* Enclosed Ideographic Supplement */
+          || (ch >= 0x2A700 && ch <= 0x2B734) /* CJK Ideograph Extension C */)
         if (!(attr & (((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_CM))))
           {
             /* ambiguous (ideograph) ? */
@@ -5928,8 +6120,10 @@ get_lbp (unsigned int ch)
           || ch == 0x2061 /* FUNCTION APPLICATION */
           || ch == 0x2062 /* INVISIBLE TIMES */
           || ch == 0x2063 /* INVISIBLE SEPARATOR */
-          || ch == 0x2064 /* INVISIBLE PLUS */)
-        if (!(attr & (((int64_t) 1 << LBP_GL) | ((int64_t) 1 << LBP_B2) | ((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_BB) | ((int64_t) 1 << LBP_HY) | ((int64_t) 1 << LBP_CB) | ((int64_t) 1 << LBP_CL) | ((int64_t) 1 << LBP_EX) | ((int64_t) 1 << LBP_IN) | ((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_OP) | ((int64_t) 1 << LBP_QU) | ((int64_t) 1 << LBP_IS) | ((int64_t) 1 << LBP_NU) | ((int64_t) 1 << LBP_PO) | ((int64_t) 1 << LBP_PR) | ((int64_t) 1 << LBP_SY) | ((int64_t) 1 << LBP_H2) | ((int64_t) 1 << LBP_H3) | ((int64_t) 1 << LBP_JL) | ((int64_t) 1 << LBP_JV) | ((int64_t) 1 << LBP_JT) | ((int64_t) 1 << LBP_SA) | ((int64_t) 1 << LBP_ID))))
+          || ch == 0x2064 /* INVISIBLE PLUS */
+          /* Extra characters for compatibility with Unicode LineBreak.txt.  */
+          || ch == 0x110BD /* KAITHI NUMBER SIGN */)
+        if (!(attr & (((int64_t) 1 << LBP_GL) | ((int64_t) 1 << LBP_B2) | ((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_BB) | ((int64_t) 1 << LBP_HY) | ((int64_t) 1 << LBP_CB) | ((int64_t) 1 << LBP_CL) | ((int64_t) 1 << LBP_CP) | ((int64_t) 1 << LBP_EX) | ((int64_t) 1 << LBP_IN) | ((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_OP) | ((int64_t) 1 << LBP_QU) | ((int64_t) 1 << LBP_IS) | ((int64_t) 1 << LBP_NU) | ((int64_t) 1 << LBP_PO) | ((int64_t) 1 << LBP_PR) | ((int64_t) 1 << LBP_SY) | ((int64_t) 1 << LBP_H2) | ((int64_t) 1 << LBP_H3) | ((int64_t) 1 << LBP_JL) | ((int64_t) 1 << LBP_JV) | ((int64_t) 1 << LBP_JT) | ((int64_t) 1 << LBP_SA) | ((int64_t) 1 << LBP_ID))))
           {
             /* ambiguous (alphabetic) ? */
             if ((unicode_width[ch] != NULL
@@ -5986,6 +6180,20 @@ get_lbp (unsigned int ch)
             attr &= ~((int64_t) 1 << LBP_CM);
           }
     }
+  else
+    {
+      /* Unassigned character.  */
+      if ((ch >= 0x3400 && ch <= 0x4DBF) /* CJK Unified Ideographs Extension A */
+          || (ch >= 0x4E00 && ch <= 0x9FFF) /* CJK Unified Ideographs */
+          || (ch >= 0xF900 && ch <= 0xFAFF) /* CJK Compatibility Ideographs */
+          || (ch >= 0x20000 && ch <= 0x2A6FF) /* CJK Unified Ideographs Extension B */
+          || (ch >= 0x2A700 && ch <= 0x2F7FF) /* CJK Unified Ideographs Extension C,
+                                                 Supplementary Ideographic Plane (Plane 2) outside of blocks */
+          || (ch >= 0x2F800 && ch <= 0x2FFFD) /* CJK Compatibility Ideographs Supplement,
+                                                 Supplementary Ideographic Plane (Plane 2) outside of blocks */
+          || (ch >= 0x30000 && ch <= 0x3FFFD) /* Tertiary Ideographic Plane (Plane 3) outside of blocks */)
+        attr |= (int64_t) 1 << LBP_ID;
+    }
 
   if (attr == 0)
     /* unknown */
@@ -6007,7 +6215,7 @@ debug_output_lbp (FILE *stream)
         {
           fprintf (stream, "0x%04X", i);
 #define PRINT_BIT(attr,bit) \
-  if (attr & (1 << bit)) fprintf (stream, " " #bit);
+  if (attr & ((int64_t) 1 << bit)) fprintf (stream, " " #bit);
           PRINT_BIT(attr,LBP_BK);
           PRINT_BIT(attr,LBP_CM);
           PRINT_BIT(attr,LBP_WJ);
@@ -6020,6 +6228,7 @@ debug_output_lbp (FILE *stream)
           PRINT_BIT(attr,LBP_HY);
           PRINT_BIT(attr,LBP_CB);
           PRINT_BIT(attr,LBP_CL);
+          PRINT_BIT(attr,LBP_CP);
           PRINT_BIT(attr,LBP_EX);
           PRINT_BIT(attr,LBP_IN);
           PRINT_BIT(attr,LBP_NS);
@@ -6133,6 +6342,7 @@ fill_org_lbp (const char *linebreak_filename)
       TRY(LBP_HY)
       TRY(LBP_CB)
       TRY(LBP_CL)
+      TRY(LBP_CP)
       TRY(LBP_EX)
       TRY(LBP_IN)
       TRY(LBP_NS)
@@ -6211,6 +6421,7 @@ debug_output_org_lbp (FILE *stream)
           PRINT_BIT(attr,LBP_HY);
           PRINT_BIT(attr,LBP_CB);
           PRINT_BIT(attr,LBP_CL);
+          PRINT_BIT(attr,LBP_CP);
           PRINT_BIT(attr,LBP_EX);
           PRINT_BIT(attr,LBP_IN);
           PRINT_BIT(attr,LBP_NS);
@@ -6383,6 +6594,7 @@ output_lbp (FILE *stream1, FILE *stream2)
           CASE(LBP_HY);
           CASE(LBP_CB);
           CASE(LBP_CL);
+          CASE(LBP_CP);
           CASE(LBP_EX);
           CASE(LBP_IN);
           CASE(LBP_NS);
@@ -6482,7 +6694,8 @@ output_lbrk_tables (const char *filename1, const char *filename2, const char *ve
 
 /* ========================================================================= */
 
-/* Word break property.  */
+/* Word break property.
+   Updated for Unicode TR #29 revision 15.  */
 
 /* Possible values of the Word_Break property.  */
 enum
@@ -6528,7 +6741,7 @@ get_wbp (unsigned int ch)
 
       if (unicode_attributes[ch].category != NULL
           && strcmp (unicode_attributes[ch].category, "Cf") == 0
-          && ch != 0x200C && ch != 0x200D)
+          && ch != 0x200B && ch != 0x200C && ch != 0x200D)
         attr |= 1 << WBP_FORMAT;
 
       if ((unicode_scripts[ch] < numscripts
@@ -6548,12 +6761,10 @@ get_wbp (unsigned int ch)
           && (attr & (1 << WBP_EXTEND)) == 0)
         attr |= 1 << WBP_ALETTER;
 
-      if (ch == 0x0027 || ch == 0x002E || ch == 0x2018 || ch == 0x2019
-          || ch == 0x2024 || ch == 0xFE52 || ch == 0xFF07 || ch == 0xFF0E)
+      if (is_WBP_MIDNUMLET (ch))
         attr |= 1 << WBP_MIDNUMLET;
 
-      if (ch == 0x00B7 || ch == 0x05F4 || ch == 0x2027 || ch == 0x003A
-          || ch == 0x0387 || ch == 0xFE13 || ch == 0xFE55 || ch == 0xFF1A)
+      if (is_WBP_MIDLETTER (ch))
         attr |= 1 << WBP_MIDLETTER;
 
       if ((((get_lbp (ch) >> LBP_IS) & 1) != 0
@@ -6960,7 +7171,8 @@ output_wbrk_tables (const char *filename, const char *version)
 
 /* ========================================================================= */
 
-/* Grapheme break property.  */
+/* Grapheme break property.
+   Updated for Unicode TR #29 revision 15.  */
 
 /* Possible values of the Grapheme_Cluster_Break property.  */
 enum
@@ -7265,6 +7477,9 @@ fill_org_gbp (const char *graphemebreakproperty_filename)
 
 /* ========================================================================= */
 
+/* Composition and decomposition.
+   Updated for Unicode TR #15 revision 31.  */
+
 /* Maximum number of characters into which a single Unicode character can be
    decomposed.  */
 #define MAX_DECOMP_LENGTH 18
@@ -7713,7 +7928,7 @@ output_composition_tables (const char *filename, const char *version)
      1527, which is quite good (60% filled).  It requires an auxiliary table
      lookup in a table of size 0.5 KB.  The total tables size is 11 KB.  */
 
-  fprintf (stream, "struct composition_rule { char codes[4]; };\n");
+  fprintf (stream, "struct composition_rule { char codes[6]; };\n");
   fprintf (stream, "%%struct-type\n");
   fprintf (stream, "%%language=ANSI-C\n");
   fprintf (stream, "%%define slot-name codes\n");
@@ -7751,16 +7966,9 @@ output_composition_tables (const char *filename, const char *version)
               if (strcmp (unicode_attributes[combined].combining, "0") != 0)
                 abort ();
 
-              if (!(code1 < 0x10000))
-                abort ();
-              if (!(code2 < 0x10000))
-                abort ();
-              if (!(combined < 0x10000))
-                abort ();
-
-              fprintf (stream, "\"\\x%02x\\x%02x\\x%02x\\x%02x\", 0x%04x\n",
-                       (code1 >> 8) & 0xff, code1 & 0xff,
-                       (code2 >> 8) & 0xff, code2 & 0xff,
+              fprintf (stream, "\"\\x%02x\\x%02x\\x%02x\\x%02x\\x%02x\\x%02x\", 0x%04x\n",
+                       (code1 >> 16) & 0xff, (code1 >> 8) & 0xff, code1 & 0xff,
+                       (code2 >> 16) & 0xff, (code2 >> 8) & 0xff, code2 & 0xff,
                        combined);
             }
         }
@@ -8900,20 +9108,20 @@ main (int argc, char * argv[])
  * compile-command: "
    gcc -O -Wall gen-uni-tables.c -Iunictype -o gen-uni-tables && \
    ./gen-uni-tables \
-        /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/UnicodeData.txt \
-        /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/PropList.txt \
-        /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/DerivedCoreProperties.txt \
-        /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/Scripts.txt \
-        /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/Blocks.txt \
+        /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.2.0/ucd/UnicodeData.txt \
+        /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.2.0/ucd/PropList.txt \
+        /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.2.0/ucd/DerivedCoreProperties.txt \
+        /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.2.0/ucd/Scripts.txt \
+        /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.2.0/ucd/Blocks.txt \
         /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/3.0.1/PropList-3.0.1.txt \
-        /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/EastAsianWidth.txt \
-        /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/LineBreak.txt \
-        /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/auxiliary/WordBreakProperty.txt \
-        /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/auxiliary/GraphemeBreakProperty.txt \
-        /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/CompositionExclusions.txt \
-        /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/SpecialCasing.txt \
-        /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/CaseFolding.txt \
-        5.1.0 \
+        /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.2.0/ucd/EastAsianWidth.txt \
+        /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.2.0/ucd/LineBreak.txt \
+        /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.2.0/ucd/auxiliary/WordBreakProperty.txt \
+        /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.2.0/ucd/auxiliary/GraphemeBreakProperty.txt \
+        /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.2.0/ucd/CompositionExclusions.txt \
+        /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.2.0/ucd/SpecialCasing.txt \
+        /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.2.0/ucd/CaseFolding.txt \
+        5.2.0 \
    && diff unilbrk/lbrkprop_org.txt unilbrk/lbrkprop.txt \
    && diff uniwbrk/wbrkprop_org.txt uniwbrk/wbrkprop.txt
    "