1 /* Generate Unicode conforming character classification tables and
2 line break properties tables and word break property tables and
3 decomposition/composition and case mapping tables from a UnicodeData file.
4 Copyright (C) 2000-2002, 2004, 2007-2010 Free Software Foundation, Inc.
5 Written by Bruno Haible <bruno@clisp.org>, 2000-2002.
7 This program is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <http://www.gnu.org/licenses/>. */
21 $ gen-uni-tables /usr/local/share/Unidata/UnicodeData.txt \
22 /usr/local/share/Unidata/PropList.txt \
23 /usr/local/share/Unidata/DerivedCoreProperties.txt \
24 /usr/local/share/Unidata/Scripts.txt \
25 /usr/local/share/Unidata/Blocks.txt \
26 /usr/local/share/Unidata/PropList-3.0.1.txt \
27 /usr/local/share/Unidata/EastAsianWidth.txt \
28 /usr/local/share/Unidata/LineBreak.txt \
29 /usr/local/share/Unidata/WordBreakProperty.txt \
30 /usr/local/share/Unidata/GraphemeBreakProperty.txt \
31 /usr/local/share/Unidata/CompositionExclusions.txt \
32 /usr/local/share/Unidata/SpecialCasing.txt \
33 /usr/local/share/Unidata/CaseFolding.txt \
44 /* ========================================================================= */
46 /* Reading UnicodeData.txt. */
49 /* This structure represents one line in the UnicodeData.txt file. */
50 struct unicode_attribute
52 const char *name; /* Character name */
53 const char *category; /* General category */
54 const char *combining; /* Canonical combining class */
55 const char *bidi; /* Bidirectional category */
56 const char *decomposition; /* Character decomposition mapping */
57 const char *decdigit; /* Decimal digit value */
58 const char *digit; /* Digit value */
59 const char *numeric; /* Numeric value */
60 bool mirrored; /* mirrored */
61 const char *oldname; /* Old Unicode 1.0 name */
62 const char *comment; /* Comment */
63 unsigned int upper; /* Uppercase mapping */
64 unsigned int lower; /* Lowercase mapping */
65 unsigned int title; /* Titlecase mapping */
68 /* Missing fields are represented with "" for strings, and NONE for
70 #define NONE (~(unsigned int)0)
72 /* The entire contents of the UnicodeData.txt file. */
73 struct unicode_attribute unicode_attributes [0x110000];
75 /* Stores in unicode_attributes[i] the values from the given fields. */
77 fill_attribute (unsigned int i,
78 const char *field1, const char *field2,
79 const char *field3, const char *field4,
80 const char *field5, const char *field6,
81 const char *field7, const char *field8,
82 const char *field9, const char *field10,
83 const char *field11, const char *field12,
84 const char *field13, const char *field14)
86 struct unicode_attribute * uni;
90 fprintf (stderr, "index too large\n");
93 if (strcmp (field2, "Cs") == 0)
94 /* Surrogates are UTF-16 artefacts, not real characters. Ignore them. */
96 uni = &unicode_attributes[i];
97 /* Copy the strings. */
98 uni->name = strdup (field1);
99 uni->category = (field2[0] == '\0' ? "" : strdup (field2));
100 uni->combining = (field3[0] == '\0' ? "" : strdup (field3));
101 uni->bidi = (field4[0] == '\0' ? "" : strdup (field4));
102 uni->decomposition = (field5[0] == '\0' ? "" : strdup (field5));
103 uni->decdigit = (field6[0] == '\0' ? "" : strdup (field6));
104 uni->digit = (field7[0] == '\0' ? "" : strdup (field7));
105 uni->numeric = (field8[0] == '\0' ? "" : strdup (field8));
106 uni->mirrored = (field9[0] == 'Y');
107 uni->oldname = (field10[0] == '\0' ? "" : strdup (field10));
108 uni->comment = (field11[0] == '\0' ? "" : strdup (field11));
109 uni->upper = (field12[0] =='\0' ? NONE : strtoul (field12, NULL, 16));
110 uni->lower = (field13[0] =='\0' ? NONE : strtoul (field13, NULL, 16));
111 uni->title = (field14[0] =='\0' ? NONE : strtoul (field14, NULL, 16));
114 /* Maximum length of a field in the UnicodeData.txt file. */
117 /* Reads the next field from STREAM. The buffer BUFFER has size FIELDLEN.
118 Reads up to (but excluding) DELIM.
119 Returns 1 when a field was successfully read, otherwise 0. */
121 getfield (FILE *stream, char *buffer, int delim)
126 for (; (c = getc (stream)), (c != EOF && c != delim); )
128 /* The original unicode.org UnicodeData.txt file happens to have
129 CR/LF line terminators. Silently convert to LF. */
133 /* Put c into the buffer. */
134 if (++count >= FIELDLEN - 1)
136 fprintf (stderr, "field longer than expected, increase FIELDLEN\n");
149 /* Stores in unicode_attributes[] the entire contents of the UnicodeData.txt
152 fill_attributes (const char *unicodedata_filename)
156 char field0[FIELDLEN];
157 char field1[FIELDLEN];
158 char field2[FIELDLEN];
159 char field3[FIELDLEN];
160 char field4[FIELDLEN];
161 char field5[FIELDLEN];
162 char field6[FIELDLEN];
163 char field7[FIELDLEN];
164 char field8[FIELDLEN];
165 char field9[FIELDLEN];
166 char field10[FIELDLEN];
167 char field11[FIELDLEN];
168 char field12[FIELDLEN];
169 char field13[FIELDLEN];
170 char field14[FIELDLEN];
173 for (i = 0; i < 0x110000; i++)
174 unicode_attributes[i].name = NULL;
176 stream = fopen (unicodedata_filename, "r");
179 fprintf (stderr, "error during fopen of '%s'\n", unicodedata_filename);
188 n = getfield (stream, field0, ';');
189 n += getfield (stream, field1, ';');
190 n += getfield (stream, field2, ';');
191 n += getfield (stream, field3, ';');
192 n += getfield (stream, field4, ';');
193 n += getfield (stream, field5, ';');
194 n += getfield (stream, field6, ';');
195 n += getfield (stream, field7, ';');
196 n += getfield (stream, field8, ';');
197 n += getfield (stream, field9, ';');
198 n += getfield (stream, field10, ';');
199 n += getfield (stream, field11, ';');
200 n += getfield (stream, field12, ';');
201 n += getfield (stream, field13, ';');
202 n += getfield (stream, field14, '\n');
207 fprintf (stderr, "short line in '%s':%d\n",
208 unicodedata_filename, lineno);
211 i = strtoul (field0, NULL, 16);
213 && strlen (field1) >= 9
214 && strcmp (field1 + strlen (field1) - 8, ", First>") == 0)
216 /* Deal with a range. */
218 n = getfield (stream, field0, ';');
219 n += getfield (stream, field1, ';');
220 n += getfield (stream, field2, ';');
221 n += getfield (stream, field3, ';');
222 n += getfield (stream, field4, ';');
223 n += getfield (stream, field5, ';');
224 n += getfield (stream, field6, ';');
225 n += getfield (stream, field7, ';');
226 n += getfield (stream, field8, ';');
227 n += getfield (stream, field9, ';');
228 n += getfield (stream, field10, ';');
229 n += getfield (stream, field11, ';');
230 n += getfield (stream, field12, ';');
231 n += getfield (stream, field13, ';');
232 n += getfield (stream, field14, '\n');
235 fprintf (stderr, "missing end range in '%s':%d\n",
236 unicodedata_filename, lineno);
239 if (!(field1[0] == '<'
240 && strlen (field1) >= 8
241 && strcmp (field1 + strlen (field1) - 7, ", Last>") == 0))
243 fprintf (stderr, "missing end range in '%s':%d\n",
244 unicodedata_filename, lineno);
247 field1[strlen (field1) - 7] = '\0';
248 j = strtoul (field0, NULL, 16);
250 fill_attribute (i, field1+1, field2, field3, field4, field5,
251 field6, field7, field8, field9, field10,
252 field11, field12, field13, field14);
256 /* Single character line */
257 fill_attribute (i, field1, field2, field3, field4, field5,
258 field6, field7, field8, field9, field10,
259 field11, field12, field13, field14);
262 if (ferror (stream) || fclose (stream))
264 fprintf (stderr, "error reading from '%s'\n", unicodedata_filename);
269 /* ========================================================================= */
271 /* General category. */
272 /* See Unicode 3.0 book, section 4.5,
276 is_category_L (unsigned int ch)
278 return (unicode_attributes[ch].name != NULL
279 && unicode_attributes[ch].category[0] == 'L');
283 is_category_Lu (unsigned int ch)
285 return (unicode_attributes[ch].name != NULL
286 && unicode_attributes[ch].category[0] == 'L'
287 && unicode_attributes[ch].category[1] == 'u');
291 is_category_Ll (unsigned int ch)
293 return (unicode_attributes[ch].name != NULL
294 && unicode_attributes[ch].category[0] == 'L'
295 && unicode_attributes[ch].category[1] == 'l');
299 is_category_Lt (unsigned int ch)
301 return (unicode_attributes[ch].name != NULL
302 && unicode_attributes[ch].category[0] == 'L'
303 && unicode_attributes[ch].category[1] == 't');
307 is_category_Lm (unsigned int ch)
309 return (unicode_attributes[ch].name != NULL
310 && unicode_attributes[ch].category[0] == 'L'
311 && unicode_attributes[ch].category[1] == 'm');
315 is_category_Lo (unsigned int ch)
317 return (unicode_attributes[ch].name != NULL
318 && unicode_attributes[ch].category[0] == 'L'
319 && unicode_attributes[ch].category[1] == 'o');
323 is_category_M (unsigned int ch)
325 return (unicode_attributes[ch].name != NULL
326 && unicode_attributes[ch].category[0] == 'M');
330 is_category_Mn (unsigned int ch)
332 return (unicode_attributes[ch].name != NULL
333 && unicode_attributes[ch].category[0] == 'M'
334 && unicode_attributes[ch].category[1] == 'n');
338 is_category_Mc (unsigned int ch)
340 return (unicode_attributes[ch].name != NULL
341 && unicode_attributes[ch].category[0] == 'M'
342 && unicode_attributes[ch].category[1] == 'c');
346 is_category_Me (unsigned int ch)
348 return (unicode_attributes[ch].name != NULL
349 && unicode_attributes[ch].category[0] == 'M'
350 && unicode_attributes[ch].category[1] == 'e');
354 is_category_N (unsigned int ch)
356 return (unicode_attributes[ch].name != NULL
357 && unicode_attributes[ch].category[0] == 'N');
361 is_category_Nd (unsigned int ch)
363 return (unicode_attributes[ch].name != NULL
364 && unicode_attributes[ch].category[0] == 'N'
365 && unicode_attributes[ch].category[1] == 'd');
369 is_category_Nl (unsigned int ch)
371 return (unicode_attributes[ch].name != NULL
372 && unicode_attributes[ch].category[0] == 'N'
373 && unicode_attributes[ch].category[1] == 'l');
377 is_category_No (unsigned int ch)
379 return (unicode_attributes[ch].name != NULL
380 && unicode_attributes[ch].category[0] == 'N'
381 && unicode_attributes[ch].category[1] == 'o');
385 is_category_P (unsigned int ch)
387 return (unicode_attributes[ch].name != NULL
388 && unicode_attributes[ch].category[0] == 'P');
392 is_category_Pc (unsigned int ch)
394 return (unicode_attributes[ch].name != NULL
395 && unicode_attributes[ch].category[0] == 'P'
396 && unicode_attributes[ch].category[1] == 'c');
400 is_category_Pd (unsigned int ch)
402 return (unicode_attributes[ch].name != NULL
403 && unicode_attributes[ch].category[0] == 'P'
404 && unicode_attributes[ch].category[1] == 'd');
408 is_category_Ps (unsigned int ch)
410 return (unicode_attributes[ch].name != NULL
411 && unicode_attributes[ch].category[0] == 'P'
412 && unicode_attributes[ch].category[1] == 's');
416 is_category_Pe (unsigned int ch)
418 return (unicode_attributes[ch].name != NULL
419 && unicode_attributes[ch].category[0] == 'P'
420 && unicode_attributes[ch].category[1] == 'e');
424 is_category_Pi (unsigned int ch)
426 return (unicode_attributes[ch].name != NULL
427 && unicode_attributes[ch].category[0] == 'P'
428 && unicode_attributes[ch].category[1] == 'i');
432 is_category_Pf (unsigned int ch)
434 return (unicode_attributes[ch].name != NULL
435 && unicode_attributes[ch].category[0] == 'P'
436 && unicode_attributes[ch].category[1] == 'f');
440 is_category_Po (unsigned int ch)
442 return (unicode_attributes[ch].name != NULL
443 && unicode_attributes[ch].category[0] == 'P'
444 && unicode_attributes[ch].category[1] == 'o');
448 is_category_S (unsigned int ch)
450 return (unicode_attributes[ch].name != NULL
451 && unicode_attributes[ch].category[0] == 'S');
455 is_category_Sm (unsigned int ch)
457 return (unicode_attributes[ch].name != NULL
458 && unicode_attributes[ch].category[0] == 'S'
459 && unicode_attributes[ch].category[1] == 'm');
463 is_category_Sc (unsigned int ch)
465 return (unicode_attributes[ch].name != NULL
466 && unicode_attributes[ch].category[0] == 'S'
467 && unicode_attributes[ch].category[1] == 'c');
471 is_category_Sk (unsigned int ch)
473 return (unicode_attributes[ch].name != NULL
474 && unicode_attributes[ch].category[0] == 'S'
475 && unicode_attributes[ch].category[1] == 'k');
479 is_category_So (unsigned int ch)
481 return (unicode_attributes[ch].name != NULL
482 && unicode_attributes[ch].category[0] == 'S'
483 && unicode_attributes[ch].category[1] == 'o');
487 is_category_Z (unsigned int ch)
489 return (unicode_attributes[ch].name != NULL
490 && unicode_attributes[ch].category[0] == 'Z');
494 is_category_Zs (unsigned int ch)
496 return (unicode_attributes[ch].name != NULL
497 && unicode_attributes[ch].category[0] == 'Z'
498 && unicode_attributes[ch].category[1] == 's');
502 is_category_Zl (unsigned int ch)
504 return (unicode_attributes[ch].name != NULL
505 && unicode_attributes[ch].category[0] == 'Z'
506 && unicode_attributes[ch].category[1] == 'l');
510 is_category_Zp (unsigned int ch)
512 return (unicode_attributes[ch].name != NULL
513 && unicode_attributes[ch].category[0] == 'Z'
514 && unicode_attributes[ch].category[1] == 'p');
518 is_category_C (unsigned int ch)
520 return (unicode_attributes[ch].name == NULL
521 || unicode_attributes[ch].category[0] == 'C');
525 is_category_Cc (unsigned int ch)
527 return (unicode_attributes[ch].name != NULL
528 && unicode_attributes[ch].category[0] == 'C'
529 && unicode_attributes[ch].category[1] == 'c');
533 is_category_Cf (unsigned int ch)
535 return (unicode_attributes[ch].name != NULL
536 && unicode_attributes[ch].category[0] == 'C'
537 && unicode_attributes[ch].category[1] == 'f');
541 is_category_Cs (unsigned int ch)
543 return (ch >= 0xd800 && ch < 0xe000);
547 is_category_Co (unsigned int ch)
549 return (unicode_attributes[ch].name != NULL
550 && unicode_attributes[ch].category[0] == 'C'
551 && unicode_attributes[ch].category[1] == 'o');
555 is_category_Cn (unsigned int ch)
557 return (unicode_attributes[ch].name == NULL
558 && !(ch >= 0xd800 && ch < 0xe000));
561 /* Output a boolean property in a human readable format. */
563 debug_output_predicate (const char *filename, bool (*predicate) (unsigned int))
568 stream = fopen (filename, "w");
571 fprintf (stderr, "cannot open '%s' for writing\n", filename);
575 #if 0 /* This yields huge text output. */
576 for (ch = 0; ch < 0x110000; ch++)
579 fprintf (stream, "0x%04X\n", ch);
582 for (ch = 0; ch < 0x110000; ch++)
585 unsigned int first = ch;
588 while (ch + 1 < 0x110000 && predicate (ch + 1))
592 fprintf (stream, "0x%04X..0x%04X\n", first, last);
594 fprintf (stream, "0x%04X\n", ch);
598 if (ferror (stream) || fclose (stream))
600 fprintf (stderr, "error writing to '%s'\n", filename);
605 /* Output the unit test for a boolean property. */
607 output_predicate_test (const char *filename, bool (*predicate) (unsigned int), const char *expression)
613 stream = fopen (filename, "w");
616 fprintf (stderr, "cannot open '%s' for writing\n", filename);
620 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
621 fprintf (stream, "/* Test the Unicode character type functions.\n");
622 fprintf (stream, " Copyright (C) 2007 Free Software Foundation, Inc.\n");
623 fprintf (stream, "\n");
624 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
625 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
626 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
627 fprintf (stream, " (at your option) any later version.\n");
628 fprintf (stream, "\n");
629 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
630 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
631 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
632 fprintf (stream, " GNU General Public License for more details.\n");
633 fprintf (stream, "\n");
634 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
635 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
636 fprintf (stream, "\n");
637 fprintf (stream, "#include \"test-predicate-part1.h\"\n");
638 fprintf (stream, "\n");
641 for (ch = 0; ch < 0x110000; ch++)
644 unsigned int first = ch;
647 while (ch + 1 < 0x110000 && predicate (ch + 1))
651 fprintf (stream, ",\n");
652 fprintf (stream, " { 0x%04X, 0x%04X }", first, last);
656 fprintf (stream, "\n");
658 fprintf (stream, "\n");
659 fprintf (stream, "#define PREDICATE(c) %s\n", expression);
660 fprintf (stream, "#include \"test-predicate-part2.h\"\n");
662 if (ferror (stream) || fclose (stream))
664 fprintf (stderr, "error writing to '%s'\n", filename);
669 /* Construction of sparse 3-level tables. */
670 #define TABLE predicate_table
671 #define xmalloc malloc
672 #define xrealloc realloc
673 #include "3levelbit.h"
675 /* Output a boolean property in a three-level bitmap. */
677 output_predicate (const char *filename, bool (*predicate) (unsigned int), const char *name, const char *comment, const char *version)
681 struct predicate_table t;
682 unsigned int level1_offset, level2_offset, level3_offset;
684 stream = fopen (filename, "w");
687 fprintf (stderr, "cannot open '%s' for writing\n", filename);
691 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
692 fprintf (stream, "/* %s of Unicode characters. */\n", comment);
693 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
698 predicate_table_init (&t);
700 for (ch = 0; ch < 0x110000; ch++)
702 predicate_table_add (&t, ch);
704 predicate_table_finalize (&t);
706 /* Offsets in t.result, in memory of this process. */
708 5 * sizeof (uint32_t);
710 5 * sizeof (uint32_t)
711 + t.level1_size * sizeof (uint32_t);
713 5 * sizeof (uint32_t)
714 + t.level1_size * sizeof (uint32_t)
715 + (t.level2_size << t.q) * sizeof (uint32_t);
717 for (i = 0; i < 5; i++)
719 fprintf (stream, "#define header_%d %d\n", i,
720 ((uint32_t *) t.result)[i]);
722 fprintf (stream, "static const\n");
723 fprintf (stream, "struct\n");
724 fprintf (stream, " {\n");
725 fprintf (stream, " int header[1];\n");
726 fprintf (stream, " int level1[%zu];\n", t.level1_size);
727 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
728 fprintf (stream, " /*unsigned*/ int level3[%zu << %d];\n", t.level3_size, t.p);
729 fprintf (stream, " }\n");
730 fprintf (stream, "%s =\n", name);
731 fprintf (stream, "{\n");
732 fprintf (stream, " { %d },\n", ((uint32_t *) t.result)[1]);
733 fprintf (stream, " {");
734 if (t.level1_size > 1)
735 fprintf (stream, "\n ");
736 for (i = 0; i < t.level1_size; i++)
739 if (i > 0 && (i % 1) == 0)
740 fprintf (stream, "\n ");
741 offset = ((uint32_t *) (t.result + level1_offset))[i];
743 fprintf (stream, " %5d", -1);
745 fprintf (stream, " %5zu * sizeof (int) / sizeof (short) + %5zu",
746 1 + t.level1_size, (offset - level2_offset) / sizeof (uint32_t));
747 if (i+1 < t.level1_size)
748 fprintf (stream, ",");
750 if (t.level1_size > 1)
751 fprintf (stream, "\n ");
752 fprintf (stream, " },\n");
753 fprintf (stream, " {");
754 if (t.level2_size << t.q > 1)
755 fprintf (stream, "\n ");
756 for (i = 0; i < t.level2_size << t.q; i++)
759 if (i > 0 && (i % 1) == 0)
760 fprintf (stream, "\n ");
761 offset = ((uint32_t *) (t.result + level2_offset))[i];
763 fprintf (stream, " %5d", -1);
765 fprintf (stream, " %5zu + %5zu * sizeof (short) / sizeof (int) + %5zu",
766 1 + t.level1_size, t.level2_size << t.q, (offset - level3_offset) / sizeof (uint32_t));
767 if (i+1 < t.level2_size << t.q)
768 fprintf (stream, ",");
770 if (t.level2_size << t.q > 1)
771 fprintf (stream, "\n ");
772 fprintf (stream, " },\n");
773 fprintf (stream, " {");
774 if (t.level3_size << t.p > 4)
775 fprintf (stream, "\n ");
776 for (i = 0; i < t.level3_size << t.p; i++)
778 if (i > 0 && (i % 4) == 0)
779 fprintf (stream, "\n ");
780 fprintf (stream, " 0x%08X",
781 ((uint32_t *) (t.result + level3_offset))[i]);
782 if (i+1 < t.level3_size << t.p)
783 fprintf (stream, ",");
785 if (t.level3_size << t.p > 4)
786 fprintf (stream, "\n ");
787 fprintf (stream, " }\n");
788 fprintf (stream, "};\n");
790 if (ferror (stream) || fclose (stream))
792 fprintf (stderr, "error writing to '%s'\n", filename);
797 /* Output all categories. */
799 output_categories (const char *version)
801 #define CATEGORY(C) \
802 debug_output_predicate ("unictype/categ_" #C ".txt", is_category_ ## C); \
803 output_predicate_test ("../tests/unictype/test-categ_" #C ".c", is_category_ ## C, "uc_is_general_category (c, UC_CATEGORY_" #C ")"); \
804 output_predicate ("unictype/categ_" #C ".h", is_category_ ## C, "u_categ_" #C, "Categories", version);
847 UC_CATEGORY_MASK_L = 0x0000001f,
848 UC_CATEGORY_MASK_Lu = 0x00000001,
849 UC_CATEGORY_MASK_Ll = 0x00000002,
850 UC_CATEGORY_MASK_Lt = 0x00000004,
851 UC_CATEGORY_MASK_Lm = 0x00000008,
852 UC_CATEGORY_MASK_Lo = 0x00000010,
853 UC_CATEGORY_MASK_M = 0x000000e0,
854 UC_CATEGORY_MASK_Mn = 0x00000020,
855 UC_CATEGORY_MASK_Mc = 0x00000040,
856 UC_CATEGORY_MASK_Me = 0x00000080,
857 UC_CATEGORY_MASK_N = 0x00000700,
858 UC_CATEGORY_MASK_Nd = 0x00000100,
859 UC_CATEGORY_MASK_Nl = 0x00000200,
860 UC_CATEGORY_MASK_No = 0x00000400,
861 UC_CATEGORY_MASK_P = 0x0003f800,
862 UC_CATEGORY_MASK_Pc = 0x00000800,
863 UC_CATEGORY_MASK_Pd = 0x00001000,
864 UC_CATEGORY_MASK_Ps = 0x00002000,
865 UC_CATEGORY_MASK_Pe = 0x00004000,
866 UC_CATEGORY_MASK_Pi = 0x00008000,
867 UC_CATEGORY_MASK_Pf = 0x00010000,
868 UC_CATEGORY_MASK_Po = 0x00020000,
869 UC_CATEGORY_MASK_S = 0x003c0000,
870 UC_CATEGORY_MASK_Sm = 0x00040000,
871 UC_CATEGORY_MASK_Sc = 0x00080000,
872 UC_CATEGORY_MASK_Sk = 0x00100000,
873 UC_CATEGORY_MASK_So = 0x00200000,
874 UC_CATEGORY_MASK_Z = 0x01c00000,
875 UC_CATEGORY_MASK_Zs = 0x00400000,
876 UC_CATEGORY_MASK_Zl = 0x00800000,
877 UC_CATEGORY_MASK_Zp = 0x01000000,
878 UC_CATEGORY_MASK_C = 0x3e000000,
879 UC_CATEGORY_MASK_Cc = 0x02000000,
880 UC_CATEGORY_MASK_Cf = 0x04000000,
881 UC_CATEGORY_MASK_Cs = 0x08000000,
882 UC_CATEGORY_MASK_Co = 0x10000000,
883 UC_CATEGORY_MASK_Cn = 0x20000000
887 general_category_byname (const char *category_name)
889 if (category_name[0] != '\0'
890 && (category_name[1] == '\0' || category_name[2] == '\0'))
891 switch (category_name[0])
894 switch (category_name[1])
896 case '\0': return UC_CATEGORY_MASK_L;
897 case 'u': return UC_CATEGORY_MASK_Lu;
898 case 'l': return UC_CATEGORY_MASK_Ll;
899 case 't': return UC_CATEGORY_MASK_Lt;
900 case 'm': return UC_CATEGORY_MASK_Lm;
901 case 'o': return UC_CATEGORY_MASK_Lo;
905 switch (category_name[1])
907 case '\0': return UC_CATEGORY_MASK_M;
908 case 'n': return UC_CATEGORY_MASK_Mn;
909 case 'c': return UC_CATEGORY_MASK_Mc;
910 case 'e': return UC_CATEGORY_MASK_Me;
914 switch (category_name[1])
916 case '\0': return UC_CATEGORY_MASK_N;
917 case 'd': return UC_CATEGORY_MASK_Nd;
918 case 'l': return UC_CATEGORY_MASK_Nl;
919 case 'o': return UC_CATEGORY_MASK_No;
923 switch (category_name[1])
925 case '\0': return UC_CATEGORY_MASK_P;
926 case 'c': return UC_CATEGORY_MASK_Pc;
927 case 'd': return UC_CATEGORY_MASK_Pd;
928 case 's': return UC_CATEGORY_MASK_Ps;
929 case 'e': return UC_CATEGORY_MASK_Pe;
930 case 'i': return UC_CATEGORY_MASK_Pi;
931 case 'f': return UC_CATEGORY_MASK_Pf;
932 case 'o': return UC_CATEGORY_MASK_Po;
936 switch (category_name[1])
938 case '\0': return UC_CATEGORY_MASK_S;
939 case 'm': return UC_CATEGORY_MASK_Sm;
940 case 'c': return UC_CATEGORY_MASK_Sc;
941 case 'k': return UC_CATEGORY_MASK_Sk;
942 case 'o': return UC_CATEGORY_MASK_So;
946 switch (category_name[1])
948 case '\0': return UC_CATEGORY_MASK_Z;
949 case 's': return UC_CATEGORY_MASK_Zs;
950 case 'l': return UC_CATEGORY_MASK_Zl;
951 case 'p': return UC_CATEGORY_MASK_Zp;
955 switch (category_name[1])
957 case '\0': return UC_CATEGORY_MASK_C;
958 case 'c': return UC_CATEGORY_MASK_Cc;
959 case 'f': return UC_CATEGORY_MASK_Cf;
960 case 's': return UC_CATEGORY_MASK_Cs;
961 case 'o': return UC_CATEGORY_MASK_Co;
962 case 'n': return UC_CATEGORY_MASK_Cn;
966 /* Invalid category name. */
970 /* Construction of sparse 3-level tables. */
971 #define TABLE category_table
972 #define ELEMENT uint8_t
973 #define DEFAULT 29 /* = log2(UC_CATEGORY_MASK_Cn) */
974 #define xmalloc malloc
975 #define xrealloc realloc
978 /* Output the per-character category table. */
980 output_category (const char *filename, const char *version)
984 struct category_table t;
985 unsigned int level1_offset, level2_offset, level3_offset;
986 uint16_t *level3_packed;
988 stream = fopen (filename, "w");
991 fprintf (stderr, "cannot open '%s' for writing\n", filename);
995 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
996 fprintf (stream, "/* Categories of Unicode characters. */\n");
997 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1002 category_table_init (&t);
1004 for (ch = 0; ch < 0x110000; ch++)
1007 unsigned int log2_value;
1009 if (is_category_Cs (ch))
1010 value = UC_CATEGORY_MASK_Cs;
1011 else if (unicode_attributes[ch].name != NULL)
1012 value = general_category_byname (unicode_attributes[ch].category);
1016 /* Now value should contain exactly one bit. */
1017 if (value == 0 || ((value & (value - 1)) != 0))
1020 for (log2_value = 0; value > 1; value >>= 1, log2_value++);
1022 category_table_add (&t, ch, log2_value);
1025 category_table_finalize (&t);
1027 /* Offsets in t.result, in memory of this process. */
1029 5 * sizeof (uint32_t);
1031 5 * sizeof (uint32_t)
1032 + t.level1_size * sizeof (uint32_t);
1034 5 * sizeof (uint32_t)
1035 + t.level1_size * sizeof (uint32_t)
1036 + (t.level2_size << t.q) * sizeof (uint32_t);
1038 for (i = 0; i < 5; i++)
1039 fprintf (stream, "#define category_header_%d %d\n", i,
1040 ((uint32_t *) t.result)[i]);
1041 fprintf (stream, "static const\n");
1042 fprintf (stream, "struct\n");
1043 fprintf (stream, " {\n");
1044 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1045 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1046 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
1047 (1 << t.p) * 5 / 16);
1048 fprintf (stream, " }\n");
1049 fprintf (stream, "u_category =\n");
1050 fprintf (stream, "{\n");
1051 fprintf (stream, " {");
1052 if (t.level1_size > 8)
1053 fprintf (stream, "\n ");
1054 for (i = 0; i < t.level1_size; i++)
1057 if (i > 0 && (i % 8) == 0)
1058 fprintf (stream, "\n ");
1059 offset = ((uint32_t *) (t.result + level1_offset))[i];
1061 fprintf (stream, " %5d", -1);
1063 fprintf (stream, " %5zu",
1064 (offset - level2_offset) / sizeof (uint32_t));
1065 if (i+1 < t.level1_size)
1066 fprintf (stream, ",");
1068 if (t.level1_size > 8)
1069 fprintf (stream, "\n ");
1070 fprintf (stream, " },\n");
1071 fprintf (stream, " {");
1072 if (t.level2_size << t.q > 8)
1073 fprintf (stream, "\n ");
1074 for (i = 0; i < t.level2_size << t.q; i++)
1077 if (i > 0 && (i % 8) == 0)
1078 fprintf (stream, "\n ");
1079 offset = ((uint32_t *) (t.result + level2_offset))[i];
1081 fprintf (stream, " %5d", -1);
1083 fprintf (stream, " %5zu",
1084 (offset - level3_offset) / sizeof (uint8_t));
1085 if (i+1 < t.level2_size << t.q)
1086 fprintf (stream, ",");
1088 if (t.level2_size << t.q > 8)
1089 fprintf (stream, "\n ");
1090 fprintf (stream, " },\n");
1091 /* Pack the level3 array. Each entry needs 5 bits only. Use 16-bit units,
1092 not 32-bit units, in order to make the lookup function easier. */
1095 calloc ((t.level3_size << t.p) * 5 / 16 + 1, sizeof (uint16_t));
1096 for (i = 0; i < t.level3_size << t.p; i++)
1098 unsigned int j = (i * 5) / 16;
1099 unsigned int k = (i * 5) % 16;
1100 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
1101 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
1102 level3_packed[j] = value & 0xffff;
1103 level3_packed[j+1] = value >> 16;
1105 fprintf (stream, " {");
1106 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1107 fprintf (stream, "\n ");
1108 for (i = 0; i < (t.level3_size << t.p) * 5 / 16 + 1; i++)
1110 if (i > 0 && (i % 8) == 0)
1111 fprintf (stream, "\n ");
1112 fprintf (stream, " 0x%04x", level3_packed[i]);
1113 if (i+1 < (t.level3_size << t.p) * 5 / 16 + 1)
1114 fprintf (stream, ",");
1116 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1117 fprintf (stream, "\n ");
1118 fprintf (stream, " }\n");
1119 free (level3_packed);
1120 fprintf (stream, "};\n");
1122 if (ferror (stream) || fclose (stream))
1124 fprintf (stderr, "error writing to '%s'\n", filename);
1129 /* ========================================================================= */
1131 /* Canonical combining class. */
1132 /* See Unicode 3.0 book, section 4.2,
1135 /* Construction of sparse 3-level tables. */
1136 #define TABLE combclass_table
1137 #define ELEMENT uint8_t
1139 #define xmalloc malloc
1140 #define xrealloc realloc
1143 /* Output the per-character combining class table. */
1145 output_combclass (const char *filename, const char *version)
1149 struct combclass_table t;
1150 unsigned int level1_offset, level2_offset, level3_offset;
1152 stream = fopen (filename, "w");
1155 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1159 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1160 fprintf (stream, "/* Combining class of Unicode characters. */\n");
1161 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1166 combclass_table_init (&t);
1168 for (ch = 0; ch < 0x110000; ch++)
1169 if (unicode_attributes[ch].name != NULL)
1171 int value = atoi (unicode_attributes[ch].combining);
1172 if (!(value >= 0 && value <= 255))
1174 combclass_table_add (&t, ch, value);
1177 combclass_table_finalize (&t);
1179 /* Offsets in t.result, in memory of this process. */
1181 5 * sizeof (uint32_t);
1183 5 * sizeof (uint32_t)
1184 + t.level1_size * sizeof (uint32_t);
1186 5 * sizeof (uint32_t)
1187 + t.level1_size * sizeof (uint32_t)
1188 + (t.level2_size << t.q) * sizeof (uint32_t);
1190 for (i = 0; i < 5; i++)
1191 fprintf (stream, "#define combclass_header_%d %d\n", i,
1192 ((uint32_t *) t.result)[i]);
1193 fprintf (stream, "static const\n");
1194 fprintf (stream, "struct\n");
1195 fprintf (stream, " {\n");
1196 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1197 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1198 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
1199 fprintf (stream, " }\n");
1200 fprintf (stream, "u_combclass =\n");
1201 fprintf (stream, "{\n");
1202 fprintf (stream, " {");
1203 if (t.level1_size > 8)
1204 fprintf (stream, "\n ");
1205 for (i = 0; i < t.level1_size; i++)
1208 if (i > 0 && (i % 8) == 0)
1209 fprintf (stream, "\n ");
1210 offset = ((uint32_t *) (t.result + level1_offset))[i];
1212 fprintf (stream, " %5d", -1);
1214 fprintf (stream, " %5zu",
1215 (offset - level2_offset) / sizeof (uint32_t));
1216 if (i+1 < t.level1_size)
1217 fprintf (stream, ",");
1219 if (t.level1_size > 8)
1220 fprintf (stream, "\n ");
1221 fprintf (stream, " },\n");
1222 fprintf (stream, " {");
1223 if (t.level2_size << t.q > 8)
1224 fprintf (stream, "\n ");
1225 for (i = 0; i < t.level2_size << t.q; i++)
1228 if (i > 0 && (i % 8) == 0)
1229 fprintf (stream, "\n ");
1230 offset = ((uint32_t *) (t.result + level2_offset))[i];
1232 fprintf (stream, " %5d", -1);
1234 fprintf (stream, " %5zu",
1235 (offset - level3_offset) / sizeof (uint8_t));
1236 if (i+1 < t.level2_size << t.q)
1237 fprintf (stream, ",");
1239 if (t.level2_size << t.q > 8)
1240 fprintf (stream, "\n ");
1241 fprintf (stream, " },\n");
1242 fprintf (stream, " {");
1243 if (t.level3_size << t.p > 8)
1244 fprintf (stream, "\n ");
1245 for (i = 0; i < t.level3_size << t.p; i++)
1247 if (i > 0 && (i % 8) == 0)
1248 fprintf (stream, "\n ");
1249 fprintf (stream, " %3d", ((uint8_t *) (t.result + level3_offset))[i]);
1250 if (i+1 < t.level3_size << t.p)
1251 fprintf (stream, ",");
1253 if (t.level3_size << t.p > 8)
1254 fprintf (stream, "\n ");
1255 fprintf (stream, " }\n");
1256 fprintf (stream, "};\n");
1258 if (ferror (stream) || fclose (stream))
1260 fprintf (stderr, "error writing to '%s'\n", filename);
1265 /* ========================================================================= */
1267 /* Bidirectional category. */
1268 /* See Unicode 3.0 book, section 4.3,
1273 UC_BIDI_L, /* Left-to-Right */
1274 UC_BIDI_LRE, /* Left-to-Right Embedding */
1275 UC_BIDI_LRO, /* Left-to-Right Override */
1276 UC_BIDI_R, /* Right-to-Left */
1277 UC_BIDI_AL, /* Right-to-Left Arabic */
1278 UC_BIDI_RLE, /* Right-to-Left Embedding */
1279 UC_BIDI_RLO, /* Right-to-Left Override */
1280 UC_BIDI_PDF, /* Pop Directional Format */
1281 UC_BIDI_EN, /* European Number */
1282 UC_BIDI_ES, /* European Number Separator */
1283 UC_BIDI_ET, /* European Number Terminator */
1284 UC_BIDI_AN, /* Arabic Number */
1285 UC_BIDI_CS, /* Common Number Separator */
1286 UC_BIDI_NSM, /* Non-Spacing Mark */
1287 UC_BIDI_BN, /* Boundary Neutral */
1288 UC_BIDI_B, /* Paragraph Separator */
1289 UC_BIDI_S, /* Segment Separator */
1290 UC_BIDI_WS, /* Whitespace */
1291 UC_BIDI_ON /* Other Neutral */
1295 bidi_category_byname (const char *category_name)
1297 switch (category_name[0])
1300 switch (category_name[1])
1303 if (category_name[2] == '\0')
1307 if (category_name[2] == '\0')
1313 switch (category_name[1])
1318 if (category_name[2] == '\0')
1324 switch (category_name[1])
1327 if (category_name[2] == '\0')
1333 switch (category_name[1])
1336 if (category_name[2] == '\0')
1340 if (category_name[2] == '\0')
1344 if (category_name[2] == '\0')
1350 switch (category_name[1])
1355 switch (category_name[2])
1358 if (category_name[3] == '\0')
1362 if (category_name[3] == '\0')
1370 switch (category_name[1])
1373 switch (category_name[2])
1376 if (category_name[3] == '\0')
1384 switch (category_name[1])
1387 if (category_name[2] == '\0')
1393 switch (category_name[1])
1396 switch (category_name[2])
1399 if (category_name[3] == '\0')
1407 switch (category_name[1])
1412 switch (category_name[2])
1415 if (category_name[3] == '\0')
1419 if (category_name[3] == '\0')
1427 if (category_name[1] == '\0')
1431 switch (category_name[1])
1434 if (category_name[2] == '\0')
1440 /* Invalid bidi category name. */
1445 get_bidi_category (unsigned int ch)
1447 if (unicode_attributes[ch].name != NULL)
1448 return bidi_category_byname (unicode_attributes[ch].bidi);
1451 /* The bidi category of unassigned characters depends on the range.
1452 See UTR #9 and DerivedBidiClass.txt. */
1453 if ((ch >= 0x0590 && ch <= 0x05FF)
1454 || (ch >= 0x07FB && ch <= 0x08FF)
1455 || (ch >= 0xFB37 && ch <= 0xFB45)
1456 || (ch >= 0x10800 && ch <= 0x10FFF))
1458 else if ((ch >= 0x0600 && ch <= 0x07BF)
1459 || (ch >= 0x2064 && ch <= 0x2069)
1460 || (ch >= 0xFBB2 && ch <= 0xFDCF)
1461 || (ch >= 0xFDFE && ch <= 0xFEFE))
1463 else if ((ch >= 0xFDD0 && ch <= 0xFDEF)
1464 || (ch >= 0xFFF0 && ch <= 0xFFFF)
1465 || (ch & 0xFFFF) == 0xFFFE
1466 || (ch & 0xFFFF) == 0xFFFF
1467 || (ch >= 0xE0000 && ch <= 0xE0FFF))
1474 /* Construction of sparse 3-level tables. */
1475 #define TABLE bidi_category_table
1476 #define ELEMENT uint8_t
1477 #define DEFAULT UC_BIDI_L
1478 #define xmalloc malloc
1479 #define xrealloc realloc
1482 /* Output the per-character bidi category table. */
1484 output_bidi_category (const char *filename, const char *version)
1488 struct bidi_category_table t;
1489 unsigned int level1_offset, level2_offset, level3_offset;
1490 uint16_t *level3_packed;
1492 stream = fopen (filename, "w");
1495 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1499 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1500 fprintf (stream, "/* Bidi categories of Unicode characters. */\n");
1501 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1506 bidi_category_table_init (&t);
1508 for (ch = 0; ch < 0x110000; ch++)
1510 int value = get_bidi_category (ch);
1512 bidi_category_table_add (&t, ch, value);
1515 bidi_category_table_finalize (&t);
1517 /* Offsets in t.result, in memory of this process. */
1519 5 * sizeof (uint32_t);
1521 5 * sizeof (uint32_t)
1522 + t.level1_size * sizeof (uint32_t);
1524 5 * sizeof (uint32_t)
1525 + t.level1_size * sizeof (uint32_t)
1526 + (t.level2_size << t.q) * sizeof (uint32_t);
1528 for (i = 0; i < 5; i++)
1529 fprintf (stream, "#define bidi_category_header_%d %d\n", i,
1530 ((uint32_t *) t.result)[i]);
1531 fprintf (stream, "static const\n");
1532 fprintf (stream, "struct\n");
1533 fprintf (stream, " {\n");
1534 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1535 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1536 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
1537 (1 << t.p) * 5 / 16);
1538 fprintf (stream, " }\n");
1539 fprintf (stream, "u_bidi_category =\n");
1540 fprintf (stream, "{\n");
1541 fprintf (stream, " {");
1542 if (t.level1_size > 8)
1543 fprintf (stream, "\n ");
1544 for (i = 0; i < t.level1_size; i++)
1547 if (i > 0 && (i % 8) == 0)
1548 fprintf (stream, "\n ");
1549 offset = ((uint32_t *) (t.result + level1_offset))[i];
1551 fprintf (stream, " %5d", -1);
1553 fprintf (stream, " %5zu",
1554 (offset - level2_offset) / sizeof (uint32_t));
1555 if (i+1 < t.level1_size)
1556 fprintf (stream, ",");
1558 if (t.level1_size > 8)
1559 fprintf (stream, "\n ");
1560 fprintf (stream, " },\n");
1561 fprintf (stream, " {");
1562 if (t.level2_size << t.q > 8)
1563 fprintf (stream, "\n ");
1564 for (i = 0; i < t.level2_size << t.q; i++)
1567 if (i > 0 && (i % 8) == 0)
1568 fprintf (stream, "\n ");
1569 offset = ((uint32_t *) (t.result + level2_offset))[i];
1571 fprintf (stream, " %5d", -1);
1573 fprintf (stream, " %5zu",
1574 (offset - level3_offset) / sizeof (uint8_t));
1575 if (i+1 < t.level2_size << t.q)
1576 fprintf (stream, ",");
1578 if (t.level2_size << t.q > 8)
1579 fprintf (stream, "\n ");
1580 fprintf (stream, " },\n");
1581 /* Pack the level3 array. Each entry needs 5 bits only. Use 16-bit units,
1582 not 32-bit units, in order to make the lookup function easier. */
1585 calloc ((t.level3_size << t.p) * 5 / 16 + 1, sizeof (uint16_t));
1586 for (i = 0; i < t.level3_size << t.p; i++)
1588 unsigned int j = (i * 5) / 16;
1589 unsigned int k = (i * 5) % 16;
1590 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
1591 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
1592 level3_packed[j] = value & 0xffff;
1593 level3_packed[j+1] = value >> 16;
1595 fprintf (stream, " {");
1596 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1597 fprintf (stream, "\n ");
1598 for (i = 0; i < (t.level3_size << t.p) * 5 / 16 + 1; i++)
1600 if (i > 0 && (i % 8) == 0)
1601 fprintf (stream, "\n ");
1602 fprintf (stream, " 0x%04x", level3_packed[i]);
1603 if (i+1 < (t.level3_size << t.p) * 5 / 16 + 1)
1604 fprintf (stream, ",");
1606 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1607 fprintf (stream, "\n ");
1608 fprintf (stream, " }\n");
1609 free (level3_packed);
1610 fprintf (stream, "};\n");
1612 if (ferror (stream) || fclose (stream))
1614 fprintf (stderr, "error writing to '%s'\n", filename);
1619 /* ========================================================================= */
1621 /* Decimal digit value. */
1622 /* See Unicode 3.0 book, section 4.6. */
1625 get_decdigit_value (unsigned int ch)
1627 if (unicode_attributes[ch].name != NULL
1628 && unicode_attributes[ch].decdigit[0] != '\0')
1629 return atoi (unicode_attributes[ch].decdigit);
1633 /* Construction of sparse 3-level tables. */
1634 #define TABLE decdigit_table
1635 #define ELEMENT uint8_t
1637 #define xmalloc malloc
1638 #define xrealloc realloc
1641 /* Output the unit test for the per-character decimal digit value table. */
1643 output_decimal_digit_test (const char *filename, const char *version)
1649 stream = fopen (filename, "w");
1652 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1656 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1657 fprintf (stream, "/* Decimal digit values of Unicode characters. */\n");
1658 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1662 for (ch = 0; ch < 0x110000; ch++)
1664 int value = get_decdigit_value (ch);
1666 if (!(value >= -1 && value < 10))
1672 fprintf (stream, ",\n");
1673 fprintf (stream, " { 0x%04X, %d }", ch, value);
1678 fprintf (stream, "\n");
1680 if (ferror (stream) || fclose (stream))
1682 fprintf (stderr, "error writing to '%s'\n", filename);
1687 /* Output the per-character decimal digit value table. */
1689 output_decimal_digit (const char *filename, const char *version)
1693 struct decdigit_table t;
1694 unsigned int level1_offset, level2_offset, level3_offset;
1696 stream = fopen (filename, "w");
1699 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1703 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1704 fprintf (stream, "/* Decimal digit values of Unicode characters. */\n");
1705 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1710 decdigit_table_init (&t);
1712 for (ch = 0; ch < 0x110000; ch++)
1714 int value = 1 + get_decdigit_value (ch);
1716 if (!(value >= 0 && value <= 10))
1719 decdigit_table_add (&t, ch, value);
1722 decdigit_table_finalize (&t);
1724 /* Offsets in t.result, in memory of this process. */
1726 5 * sizeof (uint32_t);
1728 5 * sizeof (uint32_t)
1729 + t.level1_size * sizeof (uint32_t);
1731 5 * sizeof (uint32_t)
1732 + t.level1_size * sizeof (uint32_t)
1733 + (t.level2_size << t.q) * sizeof (uint32_t);
1735 for (i = 0; i < 5; i++)
1736 fprintf (stream, "#define decdigit_header_%d %d\n", i,
1737 ((uint32_t *) t.result)[i]);
1738 fprintf (stream, "static const\n");
1739 fprintf (stream, "struct\n");
1740 fprintf (stream, " {\n");
1741 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1742 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1743 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size,
1745 fprintf (stream, " }\n");
1746 fprintf (stream, "u_decdigit =\n");
1747 fprintf (stream, "{\n");
1748 fprintf (stream, " {");
1749 if (t.level1_size > 8)
1750 fprintf (stream, "\n ");
1751 for (i = 0; i < t.level1_size; i++)
1754 if (i > 0 && (i % 8) == 0)
1755 fprintf (stream, "\n ");
1756 offset = ((uint32_t *) (t.result + level1_offset))[i];
1758 fprintf (stream, " %5d", -1);
1760 fprintf (stream, " %5zu",
1761 (offset - level2_offset) / sizeof (uint32_t));
1762 if (i+1 < t.level1_size)
1763 fprintf (stream, ",");
1765 if (t.level1_size > 8)
1766 fprintf (stream, "\n ");
1767 fprintf (stream, " },\n");
1768 fprintf (stream, " {");
1769 if (t.level2_size << t.q > 8)
1770 fprintf (stream, "\n ");
1771 for (i = 0; i < t.level2_size << t.q; i++)
1774 if (i > 0 && (i % 8) == 0)
1775 fprintf (stream, "\n ");
1776 offset = ((uint32_t *) (t.result + level2_offset))[i];
1778 fprintf (stream, " %5d", -1);
1780 fprintf (stream, " %5zu",
1781 (offset - level3_offset) / sizeof (uint8_t));
1782 if (i+1 < t.level2_size << t.q)
1783 fprintf (stream, ",");
1785 if (t.level2_size << t.q > 8)
1786 fprintf (stream, "\n ");
1787 fprintf (stream, " },\n");
1788 /* Pack the level3 array. Each entry needs 4 bits only. */
1789 fprintf (stream, " {");
1790 if (t.level3_size << (t.p - 1) > 8)
1791 fprintf (stream, "\n ");
1792 for (i = 0; i < t.level3_size << (t.p - 1); i++)
1794 if (i > 0 && (i % 8) == 0)
1795 fprintf (stream, "\n ");
1796 fprintf (stream, " 0x%02x",
1797 ((uint8_t *) (t.result + level3_offset))[2*i]
1798 + (((uint8_t *) (t.result + level3_offset))[2*i+1] << 4));
1799 if (i+1 < t.level3_size << (t.p - 1))
1800 fprintf (stream, ",");
1802 if (t.level3_size << (t.p - 1) > 8)
1803 fprintf (stream, "\n ");
1804 fprintf (stream, " }\n");
1805 fprintf (stream, "};\n");
1807 if (ferror (stream) || fclose (stream))
1809 fprintf (stderr, "error writing to '%s'\n", filename);
1814 /* ========================================================================= */
1817 /* See Unicode 3.0 book, section 4.6. */
1820 get_digit_value (unsigned int ch)
1822 if (unicode_attributes[ch].name != NULL
1823 && unicode_attributes[ch].digit[0] != '\0')
1824 return atoi (unicode_attributes[ch].digit);
1828 /* Output the unit test for the per-character digit value table. */
1830 output_digit_test (const char *filename, const char *version)
1836 stream = fopen (filename, "w");
1839 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1843 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1844 fprintf (stream, "/* Digit values of Unicode characters. */\n");
1845 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1849 for (ch = 0; ch < 0x110000; ch++)
1851 int value = get_digit_value (ch);
1853 if (!(value >= -1 && value < 10))
1859 fprintf (stream, ",\n");
1860 fprintf (stream, " { 0x%04X, %d }", ch, value);
1865 fprintf (stream, "\n");
1867 if (ferror (stream) || fclose (stream))
1869 fprintf (stderr, "error writing to '%s'\n", filename);
1874 /* Output the per-character digit value table. */
1876 output_digit (const char *filename, const char *version)
1880 struct decdigit_table t;
1881 unsigned int level1_offset, level2_offset, level3_offset;
1883 stream = fopen (filename, "w");
1886 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1890 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1891 fprintf (stream, "/* Digit values of Unicode characters. */\n");
1892 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1897 decdigit_table_init (&t);
1899 for (ch = 0; ch < 0x110000; ch++)
1901 int value = 1 + get_digit_value (ch);
1903 if (!(value >= 0 && value <= 10))
1906 decdigit_table_add (&t, ch, value);
1909 decdigit_table_finalize (&t);
1911 /* Offsets in t.result, in memory of this process. */
1913 5 * sizeof (uint32_t);
1915 5 * sizeof (uint32_t)
1916 + t.level1_size * sizeof (uint32_t);
1918 5 * sizeof (uint32_t)
1919 + t.level1_size * sizeof (uint32_t)
1920 + (t.level2_size << t.q) * sizeof (uint32_t);
1922 for (i = 0; i < 5; i++)
1923 fprintf (stream, "#define digit_header_%d %d\n", i,
1924 ((uint32_t *) t.result)[i]);
1925 fprintf (stream, "static const\n");
1926 fprintf (stream, "struct\n");
1927 fprintf (stream, " {\n");
1928 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1929 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1930 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size,
1932 fprintf (stream, " }\n");
1933 fprintf (stream, "u_digit =\n");
1934 fprintf (stream, "{\n");
1935 fprintf (stream, " {");
1936 if (t.level1_size > 8)
1937 fprintf (stream, "\n ");
1938 for (i = 0; i < t.level1_size; i++)
1941 if (i > 0 && (i % 8) == 0)
1942 fprintf (stream, "\n ");
1943 offset = ((uint32_t *) (t.result + level1_offset))[i];
1945 fprintf (stream, " %5d", -1);
1947 fprintf (stream, " %5zu",
1948 (offset - level2_offset) / sizeof (uint32_t));
1949 if (i+1 < t.level1_size)
1950 fprintf (stream, ",");
1952 if (t.level1_size > 8)
1953 fprintf (stream, "\n ");
1954 fprintf (stream, " },\n");
1955 fprintf (stream, " {");
1956 if (t.level2_size << t.q > 8)
1957 fprintf (stream, "\n ");
1958 for (i = 0; i < t.level2_size << t.q; i++)
1961 if (i > 0 && (i % 8) == 0)
1962 fprintf (stream, "\n ");
1963 offset = ((uint32_t *) (t.result + level2_offset))[i];
1965 fprintf (stream, " %5d", -1);
1967 fprintf (stream, " %5zu",
1968 (offset - level3_offset) / sizeof (uint8_t));
1969 if (i+1 < t.level2_size << t.q)
1970 fprintf (stream, ",");
1972 if (t.level2_size << t.q > 8)
1973 fprintf (stream, "\n ");
1974 fprintf (stream, " },\n");
1975 /* Pack the level3 array. Each entry needs 4 bits only. */
1976 fprintf (stream, " {");
1977 if (t.level3_size << (t.p - 1) > 8)
1978 fprintf (stream, "\n ");
1979 for (i = 0; i < t.level3_size << (t.p - 1); i++)
1981 if (i > 0 && (i % 8) == 0)
1982 fprintf (stream, "\n ");
1983 fprintf (stream, " 0x%02x",
1984 ((uint8_t *) (t.result + level3_offset))[2*i]
1985 + (((uint8_t *) (t.result + level3_offset))[2*i+1] << 4));
1986 if (i+1 < t.level3_size << (t.p - 1))
1987 fprintf (stream, ",");
1989 if (t.level3_size << (t.p - 1) > 8)
1990 fprintf (stream, "\n ");
1991 fprintf (stream, " }\n");
1992 fprintf (stream, "};\n");
1994 if (ferror (stream) || fclose (stream))
1996 fprintf (stderr, "error writing to '%s'\n", filename);
2001 /* ========================================================================= */
2003 /* Numeric value. */
2004 /* See Unicode 3.0 book, section 4.6. */
2006 typedef struct { int numerator; int denominator; } uc_fraction_t;
2008 static uc_fraction_t
2009 get_numeric_value (unsigned int ch)
2011 uc_fraction_t value;
2013 if (unicode_attributes[ch].name != NULL
2014 && unicode_attributes[ch].numeric[0] != '\0')
2016 const char *str = unicode_attributes[ch].numeric;
2017 /* str is of the form "integer" or "integer/posinteger". */
2018 value.numerator = atoi (str);
2019 if (strchr (str, '/') != NULL)
2020 value.denominator = atoi (strchr (str, '/') + 1);
2022 value.denominator = 1;
2026 value.numerator = 0;
2027 value.denominator = 0;
2032 /* Output the unit test for the per-character numeric value table. */
2034 output_numeric_test (const char *filename, const char *version)
2040 stream = fopen (filename, "w");
2043 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2047 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2048 fprintf (stream, "/* Numeric values of Unicode characters. */\n");
2049 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
2053 for (ch = 0; ch < 0x110000; ch++)
2055 uc_fraction_t value = get_numeric_value (ch);
2057 if (value.numerator != 0 || value.denominator != 0)
2060 fprintf (stream, ",\n");
2061 fprintf (stream, " { 0x%04X, %d, %d }",
2062 ch, value.numerator, value.denominator);
2067 fprintf (stream, "\n");
2069 if (ferror (stream) || fclose (stream))
2071 fprintf (stderr, "error writing to '%s'\n", filename);
2076 /* Construction of sparse 3-level tables. */
2077 #define TABLE numeric_table
2078 #define ELEMENT uint8_t
2080 #define xmalloc malloc
2081 #define xrealloc realloc
2084 /* Output the per-character numeric value table. */
2086 output_numeric (const char *filename, const char *version)
2089 uc_fraction_t fractions[128];
2090 unsigned int nfractions;
2091 unsigned int ch, i, j;
2092 struct numeric_table t;
2093 unsigned int level1_offset, level2_offset, level3_offset;
2094 uint16_t *level3_packed;
2096 stream = fopen (filename, "w");
2099 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2103 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2104 fprintf (stream, "/* Numeric values of Unicode characters. */\n");
2105 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
2108 /* Create table of occurring fractions. */
2110 for (ch = 0; ch < 0x110000; ch++)
2112 uc_fraction_t value = get_numeric_value (ch);
2114 for (i = 0; i < nfractions; i++)
2115 if (value.numerator == fractions[i].numerator
2116 && value.denominator == fractions[i].denominator)
2118 if (i == nfractions)
2120 if (nfractions == 128)
2122 for (i = 0; i < nfractions; i++)
2123 if (value.denominator < fractions[i].denominator
2124 || (value.denominator == fractions[i].denominator
2125 && value.numerator < fractions[i].numerator))
2127 for (j = nfractions; j > i; j--)
2128 fractions[j] = fractions[j - 1];
2129 fractions[i] = value;
2134 fprintf (stream, "static const uc_fraction_t u_numeric_values[%d] =\n",
2136 fprintf (stream, "{\n");
2137 for (i = 0; i < nfractions; i++)
2139 fprintf (stream, " { %d, %d }", fractions[i].numerator,
2140 fractions[i].denominator);
2141 if (i+1 < nfractions)
2142 fprintf (stream, ",");
2143 fprintf (stream, "\n");
2145 fprintf (stream, "};\n");
2149 numeric_table_init (&t);
2151 for (ch = 0; ch < 0x110000; ch++)
2153 uc_fraction_t value = get_numeric_value (ch);
2155 for (i = 0; i < nfractions; i++)
2156 if (value.numerator == fractions[i].numerator
2157 && value.denominator == fractions[i].denominator)
2159 if (i == nfractions)
2162 numeric_table_add (&t, ch, i);
2165 numeric_table_finalize (&t);
2167 /* Offsets in t.result, in memory of this process. */
2169 5 * sizeof (uint32_t);
2171 5 * sizeof (uint32_t)
2172 + t.level1_size * sizeof (uint32_t);
2174 5 * sizeof (uint32_t)
2175 + t.level1_size * sizeof (uint32_t)
2176 + (t.level2_size << t.q) * sizeof (uint32_t);
2178 for (i = 0; i < 5; i++)
2179 fprintf (stream, "#define numeric_header_%d %d\n", i,
2180 ((uint32_t *) t.result)[i]);
2181 fprintf (stream, "static const\n");
2182 fprintf (stream, "struct\n");
2183 fprintf (stream, " {\n");
2184 fprintf (stream, " int level1[%zu];\n", t.level1_size);
2185 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
2186 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
2187 (1 << t.p) * 7 / 16);
2188 fprintf (stream, " }\n");
2189 fprintf (stream, "u_numeric =\n");
2190 fprintf (stream, "{\n");
2191 fprintf (stream, " {");
2192 if (t.level1_size > 8)
2193 fprintf (stream, "\n ");
2194 for (i = 0; i < t.level1_size; i++)
2197 if (i > 0 && (i % 8) == 0)
2198 fprintf (stream, "\n ");
2199 offset = ((uint32_t *) (t.result + level1_offset))[i];
2201 fprintf (stream, " %5d", -1);
2203 fprintf (stream, " %5zu",
2204 (offset - level2_offset) / sizeof (uint32_t));
2205 if (i+1 < t.level1_size)
2206 fprintf (stream, ",");
2208 if (t.level1_size > 8)
2209 fprintf (stream, "\n ");
2210 fprintf (stream, " },\n");
2211 fprintf (stream, " {");
2212 if (t.level2_size << t.q > 8)
2213 fprintf (stream, "\n ");
2214 for (i = 0; i < t.level2_size << t.q; i++)
2217 if (i > 0 && (i % 8) == 0)
2218 fprintf (stream, "\n ");
2219 offset = ((uint32_t *) (t.result + level2_offset))[i];
2221 fprintf (stream, " %5d", -1);
2223 fprintf (stream, " %5zu",
2224 (offset - level3_offset) / sizeof (uint8_t));
2225 if (i+1 < t.level2_size << t.q)
2226 fprintf (stream, ",");
2228 if (t.level2_size << t.q > 8)
2229 fprintf (stream, "\n ");
2230 fprintf (stream, " },\n");
2231 /* Pack the level3 array. Each entry needs 7 bits only. Use 16-bit units,
2232 not 32-bit units, in order to make the lookup function easier. */
2235 calloc ((t.level3_size << t.p) * 7 / 16 + 1, sizeof (uint16_t));
2236 for (i = 0; i < t.level3_size << t.p; i++)
2238 unsigned int j = (i * 7) / 16;
2239 unsigned int k = (i * 7) % 16;
2240 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
2241 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
2242 level3_packed[j] = value & 0xffff;
2243 level3_packed[j+1] = value >> 16;
2245 fprintf (stream, " {");
2246 if ((t.level3_size << t.p) * 7 / 16 + 1 > 8)
2247 fprintf (stream, "\n ");
2248 for (i = 0; i < (t.level3_size << t.p) * 7 / 16 + 1; i++)
2250 if (i > 0 && (i % 8) == 0)
2251 fprintf (stream, "\n ");
2252 fprintf (stream, " 0x%04x", level3_packed[i]);
2253 if (i+1 < (t.level3_size << t.p) * 7 / 16 + 1)
2254 fprintf (stream, ",");
2256 if ((t.level3_size << t.p) * 7 / 16 + 1 > 8)
2257 fprintf (stream, "\n ");
2258 fprintf (stream, " }\n");
2259 free (level3_packed);
2260 fprintf (stream, "};\n");
2262 if (ferror (stream) || fclose (stream))
2264 fprintf (stderr, "error writing to '%s'\n", filename);
2269 /* ========================================================================= */
2272 /* See Unicode 3.0 book, section 4.7,
2275 /* List of mirrored character pairs. This is a subset of the characters
2276 having the BidiMirrored property. */
2277 static unsigned int mirror_pairs[][2] =
2334 get_mirror_value (unsigned int ch)
2337 unsigned int mirror_char;
2340 mirrored = (unicode_attributes[ch].name != NULL
2341 && unicode_attributes[ch].mirrored);
2342 mirror_char = 0xfffd;
2343 for (i = 0; i < sizeof (mirror_pairs) / sizeof (mirror_pairs[0]); i++)
2344 if (ch == mirror_pairs[i][0])
2346 mirror_char = mirror_pairs[i][1];
2349 else if (ch == mirror_pairs[i][1])
2351 mirror_char = mirror_pairs[i][0];
2355 return (int) mirror_char - (int) ch;
2358 if (mirror_char != 0xfffd)
2364 /* Construction of sparse 3-level tables. */
2365 #define TABLE mirror_table
2366 #define ELEMENT int32_t
2368 #define xmalloc malloc
2369 #define xrealloc realloc
2372 /* Output the per-character mirror table. */
2374 output_mirror (const char *filename, const char *version)
2378 struct mirror_table t;
2379 unsigned int level1_offset, level2_offset, level3_offset;
2381 stream = fopen (filename, "w");
2384 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2388 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2389 fprintf (stream, "/* Mirrored Unicode characters. */\n");
2390 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
2395 mirror_table_init (&t);
2397 for (ch = 0; ch < 0x110000; ch++)
2399 int value = get_mirror_value (ch);
2401 mirror_table_add (&t, ch, value);
2404 mirror_table_finalize (&t);
2406 /* Offsets in t.result, in memory of this process. */
2408 5 * sizeof (uint32_t);
2410 5 * sizeof (uint32_t)
2411 + t.level1_size * sizeof (uint32_t);
2413 5 * sizeof (uint32_t)
2414 + t.level1_size * sizeof (uint32_t)
2415 + (t.level2_size << t.q) * sizeof (uint32_t);
2417 for (i = 0; i < 5; i++)
2418 fprintf (stream, "#define mirror_header_%d %d\n", i,
2419 ((uint32_t *) t.result)[i]);
2420 fprintf (stream, "static const\n");
2421 fprintf (stream, "struct\n");
2422 fprintf (stream, " {\n");
2423 fprintf (stream, " int level1[%zu];\n", t.level1_size);
2424 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
2425 fprintf (stream, " int level3[%zu << %d];\n", t.level3_size, t.p);
2426 fprintf (stream, " }\n");
2427 fprintf (stream, "u_mirror =\n");
2428 fprintf (stream, "{\n");
2429 fprintf (stream, " {");
2430 if (t.level1_size > 8)
2431 fprintf (stream, "\n ");
2432 for (i = 0; i < t.level1_size; i++)
2435 if (i > 0 && (i % 8) == 0)
2436 fprintf (stream, "\n ");
2437 offset = ((uint32_t *) (t.result + level1_offset))[i];
2439 fprintf (stream, " %5d", -1);
2441 fprintf (stream, " %5zu",
2442 (offset - level2_offset) / sizeof (uint32_t));
2443 if (i+1 < t.level1_size)
2444 fprintf (stream, ",");
2446 if (t.level1_size > 8)
2447 fprintf (stream, "\n ");
2448 fprintf (stream, " },\n");
2449 fprintf (stream, " {");
2450 if (t.level2_size << t.q > 8)
2451 fprintf (stream, "\n ");
2452 for (i = 0; i < t.level2_size << t.q; i++)
2455 if (i > 0 && (i % 8) == 0)
2456 fprintf (stream, "\n ");
2457 offset = ((uint32_t *) (t.result + level2_offset))[i];
2459 fprintf (stream, " %5d", -1);
2461 fprintf (stream, " %5zu",
2462 (offset - level3_offset) / sizeof (int32_t));
2463 if (i+1 < t.level2_size << t.q)
2464 fprintf (stream, ",");
2466 if (t.level2_size << t.q > 8)
2467 fprintf (stream, "\n ");
2468 fprintf (stream, " },\n");
2469 fprintf (stream, " {");
2470 if (t.level3_size << t.p > 8)
2471 fprintf (stream, "\n ");
2472 for (i = 0; i < t.level3_size << t.p; i++)
2474 if (i > 0 && (i % 8) == 0)
2475 fprintf (stream, "\n ");
2476 fprintf (stream, " %5d", ((int32_t *) (t.result + level3_offset))[i]);
2477 if (i+1 < t.level3_size << t.p)
2478 fprintf (stream, ",");
2480 if (t.level3_size << t.p > 8)
2481 fprintf (stream, "\n ");
2482 fprintf (stream, " }\n");
2483 fprintf (stream, "};\n");
2485 if (ferror (stream) || fclose (stream))
2487 fprintf (stderr, "error writing to '%s'\n", filename);
2492 /* ========================================================================= */
2496 /* Reading PropList.txt and DerivedCoreProperties.txt. */
2505 PROP_QUOTATION_MARK,
2506 PROP_TERMINAL_PUNCTUATION,
2509 PROP_ASCII_HEX_DIGIT,
2510 PROP_OTHER_ALPHABETIC,
2514 PROP_OTHER_LOWERCASE,
2515 PROP_OTHER_UPPERCASE,
2516 PROP_NONCHARACTER_CODE_POINT,
2517 PROP_OTHER_GRAPHEME_EXTEND,
2518 PROP_IDS_BINARY_OPERATOR,
2519 PROP_IDS_TRINARY_OPERATOR,
2521 PROP_UNIFIED_IDEOGRAPH,
2522 PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT,
2525 PROP_LOGICAL_ORDER_EXCEPTION,
2526 PROP_OTHER_ID_START,
2527 PROP_OTHER_ID_CONTINUE,
2529 PROP_VARIATION_SELECTOR,
2530 PROP_PATTERN_WHITE_SPACE,
2531 PROP_PATTERN_SYNTAX,
2532 /* DerivedCoreProperties.txt */
2541 PROP_DEFAULT_IGNORABLE_CODE_POINT,
2542 PROP_GRAPHEME_EXTEND,
2546 unsigned long long unicode_properties[0x110000];
2549 clear_properties (void)
2553 for (i = 0; i < 0x110000; i++)
2554 unicode_properties[i] = 0;
2557 /* Stores in unicode_properties[] the properties from the
2558 PropList.txt or DerivedCoreProperties.txt file. */
2560 fill_properties (const char *proplist_filename)
2565 stream = fopen (proplist_filename, "r");
2568 fprintf (stderr, "error during fopen of '%s'\n", proplist_filename);
2575 unsigned int i1, i2;
2576 char padding[200+1];
2577 char propname[200+1];
2578 unsigned int propvalue;
2580 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
2583 if (buf[0] == '\0' || buf[0] == '#')
2586 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4)
2588 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3)
2590 fprintf (stderr, "parse error in '%s'\n", proplist_filename);
2595 #define PROP(name,value) \
2596 if (strcmp (propname, name) == 0) propvalue = value; else
2598 PROP ("White_Space", PROP_WHITE_SPACE)
2599 PROP ("Bidi_Control", PROP_BIDI_CONTROL)
2600 PROP ("Join_Control", PROP_JOIN_CONTROL)
2601 PROP ("Dash", PROP_DASH)
2602 PROP ("Hyphen", PROP_HYPHEN)
2603 PROP ("Quotation_Mark", PROP_QUOTATION_MARK)
2604 PROP ("Terminal_Punctuation", PROP_TERMINAL_PUNCTUATION)
2605 PROP ("Other_Math", PROP_OTHER_MATH)
2606 PROP ("Hex_Digit", PROP_HEX_DIGIT)
2607 PROP ("ASCII_Hex_Digit", PROP_ASCII_HEX_DIGIT)
2608 PROP ("Other_Alphabetic", PROP_OTHER_ALPHABETIC)
2609 PROP ("Ideographic", PROP_IDEOGRAPHIC)
2610 PROP ("Diacritic", PROP_DIACRITIC)
2611 PROP ("Extender", PROP_EXTENDER)
2612 PROP ("Other_Lowercase", PROP_OTHER_LOWERCASE)
2613 PROP ("Other_Uppercase", PROP_OTHER_UPPERCASE)
2614 PROP ("Noncharacter_Code_Point", PROP_NONCHARACTER_CODE_POINT)
2615 PROP ("Other_Grapheme_Extend", PROP_OTHER_GRAPHEME_EXTEND)
2616 PROP ("IDS_Binary_Operator", PROP_IDS_BINARY_OPERATOR)
2617 PROP ("IDS_Trinary_Operator", PROP_IDS_TRINARY_OPERATOR)
2618 PROP ("Radical", PROP_RADICAL)
2619 PROP ("Unified_Ideograph", PROP_UNIFIED_IDEOGRAPH)
2620 PROP ("Other_Default_Ignorable_Code_Point", PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)
2621 PROP ("Deprecated", PROP_DEPRECATED)
2622 PROP ("Soft_Dotted", PROP_SOFT_DOTTED)
2623 PROP ("Logical_Order_Exception", PROP_LOGICAL_ORDER_EXCEPTION)
2624 PROP ("Other_ID_Start", PROP_OTHER_ID_START)
2625 PROP ("Other_ID_Continue", PROP_OTHER_ID_CONTINUE)
2626 PROP ("STerm", PROP_STERM)
2627 PROP ("Variation_Selector", PROP_VARIATION_SELECTOR)
2628 PROP ("Pattern_White_Space", PROP_PATTERN_WHITE_SPACE)
2629 PROP ("Pattern_Syntax", PROP_PATTERN_SYNTAX)
2630 /* DerivedCoreProperties.txt */
2631 PROP ("Math", PROP_MATH)
2632 PROP ("Alphabetic", PROP_ALPHABETIC)
2633 PROP ("Lowercase", PROP_LOWERCASE)
2634 PROP ("Uppercase", PROP_UPPERCASE)
2635 PROP ("ID_Start", PROP_ID_START)
2636 PROP ("ID_Continue", PROP_ID_CONTINUE)
2637 PROP ("XID_Start", PROP_XID_START)
2638 PROP ("XID_Continue", PROP_XID_CONTINUE)
2639 PROP ("Default_Ignorable_Code_Point", PROP_DEFAULT_IGNORABLE_CODE_POINT)
2640 PROP ("Grapheme_Extend", PROP_GRAPHEME_EXTEND)
2641 PROP ("Grapheme_Base", PROP_GRAPHEME_BASE)
2642 PROP ("Grapheme_Link", PROP_GRAPHEME_LINK)
2645 fprintf (stderr, "unknown property named '%s' in '%s'\n", propname,
2649 if (!(i1 <= i2 && i2 < 0x110000))
2652 for (i = i1; i <= i2; i++)
2653 unicode_properties[i] |= 1ULL << propvalue;
2656 if (ferror (stream) || fclose (stream))
2658 fprintf (stderr, "error reading from '%s'\n", proplist_filename);
2663 /* Stores in array the given property from the Unicode 3.0 PropList.txt
2666 fill_property30 (char array[0x110000], const char *proplist_filename, const char *property_name)
2672 for (i = 0; i < 0x110000; i++)
2675 stream = fopen (proplist_filename, "r");
2678 fprintf (stderr, "error during fopen of '%s'\n", proplist_filename);
2682 /* Search for the "Property dump for: ..." line. */
2685 if (fscanf (stream, "%100[^\n]\n", buf) < 1)
2687 fprintf (stderr, "no property found in '%s'\n", proplist_filename);
2691 while (strstr (buf, property_name) == NULL);
2695 unsigned int i1, i2;
2697 if (fscanf (stream, "%100[^\n]\n", buf) < 1)
2701 if (strlen (buf) >= 10 && buf[4] == '.' && buf[5] == '.')
2703 if (sscanf (buf, "%4X..%4X", &i1, &i2) < 2)
2705 fprintf (stderr, "parse error in property in '%s'\n",
2710 else if (strlen (buf) >= 4)
2712 if (sscanf (buf, "%4X", &i1) < 1)
2714 fprintf (stderr, "parse error in property in '%s'\n",
2722 fprintf (stderr, "parse error in property in '%s'\n",
2726 if (!(i1 <= i2 && i2 < 0x110000))
2728 for (i = i1; i <= i2; i++)
2731 if (ferror (stream) || fclose (stream))
2733 fprintf (stderr, "error reading from '%s'\n", proplist_filename);
2738 /* Properties from Unicode 3.0 PropList.txt file. */
2740 /* The paired punctuation property from the PropList.txt file. */
2741 char unicode_pairedpunctuation[0x110000];
2743 /* The left of pair property from the PropList.txt file. */
2744 char unicode_leftofpair[0x110000];
2747 fill_properties30 (const char *proplist30_filename)
2749 fill_property30 (unicode_pairedpunctuation, proplist30_filename, "(Paired Punctuation)");
2750 fill_property30 (unicode_leftofpair, proplist30_filename, "(Left of Pair)");
2753 /* ------------------------------------------------------------------------- */
2755 /* See PropList.txt, UCD.html. */
2757 is_property_white_space (unsigned int ch)
2759 return ((unicode_properties[ch] & (1ULL << PROP_WHITE_SPACE)) != 0);
2762 /* See Unicode 3.0 book, section 4.10,
2763 PropList.txt, UCD.html,
2764 DerivedCoreProperties.txt, UCD.html. */
2766 is_property_alphabetic (unsigned int ch)
2770 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_ALPHABETIC)) != 0)
2771 /* For some reason, the following are listed as having property
2772 Alphabetic but not as having property Other_Alphabetic. */
2773 || (ch >= 0x16EE && ch <= 0x16F0) /* RUNIC SYMBOLS */
2774 || (ch >= 0x2160 && ch <= 0x2182) /* ROMAN NUMERALS */
2775 || (ch >= 0x2185 && ch <= 0x2188) /* ROMAN NUMERALS */
2776 || (ch >= 0x24D0 && ch <= 0x24E9) /* CIRCLED LATIN SMALL LETTER */
2777 || (ch == 0x3007) /* IDEOGRAPHIC NUMBER ZERO */
2778 || (ch >= 0x3021 && ch <= 0x3029) /* HANGZHOU NUMERAL */
2779 || (ch >= 0x3038 && ch <= 0x303A) /* HANGZHOU NUMERAL */
2780 || (ch >= 0x10140 && ch <= 0x10174) /* GREEK ACROPHONICS */
2781 || (ch == 0x10341) /* GOTHIC LETTER NINETY */
2782 || (ch == 0x1034A) /* GOTHIC LETTER NINE HUNDRED */
2783 || (ch >= 0x103D1 && ch <= 0x103D5) /* OLD PERSIAN NUMBERS */
2784 || (ch >= 0x12400 && ch <= 0x12462); /* CUNEIFORM NUMERIC SIGNS */
2786 ((unicode_properties[ch] & (1ULL << PROP_ALPHABETIC)) != 0);
2788 if (result1 != result2)
2793 /* See PropList.txt, UCD.html. */
2795 is_property_other_alphabetic (unsigned int ch)
2797 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ALPHABETIC)) != 0);
2800 /* See PropList.txt, UCD.html. */
2802 is_property_not_a_character (unsigned int ch)
2804 return ((unicode_properties[ch] & (1ULL << PROP_NONCHARACTER_CODE_POINT)) != 0);
2807 /* See PropList.txt, UCD.html,
2808 DerivedCoreProperties.txt, UCD.html. */
2810 is_property_default_ignorable_code_point (unsigned int ch)
2813 (is_category_Cf (ch)
2814 && !(ch >= 0xFFF9 && ch <= 0xFFFB) /* Annotations */
2815 && !((ch >= 0x0600 && ch <= 0x0603) || ch == 0x06DD || ch == 0x070F))
2816 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)) != 0)
2817 || ((unicode_properties[ch] & (1ULL << PROP_VARIATION_SELECTOR)) != 0);
2819 ((unicode_properties[ch] & (1ULL << PROP_DEFAULT_IGNORABLE_CODE_POINT)) != 0);
2821 if (result1 != result2)
2826 /* See PropList.txt, UCD.html. */
2828 is_property_other_default_ignorable_code_point (unsigned int ch)
2830 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)) != 0);
2833 /* See PropList.txt, UCD.html. */
2835 is_property_deprecated (unsigned int ch)
2837 return ((unicode_properties[ch] & (1ULL << PROP_DEPRECATED)) != 0);
2840 /* See PropList.txt, UCD.html. */
2842 is_property_logical_order_exception (unsigned int ch)
2844 return ((unicode_properties[ch] & (1ULL << PROP_LOGICAL_ORDER_EXCEPTION)) != 0);
2847 /* See PropList.txt, UCD.html. */
2849 is_property_variation_selector (unsigned int ch)
2851 return ((unicode_properties[ch] & (1ULL << PROP_VARIATION_SELECTOR)) != 0);
2854 /* See PropList-3.0.1.txt. */
2856 is_property_private_use (unsigned int ch)
2858 /* Determined through "grep 'Private Use,' UnicodeData-3.1.0.txt". */
2859 return (ch >= 0xE000 && ch <= 0xF8FF)
2860 || (ch >= 0xF0000 && ch <= 0xFFFFD)
2861 || (ch >= 0x100000 && ch <= 0x10FFFD);
2864 /* See PropList-3.0.1.txt. */
2866 is_property_unassigned_code_value (unsigned int ch)
2868 return (is_category_Cn (ch) && !is_property_not_a_character (ch));
2871 /* See PropList.txt, UCD.html,
2872 DerivedCoreProperties.txt, UCD.html. */
2874 is_property_uppercase (unsigned int ch)
2878 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_UPPERCASE)) != 0);
2880 ((unicode_properties[ch] & (1ULL << PROP_UPPERCASE)) != 0);
2882 if (result1 != result2)
2887 /* See PropList.txt, UCD.html. */
2889 is_property_other_uppercase (unsigned int ch)
2891 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_UPPERCASE)) != 0);
2894 /* See PropList.txt, UCD.html,
2895 DerivedCoreProperties.txt, UCD.html. */
2897 is_property_lowercase (unsigned int ch)
2901 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_LOWERCASE)) != 0);
2903 ((unicode_properties[ch] & (1ULL << PROP_LOWERCASE)) != 0);
2905 if (result1 != result2)
2910 /* See PropList.txt, UCD.html. */
2912 is_property_other_lowercase (unsigned int ch)
2914 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_LOWERCASE)) != 0);
2917 /* See PropList-3.0.1.txt. */
2919 is_property_titlecase (unsigned int ch)
2921 return is_category_Lt (ch);
2924 /* See PropList.txt, UCD.html. */
2926 is_property_soft_dotted (unsigned int ch)
2928 return ((unicode_properties[ch] & (1ULL << PROP_SOFT_DOTTED)) != 0);
2931 /* See DerivedCoreProperties.txt, UCD.html. */
2933 is_property_id_start (unsigned int ch)
2935 return ((unicode_properties[ch] & (1ULL << PROP_ID_START)) != 0);
2938 /* See PropList.txt, UCD.html. */
2940 is_property_other_id_start (unsigned int ch)
2942 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ID_START)) != 0);
2945 /* See DerivedCoreProperties.txt, UCD.html. */
2947 is_property_id_continue (unsigned int ch)
2949 return ((unicode_properties[ch] & (1ULL << PROP_ID_CONTINUE)) != 0);
2952 /* See PropList.txt, UCD.html. */
2954 is_property_other_id_continue (unsigned int ch)
2956 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ID_CONTINUE)) != 0);
2959 /* See DerivedCoreProperties.txt, UCD.html. */
2961 is_property_xid_start (unsigned int ch)
2963 return ((unicode_properties[ch] & (1ULL << PROP_XID_START)) != 0);
2966 /* See DerivedCoreProperties.txt, UCD.html. */
2968 is_property_xid_continue (unsigned int ch)
2970 return ((unicode_properties[ch] & (1ULL << PROP_XID_CONTINUE)) != 0);
2973 /* See PropList.txt, UCD.html. */
2975 is_property_pattern_white_space (unsigned int ch)
2977 return ((unicode_properties[ch] & (1ULL << PROP_PATTERN_WHITE_SPACE)) != 0);
2980 /* See PropList.txt, UCD.html. */
2982 is_property_pattern_syntax (unsigned int ch)
2984 return ((unicode_properties[ch] & (1ULL << PROP_PATTERN_SYNTAX)) != 0);
2987 /* See PropList.txt, UCD.html. */
2989 is_property_join_control (unsigned int ch)
2991 return ((unicode_properties[ch] & (1ULL << PROP_JOIN_CONTROL)) != 0);
2994 /* See DerivedCoreProperties.txt, UCD.html. */
2996 is_property_grapheme_base (unsigned int ch)
2998 return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_BASE)) != 0);
3001 /* See DerivedCoreProperties.txt, UCD.html. */
3003 is_property_grapheme_extend (unsigned int ch)
3005 return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_EXTEND)) != 0);
3008 /* See PropList.txt, UCD.html. */
3010 is_property_other_grapheme_extend (unsigned int ch)
3012 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_GRAPHEME_EXTEND)) != 0);
3015 /* See DerivedCoreProperties.txt, UCD.html. */
3017 is_property_grapheme_link (unsigned int ch)
3019 return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_LINK)) != 0);
3022 /* See PropList.txt, UCD.html. */
3024 is_property_bidi_control (unsigned int ch)
3026 return ((unicode_properties[ch] & (1ULL << PROP_BIDI_CONTROL)) != 0);
3029 /* See PropList-3.0.1.txt. */
3031 is_property_bidi_left_to_right (unsigned int ch)
3033 return (get_bidi_category (ch) == UC_BIDI_L);
3036 /* See PropList-3.0.1.txt. */
3038 is_property_bidi_hebrew_right_to_left (unsigned int ch)
3040 return (get_bidi_category (ch) == UC_BIDI_R);
3043 /* See PropList-3.0.1.txt. */
3045 is_property_bidi_arabic_right_to_left (unsigned int ch)
3047 return (get_bidi_category (ch) == UC_BIDI_AL);
3050 /* See PropList-3.0.1.txt. */
3052 is_property_bidi_european_digit (unsigned int ch)
3054 return (get_bidi_category (ch) == UC_BIDI_EN);
3057 /* See PropList-3.0.1.txt. */
3059 is_property_bidi_eur_num_separator (unsigned int ch)
3061 return (get_bidi_category (ch) == UC_BIDI_ES);
3064 /* See PropList-3.0.1.txt. */
3066 is_property_bidi_eur_num_terminator (unsigned int ch)
3068 return (get_bidi_category (ch) == UC_BIDI_ET);
3071 /* See PropList-3.0.1.txt. */
3073 is_property_bidi_arabic_digit (unsigned int ch)
3075 return (get_bidi_category (ch) == UC_BIDI_AN);
3078 /* See PropList-3.0.1.txt. */
3080 is_property_bidi_common_separator (unsigned int ch)
3082 return (get_bidi_category (ch) == UC_BIDI_CS);
3085 /* See PropList-3.0.1.txt. */
3087 is_property_bidi_block_separator (unsigned int ch)
3089 return (get_bidi_category (ch) == UC_BIDI_B);
3092 /* See PropList-3.0.1.txt. */
3094 is_property_bidi_segment_separator (unsigned int ch)
3096 return (get_bidi_category (ch) == UC_BIDI_S);
3099 /* See PropList-3.0.1.txt. */
3101 is_property_bidi_whitespace (unsigned int ch)
3103 return (get_bidi_category (ch) == UC_BIDI_WS);
3106 /* See PropList-3.0.1.txt. */
3108 is_property_bidi_non_spacing_mark (unsigned int ch)
3110 return (get_bidi_category (ch) == UC_BIDI_NSM);
3113 /* See PropList-3.0.1.txt. */
3115 is_property_bidi_boundary_neutral (unsigned int ch)
3117 return (get_bidi_category (ch) == UC_BIDI_BN);
3120 /* See PropList-3.0.1.txt. */
3122 is_property_bidi_pdf (unsigned int ch)
3124 return (get_bidi_category (ch) == UC_BIDI_PDF);
3127 /* See PropList-3.0.1.txt. */
3129 is_property_bidi_embedding_or_override (unsigned int ch)
3131 int category = get_bidi_category (ch);
3132 return (category == UC_BIDI_LRE || category == UC_BIDI_LRO
3133 || category == UC_BIDI_RLE || category == UC_BIDI_RLO);
3136 /* See PropList-3.0.1.txt. */
3138 is_property_bidi_other_neutral (unsigned int ch)
3140 return (get_bidi_category (ch) == UC_BIDI_ON);
3143 /* See PropList.txt, UCD.html. */
3145 is_property_hex_digit (unsigned int ch)
3147 return ((unicode_properties[ch] & (1ULL << PROP_HEX_DIGIT)) != 0);
3150 /* See PropList.txt, UCD.html. */
3152 is_property_ascii_hex_digit (unsigned int ch)
3154 return ((unicode_properties[ch] & (1ULL << PROP_ASCII_HEX_DIGIT)) != 0);
3157 /* See Unicode 3.0 book, section 4.10,
3158 PropList.txt, UCD.html. */
3160 is_property_ideographic (unsigned int ch)
3162 return ((unicode_properties[ch] & (1ULL << PROP_IDEOGRAPHIC)) != 0);
3165 /* See PropList.txt, UCD.html. */
3167 is_property_unified_ideograph (unsigned int ch)
3169 return ((unicode_properties[ch] & (1ULL << PROP_UNIFIED_IDEOGRAPH)) != 0);
3172 /* See PropList.txt, UCD.html. */
3174 is_property_radical (unsigned int ch)
3176 return ((unicode_properties[ch] & (1ULL << PROP_RADICAL)) != 0);
3179 /* See PropList.txt, UCD.html. */
3181 is_property_ids_binary_operator (unsigned int ch)
3183 return ((unicode_properties[ch] & (1ULL << PROP_IDS_BINARY_OPERATOR)) != 0);
3186 /* See PropList.txt, UCD.html. */
3188 is_property_ids_trinary_operator (unsigned int ch)
3190 return ((unicode_properties[ch] & (1ULL << PROP_IDS_TRINARY_OPERATOR)) != 0);
3193 /* See PropList-3.0.1.txt. */
3195 is_property_zero_width (unsigned int ch)
3197 return is_category_Cf (ch)
3198 || (unicode_attributes[ch].name != NULL
3199 && strstr (unicode_attributes[ch].name, "ZERO WIDTH") != NULL);
3202 /* See PropList-3.0.1.txt. */
3204 is_property_space (unsigned int ch)
3206 return is_category_Zs (ch);
3209 /* See PropList-3.0.1.txt. */
3211 is_property_non_break (unsigned int ch)
3213 /* This is exactly the set of characters having line breaking
3215 return (ch == 0x00A0 /* NO-BREAK SPACE */
3216 || ch == 0x034F /* COMBINING GRAPHEME JOINER */
3217 || ch == 0x035C /* COMBINING DOUBLE BREVE BELOW */
3218 || ch == 0x035D /* COMBINING DOUBLE BREVE */
3219 || ch == 0x035E /* COMBINING DOUBLE MACRON */
3220 || ch == 0x035F /* COMBINING DOUBLE MACRON BELOW */
3221 || ch == 0x0360 /* COMBINING DOUBLE TILDE */
3222 || ch == 0x0361 /* COMBINING DOUBLE INVERTED BREVE */
3223 || ch == 0x0362 /* COMBINING DOUBLE RIGHTWARDS ARROW BELOW */
3224 || ch == 0x0F08 /* TIBETAN MARK SBRUL SHAD */
3225 || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */
3226 || ch == 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */
3227 || ch == 0x180E /* MONGOLIAN VOWEL SEPARATOR */
3228 || ch == 0x2007 /* FIGURE SPACE */
3229 || ch == 0x2011 /* NON-BREAKING HYPHEN */
3230 || ch == 0x202F /* NARROW NO-BREAK SPACE */);
3233 /* See PropList-3.0.1.txt. */
3235 is_property_iso_control (unsigned int ch)
3238 (unicode_attributes[ch].name != NULL
3239 && strcmp (unicode_attributes[ch].name, "<control>") == 0);
3241 is_category_Cc (ch);
3243 if (result1 != result2)
3248 /* See PropList-3.0.1.txt. */
3250 is_property_format_control (unsigned int ch)
3252 return (is_category_Cf (ch)
3253 && get_bidi_category (ch) == UC_BIDI_BN
3254 && !is_property_join_control (ch)
3258 /* See PropList.txt, UCD.html. */
3260 is_property_dash (unsigned int ch)
3262 return ((unicode_properties[ch] & (1ULL << PROP_DASH)) != 0);
3265 /* See PropList.txt, UCD.html. */
3267 is_property_hyphen (unsigned int ch)
3269 return ((unicode_properties[ch] & (1ULL << PROP_HYPHEN)) != 0);
3272 /* See PropList-3.0.1.txt. */
3274 is_property_punctuation (unsigned int ch)
3276 return is_category_P (ch);
3279 /* See PropList-3.0.1.txt. */
3281 is_property_line_separator (unsigned int ch)
3283 return is_category_Zl (ch);
3286 /* See PropList-3.0.1.txt. */
3288 is_property_paragraph_separator (unsigned int ch)
3290 return is_category_Zp (ch);
3293 /* See PropList.txt, UCD.html. */
3295 is_property_quotation_mark (unsigned int ch)
3297 return ((unicode_properties[ch] & (1ULL << PROP_QUOTATION_MARK)) != 0);
3300 /* See PropList.txt, UCD.html. */
3302 is_property_sentence_terminal (unsigned int ch)
3304 return ((unicode_properties[ch] & (1ULL << PROP_STERM)) != 0);
3307 /* See PropList.txt, UCD.html. */
3309 is_property_terminal_punctuation (unsigned int ch)
3311 return ((unicode_properties[ch] & (1ULL << PROP_TERMINAL_PUNCTUATION)) != 0);
3314 /* See PropList-3.0.1.txt. */
3316 is_property_currency_symbol (unsigned int ch)
3318 return is_category_Sc (ch);
3321 /* See Unicode 3.0 book, section 4.9,
3322 PropList.txt, UCD.html,
3323 DerivedCoreProperties.txt, UCD.html. */
3325 is_property_math (unsigned int ch)
3329 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_MATH)) != 0);
3331 ((unicode_properties[ch] & (1ULL << PROP_MATH)) != 0);
3333 if (result1 != result2)
3338 /* See PropList.txt, UCD.html. */
3340 is_property_other_math (unsigned int ch)
3342 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_MATH)) != 0);
3345 /* See PropList-3.0.1.txt. */
3347 is_property_paired_punctuation (unsigned int ch)
3349 return unicode_pairedpunctuation[ch];
3352 /* See PropList-3.0.1.txt. */
3354 is_property_left_of_pair (unsigned int ch)
3356 return unicode_leftofpair[ch];
3359 /* See PropList-3.0.1.txt. */
3361 is_property_combining (unsigned int ch)
3363 return (unicode_attributes[ch].name != NULL
3364 && (strcmp (unicode_attributes[ch].combining, "0") != 0
3365 || is_category_Mc (ch)
3366 || is_category_Me (ch)
3367 || is_category_Mn (ch)));
3370 #if 0 /* same as is_property_bidi_non_spacing_mark */
3371 /* See PropList-3.0.1.txt. */
3373 is_property_non_spacing (unsigned int ch)
3375 return (unicode_attributes[ch].name != NULL
3376 && get_bidi_category (ch) == UC_BIDI_NSM);
3380 /* See PropList-3.0.1.txt. */
3382 is_property_composite (unsigned int ch)
3384 /* This definition differs from the one in PropList-3.0.1.txt, but is more
3385 logical in some sense. */
3386 if (ch >= 0xAC00 && ch <= 0xD7A4) /* Hangul Syllables */
3388 if (unicode_attributes[ch].name != NULL
3389 && unicode_attributes[ch].decomposition != NULL)
3391 /* Test whether the decomposition contains more than one character,
3392 and the first is not a space. */
3393 const char *decomp = unicode_attributes[ch].decomposition;
3394 if (decomp[0] == '<')
3396 decomp = strchr (decomp, '>') + 1;
3397 if (decomp[0] == ' ')
3400 return strchr (decomp, ' ') != NULL && strncmp (decomp, "0020 ", 5) != 0;
3405 /* See PropList-3.0.1.txt. */
3407 is_property_decimal_digit (unsigned int ch)
3409 return is_category_Nd (ch);
3412 /* See PropList-3.0.1.txt. */
3414 is_property_numeric (unsigned int ch)
3416 return ((get_numeric_value (ch)).denominator > 0)
3417 || (ch == 0x09F8) /* BENGALI CURRENCY NUMERATOR ONE LESS THAN THE DENOMINATOR */
3418 || (ch == 0x2183); /* ROMAN NUMERAL REVERSED ONE HUNDRED */
3421 /* See PropList.txt, UCD.html. */
3423 is_property_diacritic (unsigned int ch)
3425 return ((unicode_properties[ch] & (1ULL << PROP_DIACRITIC)) != 0);
3428 /* See PropList.txt, UCD.html. */
3430 is_property_extender (unsigned int ch)
3432 return ((unicode_properties[ch] & (1ULL << PROP_EXTENDER)) != 0);
3435 /* See PropList-3.0.1.txt. */
3437 is_property_ignorable_control (unsigned int ch)
3439 return ((is_category_Cc (ch) && get_bidi_category (ch) == UC_BIDI_BN)
3440 || is_category_Cf (ch))
3444 /* ------------------------------------------------------------------------- */
3446 /* Output all properties. */
3448 output_properties (const char *version)
3450 #define PROPERTY(P) \
3451 debug_output_predicate ("unictype/pr_" #P ".txt", is_property_ ## P); \
3452 output_predicate_test ("../tests/unictype/test-pr_" #P ".c", is_property_ ## P, "uc_is_property_" #P " (c)"); \
3453 output_predicate ("unictype/pr_" #P ".h", is_property_ ## P, "u_property_" #P, "Properties", version);
3454 PROPERTY(white_space)
3455 PROPERTY(alphabetic)
3456 PROPERTY(other_alphabetic)
3457 PROPERTY(not_a_character)
3458 PROPERTY(default_ignorable_code_point)
3459 PROPERTY(other_default_ignorable_code_point)
3460 PROPERTY(deprecated)
3461 PROPERTY(logical_order_exception)
3462 PROPERTY(variation_selector)
3463 PROPERTY(private_use)
3464 PROPERTY(unassigned_code_value)
3466 PROPERTY(other_uppercase)
3468 PROPERTY(other_lowercase)
3470 PROPERTY(soft_dotted)
3472 PROPERTY(other_id_start)
3473 PROPERTY(id_continue)
3474 PROPERTY(other_id_continue)
3476 PROPERTY(xid_continue)
3477 PROPERTY(pattern_white_space)
3478 PROPERTY(pattern_syntax)
3479 PROPERTY(join_control)
3480 PROPERTY(grapheme_base)
3481 PROPERTY(grapheme_extend)
3482 PROPERTY(other_grapheme_extend)
3483 PROPERTY(grapheme_link)
3484 PROPERTY(bidi_control)
3485 PROPERTY(bidi_left_to_right)
3486 PROPERTY(bidi_hebrew_right_to_left)
3487 PROPERTY(bidi_arabic_right_to_left)
3488 PROPERTY(bidi_european_digit)
3489 PROPERTY(bidi_eur_num_separator)
3490 PROPERTY(bidi_eur_num_terminator)
3491 PROPERTY(bidi_arabic_digit)
3492 PROPERTY(bidi_common_separator)
3493 PROPERTY(bidi_block_separator)
3494 PROPERTY(bidi_segment_separator)
3495 PROPERTY(bidi_whitespace)
3496 PROPERTY(bidi_non_spacing_mark)
3497 PROPERTY(bidi_boundary_neutral)
3499 PROPERTY(bidi_embedding_or_override)
3500 PROPERTY(bidi_other_neutral)
3502 PROPERTY(ascii_hex_digit)
3503 PROPERTY(ideographic)
3504 PROPERTY(unified_ideograph)
3506 PROPERTY(ids_binary_operator)
3507 PROPERTY(ids_trinary_operator)
3508 PROPERTY(zero_width)
3511 PROPERTY(iso_control)
3512 PROPERTY(format_control)
3515 PROPERTY(punctuation)
3516 PROPERTY(line_separator)
3517 PROPERTY(paragraph_separator)
3518 PROPERTY(quotation_mark)
3519 PROPERTY(sentence_terminal)
3520 PROPERTY(terminal_punctuation)
3521 PROPERTY(currency_symbol)
3523 PROPERTY(other_math)
3524 PROPERTY(paired_punctuation)
3525 PROPERTY(left_of_pair)
3528 PROPERTY(decimal_digit)
3532 PROPERTY(ignorable_control)
3536 /* ========================================================================= */
3540 static const char *scripts[256];
3541 static unsigned int numscripts;
3543 static uint8_t unicode_scripts[0x110000];
3546 fill_scripts (const char *scripts_filename)
3551 stream = fopen (scripts_filename, "r");
3554 fprintf (stderr, "error during fopen of '%s'\n", scripts_filename);
3560 for (i = 0; i < 0x110000; i++)
3561 unicode_scripts[i] = (uint8_t)~(uint8_t)0;
3566 unsigned int i1, i2;
3567 char padding[200+1];
3568 char scriptname[200+1];
3571 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
3574 if (buf[0] == '\0' || buf[0] == '#')
3577 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, scriptname) != 4)
3579 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, scriptname) != 3)
3581 fprintf (stderr, "parse error in '%s'\n", scripts_filename);
3591 for (script = numscripts - 1; script >= 0; script--)
3592 if (strcmp (scripts[script], scriptname) == 0)
3596 scripts[numscripts] = strdup (scriptname);
3597 script = numscripts;
3599 if (numscripts == 256)
3603 for (i = i1; i <= i2; i++)
3605 if (unicode_scripts[i] != (uint8_t)~(uint8_t)0)
3606 fprintf (stderr, "0x%04X belongs to multiple scripts\n", i);
3607 unicode_scripts[i] = script;
3611 if (ferror (stream) || fclose (stream))
3613 fprintf (stderr, "error reading from '%s'\n", scripts_filename);
3618 /* Construction of sparse 3-level tables. */
3619 #define TABLE script_table
3620 #define ELEMENT uint8_t
3621 #define DEFAULT (uint8_t)~(uint8_t)0
3622 #define xmalloc malloc
3623 #define xrealloc realloc
3627 output_scripts (const char *version)
3629 const char *filename = "unictype/scripts.h";
3631 unsigned int ch, s, i;
3632 struct script_table t;
3633 unsigned int level1_offset, level2_offset, level3_offset;
3637 const char *lowercase_name;
3640 scriptinfo_t scriptinfo[256];
3642 stream = fopen (filename, "w");
3645 fprintf (stderr, "cannot open '%s' for writing\n", filename);
3649 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
3650 fprintf (stream, "/* Unicode scripts. */\n");
3651 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
3654 for (s = 0; s < numscripts; s++)
3656 char *lcp = strdup (scripts[s]);
3659 for (cp = lcp; *cp != '\0'; cp++)
3660 if (*cp >= 'A' && *cp <= 'Z')
3663 scriptinfo[s].lowercase_name = lcp;
3666 for (s = 0; s < numscripts; s++)
3668 fprintf (stream, "static const uc_interval_t script_%s_intervals[] =\n",
3669 scriptinfo[s].lowercase_name);
3670 fprintf (stream, "{\n");
3672 for (ch = 0; ch < 0x110000; ch++)
3673 if (unicode_scripts[ch] == s)
3679 while (ch + 1 < 0x110000 && unicode_scripts[ch + 1] == s)
3684 fprintf (stream, ",\n");
3686 fprintf (stream, " { 0x%04X, 1, 1 }", start);
3688 fprintf (stream, " { 0x%04X, 1, 0 }, { 0x%04X, 0, 1 }",
3692 fprintf (stream, "\n");
3693 fprintf (stream, "};\n");
3696 fprintf (stream, "static const uc_script_t scripts[%d] =\n", numscripts);
3697 fprintf (stream, "{\n");
3698 for (s = 0; s < numscripts; s++)
3700 fprintf (stream, " {\n");
3701 fprintf (stream, " sizeof (script_%s_intervals) / sizeof (uc_interval_t),\n",
3702 scriptinfo[s].lowercase_name);
3703 fprintf (stream, " script_%s_intervals,\n",
3704 scriptinfo[s].lowercase_name);
3705 fprintf (stream, " \"%s\"\n", scripts[s]);
3706 fprintf (stream, " }");
3707 if (s+1 < numscripts)
3708 fprintf (stream, ",");
3709 fprintf (stream, "\n");
3711 fprintf (stream, "};\n");
3715 script_table_init (&t);
3717 for (ch = 0; ch < 0x110000; ch++)
3719 unsigned int s = unicode_scripts[ch];
3720 if (s != (uint8_t)~(uint8_t)0)
3721 script_table_add (&t, ch, s);
3724 script_table_finalize (&t);
3726 /* Offsets in t.result, in memory of this process. */
3728 5 * sizeof (uint32_t);
3730 5 * sizeof (uint32_t)
3731 + t.level1_size * sizeof (uint32_t);
3733 5 * sizeof (uint32_t)
3734 + t.level1_size * sizeof (uint32_t)
3735 + (t.level2_size << t.q) * sizeof (uint32_t);
3737 for (i = 0; i < 5; i++)
3738 fprintf (stream, "#define script_header_%d %d\n", i,
3739 ((uint32_t *) t.result)[i]);
3740 fprintf (stream, "static const\n");
3741 fprintf (stream, "struct\n");
3742 fprintf (stream, " {\n");
3743 fprintf (stream, " int level1[%zu];\n", t.level1_size);
3744 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
3745 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
3746 fprintf (stream, " }\n");
3747 fprintf (stream, "u_script =\n");
3748 fprintf (stream, "{\n");
3749 fprintf (stream, " {");
3750 if (t.level1_size > 8)
3751 fprintf (stream, "\n ");
3752 for (i = 0; i < t.level1_size; i++)
3755 if (i > 0 && (i % 8) == 0)
3756 fprintf (stream, "\n ");
3757 offset = ((uint32_t *) (t.result + level1_offset))[i];
3759 fprintf (stream, " %5d", -1);
3761 fprintf (stream, " %5zu",
3762 (offset - level2_offset) / sizeof (uint32_t));
3763 if (i+1 < t.level1_size)
3764 fprintf (stream, ",");
3766 if (t.level1_size > 8)
3767 fprintf (stream, "\n ");
3768 fprintf (stream, " },\n");
3769 fprintf (stream, " {");
3770 if (t.level2_size << t.q > 8)
3771 fprintf (stream, "\n ");
3772 for (i = 0; i < t.level2_size << t.q; i++)
3775 if (i > 0 && (i % 8) == 0)
3776 fprintf (stream, "\n ");
3777 offset = ((uint32_t *) (t.result + level2_offset))[i];
3779 fprintf (stream, " %5d", -1);
3781 fprintf (stream, " %5zu",
3782 (offset - level3_offset) / sizeof (uint8_t));
3783 if (i+1 < t.level2_size << t.q)
3784 fprintf (stream, ",");
3786 if (t.level2_size << t.q > 8)
3787 fprintf (stream, "\n ");
3788 fprintf (stream, " },\n");
3789 fprintf (stream, " {");
3790 if (t.level3_size << t.p > 8)
3791 fprintf (stream, "\n ");
3792 for (i = 0; i < t.level3_size << t.p; i++)
3794 if (i > 0 && (i % 8) == 0)
3795 fprintf (stream, "\n ");
3796 fprintf (stream, " %3d", ((uint8_t *) (t.result + level3_offset))[i]);
3797 if (i+1 < t.level3_size << t.p)
3798 fprintf (stream, ",");
3800 if (t.level3_size << t.p > 8)
3801 fprintf (stream, "\n ");
3802 fprintf (stream, " }\n");
3803 fprintf (stream, "};\n");
3805 if (ferror (stream) || fclose (stream))
3807 fprintf (stderr, "error writing to '%s'\n", filename);
3813 output_scripts_byname (const char *version)
3815 const char *filename = "unictype/scripts_byname.gperf";
3819 stream = fopen (filename, "w");
3822 fprintf (stderr, "cannot open '%s' for writing\n", filename);
3826 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
3827 fprintf (stream, "/* Unicode scripts. */\n");
3828 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
3830 fprintf (stream, "struct named_script { const char *name; unsigned int index; };\n");
3831 fprintf (stream, "%%struct-type\n");
3832 fprintf (stream, "%%language=ANSI-C\n");
3833 fprintf (stream, "%%define hash-function-name scripts_hash\n");
3834 fprintf (stream, "%%define lookup-function-name uc_script_lookup\n");
3835 fprintf (stream, "%%readonly-tables\n");
3836 fprintf (stream, "%%global-table\n");
3837 fprintf (stream, "%%define word-array-name script_names\n");
3838 fprintf (stream, "%%%%\n");
3839 for (s = 0; s < numscripts; s++)
3840 fprintf (stream, "%s, %u\n", scripts[s], s);
3842 if (ferror (stream) || fclose (stream))
3844 fprintf (stderr, "error writing to '%s'\n", filename);
3849 /* ========================================================================= */
3853 typedef struct { unsigned int start; unsigned int end; const char *name; }
3855 static block_t blocks[256];
3856 static unsigned int numblocks;
3859 fill_blocks (const char *blocks_filename)
3863 stream = fopen (blocks_filename, "r");
3866 fprintf (stderr, "error during fopen of '%s'\n", blocks_filename);
3873 unsigned int i1, i2;
3874 char padding[200+1];
3875 char blockname[200+1];
3877 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
3880 if (buf[0] == '\0' || buf[0] == '#')
3883 if (sscanf (buf, "%X..%X%[ ;]%[^\r]", &i1, &i2, padding, blockname) != 4)
3885 fprintf (stderr, "parse error in '%s'\n", blocks_filename);
3888 blocks[numblocks].start = i1;
3889 blocks[numblocks].end = i2;
3890 blocks[numblocks].name = strdup (blockname);
3891 /* It must be sorted. */
3892 if (numblocks > 0 && !(blocks[numblocks-1].end < blocks[numblocks].start))
3895 if (numblocks == 256)
3899 if (ferror (stream) || fclose (stream))
3901 fprintf (stderr, "error reading from '%s'\n", blocks_filename);
3906 /* Return the smallest block index among the blocks for characters >= ch. */
3908 block_first_index (unsigned int ch)
3910 /* Binary search. */
3911 unsigned int lo = 0;
3912 unsigned int hi = numblocks;
3914 All blocks[i], i < lo, have blocks[i].end < ch,
3915 all blocks[i], i >= hi, have blocks[i].end >= ch. */
3918 unsigned int mid = (lo + hi) / 2; /* >= lo, < hi */
3919 if (blocks[mid].end < ch)
3927 /* Return the largest block index among the blocks for characters <= ch,
3930 block_last_index (unsigned int ch)
3932 /* Binary search. */
3933 unsigned int lo = 0;
3934 unsigned int hi = numblocks;
3936 All blocks[i], i < lo, have blocks[i].start <= ch,
3937 all blocks[i], i >= hi, have blocks[i].start > ch. */
3940 unsigned int mid = (lo + hi) / 2; /* >= lo, < hi */
3941 if (blocks[mid].start <= ch)
3950 output_blocks (const char *version)
3952 const char *filename = "unictype/blocks.h";
3953 const unsigned int shift = 8; /* bits to shift away for array access */
3954 const unsigned int threshold = 0x30000; /* cut-off table here to save space */
3959 stream = fopen (filename, "w");
3962 fprintf (stderr, "cannot open '%s' for writing\n", filename);
3966 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
3967 fprintf (stream, "/* Unicode blocks. */\n");
3968 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
3971 fprintf (stream, "static const uc_block_t blocks[] =\n");
3972 fprintf (stream, "{\n");
3973 for (i = 0; i < numblocks; i++)
3975 fprintf (stream, " { 0x%04X, 0x%04X, \"%s\" }", blocks[i].start,
3976 blocks[i].end, blocks[i].name);
3977 if (i+1 < numblocks)
3978 fprintf (stream, ",");
3979 fprintf (stream, "\n");
3981 fprintf (stream, "};\n");
3982 fprintf (stream, "#define blocks_level1_shift %d\n", shift);
3983 fprintf (stream, "#define blocks_level1_threshold 0x%04X\n", threshold);
3984 fprintf (stream, "static const uint8_t blocks_level1[%d * 2] =\n",
3985 threshold >> shift);
3986 fprintf (stream, "{\n");
3987 for (i1 = 0; i1 < (threshold >> shift); i1++)
3989 unsigned int first_index = block_first_index (i1 << shift);
3990 unsigned int last_index = block_last_index (((i1 + 1) << shift) - 1);
3991 fprintf (stream, " %3d, %3d", first_index, last_index);
3992 if (i1+1 < (threshold >> shift))
3993 fprintf (stream, ",");
3994 fprintf (stream, "\n");
3996 fprintf (stream, "};\n");
3997 fprintf (stream, "#define blocks_upper_first_index %d\n",
3998 block_first_index (threshold));
3999 fprintf (stream, "#define blocks_upper_last_index %d\n",
4000 block_last_index (0x10FFFF));
4002 if (ferror (stream) || fclose (stream))
4004 fprintf (stderr, "error writing to '%s'\n", filename);
4009 /* ========================================================================= */
4011 /* C and Java syntax. */
4015 UC_IDENTIFIER_START, /* valid as first or subsequent character */
4016 UC_IDENTIFIER_VALID, /* valid as subsequent character only */
4017 UC_IDENTIFIER_INVALID, /* not valid */
4018 UC_IDENTIFIER_IGNORABLE /* ignorable (Java only) */
4021 /* ISO C 99 section 6.4.(3). */
4023 is_c_whitespace (unsigned int ch)
4025 return (ch == ' ' /* space */
4026 || ch == '\t' /* horizontal tab */
4027 || ch == '\n' || ch == '\r' /* new-line */
4028 || ch == '\v' /* vertical tab */
4029 || ch == '\f'); /* form-feed */
4032 /* ISO C 99 section 6.4.2.1 and appendix D. */
4034 c_ident_category (unsigned int ch)
4036 /* Section 6.4.2.1. */
4037 if (ch >= '0' && ch <= '9')
4038 return UC_IDENTIFIER_VALID;
4039 if ((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || ch == '_')
4040 return UC_IDENTIFIER_START;
4046 || (ch >= 0x00C0 && ch <= 0x00D6)
4047 || (ch >= 0x00D8 && ch <= 0x00F6)
4048 || (ch >= 0x00F8 && ch <= 0x01F5)
4049 || (ch >= 0x01FA && ch <= 0x0217)
4050 || (ch >= 0x0250 && ch <= 0x02A8)
4051 || (ch >= 0x1E00 && ch <= 0x1E9B)
4052 || (ch >= 0x1EA0 && ch <= 0x1EF9)
4056 || (ch >= 0x0388 && ch <= 0x038A)
4058 || (ch >= 0x038E && ch <= 0x03A1)
4059 || (ch >= 0x03A3 && ch <= 0x03CE)
4060 || (ch >= 0x03D0 && ch <= 0x03D6)
4065 || (ch >= 0x03E2 && ch <= 0x03F3)
4066 || (ch >= 0x1F00 && ch <= 0x1F15)
4067 || (ch >= 0x1F18 && ch <= 0x1F1D)
4068 || (ch >= 0x1F20 && ch <= 0x1F45)
4069 || (ch >= 0x1F48 && ch <= 0x1F4D)
4070 || (ch >= 0x1F50 && ch <= 0x1F57)
4074 || (ch >= 0x1F5F && ch <= 0x1F7D)
4075 || (ch >= 0x1F80 && ch <= 0x1FB4)
4076 || (ch >= 0x1FB6 && ch <= 0x1FBC)
4077 || (ch >= 0x1FC2 && ch <= 0x1FC4)
4078 || (ch >= 0x1FC6 && ch <= 0x1FCC)
4079 || (ch >= 0x1FD0 && ch <= 0x1FD3)
4080 || (ch >= 0x1FD6 && ch <= 0x1FDB)
4081 || (ch >= 0x1FE0 && ch <= 0x1FEC)
4082 || (ch >= 0x1FF2 && ch <= 0x1FF4)
4083 || (ch >= 0x1FF6 && ch <= 0x1FFC)
4085 || (ch >= 0x0401 && ch <= 0x040C)
4086 || (ch >= 0x040E && ch <= 0x044F)
4087 || (ch >= 0x0451 && ch <= 0x045C)
4088 || (ch >= 0x045E && ch <= 0x0481)
4089 || (ch >= 0x0490 && ch <= 0x04C4)
4090 || (ch >= 0x04C7 && ch <= 0x04C8)
4091 || (ch >= 0x04CB && ch <= 0x04CC)
4092 || (ch >= 0x04D0 && ch <= 0x04EB)
4093 || (ch >= 0x04EE && ch <= 0x04F5)
4094 || (ch >= 0x04F8 && ch <= 0x04F9)
4096 || (ch >= 0x0531 && ch <= 0x0556)
4097 || (ch >= 0x0561 && ch <= 0x0587)
4099 || (ch >= 0x05B0 && ch <= 0x05B9)
4100 || (ch >= 0x05BB && ch <= 0x05BD)
4102 || (ch >= 0x05C1 && ch <= 0x05C2)
4103 || (ch >= 0x05D0 && ch <= 0x05EA)
4104 || (ch >= 0x05F0 && ch <= 0x05F2)
4106 || (ch >= 0x0621 && ch <= 0x063A)
4107 || (ch >= 0x0640 && ch <= 0x0652)
4108 || (ch >= 0x0670 && ch <= 0x06B7)
4109 || (ch >= 0x06BA && ch <= 0x06BE)
4110 || (ch >= 0x06C0 && ch <= 0x06CE)
4111 || (ch >= 0x06D0 && ch <= 0x06DC)
4112 || (ch >= 0x06E5 && ch <= 0x06E8)
4113 || (ch >= 0x06EA && ch <= 0x06ED)
4115 || (ch >= 0x0901 && ch <= 0x0903)
4116 || (ch >= 0x0905 && ch <= 0x0939)
4117 || (ch >= 0x093E && ch <= 0x094D)
4118 || (ch >= 0x0950 && ch <= 0x0952)
4119 || (ch >= 0x0958 && ch <= 0x0963)
4121 || (ch >= 0x0981 && ch <= 0x0983)
4122 || (ch >= 0x0985 && ch <= 0x098C)
4123 || (ch >= 0x098F && ch <= 0x0990)
4124 || (ch >= 0x0993 && ch <= 0x09A8)
4125 || (ch >= 0x09AA && ch <= 0x09B0)
4127 || (ch >= 0x09B6 && ch <= 0x09B9)
4128 || (ch >= 0x09BE && ch <= 0x09C4)
4129 || (ch >= 0x09C7 && ch <= 0x09C8)
4130 || (ch >= 0x09CB && ch <= 0x09CD)
4131 || (ch >= 0x09DC && ch <= 0x09DD)
4132 || (ch >= 0x09DF && ch <= 0x09E3)
4133 || (ch >= 0x09F0 && ch <= 0x09F1)
4136 || (ch >= 0x0A05 && ch <= 0x0A0A)
4137 || (ch >= 0x0A0F && ch <= 0x0A10)
4138 || (ch >= 0x0A13 && ch <= 0x0A28)
4139 || (ch >= 0x0A2A && ch <= 0x0A30)
4140 || (ch >= 0x0A32 && ch <= 0x0A33)
4141 || (ch >= 0x0A35 && ch <= 0x0A36)
4142 || (ch >= 0x0A38 && ch <= 0x0A39)
4143 || (ch >= 0x0A3E && ch <= 0x0A42)
4144 || (ch >= 0x0A47 && ch <= 0x0A48)
4145 || (ch >= 0x0A4B && ch <= 0x0A4D)
4146 || (ch >= 0x0A59 && ch <= 0x0A5C)
4150 || (ch >= 0x0A81 && ch <= 0x0A83)
4151 || (ch >= 0x0A85 && ch <= 0x0A8B)
4153 || (ch >= 0x0A8F && ch <= 0x0A91)
4154 || (ch >= 0x0A93 && ch <= 0x0AA8)
4155 || (ch >= 0x0AAA && ch <= 0x0AB0)
4156 || (ch >= 0x0AB2 && ch <= 0x0AB3)
4157 || (ch >= 0x0AB5 && ch <= 0x0AB9)
4158 || (ch >= 0x0ABD && ch <= 0x0AC5)
4159 || (ch >= 0x0AC7 && ch <= 0x0AC9)
4160 || (ch >= 0x0ACB && ch <= 0x0ACD)
4164 || (ch >= 0x0B01 && ch <= 0x0B03)
4165 || (ch >= 0x0B05 && ch <= 0x0B0C)
4166 || (ch >= 0x0B0F && ch <= 0x0B10)
4167 || (ch >= 0x0B13 && ch <= 0x0B28)
4168 || (ch >= 0x0B2A && ch <= 0x0B30)
4169 || (ch >= 0x0B32 && ch <= 0x0B33)
4170 || (ch >= 0x0B36 && ch <= 0x0B39)
4171 || (ch >= 0x0B3E && ch <= 0x0B43)
4172 || (ch >= 0x0B47 && ch <= 0x0B48)
4173 || (ch >= 0x0B4B && ch <= 0x0B4D)
4174 || (ch >= 0x0B5C && ch <= 0x0B5D)
4175 || (ch >= 0x0B5F && ch <= 0x0B61)
4177 || (ch >= 0x0B82 && ch <= 0x0B83)
4178 || (ch >= 0x0B85 && ch <= 0x0B8A)
4179 || (ch >= 0x0B8E && ch <= 0x0B90)
4180 || (ch >= 0x0B92 && ch <= 0x0B95)
4181 || (ch >= 0x0B99 && ch <= 0x0B9A)
4183 || (ch >= 0x0B9E && ch <= 0x0B9F)
4184 || (ch >= 0x0BA3 && ch <= 0x0BA4)
4185 || (ch >= 0x0BA8 && ch <= 0x0BAA)
4186 || (ch >= 0x0BAE && ch <= 0x0BB5)
4187 || (ch >= 0x0BB7 && ch <= 0x0BB9)
4188 || (ch >= 0x0BBE && ch <= 0x0BC2)
4189 || (ch >= 0x0BC6 && ch <= 0x0BC8)
4190 || (ch >= 0x0BCA && ch <= 0x0BCD)
4192 || (ch >= 0x0C01 && ch <= 0x0C03)
4193 || (ch >= 0x0C05 && ch <= 0x0C0C)
4194 || (ch >= 0x0C0E && ch <= 0x0C10)
4195 || (ch >= 0x0C12 && ch <= 0x0C28)
4196 || (ch >= 0x0C2A && ch <= 0x0C33)
4197 || (ch >= 0x0C35 && ch <= 0x0C39)
4198 || (ch >= 0x0C3E && ch <= 0x0C44)
4199 || (ch >= 0x0C46 && ch <= 0x0C48)
4200 || (ch >= 0x0C4A && ch <= 0x0C4D)
4201 || (ch >= 0x0C60 && ch <= 0x0C61)
4203 || (ch >= 0x0C82 && ch <= 0x0C83)
4204 || (ch >= 0x0C85 && ch <= 0x0C8C)
4205 || (ch >= 0x0C8E && ch <= 0x0C90)
4206 || (ch >= 0x0C92 && ch <= 0x0CA8)
4207 || (ch >= 0x0CAA && ch <= 0x0CB3)
4208 || (ch >= 0x0CB5 && ch <= 0x0CB9)
4209 || (ch >= 0x0CBE && ch <= 0x0CC4)
4210 || (ch >= 0x0CC6 && ch <= 0x0CC8)
4211 || (ch >= 0x0CCA && ch <= 0x0CCD)
4213 || (ch >= 0x0CE0 && ch <= 0x0CE1)
4215 || (ch >= 0x0D02 && ch <= 0x0D03)
4216 || (ch >= 0x0D05 && ch <= 0x0D0C)
4217 || (ch >= 0x0D0E && ch <= 0x0D10)
4218 || (ch >= 0x0D12 && ch <= 0x0D28)
4219 || (ch >= 0x0D2A && ch <= 0x0D39)
4220 || (ch >= 0x0D3E && ch <= 0x0D43)
4221 || (ch >= 0x0D46 && ch <= 0x0D48)
4222 || (ch >= 0x0D4A && ch <= 0x0D4D)
4223 || (ch >= 0x0D60 && ch <= 0x0D61)
4225 || (ch >= 0x0E01 && ch <= 0x0E3A)
4226 || (ch >= 0x0E40 && ch <= 0x0E5B)
4228 || (ch >= 0x0E81 && ch <= 0x0E82)
4230 || (ch >= 0x0E87 && ch <= 0x0E88)
4233 || (ch >= 0x0E94 && ch <= 0x0E97)
4234 || (ch >= 0x0E99 && ch <= 0x0E9F)
4235 || (ch >= 0x0EA1 && ch <= 0x0EA3)
4238 || (ch >= 0x0EAA && ch <= 0x0EAB)
4239 || (ch >= 0x0EAD && ch <= 0x0EAE)
4240 || (ch >= 0x0EB0 && ch <= 0x0EB9)
4241 || (ch >= 0x0EBB && ch <= 0x0EBD)
4242 || (ch >= 0x0EC0 && ch <= 0x0EC4)
4244 || (ch >= 0x0EC8 && ch <= 0x0ECD)
4245 || (ch >= 0x0EDC && ch <= 0x0EDD)
4248 || (ch >= 0x0F18 && ch <= 0x0F19)
4252 || (ch >= 0x0F3E && ch <= 0x0F47)
4253 || (ch >= 0x0F49 && ch <= 0x0F69)
4254 || (ch >= 0x0F71 && ch <= 0x0F84)
4255 || (ch >= 0x0F86 && ch <= 0x0F8B)
4256 || (ch >= 0x0F90 && ch <= 0x0F95)
4258 || (ch >= 0x0F99 && ch <= 0x0FAD)
4259 || (ch >= 0x0FB1 && ch <= 0x0FB7)
4262 || (ch >= 0x10A0 && ch <= 0x10C5)
4263 || (ch >= 0x10D0 && ch <= 0x10F6)
4265 || (ch >= 0x3041 && ch <= 0x3093)
4266 || (ch >= 0x309B && ch <= 0x309C)
4268 || (ch >= 0x30A1 && ch <= 0x30F6)
4269 || (ch >= 0x30FB && ch <= 0x30FC)
4271 || (ch >= 0x3105 && ch <= 0x312C)
4272 /* CJK Unified Ideographs */
4273 || (ch >= 0x4E00 && ch <= 0x9FA5)
4275 || (ch >= 0xAC00 && ch <= 0xD7A3)
4277 || (ch >= 0x0660 && ch <= 0x0669)
4278 || (ch >= 0x06F0 && ch <= 0x06F9)
4279 || (ch >= 0x0966 && ch <= 0x096F)
4280 || (ch >= 0x09E6 && ch <= 0x09EF)
4281 || (ch >= 0x0A66 && ch <= 0x0A6F)
4282 || (ch >= 0x0AE6 && ch <= 0x0AEF)
4283 || (ch >= 0x0B66 && ch <= 0x0B6F)
4284 || (ch >= 0x0BE7 && ch <= 0x0BEF)
4285 || (ch >= 0x0C66 && ch <= 0x0C6F)
4286 || (ch >= 0x0CE6 && ch <= 0x0CEF)
4287 || (ch >= 0x0D66 && ch <= 0x0D6F)
4288 || (ch >= 0x0E50 && ch <= 0x0E59)
4289 || (ch >= 0x0ED0 && ch <= 0x0ED9)
4290 || (ch >= 0x0F20 && ch <= 0x0F33)
4291 /* Special characters */
4294 || (ch >= 0x02B0 && ch <= 0x02B8)
4296 || (ch >= 0x02BD && ch <= 0x02C1)
4297 || (ch >= 0x02D0 && ch <= 0x02D1)
4298 || (ch >= 0x02E0 && ch <= 0x02E4)
4304 || (ch >= 0x203F && ch <= 0x2040)
4307 || (ch >= 0x210A && ch <= 0x2113)
4309 || (ch >= 0x2118 && ch <= 0x211D)
4313 || (ch >= 0x212A && ch <= 0x2131)
4314 || (ch >= 0x2133 && ch <= 0x2138)
4315 || (ch >= 0x2160 && ch <= 0x2182)
4316 || (ch >= 0x3005 && ch <= 0x3007)
4317 || (ch >= 0x3021 && ch <= 0x3029)
4319 return UC_IDENTIFIER_START;
4320 return UC_IDENTIFIER_INVALID;
4323 /* The Java Language Specification, 3rd edition, §3.6.
4324 http://java.sun.com/docs/books/jls/third_edition/html/lexical.html#95710 */
4326 is_java_whitespace (unsigned int ch)
4328 return (ch == ' ' || ch == '\t' || ch == '\f'
4329 || ch == '\n' || ch == '\r');
4332 /* The Java Language Specification, 3rd edition, §3.8.
4333 http://java.sun.com/docs/books/jls/third_edition/html/lexical.html#40625
4334 and Character.isJavaIdentifierStart and Character.isJavaIdentifierPart */
4336 java_ident_category (unsigned int ch)
4338 /* FIXME: Check this against Sun's JDK implementation. */
4339 if (is_category_L (ch) /* = Character.isLetter(ch) */
4340 || is_category_Nl (ch) /* = Character.getType(ch)==LETTER_NUMBER */
4341 || is_category_Sc (ch) /* currency symbol */
4342 || is_category_Pc (ch) /* connector punctuation */
4344 return UC_IDENTIFIER_START;
4345 if (is_category_Nd (ch) /* digit */
4346 || is_category_Mc (ch) /* combining mark */
4347 || is_category_Mn (ch) /* non-spacing mark */
4349 return UC_IDENTIFIER_VALID;
4350 if ((ch >= 0x0000 && ch <= 0x0008)
4351 || (ch >= 0x000E && ch <= 0x001B)
4352 || (ch >= 0x007F && ch <= 0x009F)
4353 || is_category_Cf (ch) /* = Character.getType(ch)==FORMAT */
4355 return UC_IDENTIFIER_IGNORABLE;
4356 return UC_IDENTIFIER_INVALID;
4359 /* Construction of sparse 3-level tables. */
4360 #define TABLE identsyntax_table
4361 #define ELEMENT uint8_t
4362 #define DEFAULT UC_IDENTIFIER_INVALID
4363 #define xmalloc malloc
4364 #define xrealloc realloc
4367 /* Output an identifier syntax categorization in a three-level bitmap. */
4369 output_ident_category (const char *filename, int (*predicate) (unsigned int), const char *name, const char *version)
4373 struct identsyntax_table t;
4374 unsigned int level1_offset, level2_offset, level3_offset;
4376 stream = fopen (filename, "w");
4379 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4383 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4384 fprintf (stream, "/* Language syntax properties of Unicode characters. */\n");
4385 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
4390 identsyntax_table_init (&t);
4392 for (ch = 0; ch < 0x110000; ch++)
4394 int syntaxcode = predicate (ch);
4395 if (syntaxcode != UC_IDENTIFIER_INVALID)
4396 identsyntax_table_add (&t, ch, syntaxcode);
4399 identsyntax_table_finalize (&t);
4401 /* Offsets in t.result, in memory of this process. */
4403 5 * sizeof (uint32_t);
4405 5 * sizeof (uint32_t)
4406 + t.level1_size * sizeof (uint32_t);
4408 5 * sizeof (uint32_t)
4409 + t.level1_size * sizeof (uint32_t)
4410 + (t.level2_size << t.q) * sizeof (uint32_t);
4412 for (i = 0; i < 5; i++)
4413 fprintf (stream, "#define identsyntax_header_%d %d\n", i,
4414 ((uint32_t *) t.result)[i]);
4415 fprintf (stream, "static const\n");
4416 fprintf (stream, "struct\n");
4417 fprintf (stream, " {\n");
4418 fprintf (stream, " int level1[%zu];\n", t.level1_size);
4419 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
4420 fprintf (stream, " unsigned short level3[%zu * %d];\n", t.level3_size,
4421 (1 << t.p) * 2 / 16);
4422 fprintf (stream, " }\n");
4423 fprintf (stream, "%s =\n", name);
4424 fprintf (stream, "{\n");
4425 fprintf (stream, " {");
4426 if (t.level1_size > 8)
4427 fprintf (stream, "\n ");
4428 for (i = 0; i < t.level1_size; i++)
4431 if (i > 0 && (i % 8) == 0)
4432 fprintf (stream, "\n ");
4433 offset = ((uint32_t *) (t.result + level1_offset))[i];
4435 fprintf (stream, " %5d", -1);
4437 fprintf (stream, " %5zu",
4438 (offset - level2_offset) / sizeof (uint32_t));
4439 if (i+1 < t.level1_size)
4440 fprintf (stream, ",");
4442 if (t.level1_size > 8)
4443 fprintf (stream, "\n ");
4444 fprintf (stream, " },\n");
4445 fprintf (stream, " {");
4446 if (t.level2_size << t.q > 8)
4447 fprintf (stream, "\n ");
4448 for (i = 0; i < t.level2_size << t.q; i++)
4451 if (i > 0 && (i % 8) == 0)
4452 fprintf (stream, "\n ");
4453 offset = ((uint32_t *) (t.result + level2_offset))[i];
4455 fprintf (stream, " %5d", -1);
4457 fprintf (stream, " %5zu",
4458 (offset - level3_offset) / sizeof (uint8_t));
4459 if (i+1 < t.level2_size << t.q)
4460 fprintf (stream, ",");
4462 if (t.level2_size << t.q > 8)
4463 fprintf (stream, "\n ");
4464 fprintf (stream, " },\n");
4465 /* Pack the level3 array. Each entry needs 2 bits only. */
4466 fprintf (stream, " {");
4467 if ((t.level3_size << t.p) * 2 / 16 > 8)
4468 fprintf (stream, "\n ");
4469 for (i = 0; i < (t.level3_size << t.p) * 2 / 16; i++)
4471 if (i > 0 && (i % 8) == 0)
4472 fprintf (stream, "\n ");
4473 fprintf (stream, " 0x%04x",
4474 (((uint8_t *) (t.result + level3_offset))[8 * i] << 0)
4475 | (((uint8_t *) (t.result + level3_offset))[8 * i + 1] << 2)
4476 | (((uint8_t *) (t.result + level3_offset))[8 * i + 2] << 4)
4477 | (((uint8_t *) (t.result + level3_offset))[8 * i + 3] << 6)
4478 | (((uint8_t *) (t.result + level3_offset))[8 * i + 4] << 8)
4479 | (((uint8_t *) (t.result + level3_offset))[8 * i + 5] << 10)
4480 | (((uint8_t *) (t.result + level3_offset))[8 * i + 6] << 12)
4481 | (((uint8_t *) (t.result + level3_offset))[8 * i + 7] << 14));
4482 if (i+1 < (t.level3_size << t.p) * 2 / 16)
4483 fprintf (stream, ",");
4485 if ((t.level3_size << t.p) * 2 / 16 > 8)
4486 fprintf (stream, "\n ");
4487 fprintf (stream, " }\n");
4488 fprintf (stream, "};\n");
4490 if (ferror (stream) || fclose (stream))
4492 fprintf (stderr, "error writing to '%s'\n", filename);
4498 output_ident_properties (const char *version)
4500 #define PROPERTY(P) \
4501 debug_output_predicate ("unictype/sy_" #P ".txt", is_ ## P); \
4502 output_predicate_test ("../tests/unictype/test-sy_" #P ".c", is_ ## P, "uc_is_" #P " (c)"); \
4503 output_predicate ("unictype/sy_" #P ".h", is_ ## P, "u_" #P, "Language syntax properties", version);
4504 PROPERTY(c_whitespace)
4505 PROPERTY(java_whitespace)
4508 output_ident_category ("unictype/sy_c_ident.h", c_ident_category, "u_c_ident", version);
4509 output_ident_category ("unictype/sy_java_ident.h", java_ident_category, "u_java_ident", version);
4512 /* ========================================================================= */
4514 /* Like ISO C <ctype.h> and <wctype.h>. Compatible to glibc's
4515 glibc/localedata/locales/i18n file, generated by
4516 glibc/localedata/gen-unicode-ctype.c. */
4518 /* Character mappings. */
4521 to_upper (unsigned int ch)
4523 if (unicode_attributes[ch].name != NULL
4524 && unicode_attributes[ch].upper != NONE)
4525 return unicode_attributes[ch].upper;
4531 to_lower (unsigned int ch)
4533 if (unicode_attributes[ch].name != NULL
4534 && unicode_attributes[ch].lower != NONE)
4535 return unicode_attributes[ch].lower;
4541 to_title (unsigned int ch)
4543 if (unicode_attributes[ch].name != NULL
4544 && unicode_attributes[ch].title != NONE)
4545 return unicode_attributes[ch].title;
4550 /* Character class properties. */
4553 is_upper (unsigned int ch)
4555 return (to_lower (ch) != ch);
4559 is_lower (unsigned int ch)
4561 return (to_upper (ch) != ch)
4562 /* <U00DF> is lowercase, but without simple to_upper mapping. */
4567 is_alpha (unsigned int ch)
4569 return (unicode_attributes[ch].name != NULL
4570 && ((unicode_attributes[ch].category[0] == 'L'
4571 /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
4572 <U0E2F>, <U0E46> should belong to is_punct. */
4573 && (ch != 0x0E2F) && (ch != 0x0E46))
4574 /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
4575 <U0E31>, <U0E34>..<U0E3A>, <U0E47>..<U0E4E> are is_alpha. */
4577 || (ch >= 0x0E34 && ch <= 0x0E3A)
4578 || (ch >= 0x0E47 && ch <= 0x0E4E)
4579 /* Avoid warning for <U0345>. */
4581 /* Avoid warnings for <U2160>..<U217F>. */
4582 || (unicode_attributes[ch].category[0] == 'N'
4583 && unicode_attributes[ch].category[1] == 'l')
4584 /* Avoid warnings for <U24B6>..<U24E9>. */
4585 || (unicode_attributes[ch].category[0] == 'S'
4586 && unicode_attributes[ch].category[1] == 'o'
4587 && strstr (unicode_attributes[ch].name, " LETTER ")
4589 /* Consider all the non-ASCII digits as alphabetic.
4590 ISO C 99 forbids us to have them in category "digit",
4591 but we want iswalnum to return true on them. */
4592 || (unicode_attributes[ch].category[0] == 'N'
4593 && unicode_attributes[ch].category[1] == 'd'
4594 && !(ch >= 0x0030 && ch <= 0x0039))));
4598 is_digit (unsigned int ch)
4601 return (unicode_attributes[ch].name != NULL
4602 && unicode_attributes[ch].category[0] == 'N'
4603 && unicode_attributes[ch].category[1] == 'd');
4604 /* Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
4605 a zero. Must add <0> in front of them by hand. */
4607 /* SUSV2 gives us some freedom for the "digit" category, but ISO C 99
4610 The iswdigit function tests for any wide character that corresponds
4611 to a decimal-digit character (as defined in 5.2.1).
4613 the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
4615 return (ch >= 0x0030 && ch <= 0x0039);
4620 is_outdigit (unsigned int ch)
4622 return (ch >= 0x0030 && ch <= 0x0039);
4626 is_alnum (unsigned int ch)
4628 return is_alpha (ch) || is_digit (ch);
4632 is_blank (unsigned int ch)
4634 return (ch == 0x0009 /* '\t' */
4635 /* Category Zs without mention of "<noBreak>" */
4636 || (unicode_attributes[ch].name != NULL
4637 && unicode_attributes[ch].category[0] == 'Z'
4638 && unicode_attributes[ch].category[1] == 's'
4639 && !strstr (unicode_attributes[ch].decomposition, "<noBreak>")));
4643 is_space (unsigned int ch)
4645 /* Don't make U+00A0 a space. Non-breaking space means that all programs
4646 should treat it like a punctuation character, not like a space. */
4647 return (ch == 0x0020 /* ' ' */
4648 || ch == 0x000C /* '\f' */
4649 || ch == 0x000A /* '\n' */
4650 || ch == 0x000D /* '\r' */
4651 || ch == 0x0009 /* '\t' */
4652 || ch == 0x000B /* '\v' */
4653 /* Categories Zl, Zp, and Zs without mention of "<noBreak>" */
4654 || (unicode_attributes[ch].name != NULL
4655 && unicode_attributes[ch].category[0] == 'Z'
4656 && (unicode_attributes[ch].category[1] == 'l'
4657 || unicode_attributes[ch].category[1] == 'p'
4658 || (unicode_attributes[ch].category[1] == 's'
4659 && !strstr (unicode_attributes[ch].decomposition,
4664 is_cntrl (unsigned int ch)
4666 return (unicode_attributes[ch].name != NULL
4667 && (strcmp (unicode_attributes[ch].name, "<control>") == 0
4668 /* Categories Zl and Zp */
4669 || (unicode_attributes[ch].category[0] == 'Z'
4670 && (unicode_attributes[ch].category[1] == 'l'
4671 || unicode_attributes[ch].category[1] == 'p'))));
4675 is_xdigit (unsigned int ch)
4678 return is_digit (ch)
4679 || (ch >= 0x0041 && ch <= 0x0046)
4680 || (ch >= 0x0061 && ch <= 0x0066);
4682 /* SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
4685 The iswxdigit function tests for any wide character that corresponds
4686 to a hexadecimal-digit character (as defined in 6.4.4.1).
4688 hexadecimal-digit: one of 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
4690 return (ch >= 0x0030 && ch <= 0x0039)
4691 || (ch >= 0x0041 && ch <= 0x0046)
4692 || (ch >= 0x0061 && ch <= 0x0066);
4697 is_graph (unsigned int ch)
4699 return (unicode_attributes[ch].name != NULL
4700 && strcmp (unicode_attributes[ch].name, "<control>")
4705 is_print (unsigned int ch)
4707 return (unicode_attributes[ch].name != NULL
4708 && strcmp (unicode_attributes[ch].name, "<control>")
4709 /* Categories Zl and Zp */
4710 && !(unicode_attributes[ch].name != NULL
4711 && unicode_attributes[ch].category[0] == 'Z'
4712 && (unicode_attributes[ch].category[1] == 'l'
4713 || unicode_attributes[ch].category[1] == 'p')));
4717 is_punct (unsigned int ch)
4720 return (unicode_attributes[ch].name != NULL
4721 && unicode_attributes[ch].category[0] == 'P');
4723 /* The traditional POSIX definition of punctuation is every graphic,
4724 non-alphanumeric character. */
4725 return (is_graph (ch) && !is_alpha (ch) && !is_digit (ch));
4729 /* Output all properties. */
4731 output_old_ctype (const char *version)
4733 #define PROPERTY(P) \
4734 debug_output_predicate ("unictype/ctype_" #P ".txt", is_ ## P); \
4735 output_predicate_test ("../tests/unictype/test-ctype_" #P ".c", is_ ## P, "uc_is_" #P " (c)"); \
4736 output_predicate ("unictype/ctype_" #P ".h", is_ ## P, "u_is_" #P, "ISO C <ctype.h> like properties", version);
4755 is_combining (unsigned int ch)
4757 /* Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
4758 file. In 3.0.1 it was identical to the union of the general categories
4759 "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the
4760 PropList.txt file, so we take the latter definition. */
4761 return (unicode_attributes[ch].name != NULL
4762 && unicode_attributes[ch].category[0] == 'M'
4763 && (unicode_attributes[ch].category[1] == 'n'
4764 || unicode_attributes[ch].category[1] == 'c'
4765 || unicode_attributes[ch].category[1] == 'e'));
4769 is_combining_level3 (unsigned int ch)
4771 return is_combining (ch)
4772 && !(unicode_attributes[ch].combining[0] != '\0'
4773 && unicode_attributes[ch].combining[0] != '0'
4774 && strtoul (unicode_attributes[ch].combining, NULL, 10) >= 200);
4777 /* Return the UCS symbol string for a Unicode character. */
4779 ucs_symbol (unsigned int i)
4781 static char buf[11+1];
4783 sprintf (buf, (i < 0x10000 ? "<U%04X>" : "<U%08X>"), i);
4787 /* Return the UCS symbol range string for a Unicode characters interval. */
4789 ucs_symbol_range (unsigned int low, unsigned int high)
4791 static char buf[24+1];
4793 strcpy (buf, ucs_symbol (low));
4795 strcat (buf, ucs_symbol (high));
4799 /* Output a character class (= property) table. */
4802 output_charclass (FILE *stream, const char *classname,
4803 bool (*func) (unsigned int))
4805 char table[0x110000];
4807 bool need_semicolon;
4808 const int max_column = 75;
4811 for (i = 0; i < 0x110000; i++)
4812 table[i] = (int) func (i);
4814 fprintf (stream, "%s ", classname);
4815 need_semicolon = false;
4817 for (i = 0; i < 0x110000; )
4823 unsigned int low, high;
4829 while (i < 0x110000 && table[i]);
4833 strcpy (buf, ucs_symbol (low));
4835 strcpy (buf, ucs_symbol_range (low, high));
4839 fprintf (stream, ";");
4843 if (column + strlen (buf) > max_column)
4845 fprintf (stream, "/\n ");
4849 fprintf (stream, "%s", buf);
4850 column += strlen (buf);
4851 need_semicolon = true;
4854 fprintf (stream, "\n");
4857 /* Output a character mapping table. */
4860 output_charmap (FILE *stream, const char *mapname,
4861 unsigned int (*func) (unsigned int))
4863 char table[0x110000];
4865 bool need_semicolon;
4866 const int max_column = 75;
4869 for (i = 0; i < 0x110000; i++)
4870 table[i] = (func (i) != i);
4872 fprintf (stream, "%s ", mapname);
4873 need_semicolon = false;
4875 for (i = 0; i < 0x110000; i++)
4881 strcat (buf, ucs_symbol (i));
4883 strcat (buf, ucs_symbol (func (i)));
4888 fprintf (stream, ";");
4892 if (column + strlen (buf) > max_column)
4894 fprintf (stream, "/\n ");
4898 fprintf (stream, "%s", buf);
4899 column += strlen (buf);
4900 need_semicolon = true;
4902 fprintf (stream, "\n");
4905 /* Output the width table. */
4908 output_widthmap (FILE *stream)
4912 /* Output the tables to the given file. */
4915 output_tables (const char *filename, const char *version)
4920 stream = fopen (filename, "w");
4923 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4927 fprintf (stream, "escape_char /\n");
4928 fprintf (stream, "comment_char %%\n");
4929 fprintf (stream, "\n");
4930 fprintf (stream, "%% Generated automatically by gen-unicode-ctype for Unicode %s.\n",
4932 fprintf (stream, "\n");
4934 fprintf (stream, "LC_IDENTIFICATION\n");
4935 fprintf (stream, "title \"Unicode %s FDCC-set\"\n", version);
4936 fprintf (stream, "source \"UnicodeData.txt, PropList.txt\"\n");
4937 fprintf (stream, "address \"\"\n");
4938 fprintf (stream, "contact \"\"\n");
4939 fprintf (stream, "email \"bug-glibc@gnu.org\"\n");
4940 fprintf (stream, "tel \"\"\n");
4941 fprintf (stream, "fax \"\"\n");
4942 fprintf (stream, "language \"\"\n");
4943 fprintf (stream, "territory \"Earth\"\n");
4944 fprintf (stream, "revision \"%s\"\n", version);
4949 strftime (date, sizeof (date), "%Y-%m-%d", gmtime (&now));
4950 fprintf (stream, "date \"%s\"\n", date);
4952 fprintf (stream, "category \"unicode:2001\";LC_CTYPE\n");
4953 fprintf (stream, "END LC_IDENTIFICATION\n");
4954 fprintf (stream, "\n");
4956 /* Verifications. */
4957 for (ch = 0; ch < 0x110000; ch++)
4959 /* toupper restriction: "Only characters specified for the keywords
4960 lower and upper shall be specified. */
4961 if (to_upper (ch) != ch && !(is_lower (ch) || is_upper (ch)))
4963 "%s is not upper|lower but toupper(0x%04X) = 0x%04X\n",
4964 ucs_symbol (ch), ch, to_upper (ch));
4966 /* tolower restriction: "Only characters specified for the keywords
4967 lower and upper shall be specified. */
4968 if (to_lower (ch) != ch && !(is_lower (ch) || is_upper (ch)))
4970 "%s is not upper|lower but tolower(0x%04X) = 0x%04X\n",
4971 ucs_symbol (ch), ch, to_lower (ch));
4973 /* alpha restriction: "Characters classified as either upper or lower
4974 shall automatically belong to this class. */
4975 if ((is_lower (ch) || is_upper (ch)) && !is_alpha (ch))
4976 fprintf (stderr, "%s is upper|lower but not alpha\n", ucs_symbol (ch));
4978 /* alpha restriction: "No character specified for the keywords cntrl,
4979 digit, punct or space shall be specified." */
4980 if (is_alpha (ch) && is_cntrl (ch))
4981 fprintf (stderr, "%s is alpha and cntrl\n", ucs_symbol (ch));
4982 if (is_alpha (ch) && is_digit (ch))
4983 fprintf (stderr, "%s is alpha and digit\n", ucs_symbol (ch));
4984 if (is_alpha (ch) && is_punct (ch))
4985 fprintf (stderr, "%s is alpha and punct\n", ucs_symbol (ch));
4986 if (is_alpha (ch) && is_space (ch))
4987 fprintf (stderr, "%s is alpha and space\n", ucs_symbol (ch));
4989 /* space restriction: "No character specified for the keywords upper,
4990 lower, alpha, digit, graph or xdigit shall be specified."
4991 upper, lower, alpha already checked above. */
4992 if (is_space (ch) && is_digit (ch))
4993 fprintf (stderr, "%s is space and digit\n", ucs_symbol (ch));
4994 if (is_space (ch) && is_graph (ch))
4995 fprintf (stderr, "%s is space and graph\n", ucs_symbol (ch));
4996 if (is_space (ch) && is_xdigit (ch))
4997 fprintf (stderr, "%s is space and xdigit\n", ucs_symbol (ch));
4999 /* cntrl restriction: "No character specified for the keywords upper,
5000 lower, alpha, digit, punct, graph, print or xdigit shall be
5001 specified." upper, lower, alpha already checked above. */
5002 if (is_cntrl (ch) && is_digit (ch))
5003 fprintf (stderr, "%s is cntrl and digit\n", ucs_symbol (ch));
5004 if (is_cntrl (ch) && is_punct (ch))
5005 fprintf (stderr, "%s is cntrl and punct\n", ucs_symbol (ch));
5006 if (is_cntrl (ch) && is_graph (ch))
5007 fprintf (stderr, "%s is cntrl and graph\n", ucs_symbol (ch));
5008 if (is_cntrl (ch) && is_print (ch))
5009 fprintf (stderr, "%s is cntrl and print\n", ucs_symbol (ch));
5010 if (is_cntrl (ch) && is_xdigit (ch))
5011 fprintf (stderr, "%s is cntrl and xdigit\n", ucs_symbol (ch));
5013 /* punct restriction: "No character specified for the keywords upper,
5014 lower, alpha, digit, cntrl, xdigit or as the <space> character shall
5015 be specified." upper, lower, alpha, cntrl already checked above. */
5016 if (is_punct (ch) && is_digit (ch))
5017 fprintf (stderr, "%s is punct and digit\n", ucs_symbol (ch));
5018 if (is_punct (ch) && is_xdigit (ch))
5019 fprintf (stderr, "%s is punct and xdigit\n", ucs_symbol (ch));
5020 if (is_punct (ch) && (ch == 0x0020))
5021 fprintf (stderr, "%s is punct\n", ucs_symbol (ch));
5023 /* graph restriction: "No character specified for the keyword cntrl
5024 shall be specified." Already checked above. */
5026 /* print restriction: "No character specified for the keyword cntrl
5027 shall be specified." Already checked above. */
5029 /* graph - print relation: differ only in the <space> character.
5030 How is this possible if there are more than one space character?!
5031 I think susv2/xbd/locale.html should speak of "space characters",
5032 not "space character". */
5033 if (is_print (ch) && !(is_graph (ch) || /* ch == 0x0020 */ is_space (ch)))
5035 "%s is print but not graph|<space>\n", ucs_symbol (ch));
5036 if (!is_print (ch) && (is_graph (ch) || ch == 0x0020))
5038 "%s is graph|<space> but not print\n", ucs_symbol (ch));
5041 fprintf (stream, "LC_CTYPE\n");
5042 output_charclass (stream, "upper", is_upper);
5043 output_charclass (stream, "lower", is_lower);
5044 output_charclass (stream, "alpha", is_alpha);
5045 output_charclass (stream, "digit", is_digit);
5046 output_charclass (stream, "outdigit", is_outdigit);
5047 output_charclass (stream, "blank", is_blank);
5048 output_charclass (stream, "space", is_space);
5049 output_charclass (stream, "cntrl", is_cntrl);
5050 output_charclass (stream, "punct", is_punct);
5051 output_charclass (stream, "xdigit", is_xdigit);
5052 output_charclass (stream, "graph", is_graph);
5053 output_charclass (stream, "print", is_print);
5054 output_charclass (stream, "class \"combining\";", is_combining);
5055 output_charclass (stream, "class \"combining_level3\";", is_combining_level3);
5056 output_charmap (stream, "toupper", to_upper);
5057 output_charmap (stream, "tolower", to_lower);
5058 output_charmap (stream, "map \"totitle\";", to_title);
5059 output_widthmap (stream);
5060 fprintf (stream, "END LC_CTYPE\n");
5062 if (ferror (stream) || fclose (stream))
5064 fprintf (stderr, "error writing to '%s'\n", filename);
5071 /* ========================================================================= */
5073 /* The width property from the EastAsianWidth.txt file.
5074 Each is NULL (unassigned) or "N", "A", "H", "W", "F", "Na". */
5075 const char * unicode_width[0x110000];
5077 /* Stores in unicode_width[] the width property from the EastAsianWidth.txt
5080 fill_width (const char *width_filename)
5084 char field0[FIELDLEN];
5085 char field1[FIELDLEN];
5086 char field2[FIELDLEN];
5089 for (i = 0; i < 0x110000; i++)
5090 unicode_width[i] = (unicode_attributes[i].name != NULL ? "N" : NULL);
5092 stream = fopen (width_filename, "r");
5095 fprintf (stderr, "error during fopen of '%s'\n", width_filename);
5110 do c = getc (stream); while (c != EOF && c != '\n');
5114 n = getfield (stream, field0, ';');
5115 n += getfield (stream, field1, ' ');
5116 n += getfield (stream, field2, '\n');
5121 fprintf (stderr, "short line in '%s':%d\n", width_filename, lineno);
5124 i = strtoul (field0, NULL, 16);
5125 if (strstr (field0, "..") != NULL)
5127 /* Deal with a range. */
5128 j = strtoul (strstr (field0, "..") + 2, NULL, 16);
5130 unicode_width[i] = strdup (field1);
5134 /* Single character line. */
5135 unicode_width[i] = strdup (field1);
5138 if (ferror (stream) || fclose (stream))
5140 fprintf (stderr, "error reading from '%s'\n", width_filename);
5145 /* ========================================================================= */
5147 /* Line breaking classification. */
5151 /* Values >= 24 are resolved at run time. */
5152 LBP_BK = 24, /* mandatory break */
5153 /*LBP_CR, carriage return - not used here because it's a DOSism */
5154 /*LBP_LF, line feed - not used here because it's a DOSism */
5155 LBP_CM = 25, /* attached characters and combining marks */
5156 /*LBP_NL, next line - not used here because it's equivalent to LBP_BK */
5157 /*LBP_SG, surrogates - not used here because they are not characters */
5158 LBP_WJ = 0, /* word joiner */
5159 LBP_ZW = 26, /* zero width space */
5160 LBP_GL = 1, /* non-breaking (glue) */
5161 LBP_SP = 27, /* space */
5162 LBP_B2 = 2, /* break opportunity before and after */
5163 LBP_BA = 3, /* break opportunity after */
5164 LBP_BB = 4, /* break opportunity before */
5165 LBP_HY = 5, /* hyphen */
5166 LBP_CB = 28, /* contingent break opportunity */
5167 LBP_CL = 6, /* closing punctuation */
5168 LBP_EX = 7, /* exclamation/interrogation */
5169 LBP_IN = 8, /* inseparable */
5170 LBP_NS = 9, /* non starter */
5171 LBP_OP = 10, /* opening punctuation */
5172 LBP_QU = 11, /* ambiguous quotation */
5173 LBP_IS = 12, /* infix separator (numeric) */
5174 LBP_NU = 13, /* numeric */
5175 LBP_PO = 14, /* postfix (numeric) */
5176 LBP_PR = 15, /* prefix (numeric) */
5177 LBP_SY = 16, /* symbols allowing breaks */
5178 LBP_AI = 29, /* ambiguous (alphabetic or ideograph) */
5179 LBP_AL = 17, /* ordinary alphabetic and symbol characters */
5180 LBP_H2 = 18, /* Hangul LV syllable */
5181 LBP_H3 = 19, /* Hangul LVT syllable */
5182 LBP_ID = 20, /* ideographic */
5183 LBP_JL = 21, /* Hangul L Jamo */
5184 LBP_JV = 22, /* Hangul V Jamo */
5185 LBP_JT = 23, /* Hangul T Jamo */
5186 LBP_SA = 30, /* complex context (South East Asian) */
5187 LBP_XX = 31 /* unknown */
5190 /* Returns the line breaking classification for ch, as a bit mask. */
5192 get_lbp (unsigned int ch)
5196 if (unicode_attributes[ch].name != NULL)
5198 /* mandatory break */
5199 if (ch == 0x000A || ch == 0x000D || ch == 0x0085 /* newline */
5200 || ch == 0x000C /* form feed */
5201 || ch == 0x000B /* line tabulation */
5202 || ch == 0x2028 /* LINE SEPARATOR */
5203 || ch == 0x2029 /* PARAGRAPH SEPARATOR */)
5204 attr |= 1 << LBP_BK;
5206 if (ch == 0x2060 /* WORD JOINER */
5207 || ch == 0xFEFF /* ZERO WIDTH NO-BREAK SPACE */)
5208 attr |= 1 << LBP_WJ;
5210 /* zero width space */
5211 if (ch == 0x200B /* ZERO WIDTH SPACE */)
5212 attr |= 1 << LBP_ZW;
5214 /* non-breaking (glue) */
5215 if (ch == 0x00A0 /* NO-BREAK SPACE */
5216 || ch == 0x202F /* NARROW NO-BREAK SPACE */
5217 || ch == 0x180E /* MONGOLIAN VOWEL SEPARATOR */
5218 || ch == 0x034F /* COMBINING GRAPHEME JOINER */
5219 || ch == 0x2007 /* FIGURE SPACE */
5220 || ch == 0x2011 /* NON-BREAKING HYPHEN */
5221 || ch == 0x0F08 /* TIBETAN MARK SBRUL SHAD */
5222 || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */
5223 || ch == 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */
5224 || (ch >= 0x035C && ch <= 0x0362) /* COMBINING DOUBLE ... */)
5225 attr |= 1 << LBP_GL;
5228 if (ch == 0x0020 /* SPACE */)
5229 attr |= 1 << LBP_SP;
5231 /* break opportunity before and after */
5232 if (ch == 0x2014 /* EM DASH */)
5233 attr |= 1 << LBP_B2;
5235 /* break opportunity after */
5236 if (ch == 0x1680 /* OGHAM SPACE MARK */
5237 || ch == 0x2000 /* EN QUAD */
5238 || ch == 0x2001 /* EM QUAD */
5239 || ch == 0x2002 /* EN SPACE */
5240 || ch == 0x2003 /* EM SPACE */
5241 || ch == 0x2004 /* THREE-PER-EM SPACE */
5242 || ch == 0x2005 /* FOUR-PER-EM SPACE */
5243 || ch == 0x2006 /* SIX-PER-EM SPACE */
5244 || ch == 0x2008 /* PUNCTUATION SPACE */
5245 || ch == 0x2009 /* THIN SPACE */
5246 || ch == 0x200A /* HAIR SPACE */
5247 || ch == 0x205F /* MEDIUM MATHEMATICAL SPACE */
5248 || ch == 0x0009 /* tab */
5249 || ch == 0x00AD /* SOFT HYPHEN */
5250 || ch == 0x058A /* ARMENIAN HYPHEN */
5251 || ch == 0x2010 /* HYPHEN */
5252 || ch == 0x2012 /* FIGURE DASH */
5253 || ch == 0x2013 /* EN DASH */
5254 || ch == 0x05BE /* HEBREW PUNCTUATION MAQAF */
5255 || ch == 0x0F0B /* TIBETAN MARK INTERSYLLABIC TSHEG */
5256 || ch == 0x1361 /* ETHIOPIC WORDSPACE */
5257 || ch == 0x17D8 /* KHMER SIGN BEYYAL */
5258 || ch == 0x17DA /* KHMER SIGN KOOMUUT */
5259 || ch == 0x2027 /* HYPHENATION POINT */
5260 || ch == 0x007C /* VERTICAL LINE */
5261 || ch == 0x16EB /* RUNIC SINGLE PUNCTUATION */
5262 || ch == 0x16EC /* RUNIC MULTIPLE PUNCTUATION */
5263 || ch == 0x16ED /* RUNIC CROSS PUNCTUATION */
5264 || ch == 0x2056 /* THREE DOT PUNCTUATION */
5265 || ch == 0x2058 /* FOUR DOT PUNCTUATION */
5266 || ch == 0x2059 /* FIVE DOT PUNCTUATION */
5267 || ch == 0x205A /* TWO DOT PUNCTUATION */
5268 || ch == 0x205B /* FOUR DOT MARK */
5269 || ch == 0x205D /* TRICOLON */
5270 || ch == 0x205E /* VERTICAL FOUR DOTS */
5271 || ch == 0x2E19 /* PALM BRANCH */
5272 || ch == 0x2E2A /* TWO DOTS OVER ONE DOT PUNCTUATION */
5273 || ch == 0x2E2B /* ONE DOT OVER TWO DOTS PUNCTUATION */
5274 || ch == 0x2E2C /* SQUARED FOUR DOT PUNCTUATION */
5275 || ch == 0x2E2D /* FIVE DOT PUNCTUATION */
5276 || ch == 0x2E30 /* RING POINT */
5277 || ch == 0x10100 /* AEGEAN WORD SEPARATOR LINE */
5278 || ch == 0x10101 /* AEGEAN WORD SEPARATOR DOT */
5279 || ch == 0x10102 /* AEGEAN CHECK MARK */
5280 || ch == 0x1039F /* UGARITIC WORD DIVIDER */
5281 || ch == 0x103D0 /* OLD PERSIAN WORD DIVIDER */
5282 || ch == 0x1091F /* PHOENICIAN WORD SEPARATOR */
5283 || ch == 0x12470 /* CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER */
5284 || ch == 0x0964 /* DEVANAGARI DANDA */
5285 || ch == 0x0965 /* DEVANAGARI DOUBLE DANDA */
5286 || ch == 0x0E5A /* THAI CHARACTER ANGKHANKHU */
5287 || ch == 0x0E5B /* THAI CHARACTER KHOMUT */
5288 || ch == 0x104A /* MYANMAR SIGN LITTLE SECTION */
5289 || ch == 0x104B /* MYANMAR SIGN SECTION */
5290 || ch == 0x1735 /* PHILIPPINE SINGLE PUNCTUATION */
5291 || ch == 0x1736 /* PHILIPPINE DOUBLE PUNCTUATION */
5292 || ch == 0x17D4 /* KHMER SIGN KHAN */
5293 || ch == 0x17D5 /* KHMER SIGN BARIYOOSAN */
5294 || ch == 0x1B5E /* BALINESE CARIK SIKI */
5295 || ch == 0x1B5F /* BALINESE CARIK PAREREN */
5296 || ch == 0xA8CE /* SAURASHTRA DANDA */
5297 || ch == 0xA8CF /* SAURASHTRA DOUBLE DANDA */
5298 || ch == 0xAA5D /* CHAM PUNCTUATION DANDA */
5299 || ch == 0xAA5E /* CHAM PUNCTUATION DOUBLE DANDA */
5300 || ch == 0xAA5F /* CHAM PUNCTUATION TRIPLE DANDA */
5301 || ch == 0x10A56 /* KHAROSHTHI PUNCTUATION DANDA */
5302 || ch == 0x10A57 /* KHAROSHTHI PUNCTUATION DOUBLE DANDA */
5303 || ch == 0x0F34 /* TIBETAN MARK BSDUS RTAGS */
5304 || ch == 0x0F7F /* TIBETAN SIGN RNAM BCAD */
5305 || ch == 0x0F85 /* TIBETAN MARK PALUTA */
5306 || ch == 0x0FBE /* TIBETAN KU RU KHA */
5307 || ch == 0x0FBF /* TIBETAN KU RU KHA BZHI MIG CAN */
5308 || ch == 0x0FD2 /* TIBETAN MARK NYIS TSHEG */
5309 || ch == 0x1804 /* MONGOLIAN COLON */
5310 || ch == 0x1805 /* MONGOLIAN FOUR DOTS */
5311 || ch == 0x1B5A /* BALINESE PANTI */
5312 || ch == 0x1B5B /* BALINESE PAMADA */
5313 || ch == 0x1B5C /* BALINESE WINDU */
5314 || ch == 0x1B5D /* BALINESE CARIK PAMUNGKAH */
5315 || ch == 0x1B60 /* BALINESE PAMENENG */
5316 || ch == 0x1C3B /* LEPCHA PUNCTUATION TA-ROL */
5317 || ch == 0x1C3C /* LEPCHA PUNCTUATION NYET THYOOM TA-ROL */
5318 || ch == 0x1C3D /* LEPCHA PUNCTUATION CER-WA */
5319 || ch == 0x1C3E /* LEPCHA PUNCTUATION TSHOOK CER-WA */
5320 || ch == 0x1C3F /* LEPCHA PUNCTUATION TSHOOK */
5321 || ch == 0x1C7E /* OL CHIKI PUNCTUATION MUCAAD */
5322 || ch == 0x1C7F /* OL CHIKI PUNCTUATION DOUBLE MUCAAD */
5323 || ch == 0x2CFA /* COPTIC OLD NUBIAN DIRECT QUESTION MARK */
5324 || ch == 0x2CFB /* COPTIC OLD NUBIAN INDIRECT QUESTION MARK */
5325 || ch == 0x2CFC /* COPTIC OLD NUBIAN VERSE DIVIDER */
5326 || ch == 0x2CFF /* COPTIC MORPHOLOGICAL DIVIDER */
5327 || (ch >= 0x2E0E && ch <= 0x2E15) /* EDITORIAL CORONIS .. UPWARDS ANCORA */
5328 || ch == 0x2E17 /* DOUBLE OBLIQUE HYPHEN */
5329 || ch == 0xA60D /* VAI COMMA */
5330 || ch == 0xA60F /* VAI QUESTION MARK */
5331 || ch == 0xA92E /* KAYAH LI SIGN CWI */
5332 || ch == 0xA92F /* KAYAH LI SIGN SHYA */
5333 || ch == 0x10A50 /* KHAROSHTHI PUNCTUATION DOT */
5334 || ch == 0x10A51 /* KHAROSHTHI PUNCTUATION SMALL CIRCLE */
5335 || ch == 0x10A52 /* KHAROSHTHI PUNCTUATION CIRCLE */
5336 || ch == 0x10A53 /* KHAROSHTHI PUNCTUATION CRESCENT BAR */
5337 || ch == 0x10A54 /* KHAROSHTHI PUNCTUATION MANGALAM */
5338 || ch == 0x10A55 /* KHAROSHTHI PUNCTUATION LOTUS */
5339 /* Extra characters for compatibility with Unicode LineBreak.txt. */
5340 || ch == 0x12471 /* CUNEIFORM PUNCTUATION SIGN VERTICAL COLON */
5341 || ch == 0x12472 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL COLON */
5342 || ch == 0x12473 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL TRICOLON */)
5343 attr |= 1 << LBP_BA;
5345 /* break opportunity before */
5346 if (ch == 0x00B4 /* ACUTE ACCENT */
5347 || ch == 0x1FFD /* GREEK OXIA */
5348 || ch == 0x02DF /* MODIFIER LETTER CROSS ACCENT */
5349 || ch == 0x02C8 /* MODIFIER LETTER VERTICAL LINE */
5350 || ch == 0x02CC /* MODIFIER LETTER LOW VERTICAL LINE */
5351 || ch == 0x0F01 /* TIBETAN MARK GTER YIG MGO TRUNCATED A */
5352 || ch == 0x0F02 /* TIBETAN MARK GTER YIG MGO -UM RNAM BCAD MA */
5353 || ch == 0x0F03 /* TIBETAN MARK GTER YIG MGO -UM GTER TSHEG MA */
5354 || ch == 0x0F04 /* TIBETAN MARK INITIAL YIG MGO MDUN MA */
5355 || ch == 0x0F06 /* TIBETAN MARK CARET YIG MGO PHUR SHAD MA */
5356 || ch == 0x0F07 /* TIBETAN MARK YIG MGO TSHEG SHAD MA */
5357 || ch == 0x0F09 /* TIBETAN MARK BSKUR YIG MGO */
5358 || ch == 0x0F0A /* TIBETAN MARK BKA- SHOG YIG MGO */
5359 || ch == 0x0FD0 /* TIBETAN MARK BSKA- SHOG GI MGO RGYAN */
5360 || ch == 0x0FD1 /* TIBETAN MARK MNYAM YIG GI MGO RGYAN */
5361 || ch == 0x0FD3 /* TIBETAN MARK INITIAL BRDA RNYING YIG MGO MDUN MA */
5362 || ch == 0xA874 /* PHAGS-PA SINGLE HEAD MARK */
5363 || ch == 0xA875 /* PHAGS-PA DOUBLE HEAD MARK */
5364 || ch == 0x1806 /* MONGOLIAN TODO SOFT HYPHEN */)
5365 attr |= 1 << LBP_BB;
5368 if (ch == 0x002D /* HYPHEN-MINUS */)
5369 attr |= 1 << LBP_HY;
5371 /* contingent break opportunity */
5372 if (ch == 0xFFFC /* OBJECT REPLACEMENT CHARACTER */)
5373 attr |= 1 << LBP_CB;
5375 /* closing punctuation */
5376 if ((unicode_attributes[ch].category[0] == 'P'
5377 && unicode_attributes[ch].category[1] == 'e')
5378 || ch == 0x3001 /* IDEOGRAPHIC COMMA */
5379 || ch == 0x3002 /* IDEOGRAPHIC FULL STOP */
5380 || ch == 0xFE11 /* PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC COMMA */
5381 || ch == 0xFE12 /* PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC FULL STOP */
5382 || ch == 0xFE50 /* SMALL COMMA */
5383 || ch == 0xFE52 /* SMALL FULL STOP */
5384 || ch == 0xFF0C /* FULLWIDTH COMMA */
5385 || ch == 0xFF0E /* FULLWIDTH FULL STOP */
5386 || ch == 0xFF61 /* HALFWIDTH IDEOGRAPHIC FULL STOP */
5387 || ch == 0xFF64 /* HALFWIDTH IDEOGRAPHIC COMMA */)
5388 attr |= 1 << LBP_CL;
5390 /* exclamation/interrogation */
5391 if (ch == 0x0021 /* EXCLAMATION MARK */
5392 || ch == 0x003F /* QUESTION MARK */
5393 || ch == 0x05C6 /* HEBREW PUNCTUATION NUN HAFUKHA */
5394 || ch == 0x061B /* ARABIC SEMICOLON */
5395 || ch == 0x061E /* ARABIC TRIPLE DOT PUNCTUATION MARK */
5396 || ch == 0x061F /* ARABIC QUESTION MARK */
5397 || ch == 0x06D4 /* ARABIC FULL STOP */
5398 || ch == 0x07F9 /* NKO EXCLAMATION MARK */
5399 || ch == 0x0F0D /* TIBETAN MARK SHAD */
5400 || ch == 0x0F0E /* TIBETAN MARK NYIS SHAD */
5401 || ch == 0x0F0F /* TIBETAN MARK TSHEG SHAD */
5402 || ch == 0x0F10 /* TIBETAN MARK NYIS TSHEG SHAD */
5403 || ch == 0x0F11 /* TIBETAN MARK RIN CHEN SPUNGS SHAD */
5404 || ch == 0x0F14 /* TIBETAN MARK GTER TSHEG */
5405 || ch == 0x1802 /* MONGOLIAN COMMA */
5406 || ch == 0x1803 /* MONGOLIAN FULL STOP */
5407 || ch == 0x1808 /* MONGOLIAN MANCHU COMMA */
5408 || ch == 0x1809 /* MONGOLIAN MANCHU FULL STOP */
5409 || ch == 0x1944 /* LIMBU EXCLAMATION MARK */
5410 || ch == 0x1945 /* LIMBU QUESTION MARK */
5411 || ch == 0x2762 /* HEAVY EXCLAMATION MARK ORNAMENT */
5412 || ch == 0x2763 /* HEAVY HEART EXCLAMATION MARK ORNAMENT */
5413 || ch == 0x2CF9 /* COPTIC OLD NUBIAN FULL STOP */
5414 || ch == 0x2CFE /* COPTIC FULL STOP */
5415 || ch == 0x2E2E /* REVERSED QUESTION MARK */
5417 || ch == 0xA60C /* VAI SYLLABLE LENGTHENER */
5419 || ch == 0xA60E /* VAI FULL STOP */
5420 || ch == 0xA876 /* PHAGS-PA MARK SHAD */
5421 || ch == 0xA877 /* PHAGS-PA MARK DOUBLE SHAD */
5422 || ch == 0xFE15 /* PRESENTATION FORM FOR VERTICAL EXCLAMATION MARK */
5423 || ch == 0xFE16 /* PRESENTATION FORM FOR VERTICAL QUESTION MARK */
5424 || ch == 0xFE56 /* SMALL QUESTION MARK */
5425 || ch == 0xFE57 /* SMALL EXCLAMATION MARK */
5426 || ch == 0xFF01 /* FULLWIDTH EXCLAMATION MARK */
5427 || ch == 0xFF1F /* FULLWIDTH QUESTION MARK */)
5428 attr |= 1 << LBP_EX;
5431 if (ch == 0x2024 /* ONE DOT LEADER */
5432 || ch == 0x2025 /* TWO DOT LEADER */
5433 || ch == 0x2026 /* HORIZONTAL ELLIPSIS */
5434 || ch == 0xFE19 /* PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS */)
5435 attr |= 1 << LBP_IN;
5438 if (ch == 0x17D6 /* KHMER SIGN CAMNUC PII KUUH */
5439 || ch == 0x203C /* DOUBLE EXCLAMATION MARK */
5440 || ch == 0x203D /* INTERROBANG */
5441 || ch == 0x2047 /* DOUBLE QUESTION MARK */
5442 || ch == 0x2048 /* QUESTION EXCLAMATION MARK */
5443 || ch == 0x2049 /* EXCLAMATION QUESTION MARK */
5444 || ch == 0x3005 /* IDEOGRAPHIC ITERATION MARK */
5445 || ch == 0x301C /* WAVE DASH */
5446 || ch == 0x303C /* MASU MARK */
5447 || ch == 0x303B /* VERTICAL IDEOGRAPHIC ITERATION MARK */
5448 || ch == 0x309B /* KATAKANA-HIRAGANA VOICED SOUND MARK */
5449 || ch == 0x309C /* KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK */
5450 || ch == 0x309D /* HIRAGANA ITERATION MARK */
5451 || ch == 0x309E /* HIRAGANA VOICED ITERATION MARK */
5452 || ch == 0x30A0 /* KATAKANA-HIRAGANA DOUBLE HYPHEN */
5453 || ch == 0x30FB /* KATAKANA MIDDLE DOT */
5454 || ch == 0x30FC /* KATAKANA-HIRAGANA PROLONGED SOUND MARK */
5455 || ch == 0x30FD /* KATAKANA ITERATION MARK */
5456 || ch == 0x30FE /* KATAKANA VOICED ITERATION MARK */
5457 || ch == 0xA015 /* YI SYLLABLE WU */
5458 || ch == 0xFE54 /* SMALL SEMICOLON */
5459 || ch == 0xFE55 /* SMALL COLON */
5460 || ch == 0xFF1A /* FULLWIDTH COLON */
5461 || ch == 0xFF1B /* FULLWIDTH SEMICOLON */
5462 || ch == 0xFF65 /* HALFWIDTH KATAKANA MIDDLE DOT */
5463 || ch == 0xFF70 /* HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK */
5464 || ch == 0xFF9E /* HALFWIDTH KATAKANA VOICED SOUND MARK */
5465 || ch == 0xFF9F /* HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK */
5466 || strstr (unicode_attributes[ch].name, "HIRAGANA LETTER SMALL ") != NULL
5467 || strstr (unicode_attributes[ch].name, "KATAKANA LETTER SMALL ") != NULL)
5468 attr |= 1 << LBP_NS;
5470 /* opening punctuation */
5471 if ((unicode_attributes[ch].category[0] == 'P'
5472 && unicode_attributes[ch].category[1] == 's')
5473 || ch == 0x00A1 /* INVERTED EXCLAMATION MARK */
5474 || ch == 0x00BF /* INVERTED QUESTION MARK */
5475 || ch == 0x2E18 /* INVERTED INTERROBANG */)
5476 attr |= 1 << LBP_OP;
5478 /* ambiguous quotation */
5479 if ((unicode_attributes[ch].category[0] == 'P'
5480 && (unicode_attributes[ch].category[1] == 'f'
5481 || unicode_attributes[ch].category[1] == 'i'))
5482 || ch == 0x0022 /* QUOTATION MARK */
5483 || ch == 0x0027 /* APOSTROPHE */
5484 || ch == 0x275B /* HEAVY SINGLE TURNED COMMA QUOTATION MARK ORNAMENT */
5485 || ch == 0x275C /* HEAVY SINGLE COMMA QUOTATION MARK ORNAMENT */
5486 || ch == 0x275D /* HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT */
5487 || ch == 0x275E /* HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT */
5488 || ch == 0x2E00 /* RIGHT ANGLE SUBSTITUTION MARKER */
5489 || ch == 0x2E01 /* RIGHT ANGLE DOTTED SUBSTITUTION MARKER */
5490 || ch == 0x2E06 /* RAISED INTERPOLATION MARKER */
5491 || ch == 0x2E07 /* RAISED DOTTED INTERPOLATION MARKER */
5492 || ch == 0x2E08 /* DOTTED TRANSPOSITION MARKER */
5493 || ch == 0x2E0B /* RAISED SQUARE */)
5494 attr |= 1 << LBP_QU;
5496 /* infix separator (numeric) */
5497 if (ch == 0x002C /* COMMA */
5498 || ch == 0x002E /* FULL STOP */
5499 || ch == 0x003A /* COLON */
5500 || ch == 0x003B /* SEMICOLON */
5501 || ch == 0x037E /* GREEK QUESTION MARK */
5502 || ch == 0x0589 /* ARMENIAN FULL STOP */
5503 || ch == 0x060C /* ARABIC COMMA */
5504 || ch == 0x060D /* ARABIC DATE SEPARATOR */
5505 || ch == 0x07F8 /* NKO COMMA */
5506 || ch == 0x2044 /* FRACTION SLASH */
5507 || ch == 0xFE10 /* PRESENTATION FORM FOR VERTICAL COMMA */
5508 || ch == 0xFE13 /* PRESENTATION FORM FOR VERTICAL COLON */
5509 || ch == 0xFE14 /* PRESENTATION FORM FOR VERTICAL SEMICOLON */)
5510 attr |= 1 << LBP_IS;
5513 if ((unicode_attributes[ch].category[0] == 'N'
5514 && unicode_attributes[ch].category[1] == 'd'
5515 && strstr (unicode_attributes[ch].name, "FULLWIDTH") == NULL)
5516 || ch == 0x066B /* ARABIC DECIMAL SEPARATOR */
5517 || ch == 0x066C /* ARABIC THOUSANDS SEPARATOR */)
5518 attr |= 1 << LBP_NU;
5520 /* postfix (numeric) */
5521 if (ch == 0x0025 /* PERCENT SIGN */
5522 || ch == 0x00A2 /* CENT SIGN */
5523 || ch == 0x00B0 /* DEGREE SIGN */
5524 || ch == 0x060B /* AFGHANI SIGN */
5525 || ch == 0x066A /* ARABIC PERCENT SIGN */
5526 || ch == 0x2030 /* PER MILLE SIGN */
5527 || ch == 0x2031 /* PER TEN THOUSAND SIGN */
5528 || ch == 0x2032 /* PRIME */
5529 || ch == 0x2033 /* DOUBLE PRIME */
5530 || ch == 0x2034 /* TRIPLE PRIME */
5531 || ch == 0x2035 /* REVERSED PRIME */
5532 || ch == 0x2036 /* REVERSED DOUBLE PRIME */
5533 || ch == 0x2037 /* REVERSED TRIPLE PRIME */
5534 || ch == 0x20A7 /* PESETA SIGN */
5535 || ch == 0x2103 /* DEGREE CELSIUS */
5536 || ch == 0x2109 /* DEGREE FAHRENHEIT */
5537 || ch == 0xFDFC /* RIAL SIGN */
5538 || ch == 0xFE6A /* SMALL PERCENT SIGN */
5539 || ch == 0xFF05 /* FULLWIDTH PERCENT SIGN */
5540 || ch == 0xFFE0 /* FULLWIDTH DIGIT ZERO */
5541 /* Extra characters for compatibility with Unicode LineBreak.txt. */
5542 || ch == 0x0609 /* ARABIC-INDIC PER MILLE SIGN */
5543 || ch == 0x060A /* ARABIC-INDIC PER TEN THOUSAND SIGN */
5544 || ch == 0x0D79 /* MALAYALAM DATE MARK */)
5545 attr |= 1 << LBP_PO;
5547 /* prefix (numeric) */
5548 if ((unicode_attributes[ch].category[0] == 'S'
5549 && unicode_attributes[ch].category[1] == 'c')
5550 || ch == 0x002B /* PLUS SIGN */
5551 || ch == 0x005C /* REVERSE SOLIDUS */
5552 || ch == 0x00B1 /* PLUS-MINUS SIGN */
5553 || ch == 0x2116 /* NUMERO SIGN */
5554 || ch == 0x2212 /* MINUS SIGN */
5555 || ch == 0x2213 /* MINUS-OR-PLUS SIGN */)
5556 if (!(attr & (1 << LBP_PO)))
5557 attr |= 1 << LBP_PR;
5559 /* symbols allowing breaks */
5560 if (ch == 0x002F /* SOLIDUS */)
5561 attr |= 1 << LBP_SY;
5563 if (ch >= 0xAC00 && ch <= 0xD7A3 && ((ch - 0xAC00) % 28) == 0)
5564 attr |= 1 << LBP_H2;
5566 if (ch >= 0xAC00 && ch <= 0xD7A3 && ((ch - 0xAC00) % 28) != 0)
5567 attr |= 1 << LBP_H3;
5569 if ((ch >= 0x1100 && ch <= 0x1159) || ch == 0x115F)
5570 attr |= 1 << LBP_JL;
5572 if (ch >= 0x1160 && ch <= 0x11A2)
5573 attr |= 1 << LBP_JV;
5575 if (ch >= 0x11A8 && ch <= 0x11F9)
5576 attr |= 1 << LBP_JT;
5578 /* complex context (South East Asian) */
5579 if (((unicode_attributes[ch].category[0] == 'C'
5580 && unicode_attributes[ch].category[1] == 'f')
5581 || (unicode_attributes[ch].category[0] == 'L'
5582 && (unicode_attributes[ch].category[1] == 'm'
5583 || unicode_attributes[ch].category[1] == 'o'))
5584 || (unicode_attributes[ch].category[0] == 'M'
5585 && (unicode_attributes[ch].category[1] == 'c'
5586 || unicode_attributes[ch].category[1] == 'n'))
5587 /* Extra characters for compatibility with Unicode LineBreak.txt. */
5588 || ch == 0x109E /* MYANMAR SYMBOL SHAN ONE */
5589 || ch == 0x109F /* MYANMAR SYMBOL SHAN EXCLAMATION */
5590 || ch == 0x19DE /* NEW TAI LUE SIGN LAE */
5591 || ch == 0x19DF /* NEW TAI LUE SIGN LAEV */)
5592 && ((ch >= 0x0E00 && ch <= 0x0EFF)
5593 || (ch >= 0x1000 && ch <= 0x109F)
5594 || (ch >= 0x1780 && ch <= 0x17FF)
5595 || (ch >= 0x1950 && ch <= 0x19DF)))
5596 attr |= 1 << LBP_SA;
5598 /* attached characters and combining marks */
5599 if ((unicode_attributes[ch].category[0] == 'M'
5600 && (unicode_attributes[ch].category[1] == 'c'
5601 || unicode_attributes[ch].category[1] == 'e'
5602 || unicode_attributes[ch].category[1] == 'n'))
5603 || (unicode_attributes[ch].category[0] == 'C'
5604 && (unicode_attributes[ch].category[1] == 'c'
5605 || unicode_attributes[ch].category[1] == 'f')))
5606 if (!(attr & ((1 << LBP_BK) | (1 << LBP_BA) | (1 << LBP_GL) | (1 << LBP_SA) | (1 << LBP_WJ) | (1 << LBP_ZW))))
5607 attr |= 1 << LBP_CM;
5610 if ((ch >= 0x2E80 && ch <= 0x2FFF) /* CJK RADICAL, KANGXI RADICAL, IDEOGRAPHIC DESCRIPTION */
5611 || ch == 0x3000 /* IDEOGRAPHIC SPACE */
5612 || (ch >= 0x3040 && ch <= 0x309F) /* HIRAGANA */
5613 || (ch >= 0x30A0 && ch <= 0x30FF) /* KATAKANA */
5614 || (ch >= 0x3400 && ch <= 0x4DB5) /* CJK Ideograph Extension A */
5615 || (ch >= 0x4E00 && ch <= 0x9FC3) /* CJK Ideograph */
5616 || (ch >= 0xF900 && ch <= 0xFAD9) /* CJK COMPATIBILITY IDEOGRAPH */
5617 || (ch >= 0xA000 && ch <= 0xA48F) /* YI SYLLABLE */
5618 || (ch >= 0xA490 && ch <= 0xA4CF) /* YI RADICAL */
5619 || ch == 0xFE62 /* SMALL PLUS SIGN */
5620 || ch == 0xFE63 /* SMALL HYPHEN-MINUS */
5621 || ch == 0xFE64 /* SMALL LESS-THAN SIGN */
5622 || ch == 0xFE65 /* SMALL GREATER-THAN SIGN */
5623 || ch == 0xFE66 /* SMALL EQUALS SIGN */
5624 || (ch >= 0xFF10 && ch <= 0xFF19) /* FULLWIDTH DIGIT */
5625 || (ch >= 0x20000 && ch <= 0x2A6D6) /* CJK Ideograph Extension B */
5626 || (ch >= 0x2F800 && ch <= 0x2FA1D) /* CJK COMPATIBILITY IDEOGRAPH */
5627 || strstr (unicode_attributes[ch].name, "FULLWIDTH LATIN ") != NULL
5628 || (ch >= 0x3000 && ch <= 0x33FF
5629 && !(attr & ((1 << LBP_CM) | (1 << LBP_NS) | (1 << LBP_OP) | (1 << LBP_CL))))
5630 /* Extra characters for compatibility with Unicode LineBreak.txt. */
5631 || ch == 0xFE30 /* PRESENTATION FORM FOR VERTICAL TWO DOT LEADER */
5632 || ch == 0xFE31 /* PRESENTATION FORM FOR VERTICAL EM DASH */
5633 || ch == 0xFE32 /* PRESENTATION FORM FOR VERTICAL EN DASH */
5634 || ch == 0xFE33 /* PRESENTATION FORM FOR VERTICAL LOW LINE */
5635 || ch == 0xFE34 /* PRESENTATION FORM FOR VERTICAL WAVY LOW LINE */
5636 || ch == 0xFE45 /* SESAME DOT */
5637 || ch == 0xFE46 /* WHITE SESAME DOT */
5638 || ch == 0xFE49 /* DASHED OVERLINE */
5639 || ch == 0xFE4A /* CENTRELINE OVERLINE */
5640 || ch == 0xFE4B /* WAVY OVERLINE */
5641 || ch == 0xFE4C /* DOUBLE WAVY OVERLINE */
5642 || ch == 0xFE4D /* DASHED LOW LINE */
5643 || ch == 0xFE4E /* CENTRELINE LOW LINE */
5644 || ch == 0xFE4F /* WAVY LOW LINE */
5645 || ch == 0xFE51 /* SMALL IDEOGRAPHIC COMMA */
5646 || ch == 0xFE58 /* SMALL EM DASH */
5647 || ch == 0xFE5F /* SMALL NUMBER SIGN */
5648 || ch == 0xFE60 /* SMALL AMPERSAND */
5649 || ch == 0xFE61 /* SMALL ASTERISK */
5650 || ch == 0xFE68 /* SMALL REVERSE SOLIDUS */
5651 || ch == 0xFE6B /* SMALL COMMERCIAL AT */
5652 || ch == 0xFF02 /* FULLWIDTH QUOTATION MARK */
5653 || ch == 0xFF03 /* FULLWIDTH NUMBER SIGN */
5654 || ch == 0xFF06 /* FULLWIDTH AMPERSAND */
5655 || ch == 0xFF07 /* FULLWIDTH APOSTROPHE */
5656 || ch == 0xFF0A /* FULLWIDTH ASTERISK */
5657 || ch == 0xFF0B /* FULLWIDTH PLUS SIGN */
5658 || ch == 0xFF0D /* FULLWIDTH HYPHEN-MINUS */
5659 || ch == 0xFF0F /* FULLWIDTH SOLIDUS */
5660 || ch == 0xFF1C /* FULLWIDTH LESS-THAN SIGN */
5661 || ch == 0xFF1D /* FULLWIDTH EQUALS SIGN */
5662 || ch == 0xFF1E /* FULLWIDTH GREATER-THAN SIGN */
5663 || ch == 0xFF20 /* FULLWIDTH COMMERCIAL AT */
5664 || ch == 0xFF3C /* FULLWIDTH REVERSE SOLIDUS */
5665 || ch == 0xFF3E /* FULLWIDTH CIRCUMFLEX ACCENT */
5666 || ch == 0xFF3F /* FULLWIDTH LOW LINE */
5667 || ch == 0xFF40 /* FULLWIDTH GRAVE ACCENT */
5668 || ch == 0xFF5C /* FULLWIDTH VERTICAL LINE */
5669 || ch == 0xFF5E /* FULLWIDTH TILDE */
5670 || ch == 0xFFE2 /* FULLWIDTH NOT SIGN */
5671 || ch == 0xFFE3 /* FULLWIDTH MACRON */
5672 || ch == 0xFFE4 /* FULLWIDTH BROKEN BAR */)
5673 if (!(attr & ((1 << LBP_NS) | (1 << LBP_CM))))
5675 /* ambiguous (ideograph) ? */
5676 if ((unicode_width[ch] != NULL
5677 && unicode_width[ch][0] == 'A'
5679 || ch == 0x24EA /* CIRCLED DIGIT ZERO */
5680 || (ch >= 0x2780 && ch <= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */)
5681 attr |= 1 << LBP_AI;
5683 attr |= 1 << LBP_ID;
5686 /* ordinary alphabetic and symbol characters */
5687 if ((unicode_attributes[ch].category[0] == 'L'
5688 && (unicode_attributes[ch].category[1] == 'u'
5689 || unicode_attributes[ch].category[1] == 'l'
5690 || unicode_attributes[ch].category[1] == 't'
5691 || unicode_attributes[ch].category[1] == 'm'
5692 || unicode_attributes[ch].category[1] == 'o'))
5693 || (unicode_attributes[ch].category[0] == 'S'
5694 && (unicode_attributes[ch].category[1] == 'm'
5695 || unicode_attributes[ch].category[1] == 'k'
5696 || unicode_attributes[ch].category[1] == 'o'))
5697 || (unicode_attributes[ch].category[0] == 'N'
5698 && (unicode_attributes[ch].category[1] == 'l'
5699 || unicode_attributes[ch].category[1] == 'o'))
5700 || (unicode_attributes[ch].category[0] == 'P'
5701 && (unicode_attributes[ch].category[1] == 'c'
5702 || unicode_attributes[ch].category[1] == 'd'
5703 || unicode_attributes[ch].category[1] == 'o'))
5704 || ch == 0x0600 /* ARABIC NUMBER SIGN */
5705 || ch == 0x0601 /* ARABIC SIGN SANAH */
5706 || ch == 0x0602 /* ARABIC FOOTNOTE MARKER */
5707 || ch == 0x0603 /* ARABIC SIGN SAFHA */
5708 || ch == 0x06DD /* ARABIC END OF AYAH */
5709 || ch == 0x070F /* SYRIAC ABBREVIATION MARK */
5710 || ch == 0x2061 /* FUNCTION APPLICATION */
5711 || ch == 0x2062 /* INVISIBLE TIMES */
5712 || ch == 0x2063 /* INVISIBLE SEPARATOR */
5713 || ch == 0x2064 /* INVISIBLE PLUS */)
5714 if (!(attr & ((1 << LBP_GL) | (1 << LBP_B2) | (1 << LBP_BA) | (1 << LBP_BB) | (1 << LBP_HY) | (1 << LBP_CB) | (1 << LBP_CL) | (1 << LBP_EX) | (1 << LBP_IN) | (1 << LBP_NS) | (1 << LBP_OP) | (1 << LBP_QU) | (1 << LBP_IS) | (1 << LBP_NU) | (1 << LBP_PO) | (1 << LBP_PR) | (1 << LBP_SY) | (1 << LBP_H2) | (1 << LBP_H3) | (1 << LBP_JL) | (1 << LBP_JV) | (1 << LBP_JT) | (1 << LBP_SA) | (1 << LBP_ID))))
5716 /* ambiguous (alphabetic) ? */
5717 if ((unicode_width[ch] != NULL
5718 && unicode_width[ch][0] == 'A'
5720 /* Extra exceptions for compatibility with Unicode LineBreak.txt. */
5721 && ch != 0x2022 /* BULLET */
5722 && ch != 0x203E /* OVERLINE */
5723 && ch != 0x2126 /* OHM SIGN */
5724 && ch != 0x2153 /* VULGAR FRACTION ONE THIRD */
5725 && ch != 0x215C /* VULGAR FRACTION THREE EIGHTHS */
5726 && ch != 0x215D /* VULGAR FRACTION FIVE EIGHTHS */
5727 && ch != 0x21B8 /* NORTH WEST ARROW TO LONG BAR */
5728 && ch != 0x21B9 /* LEFTWARDS ARROW TO BAR OVER RIGHTWARDS ARROW TO BAR */
5729 && ch != 0x21E7 /* UPWARDS WHITE ARROW */
5730 && ch != 0x24FF /* NEGATIVE CIRCLED DIGIT ZERO */
5731 && ch != 0x273D /* HEAVY TEARDROP-SPOKED ASTERISK */)
5733 || ch == 0x00A1 /* INVERTED EXCLAMATION MARK */
5734 || ch == 0x00A7 /* SECTION SIGN */
5735 || ch == 0x00A8 /* DIAERESIS */
5736 || ch == 0x00AA /* FEMININE ORDINAL INDICATOR */
5737 || ch == 0x00B2 /* SUPERSCRIPT TWO */
5738 || ch == 0x00B3 /* SUPERSCRIPT THREE */
5739 || ch == 0x00B6 /* PILCROW SIGN */
5740 || ch == 0x00B7 /* MIDDLE DOT */
5741 || ch == 0x00B8 /* CEDILLA */
5742 || ch == 0x00B9 /* SUPERSCRIPT ONE */
5743 || ch == 0x00BA /* MASCULINE ORDINAL INDICATOR */
5744 || ch == 0x00BC /* VULGAR FRACTION ONE QUARTER */
5745 || ch == 0x00BD /* VULGAR FRACTION ONE HALF */
5746 || ch == 0x00BE /* VULGAR FRACTION THREE QUARTERS */
5747 || ch == 0x00BF /* INVERTED QUESTION MARK */
5748 || ch == 0x00D7 /* MULTIPLICATION SIGN */
5749 || ch == 0x00F7 /* DIVISION SIGN */
5750 || ch == 0x02C7 /* CARON */
5751 || ch == 0x02C9 /* MODIFIER LETTER MACRON */
5752 || ch == 0x02CA /* MODIFIER LETTER ACUTE ACCENT */
5753 || ch == 0x02CB /* MODIFIER LETTER GRAVE ACCENT */
5754 || ch == 0x02CD /* MODIFIER LETTER LOW MACRON */
5755 || ch == 0x02D0 /* MODIFIER LETTER TRIANGULAR COLON */
5756 || ch == 0x02D8 /* BREVE */
5757 || ch == 0x02D9 /* DOT ABOVE */
5758 || ch == 0x02DA /* RING ABOVE */
5759 || ch == 0x02DB /* OGONEK */
5760 || ch == 0x02DD /* DOUBLE ACUTE ACCENT */
5762 || ch == 0x24EA /* CIRCLED DIGIT ZERO */
5763 || (ch >= 0x2780 && ch <= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */
5764 /* Extra characters for compatibility with Unicode LineBreak.txt. */
5765 || ch == 0x2155 /* VULGAR FRACTION ONE FIFTH */
5766 || ch == 0x2574 /* BOX DRAWINGS LIGHT LEFT */
5767 || ch == 0x2616 /* WHITE SHOGI PIECE */
5768 || ch == 0x2617 /* BLACK SHOGI PIECE */)
5769 attr |= 1 << LBP_AI;
5771 attr |= 1 << LBP_AL;
5772 attr &= ~(1 << LBP_CM);
5778 attr |= 1 << LBP_XX;
5783 /* Output the line breaking properties in a human readable format. */
5785 debug_output_lbp (FILE *stream)
5789 for (i = 0; i < 0x110000; i++)
5791 int attr = get_lbp (i);
5792 if (attr != 1 << LBP_XX)
5794 fprintf (stream, "0x%04X", i);
5795 #define PRINT_BIT(attr,bit) \
5796 if (attr & (1 << bit)) fprintf (stream, " " #bit);
5797 PRINT_BIT(attr,LBP_BK);
5798 PRINT_BIT(attr,LBP_CM);
5799 PRINT_BIT(attr,LBP_WJ);
5800 PRINT_BIT(attr,LBP_ZW);
5801 PRINT_BIT(attr,LBP_GL);
5802 PRINT_BIT(attr,LBP_SP);
5803 PRINT_BIT(attr,LBP_B2);
5804 PRINT_BIT(attr,LBP_BA);
5805 PRINT_BIT(attr,LBP_BB);
5806 PRINT_BIT(attr,LBP_HY);
5807 PRINT_BIT(attr,LBP_CB);
5808 PRINT_BIT(attr,LBP_CL);
5809 PRINT_BIT(attr,LBP_EX);
5810 PRINT_BIT(attr,LBP_IN);
5811 PRINT_BIT(attr,LBP_NS);
5812 PRINT_BIT(attr,LBP_OP);
5813 PRINT_BIT(attr,LBP_QU);
5814 PRINT_BIT(attr,LBP_IS);
5815 PRINT_BIT(attr,LBP_NU);
5816 PRINT_BIT(attr,LBP_PO);
5817 PRINT_BIT(attr,LBP_PR);
5818 PRINT_BIT(attr,LBP_SY);
5819 PRINT_BIT(attr,LBP_AI);
5820 PRINT_BIT(attr,LBP_AL);
5821 PRINT_BIT(attr,LBP_H2);
5822 PRINT_BIT(attr,LBP_H3);
5823 PRINT_BIT(attr,LBP_ID);
5824 PRINT_BIT(attr,LBP_JL);
5825 PRINT_BIT(attr,LBP_JV);
5826 PRINT_BIT(attr,LBP_JT);
5827 PRINT_BIT(attr,LBP_SA);
5828 PRINT_BIT(attr,LBP_XX);
5830 fprintf (stream, "\n");
5836 debug_output_lbrk_tables (const char *filename)
5840 stream = fopen (filename, "w");
5843 fprintf (stderr, "cannot open '%s' for writing\n", filename);
5847 debug_output_lbp (stream);
5849 if (ferror (stream) || fclose (stream))
5851 fprintf (stderr, "error writing to '%s'\n", filename);
5856 /* The line breaking property from the LineBreak.txt file. */
5857 int unicode_org_lbp[0x110000];
5859 /* Stores in unicode_org_lbp[] the line breaking property from the
5860 LineBreak.txt file. */
5862 fill_org_lbp (const char *linebreak_filename)
5866 char field0[FIELDLEN];
5867 char field1[FIELDLEN];
5868 char field2[FIELDLEN];
5871 for (i = 0; i < 0x110000; i++)
5872 unicode_org_lbp[i] = LBP_XX;
5874 stream = fopen (linebreak_filename, "r");
5877 fprintf (stderr, "error during fopen of '%s'\n", linebreak_filename);
5893 do c = getc (stream); while (c != EOF && c != '\n');
5897 n = getfield (stream, field0, ';');
5898 n += getfield (stream, field1, ' ');
5899 n += getfield (stream, field2, '\n');
5904 fprintf (stderr, "short line in '%s':%d\n", linebreak_filename,
5908 #define TRY(bit) else if (strcmp (field1, #bit + 4) == 0) value = bit;
5943 else if (strcmp (field1, "LF") == 0) value = LBP_BK;
5944 else if (strcmp (field1, "CR") == 0) value = LBP_BK;
5945 else if (strcmp (field1, "NL") == 0) value = LBP_BK;
5946 else if (strcmp (field1, "SG") == 0) value = LBP_XX;
5949 fprintf (stderr, "unknown property value \"%s\" in '%s':%d\n",
5950 field1, linebreak_filename, lineno);
5953 i = strtoul (field0, NULL, 16);
5954 if (strstr (field0, "..") != NULL)
5956 /* Deal with a range. */
5957 j = strtoul (strstr (field0, "..") + 2, NULL, 16);
5959 unicode_org_lbp[i] = value;
5963 /* Single character line. */
5964 unicode_org_lbp[i] = value;
5967 if (ferror (stream) || fclose (stream))
5969 fprintf (stderr, "error reading from '%s'\n", linebreak_filename);
5974 /* Output the line breaking properties in a human readable format. */
5976 debug_output_org_lbp (FILE *stream)
5980 for (i = 0; i < 0x110000; i++)
5982 int attr = unicode_org_lbp[i];
5985 fprintf (stream, "0x%04X", i);
5986 #define PRINT_BIT(attr,bit) \
5987 if (attr == bit) fprintf (stream, " " #bit);
5988 PRINT_BIT(attr,LBP_BK);
5989 PRINT_BIT(attr,LBP_CM);
5990 PRINT_BIT(attr,LBP_WJ);
5991 PRINT_BIT(attr,LBP_ZW);
5992 PRINT_BIT(attr,LBP_GL);
5993 PRINT_BIT(attr,LBP_SP);
5994 PRINT_BIT(attr,LBP_B2);
5995 PRINT_BIT(attr,LBP_BA);
5996 PRINT_BIT(attr,LBP_BB);
5997 PRINT_BIT(attr,LBP_HY);
5998 PRINT_BIT(attr,LBP_CB);
5999 PRINT_BIT(attr,LBP_CL);
6000 PRINT_BIT(attr,LBP_EX);
6001 PRINT_BIT(attr,LBP_IN);
6002 PRINT_BIT(attr,LBP_NS);
6003 PRINT_BIT(attr,LBP_OP);
6004 PRINT_BIT(attr,LBP_QU);
6005 PRINT_BIT(attr,LBP_IS);
6006 PRINT_BIT(attr,LBP_NU);
6007 PRINT_BIT(attr,LBP_PO);
6008 PRINT_BIT(attr,LBP_PR);
6009 PRINT_BIT(attr,LBP_SY);
6010 PRINT_BIT(attr,LBP_AI);
6011 PRINT_BIT(attr,LBP_AL);
6012 PRINT_BIT(attr,LBP_H2);
6013 PRINT_BIT(attr,LBP_H3);
6014 PRINT_BIT(attr,LBP_ID);
6015 PRINT_BIT(attr,LBP_JL);
6016 PRINT_BIT(attr,LBP_JV);
6017 PRINT_BIT(attr,LBP_JT);
6018 PRINT_BIT(attr,LBP_SA);
6019 PRINT_BIT(attr,LBP_XX);
6021 fprintf (stream, "\n");
6027 debug_output_org_lbrk_tables (const char *filename)
6031 stream = fopen (filename, "w");
6034 fprintf (stderr, "cannot open '%s' for writing\n", filename);
6038 debug_output_org_lbp (stream);
6040 if (ferror (stream) || fclose (stream))
6042 fprintf (stderr, "error writing to '%s'\n", filename);
6047 /* Construction of sparse 3-level tables. */
6048 #define TABLE lbp_table
6049 #define ELEMENT unsigned char
6050 #define DEFAULT LBP_XX
6051 #define xmalloc malloc
6052 #define xrealloc realloc
6056 output_lbp (FILE *stream1, FILE *stream2)
6060 unsigned int level1_offset, level2_offset, level3_offset;
6064 lbp_table_init (&t);
6066 for (i = 0; i < 0x110000; i++)
6068 int attr = get_lbp (i);
6070 /* Now attr should contain exactly one bit. */
6071 if (attr == 0 || ((attr & (attr - 1)) != 0))
6074 if (attr != 1 << LBP_XX)
6076 unsigned int log2_attr;
6077 for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++);
6079 lbp_table_add (&t, i, log2_attr);
6083 lbp_table_finalize (&t);
6086 5 * sizeof (uint32_t);
6088 5 * sizeof (uint32_t)
6089 + t.level1_size * sizeof (uint32_t);
6091 5 * sizeof (uint32_t)
6092 + t.level1_size * sizeof (uint32_t)
6093 + (t.level2_size << t.q) * sizeof (uint32_t);
6095 for (i = 0; i < 5; i++)
6096 fprintf (stream1, "#define lbrkprop_header_%d %d\n", i,
6097 ((uint32_t *) t.result)[i]);
6098 fprintf (stream1, "\n");
6099 fprintf (stream1, "typedef struct\n");
6100 fprintf (stream1, " {\n");
6101 fprintf (stream1, " int level1[%zu];\n", t.level1_size);
6102 fprintf (stream1, " int level2[%zu << %d];\n", t.level2_size, t.q);
6103 fprintf (stream1, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
6104 fprintf (stream1, " }\n");
6105 fprintf (stream1, "lbrkprop_t;\n");
6106 fprintf (stream1, "extern const lbrkprop_t unilbrkprop;\n");
6108 fprintf (stream2, "const lbrkprop_t unilbrkprop =\n");
6109 fprintf (stream2, "{\n");
6110 fprintf (stream2, " {");
6111 if (t.level1_size > 8)
6112 fprintf (stream2, "\n ");
6113 for (i = 0; i < t.level1_size; i++)
6116 if (i > 0 && (i % 8) == 0)
6117 fprintf (stream2, "\n ");
6118 offset = ((uint32_t *) (t.result + level1_offset))[i];
6120 fprintf (stream2, " %5d", -1);
6122 fprintf (stream2, " %5zu",
6123 (offset - level2_offset) / sizeof (uint32_t));
6124 if (i+1 < t.level1_size)
6125 fprintf (stream2, ",");
6127 if (t.level1_size > 8)
6128 fprintf (stream2, "\n ");
6129 fprintf (stream2, " },\n");
6130 fprintf (stream2, " {");
6131 if (t.level2_size << t.q > 8)
6132 fprintf (stream2, "\n ");
6133 for (i = 0; i < t.level2_size << t.q; i++)
6136 if (i > 0 && (i % 8) == 0)
6137 fprintf (stream2, "\n ");
6138 offset = ((uint32_t *) (t.result + level2_offset))[i];
6140 fprintf (stream2, " %5d", -1);
6142 fprintf (stream2, " %5zu",
6143 (offset - level3_offset) / sizeof (unsigned char));
6144 if (i+1 < t.level2_size << t.q)
6145 fprintf (stream2, ",");
6147 if (t.level2_size << t.q > 8)
6148 fprintf (stream2, "\n ");
6149 fprintf (stream2, " },\n");
6150 fprintf (stream2, " {");
6151 if (t.level3_size << t.p > 8)
6152 fprintf (stream2, "\n ");
6153 for (i = 0; i < t.level3_size << t.p; i++)
6155 unsigned char value = ((unsigned char *) (t.result + level3_offset))[i];
6156 const char *value_string;
6159 #define CASE(x) case x: value_string = #x; break;
6196 if (i > 0 && (i % 8) == 0)
6197 fprintf (stream2, "\n ");
6198 fprintf (stream2, " %s%s", value_string,
6199 (i+1 < t.level3_size << t.p ? "," : ""));
6201 if (t.level3_size << t.p > 8)
6202 fprintf (stream2, "\n ");
6203 fprintf (stream2, " }\n");
6204 fprintf (stream2, "};\n");
6208 output_lbrk_tables (const char *filename1, const char *filename2, const char *version)
6210 const char *filenames[2];
6214 filenames[0] = filename1;
6215 filenames[1] = filename2;
6217 for (i = 0; i < 2; i++)
6219 streams[i] = fopen (filenames[i], "w");
6220 if (streams[i] == NULL)
6222 fprintf (stderr, "cannot open '%s' for writing\n", filenames[i]);
6227 for (i = 0; i < 2; i++)
6229 FILE *stream = streams[i];
6231 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
6232 fprintf (stream, "/* Line breaking properties of Unicode characters. */\n");
6233 fprintf (stream, "/* Generated automatically by gen-lbrk for Unicode %s. */\n",
6235 fprintf (stream, "\n");
6237 /* Put a GPL header on it. The gnulib module is under LGPL (although it
6238 still carries the GPL header), and it's gnulib-tool which replaces the
6239 GPL header with an LGPL header. */
6240 fprintf (stream, "/* Copyright (C) 2000-2002, 2004, 2008 Free Software Foundation, Inc.\n");
6241 fprintf (stream, "\n");
6242 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
6243 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
6244 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
6245 fprintf (stream, " (at your option) any later version.\n");
6246 fprintf (stream, "\n");
6247 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
6248 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
6249 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
6250 fprintf (stream, " GNU General Public License for more details.\n");
6251 fprintf (stream, "\n");
6252 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
6253 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
6254 fprintf (stream, "\n");
6257 output_lbp (streams[0], streams[1]);
6259 for (i = 0; i < 2; i++)
6261 if (ferror (streams[i]) || fclose (streams[i]))
6263 fprintf (stderr, "error writing to '%s'\n", filenames[i]);
6269 /* ========================================================================= */
6271 /* Word break property. */
6273 /* Possible values of the Word_Break property. */
6288 WBP_EXTENDNUMLET = 7
6291 /* Returns the word breaking property for ch, as a bit mask. */
6293 get_wbp (unsigned int ch)
6297 if (unicode_attributes[ch].name != NULL)
6300 attr |= 1 << WBP_CR;
6303 attr |= 1 << WBP_LF;
6305 if (ch == 0x000B || ch == 0x000C
6307 || ch == 0x2028 || ch == 0x2029)
6308 attr |= 1 << WBP_NEWLINE;
6310 if (((unicode_properties[ch] >> PROP_GRAPHEME_EXTEND) & 1) != 0
6311 || (unicode_attributes[ch].category != NULL
6312 && strcmp (unicode_attributes[ch].category, "Mc") == 0))
6313 attr |= 1 << WBP_EXTEND;
6315 if (unicode_attributes[ch].category != NULL
6316 && strcmp (unicode_attributes[ch].category, "Cf") == 0
6317 && ch != 0x200C && ch != 0x200D)
6318 attr |= 1 << WBP_FORMAT;
6320 if ((unicode_scripts[ch] < numscripts
6321 && strcmp (scripts[unicode_scripts[ch]], "Katakana") == 0)
6322 || (ch >= 0x3031 && ch <= 0x3035)
6323 || ch == 0x309B || ch == 0x309C || ch == 0x30A0 || ch == 0x30FC
6325 attr |= 1 << WBP_KATAKANA;
6327 if ((((unicode_properties[ch] >> PROP_ALPHABETIC) & 1) != 0
6329 && ((unicode_properties[ch] >> PROP_IDEOGRAPHIC) & 1) == 0
6330 && (attr & (1 << WBP_KATAKANA)) == 0
6331 && ((get_lbp (ch) >> LBP_SA) & 1) == 0
6332 && !(unicode_scripts[ch] < numscripts
6333 && strcmp (scripts[unicode_scripts[ch]], "Hiragana") == 0)
6334 && (attr & (1 << WBP_EXTEND)) == 0)
6335 attr |= 1 << WBP_ALETTER;
6337 if (ch == 0x0027 || ch == 0x002E || ch == 0x2018 || ch == 0x2019
6338 || ch == 0x2024 || ch == 0xFE52 || ch == 0xFF07 || ch == 0xFF0E)
6339 attr |= 1 << WBP_MIDNUMLET;
6341 if (ch == 0x00B7 || ch == 0x05F4 || ch == 0x2027 || ch == 0x003A
6342 || ch == 0x0387 || ch == 0xFE13 || ch == 0xFE55 || ch == 0xFF1A)
6343 attr |= 1 << WBP_MIDLETTER;
6345 if ((((get_lbp (ch) >> LBP_IS) & 1) != 0
6346 || ch == 0x066C || ch == 0xFE50 || ch == 0xFE54 || ch == 0xFF0C
6348 && ch != 0x003A && ch != 0xFE13 && ch != 0x002E)
6349 attr |= 1 << WBP_MIDNUM;
6351 if (((get_lbp (ch) >> LBP_NU) & 1) != 0
6353 attr |= 1 << WBP_NUMERIC;
6355 if (unicode_attributes[ch].category != NULL
6356 && strcmp (unicode_attributes[ch].category, "Pc") == 0)
6357 attr |= 1 << WBP_EXTENDNUMLET;
6362 attr |= 1 << WBP_OTHER;
6367 /* Output the word break property in a human readable format. */
6369 debug_output_wbp (FILE *stream)
6373 for (i = 0; i < 0x110000; i++)
6375 int attr = get_wbp (i);
6376 if (attr != 1 << WBP_OTHER)
6378 fprintf (stream, "0x%04X", i);
6379 if (attr & (1 << WBP_CR))
6380 fprintf (stream, " CR");
6381 if (attr & (1 << WBP_LF))
6382 fprintf (stream, " LF");
6383 if (attr & (1 << WBP_NEWLINE))
6384 fprintf (stream, " Newline");
6385 if (attr & (1 << WBP_EXTEND))
6386 fprintf (stream, " Extend");
6387 if (attr & (1 << WBP_FORMAT))
6388 fprintf (stream, " Format");
6389 if (attr & (1 << WBP_KATAKANA))
6390 fprintf (stream, " Katakana");
6391 if (attr & (1 << WBP_ALETTER))
6392 fprintf (stream, " ALetter");
6393 if (attr & (1 << WBP_MIDNUMLET))
6394 fprintf (stream, " MidNumLet");
6395 if (attr & (1 << WBP_MIDLETTER))
6396 fprintf (stream, " MidLetter");
6397 if (attr & (1 << WBP_MIDNUM))
6398 fprintf (stream, " MidNum");
6399 if (attr & (1 << WBP_NUMERIC))
6400 fprintf (stream, " Numeric");
6401 if (attr & (1 << WBP_EXTENDNUMLET))
6402 fprintf (stream, " ExtendNumLet");
6403 fprintf (stream, "\n");
6409 debug_output_wbrk_tables (const char *filename)
6413 stream = fopen (filename, "w");
6416 fprintf (stderr, "cannot open '%s' for writing\n", filename);
6420 debug_output_wbp (stream);
6422 if (ferror (stream) || fclose (stream))
6424 fprintf (stderr, "error writing to '%s'\n", filename);
6429 /* The word break property from the WordBreakProperty.txt file. */
6430 int unicode_org_wbp[0x110000];
6432 /* Stores in unicode_org_wbp[] the word break property from the
6433 WordBreakProperty.txt file. */
6435 fill_org_wbp (const char *wordbreakproperty_filename)
6440 for (i = 0; i < 0x110000; i++)
6441 unicode_org_wbp[i] = WBP_OTHER;
6443 stream = fopen (wordbreakproperty_filename, "r");
6446 fprintf (stderr, "error during fopen of '%s'\n", wordbreakproperty_filename);
6453 unsigned int i1, i2;
6454 char padding[200+1];
6455 char propname[200+1];
6458 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
6461 if (buf[0] == '\0' || buf[0] == '#')
6464 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4)
6466 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3)
6468 fprintf (stderr, "parse error in '%s'\n",
6469 wordbreakproperty_filename);
6474 #define PROP(name,value) \
6475 if (strcmp (propname, name) == 0) propvalue = value; else
6478 PROP ("Newline", WBP_NEWLINE)
6479 PROP ("Extend", WBP_EXTEND)
6480 PROP ("Format", WBP_FORMAT)
6481 PROP ("Katakana", WBP_KATAKANA)
6482 PROP ("ALetter", WBP_ALETTER)
6483 PROP ("MidNumLet", WBP_MIDNUMLET)
6484 PROP ("MidLetter", WBP_MIDLETTER)
6485 PROP ("MidNum", WBP_MIDNUM)
6486 PROP ("Numeric", WBP_NUMERIC)
6487 PROP ("ExtendNumLet", WBP_EXTENDNUMLET)
6490 fprintf (stderr, "unknown property value '%s' in '%s'\n", propname,
6491 wordbreakproperty_filename);
6494 if (!(i1 <= i2 && i2 < 0x110000))
6497 for (i = i1; i <= i2; i++)
6498 unicode_org_wbp[i] = propvalue;
6501 if (ferror (stream) || fclose (stream))
6503 fprintf (stderr, "error reading from '%s'\n", wordbreakproperty_filename);
6508 /* Output the word break property in a human readable format. */
6510 debug_output_org_wbp (FILE *stream)
6514 for (i = 0; i < 0x110000; i++)
6516 int propvalue = unicode_org_wbp[i];
6517 if (propvalue != WBP_OTHER)
6519 fprintf (stream, "0x%04X", i);
6520 #define PROP(name,value) \
6521 if (propvalue == value) fprintf (stream, " " name); else
6524 PROP ("Newline", WBP_NEWLINE)
6525 PROP ("Extend", WBP_EXTEND)
6526 PROP ("Format", WBP_FORMAT)
6527 PROP ("Katakana", WBP_KATAKANA)
6528 PROP ("ALetter", WBP_ALETTER)
6529 PROP ("MidNumLet", WBP_MIDNUMLET)
6530 PROP ("MidLetter", WBP_MIDLETTER)
6531 PROP ("MidNum", WBP_MIDNUM)
6532 PROP ("Numeric", WBP_NUMERIC)
6533 PROP ("ExtendNumLet", WBP_EXTENDNUMLET)
6535 fprintf (stream, " ??");
6536 fprintf (stream, "\n");
6542 debug_output_org_wbrk_tables (const char *filename)
6546 stream = fopen (filename, "w");
6549 fprintf (stderr, "cannot open '%s' for writing\n", filename);
6553 debug_output_org_wbp (stream);
6555 if (ferror (stream) || fclose (stream))
6557 fprintf (stderr, "error writing to '%s'\n", filename);
6562 /* Construction of sparse 3-level tables. */
6563 #define TABLE wbp_table
6564 #define ELEMENT unsigned char
6565 #define DEFAULT WBP_OTHER
6566 #define xmalloc malloc
6567 #define xrealloc realloc
6571 output_wbp (FILE *stream)
6575 unsigned int level1_offset, level2_offset, level3_offset;
6579 wbp_table_init (&t);
6581 for (i = 0; i < 0x110000; i++)
6583 int attr = get_wbp (i);
6585 /* Now attr should contain exactly one bit. */
6586 if (attr == 0 || ((attr & (attr - 1)) != 0))
6589 if (attr != 1 << WBP_OTHER)
6591 unsigned int log2_attr;
6592 for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++);
6594 wbp_table_add (&t, i, log2_attr);
6598 wbp_table_finalize (&t);
6601 5 * sizeof (uint32_t);
6603 5 * sizeof (uint32_t)
6604 + t.level1_size * sizeof (uint32_t);
6606 5 * sizeof (uint32_t)
6607 + t.level1_size * sizeof (uint32_t)
6608 + (t.level2_size << t.q) * sizeof (uint32_t);
6610 for (i = 0; i < 5; i++)
6611 fprintf (stream, "#define wbrkprop_header_%d %d\n", i,
6612 ((uint32_t *) t.result)[i]);
6613 fprintf (stream, "\n");
6614 fprintf (stream, "typedef struct\n");
6615 fprintf (stream, " {\n");
6616 fprintf (stream, " int level1[%zu];\n", t.level1_size);
6617 fprintf (stream, " int level2[%zu << %d];\n", t.level2_size, t.q);
6618 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
6619 fprintf (stream, " }\n");
6620 fprintf (stream, "wbrkprop_t;\n");
6621 fprintf (stream, "static const wbrkprop_t uniwbrkprop =\n");
6622 fprintf (stream, "{\n");
6623 fprintf (stream, " {");
6624 if (t.level1_size > 8)
6625 fprintf (stream, "\n ");
6626 for (i = 0; i < t.level1_size; i++)
6629 if (i > 0 && (i % 8) == 0)
6630 fprintf (stream, "\n ");
6631 offset = ((uint32_t *) (t.result + level1_offset))[i];
6633 fprintf (stream, " %5d", -1);
6635 fprintf (stream, " %5zu",
6636 (offset - level2_offset) / sizeof (uint32_t));
6637 if (i+1 < t.level1_size)
6638 fprintf (stream, ",");
6640 if (t.level1_size > 8)
6641 fprintf (stream, "\n ");
6642 fprintf (stream, " },\n");
6643 fprintf (stream, " {");
6644 if (t.level2_size << t.q > 8)
6645 fprintf (stream, "\n ");
6646 for (i = 0; i < t.level2_size << t.q; i++)
6649 if (i > 0 && (i % 8) == 0)
6650 fprintf (stream, "\n ");
6651 offset = ((uint32_t *) (t.result + level2_offset))[i];
6653 fprintf (stream, " %5d", -1);
6655 fprintf (stream, " %5zu",
6656 (offset - level3_offset) / sizeof (unsigned char));
6657 if (i+1 < t.level2_size << t.q)
6658 fprintf (stream, ",");
6660 if (t.level2_size << t.q > 8)
6661 fprintf (stream, "\n ");
6662 fprintf (stream, " },\n");
6663 fprintf (stream, " {");
6664 if (t.level3_size << t.p > 4)
6665 fprintf (stream, "\n ");
6666 for (i = 0; i < t.level3_size << t.p; i++)
6668 unsigned char value = ((unsigned char *) (t.result + level3_offset))[i];
6669 const char *value_string;
6672 #define CASE(x) case x: value_string = #x; break;
6681 CASE(WBP_MIDNUMLET);
6682 CASE(WBP_MIDLETTER);
6685 CASE(WBP_EXTENDNUMLET);
6690 if (i > 0 && (i % 4) == 0)
6691 fprintf (stream, "\n ");
6692 fprintf (stream, " %s%s", value_string,
6693 (i+1 < t.level3_size << t.p ? "," : ""));
6695 if (t.level3_size << t.p > 4)
6696 fprintf (stream, "\n ");
6697 fprintf (stream, " }\n");
6698 fprintf (stream, "};\n");
6702 output_wbrk_tables (const char *filename, const char *version)
6706 stream = fopen (filename, "w");
6709 fprintf (stderr, "cannot open '%s' for writing\n", filename);
6713 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
6714 fprintf (stream, "/* Line breaking properties of Unicode characters. */\n");
6715 fprintf (stream, "/* Generated automatically by gen-uni-tables for Unicode %s. */\n",
6717 fprintf (stream, "\n");
6719 /* Put a GPL header on it. The gnulib module is under LGPL (although it
6720 still carries the GPL header), and it's gnulib-tool which replaces the
6721 GPL header with an LGPL header. */
6722 fprintf (stream, "/* Copyright (C) 2000-2002, 2004, 2007-2009 Free Software Foundation, Inc.\n");
6723 fprintf (stream, "\n");
6724 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
6725 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
6726 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
6727 fprintf (stream, " (at your option) any later version.\n");
6728 fprintf (stream, "\n");
6729 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
6730 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
6731 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
6732 fprintf (stream, " GNU General Public License for more details.\n");
6733 fprintf (stream, "\n");
6734 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
6735 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
6736 fprintf (stream, "\n");
6738 output_wbp (stream);
6740 if (ferror (stream) || fclose (stream))
6742 fprintf (stderr, "error writing to '%s'\n", filename);
6747 /* ========================================================================= */
6749 /* Grapheme break property. */
6751 /* Possible values of the Grapheme_Cluster_Break property. */
6760 GBP_SPACINGMARK = 6,
6768 /* Construction of sparse 3-level tables. */
6769 #define TABLE gbp_table
6770 #define ELEMENT unsigned char
6771 #define DEFAULT GBP_OTHER
6772 #define xmalloc malloc
6773 #define xrealloc realloc
6776 /* The grapheme break property from the GraphemeBreakProperty.txt file. */
6777 int unicode_org_gbp[0x110000];
6779 /* Output the per-character grapheme break property table. */
6781 output_gbp_table (const char *filename, const char *version)
6786 unsigned int level1_offset, level2_offset, level3_offset;
6788 stream = fopen (filename, "w");
6791 fprintf (stderr, "cannot open '%s' for writing\n", filename);
6795 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
6796 fprintf (stream, "/* Grapheme break property of Unicode characters. */\n");
6797 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
6802 gbp_table_init (&t);
6804 for (ch = 0; ch < 0x110000; ch++)
6805 gbp_table_add (&t, ch, unicode_org_gbp[ch]);
6807 gbp_table_finalize (&t);
6809 /* Offsets in t.result, in memory of this process. */
6811 5 * sizeof (uint32_t);
6813 5 * sizeof (uint32_t)
6814 + t.level1_size * sizeof (uint32_t);
6816 5 * sizeof (uint32_t)
6817 + t.level1_size * sizeof (uint32_t)
6818 + (t.level2_size << t.q) * sizeof (uint32_t);
6820 for (i = 0; i < 5; i++)
6821 fprintf (stream, "#define gbrkprop_header_%d %d\n", i,
6822 ((uint32_t *) t.result)[i]);
6823 fprintf (stream, "static const\n");
6824 fprintf (stream, "struct\n");
6825 fprintf (stream, " {\n");
6826 fprintf (stream, " int level1[%zu];\n", t.level1_size);
6827 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
6828 fprintf (stream, " unsigned char level3[(%zu << %d) / 2];\n",
6829 t.level3_size, t.p);
6830 fprintf (stream, " }\n");
6831 fprintf (stream, "unigbrkprop =\n");
6832 fprintf (stream, "{\n");
6833 fprintf (stream, " {");
6834 if (t.level1_size > 8)
6835 fprintf (stream, "\n ");
6836 for (i = 0; i < t.level1_size; i++)
6839 if (i > 0 && (i % 8) == 0)
6840 fprintf (stream, "\n ");
6841 offset = ((uint32_t *) (t.result + level1_offset))[i];
6843 fprintf (stream, " %5d", -1);
6845 fprintf (stream, " %5zu",
6846 (offset - level2_offset) / sizeof (uint32_t));
6847 if (i+1 < t.level1_size)
6848 fprintf (stream, ",");
6850 if (t.level1_size > 8)
6851 fprintf (stream, "\n ");
6852 fprintf (stream, " },\n");
6853 fprintf (stream, " {");
6854 if (t.level2_size << t.q > 8)
6855 fprintf (stream, "\n ");
6856 for (i = 0; i < t.level2_size << t.q; i++)
6859 if (i > 0 && (i % 8) == 0)
6860 fprintf (stream, "\n ");
6861 offset = ((uint32_t *) (t.result + level2_offset))[i];
6863 fprintf (stream, " %5d", -1);
6865 fprintf (stream, " %5zu",
6866 (offset - level3_offset) / sizeof (uint8_t) / 2);
6867 if (i+1 < t.level2_size << t.q)
6868 fprintf (stream, ",");
6870 if (t.level2_size << t.q > 8)
6871 fprintf (stream, "\n ");
6872 fprintf (stream, " },\n");
6873 fprintf (stream, " {");
6874 if (t.level3_size << t.p > 8)
6875 fprintf (stream, "\n ");
6876 for (i = 0; i < (t.level3_size << t.p) / 2; i++)
6878 unsigned char *p = (unsigned char *) (t.result + level3_offset);
6879 unsigned char value0 = p[i * 2];
6880 unsigned char value1 = p[i * 2 + 1];
6881 if (i > 0 && (i % 8) == 0)
6882 fprintf (stream, "\n ");
6883 fprintf (stream, " 0x%02x%s", (value1 << 4) + value0,
6884 (i+1 < (t.level3_size << t.p) / 2 ? "," : ""));
6886 if (t.level3_size << t.p > 8)
6887 fprintf (stream, "\n ");
6888 fprintf (stream, " }\n");
6889 fprintf (stream, "};\n");
6891 if (ferror (stream) || fclose (stream))
6893 fprintf (stderr, "error writing to '%s'\n", filename);
6898 /* Stores in unicode_org_gbp[] the grapheme breaking property from the
6899 GraphemeBreakProperty.txt file. */
6901 fill_org_gbp (const char *graphemebreakproperty_filename)
6907 for (i = 0; i < 0x110000; i++)
6908 unicode_org_gbp[i] = GBP_OTHER;
6910 stream = fopen (graphemebreakproperty_filename, "r");
6913 fprintf (stderr, "error during fopen of '%s'\n",
6914 graphemebreakproperty_filename);
6921 unsigned int i1, i2;
6922 char padding[200+1];
6923 char propname[200+1];
6927 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
6930 if (buf[0] == '\0' || buf[0] == '#')
6933 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4)
6935 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3)
6937 fprintf (stderr, "parse error in '%s'\n",
6938 graphemebreakproperty_filename);
6943 #define PROP(name,value) \
6944 if (strcmp (propname, name) == 0) propvalue = value; else
6947 PROP ("Control", GBP_CONTROL)
6948 PROP ("Extend", GBP_EXTEND)
6949 PROP ("Prepend", GBP_PREPEND)
6950 PROP ("SpacingMark", GBP_SPACINGMARK)
6955 PROP ("LVT", GBP_LVT)
6958 fprintf (stderr, "unknown property value '%s' in %s:%d\n", propname,
6959 graphemebreakproperty_filename, lineno);
6962 if (!(i1 <= i2 && i2 < 0x110000))
6965 for (i = i1; i <= i2; i++)
6966 unicode_org_gbp[i] = propvalue;
6968 if (ferror (stream) || fclose (stream))
6970 fprintf (stderr, "error reading from '%s'\n", graphemebreakproperty_filename);
6975 /* ========================================================================= */
6977 /* Maximum number of characters into which a single Unicode character can be
6979 #define MAX_DECOMP_LENGTH 18
6983 UC_DECOMP_CANONICAL,/* Canonical decomposition. */
6984 UC_DECOMP_FONT, /* <font> A font variant (e.g. a blackletter form). */
6985 UC_DECOMP_NOBREAK, /* <noBreak> A no-break version of a space or hyphen. */
6986 UC_DECOMP_INITIAL, /* <initial> An initial presentation form (Arabic). */
6987 UC_DECOMP_MEDIAL, /* <medial> A medial presentation form (Arabic). */
6988 UC_DECOMP_FINAL, /* <final> A final presentation form (Arabic). */
6989 UC_DECOMP_ISOLATED,/* <isolated> An isolated presentation form (Arabic). */
6990 UC_DECOMP_CIRCLE, /* <circle> An encircled form. */
6991 UC_DECOMP_SUPER, /* <super> A superscript form. */
6992 UC_DECOMP_SUB, /* <sub> A subscript form. */
6993 UC_DECOMP_VERTICAL,/* <vertical> A vertical layout presentation form. */
6994 UC_DECOMP_WIDE, /* <wide> A wide (or zenkaku) compatibility character. */
6995 UC_DECOMP_NARROW, /* <narrow> A narrow (or hankaku) compatibility character. */
6996 UC_DECOMP_SMALL, /* <small> A small variant form (CNS compatibility). */
6997 UC_DECOMP_SQUARE, /* <square> A CJK squared font variant. */
6998 UC_DECOMP_FRACTION,/* <fraction> A vulgar fraction form. */
6999 UC_DECOMP_COMPAT /* <compat> Otherwise unspecified compatibility character. */
7002 /* Return the decomposition for a Unicode character (ignoring Hangul Jamo
7003 decompositions). Return the type, or -1 for none. */
7005 get_decomposition (unsigned int ch,
7006 unsigned int *lengthp, unsigned int decomposed[MAX_DECOMP_LENGTH])
7008 const char *decomposition = unicode_attributes[ch].decomposition;
7010 if (decomposition != NULL && decomposition[0] != '\0')
7012 int type = UC_DECOMP_CANONICAL;
7013 unsigned int length;
7016 if (decomposition[0] == '<')
7021 rangle = strchr (decomposition + 1, '>');
7024 typelen = rangle + 1 - decomposition;
7025 #define TYPE(t1,t2) \
7026 if (typelen == (sizeof (t1) - 1) && memcmp (decomposition, t1, typelen) == 0) \
7029 TYPE ("<font>", UC_DECOMP_FONT)
7030 TYPE ("<noBreak>", UC_DECOMP_NOBREAK)
7031 TYPE ("<initial>", UC_DECOMP_INITIAL)
7032 TYPE ("<medial>", UC_DECOMP_MEDIAL)
7033 TYPE ("<final>", UC_DECOMP_FINAL)
7034 TYPE ("<isolated>", UC_DECOMP_ISOLATED)
7035 TYPE ("<circle>", UC_DECOMP_CIRCLE)
7036 TYPE ("<super>", UC_DECOMP_SUPER)
7037 TYPE ("<sub>", UC_DECOMP_SUB)
7038 TYPE ("<vertical>", UC_DECOMP_VERTICAL)
7039 TYPE ("<wide>", UC_DECOMP_WIDE)
7040 TYPE ("<narrow>", UC_DECOMP_NARROW)
7041 TYPE ("<small>", UC_DECOMP_SMALL)
7042 TYPE ("<square>", UC_DECOMP_SQUARE)
7043 TYPE ("<fraction>", UC_DECOMP_FRACTION)
7044 TYPE ("<compat>", UC_DECOMP_COMPAT)
7046 fprintf (stderr, "unknown decomposition type %*s\n", (int)typelen, decomposition);
7050 decomposition = rangle + 1;
7051 if (decomposition[0] == ' ')
7054 for (length = 0; length < MAX_DECOMP_LENGTH; length++)
7056 decomposed[length] = strtoul (decomposition, &endptr, 16);
7057 if (endptr == decomposition)
7059 decomposition = endptr;
7060 if (decomposition[0] == ' ')
7063 if (*decomposition != '\0')
7064 /* MAX_DECOMP_LENGTH is too small. */
7074 /* Construction of sparse 3-level tables. */
7075 #define TABLE decomp_table
7076 #define ELEMENT uint16_t
7077 #define DEFAULT (uint16_t)(-1)
7078 #define xmalloc malloc
7079 #define xrealloc realloc
7083 output_decomposition (FILE *stream1, FILE *stream2)
7085 struct decomp_table t;
7086 unsigned int level1_offset, level2_offset, level3_offset;
7087 unsigned int offset;
7093 decomp_table_init (&t);
7095 fprintf (stream1, "extern const unsigned char gl_uninorm_decomp_chars_table[];\n");
7096 fprintf (stream1, "\n");
7097 fprintf (stream2, "const unsigned char gl_uninorm_decomp_chars_table[] =\n{");
7100 for (ch = 0; ch < 0x110000; ch++)
7102 unsigned int length;
7103 unsigned int decomposed[MAX_DECOMP_LENGTH];
7104 int type = get_decomposition (ch, &length, decomposed);
7108 if (!(offset < (1 << 15)))
7110 decomp_table_add (&t, ch, ((type == UC_DECOMP_CANONICAL ? 0 : 1) << 15) | offset);
7112 /* Produce length 3-bytes entries. */
7114 /* We would need a special representation of zero-length entries. */
7116 for (i = 0; i < length; i++)
7119 fprintf (stream2, ",");
7120 if ((offset % 4) == 0)
7121 fprintf (stream2, "\n ");
7122 if (!(decomposed[i] < (1 << 18)))
7124 fprintf (stream2, " 0x%02X, 0x%02X, 0x%02X",
7125 (((i+1 < length ? (1 << 23) : 0)
7126 | (i == 0 ? (type << 18) : 0)
7127 | decomposed[i]) >> 16) & 0xff,
7128 (decomposed[i] >> 8) & 0xff,
7129 decomposed[i] & 0xff);
7135 fprintf (stream2, "\n};\n");
7136 fprintf (stream2, "\n");
7138 decomp_table_finalize (&t);
7141 5 * sizeof (uint32_t);
7143 5 * sizeof (uint32_t)
7144 + t.level1_size * sizeof (uint32_t);
7146 5 * sizeof (uint32_t)
7147 + t.level1_size * sizeof (uint32_t)
7148 + (t.level2_size << t.q) * sizeof (uint32_t);
7150 for (i = 0; i < 5; i++)
7151 fprintf (stream1, "#define decomp_header_%d %d\n", i,
7152 ((uint32_t *) t.result)[i]);
7153 fprintf (stream1, "\n");
7154 fprintf (stream1, "typedef struct\n");
7155 fprintf (stream1, " {\n");
7156 fprintf (stream1, " int level1[%zu];\n", t.level1_size);
7157 fprintf (stream1, " int level2[%zu << %d];\n", t.level2_size, t.q);
7158 fprintf (stream1, " unsigned short level3[%zu << %d];\n", t.level3_size, t.p);
7159 fprintf (stream1, " }\n");
7160 fprintf (stream1, "decomp_index_table_t;\n");
7161 fprintf (stream1, "extern const decomp_index_table_t gl_uninorm_decomp_index_table;\n");
7162 fprintf (stream2, "const decomp_index_table_t gl_uninorm_decomp_index_table =\n");
7163 fprintf (stream2, "{\n");
7164 fprintf (stream2, " {");
7165 if (t.level1_size > 8)
7166 fprintf (stream2, "\n ");
7167 for (i = 0; i < t.level1_size; i++)
7170 if (i > 0 && (i % 8) == 0)
7171 fprintf (stream2, "\n ");
7172 offset = ((uint32_t *) (t.result + level1_offset))[i];
7174 fprintf (stream2, " %5d", -1);
7176 fprintf (stream2, " %5zu",
7177 (offset - level2_offset) / sizeof (uint32_t));
7178 if (i+1 < t.level1_size)
7179 fprintf (stream2, ",");
7181 if (t.level1_size > 8)
7182 fprintf (stream2, "\n ");
7183 fprintf (stream2, " },\n");
7184 fprintf (stream2, " {");
7185 if (t.level2_size << t.q > 8)
7186 fprintf (stream2, "\n ");
7187 for (i = 0; i < t.level2_size << t.q; i++)
7190 if (i > 0 && (i % 8) == 0)
7191 fprintf (stream2, "\n ");
7192 offset = ((uint32_t *) (t.result + level2_offset))[i];
7194 fprintf (stream2, " %5d", -1);
7196 fprintf (stream2, " %5zu",
7197 (offset - level3_offset) / sizeof (uint16_t));
7198 if (i+1 < t.level2_size << t.q)
7199 fprintf (stream2, ",");
7201 if (t.level2_size << t.q > 8)
7202 fprintf (stream2, "\n ");
7203 fprintf (stream2, " },\n");
7204 fprintf (stream2, " {");
7205 if (t.level3_size << t.p > 8)
7206 fprintf (stream2, "\n ");
7207 for (i = 0; i < t.level3_size << t.p; i++)
7209 uint16_t value = ((uint16_t *) (t.result + level3_offset))[i];
7210 if (i > 0 && (i % 8) == 0)
7211 fprintf (stream2, "\n ");
7212 fprintf (stream2, " %5d", value == (uint16_t)(-1) ? -1 : value);
7213 if (i+1 < t.level3_size << t.p)
7214 fprintf (stream2, ",");
7216 if (t.level3_size << t.p > 8)
7217 fprintf (stream2, "\n ");
7218 fprintf (stream2, " }\n");
7219 fprintf (stream2, "};\n");
7223 output_decomposition_tables (const char *filename1, const char *filename2, const char *version)
7225 const char *filenames[2];
7229 filenames[0] = filename1;
7230 filenames[1] = filename2;
7232 for (i = 0; i < 2; i++)
7234 streams[i] = fopen (filenames[i], "w");
7235 if (streams[i] == NULL)
7237 fprintf (stderr, "cannot open '%s' for writing\n", filenames[i]);
7242 for (i = 0; i < 2; i++)
7244 FILE *stream = streams[i];
7246 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
7247 fprintf (stream, "/* Decomposition of Unicode characters. */\n");
7248 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
7250 fprintf (stream, "\n");
7253 output_decomposition (streams[0], streams[1]);
7255 for (i = 0; i < 2; i++)
7257 if (ferror (streams[i]) || fclose (streams[i]))
7259 fprintf (stderr, "error writing to '%s'\n", filenames[i]);
7265 /* The "excluded from composition" property from the CompositionExclusions.txt file. */
7266 char unicode_composition_exclusions[0x110000];
7269 fill_composition_exclusions (const char *compositionexclusions_filename)
7274 stream = fopen (compositionexclusions_filename, "r");
7277 fprintf (stderr, "error during fopen of '%s'\n", compositionexclusions_filename);
7281 for (i = 0; i < 0x110000; i++)
7282 unicode_composition_exclusions[i] = 0;
7289 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
7292 if (buf[0] == '\0' || buf[0] == '#')
7295 if (sscanf (buf, "%X", &i) != 1)
7297 fprintf (stderr, "parse error in '%s'\n", compositionexclusions_filename);
7300 if (!(i < 0x110000))
7303 unicode_composition_exclusions[i] = 1;
7306 if (ferror (stream) || fclose (stream))
7308 fprintf (stderr, "error reading from '%s'\n", compositionexclusions_filename);
7314 debug_output_composition_tables (const char *filename)
7319 stream = fopen (filename, "w");
7322 fprintf (stderr, "cannot open '%s' for writing\n", filename);
7326 for (ch = 0; ch < 0x110000; ch++)
7328 unsigned int length;
7329 unsigned int decomposed[MAX_DECOMP_LENGTH];
7330 int type = get_decomposition (ch, &length, decomposed);
7332 if (type == UC_DECOMP_CANONICAL
7333 /* Consider only binary decompositions.
7334 Exclude singleton decompositions. */
7337 unsigned int code1 = decomposed[0];
7338 unsigned int code2 = decomposed[1];
7339 unsigned int combined = ch;
7341 /* Exclude decompositions where the first part is not a starter,
7342 i.e. is not of canonical combining class 0. */
7343 if (strcmp (unicode_attributes[code1].combining, "0") == 0
7344 /* Exclude characters listed in CompositionExclusions.txt. */
7345 && !unicode_composition_exclusions[combined])
7347 /* The combined character must now also be a starter.
7349 if (strcmp (unicode_attributes[combined].combining, "0") != 0)
7352 fprintf (stream, "0x%04X\t0x%04X\t0x%04X\t%s\n",
7356 unicode_attributes[code2].combining);
7361 if (ferror (stream) || fclose (stream))
7363 fprintf (stderr, "error writing to '%s'\n", filename);
7369 output_composition_tables (const char *filename, const char *version)
7374 stream = fopen (filename, "w");
7377 fprintf (stderr, "cannot open '%s' for writing\n", filename);
7381 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
7382 fprintf (stream, "/* Canonical composition of Unicode characters. */\n");
7383 fprintf (stream, "/* Generated automatically by gen-uni-tables for Unicode %s. */\n",
7385 fprintf (stream, "\n");
7387 /* Put a GPL header on it. The gnulib module is under LGPL (although it
7388 still carries the GPL header), and it's gnulib-tool which replaces the
7389 GPL header with an LGPL header. */
7390 fprintf (stream, "/* Copyright (C) 2009 Free Software Foundation, Inc.\n");
7391 fprintf (stream, "\n");
7392 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
7393 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
7394 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
7395 fprintf (stream, " (at your option) any later version.\n");
7396 fprintf (stream, "\n");
7397 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
7398 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
7399 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
7400 fprintf (stream, " GNU General Public License for more details.\n");
7401 fprintf (stream, "\n");
7402 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
7403 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
7404 fprintf (stream, "\n");
7406 /* The composition table is a set of mappings (code1, code2) -> combined,
7408 367 values for code1 (from 0x003C to 0x30FD),
7409 54 values for code2 (from 0x0300 to 0x309A).
7410 For a fixed code1, there are from 1 to 19 possible values for code2.
7411 For a fixed code2, there are from 1 to 117 possible values for code1.
7412 This is a very sparse matrix.
7414 We want an O(1) hash lookup.
7416 We could implement the hash lookup by mapping (code1, code2) to a linear
7417 combination mul1*code1 + mul2*code2, which is then used as an index into
7418 a 3-level table. But this leads to a table of size 37 KB.
7420 We use gperf to implement the hash lookup, giving it the 928 sets of
7421 4 bytes (code1, code2) as input. gperf generates a hash table of size
7422 1527, which is quite good (60% filled). It requires an auxiliary table
7423 lookup in a table of size 0.5 KB. The total tables size is 11 KB. */
7425 fprintf (stream, "struct composition_rule { char codes[4]; };\n");
7426 fprintf (stream, "%%struct-type\n");
7427 fprintf (stream, "%%language=ANSI-C\n");
7428 fprintf (stream, "%%define slot-name codes\n");
7429 fprintf (stream, "%%define hash-function-name gl_uninorm_compose_hash\n");
7430 fprintf (stream, "%%define lookup-function-name gl_uninorm_compose_lookup\n");
7431 fprintf (stream, "%%compare-lengths\n");
7432 fprintf (stream, "%%compare-strncmp\n");
7433 fprintf (stream, "%%readonly-tables\n");
7434 fprintf (stream, "%%omit-struct-type\n");
7435 fprintf (stream, "%%%%\n");
7437 for (ch = 0; ch < 0x110000; ch++)
7439 unsigned int length;
7440 unsigned int decomposed[MAX_DECOMP_LENGTH];
7441 int type = get_decomposition (ch, &length, decomposed);
7443 if (type == UC_DECOMP_CANONICAL
7444 /* Consider only binary decompositions.
7445 Exclude singleton decompositions. */
7448 unsigned int code1 = decomposed[0];
7449 unsigned int code2 = decomposed[1];
7450 unsigned int combined = ch;
7452 /* Exclude decompositions where the first part is not a starter,
7453 i.e. is not of canonical combining class 0. */
7454 if (strcmp (unicode_attributes[code1].combining, "0") == 0
7455 /* Exclude characters listed in CompositionExclusions.txt. */
7456 && !unicode_composition_exclusions[combined])
7458 /* The combined character must now also be a starter.
7460 if (strcmp (unicode_attributes[combined].combining, "0") != 0)
7463 if (!(code1 < 0x10000))
7465 if (!(code2 < 0x10000))
7467 if (!(combined < 0x10000))
7470 fprintf (stream, "\"\\x%02x\\x%02x\\x%02x\\x%02x\", 0x%04x\n",
7471 (code1 >> 8) & 0xff, code1 & 0xff,
7472 (code2 >> 8) & 0xff, code2 & 0xff,
7478 if (ferror (stream) || fclose (stream))
7480 fprintf (stderr, "error writing to '%s'\n", filename);
7485 /* ========================================================================= */
7487 /* Output the test for a simple character mapping table to the given file. */
7490 output_simple_mapping_test (const char *filename,
7491 const char *function_name,
7492 unsigned int (*func) (unsigned int),
7493 const char *version)
7499 stream = fopen (filename, "w");
7502 fprintf (stderr, "cannot open '%s' for writing\n", filename);
7506 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
7507 fprintf (stream, "/* Test the Unicode character mapping functions.\n");
7508 fprintf (stream, " Copyright (C) 2009 Free Software Foundation, Inc.\n");
7509 fprintf (stream, "\n");
7510 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
7511 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
7512 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
7513 fprintf (stream, " (at your option) any later version.\n");
7514 fprintf (stream, "\n");
7515 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
7516 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
7517 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
7518 fprintf (stream, " GNU General Public License for more details.\n");
7519 fprintf (stream, "\n");
7520 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
7521 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
7522 fprintf (stream, "\n");
7523 fprintf (stream, "/* Generated automatically by gen-case.c for Unicode %s. */\n",
7525 fprintf (stream, "\n");
7526 fprintf (stream, "#include \"test-mapping-part1.h\"\n");
7527 fprintf (stream, "\n");
7530 for (ch = 0; ch < 0x110000; ch++)
7532 unsigned int value = func (ch);
7537 fprintf (stream, ",\n");
7538 fprintf (stream, " { 0x%04X, 0x%04X }", ch, value);
7543 fprintf (stream, "\n");
7545 fprintf (stream, "\n");
7546 fprintf (stream, "#define MAP(c) %s (c)\n", function_name);
7547 fprintf (stream, "#include \"test-mapping-part2.h\"\n");
7549 if (ferror (stream) || fclose (stream))
7551 fprintf (stderr, "error writing to '%s'\n", filename);
7556 /* Construction of sparse 3-level tables. */
7557 #define TABLE mapping_table
7558 #define ELEMENT int32_t
7560 #define xmalloc malloc
7561 #define xrealloc realloc
7564 /* Output a simple character mapping table to the given file. */
7567 output_simple_mapping (const char *filename,
7568 unsigned int (*func) (unsigned int),
7569 const char *version)
7573 struct mapping_table t;
7574 unsigned int level1_offset, level2_offset, level3_offset;
7576 stream = fopen (filename, "w");
7579 fprintf (stderr, "cannot open '%s' for writing\n", filename);
7583 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
7584 fprintf (stream, "/* Simple character mapping of Unicode characters. */\n");
7585 fprintf (stream, "/* Generated automatically by gen-case.c for Unicode %s. */\n",
7590 mapping_table_init (&t);
7592 for (ch = 0; ch < 0x110000; ch++)
7594 int value = (int) func (ch) - (int) ch;
7596 mapping_table_add (&t, ch, value);
7599 mapping_table_finalize (&t);
7601 /* Offsets in t.result, in memory of this process. */
7603 5 * sizeof (uint32_t);
7605 5 * sizeof (uint32_t)
7606 + t.level1_size * sizeof (uint32_t);
7608 5 * sizeof (uint32_t)
7609 + t.level1_size * sizeof (uint32_t)
7610 + (t.level2_size << t.q) * sizeof (uint32_t);
7612 for (i = 0; i < 5; i++)
7613 fprintf (stream, "#define mapping_header_%d %d\n", i,
7614 ((uint32_t *) t.result)[i]);
7615 fprintf (stream, "static const\n");
7616 fprintf (stream, "struct\n");
7617 fprintf (stream, " {\n");
7618 fprintf (stream, " int level1[%zu];\n", t.level1_size);
7619 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
7620 fprintf (stream, " int level3[%zu << %d];\n", t.level3_size, t.p);
7621 fprintf (stream, " }\n");
7622 fprintf (stream, "u_mapping =\n");
7623 fprintf (stream, "{\n");
7624 fprintf (stream, " {");
7625 if (t.level1_size > 8)
7626 fprintf (stream, "\n ");
7627 for (i = 0; i < t.level1_size; i++)
7630 if (i > 0 && (i % 8) == 0)
7631 fprintf (stream, "\n ");
7632 offset = ((uint32_t *) (t.result + level1_offset))[i];
7634 fprintf (stream, " %5d", -1);
7636 fprintf (stream, " %5zu",
7637 (offset - level2_offset) / sizeof (uint32_t));
7638 if (i+1 < t.level1_size)
7639 fprintf (stream, ",");
7641 if (t.level1_size > 8)
7642 fprintf (stream, "\n ");
7643 fprintf (stream, " },\n");
7644 fprintf (stream, " {");
7645 if (t.level2_size << t.q > 8)
7646 fprintf (stream, "\n ");
7647 for (i = 0; i < t.level2_size << t.q; i++)
7650 if (i > 0 && (i % 8) == 0)
7651 fprintf (stream, "\n ");
7652 offset = ((uint32_t *) (t.result + level2_offset))[i];
7654 fprintf (stream, " %5d", -1);
7656 fprintf (stream, " %5zu",
7657 (offset - level3_offset) / sizeof (int32_t));
7658 if (i+1 < t.level2_size << t.q)
7659 fprintf (stream, ",");
7661 if (t.level2_size << t.q > 8)
7662 fprintf (stream, "\n ");
7663 fprintf (stream, " },\n");
7664 fprintf (stream, " {");
7665 if (t.level3_size << t.p > 8)
7666 fprintf (stream, "\n ");
7667 for (i = 0; i < t.level3_size << t.p; i++)
7669 if (i > 0 && (i % 8) == 0)
7670 fprintf (stream, "\n ");
7671 fprintf (stream, " %5d", ((int32_t *) (t.result + level3_offset))[i]);
7672 if (i+1 < t.level3_size << t.p)
7673 fprintf (stream, ",");
7675 if (t.level3_size << t.p > 8)
7676 fprintf (stream, "\n ");
7677 fprintf (stream, " }\n");
7678 fprintf (stream, "};\n");
7680 if (ferror (stream) || fclose (stream))
7682 fprintf (stderr, "error writing to '%s'\n", filename);
7687 /* ========================================================================= */
7689 /* A special casing context.
7690 A context is negated through x -> -x. */
7695 SCC_AFTER_SOFT_DOTTED,
7701 /* A special casing rule. */
7702 struct special_casing_rule
7705 unsigned int lower_mapping[3];
7706 unsigned int title_mapping[3];
7707 unsigned int upper_mapping[3];
7708 unsigned int casefold_mapping[3];
7709 const char *language;
7713 /* The special casing rules. */
7714 struct special_casing_rule **casing_rules;
7715 unsigned int num_casing_rules;
7716 unsigned int allocated_casing_rules;
7719 add_casing_rule (struct special_casing_rule *new_rule)
7721 if (num_casing_rules == allocated_casing_rules)
7723 allocated_casing_rules = 2 * allocated_casing_rules;
7724 if (allocated_casing_rules < 16)
7725 allocated_casing_rules = 16;
7727 (struct special_casing_rule **)
7728 realloc (casing_rules, allocated_casing_rules * sizeof (struct special_casing_rule *));
7730 casing_rules[num_casing_rules++] = new_rule;
7733 /* Stores in casing_rules the special casing rules found in
7734 specialcasing_filename. */
7736 fill_casing_rules (const char *specialcasing_filename)
7740 stream = fopen (specialcasing_filename, "r");
7743 fprintf (stderr, "error during fopen of '%s'\n", specialcasing_filename);
7747 casing_rules = NULL;
7748 num_casing_rules = 0;
7749 allocated_casing_rules = 0;
7759 unsigned int lower_mapping[3];
7760 unsigned int title_mapping[3];
7761 unsigned int upper_mapping[3];
7765 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
7768 if (buf[0] == '\0' || buf[0] == '#')
7773 code = strtoul (scanptr, &endptr, 16);
7774 if (endptr == scanptr)
7776 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
7780 if (*scanptr != ';')
7782 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
7787 /* Scan lower mapping. */
7788 for (i = 0; i < 3; i++)
7789 lower_mapping[i] = 0;
7790 for (i = 0; i < 3; i++)
7792 while (*scanptr == ' ')
7794 if (*scanptr == ';')
7796 lower_mapping[i] = strtoul (scanptr, &endptr, 16);
7797 if (endptr == scanptr)
7799 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
7804 if (*scanptr != ';')
7806 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
7811 /* Scan title mapping. */
7812 for (i = 0; i < 3; i++)
7813 title_mapping[i] = 0;
7814 for (i = 0; i < 3; i++)
7816 while (*scanptr == ' ')
7818 if (*scanptr == ';')
7820 title_mapping[i] = strtoul (scanptr, &endptr, 16);
7821 if (endptr == scanptr)
7823 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
7828 if (*scanptr != ';')
7830 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
7835 /* Scan upper mapping. */
7836 for (i = 0; i < 3; i++)
7837 upper_mapping[i] = 0;
7838 for (i = 0; i < 3; i++)
7840 while (*scanptr == ' ')
7842 if (*scanptr == ';')
7844 upper_mapping[i] = strtoul (scanptr, &endptr, 16);
7845 if (endptr == scanptr)
7847 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
7852 if (*scanptr != ';')
7854 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
7859 /* Scan language and context. */
7861 context = SCC_ALWAYS;
7862 while (*scanptr == ' ')
7864 if (*scanptr != '\0' && *scanptr != '#')
7866 const char *word_begin = scanptr;
7867 const char *word_end;
7869 while (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';' && *scanptr != ' ')
7873 while (*scanptr == ' ')
7876 if (word_end - word_begin == 2)
7878 language = (char *) malloc ((word_end - word_begin) + 1);
7879 memcpy (language, word_begin, 2);
7880 language[word_end - word_begin] = '\0';
7881 word_begin = word_end = NULL;
7883 if (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';')
7885 word_begin = scanptr;
7886 while (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';' && *scanptr != ' ')
7892 if (word_end > word_begin)
7894 bool negate = false;
7896 if (word_end - word_begin >= 4 && memcmp (word_begin, "Not_", 4) == 0)
7901 if (word_end - word_begin == 11 && memcmp (word_begin, "Final_Sigma", 11) == 0)
7902 context = SCC_FINAL_SIGMA;
7903 else if (word_end - word_begin == 17 && memcmp (word_begin, "After_Soft_Dotted", 17) == 0)
7904 context = SCC_AFTER_SOFT_DOTTED;
7905 else if (word_end - word_begin == 10 && memcmp (word_begin, "More_Above", 10) == 0)
7906 context = SCC_MORE_ABOVE;
7907 else if (word_end - word_begin == 10 && memcmp (word_begin, "Before_Dot", 10) == 0)
7908 context = SCC_BEFORE_DOT;
7909 else if (word_end - word_begin == 7 && memcmp (word_begin, "After_I", 7) == 0)
7910 context = SCC_AFTER_I;
7913 fprintf (stderr, "unknown context type in '%s'\n", specialcasing_filename);
7917 context = - context;
7920 if (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';')
7922 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
7927 /* Store the rule. */
7929 struct special_casing_rule *new_rule =
7930 (struct special_casing_rule *) malloc (sizeof (struct special_casing_rule));
7931 new_rule->code = code;
7932 new_rule->language = language;
7933 new_rule->context = context;
7934 memcpy (new_rule->lower_mapping, lower_mapping, sizeof (new_rule->lower_mapping));
7935 memcpy (new_rule->title_mapping, title_mapping, sizeof (new_rule->title_mapping));
7936 memcpy (new_rule->upper_mapping, upper_mapping, sizeof (new_rule->upper_mapping));
7938 add_casing_rule (new_rule);
7942 if (ferror (stream) || fclose (stream))
7944 fprintf (stderr, "error reading from '%s'\n", specialcasing_filename);
7949 /* A casefolding rule. */
7950 struct casefold_rule
7953 unsigned int mapping[3];
7954 const char *language;
7957 /* The casefolding rules. */
7958 struct casefold_rule **casefolding_rules;
7959 unsigned int num_casefolding_rules;
7960 unsigned int allocated_casefolding_rules;
7962 /* Stores in casefolding_rules the case folding rules found in
7963 casefolding_filename. */
7965 fill_casefolding_rules (const char *casefolding_filename)
7969 stream = fopen (casefolding_filename, "r");
7972 fprintf (stderr, "error during fopen of '%s'\n", casefolding_filename);
7976 casefolding_rules = NULL;
7977 num_casefolding_rules = 0;
7978 allocated_casefolding_rules = 0;
7989 unsigned int mapping[3];
7991 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
7994 if (buf[0] == '\0' || buf[0] == '#')
7999 code = strtoul (scanptr, &endptr, 16);
8000 if (endptr == scanptr)
8002 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
8006 if (*scanptr != ';')
8008 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
8014 while (*scanptr == ' ')
8019 case 'C': case 'F': case 'S': case 'T':
8023 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
8027 if (*scanptr != ';')
8029 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
8034 /* Scan casefold mapping. */
8035 for (i = 0; i < 3; i++)
8037 for (i = 0; i < 3; i++)
8039 while (*scanptr == ' ')
8041 if (*scanptr == ';')
8043 mapping[i] = strtoul (scanptr, &endptr, 16);
8044 if (endptr == scanptr)
8046 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
8051 if (*scanptr != ';')
8053 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
8058 /* Ignore rules of type 'S'; we use the rules of type 'F' instead. */
8061 const char * const *languages;
8062 unsigned int languages_count;
8064 /* Type 'T' indicates that the rule is applicable to Turkish
8068 static const char * const turkish_languages[] = { "tr", "az" };
8069 languages = turkish_languages;
8070 languages_count = 2;
8074 static const char * const all_languages[] = { NULL };
8075 languages = all_languages;
8076 languages_count = 1;
8079 for (i = 0; i < languages_count; i++)
8081 /* Store a new rule. */
8082 struct casefold_rule *new_rule =
8083 (struct casefold_rule *) malloc (sizeof (struct casefold_rule));
8084 new_rule->code = code;
8085 memcpy (new_rule->mapping, mapping, sizeof (new_rule->mapping));
8086 new_rule->language = languages[i];
8088 if (num_casefolding_rules == allocated_casefolding_rules)
8090 allocated_casefolding_rules = 2 * allocated_casefolding_rules;
8091 if (allocated_casefolding_rules < 16)
8092 allocated_casefolding_rules = 16;
8094 (struct casefold_rule **)
8095 realloc (casefolding_rules,
8096 allocated_casefolding_rules * sizeof (struct casefold_rule *));
8098 casefolding_rules[num_casefolding_rules++] = new_rule;
8103 if (ferror (stream) || fclose (stream))
8105 fprintf (stderr, "error reading from '%s'\n", casefolding_filename);
8110 /* Casefold mapping, when it maps to a single character. */
8111 unsigned int unicode_casefold[0x110000];
8114 to_casefold (unsigned int ch)
8116 return unicode_casefold[ch];
8119 /* Redistribute the casefolding_rules:
8120 - Rules that map to a single character, language independently, are stored
8121 in unicode_casefold.
8122 - Other rules are merged into casing_rules. */
8124 redistribute_casefolding_rules (void)
8126 unsigned int ch, i, j;
8128 /* Fill unicode_casefold[]. */
8129 for (ch = 0; ch < 0x110000; ch++)
8130 unicode_casefold[ch] = ch;
8131 for (i = 0; i < num_casefolding_rules; i++)
8133 struct casefold_rule *cfrule = casefolding_rules[i];
8135 if (cfrule->language == NULL && cfrule->mapping[1] == 0)
8138 if (!(ch < 0x110000))
8140 unicode_casefold[ch] = cfrule->mapping[0];
8144 /* Extend the special casing rules by filling in their casefold_mapping[]
8146 for (j = 0; j < num_casing_rules; j++)
8148 struct special_casing_rule *rule = casing_rules[j];
8151 rule->casefold_mapping[0] = to_casefold (rule->code);
8152 for (k = 1; k < 3; k++)
8153 rule->casefold_mapping[k] = 0;
8156 /* Now merge the other casefolding rules into casing_rules. */
8157 for (i = 0; i < num_casefolding_rules; i++)
8159 struct casefold_rule *cfrule = casefolding_rules[i];
8161 if (!(cfrule->language == NULL && cfrule->mapping[1] == 0))
8163 /* Find a rule that applies to the same code, same language, and it
8164 has context SCC_ALWAYS. At the same time, update all rules that
8165 have the same code and same or more specific language. */
8166 struct special_casing_rule *found_rule = NULL;
8168 for (j = 0; j < num_casing_rules; j++)
8170 struct special_casing_rule *rule = casing_rules[j];
8172 if (rule->code == cfrule->code
8173 && (cfrule->language == NULL
8174 || (rule->language != NULL
8175 && strcmp (rule->language, cfrule->language) == 0)))
8177 memcpy (rule->casefold_mapping, cfrule->mapping,
8178 sizeof (rule->casefold_mapping));
8180 if ((cfrule->language == NULL
8181 ? rule->language == NULL
8182 : rule->language != NULL
8183 && strcmp (rule->language, cfrule->language) == 0)
8184 && rule->context == SCC_ALWAYS)
8192 if (found_rule == NULL)
8194 /* Create a new rule. */
8195 struct special_casing_rule *new_rule =
8196 (struct special_casing_rule *) malloc (sizeof (struct special_casing_rule));
8198 /* Try to find a rule that applies to the same code, no language
8199 restriction, and with context SCC_ALWAYS. */
8200 for (j = 0; j < num_casing_rules; j++)
8202 struct special_casing_rule *rule = casing_rules[j];
8204 if (rule->code == cfrule->code
8205 && rule->context == SCC_ALWAYS
8206 && rule->language == NULL)
8214 new_rule->code = cfrule->code;
8215 new_rule->language = cfrule->language;
8216 new_rule->context = SCC_ALWAYS;
8217 if (found_rule != NULL)
8219 memcpy (new_rule->lower_mapping, found_rule->lower_mapping,
8220 sizeof (new_rule->lower_mapping));
8221 memcpy (new_rule->title_mapping, found_rule->title_mapping,
8222 sizeof (new_rule->title_mapping));
8223 memcpy (new_rule->upper_mapping, found_rule->upper_mapping,
8224 sizeof (new_rule->upper_mapping));
8230 new_rule->lower_mapping[0] = to_lower (cfrule->code);
8231 for (k = 1; k < 3; k++)
8232 new_rule->lower_mapping[k] = 0;
8233 new_rule->title_mapping[0] = to_title (cfrule->code);
8234 for (k = 1; k < 3; k++)
8235 new_rule->title_mapping[k] = 0;
8236 new_rule->upper_mapping[0] = to_upper (cfrule->code);
8237 for (k = 1; k < 3; k++)
8238 new_rule->upper_mapping[k] = 0;
8240 memcpy (new_rule->casefold_mapping, cfrule->mapping,
8241 sizeof (new_rule->casefold_mapping));
8243 add_casing_rule (new_rule);
8250 compare_casing_rules (const void *a, const void *b)
8252 struct special_casing_rule *a_rule = *(struct special_casing_rule **) a;
8253 struct special_casing_rule *b_rule = *(struct special_casing_rule **) b;
8254 unsigned int a_code = a_rule->code;
8255 unsigned int b_code = b_rule->code;
8257 if (a_code < b_code)
8259 if (a_code > b_code)
8262 /* Sort the more specific rules before the more general ones. */
8263 return (- ((a_rule->language != NULL ? 1 : 0) + (a_rule->context != SCC_ALWAYS ? 1 : 0))
8264 + ((b_rule->language != NULL ? 1 : 0) + (b_rule->context != SCC_ALWAYS ? 1 : 0)));
8268 sort_casing_rules (void)
8270 /* Sort the rules 1. by code, 2. by specificity. */
8271 if (num_casing_rules > 1)
8272 qsort (casing_rules, num_casing_rules, sizeof (struct special_casing_rule *),
8273 compare_casing_rules);
8276 /* Output the special casing rules. */
8278 output_casing_rules (const char *filename, const char *version)
8284 stream = fopen (filename, "w");
8287 fprintf (stderr, "cannot open '%s' for writing\n", filename);
8291 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
8292 fprintf (stream, "/* Special casing rules of Unicode characters. */\n");
8293 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
8295 fprintf (stream, "struct special_casing_rule { char code[3]; };\n");
8296 fprintf (stream, "%%struct-type\n");
8297 fprintf (stream, "%%language=ANSI-C\n");
8298 fprintf (stream, "%%define slot-name code\n");
8299 fprintf (stream, "%%define hash-function-name gl_unicase_special_hash\n");
8300 fprintf (stream, "%%define lookup-function-name gl_unicase_special_lookup\n");
8301 fprintf (stream, "%%compare-lengths\n");
8302 fprintf (stream, "%%compare-strncmp\n");
8303 fprintf (stream, "%%readonly-tables\n");
8304 fprintf (stream, "%%omit-struct-type\n");
8305 fprintf (stream, "%%%%\n");
8308 for (i = 0; i < num_casing_rules; i++)
8310 struct special_casing_rule *rule = casing_rules[i];
8313 if (i > 0 && rule->code == casing_rules[i - 1]->code)
8318 if (!(rule->code < 0x10000))
8320 fprintf (stderr, "special rule #%u: code %u out of range\n", i, rule->code);
8324 fprintf (stream, "\"\\x%02x\\x%02x\\x%02x\", ",
8325 (rule->code >> 8) & 0xff, rule->code & 0xff, minor);
8327 fprintf (stream, "%d, ",
8328 i + 1 < num_casing_rules && casing_rules[i + 1]->code == rule->code ? 1 : 0);
8330 context = rule->context;
8333 fprintf (stream, "-");
8334 context = - context;
8337 fprintf (stream, " ");
8341 fprintf (stream, "SCC_ALWAYS ");
8343 case SCC_FINAL_SIGMA:
8344 fprintf (stream, "SCC_FINAL_SIGMA ");
8346 case SCC_AFTER_SOFT_DOTTED:
8347 fprintf (stream, "SCC_AFTER_SOFT_DOTTED");
8349 case SCC_MORE_ABOVE:
8350 fprintf (stream, "SCC_MORE_ABOVE ");
8352 case SCC_BEFORE_DOT:
8353 fprintf (stream, "SCC_BEFORE_DOT ");
8356 fprintf (stream, "SCC_AFTER_I ");
8361 fprintf (stream, ", ");
8363 if (rule->language != NULL)
8365 if (strlen (rule->language) != 2)
8367 fprintf (stream, "{ '%c', '%c' }, ", rule->language[0], rule->language[1]);
8370 fprintf (stream, "{ '\\0', '\\0' }, ");
8372 fprintf (stream, "{ ");
8373 for (j = 0; j < 3; j++)
8376 fprintf (stream, ", ");
8377 if (!(rule->upper_mapping[j] < 0x10000))
8379 fprintf (stderr, "special rule #%u: upper mapping of code %u out of range\n", i, rule->code);
8382 if (rule->upper_mapping[j] != 0)
8383 fprintf (stream, "0x%04X", rule->upper_mapping[j]);
8385 fprintf (stream, " 0");
8387 fprintf (stream, " }, { ");
8388 for (j = 0; j < 3; j++)
8391 fprintf (stream, ", ");
8392 if (!(rule->lower_mapping[j] < 0x10000))
8394 fprintf (stderr, "special rule #%u: lower mapping of code %u out of range\n", i, rule->code);
8397 if (rule->lower_mapping[j] != 0)
8398 fprintf (stream, "0x%04X", rule->lower_mapping[j]);
8400 fprintf (stream, " 0");
8402 fprintf (stream, " }, { ");
8403 for (j = 0; j < 3; j++)
8406 fprintf (stream, ", ");
8407 if (!(rule->title_mapping[j] < 0x10000))
8409 fprintf (stderr, "special rule #%u: title mapping of code %u out of range\n", i, rule->code);
8412 if (rule->title_mapping[j] != 0)
8413 fprintf (stream, "0x%04X", rule->title_mapping[j]);
8415 fprintf (stream, " 0");
8417 fprintf (stream, " }, { ");
8418 for (j = 0; j < 3; j++)
8421 fprintf (stream, ", ");
8422 if (!(rule->casefold_mapping[j] < 0x10000))
8424 fprintf (stderr, "special rule #%u: casefold mapping of code %u out of range\n", i, rule->code);
8427 if (rule->casefold_mapping[j] != 0)
8428 fprintf (stream, "0x%04X", rule->casefold_mapping[j]);
8430 fprintf (stream, " 0");
8432 fprintf (stream, " }\n");
8435 if (ferror (stream) || fclose (stream))
8437 fprintf (stderr, "error writing to '%s'\n", filename);
8442 /* ========================================================================= */
8444 /* Quoting the Unicode standard:
8445 Definition: A character is defined to be "cased" if it has the Lowercase
8446 or Uppercase property or has a General_Category value of
8447 Titlecase_Letter. */
8449 is_cased (unsigned int ch)
8451 return (is_property_lowercase (ch)
8452 || is_property_uppercase (ch)
8453 || is_category_Lt (ch));
8456 /* Quoting the Unicode standard:
8457 Definition: A character is defined to be "case-ignorable" if it has the
8458 value MidLetter {or the value MidNumLet} for the Word_Break property or
8459 its General_Category is one of Nonspacing_Mark (Mn), Enclosing_Mark (Me),
8460 Format (Cf), Modifier_Letter (Lm), or Modifier_Symbol (Sk).
8461 The text marked in braces was added in Unicode 5.1.0, see
8462 <http://www.unicode.org/versions/Unicode5.1.0/> section "Update of
8463 Definition of case-ignorable". */
8464 /* Since this predicate is only used for the "Before C" and "After C"
8465 conditions of FINAL_SIGMA, we exclude the "cased" characters here.
8466 This simplifies the evaluation of the regular expressions
8467 \p{cased} (\p{case-ignorable})* C
8469 C (\p{case-ignorable})* \p{cased}
8472 is_case_ignorable (unsigned int ch)
8474 return (unicode_org_wbp[ch] == WBP_MIDLETTER
8475 || unicode_org_wbp[ch] == WBP_MIDNUMLET
8476 || is_category_Mn (ch)
8477 || is_category_Me (ch)
8478 || is_category_Cf (ch)
8479 || is_category_Lm (ch)
8480 || is_category_Sk (ch))
8484 /* ------------------------------------------------------------------------- */
8486 /* Output all case related properties. */
8488 output_casing_properties (const char *version)
8490 #define PROPERTY(FN,P) \
8491 debug_output_predicate ("unicase/" #FN ".txt", is_ ## P); \
8492 output_predicate_test ("../tests/unicase/test-" #FN ".c", is_ ## P, "uc_is_" #P " (c)"); \
8493 output_predicate ("unicase/" #FN ".h", is_ ## P, "u_casing_property_" #P, "Casing Properties", version);
8494 PROPERTY(cased, cased)
8495 PROPERTY(ignorable, case_ignorable)
8499 /* ========================================================================= */
8502 main (int argc, char * argv[])
8504 const char *unicodedata_filename;
8505 const char *proplist_filename;
8506 const char *derivedproplist_filename;
8507 const char *scripts_filename;
8508 const char *blocks_filename;
8509 const char *proplist30_filename;
8510 const char *eastasianwidth_filename;
8511 const char *linebreak_filename;
8512 const char *wordbreakproperty_filename;
8513 const char *graphemebreakproperty_filename;
8514 const char *compositionexclusions_filename;
8515 const char *specialcasing_filename;
8516 const char *casefolding_filename;
8517 const char *version;
8521 fprintf (stderr, "Usage: %s UnicodeData.txt PropList.txt DerivedCoreProperties.txt Scripts.txt Blocks.txt PropList-3.0.1.txt EastAsianWidth.txt LineBreak.txt WordBreakProperty.txt GraphemeBreakProperty.txt CompositionExclusions.txt SpecialCasing.txt CaseFolding.txt version\n",
8526 unicodedata_filename = argv[1];
8527 proplist_filename = argv[2];
8528 derivedproplist_filename = argv[3];
8529 scripts_filename = argv[4];
8530 blocks_filename = argv[5];
8531 proplist30_filename = argv[6];
8532 eastasianwidth_filename = argv[7];
8533 linebreak_filename = argv[8];
8534 wordbreakproperty_filename = argv[9];
8535 graphemebreakproperty_filename = argv[10];
8536 compositionexclusions_filename = argv[11];
8537 specialcasing_filename = argv[12];
8538 casefolding_filename = argv[13];
8541 fill_attributes (unicodedata_filename);
8542 clear_properties ();
8543 fill_properties (proplist_filename);
8544 fill_properties (derivedproplist_filename);
8545 fill_properties30 (proplist30_filename);
8546 fill_scripts (scripts_filename);
8547 fill_blocks (blocks_filename);
8548 fill_width (eastasianwidth_filename);
8549 fill_org_lbp (linebreak_filename);
8550 fill_org_wbp (wordbreakproperty_filename);
8551 fill_org_gbp (graphemebreakproperty_filename);
8552 fill_composition_exclusions (compositionexclusions_filename);
8553 fill_casing_rules (specialcasing_filename);
8554 fill_casefolding_rules (casefolding_filename);
8555 redistribute_casefolding_rules ();
8556 sort_casing_rules ();
8558 output_categories (version);
8559 output_category ("unictype/categ_of.h", version);
8560 output_combclass ("unictype/combining.h", version);
8561 output_bidi_category ("unictype/bidi_of.h", version);
8562 output_decimal_digit_test ("../tests/unictype/test-decdigit.h", version);
8563 output_decimal_digit ("unictype/decdigit.h", version);
8564 output_digit_test ("../tests/unictype/test-digit.h", version);
8565 output_digit ("unictype/digit.h", version);
8566 output_numeric_test ("../tests/unictype/test-numeric.h", version);
8567 output_numeric ("unictype/numeric.h", version);
8568 output_mirror ("unictype/mirror.h", version);
8569 output_properties (version);
8570 output_scripts (version);
8571 output_scripts_byname (version);
8572 output_blocks (version);
8573 output_ident_properties (version);
8574 output_old_ctype (version);
8576 debug_output_lbrk_tables ("unilbrk/lbrkprop.txt");
8577 debug_output_org_lbrk_tables ("unilbrk/lbrkprop_org.txt");
8578 output_lbrk_tables ("unilbrk/lbrkprop1.h", "unilbrk/lbrkprop2.h", version);
8580 debug_output_wbrk_tables ("uniwbrk/wbrkprop.txt");
8581 debug_output_org_wbrk_tables ("uniwbrk/wbrkprop_org.txt");
8582 output_wbrk_tables ("uniwbrk/wbrkprop.h", version);
8584 output_gbp_table ("unigbrk/gbrkprop.h", version);
8586 output_decomposition_tables ("uninorm/decomposition-table1.h", "uninorm/decomposition-table2.h", version);
8587 debug_output_composition_tables ("uninorm/composition.txt");
8588 output_composition_tables ("uninorm/composition-table.gperf", version);
8590 output_simple_mapping_test ("../tests/unicase/test-uc_toupper.c", "uc_toupper", to_upper, version);
8591 output_simple_mapping_test ("../tests/unicase/test-uc_tolower.c", "uc_tolower", to_lower, version);
8592 output_simple_mapping_test ("../tests/unicase/test-uc_totitle.c", "uc_totitle", to_title, version);
8593 output_simple_mapping ("unicase/toupper.h", to_upper, version);
8594 output_simple_mapping ("unicase/tolower.h", to_lower, version);
8595 output_simple_mapping ("unicase/totitle.h", to_title, version);
8596 output_simple_mapping ("unicase/tocasefold.h", to_casefold, version);
8597 output_casing_rules ("unicase/special-casing-table.gperf", version);
8598 output_casing_properties (version);
8604 * For Emacs M-x compile
8606 * compile-command: "
8607 gcc -O -Wall gen-uni-tables.c -Iunictype -o gen-uni-tables && \
8609 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/UnicodeData.txt \
8610 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/PropList.txt \
8611 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/DerivedCoreProperties.txt \
8612 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/Scripts.txt \
8613 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/Blocks.txt \
8614 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/3.0.1/PropList-3.0.1.txt \
8615 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/EastAsianWidth.txt \
8616 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/LineBreak.txt \
8617 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/auxiliary/WordBreakProperty.txt \
8618 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/auxiliary/GraphemeBreakProperty.txt \
8619 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/CompositionExclusions.txt \
8620 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/SpecialCasing.txt \
8621 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/CaseFolding.txt \