1 /* Generate Unicode conforming character classification tables and
2 line break properties tables and word break property tables and
3 decomposition/composition and case mapping tables from a UnicodeData file.
4 Copyright (C) 2000-2002, 2004, 2007-2011 Free Software Foundation, Inc.
5 Written by Bruno Haible <bruno@clisp.org>, 2000-2002.
7 This program is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <http://www.gnu.org/licenses/>. */
21 $ gen-uni-tables /usr/local/share/Unidata/UnicodeData.txt \
22 /usr/local/share/Unidata/PropList.txt \
23 /usr/local/share/Unidata/DerivedCoreProperties.txt \
24 /usr/local/share/Unidata/Scripts.txt \
25 /usr/local/share/Unidata/Blocks.txt \
26 /usr/local/share/Unidata/PropList-3.0.1.txt \
27 /usr/local/share/Unidata/EastAsianWidth.txt \
28 /usr/local/share/Unidata/LineBreak.txt \
29 /usr/local/share/Unidata/WordBreakProperty.txt \
30 /usr/local/share/Unidata/GraphemeBreakProperty.txt \
31 /usr/local/share/Unidata/CompositionExclusions.txt \
32 /usr/local/share/Unidata/SpecialCasing.txt \
33 /usr/local/share/Unidata/CaseFolding.txt \
44 /* ========================================================================= */
46 /* Reading UnicodeData.txt. */
49 /* This structure represents one line in the UnicodeData.txt file. */
50 struct unicode_attribute
52 const char *name; /* Character name */
53 const char *category; /* General category */
54 const char *combining; /* Canonical combining class */
55 const char *bidi; /* Bidirectional category */
56 const char *decomposition; /* Character decomposition mapping */
57 const char *decdigit; /* Decimal digit value */
58 const char *digit; /* Digit value */
59 const char *numeric; /* Numeric value */
60 bool mirrored; /* mirrored */
61 const char *oldname; /* Old Unicode 1.0 name */
62 const char *comment; /* Comment */
63 unsigned int upper; /* Uppercase mapping */
64 unsigned int lower; /* Lowercase mapping */
65 unsigned int title; /* Titlecase mapping */
68 /* Missing fields are represented with "" for strings, and NONE for
70 #define NONE (~(unsigned int)0)
72 /* The entire contents of the UnicodeData.txt file. */
73 struct unicode_attribute unicode_attributes [0x110000];
75 /* Stores in unicode_attributes[i] the values from the given fields. */
77 fill_attribute (unsigned int i,
78 const char *field1, const char *field2,
79 const char *field3, const char *field4,
80 const char *field5, const char *field6,
81 const char *field7, const char *field8,
82 const char *field9, const char *field10,
83 const char *field11, const char *field12,
84 const char *field13, const char *field14)
86 struct unicode_attribute * uni;
90 fprintf (stderr, "index too large\n");
93 if (strcmp (field2, "Cs") == 0)
94 /* Surrogates are UTF-16 artefacts, not real characters. Ignore them. */
96 uni = &unicode_attributes[i];
97 /* Copy the strings. */
98 uni->name = strdup (field1);
99 uni->category = (field2[0] == '\0' ? "" : strdup (field2));
100 uni->combining = (field3[0] == '\0' ? "" : strdup (field3));
101 uni->bidi = (field4[0] == '\0' ? "" : strdup (field4));
102 uni->decomposition = (field5[0] == '\0' ? "" : strdup (field5));
103 uni->decdigit = (field6[0] == '\0' ? "" : strdup (field6));
104 uni->digit = (field7[0] == '\0' ? "" : strdup (field7));
105 uni->numeric = (field8[0] == '\0' ? "" : strdup (field8));
106 uni->mirrored = (field9[0] == 'Y');
107 uni->oldname = (field10[0] == '\0' ? "" : strdup (field10));
108 uni->comment = (field11[0] == '\0' ? "" : strdup (field11));
109 uni->upper = (field12[0] =='\0' ? NONE : strtoul (field12, NULL, 16));
110 uni->lower = (field13[0] =='\0' ? NONE : strtoul (field13, NULL, 16));
111 uni->title = (field14[0] =='\0' ? NONE : strtoul (field14, NULL, 16));
114 /* Maximum length of a field in the UnicodeData.txt file. */
117 /* Reads the next field from STREAM. The buffer BUFFER has size FIELDLEN.
118 Reads up to (but excluding) DELIM.
119 Returns 1 when a field was successfully read, otherwise 0. */
121 getfield (FILE *stream, char *buffer, int delim)
126 for (; (c = getc (stream)), (c != EOF && c != delim); )
128 /* The original unicode.org UnicodeData.txt file happens to have
129 CR/LF line terminators. Silently convert to LF. */
133 /* Put c into the buffer. */
134 if (++count >= FIELDLEN - 1)
136 fprintf (stderr, "field longer than expected, increase FIELDLEN\n");
149 /* Stores in unicode_attributes[] the entire contents of the UnicodeData.txt
152 fill_attributes (const char *unicodedata_filename)
156 char field0[FIELDLEN];
157 char field1[FIELDLEN];
158 char field2[FIELDLEN];
159 char field3[FIELDLEN];
160 char field4[FIELDLEN];
161 char field5[FIELDLEN];
162 char field6[FIELDLEN];
163 char field7[FIELDLEN];
164 char field8[FIELDLEN];
165 char field9[FIELDLEN];
166 char field10[FIELDLEN];
167 char field11[FIELDLEN];
168 char field12[FIELDLEN];
169 char field13[FIELDLEN];
170 char field14[FIELDLEN];
173 for (i = 0; i < 0x110000; i++)
174 unicode_attributes[i].name = NULL;
176 stream = fopen (unicodedata_filename, "r");
179 fprintf (stderr, "error during fopen of '%s'\n", unicodedata_filename);
188 n = getfield (stream, field0, ';');
189 n += getfield (stream, field1, ';');
190 n += getfield (stream, field2, ';');
191 n += getfield (stream, field3, ';');
192 n += getfield (stream, field4, ';');
193 n += getfield (stream, field5, ';');
194 n += getfield (stream, field6, ';');
195 n += getfield (stream, field7, ';');
196 n += getfield (stream, field8, ';');
197 n += getfield (stream, field9, ';');
198 n += getfield (stream, field10, ';');
199 n += getfield (stream, field11, ';');
200 n += getfield (stream, field12, ';');
201 n += getfield (stream, field13, ';');
202 n += getfield (stream, field14, '\n');
207 fprintf (stderr, "short line in '%s':%d\n",
208 unicodedata_filename, lineno);
211 i = strtoul (field0, NULL, 16);
213 && strlen (field1) >= 9
214 && strcmp (field1 + strlen (field1) - 8, ", First>") == 0)
216 /* Deal with a range. */
218 n = getfield (stream, field0, ';');
219 n += getfield (stream, field1, ';');
220 n += getfield (stream, field2, ';');
221 n += getfield (stream, field3, ';');
222 n += getfield (stream, field4, ';');
223 n += getfield (stream, field5, ';');
224 n += getfield (stream, field6, ';');
225 n += getfield (stream, field7, ';');
226 n += getfield (stream, field8, ';');
227 n += getfield (stream, field9, ';');
228 n += getfield (stream, field10, ';');
229 n += getfield (stream, field11, ';');
230 n += getfield (stream, field12, ';');
231 n += getfield (stream, field13, ';');
232 n += getfield (stream, field14, '\n');
235 fprintf (stderr, "missing end range in '%s':%d\n",
236 unicodedata_filename, lineno);
239 if (!(field1[0] == '<'
240 && strlen (field1) >= 8
241 && strcmp (field1 + strlen (field1) - 7, ", Last>") == 0))
243 fprintf (stderr, "missing end range in '%s':%d\n",
244 unicodedata_filename, lineno);
247 field1[strlen (field1) - 7] = '\0';
248 j = strtoul (field0, NULL, 16);
250 fill_attribute (i, field1+1, field2, field3, field4, field5,
251 field6, field7, field8, field9, field10,
252 field11, field12, field13, field14);
256 /* Single character line */
257 fill_attribute (i, field1, field2, field3, field4, field5,
258 field6, field7, field8, field9, field10,
259 field11, field12, field13, field14);
262 if (ferror (stream) || fclose (stream))
264 fprintf (stderr, "error reading from '%s'\n", unicodedata_filename);
269 /* ========================================================================= */
271 /* General category. */
272 /* See Unicode 3.0 book, section 4.5,
276 is_category_L (unsigned int ch)
278 return (unicode_attributes[ch].name != NULL
279 && unicode_attributes[ch].category[0] == 'L');
283 is_category_Lu (unsigned int ch)
285 return (unicode_attributes[ch].name != NULL
286 && unicode_attributes[ch].category[0] == 'L'
287 && unicode_attributes[ch].category[1] == 'u');
291 is_category_Ll (unsigned int ch)
293 return (unicode_attributes[ch].name != NULL
294 && unicode_attributes[ch].category[0] == 'L'
295 && unicode_attributes[ch].category[1] == 'l');
299 is_category_Lt (unsigned int ch)
301 return (unicode_attributes[ch].name != NULL
302 && unicode_attributes[ch].category[0] == 'L'
303 && unicode_attributes[ch].category[1] == 't');
307 is_category_Lm (unsigned int ch)
309 return (unicode_attributes[ch].name != NULL
310 && unicode_attributes[ch].category[0] == 'L'
311 && unicode_attributes[ch].category[1] == 'm');
315 is_category_Lo (unsigned int ch)
317 return (unicode_attributes[ch].name != NULL
318 && unicode_attributes[ch].category[0] == 'L'
319 && unicode_attributes[ch].category[1] == 'o');
323 is_category_M (unsigned int ch)
325 return (unicode_attributes[ch].name != NULL
326 && unicode_attributes[ch].category[0] == 'M');
330 is_category_Mn (unsigned int ch)
332 return (unicode_attributes[ch].name != NULL
333 && unicode_attributes[ch].category[0] == 'M'
334 && unicode_attributes[ch].category[1] == 'n');
338 is_category_Mc (unsigned int ch)
340 return (unicode_attributes[ch].name != NULL
341 && unicode_attributes[ch].category[0] == 'M'
342 && unicode_attributes[ch].category[1] == 'c');
346 is_category_Me (unsigned int ch)
348 return (unicode_attributes[ch].name != NULL
349 && unicode_attributes[ch].category[0] == 'M'
350 && unicode_attributes[ch].category[1] == 'e');
354 is_category_N (unsigned int ch)
356 return (unicode_attributes[ch].name != NULL
357 && unicode_attributes[ch].category[0] == 'N');
361 is_category_Nd (unsigned int ch)
363 return (unicode_attributes[ch].name != NULL
364 && unicode_attributes[ch].category[0] == 'N'
365 && unicode_attributes[ch].category[1] == 'd');
369 is_category_Nl (unsigned int ch)
371 return (unicode_attributes[ch].name != NULL
372 && unicode_attributes[ch].category[0] == 'N'
373 && unicode_attributes[ch].category[1] == 'l');
377 is_category_No (unsigned int ch)
379 return (unicode_attributes[ch].name != NULL
380 && unicode_attributes[ch].category[0] == 'N'
381 && unicode_attributes[ch].category[1] == 'o');
385 is_category_P (unsigned int ch)
387 return (unicode_attributes[ch].name != NULL
388 && unicode_attributes[ch].category[0] == 'P');
392 is_category_Pc (unsigned int ch)
394 return (unicode_attributes[ch].name != NULL
395 && unicode_attributes[ch].category[0] == 'P'
396 && unicode_attributes[ch].category[1] == 'c');
400 is_category_Pd (unsigned int ch)
402 return (unicode_attributes[ch].name != NULL
403 && unicode_attributes[ch].category[0] == 'P'
404 && unicode_attributes[ch].category[1] == 'd');
408 is_category_Ps (unsigned int ch)
410 return (unicode_attributes[ch].name != NULL
411 && unicode_attributes[ch].category[0] == 'P'
412 && unicode_attributes[ch].category[1] == 's');
416 is_category_Pe (unsigned int ch)
418 return (unicode_attributes[ch].name != NULL
419 && unicode_attributes[ch].category[0] == 'P'
420 && unicode_attributes[ch].category[1] == 'e');
424 is_category_Pi (unsigned int ch)
426 return (unicode_attributes[ch].name != NULL
427 && unicode_attributes[ch].category[0] == 'P'
428 && unicode_attributes[ch].category[1] == 'i');
432 is_category_Pf (unsigned int ch)
434 return (unicode_attributes[ch].name != NULL
435 && unicode_attributes[ch].category[0] == 'P'
436 && unicode_attributes[ch].category[1] == 'f');
440 is_category_Po (unsigned int ch)
442 return (unicode_attributes[ch].name != NULL
443 && unicode_attributes[ch].category[0] == 'P'
444 && unicode_attributes[ch].category[1] == 'o');
448 is_category_S (unsigned int ch)
450 return (unicode_attributes[ch].name != NULL
451 && unicode_attributes[ch].category[0] == 'S');
455 is_category_Sm (unsigned int ch)
457 return (unicode_attributes[ch].name != NULL
458 && unicode_attributes[ch].category[0] == 'S'
459 && unicode_attributes[ch].category[1] == 'm');
463 is_category_Sc (unsigned int ch)
465 return (unicode_attributes[ch].name != NULL
466 && unicode_attributes[ch].category[0] == 'S'
467 && unicode_attributes[ch].category[1] == 'c');
471 is_category_Sk (unsigned int ch)
473 return (unicode_attributes[ch].name != NULL
474 && unicode_attributes[ch].category[0] == 'S'
475 && unicode_attributes[ch].category[1] == 'k');
479 is_category_So (unsigned int ch)
481 return (unicode_attributes[ch].name != NULL
482 && unicode_attributes[ch].category[0] == 'S'
483 && unicode_attributes[ch].category[1] == 'o');
487 is_category_Z (unsigned int ch)
489 return (unicode_attributes[ch].name != NULL
490 && unicode_attributes[ch].category[0] == 'Z');
494 is_category_Zs (unsigned int ch)
496 return (unicode_attributes[ch].name != NULL
497 && unicode_attributes[ch].category[0] == 'Z'
498 && unicode_attributes[ch].category[1] == 's');
502 is_category_Zl (unsigned int ch)
504 return (unicode_attributes[ch].name != NULL
505 && unicode_attributes[ch].category[0] == 'Z'
506 && unicode_attributes[ch].category[1] == 'l');
510 is_category_Zp (unsigned int ch)
512 return (unicode_attributes[ch].name != NULL
513 && unicode_attributes[ch].category[0] == 'Z'
514 && unicode_attributes[ch].category[1] == 'p');
518 is_category_C (unsigned int ch)
520 return (unicode_attributes[ch].name == NULL
521 || unicode_attributes[ch].category[0] == 'C');
525 is_category_Cc (unsigned int ch)
527 return (unicode_attributes[ch].name != NULL
528 && unicode_attributes[ch].category[0] == 'C'
529 && unicode_attributes[ch].category[1] == 'c');
533 is_category_Cf (unsigned int ch)
535 return (unicode_attributes[ch].name != NULL
536 && unicode_attributes[ch].category[0] == 'C'
537 && unicode_attributes[ch].category[1] == 'f');
541 is_category_Cs (unsigned int ch)
543 return (ch >= 0xd800 && ch < 0xe000);
547 is_category_Co (unsigned int ch)
549 return (unicode_attributes[ch].name != NULL
550 && unicode_attributes[ch].category[0] == 'C'
551 && unicode_attributes[ch].category[1] == 'o');
555 is_category_Cn (unsigned int ch)
557 return (unicode_attributes[ch].name == NULL
558 && !(ch >= 0xd800 && ch < 0xe000));
561 /* Output a boolean property in a human readable format. */
563 debug_output_predicate (const char *filename, bool (*predicate) (unsigned int))
568 stream = fopen (filename, "w");
571 fprintf (stderr, "cannot open '%s' for writing\n", filename);
575 #if 0 /* This yields huge text output. */
576 for (ch = 0; ch < 0x110000; ch++)
579 fprintf (stream, "0x%04X\n", ch);
582 for (ch = 0; ch < 0x110000; ch++)
585 unsigned int first = ch;
588 while (ch + 1 < 0x110000 && predicate (ch + 1))
592 fprintf (stream, "0x%04X..0x%04X\n", first, last);
594 fprintf (stream, "0x%04X\n", ch);
598 if (ferror (stream) || fclose (stream))
600 fprintf (stderr, "error writing to '%s'\n", filename);
605 /* Output the unit test for a boolean property. */
607 output_predicate_test (const char *filename, bool (*predicate) (unsigned int), const char *expression)
613 stream = fopen (filename, "w");
616 fprintf (stderr, "cannot open '%s' for writing\n", filename);
620 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
621 fprintf (stream, "/* Test the Unicode character type functions.\n");
622 fprintf (stream, " Copyright (C) 2007 Free Software Foundation, Inc.\n");
623 fprintf (stream, "\n");
624 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
625 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
626 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
627 fprintf (stream, " (at your option) any later version.\n");
628 fprintf (stream, "\n");
629 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
630 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
631 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
632 fprintf (stream, " GNU General Public License for more details.\n");
633 fprintf (stream, "\n");
634 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
635 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
636 fprintf (stream, "\n");
637 fprintf (stream, "#include \"test-predicate-part1.h\"\n");
638 fprintf (stream, "\n");
641 for (ch = 0; ch < 0x110000; ch++)
644 unsigned int first = ch;
647 while (ch + 1 < 0x110000 && predicate (ch + 1))
651 fprintf (stream, ",\n");
652 fprintf (stream, " { 0x%04X, 0x%04X }", first, last);
656 fprintf (stream, "\n");
658 fprintf (stream, "\n");
659 fprintf (stream, "#define PREDICATE(c) %s\n", expression);
660 fprintf (stream, "#include \"test-predicate-part2.h\"\n");
662 if (ferror (stream) || fclose (stream))
664 fprintf (stderr, "error writing to '%s'\n", filename);
669 /* Construction of sparse 3-level tables. */
670 #define TABLE predicate_table
671 #define xmalloc malloc
672 #define xrealloc realloc
673 #include "3levelbit.h"
675 /* Output a boolean property in a three-level bitmap. */
677 output_predicate (const char *filename, bool (*predicate) (unsigned int), const char *name, const char *comment, const char *version)
681 struct predicate_table t;
682 unsigned int level1_offset, level2_offset, level3_offset;
684 stream = fopen (filename, "w");
687 fprintf (stderr, "cannot open '%s' for writing\n", filename);
691 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
692 fprintf (stream, "/* %s of Unicode characters. */\n", comment);
693 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
698 predicate_table_init (&t);
700 for (ch = 0; ch < 0x110000; ch++)
702 predicate_table_add (&t, ch);
704 predicate_table_finalize (&t);
706 /* Offsets in t.result, in memory of this process. */
708 5 * sizeof (uint32_t);
710 5 * sizeof (uint32_t)
711 + t.level1_size * sizeof (uint32_t);
713 5 * sizeof (uint32_t)
714 + t.level1_size * sizeof (uint32_t)
715 + (t.level2_size << t.q) * sizeof (uint32_t);
717 for (i = 0; i < 5; i++)
719 fprintf (stream, "#define header_%d %d\n", i,
720 ((uint32_t *) t.result)[i]);
722 fprintf (stream, "static const\n");
723 fprintf (stream, "struct\n");
724 fprintf (stream, " {\n");
725 fprintf (stream, " int header[1];\n");
726 fprintf (stream, " int level1[%zu];\n", t.level1_size);
727 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
728 fprintf (stream, " /*unsigned*/ int level3[%zu << %d];\n", t.level3_size, t.p);
729 fprintf (stream, " }\n");
730 fprintf (stream, "%s =\n", name);
731 fprintf (stream, "{\n");
732 fprintf (stream, " { %d },\n", ((uint32_t *) t.result)[1]);
733 fprintf (stream, " {");
734 if (t.level1_size > 1)
735 fprintf (stream, "\n ");
736 for (i = 0; i < t.level1_size; i++)
739 if (i > 0 && (i % 1) == 0)
740 fprintf (stream, "\n ");
741 offset = ((uint32_t *) (t.result + level1_offset))[i];
743 fprintf (stream, " %5d", -1);
745 fprintf (stream, " %5zu * sizeof (int) / sizeof (short) + %5zu",
746 1 + t.level1_size, (offset - level2_offset) / sizeof (uint32_t));
747 if (i+1 < t.level1_size)
748 fprintf (stream, ",");
750 if (t.level1_size > 1)
751 fprintf (stream, "\n ");
752 fprintf (stream, " },\n");
753 fprintf (stream, " {");
754 if (t.level2_size << t.q > 1)
755 fprintf (stream, "\n ");
756 for (i = 0; i < t.level2_size << t.q; i++)
759 if (i > 0 && (i % 1) == 0)
760 fprintf (stream, "\n ");
761 offset = ((uint32_t *) (t.result + level2_offset))[i];
763 fprintf (stream, " %5d", -1);
765 fprintf (stream, " %5zu + %5zu * sizeof (short) / sizeof (int) + %5zu",
766 1 + t.level1_size, t.level2_size << t.q, (offset - level3_offset) / sizeof (uint32_t));
767 if (i+1 < t.level2_size << t.q)
768 fprintf (stream, ",");
770 if (t.level2_size << t.q > 1)
771 fprintf (stream, "\n ");
772 fprintf (stream, " },\n");
773 fprintf (stream, " {");
774 if (t.level3_size << t.p > 4)
775 fprintf (stream, "\n ");
776 for (i = 0; i < t.level3_size << t.p; i++)
778 if (i > 0 && (i % 4) == 0)
779 fprintf (stream, "\n ");
780 fprintf (stream, " 0x%08X",
781 ((uint32_t *) (t.result + level3_offset))[i]);
782 if (i+1 < t.level3_size << t.p)
783 fprintf (stream, ",");
785 if (t.level3_size << t.p > 4)
786 fprintf (stream, "\n ");
787 fprintf (stream, " }\n");
788 fprintf (stream, "};\n");
790 if (ferror (stream) || fclose (stream))
792 fprintf (stderr, "error writing to '%s'\n", filename);
797 /* Output all categories. */
799 output_categories (const char *version)
801 #define CATEGORY(C) \
802 debug_output_predicate ("unictype/categ_" #C ".txt", is_category_ ## C); \
803 output_predicate_test ("../tests/unictype/test-categ_" #C ".c", is_category_ ## C, "uc_is_general_category (c, UC_CATEGORY_" #C ")"); \
804 output_predicate ("unictype/categ_" #C ".h", is_category_ ## C, "u_categ_" #C, "Categories", version);
847 UC_CATEGORY_MASK_L = 0x0000001f,
848 UC_CATEGORY_MASK_Lu = 0x00000001,
849 UC_CATEGORY_MASK_Ll = 0x00000002,
850 UC_CATEGORY_MASK_Lt = 0x00000004,
851 UC_CATEGORY_MASK_Lm = 0x00000008,
852 UC_CATEGORY_MASK_Lo = 0x00000010,
853 UC_CATEGORY_MASK_M = 0x000000e0,
854 UC_CATEGORY_MASK_Mn = 0x00000020,
855 UC_CATEGORY_MASK_Mc = 0x00000040,
856 UC_CATEGORY_MASK_Me = 0x00000080,
857 UC_CATEGORY_MASK_N = 0x00000700,
858 UC_CATEGORY_MASK_Nd = 0x00000100,
859 UC_CATEGORY_MASK_Nl = 0x00000200,
860 UC_CATEGORY_MASK_No = 0x00000400,
861 UC_CATEGORY_MASK_P = 0x0003f800,
862 UC_CATEGORY_MASK_Pc = 0x00000800,
863 UC_CATEGORY_MASK_Pd = 0x00001000,
864 UC_CATEGORY_MASK_Ps = 0x00002000,
865 UC_CATEGORY_MASK_Pe = 0x00004000,
866 UC_CATEGORY_MASK_Pi = 0x00008000,
867 UC_CATEGORY_MASK_Pf = 0x00010000,
868 UC_CATEGORY_MASK_Po = 0x00020000,
869 UC_CATEGORY_MASK_S = 0x003c0000,
870 UC_CATEGORY_MASK_Sm = 0x00040000,
871 UC_CATEGORY_MASK_Sc = 0x00080000,
872 UC_CATEGORY_MASK_Sk = 0x00100000,
873 UC_CATEGORY_MASK_So = 0x00200000,
874 UC_CATEGORY_MASK_Z = 0x01c00000,
875 UC_CATEGORY_MASK_Zs = 0x00400000,
876 UC_CATEGORY_MASK_Zl = 0x00800000,
877 UC_CATEGORY_MASK_Zp = 0x01000000,
878 UC_CATEGORY_MASK_C = 0x3e000000,
879 UC_CATEGORY_MASK_Cc = 0x02000000,
880 UC_CATEGORY_MASK_Cf = 0x04000000,
881 UC_CATEGORY_MASK_Cs = 0x08000000,
882 UC_CATEGORY_MASK_Co = 0x10000000,
883 UC_CATEGORY_MASK_Cn = 0x20000000
887 general_category_byname (const char *category_name)
889 if (category_name[0] != '\0'
890 && (category_name[1] == '\0' || category_name[2] == '\0'))
891 switch (category_name[0])
894 switch (category_name[1])
896 case '\0': return UC_CATEGORY_MASK_L;
897 case 'u': return UC_CATEGORY_MASK_Lu;
898 case 'l': return UC_CATEGORY_MASK_Ll;
899 case 't': return UC_CATEGORY_MASK_Lt;
900 case 'm': return UC_CATEGORY_MASK_Lm;
901 case 'o': return UC_CATEGORY_MASK_Lo;
905 switch (category_name[1])
907 case '\0': return UC_CATEGORY_MASK_M;
908 case 'n': return UC_CATEGORY_MASK_Mn;
909 case 'c': return UC_CATEGORY_MASK_Mc;
910 case 'e': return UC_CATEGORY_MASK_Me;
914 switch (category_name[1])
916 case '\0': return UC_CATEGORY_MASK_N;
917 case 'd': return UC_CATEGORY_MASK_Nd;
918 case 'l': return UC_CATEGORY_MASK_Nl;
919 case 'o': return UC_CATEGORY_MASK_No;
923 switch (category_name[1])
925 case '\0': return UC_CATEGORY_MASK_P;
926 case 'c': return UC_CATEGORY_MASK_Pc;
927 case 'd': return UC_CATEGORY_MASK_Pd;
928 case 's': return UC_CATEGORY_MASK_Ps;
929 case 'e': return UC_CATEGORY_MASK_Pe;
930 case 'i': return UC_CATEGORY_MASK_Pi;
931 case 'f': return UC_CATEGORY_MASK_Pf;
932 case 'o': return UC_CATEGORY_MASK_Po;
936 switch (category_name[1])
938 case '\0': return UC_CATEGORY_MASK_S;
939 case 'm': return UC_CATEGORY_MASK_Sm;
940 case 'c': return UC_CATEGORY_MASK_Sc;
941 case 'k': return UC_CATEGORY_MASK_Sk;
942 case 'o': return UC_CATEGORY_MASK_So;
946 switch (category_name[1])
948 case '\0': return UC_CATEGORY_MASK_Z;
949 case 's': return UC_CATEGORY_MASK_Zs;
950 case 'l': return UC_CATEGORY_MASK_Zl;
951 case 'p': return UC_CATEGORY_MASK_Zp;
955 switch (category_name[1])
957 case '\0': return UC_CATEGORY_MASK_C;
958 case 'c': return UC_CATEGORY_MASK_Cc;
959 case 'f': return UC_CATEGORY_MASK_Cf;
960 case 's': return UC_CATEGORY_MASK_Cs;
961 case 'o': return UC_CATEGORY_MASK_Co;
962 case 'n': return UC_CATEGORY_MASK_Cn;
966 /* Invalid category name. */
970 /* Construction of sparse 3-level tables. */
971 #define TABLE category_table
972 #define ELEMENT uint8_t
973 #define DEFAULT 29 /* = log2(UC_CATEGORY_MASK_Cn) */
974 #define xmalloc malloc
975 #define xrealloc realloc
978 /* Output the per-character category table. */
980 output_category (const char *filename, const char *version)
984 struct category_table t;
985 unsigned int level1_offset, level2_offset, level3_offset;
986 uint16_t *level3_packed;
988 stream = fopen (filename, "w");
991 fprintf (stderr, "cannot open '%s' for writing\n", filename);
995 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
996 fprintf (stream, "/* Categories of Unicode characters. */\n");
997 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1002 category_table_init (&t);
1004 for (ch = 0; ch < 0x110000; ch++)
1007 unsigned int log2_value;
1009 if (is_category_Cs (ch))
1010 value = UC_CATEGORY_MASK_Cs;
1011 else if (unicode_attributes[ch].name != NULL)
1012 value = general_category_byname (unicode_attributes[ch].category);
1016 /* Now value should contain exactly one bit. */
1017 if (value == 0 || ((value & (value - 1)) != 0))
1020 for (log2_value = 0; value > 1; value >>= 1, log2_value++);
1022 category_table_add (&t, ch, log2_value);
1025 category_table_finalize (&t);
1027 /* Offsets in t.result, in memory of this process. */
1029 5 * sizeof (uint32_t);
1031 5 * sizeof (uint32_t)
1032 + t.level1_size * sizeof (uint32_t);
1034 5 * sizeof (uint32_t)
1035 + t.level1_size * sizeof (uint32_t)
1036 + (t.level2_size << t.q) * sizeof (uint32_t);
1038 for (i = 0; i < 5; i++)
1039 fprintf (stream, "#define category_header_%d %d\n", i,
1040 ((uint32_t *) t.result)[i]);
1041 fprintf (stream, "static const\n");
1042 fprintf (stream, "struct\n");
1043 fprintf (stream, " {\n");
1044 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1045 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1046 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
1047 (1 << t.p) * 5 / 16);
1048 fprintf (stream, " }\n");
1049 fprintf (stream, "u_category =\n");
1050 fprintf (stream, "{\n");
1051 fprintf (stream, " {");
1052 if (t.level1_size > 8)
1053 fprintf (stream, "\n ");
1054 for (i = 0; i < t.level1_size; i++)
1057 if (i > 0 && (i % 8) == 0)
1058 fprintf (stream, "\n ");
1059 offset = ((uint32_t *) (t.result + level1_offset))[i];
1061 fprintf (stream, " %5d", -1);
1063 fprintf (stream, " %5zu",
1064 (offset - level2_offset) / sizeof (uint32_t));
1065 if (i+1 < t.level1_size)
1066 fprintf (stream, ",");
1068 if (t.level1_size > 8)
1069 fprintf (stream, "\n ");
1070 fprintf (stream, " },\n");
1071 fprintf (stream, " {");
1072 if (t.level2_size << t.q > 8)
1073 fprintf (stream, "\n ");
1074 for (i = 0; i < t.level2_size << t.q; i++)
1077 if (i > 0 && (i % 8) == 0)
1078 fprintf (stream, "\n ");
1079 offset = ((uint32_t *) (t.result + level2_offset))[i];
1081 fprintf (stream, " %5d", -1);
1083 fprintf (stream, " %5zu",
1084 (offset - level3_offset) / sizeof (uint8_t));
1085 if (i+1 < t.level2_size << t.q)
1086 fprintf (stream, ",");
1088 if (t.level2_size << t.q > 8)
1089 fprintf (stream, "\n ");
1090 fprintf (stream, " },\n");
1091 /* Pack the level3 array. Each entry needs 5 bits only. Use 16-bit units,
1092 not 32-bit units, in order to make the lookup function easier. */
1095 calloc ((t.level3_size << t.p) * 5 / 16 + 1, sizeof (uint16_t));
1096 for (i = 0; i < t.level3_size << t.p; i++)
1098 unsigned int j = (i * 5) / 16;
1099 unsigned int k = (i * 5) % 16;
1100 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
1101 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
1102 level3_packed[j] = value & 0xffff;
1103 level3_packed[j+1] = value >> 16;
1105 fprintf (stream, " {");
1106 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1107 fprintf (stream, "\n ");
1108 for (i = 0; i < (t.level3_size << t.p) * 5 / 16 + 1; i++)
1110 if (i > 0 && (i % 8) == 0)
1111 fprintf (stream, "\n ");
1112 fprintf (stream, " 0x%04x", level3_packed[i]);
1113 if (i+1 < (t.level3_size << t.p) * 5 / 16 + 1)
1114 fprintf (stream, ",");
1116 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1117 fprintf (stream, "\n ");
1118 fprintf (stream, " }\n");
1119 free (level3_packed);
1120 fprintf (stream, "};\n");
1122 if (ferror (stream) || fclose (stream))
1124 fprintf (stderr, "error writing to '%s'\n", filename);
1129 /* ========================================================================= */
1131 /* Canonical combining class. */
1132 /* See Unicode 3.0 book, section 4.2,
1135 /* Construction of sparse 3-level tables. */
1136 #define TABLE combclass_table
1137 #define ELEMENT uint8_t
1139 #define xmalloc malloc
1140 #define xrealloc realloc
1143 /* Output the per-character combining class table. */
1145 output_combclass (const char *filename, const char *version)
1149 struct combclass_table t;
1150 unsigned int level1_offset, level2_offset, level3_offset;
1152 stream = fopen (filename, "w");
1155 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1159 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1160 fprintf (stream, "/* Combining class of Unicode characters. */\n");
1161 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1166 combclass_table_init (&t);
1168 for (ch = 0; ch < 0x110000; ch++)
1169 if (unicode_attributes[ch].name != NULL)
1171 int value = atoi (unicode_attributes[ch].combining);
1172 if (!(value >= 0 && value <= 255))
1174 combclass_table_add (&t, ch, value);
1177 combclass_table_finalize (&t);
1179 /* Offsets in t.result, in memory of this process. */
1181 5 * sizeof (uint32_t);
1183 5 * sizeof (uint32_t)
1184 + t.level1_size * sizeof (uint32_t);
1186 5 * sizeof (uint32_t)
1187 + t.level1_size * sizeof (uint32_t)
1188 + (t.level2_size << t.q) * sizeof (uint32_t);
1190 for (i = 0; i < 5; i++)
1191 fprintf (stream, "#define combclass_header_%d %d\n", i,
1192 ((uint32_t *) t.result)[i]);
1193 fprintf (stream, "static const\n");
1194 fprintf (stream, "struct\n");
1195 fprintf (stream, " {\n");
1196 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1197 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1198 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
1199 fprintf (stream, " }\n");
1200 fprintf (stream, "u_combclass =\n");
1201 fprintf (stream, "{\n");
1202 fprintf (stream, " {");
1203 if (t.level1_size > 8)
1204 fprintf (stream, "\n ");
1205 for (i = 0; i < t.level1_size; i++)
1208 if (i > 0 && (i % 8) == 0)
1209 fprintf (stream, "\n ");
1210 offset = ((uint32_t *) (t.result + level1_offset))[i];
1212 fprintf (stream, " %5d", -1);
1214 fprintf (stream, " %5zu",
1215 (offset - level2_offset) / sizeof (uint32_t));
1216 if (i+1 < t.level1_size)
1217 fprintf (stream, ",");
1219 if (t.level1_size > 8)
1220 fprintf (stream, "\n ");
1221 fprintf (stream, " },\n");
1222 fprintf (stream, " {");
1223 if (t.level2_size << t.q > 8)
1224 fprintf (stream, "\n ");
1225 for (i = 0; i < t.level2_size << t.q; i++)
1228 if (i > 0 && (i % 8) == 0)
1229 fprintf (stream, "\n ");
1230 offset = ((uint32_t *) (t.result + level2_offset))[i];
1232 fprintf (stream, " %5d", -1);
1234 fprintf (stream, " %5zu",
1235 (offset - level3_offset) / sizeof (uint8_t));
1236 if (i+1 < t.level2_size << t.q)
1237 fprintf (stream, ",");
1239 if (t.level2_size << t.q > 8)
1240 fprintf (stream, "\n ");
1241 fprintf (stream, " },\n");
1242 fprintf (stream, " {");
1243 if (t.level3_size << t.p > 8)
1244 fprintf (stream, "\n ");
1245 for (i = 0; i < t.level3_size << t.p; i++)
1247 if (i > 0 && (i % 8) == 0)
1248 fprintf (stream, "\n ");
1249 fprintf (stream, " %3d", ((uint8_t *) (t.result + level3_offset))[i]);
1250 if (i+1 < t.level3_size << t.p)
1251 fprintf (stream, ",");
1253 if (t.level3_size << t.p > 8)
1254 fprintf (stream, "\n ");
1255 fprintf (stream, " }\n");
1256 fprintf (stream, "};\n");
1258 if (ferror (stream) || fclose (stream))
1260 fprintf (stderr, "error writing to '%s'\n", filename);
1265 /* ========================================================================= */
1267 /* Bidirectional category. */
1268 /* See Unicode 3.0 book, section 4.3,
1273 UC_BIDI_L, /* Left-to-Right */
1274 UC_BIDI_LRE, /* Left-to-Right Embedding */
1275 UC_BIDI_LRO, /* Left-to-Right Override */
1276 UC_BIDI_R, /* Right-to-Left */
1277 UC_BIDI_AL, /* Right-to-Left Arabic */
1278 UC_BIDI_RLE, /* Right-to-Left Embedding */
1279 UC_BIDI_RLO, /* Right-to-Left Override */
1280 UC_BIDI_PDF, /* Pop Directional Format */
1281 UC_BIDI_EN, /* European Number */
1282 UC_BIDI_ES, /* European Number Separator */
1283 UC_BIDI_ET, /* European Number Terminator */
1284 UC_BIDI_AN, /* Arabic Number */
1285 UC_BIDI_CS, /* Common Number Separator */
1286 UC_BIDI_NSM, /* Non-Spacing Mark */
1287 UC_BIDI_BN, /* Boundary Neutral */
1288 UC_BIDI_B, /* Paragraph Separator */
1289 UC_BIDI_S, /* Segment Separator */
1290 UC_BIDI_WS, /* Whitespace */
1291 UC_BIDI_ON /* Other Neutral */
1295 bidi_category_byname (const char *category_name)
1297 switch (category_name[0])
1300 switch (category_name[1])
1303 if (category_name[2] == '\0')
1307 if (category_name[2] == '\0')
1313 switch (category_name[1])
1318 if (category_name[2] == '\0')
1324 switch (category_name[1])
1327 if (category_name[2] == '\0')
1333 switch (category_name[1])
1336 if (category_name[2] == '\0')
1340 if (category_name[2] == '\0')
1344 if (category_name[2] == '\0')
1350 switch (category_name[1])
1355 switch (category_name[2])
1358 if (category_name[3] == '\0')
1362 if (category_name[3] == '\0')
1370 switch (category_name[1])
1373 switch (category_name[2])
1376 if (category_name[3] == '\0')
1384 switch (category_name[1])
1387 if (category_name[2] == '\0')
1393 switch (category_name[1])
1396 switch (category_name[2])
1399 if (category_name[3] == '\0')
1407 switch (category_name[1])
1412 switch (category_name[2])
1415 if (category_name[3] == '\0')
1419 if (category_name[3] == '\0')
1427 if (category_name[1] == '\0')
1431 switch (category_name[1])
1434 if (category_name[2] == '\0')
1440 /* Invalid bidi category name. */
1445 get_bidi_category (unsigned int ch)
1447 if (unicode_attributes[ch].name != NULL)
1448 return bidi_category_byname (unicode_attributes[ch].bidi);
1451 /* The bidi category of unassigned characters depends on the range.
1452 See UTR #9 and DerivedBidiClass.txt. */
1453 if ((ch >= 0x0590 && ch <= 0x05FF)
1454 || (ch >= 0x07FB && ch <= 0x08FF)
1455 || (ch >= 0xFB37 && ch <= 0xFB45)
1456 || (ch >= 0x10800 && ch <= 0x10FFF))
1458 else if ((ch >= 0x0600 && ch <= 0x07BF)
1459 || (ch >= 0x2064 && ch <= 0x2069)
1460 || (ch >= 0xFBB2 && ch <= 0xFDCF)
1461 || (ch >= 0xFDFE && ch <= 0xFEFE))
1463 else if ((ch >= 0xFDD0 && ch <= 0xFDEF)
1464 || (ch >= 0xFFF0 && ch <= 0xFFFF)
1465 || (ch & 0xFFFF) == 0xFFFE
1466 || (ch & 0xFFFF) == 0xFFFF
1467 || (ch >= 0xE0000 && ch <= 0xE0FFF))
1474 /* Construction of sparse 3-level tables. */
1475 #define TABLE bidi_category_table
1476 #define ELEMENT uint8_t
1477 #define DEFAULT UC_BIDI_L
1478 #define xmalloc malloc
1479 #define xrealloc realloc
1482 /* Output the per-character bidi category table. */
1484 output_bidi_category (const char *filename, const char *version)
1488 struct bidi_category_table t;
1489 unsigned int level1_offset, level2_offset, level3_offset;
1490 uint16_t *level3_packed;
1492 stream = fopen (filename, "w");
1495 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1499 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1500 fprintf (stream, "/* Bidi categories of Unicode characters. */\n");
1501 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1506 bidi_category_table_init (&t);
1508 for (ch = 0; ch < 0x110000; ch++)
1510 int value = get_bidi_category (ch);
1512 bidi_category_table_add (&t, ch, value);
1515 bidi_category_table_finalize (&t);
1517 /* Offsets in t.result, in memory of this process. */
1519 5 * sizeof (uint32_t);
1521 5 * sizeof (uint32_t)
1522 + t.level1_size * sizeof (uint32_t);
1524 5 * sizeof (uint32_t)
1525 + t.level1_size * sizeof (uint32_t)
1526 + (t.level2_size << t.q) * sizeof (uint32_t);
1528 for (i = 0; i < 5; i++)
1529 fprintf (stream, "#define bidi_category_header_%d %d\n", i,
1530 ((uint32_t *) t.result)[i]);
1531 fprintf (stream, "static const\n");
1532 fprintf (stream, "struct\n");
1533 fprintf (stream, " {\n");
1534 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1535 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1536 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
1537 (1 << t.p) * 5 / 16);
1538 fprintf (stream, " }\n");
1539 fprintf (stream, "u_bidi_category =\n");
1540 fprintf (stream, "{\n");
1541 fprintf (stream, " {");
1542 if (t.level1_size > 8)
1543 fprintf (stream, "\n ");
1544 for (i = 0; i < t.level1_size; i++)
1547 if (i > 0 && (i % 8) == 0)
1548 fprintf (stream, "\n ");
1549 offset = ((uint32_t *) (t.result + level1_offset))[i];
1551 fprintf (stream, " %5d", -1);
1553 fprintf (stream, " %5zu",
1554 (offset - level2_offset) / sizeof (uint32_t));
1555 if (i+1 < t.level1_size)
1556 fprintf (stream, ",");
1558 if (t.level1_size > 8)
1559 fprintf (stream, "\n ");
1560 fprintf (stream, " },\n");
1561 fprintf (stream, " {");
1562 if (t.level2_size << t.q > 8)
1563 fprintf (stream, "\n ");
1564 for (i = 0; i < t.level2_size << t.q; i++)
1567 if (i > 0 && (i % 8) == 0)
1568 fprintf (stream, "\n ");
1569 offset = ((uint32_t *) (t.result + level2_offset))[i];
1571 fprintf (stream, " %5d", -1);
1573 fprintf (stream, " %5zu",
1574 (offset - level3_offset) / sizeof (uint8_t));
1575 if (i+1 < t.level2_size << t.q)
1576 fprintf (stream, ",");
1578 if (t.level2_size << t.q > 8)
1579 fprintf (stream, "\n ");
1580 fprintf (stream, " },\n");
1581 /* Pack the level3 array. Each entry needs 5 bits only. Use 16-bit units,
1582 not 32-bit units, in order to make the lookup function easier. */
1585 calloc ((t.level3_size << t.p) * 5 / 16 + 1, sizeof (uint16_t));
1586 for (i = 0; i < t.level3_size << t.p; i++)
1588 unsigned int j = (i * 5) / 16;
1589 unsigned int k = (i * 5) % 16;
1590 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
1591 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
1592 level3_packed[j] = value & 0xffff;
1593 level3_packed[j+1] = value >> 16;
1595 fprintf (stream, " {");
1596 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1597 fprintf (stream, "\n ");
1598 for (i = 0; i < (t.level3_size << t.p) * 5 / 16 + 1; i++)
1600 if (i > 0 && (i % 8) == 0)
1601 fprintf (stream, "\n ");
1602 fprintf (stream, " 0x%04x", level3_packed[i]);
1603 if (i+1 < (t.level3_size << t.p) * 5 / 16 + 1)
1604 fprintf (stream, ",");
1606 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1607 fprintf (stream, "\n ");
1608 fprintf (stream, " }\n");
1609 free (level3_packed);
1610 fprintf (stream, "};\n");
1612 if (ferror (stream) || fclose (stream))
1614 fprintf (stderr, "error writing to '%s'\n", filename);
1619 /* ========================================================================= */
1621 /* Decimal digit value. */
1622 /* See Unicode 3.0 book, section 4.6. */
1625 get_decdigit_value (unsigned int ch)
1627 if (unicode_attributes[ch].name != NULL
1628 && unicode_attributes[ch].decdigit[0] != '\0')
1629 return atoi (unicode_attributes[ch].decdigit);
1633 /* Construction of sparse 3-level tables. */
1634 #define TABLE decdigit_table
1635 #define ELEMENT uint8_t
1637 #define xmalloc malloc
1638 #define xrealloc realloc
1641 /* Output the unit test for the per-character decimal digit value table. */
1643 output_decimal_digit_test (const char *filename, const char *version)
1649 stream = fopen (filename, "w");
1652 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1656 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1657 fprintf (stream, "/* Decimal digit values of Unicode characters. */\n");
1658 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1662 for (ch = 0; ch < 0x110000; ch++)
1664 int value = get_decdigit_value (ch);
1666 if (!(value >= -1 && value < 10))
1672 fprintf (stream, ",\n");
1673 fprintf (stream, " { 0x%04X, %d }", ch, value);
1678 fprintf (stream, "\n");
1680 if (ferror (stream) || fclose (stream))
1682 fprintf (stderr, "error writing to '%s'\n", filename);
1687 /* Output the per-character decimal digit value table. */
1689 output_decimal_digit (const char *filename, const char *version)
1693 struct decdigit_table t;
1694 unsigned int level1_offset, level2_offset, level3_offset;
1696 stream = fopen (filename, "w");
1699 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1703 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1704 fprintf (stream, "/* Decimal digit values of Unicode characters. */\n");
1705 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1710 decdigit_table_init (&t);
1712 for (ch = 0; ch < 0x110000; ch++)
1714 int value = 1 + get_decdigit_value (ch);
1716 if (!(value >= 0 && value <= 10))
1719 decdigit_table_add (&t, ch, value);
1722 decdigit_table_finalize (&t);
1724 /* Offsets in t.result, in memory of this process. */
1726 5 * sizeof (uint32_t);
1728 5 * sizeof (uint32_t)
1729 + t.level1_size * sizeof (uint32_t);
1731 5 * sizeof (uint32_t)
1732 + t.level1_size * sizeof (uint32_t)
1733 + (t.level2_size << t.q) * sizeof (uint32_t);
1735 for (i = 0; i < 5; i++)
1736 fprintf (stream, "#define decdigit_header_%d %d\n", i,
1737 ((uint32_t *) t.result)[i]);
1738 fprintf (stream, "static const\n");
1739 fprintf (stream, "struct\n");
1740 fprintf (stream, " {\n");
1741 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1742 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1743 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size,
1745 fprintf (stream, " }\n");
1746 fprintf (stream, "u_decdigit =\n");
1747 fprintf (stream, "{\n");
1748 fprintf (stream, " {");
1749 if (t.level1_size > 8)
1750 fprintf (stream, "\n ");
1751 for (i = 0; i < t.level1_size; i++)
1754 if (i > 0 && (i % 8) == 0)
1755 fprintf (stream, "\n ");
1756 offset = ((uint32_t *) (t.result + level1_offset))[i];
1758 fprintf (stream, " %5d", -1);
1760 fprintf (stream, " %5zu",
1761 (offset - level2_offset) / sizeof (uint32_t));
1762 if (i+1 < t.level1_size)
1763 fprintf (stream, ",");
1765 if (t.level1_size > 8)
1766 fprintf (stream, "\n ");
1767 fprintf (stream, " },\n");
1768 fprintf (stream, " {");
1769 if (t.level2_size << t.q > 8)
1770 fprintf (stream, "\n ");
1771 for (i = 0; i < t.level2_size << t.q; i++)
1774 if (i > 0 && (i % 8) == 0)
1775 fprintf (stream, "\n ");
1776 offset = ((uint32_t *) (t.result + level2_offset))[i];
1778 fprintf (stream, " %5d", -1);
1780 fprintf (stream, " %5zu",
1781 (offset - level3_offset) / sizeof (uint8_t));
1782 if (i+1 < t.level2_size << t.q)
1783 fprintf (stream, ",");
1785 if (t.level2_size << t.q > 8)
1786 fprintf (stream, "\n ");
1787 fprintf (stream, " },\n");
1788 /* Pack the level3 array. Each entry needs 4 bits only. */
1789 fprintf (stream, " {");
1790 if (t.level3_size << (t.p - 1) > 8)
1791 fprintf (stream, "\n ");
1792 for (i = 0; i < t.level3_size << (t.p - 1); i++)
1794 if (i > 0 && (i % 8) == 0)
1795 fprintf (stream, "\n ");
1796 fprintf (stream, " 0x%02x",
1797 ((uint8_t *) (t.result + level3_offset))[2*i]
1798 + (((uint8_t *) (t.result + level3_offset))[2*i+1] << 4));
1799 if (i+1 < t.level3_size << (t.p - 1))
1800 fprintf (stream, ",");
1802 if (t.level3_size << (t.p - 1) > 8)
1803 fprintf (stream, "\n ");
1804 fprintf (stream, " }\n");
1805 fprintf (stream, "};\n");
1807 if (ferror (stream) || fclose (stream))
1809 fprintf (stderr, "error writing to '%s'\n", filename);
1814 /* ========================================================================= */
1817 /* See Unicode 3.0 book, section 4.6. */
1820 get_digit_value (unsigned int ch)
1822 if (unicode_attributes[ch].name != NULL
1823 && unicode_attributes[ch].digit[0] != '\0')
1824 return atoi (unicode_attributes[ch].digit);
1828 /* Output the unit test for the per-character digit value table. */
1830 output_digit_test (const char *filename, const char *version)
1836 stream = fopen (filename, "w");
1839 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1843 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1844 fprintf (stream, "/* Digit values of Unicode characters. */\n");
1845 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1849 for (ch = 0; ch < 0x110000; ch++)
1851 int value = get_digit_value (ch);
1853 if (!(value >= -1 && value < 10))
1859 fprintf (stream, ",\n");
1860 fprintf (stream, " { 0x%04X, %d }", ch, value);
1865 fprintf (stream, "\n");
1867 if (ferror (stream) || fclose (stream))
1869 fprintf (stderr, "error writing to '%s'\n", filename);
1874 /* Output the per-character digit value table. */
1876 output_digit (const char *filename, const char *version)
1880 struct decdigit_table t;
1881 unsigned int level1_offset, level2_offset, level3_offset;
1883 stream = fopen (filename, "w");
1886 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1890 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1891 fprintf (stream, "/* Digit values of Unicode characters. */\n");
1892 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1897 decdigit_table_init (&t);
1899 for (ch = 0; ch < 0x110000; ch++)
1901 int value = 1 + get_digit_value (ch);
1903 if (!(value >= 0 && value <= 10))
1906 decdigit_table_add (&t, ch, value);
1909 decdigit_table_finalize (&t);
1911 /* Offsets in t.result, in memory of this process. */
1913 5 * sizeof (uint32_t);
1915 5 * sizeof (uint32_t)
1916 + t.level1_size * sizeof (uint32_t);
1918 5 * sizeof (uint32_t)
1919 + t.level1_size * sizeof (uint32_t)
1920 + (t.level2_size << t.q) * sizeof (uint32_t);
1922 for (i = 0; i < 5; i++)
1923 fprintf (stream, "#define digit_header_%d %d\n", i,
1924 ((uint32_t *) t.result)[i]);
1925 fprintf (stream, "static const\n");
1926 fprintf (stream, "struct\n");
1927 fprintf (stream, " {\n");
1928 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1929 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1930 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size,
1932 fprintf (stream, " }\n");
1933 fprintf (stream, "u_digit =\n");
1934 fprintf (stream, "{\n");
1935 fprintf (stream, " {");
1936 if (t.level1_size > 8)
1937 fprintf (stream, "\n ");
1938 for (i = 0; i < t.level1_size; i++)
1941 if (i > 0 && (i % 8) == 0)
1942 fprintf (stream, "\n ");
1943 offset = ((uint32_t *) (t.result + level1_offset))[i];
1945 fprintf (stream, " %5d", -1);
1947 fprintf (stream, " %5zu",
1948 (offset - level2_offset) / sizeof (uint32_t));
1949 if (i+1 < t.level1_size)
1950 fprintf (stream, ",");
1952 if (t.level1_size > 8)
1953 fprintf (stream, "\n ");
1954 fprintf (stream, " },\n");
1955 fprintf (stream, " {");
1956 if (t.level2_size << t.q > 8)
1957 fprintf (stream, "\n ");
1958 for (i = 0; i < t.level2_size << t.q; i++)
1961 if (i > 0 && (i % 8) == 0)
1962 fprintf (stream, "\n ");
1963 offset = ((uint32_t *) (t.result + level2_offset))[i];
1965 fprintf (stream, " %5d", -1);
1967 fprintf (stream, " %5zu",
1968 (offset - level3_offset) / sizeof (uint8_t));
1969 if (i+1 < t.level2_size << t.q)
1970 fprintf (stream, ",");
1972 if (t.level2_size << t.q > 8)
1973 fprintf (stream, "\n ");
1974 fprintf (stream, " },\n");
1975 /* Pack the level3 array. Each entry needs 4 bits only. */
1976 fprintf (stream, " {");
1977 if (t.level3_size << (t.p - 1) > 8)
1978 fprintf (stream, "\n ");
1979 for (i = 0; i < t.level3_size << (t.p - 1); i++)
1981 if (i > 0 && (i % 8) == 0)
1982 fprintf (stream, "\n ");
1983 fprintf (stream, " 0x%02x",
1984 ((uint8_t *) (t.result + level3_offset))[2*i]
1985 + (((uint8_t *) (t.result + level3_offset))[2*i+1] << 4));
1986 if (i+1 < t.level3_size << (t.p - 1))
1987 fprintf (stream, ",");
1989 if (t.level3_size << (t.p - 1) > 8)
1990 fprintf (stream, "\n ");
1991 fprintf (stream, " }\n");
1992 fprintf (stream, "};\n");
1994 if (ferror (stream) || fclose (stream))
1996 fprintf (stderr, "error writing to '%s'\n", filename);
2001 /* ========================================================================= */
2003 /* Numeric value. */
2004 /* See Unicode 3.0 book, section 4.6. */
2006 typedef struct { int numerator; int denominator; } uc_fraction_t;
2008 static uc_fraction_t
2009 get_numeric_value (unsigned int ch)
2011 uc_fraction_t value;
2013 if (unicode_attributes[ch].name != NULL
2014 && unicode_attributes[ch].numeric[0] != '\0')
2016 const char *str = unicode_attributes[ch].numeric;
2017 /* str is of the form "integer" or "integer/posinteger". */
2018 value.numerator = atoi (str);
2019 if (strchr (str, '/') != NULL)
2020 value.denominator = atoi (strchr (str, '/') + 1);
2022 value.denominator = 1;
2026 value.numerator = 0;
2027 value.denominator = 0;
2032 /* Output the unit test for the per-character numeric value table. */
2034 output_numeric_test (const char *filename, const char *version)
2040 stream = fopen (filename, "w");
2043 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2047 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2048 fprintf (stream, "/* Numeric values of Unicode characters. */\n");
2049 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
2053 for (ch = 0; ch < 0x110000; ch++)
2055 uc_fraction_t value = get_numeric_value (ch);
2057 if (value.numerator != 0 || value.denominator != 0)
2060 fprintf (stream, ",\n");
2061 fprintf (stream, " { 0x%04X, %d, %d }",
2062 ch, value.numerator, value.denominator);
2067 fprintf (stream, "\n");
2069 if (ferror (stream) || fclose (stream))
2071 fprintf (stderr, "error writing to '%s'\n", filename);
2076 /* Construction of sparse 3-level tables. */
2077 #define TABLE numeric_table
2078 #define ELEMENT uint8_t
2080 #define xmalloc malloc
2081 #define xrealloc realloc
2084 /* Output the per-character numeric value table. */
2086 output_numeric (const char *filename, const char *version)
2089 uc_fraction_t fractions[128];
2090 unsigned int nfractions;
2091 unsigned int ch, i, j;
2092 struct numeric_table t;
2093 unsigned int level1_offset, level2_offset, level3_offset;
2094 uint16_t *level3_packed;
2096 stream = fopen (filename, "w");
2099 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2103 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2104 fprintf (stream, "/* Numeric values of Unicode characters. */\n");
2105 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
2108 /* Create table of occurring fractions. */
2110 for (ch = 0; ch < 0x110000; ch++)
2112 uc_fraction_t value = get_numeric_value (ch);
2114 for (i = 0; i < nfractions; i++)
2115 if (value.numerator == fractions[i].numerator
2116 && value.denominator == fractions[i].denominator)
2118 if (i == nfractions)
2120 if (nfractions == 128)
2122 for (i = 0; i < nfractions; i++)
2123 if (value.denominator < fractions[i].denominator
2124 || (value.denominator == fractions[i].denominator
2125 && value.numerator < fractions[i].numerator))
2127 for (j = nfractions; j > i; j--)
2128 fractions[j] = fractions[j - 1];
2129 fractions[i] = value;
2134 fprintf (stream, "static const uc_fraction_t u_numeric_values[%d] =\n",
2136 fprintf (stream, "{\n");
2137 for (i = 0; i < nfractions; i++)
2139 fprintf (stream, " { %d, %d }", fractions[i].numerator,
2140 fractions[i].denominator);
2141 if (i+1 < nfractions)
2142 fprintf (stream, ",");
2143 fprintf (stream, "\n");
2145 fprintf (stream, "};\n");
2149 numeric_table_init (&t);
2151 for (ch = 0; ch < 0x110000; ch++)
2153 uc_fraction_t value = get_numeric_value (ch);
2155 for (i = 0; i < nfractions; i++)
2156 if (value.numerator == fractions[i].numerator
2157 && value.denominator == fractions[i].denominator)
2159 if (i == nfractions)
2162 numeric_table_add (&t, ch, i);
2165 numeric_table_finalize (&t);
2167 /* Offsets in t.result, in memory of this process. */
2169 5 * sizeof (uint32_t);
2171 5 * sizeof (uint32_t)
2172 + t.level1_size * sizeof (uint32_t);
2174 5 * sizeof (uint32_t)
2175 + t.level1_size * sizeof (uint32_t)
2176 + (t.level2_size << t.q) * sizeof (uint32_t);
2178 for (i = 0; i < 5; i++)
2179 fprintf (stream, "#define numeric_header_%d %d\n", i,
2180 ((uint32_t *) t.result)[i]);
2181 fprintf (stream, "static const\n");
2182 fprintf (stream, "struct\n");
2183 fprintf (stream, " {\n");
2184 fprintf (stream, " int level1[%zu];\n", t.level1_size);
2185 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
2186 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
2187 (1 << t.p) * 7 / 16);
2188 fprintf (stream, " }\n");
2189 fprintf (stream, "u_numeric =\n");
2190 fprintf (stream, "{\n");
2191 fprintf (stream, " {");
2192 if (t.level1_size > 8)
2193 fprintf (stream, "\n ");
2194 for (i = 0; i < t.level1_size; i++)
2197 if (i > 0 && (i % 8) == 0)
2198 fprintf (stream, "\n ");
2199 offset = ((uint32_t *) (t.result + level1_offset))[i];
2201 fprintf (stream, " %5d", -1);
2203 fprintf (stream, " %5zu",
2204 (offset - level2_offset) / sizeof (uint32_t));
2205 if (i+1 < t.level1_size)
2206 fprintf (stream, ",");
2208 if (t.level1_size > 8)
2209 fprintf (stream, "\n ");
2210 fprintf (stream, " },\n");
2211 fprintf (stream, " {");
2212 if (t.level2_size << t.q > 8)
2213 fprintf (stream, "\n ");
2214 for (i = 0; i < t.level2_size << t.q; i++)
2217 if (i > 0 && (i % 8) == 0)
2218 fprintf (stream, "\n ");
2219 offset = ((uint32_t *) (t.result + level2_offset))[i];
2221 fprintf (stream, " %5d", -1);
2223 fprintf (stream, " %5zu",
2224 (offset - level3_offset) / sizeof (uint8_t));
2225 if (i+1 < t.level2_size << t.q)
2226 fprintf (stream, ",");
2228 if (t.level2_size << t.q > 8)
2229 fprintf (stream, "\n ");
2230 fprintf (stream, " },\n");
2231 /* Pack the level3 array. Each entry needs 7 bits only. Use 16-bit units,
2232 not 32-bit units, in order to make the lookup function easier. */
2235 calloc ((t.level3_size << t.p) * 7 / 16 + 1, sizeof (uint16_t));
2236 for (i = 0; i < t.level3_size << t.p; i++)
2238 unsigned int j = (i * 7) / 16;
2239 unsigned int k = (i * 7) % 16;
2240 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
2241 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
2242 level3_packed[j] = value & 0xffff;
2243 level3_packed[j+1] = value >> 16;
2245 fprintf (stream, " {");
2246 if ((t.level3_size << t.p) * 7 / 16 + 1 > 8)
2247 fprintf (stream, "\n ");
2248 for (i = 0; i < (t.level3_size << t.p) * 7 / 16 + 1; i++)
2250 if (i > 0 && (i % 8) == 0)
2251 fprintf (stream, "\n ");
2252 fprintf (stream, " 0x%04x", level3_packed[i]);
2253 if (i+1 < (t.level3_size << t.p) * 7 / 16 + 1)
2254 fprintf (stream, ",");
2256 if ((t.level3_size << t.p) * 7 / 16 + 1 > 8)
2257 fprintf (stream, "\n ");
2258 fprintf (stream, " }\n");
2259 free (level3_packed);
2260 fprintf (stream, "};\n");
2262 if (ferror (stream) || fclose (stream))
2264 fprintf (stderr, "error writing to '%s'\n", filename);
2269 /* ========================================================================= */
2272 /* See Unicode 3.0 book, section 4.7,
2275 /* List of mirrored character pairs. This is a subset of the characters
2276 having the BidiMirrored property. */
2277 static unsigned int mirror_pairs[][2] =
2334 get_mirror_value (unsigned int ch)
2337 unsigned int mirror_char;
2340 mirrored = (unicode_attributes[ch].name != NULL
2341 && unicode_attributes[ch].mirrored);
2342 mirror_char = 0xfffd;
2343 for (i = 0; i < sizeof (mirror_pairs) / sizeof (mirror_pairs[0]); i++)
2344 if (ch == mirror_pairs[i][0])
2346 mirror_char = mirror_pairs[i][1];
2349 else if (ch == mirror_pairs[i][1])
2351 mirror_char = mirror_pairs[i][0];
2355 return (int) mirror_char - (int) ch;
2358 if (mirror_char != 0xfffd)
2364 /* Construction of sparse 3-level tables. */
2365 #define TABLE mirror_table
2366 #define ELEMENT int32_t
2368 #define xmalloc malloc
2369 #define xrealloc realloc
2372 /* Output the per-character mirror table. */
2374 output_mirror (const char *filename, const char *version)
2378 struct mirror_table t;
2379 unsigned int level1_offset, level2_offset, level3_offset;
2381 stream = fopen (filename, "w");
2384 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2388 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2389 fprintf (stream, "/* Mirrored Unicode characters. */\n");
2390 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
2395 mirror_table_init (&t);
2397 for (ch = 0; ch < 0x110000; ch++)
2399 int value = get_mirror_value (ch);
2401 mirror_table_add (&t, ch, value);
2404 mirror_table_finalize (&t);
2406 /* Offsets in t.result, in memory of this process. */
2408 5 * sizeof (uint32_t);
2410 5 * sizeof (uint32_t)
2411 + t.level1_size * sizeof (uint32_t);
2413 5 * sizeof (uint32_t)
2414 + t.level1_size * sizeof (uint32_t)
2415 + (t.level2_size << t.q) * sizeof (uint32_t);
2417 for (i = 0; i < 5; i++)
2418 fprintf (stream, "#define mirror_header_%d %d\n", i,
2419 ((uint32_t *) t.result)[i]);
2420 fprintf (stream, "static const\n");
2421 fprintf (stream, "struct\n");
2422 fprintf (stream, " {\n");
2423 fprintf (stream, " int level1[%zu];\n", t.level1_size);
2424 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
2425 fprintf (stream, " int level3[%zu << %d];\n", t.level3_size, t.p);
2426 fprintf (stream, " }\n");
2427 fprintf (stream, "u_mirror =\n");
2428 fprintf (stream, "{\n");
2429 fprintf (stream, " {");
2430 if (t.level1_size > 8)
2431 fprintf (stream, "\n ");
2432 for (i = 0; i < t.level1_size; i++)
2435 if (i > 0 && (i % 8) == 0)
2436 fprintf (stream, "\n ");
2437 offset = ((uint32_t *) (t.result + level1_offset))[i];
2439 fprintf (stream, " %5d", -1);
2441 fprintf (stream, " %5zu",
2442 (offset - level2_offset) / sizeof (uint32_t));
2443 if (i+1 < t.level1_size)
2444 fprintf (stream, ",");
2446 if (t.level1_size > 8)
2447 fprintf (stream, "\n ");
2448 fprintf (stream, " },\n");
2449 fprintf (stream, " {");
2450 if (t.level2_size << t.q > 8)
2451 fprintf (stream, "\n ");
2452 for (i = 0; i < t.level2_size << t.q; i++)
2455 if (i > 0 && (i % 8) == 0)
2456 fprintf (stream, "\n ");
2457 offset = ((uint32_t *) (t.result + level2_offset))[i];
2459 fprintf (stream, " %5d", -1);
2461 fprintf (stream, " %5zu",
2462 (offset - level3_offset) / sizeof (int32_t));
2463 if (i+1 < t.level2_size << t.q)
2464 fprintf (stream, ",");
2466 if (t.level2_size << t.q > 8)
2467 fprintf (stream, "\n ");
2468 fprintf (stream, " },\n");
2469 fprintf (stream, " {");
2470 if (t.level3_size << t.p > 8)
2471 fprintf (stream, "\n ");
2472 for (i = 0; i < t.level3_size << t.p; i++)
2474 if (i > 0 && (i % 8) == 0)
2475 fprintf (stream, "\n ");
2476 fprintf (stream, " %5d", ((int32_t *) (t.result + level3_offset))[i]);
2477 if (i+1 < t.level3_size << t.p)
2478 fprintf (stream, ",");
2480 if (t.level3_size << t.p > 8)
2481 fprintf (stream, "\n ");
2482 fprintf (stream, " }\n");
2483 fprintf (stream, "};\n");
2485 if (ferror (stream) || fclose (stream))
2487 fprintf (stderr, "error writing to '%s'\n", filename);
2492 /* ========================================================================= */
2494 /* Particular values of the word break property. */
2497 is_WBP_MIDNUMLET (unsigned int ch)
2499 return (ch == 0x0027 || ch == 0x002E || ch == 0x2018 || ch == 0x2019
2500 || ch == 0x2024 || ch == 0xFE52 || ch == 0xFF07 || ch == 0xFF0E);
2504 is_WBP_MIDLETTER (unsigned int ch)
2506 return (ch == 0x00B7 || ch == 0x05F4 || ch == 0x2027 || ch == 0x003A
2507 || ch == 0x0387 || ch == 0xFE13 || ch == 0xFE55 || ch == 0xFF1A);
2510 /* ========================================================================= */
2514 /* Reading PropList.txt and DerivedCoreProperties.txt. */
2523 PROP_QUOTATION_MARK,
2524 PROP_TERMINAL_PUNCTUATION,
2527 PROP_ASCII_HEX_DIGIT,
2528 PROP_OTHER_ALPHABETIC,
2532 PROP_OTHER_LOWERCASE,
2533 PROP_OTHER_UPPERCASE,
2534 PROP_NONCHARACTER_CODE_POINT,
2535 PROP_OTHER_GRAPHEME_EXTEND,
2536 PROP_IDS_BINARY_OPERATOR,
2537 PROP_IDS_TRINARY_OPERATOR,
2539 PROP_UNIFIED_IDEOGRAPH,
2540 PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT,
2543 PROP_LOGICAL_ORDER_EXCEPTION,
2544 PROP_OTHER_ID_START,
2545 PROP_OTHER_ID_CONTINUE,
2547 PROP_VARIATION_SELECTOR,
2548 PROP_PATTERN_WHITE_SPACE,
2549 PROP_PATTERN_SYNTAX,
2550 /* DerivedCoreProperties.txt */
2556 PROP_CASE_IGNORABLE,
2557 PROP_CHANGES_WHEN_LOWERCASED,
2558 PROP_CHANGES_WHEN_UPPERCASED,
2559 PROP_CHANGES_WHEN_TITLECASED,
2560 PROP_CHANGES_WHEN_CASEFOLDED,
2561 PROP_CHANGES_WHEN_CASEMAPPED,
2566 PROP_DEFAULT_IGNORABLE_CODE_POINT,
2567 PROP_GRAPHEME_EXTEND,
2571 unsigned long long unicode_properties[0x110000];
2574 clear_properties (void)
2578 for (i = 0; i < 0x110000; i++)
2579 unicode_properties[i] = 0;
2582 /* Stores in unicode_properties[] the properties from the
2583 PropList.txt or DerivedCoreProperties.txt file. */
2585 fill_properties (const char *proplist_filename)
2590 stream = fopen (proplist_filename, "r");
2593 fprintf (stderr, "error during fopen of '%s'\n", proplist_filename);
2600 unsigned int i1, i2;
2601 char padding[200+1];
2602 char propname[200+1];
2603 unsigned int propvalue;
2605 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
2608 if (buf[0] == '\0' || buf[0] == '#')
2611 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4)
2613 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3)
2615 fprintf (stderr, "parse error in '%s'\n", proplist_filename);
2620 #define PROP(name,value) \
2621 if (strcmp (propname, name) == 0) propvalue = value; else
2623 PROP ("White_Space", PROP_WHITE_SPACE)
2624 PROP ("Bidi_Control", PROP_BIDI_CONTROL)
2625 PROP ("Join_Control", PROP_JOIN_CONTROL)
2626 PROP ("Dash", PROP_DASH)
2627 PROP ("Hyphen", PROP_HYPHEN)
2628 PROP ("Quotation_Mark", PROP_QUOTATION_MARK)
2629 PROP ("Terminal_Punctuation", PROP_TERMINAL_PUNCTUATION)
2630 PROP ("Other_Math", PROP_OTHER_MATH)
2631 PROP ("Hex_Digit", PROP_HEX_DIGIT)
2632 PROP ("ASCII_Hex_Digit", PROP_ASCII_HEX_DIGIT)
2633 PROP ("Other_Alphabetic", PROP_OTHER_ALPHABETIC)
2634 PROP ("Ideographic", PROP_IDEOGRAPHIC)
2635 PROP ("Diacritic", PROP_DIACRITIC)
2636 PROP ("Extender", PROP_EXTENDER)
2637 PROP ("Other_Lowercase", PROP_OTHER_LOWERCASE)
2638 PROP ("Other_Uppercase", PROP_OTHER_UPPERCASE)
2639 PROP ("Noncharacter_Code_Point", PROP_NONCHARACTER_CODE_POINT)
2640 PROP ("Other_Grapheme_Extend", PROP_OTHER_GRAPHEME_EXTEND)
2641 PROP ("IDS_Binary_Operator", PROP_IDS_BINARY_OPERATOR)
2642 PROP ("IDS_Trinary_Operator", PROP_IDS_TRINARY_OPERATOR)
2643 PROP ("Radical", PROP_RADICAL)
2644 PROP ("Unified_Ideograph", PROP_UNIFIED_IDEOGRAPH)
2645 PROP ("Other_Default_Ignorable_Code_Point", PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)
2646 PROP ("Deprecated", PROP_DEPRECATED)
2647 PROP ("Soft_Dotted", PROP_SOFT_DOTTED)
2648 PROP ("Logical_Order_Exception", PROP_LOGICAL_ORDER_EXCEPTION)
2649 PROP ("Other_ID_Start", PROP_OTHER_ID_START)
2650 PROP ("Other_ID_Continue", PROP_OTHER_ID_CONTINUE)
2651 PROP ("STerm", PROP_STERM)
2652 PROP ("Variation_Selector", PROP_VARIATION_SELECTOR)
2653 PROP ("Pattern_White_Space", PROP_PATTERN_WHITE_SPACE)
2654 PROP ("Pattern_Syntax", PROP_PATTERN_SYNTAX)
2655 /* DerivedCoreProperties.txt */
2656 PROP ("Math", PROP_MATH)
2657 PROP ("Alphabetic", PROP_ALPHABETIC)
2658 PROP ("Lowercase", PROP_LOWERCASE)
2659 PROP ("Uppercase", PROP_UPPERCASE)
2660 PROP ("Cased", PROP_CASED)
2661 PROP ("Case_Ignorable", PROP_CASE_IGNORABLE)
2662 PROP ("Changes_When_Lowercased", PROP_CHANGES_WHEN_LOWERCASED)
2663 PROP ("Changes_When_Uppercased", PROP_CHANGES_WHEN_UPPERCASED)
2664 PROP ("Changes_When_Titlecased", PROP_CHANGES_WHEN_TITLECASED)
2665 PROP ("Changes_When_Casefolded", PROP_CHANGES_WHEN_CASEFOLDED)
2666 PROP ("Changes_When_Casemapped", PROP_CHANGES_WHEN_CASEMAPPED)
2667 PROP ("ID_Start", PROP_ID_START)
2668 PROP ("ID_Continue", PROP_ID_CONTINUE)
2669 PROP ("XID_Start", PROP_XID_START)
2670 PROP ("XID_Continue", PROP_XID_CONTINUE)
2671 PROP ("Default_Ignorable_Code_Point", PROP_DEFAULT_IGNORABLE_CODE_POINT)
2672 PROP ("Grapheme_Extend", PROP_GRAPHEME_EXTEND)
2673 PROP ("Grapheme_Base", PROP_GRAPHEME_BASE)
2674 PROP ("Grapheme_Link", PROP_GRAPHEME_LINK)
2677 fprintf (stderr, "unknown property named '%s' in '%s'\n", propname,
2681 if (!(i1 <= i2 && i2 < 0x110000))
2684 for (i = i1; i <= i2; i++)
2685 unicode_properties[i] |= 1ULL << propvalue;
2688 if (ferror (stream) || fclose (stream))
2690 fprintf (stderr, "error reading from '%s'\n", proplist_filename);
2695 /* Stores in array the given property from the Unicode 3.0 PropList.txt
2698 fill_property30 (char array[0x110000], const char *proplist_filename, const char *property_name)
2704 for (i = 0; i < 0x110000; i++)
2707 stream = fopen (proplist_filename, "r");
2710 fprintf (stderr, "error during fopen of '%s'\n", proplist_filename);
2714 /* Search for the "Property dump for: ..." line. */
2717 if (fscanf (stream, "%100[^\n]\n", buf) < 1)
2719 fprintf (stderr, "no property found in '%s'\n", proplist_filename);
2723 while (strstr (buf, property_name) == NULL);
2727 unsigned int i1, i2;
2729 if (fscanf (stream, "%100[^\n]\n", buf) < 1)
2733 if (strlen (buf) >= 10 && buf[4] == '.' && buf[5] == '.')
2735 if (sscanf (buf, "%4X..%4X", &i1, &i2) < 2)
2737 fprintf (stderr, "parse error in property in '%s'\n",
2742 else if (strlen (buf) >= 4)
2744 if (sscanf (buf, "%4X", &i1) < 1)
2746 fprintf (stderr, "parse error in property in '%s'\n",
2754 fprintf (stderr, "parse error in property in '%s'\n",
2758 if (!(i1 <= i2 && i2 < 0x110000))
2760 for (i = i1; i <= i2; i++)
2763 if (ferror (stream) || fclose (stream))
2765 fprintf (stderr, "error reading from '%s'\n", proplist_filename);
2770 /* Properties from Unicode 3.0 PropList.txt file. */
2772 /* The paired punctuation property from the PropList.txt file. */
2773 char unicode_pairedpunctuation[0x110000];
2775 /* The left of pair property from the PropList.txt file. */
2776 char unicode_leftofpair[0x110000];
2779 fill_properties30 (const char *proplist30_filename)
2781 fill_property30 (unicode_pairedpunctuation, proplist30_filename, "(Paired Punctuation)");
2782 fill_property30 (unicode_leftofpair, proplist30_filename, "(Left of Pair)");
2785 /* ------------------------------------------------------------------------- */
2787 /* See PropList.txt, UCD.html. */
2789 is_property_white_space (unsigned int ch)
2791 return ((unicode_properties[ch] & (1ULL << PROP_WHITE_SPACE)) != 0);
2794 /* See Unicode 3.0 book, section 4.10,
2795 PropList.txt, UCD.html,
2796 DerivedCoreProperties.txt, UCD.html. */
2798 is_property_alphabetic (unsigned int ch)
2802 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_ALPHABETIC)) != 0)
2803 /* For some reason, the following are listed as having property
2804 Alphabetic but not as having property Other_Alphabetic. */
2805 || (ch >= 0x16EE && ch <= 0x16F0) /* RUNIC SYMBOLS */
2806 || (ch >= 0x2160 && ch <= 0x2182) /* ROMAN NUMERALS */
2807 || (ch >= 0x2185 && ch <= 0x2188) /* ROMAN NUMERALS */
2808 || (ch >= 0x24D0 && ch <= 0x24E9) /* CIRCLED LATIN SMALL LETTER */
2809 || (ch == 0x3007) /* IDEOGRAPHIC NUMBER ZERO */
2810 || (ch >= 0x3021 && ch <= 0x3029) /* HANGZHOU NUMERAL */
2811 || (ch >= 0x3038 && ch <= 0x303A) /* HANGZHOU NUMERAL */
2812 || (ch >= 0xA6E6 && ch <= 0xA6EF) /* BAMUM LETTERS */
2813 || (ch >= 0x10140 && ch <= 0x10174) /* GREEK ACROPHONICS */
2814 || (ch == 0x10341) /* GOTHIC LETTER NINETY */
2815 || (ch == 0x1034A) /* GOTHIC LETTER NINE HUNDRED */
2816 || (ch >= 0x103D1 && ch <= 0x103D5) /* OLD PERSIAN NUMBERS */
2817 || (ch >= 0x12400 && ch <= 0x12462); /* CUNEIFORM NUMERIC SIGNS */
2819 ((unicode_properties[ch] & (1ULL << PROP_ALPHABETIC)) != 0);
2821 if (result1 != result2)
2826 /* See PropList.txt, UCD.html. */
2828 is_property_other_alphabetic (unsigned int ch)
2830 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ALPHABETIC)) != 0);
2833 /* See PropList.txt, UCD.html. */
2835 is_property_not_a_character (unsigned int ch)
2837 return ((unicode_properties[ch] & (1ULL << PROP_NONCHARACTER_CODE_POINT)) != 0);
2840 /* See PropList.txt, UCD.html,
2841 DerivedCoreProperties.txt, UCD.html. */
2843 is_property_default_ignorable_code_point (unsigned int ch)
2846 (is_category_Cf (ch)
2847 && !(ch >= 0xFFF9 && ch <= 0xFFFB) /* Annotations */
2848 && !((ch >= 0x0600 && ch <= 0x0603) || ch == 0x06DD || ch == 0x070F)
2849 /* For some reason, the following are not listed as having property
2850 Default_Ignorable_Code_Point. */
2851 && !(ch == 0x110BD))
2852 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)) != 0)
2853 || ((unicode_properties[ch] & (1ULL << PROP_VARIATION_SELECTOR)) != 0);
2855 ((unicode_properties[ch] & (1ULL << PROP_DEFAULT_IGNORABLE_CODE_POINT)) != 0);
2857 if (result1 != result2)
2862 /* See PropList.txt, UCD.html. */
2864 is_property_other_default_ignorable_code_point (unsigned int ch)
2866 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)) != 0);
2869 /* See PropList.txt, UCD.html. */
2871 is_property_deprecated (unsigned int ch)
2873 return ((unicode_properties[ch] & (1ULL << PROP_DEPRECATED)) != 0);
2876 /* See PropList.txt, UCD.html. */
2878 is_property_logical_order_exception (unsigned int ch)
2880 return ((unicode_properties[ch] & (1ULL << PROP_LOGICAL_ORDER_EXCEPTION)) != 0);
2883 /* See PropList.txt, UCD.html. */
2885 is_property_variation_selector (unsigned int ch)
2887 return ((unicode_properties[ch] & (1ULL << PROP_VARIATION_SELECTOR)) != 0);
2890 /* See PropList-3.0.1.txt. */
2892 is_property_private_use (unsigned int ch)
2894 /* Determined through "grep 'Private Use,' UnicodeData-3.1.0.txt". */
2895 return (ch >= 0xE000 && ch <= 0xF8FF)
2896 || (ch >= 0xF0000 && ch <= 0xFFFFD)
2897 || (ch >= 0x100000 && ch <= 0x10FFFD);
2900 /* See PropList-3.0.1.txt. */
2902 is_property_unassigned_code_value (unsigned int ch)
2904 return (is_category_Cn (ch) && !is_property_not_a_character (ch));
2907 /* See PropList.txt, UCD.html,
2908 DerivedCoreProperties.txt, UCD.html. */
2910 is_property_uppercase (unsigned int ch)
2914 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_UPPERCASE)) != 0);
2916 ((unicode_properties[ch] & (1ULL << PROP_UPPERCASE)) != 0);
2918 if (result1 != result2)
2923 /* See PropList.txt, UCD.html. */
2925 is_property_other_uppercase (unsigned int ch)
2927 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_UPPERCASE)) != 0);
2930 /* See PropList.txt, UCD.html,
2931 DerivedCoreProperties.txt, UCD.html. */
2933 is_property_lowercase (unsigned int ch)
2937 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_LOWERCASE)) != 0);
2939 ((unicode_properties[ch] & (1ULL << PROP_LOWERCASE)) != 0);
2941 if (result1 != result2)
2946 /* See PropList.txt, UCD.html. */
2948 is_property_other_lowercase (unsigned int ch)
2950 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_LOWERCASE)) != 0);
2953 /* See PropList-3.0.1.txt. */
2955 is_property_titlecase (unsigned int ch)
2957 return is_category_Lt (ch);
2960 /* See DerivedCoreProperties.txt. */
2962 is_property_cased (unsigned int ch)
2964 bool result1 = (is_property_lowercase (ch)
2965 || is_property_uppercase (ch)
2966 || is_category_Lt (ch));
2967 bool result2 = ((unicode_properties[ch] & (1ULL << PROP_CASED)) != 0);
2969 if (result1 != result2)
2974 /* See DerivedCoreProperties.txt. */
2976 is_property_case_ignorable (unsigned int ch)
2978 bool result1 = (is_WBP_MIDLETTER (ch) || is_WBP_MIDNUMLET (ch)
2979 || is_category_Mn (ch)
2980 || is_category_Me (ch)
2981 || is_category_Cf (ch)
2982 || is_category_Lm (ch)
2983 || is_category_Sk (ch));
2984 bool result2 = ((unicode_properties[ch] & (1ULL << PROP_CASE_IGNORABLE)) != 0);
2986 if (result1 != result2)
2991 /* See DerivedCoreProperties.txt. */
2993 is_property_changes_when_lowercased (unsigned int ch)
2995 bool result1 = ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_LOWERCASED)) != 0);
2996 bool result2 = (unicode_attributes[ch].name != NULL
2997 && unicode_attributes[ch].lower != NONE
2998 && unicode_attributes[ch].lower != ch);
3000 if (result1 != result2)
3005 /* See DerivedCoreProperties.txt. */
3007 is_property_changes_when_uppercased (unsigned int ch)
3009 return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_UPPERCASED)) != 0);
3012 /* See DerivedCoreProperties.txt. */
3014 is_property_changes_when_titlecased (unsigned int ch)
3016 return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_TITLECASED)) != 0);
3019 /* See DerivedCoreProperties.txt. */
3021 is_property_changes_when_casefolded (unsigned int ch)
3023 return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_CASEFOLDED)) != 0);
3026 /* See DerivedCoreProperties.txt. */
3028 is_property_changes_when_casemapped (unsigned int ch)
3030 return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_CASEMAPPED)) != 0);
3033 /* See PropList.txt, UCD.html. */
3035 is_property_soft_dotted (unsigned int ch)
3037 return ((unicode_properties[ch] & (1ULL << PROP_SOFT_DOTTED)) != 0);
3040 /* See DerivedCoreProperties.txt, UCD.html. */
3042 is_property_id_start (unsigned int ch)
3044 return ((unicode_properties[ch] & (1ULL << PROP_ID_START)) != 0);
3047 /* See PropList.txt, UCD.html. */
3049 is_property_other_id_start (unsigned int ch)
3051 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ID_START)) != 0);
3054 /* See DerivedCoreProperties.txt, UCD.html. */
3056 is_property_id_continue (unsigned int ch)
3058 return ((unicode_properties[ch] & (1ULL << PROP_ID_CONTINUE)) != 0);
3061 /* See PropList.txt, UCD.html. */
3063 is_property_other_id_continue (unsigned int ch)
3065 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ID_CONTINUE)) != 0);
3068 /* See DerivedCoreProperties.txt, UCD.html. */
3070 is_property_xid_start (unsigned int ch)
3072 return ((unicode_properties[ch] & (1ULL << PROP_XID_START)) != 0);
3075 /* See DerivedCoreProperties.txt, UCD.html. */
3077 is_property_xid_continue (unsigned int ch)
3079 return ((unicode_properties[ch] & (1ULL << PROP_XID_CONTINUE)) != 0);
3082 /* See PropList.txt, UCD.html. */
3084 is_property_pattern_white_space (unsigned int ch)
3086 return ((unicode_properties[ch] & (1ULL << PROP_PATTERN_WHITE_SPACE)) != 0);
3089 /* See PropList.txt, UCD.html. */
3091 is_property_pattern_syntax (unsigned int ch)
3093 return ((unicode_properties[ch] & (1ULL << PROP_PATTERN_SYNTAX)) != 0);
3096 /* See PropList.txt, UCD.html. */
3098 is_property_join_control (unsigned int ch)
3100 return ((unicode_properties[ch] & (1ULL << PROP_JOIN_CONTROL)) != 0);
3103 /* See DerivedCoreProperties.txt, UCD.html. */
3105 is_property_grapheme_base (unsigned int ch)
3107 return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_BASE)) != 0);
3110 /* See DerivedCoreProperties.txt, UCD.html. */
3112 is_property_grapheme_extend (unsigned int ch)
3114 return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_EXTEND)) != 0);
3117 /* See PropList.txt, UCD.html. */
3119 is_property_other_grapheme_extend (unsigned int ch)
3121 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_GRAPHEME_EXTEND)) != 0);
3124 /* See DerivedCoreProperties.txt, UCD.html. */
3126 is_property_grapheme_link (unsigned int ch)
3128 return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_LINK)) != 0);
3131 /* See PropList.txt, UCD.html. */
3133 is_property_bidi_control (unsigned int ch)
3135 return ((unicode_properties[ch] & (1ULL << PROP_BIDI_CONTROL)) != 0);
3138 /* See PropList-3.0.1.txt. */
3140 is_property_bidi_left_to_right (unsigned int ch)
3142 return (get_bidi_category (ch) == UC_BIDI_L);
3145 /* See PropList-3.0.1.txt. */
3147 is_property_bidi_hebrew_right_to_left (unsigned int ch)
3149 return (get_bidi_category (ch) == UC_BIDI_R);
3152 /* See PropList-3.0.1.txt. */
3154 is_property_bidi_arabic_right_to_left (unsigned int ch)
3156 return (get_bidi_category (ch) == UC_BIDI_AL);
3159 /* See PropList-3.0.1.txt. */
3161 is_property_bidi_european_digit (unsigned int ch)
3163 return (get_bidi_category (ch) == UC_BIDI_EN);
3166 /* See PropList-3.0.1.txt. */
3168 is_property_bidi_eur_num_separator (unsigned int ch)
3170 return (get_bidi_category (ch) == UC_BIDI_ES);
3173 /* See PropList-3.0.1.txt. */
3175 is_property_bidi_eur_num_terminator (unsigned int ch)
3177 return (get_bidi_category (ch) == UC_BIDI_ET);
3180 /* See PropList-3.0.1.txt. */
3182 is_property_bidi_arabic_digit (unsigned int ch)
3184 return (get_bidi_category (ch) == UC_BIDI_AN);
3187 /* See PropList-3.0.1.txt. */
3189 is_property_bidi_common_separator (unsigned int ch)
3191 return (get_bidi_category (ch) == UC_BIDI_CS);
3194 /* See PropList-3.0.1.txt. */
3196 is_property_bidi_block_separator (unsigned int ch)
3198 return (get_bidi_category (ch) == UC_BIDI_B);
3201 /* See PropList-3.0.1.txt. */
3203 is_property_bidi_segment_separator (unsigned int ch)
3205 return (get_bidi_category (ch) == UC_BIDI_S);
3208 /* See PropList-3.0.1.txt. */
3210 is_property_bidi_whitespace (unsigned int ch)
3212 return (get_bidi_category (ch) == UC_BIDI_WS);
3215 /* See PropList-3.0.1.txt. */
3217 is_property_bidi_non_spacing_mark (unsigned int ch)
3219 return (get_bidi_category (ch) == UC_BIDI_NSM);
3222 /* See PropList-3.0.1.txt. */
3224 is_property_bidi_boundary_neutral (unsigned int ch)
3226 return (get_bidi_category (ch) == UC_BIDI_BN);
3229 /* See PropList-3.0.1.txt. */
3231 is_property_bidi_pdf (unsigned int ch)
3233 return (get_bidi_category (ch) == UC_BIDI_PDF);
3236 /* See PropList-3.0.1.txt. */
3238 is_property_bidi_embedding_or_override (unsigned int ch)
3240 int category = get_bidi_category (ch);
3241 return (category == UC_BIDI_LRE || category == UC_BIDI_LRO
3242 || category == UC_BIDI_RLE || category == UC_BIDI_RLO);
3245 /* See PropList-3.0.1.txt. */
3247 is_property_bidi_other_neutral (unsigned int ch)
3249 return (get_bidi_category (ch) == UC_BIDI_ON);
3252 /* See PropList.txt, UCD.html. */
3254 is_property_hex_digit (unsigned int ch)
3256 return ((unicode_properties[ch] & (1ULL << PROP_HEX_DIGIT)) != 0);
3259 /* See PropList.txt, UCD.html. */
3261 is_property_ascii_hex_digit (unsigned int ch)
3263 return ((unicode_properties[ch] & (1ULL << PROP_ASCII_HEX_DIGIT)) != 0);
3266 /* See Unicode 3.0 book, section 4.10,
3267 PropList.txt, UCD.html. */
3269 is_property_ideographic (unsigned int ch)
3271 return ((unicode_properties[ch] & (1ULL << PROP_IDEOGRAPHIC)) != 0);
3274 /* See PropList.txt, UCD.html. */
3276 is_property_unified_ideograph (unsigned int ch)
3278 return ((unicode_properties[ch] & (1ULL << PROP_UNIFIED_IDEOGRAPH)) != 0);
3281 /* See PropList.txt, UCD.html. */
3283 is_property_radical (unsigned int ch)
3285 return ((unicode_properties[ch] & (1ULL << PROP_RADICAL)) != 0);
3288 /* See PropList.txt, UCD.html. */
3290 is_property_ids_binary_operator (unsigned int ch)
3292 return ((unicode_properties[ch] & (1ULL << PROP_IDS_BINARY_OPERATOR)) != 0);
3295 /* See PropList.txt, UCD.html. */
3297 is_property_ids_trinary_operator (unsigned int ch)
3299 return ((unicode_properties[ch] & (1ULL << PROP_IDS_TRINARY_OPERATOR)) != 0);
3302 /* See PropList-3.0.1.txt. */
3304 is_property_zero_width (unsigned int ch)
3306 return is_category_Cf (ch)
3307 || (unicode_attributes[ch].name != NULL
3308 && strstr (unicode_attributes[ch].name, "ZERO WIDTH") != NULL);
3311 /* See PropList-3.0.1.txt. */
3313 is_property_space (unsigned int ch)
3315 return is_category_Zs (ch);
3318 /* See PropList-3.0.1.txt. */
3320 is_property_non_break (unsigned int ch)
3322 /* This is exactly the set of characters having line breaking
3324 return (ch == 0x00A0 /* NO-BREAK SPACE */
3325 || ch == 0x034F /* COMBINING GRAPHEME JOINER */
3326 || ch == 0x035C /* COMBINING DOUBLE BREVE BELOW */
3327 || ch == 0x035D /* COMBINING DOUBLE BREVE */
3328 || ch == 0x035E /* COMBINING DOUBLE MACRON */
3329 || ch == 0x035F /* COMBINING DOUBLE MACRON BELOW */
3330 || ch == 0x0360 /* COMBINING DOUBLE TILDE */
3331 || ch == 0x0361 /* COMBINING DOUBLE INVERTED BREVE */
3332 || ch == 0x0362 /* COMBINING DOUBLE RIGHTWARDS ARROW BELOW */
3333 || ch == 0x0F08 /* TIBETAN MARK SBRUL SHAD */
3334 || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */
3335 || ch == 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */
3336 || ch == 0x180E /* MONGOLIAN VOWEL SEPARATOR */
3337 || ch == 0x2007 /* FIGURE SPACE */
3338 || ch == 0x2011 /* NON-BREAKING HYPHEN */
3339 || ch == 0x202F /* NARROW NO-BREAK SPACE */);
3342 /* See PropList-3.0.1.txt. */
3344 is_property_iso_control (unsigned int ch)
3347 (unicode_attributes[ch].name != NULL
3348 && strcmp (unicode_attributes[ch].name, "<control>") == 0);
3350 is_category_Cc (ch);
3352 if (result1 != result2)
3357 /* See PropList-3.0.1.txt. */
3359 is_property_format_control (unsigned int ch)
3361 return (is_category_Cf (ch)
3362 && get_bidi_category (ch) == UC_BIDI_BN
3363 && !is_property_join_control (ch)
3367 /* See PropList.txt, UCD.html. */
3369 is_property_dash (unsigned int ch)
3371 return ((unicode_properties[ch] & (1ULL << PROP_DASH)) != 0);
3374 /* See PropList.txt, UCD.html. */
3376 is_property_hyphen (unsigned int ch)
3378 return ((unicode_properties[ch] & (1ULL << PROP_HYPHEN)) != 0);
3381 /* See PropList-3.0.1.txt. */
3383 is_property_punctuation (unsigned int ch)
3385 return is_category_P (ch);
3388 /* See PropList-3.0.1.txt. */
3390 is_property_line_separator (unsigned int ch)
3392 return is_category_Zl (ch);
3395 /* See PropList-3.0.1.txt. */
3397 is_property_paragraph_separator (unsigned int ch)
3399 return is_category_Zp (ch);
3402 /* See PropList.txt, UCD.html. */
3404 is_property_quotation_mark (unsigned int ch)
3406 return ((unicode_properties[ch] & (1ULL << PROP_QUOTATION_MARK)) != 0);
3409 /* See PropList.txt, UCD.html. */
3411 is_property_sentence_terminal (unsigned int ch)
3413 return ((unicode_properties[ch] & (1ULL << PROP_STERM)) != 0);
3416 /* See PropList.txt, UCD.html. */
3418 is_property_terminal_punctuation (unsigned int ch)
3420 return ((unicode_properties[ch] & (1ULL << PROP_TERMINAL_PUNCTUATION)) != 0);
3423 /* See PropList-3.0.1.txt. */
3425 is_property_currency_symbol (unsigned int ch)
3427 return is_category_Sc (ch);
3430 /* See Unicode 3.0 book, section 4.9,
3431 PropList.txt, UCD.html,
3432 DerivedCoreProperties.txt, UCD.html. */
3434 is_property_math (unsigned int ch)
3438 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_MATH)) != 0);
3440 ((unicode_properties[ch] & (1ULL << PROP_MATH)) != 0);
3442 if (result1 != result2)
3447 /* See PropList.txt, UCD.html. */
3449 is_property_other_math (unsigned int ch)
3451 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_MATH)) != 0);
3454 /* See PropList-3.0.1.txt. */
3456 is_property_paired_punctuation (unsigned int ch)
3458 return unicode_pairedpunctuation[ch];
3461 /* See PropList-3.0.1.txt. */
3463 is_property_left_of_pair (unsigned int ch)
3465 return unicode_leftofpair[ch];
3468 /* See PropList-3.0.1.txt. */
3470 is_property_combining (unsigned int ch)
3472 return (unicode_attributes[ch].name != NULL
3473 && (strcmp (unicode_attributes[ch].combining, "0") != 0
3474 || is_category_Mc (ch)
3475 || is_category_Me (ch)
3476 || is_category_Mn (ch)));
3479 #if 0 /* same as is_property_bidi_non_spacing_mark */
3480 /* See PropList-3.0.1.txt. */
3482 is_property_non_spacing (unsigned int ch)
3484 return (unicode_attributes[ch].name != NULL
3485 && get_bidi_category (ch) == UC_BIDI_NSM);
3489 /* See PropList-3.0.1.txt. */
3491 is_property_composite (unsigned int ch)
3493 /* This definition differs from the one in PropList-3.0.1.txt, but is more
3494 logical in some sense. */
3495 if (ch >= 0xAC00 && ch <= 0xD7A4) /* Hangul Syllables */
3497 if (unicode_attributes[ch].name != NULL
3498 && unicode_attributes[ch].decomposition != NULL)
3500 /* Test whether the decomposition contains more than one character,
3501 and the first is not a space. */
3502 const char *decomp = unicode_attributes[ch].decomposition;
3503 if (decomp[0] == '<')
3505 decomp = strchr (decomp, '>') + 1;
3506 if (decomp[0] == ' ')
3509 return strchr (decomp, ' ') != NULL && strncmp (decomp, "0020 ", 5) != 0;
3514 /* See PropList-3.0.1.txt. */
3516 is_property_decimal_digit (unsigned int ch)
3518 return is_category_Nd (ch);
3521 /* See PropList-3.0.1.txt. */
3523 is_property_numeric (unsigned int ch)
3525 return ((get_numeric_value (ch)).denominator > 0)
3526 || (ch == 0x09F8) /* BENGALI CURRENCY NUMERATOR ONE LESS THAN THE DENOMINATOR */
3527 || (ch == 0x2183); /* ROMAN NUMERAL REVERSED ONE HUNDRED */
3530 /* See PropList.txt, UCD.html. */
3532 is_property_diacritic (unsigned int ch)
3534 return ((unicode_properties[ch] & (1ULL << PROP_DIACRITIC)) != 0);
3537 /* See PropList.txt, UCD.html. */
3539 is_property_extender (unsigned int ch)
3541 return ((unicode_properties[ch] & (1ULL << PROP_EXTENDER)) != 0);
3544 /* See PropList-3.0.1.txt. */
3546 is_property_ignorable_control (unsigned int ch)
3548 return ((is_category_Cc (ch) && get_bidi_category (ch) == UC_BIDI_BN)
3549 || is_category_Cf (ch))
3553 /* ------------------------------------------------------------------------- */
3555 /* Output all properties. */
3557 output_properties (const char *version)
3559 #define PROPERTY(P) \
3560 debug_output_predicate ("unictype/pr_" #P ".txt", is_property_ ## P); \
3561 output_predicate_test ("../tests/unictype/test-pr_" #P ".c", is_property_ ## P, "uc_is_property_" #P " (c)"); \
3562 output_predicate ("unictype/pr_" #P ".h", is_property_ ## P, "u_property_" #P, "Properties", version);
3563 PROPERTY(white_space)
3564 PROPERTY(alphabetic)
3565 PROPERTY(other_alphabetic)
3566 PROPERTY(not_a_character)
3567 PROPERTY(default_ignorable_code_point)
3568 PROPERTY(other_default_ignorable_code_point)
3569 PROPERTY(deprecated)
3570 PROPERTY(logical_order_exception)
3571 PROPERTY(variation_selector)
3572 PROPERTY(private_use)
3573 PROPERTY(unassigned_code_value)
3575 PROPERTY(other_uppercase)
3577 PROPERTY(other_lowercase)
3580 PROPERTY(case_ignorable)
3581 PROPERTY(changes_when_lowercased)
3582 PROPERTY(changes_when_uppercased)
3583 PROPERTY(changes_when_titlecased)
3584 PROPERTY(changes_when_casefolded)
3585 PROPERTY(changes_when_casemapped)
3586 PROPERTY(soft_dotted)
3588 PROPERTY(other_id_start)
3589 PROPERTY(id_continue)
3590 PROPERTY(other_id_continue)
3592 PROPERTY(xid_continue)
3593 PROPERTY(pattern_white_space)
3594 PROPERTY(pattern_syntax)
3595 PROPERTY(join_control)
3596 PROPERTY(grapheme_base)
3597 PROPERTY(grapheme_extend)
3598 PROPERTY(other_grapheme_extend)
3599 PROPERTY(grapheme_link)
3600 PROPERTY(bidi_control)
3601 PROPERTY(bidi_left_to_right)
3602 PROPERTY(bidi_hebrew_right_to_left)
3603 PROPERTY(bidi_arabic_right_to_left)
3604 PROPERTY(bidi_european_digit)
3605 PROPERTY(bidi_eur_num_separator)
3606 PROPERTY(bidi_eur_num_terminator)
3607 PROPERTY(bidi_arabic_digit)
3608 PROPERTY(bidi_common_separator)
3609 PROPERTY(bidi_block_separator)
3610 PROPERTY(bidi_segment_separator)
3611 PROPERTY(bidi_whitespace)
3612 PROPERTY(bidi_non_spacing_mark)
3613 PROPERTY(bidi_boundary_neutral)
3615 PROPERTY(bidi_embedding_or_override)
3616 PROPERTY(bidi_other_neutral)
3618 PROPERTY(ascii_hex_digit)
3619 PROPERTY(ideographic)
3620 PROPERTY(unified_ideograph)
3622 PROPERTY(ids_binary_operator)
3623 PROPERTY(ids_trinary_operator)
3624 PROPERTY(zero_width)
3627 PROPERTY(iso_control)
3628 PROPERTY(format_control)
3631 PROPERTY(punctuation)
3632 PROPERTY(line_separator)
3633 PROPERTY(paragraph_separator)
3634 PROPERTY(quotation_mark)
3635 PROPERTY(sentence_terminal)
3636 PROPERTY(terminal_punctuation)
3637 PROPERTY(currency_symbol)
3639 PROPERTY(other_math)
3640 PROPERTY(paired_punctuation)
3641 PROPERTY(left_of_pair)
3644 PROPERTY(decimal_digit)
3648 PROPERTY(ignorable_control)
3652 /* ========================================================================= */
3656 static const char *scripts[256];
3657 static unsigned int numscripts;
3659 static uint8_t unicode_scripts[0x110000];
3662 fill_scripts (const char *scripts_filename)
3667 stream = fopen (scripts_filename, "r");
3670 fprintf (stderr, "error during fopen of '%s'\n", scripts_filename);
3676 for (i = 0; i < 0x110000; i++)
3677 unicode_scripts[i] = (uint8_t)~(uint8_t)0;
3682 unsigned int i1, i2;
3683 char padding[200+1];
3684 char scriptname[200+1];
3687 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
3690 if (buf[0] == '\0' || buf[0] == '#')
3693 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, scriptname) != 4)
3695 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, scriptname) != 3)
3697 fprintf (stderr, "parse error in '%s'\n", scripts_filename);
3707 for (script = numscripts - 1; script >= 0; script--)
3708 if (strcmp (scripts[script], scriptname) == 0)
3712 scripts[numscripts] = strdup (scriptname);
3713 script = numscripts;
3715 if (numscripts == 256)
3719 for (i = i1; i <= i2; i++)
3721 if (unicode_scripts[i] != (uint8_t)~(uint8_t)0)
3722 fprintf (stderr, "0x%04X belongs to multiple scripts\n", i);
3723 unicode_scripts[i] = script;
3727 if (ferror (stream) || fclose (stream))
3729 fprintf (stderr, "error reading from '%s'\n", scripts_filename);
3734 /* Construction of sparse 3-level tables. */
3735 #define TABLE script_table
3736 #define ELEMENT uint8_t
3737 #define DEFAULT (uint8_t)~(uint8_t)0
3738 #define xmalloc malloc
3739 #define xrealloc realloc
3743 output_scripts (const char *version)
3745 const char *filename = "unictype/scripts.h";
3747 unsigned int ch, s, i;
3748 struct script_table t;
3749 unsigned int level1_offset, level2_offset, level3_offset;
3753 const char *lowercase_name;
3756 scriptinfo_t scriptinfo[256];
3758 stream = fopen (filename, "w");
3761 fprintf (stderr, "cannot open '%s' for writing\n", filename);
3765 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
3766 fprintf (stream, "/* Unicode scripts. */\n");
3767 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
3770 for (s = 0; s < numscripts; s++)
3772 char *lcp = strdup (scripts[s]);
3775 for (cp = lcp; *cp != '\0'; cp++)
3776 if (*cp >= 'A' && *cp <= 'Z')
3779 scriptinfo[s].lowercase_name = lcp;
3782 for (s = 0; s < numscripts; s++)
3784 fprintf (stream, "static const uc_interval_t script_%s_intervals[] =\n",
3785 scriptinfo[s].lowercase_name);
3786 fprintf (stream, "{\n");
3788 for (ch = 0; ch < 0x110000; ch++)
3789 if (unicode_scripts[ch] == s)
3795 while (ch + 1 < 0x110000 && unicode_scripts[ch + 1] == s)
3800 fprintf (stream, ",\n");
3802 fprintf (stream, " { 0x%04X, 1, 1 }", start);
3804 fprintf (stream, " { 0x%04X, 1, 0 }, { 0x%04X, 0, 1 }",
3808 fprintf (stream, "\n");
3809 fprintf (stream, "};\n");
3812 fprintf (stream, "static const uc_script_t scripts[%d] =\n", numscripts);
3813 fprintf (stream, "{\n");
3814 for (s = 0; s < numscripts; s++)
3816 fprintf (stream, " {\n");
3817 fprintf (stream, " sizeof (script_%s_intervals) / sizeof (uc_interval_t),\n",
3818 scriptinfo[s].lowercase_name);
3819 fprintf (stream, " script_%s_intervals,\n",
3820 scriptinfo[s].lowercase_name);
3821 fprintf (stream, " \"%s\"\n", scripts[s]);
3822 fprintf (stream, " }");
3823 if (s+1 < numscripts)
3824 fprintf (stream, ",");
3825 fprintf (stream, "\n");
3827 fprintf (stream, "};\n");
3831 script_table_init (&t);
3833 for (ch = 0; ch < 0x110000; ch++)
3835 unsigned int s = unicode_scripts[ch];
3836 if (s != (uint8_t)~(uint8_t)0)
3837 script_table_add (&t, ch, s);
3840 script_table_finalize (&t);
3842 /* Offsets in t.result, in memory of this process. */
3844 5 * sizeof (uint32_t);
3846 5 * sizeof (uint32_t)
3847 + t.level1_size * sizeof (uint32_t);
3849 5 * sizeof (uint32_t)
3850 + t.level1_size * sizeof (uint32_t)
3851 + (t.level2_size << t.q) * sizeof (uint32_t);
3853 for (i = 0; i < 5; i++)
3854 fprintf (stream, "#define script_header_%d %d\n", i,
3855 ((uint32_t *) t.result)[i]);
3856 fprintf (stream, "static const\n");
3857 fprintf (stream, "struct\n");
3858 fprintf (stream, " {\n");
3859 fprintf (stream, " int level1[%zu];\n", t.level1_size);
3860 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
3861 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
3862 fprintf (stream, " }\n");
3863 fprintf (stream, "u_script =\n");
3864 fprintf (stream, "{\n");
3865 fprintf (stream, " {");
3866 if (t.level1_size > 8)
3867 fprintf (stream, "\n ");
3868 for (i = 0; i < t.level1_size; i++)
3871 if (i > 0 && (i % 8) == 0)
3872 fprintf (stream, "\n ");
3873 offset = ((uint32_t *) (t.result + level1_offset))[i];
3875 fprintf (stream, " %5d", -1);
3877 fprintf (stream, " %5zu",
3878 (offset - level2_offset) / sizeof (uint32_t));
3879 if (i+1 < t.level1_size)
3880 fprintf (stream, ",");
3882 if (t.level1_size > 8)
3883 fprintf (stream, "\n ");
3884 fprintf (stream, " },\n");
3885 fprintf (stream, " {");
3886 if (t.level2_size << t.q > 8)
3887 fprintf (stream, "\n ");
3888 for (i = 0; i < t.level2_size << t.q; i++)
3891 if (i > 0 && (i % 8) == 0)
3892 fprintf (stream, "\n ");
3893 offset = ((uint32_t *) (t.result + level2_offset))[i];
3895 fprintf (stream, " %5d", -1);
3897 fprintf (stream, " %5zu",
3898 (offset - level3_offset) / sizeof (uint8_t));
3899 if (i+1 < t.level2_size << t.q)
3900 fprintf (stream, ",");
3902 if (t.level2_size << t.q > 8)
3903 fprintf (stream, "\n ");
3904 fprintf (stream, " },\n");
3905 fprintf (stream, " {");
3906 if (t.level3_size << t.p > 8)
3907 fprintf (stream, "\n ");
3908 for (i = 0; i < t.level3_size << t.p; i++)
3910 if (i > 0 && (i % 8) == 0)
3911 fprintf (stream, "\n ");
3912 fprintf (stream, " %3d", ((uint8_t *) (t.result + level3_offset))[i]);
3913 if (i+1 < t.level3_size << t.p)
3914 fprintf (stream, ",");
3916 if (t.level3_size << t.p > 8)
3917 fprintf (stream, "\n ");
3918 fprintf (stream, " }\n");
3919 fprintf (stream, "};\n");
3921 if (ferror (stream) || fclose (stream))
3923 fprintf (stderr, "error writing to '%s'\n", filename);
3929 output_scripts_byname (const char *version)
3931 const char *filename = "unictype/scripts_byname.gperf";
3935 stream = fopen (filename, "w");
3938 fprintf (stderr, "cannot open '%s' for writing\n", filename);
3942 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
3943 fprintf (stream, "/* Unicode scripts. */\n");
3944 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
3946 fprintf (stream, "struct named_script { int name; unsigned int index; };\n");
3947 fprintf (stream, "%%struct-type\n");
3948 fprintf (stream, "%%language=ANSI-C\n");
3949 fprintf (stream, "%%define hash-function-name scripts_hash\n");
3950 fprintf (stream, "%%define lookup-function-name uc_script_lookup\n");
3951 fprintf (stream, "%%readonly-tables\n");
3952 fprintf (stream, "%%global-table\n");
3953 fprintf (stream, "%%define word-array-name script_names\n");
3954 fprintf (stream, "%%pic\n");
3955 fprintf (stream, "%%define string-pool-name script_stringpool\n");
3956 fprintf (stream, "%%%%\n");
3957 for (s = 0; s < numscripts; s++)
3958 fprintf (stream, "%s, %u\n", scripts[s], s);
3960 if (ferror (stream) || fclose (stream))
3962 fprintf (stderr, "error writing to '%s'\n", filename);
3967 /* ========================================================================= */
3971 typedef struct { unsigned int start; unsigned int end; const char *name; }
3973 static block_t blocks[256];
3974 static unsigned int numblocks;
3977 fill_blocks (const char *blocks_filename)
3981 stream = fopen (blocks_filename, "r");
3984 fprintf (stderr, "error during fopen of '%s'\n", blocks_filename);
3991 unsigned int i1, i2;
3992 char padding[200+1];
3993 char blockname[200+1];
3995 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
3998 if (buf[0] == '\0' || buf[0] == '#')
4001 if (sscanf (buf, "%X..%X%[ ;]%[^\r]", &i1, &i2, padding, blockname) != 4)
4003 fprintf (stderr, "parse error in '%s'\n", blocks_filename);
4006 blocks[numblocks].start = i1;
4007 blocks[numblocks].end = i2;
4008 blocks[numblocks].name = strdup (blockname);
4009 /* It must be sorted. */
4010 if (numblocks > 0 && !(blocks[numblocks-1].end < blocks[numblocks].start))
4013 if (numblocks == 256)
4017 if (ferror (stream) || fclose (stream))
4019 fprintf (stderr, "error reading from '%s'\n", blocks_filename);
4024 /* Return the smallest block index among the blocks for characters >= ch. */
4026 block_first_index (unsigned int ch)
4028 /* Binary search. */
4029 unsigned int lo = 0;
4030 unsigned int hi = numblocks;
4032 All blocks[i], i < lo, have blocks[i].end < ch,
4033 all blocks[i], i >= hi, have blocks[i].end >= ch. */
4036 unsigned int mid = (lo + hi) / 2; /* >= lo, < hi */
4037 if (blocks[mid].end < ch)
4045 /* Return the largest block index among the blocks for characters <= ch,
4048 block_last_index (unsigned int ch)
4050 /* Binary search. */
4051 unsigned int lo = 0;
4052 unsigned int hi = numblocks;
4054 All blocks[i], i < lo, have blocks[i].start <= ch,
4055 all blocks[i], i >= hi, have blocks[i].start > ch. */
4058 unsigned int mid = (lo + hi) / 2; /* >= lo, < hi */
4059 if (blocks[mid].start <= ch)
4068 output_blocks (const char *version)
4070 const char *filename = "unictype/blocks.h";
4071 const unsigned int shift = 8; /* bits to shift away for array access */
4072 const unsigned int threshold = 0x30000; /* cut-off table here to save space */
4077 stream = fopen (filename, "w");
4080 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4084 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4085 fprintf (stream, "/* Unicode blocks. */\n");
4086 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
4089 fprintf (stream, "static const uc_block_t blocks[] =\n");
4090 fprintf (stream, "{\n");
4091 for (i = 0; i < numblocks; i++)
4093 fprintf (stream, " { 0x%04X, 0x%04X, \"%s\" }", blocks[i].start,
4094 blocks[i].end, blocks[i].name);
4095 if (i+1 < numblocks)
4096 fprintf (stream, ",");
4097 fprintf (stream, "\n");
4099 fprintf (stream, "};\n");
4100 fprintf (stream, "#define blocks_level1_shift %d\n", shift);
4101 fprintf (stream, "#define blocks_level1_threshold 0x%04X\n", threshold);
4102 fprintf (stream, "static const uint8_t blocks_level1[%d * 2] =\n",
4103 threshold >> shift);
4104 fprintf (stream, "{\n");
4105 for (i1 = 0; i1 < (threshold >> shift); i1++)
4107 unsigned int first_index = block_first_index (i1 << shift);
4108 unsigned int last_index = block_last_index (((i1 + 1) << shift) - 1);
4109 fprintf (stream, " %3d, %3d", first_index, last_index);
4110 if (i1+1 < (threshold >> shift))
4111 fprintf (stream, ",");
4112 fprintf (stream, "\n");
4114 fprintf (stream, "};\n");
4115 fprintf (stream, "#define blocks_upper_first_index %d\n",
4116 block_first_index (threshold));
4117 fprintf (stream, "#define blocks_upper_last_index %d\n",
4118 block_last_index (0x10FFFF));
4120 if (ferror (stream) || fclose (stream))
4122 fprintf (stderr, "error writing to '%s'\n", filename);
4127 /* ========================================================================= */
4129 /* C and Java syntax. */
4133 UC_IDENTIFIER_START, /* valid as first or subsequent character */
4134 UC_IDENTIFIER_VALID, /* valid as subsequent character only */
4135 UC_IDENTIFIER_INVALID, /* not valid */
4136 UC_IDENTIFIER_IGNORABLE /* ignorable (Java only) */
4139 /* ISO C 99 section 6.4.(3). */
4141 is_c_whitespace (unsigned int ch)
4143 return (ch == ' ' /* space */
4144 || ch == '\t' /* horizontal tab */
4145 || ch == '\n' || ch == '\r' /* new-line */
4146 || ch == '\v' /* vertical tab */
4147 || ch == '\f'); /* form-feed */
4150 /* ISO C 99 section 6.4.2.1 and appendix D. */
4152 c_ident_category (unsigned int ch)
4154 /* Section 6.4.2.1. */
4155 if (ch >= '0' && ch <= '9')
4156 return UC_IDENTIFIER_VALID;
4157 if ((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || ch == '_')
4158 return UC_IDENTIFIER_START;
4164 || (ch >= 0x00C0 && ch <= 0x00D6)
4165 || (ch >= 0x00D8 && ch <= 0x00F6)
4166 || (ch >= 0x00F8 && ch <= 0x01F5)
4167 || (ch >= 0x01FA && ch <= 0x0217)
4168 || (ch >= 0x0250 && ch <= 0x02A8)
4169 || (ch >= 0x1E00 && ch <= 0x1E9B)
4170 || (ch >= 0x1EA0 && ch <= 0x1EF9)
4174 || (ch >= 0x0388 && ch <= 0x038A)
4176 || (ch >= 0x038E && ch <= 0x03A1)
4177 || (ch >= 0x03A3 && ch <= 0x03CE)
4178 || (ch >= 0x03D0 && ch <= 0x03D6)
4183 || (ch >= 0x03E2 && ch <= 0x03F3)
4184 || (ch >= 0x1F00 && ch <= 0x1F15)
4185 || (ch >= 0x1F18 && ch <= 0x1F1D)
4186 || (ch >= 0x1F20 && ch <= 0x1F45)
4187 || (ch >= 0x1F48 && ch <= 0x1F4D)
4188 || (ch >= 0x1F50 && ch <= 0x1F57)
4192 || (ch >= 0x1F5F && ch <= 0x1F7D)
4193 || (ch >= 0x1F80 && ch <= 0x1FB4)
4194 || (ch >= 0x1FB6 && ch <= 0x1FBC)
4195 || (ch >= 0x1FC2 && ch <= 0x1FC4)
4196 || (ch >= 0x1FC6 && ch <= 0x1FCC)
4197 || (ch >= 0x1FD0 && ch <= 0x1FD3)
4198 || (ch >= 0x1FD6 && ch <= 0x1FDB)
4199 || (ch >= 0x1FE0 && ch <= 0x1FEC)
4200 || (ch >= 0x1FF2 && ch <= 0x1FF4)
4201 || (ch >= 0x1FF6 && ch <= 0x1FFC)
4203 || (ch >= 0x0401 && ch <= 0x040C)
4204 || (ch >= 0x040E && ch <= 0x044F)
4205 || (ch >= 0x0451 && ch <= 0x045C)
4206 || (ch >= 0x045E && ch <= 0x0481)
4207 || (ch >= 0x0490 && ch <= 0x04C4)
4208 || (ch >= 0x04C7 && ch <= 0x04C8)
4209 || (ch >= 0x04CB && ch <= 0x04CC)
4210 || (ch >= 0x04D0 && ch <= 0x04EB)
4211 || (ch >= 0x04EE && ch <= 0x04F5)
4212 || (ch >= 0x04F8 && ch <= 0x04F9)
4214 || (ch >= 0x0531 && ch <= 0x0556)
4215 || (ch >= 0x0561 && ch <= 0x0587)
4217 || (ch >= 0x05B0 && ch <= 0x05B9)
4218 || (ch >= 0x05BB && ch <= 0x05BD)
4220 || (ch >= 0x05C1 && ch <= 0x05C2)
4221 || (ch >= 0x05D0 && ch <= 0x05EA)
4222 || (ch >= 0x05F0 && ch <= 0x05F2)
4224 || (ch >= 0x0621 && ch <= 0x063A)
4225 || (ch >= 0x0640 && ch <= 0x0652)
4226 || (ch >= 0x0670 && ch <= 0x06B7)
4227 || (ch >= 0x06BA && ch <= 0x06BE)
4228 || (ch >= 0x06C0 && ch <= 0x06CE)
4229 || (ch >= 0x06D0 && ch <= 0x06DC)
4230 || (ch >= 0x06E5 && ch <= 0x06E8)
4231 || (ch >= 0x06EA && ch <= 0x06ED)
4233 || (ch >= 0x0901 && ch <= 0x0903)
4234 || (ch >= 0x0905 && ch <= 0x0939)
4235 || (ch >= 0x093E && ch <= 0x094D)
4236 || (ch >= 0x0950 && ch <= 0x0952)
4237 || (ch >= 0x0958 && ch <= 0x0963)
4239 || (ch >= 0x0981 && ch <= 0x0983)
4240 || (ch >= 0x0985 && ch <= 0x098C)
4241 || (ch >= 0x098F && ch <= 0x0990)
4242 || (ch >= 0x0993 && ch <= 0x09A8)
4243 || (ch >= 0x09AA && ch <= 0x09B0)
4245 || (ch >= 0x09B6 && ch <= 0x09B9)
4246 || (ch >= 0x09BE && ch <= 0x09C4)
4247 || (ch >= 0x09C7 && ch <= 0x09C8)
4248 || (ch >= 0x09CB && ch <= 0x09CD)
4249 || (ch >= 0x09DC && ch <= 0x09DD)
4250 || (ch >= 0x09DF && ch <= 0x09E3)
4251 || (ch >= 0x09F0 && ch <= 0x09F1)
4254 || (ch >= 0x0A05 && ch <= 0x0A0A)
4255 || (ch >= 0x0A0F && ch <= 0x0A10)
4256 || (ch >= 0x0A13 && ch <= 0x0A28)
4257 || (ch >= 0x0A2A && ch <= 0x0A30)
4258 || (ch >= 0x0A32 && ch <= 0x0A33)
4259 || (ch >= 0x0A35 && ch <= 0x0A36)
4260 || (ch >= 0x0A38 && ch <= 0x0A39)
4261 || (ch >= 0x0A3E && ch <= 0x0A42)
4262 || (ch >= 0x0A47 && ch <= 0x0A48)
4263 || (ch >= 0x0A4B && ch <= 0x0A4D)
4264 || (ch >= 0x0A59 && ch <= 0x0A5C)
4268 || (ch >= 0x0A81 && ch <= 0x0A83)
4269 || (ch >= 0x0A85 && ch <= 0x0A8B)
4271 || (ch >= 0x0A8F && ch <= 0x0A91)
4272 || (ch >= 0x0A93 && ch <= 0x0AA8)
4273 || (ch >= 0x0AAA && ch <= 0x0AB0)
4274 || (ch >= 0x0AB2 && ch <= 0x0AB3)
4275 || (ch >= 0x0AB5 && ch <= 0x0AB9)
4276 || (ch >= 0x0ABD && ch <= 0x0AC5)
4277 || (ch >= 0x0AC7 && ch <= 0x0AC9)
4278 || (ch >= 0x0ACB && ch <= 0x0ACD)
4282 || (ch >= 0x0B01 && ch <= 0x0B03)
4283 || (ch >= 0x0B05 && ch <= 0x0B0C)
4284 || (ch >= 0x0B0F && ch <= 0x0B10)
4285 || (ch >= 0x0B13 && ch <= 0x0B28)
4286 || (ch >= 0x0B2A && ch <= 0x0B30)
4287 || (ch >= 0x0B32 && ch <= 0x0B33)
4288 || (ch >= 0x0B36 && ch <= 0x0B39)
4289 || (ch >= 0x0B3E && ch <= 0x0B43)
4290 || (ch >= 0x0B47 && ch <= 0x0B48)
4291 || (ch >= 0x0B4B && ch <= 0x0B4D)
4292 || (ch >= 0x0B5C && ch <= 0x0B5D)
4293 || (ch >= 0x0B5F && ch <= 0x0B61)
4295 || (ch >= 0x0B82 && ch <= 0x0B83)
4296 || (ch >= 0x0B85 && ch <= 0x0B8A)
4297 || (ch >= 0x0B8E && ch <= 0x0B90)
4298 || (ch >= 0x0B92 && ch <= 0x0B95)
4299 || (ch >= 0x0B99 && ch <= 0x0B9A)
4301 || (ch >= 0x0B9E && ch <= 0x0B9F)
4302 || (ch >= 0x0BA3 && ch <= 0x0BA4)
4303 || (ch >= 0x0BA8 && ch <= 0x0BAA)
4304 || (ch >= 0x0BAE && ch <= 0x0BB5)
4305 || (ch >= 0x0BB7 && ch <= 0x0BB9)
4306 || (ch >= 0x0BBE && ch <= 0x0BC2)
4307 || (ch >= 0x0BC6 && ch <= 0x0BC8)
4308 || (ch >= 0x0BCA && ch <= 0x0BCD)
4310 || (ch >= 0x0C01 && ch <= 0x0C03)
4311 || (ch >= 0x0C05 && ch <= 0x0C0C)
4312 || (ch >= 0x0C0E && ch <= 0x0C10)
4313 || (ch >= 0x0C12 && ch <= 0x0C28)
4314 || (ch >= 0x0C2A && ch <= 0x0C33)
4315 || (ch >= 0x0C35 && ch <= 0x0C39)
4316 || (ch >= 0x0C3E && ch <= 0x0C44)
4317 || (ch >= 0x0C46 && ch <= 0x0C48)
4318 || (ch >= 0x0C4A && ch <= 0x0C4D)
4319 || (ch >= 0x0C60 && ch <= 0x0C61)
4321 || (ch >= 0x0C82 && ch <= 0x0C83)
4322 || (ch >= 0x0C85 && ch <= 0x0C8C)
4323 || (ch >= 0x0C8E && ch <= 0x0C90)
4324 || (ch >= 0x0C92 && ch <= 0x0CA8)
4325 || (ch >= 0x0CAA && ch <= 0x0CB3)
4326 || (ch >= 0x0CB5 && ch <= 0x0CB9)
4327 || (ch >= 0x0CBE && ch <= 0x0CC4)
4328 || (ch >= 0x0CC6 && ch <= 0x0CC8)
4329 || (ch >= 0x0CCA && ch <= 0x0CCD)
4331 || (ch >= 0x0CE0 && ch <= 0x0CE1)
4333 || (ch >= 0x0D02 && ch <= 0x0D03)
4334 || (ch >= 0x0D05 && ch <= 0x0D0C)
4335 || (ch >= 0x0D0E && ch <= 0x0D10)
4336 || (ch >= 0x0D12 && ch <= 0x0D28)
4337 || (ch >= 0x0D2A && ch <= 0x0D39)
4338 || (ch >= 0x0D3E && ch <= 0x0D43)
4339 || (ch >= 0x0D46 && ch <= 0x0D48)
4340 || (ch >= 0x0D4A && ch <= 0x0D4D)
4341 || (ch >= 0x0D60 && ch <= 0x0D61)
4343 || (ch >= 0x0E01 && ch <= 0x0E3A)
4344 || (ch >= 0x0E40 && ch <= 0x0E5B)
4346 || (ch >= 0x0E81 && ch <= 0x0E82)
4348 || (ch >= 0x0E87 && ch <= 0x0E88)
4351 || (ch >= 0x0E94 && ch <= 0x0E97)
4352 || (ch >= 0x0E99 && ch <= 0x0E9F)
4353 || (ch >= 0x0EA1 && ch <= 0x0EA3)
4356 || (ch >= 0x0EAA && ch <= 0x0EAB)
4357 || (ch >= 0x0EAD && ch <= 0x0EAE)
4358 || (ch >= 0x0EB0 && ch <= 0x0EB9)
4359 || (ch >= 0x0EBB && ch <= 0x0EBD)
4360 || (ch >= 0x0EC0 && ch <= 0x0EC4)
4362 || (ch >= 0x0EC8 && ch <= 0x0ECD)
4363 || (ch >= 0x0EDC && ch <= 0x0EDD)
4366 || (ch >= 0x0F18 && ch <= 0x0F19)
4370 || (ch >= 0x0F3E && ch <= 0x0F47)
4371 || (ch >= 0x0F49 && ch <= 0x0F69)
4372 || (ch >= 0x0F71 && ch <= 0x0F84)
4373 || (ch >= 0x0F86 && ch <= 0x0F8B)
4374 || (ch >= 0x0F90 && ch <= 0x0F95)
4376 || (ch >= 0x0F99 && ch <= 0x0FAD)
4377 || (ch >= 0x0FB1 && ch <= 0x0FB7)
4380 || (ch >= 0x10A0 && ch <= 0x10C5)
4381 || (ch >= 0x10D0 && ch <= 0x10F6)
4383 || (ch >= 0x3041 && ch <= 0x3093)
4384 || (ch >= 0x309B && ch <= 0x309C)
4386 || (ch >= 0x30A1 && ch <= 0x30F6)
4387 || (ch >= 0x30FB && ch <= 0x30FC)
4389 || (ch >= 0x3105 && ch <= 0x312C)
4390 /* CJK Unified Ideographs */
4391 || (ch >= 0x4E00 && ch <= 0x9FA5)
4393 || (ch >= 0xAC00 && ch <= 0xD7A3)
4395 || (ch >= 0x0660 && ch <= 0x0669)
4396 || (ch >= 0x06F0 && ch <= 0x06F9)
4397 || (ch >= 0x0966 && ch <= 0x096F)
4398 || (ch >= 0x09E6 && ch <= 0x09EF)
4399 || (ch >= 0x0A66 && ch <= 0x0A6F)
4400 || (ch >= 0x0AE6 && ch <= 0x0AEF)
4401 || (ch >= 0x0B66 && ch <= 0x0B6F)
4402 || (ch >= 0x0BE7 && ch <= 0x0BEF)
4403 || (ch >= 0x0C66 && ch <= 0x0C6F)
4404 || (ch >= 0x0CE6 && ch <= 0x0CEF)
4405 || (ch >= 0x0D66 && ch <= 0x0D6F)
4406 || (ch >= 0x0E50 && ch <= 0x0E59)
4407 || (ch >= 0x0ED0 && ch <= 0x0ED9)
4408 || (ch >= 0x0F20 && ch <= 0x0F33)
4409 /* Special characters */
4412 || (ch >= 0x02B0 && ch <= 0x02B8)
4414 || (ch >= 0x02BD && ch <= 0x02C1)
4415 || (ch >= 0x02D0 && ch <= 0x02D1)
4416 || (ch >= 0x02E0 && ch <= 0x02E4)
4422 || (ch >= 0x203F && ch <= 0x2040)
4425 || (ch >= 0x210A && ch <= 0x2113)
4427 || (ch >= 0x2118 && ch <= 0x211D)
4431 || (ch >= 0x212A && ch <= 0x2131)
4432 || (ch >= 0x2133 && ch <= 0x2138)
4433 || (ch >= 0x2160 && ch <= 0x2182)
4434 || (ch >= 0x3005 && ch <= 0x3007)
4435 || (ch >= 0x3021 && ch <= 0x3029)
4437 return UC_IDENTIFIER_START;
4438 return UC_IDENTIFIER_INVALID;
4441 /* The Java Language Specification, 3rd edition, §3.6.
4442 http://java.sun.com/docs/books/jls/third_edition/html/lexical.html#95710 */
4444 is_java_whitespace (unsigned int ch)
4446 return (ch == ' ' || ch == '\t' || ch == '\f'
4447 || ch == '\n' || ch == '\r');
4450 /* The Java Language Specification, 3rd edition, §3.8.
4451 http://java.sun.com/docs/books/jls/third_edition/html/lexical.html#40625
4452 and Character.isJavaIdentifierStart and Character.isJavaIdentifierPart */
4454 java_ident_category (unsigned int ch)
4456 /* FIXME: Check this against Sun's JDK implementation. */
4457 if (is_category_L (ch) /* = Character.isLetter(ch) */
4458 || is_category_Nl (ch) /* = Character.getType(ch)==LETTER_NUMBER */
4459 || is_category_Sc (ch) /* currency symbol */
4460 || is_category_Pc (ch) /* connector punctuation */
4462 return UC_IDENTIFIER_START;
4463 if (is_category_Nd (ch) /* digit */
4464 || is_category_Mc (ch) /* combining mark */
4465 || is_category_Mn (ch) /* non-spacing mark */
4467 return UC_IDENTIFIER_VALID;
4468 if ((ch >= 0x0000 && ch <= 0x0008)
4469 || (ch >= 0x000E && ch <= 0x001B)
4470 || (ch >= 0x007F && ch <= 0x009F)
4471 || is_category_Cf (ch) /* = Character.getType(ch)==FORMAT */
4473 return UC_IDENTIFIER_IGNORABLE;
4474 return UC_IDENTIFIER_INVALID;
4477 /* Construction of sparse 3-level tables. */
4478 #define TABLE identsyntax_table
4479 #define ELEMENT uint8_t
4480 #define DEFAULT UC_IDENTIFIER_INVALID
4481 #define xmalloc malloc
4482 #define xrealloc realloc
4485 /* Output an identifier syntax categorization in a three-level bitmap. */
4487 output_ident_category (const char *filename, int (*predicate) (unsigned int), const char *name, const char *version)
4491 struct identsyntax_table t;
4492 unsigned int level1_offset, level2_offset, level3_offset;
4494 stream = fopen (filename, "w");
4497 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4501 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4502 fprintf (stream, "/* Language syntax properties of Unicode characters. */\n");
4503 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
4508 identsyntax_table_init (&t);
4510 for (ch = 0; ch < 0x110000; ch++)
4512 int syntaxcode = predicate (ch);
4513 if (syntaxcode != UC_IDENTIFIER_INVALID)
4514 identsyntax_table_add (&t, ch, syntaxcode);
4517 identsyntax_table_finalize (&t);
4519 /* Offsets in t.result, in memory of this process. */
4521 5 * sizeof (uint32_t);
4523 5 * sizeof (uint32_t)
4524 + t.level1_size * sizeof (uint32_t);
4526 5 * sizeof (uint32_t)
4527 + t.level1_size * sizeof (uint32_t)
4528 + (t.level2_size << t.q) * sizeof (uint32_t);
4530 for (i = 0; i < 5; i++)
4531 fprintf (stream, "#define identsyntax_header_%d %d\n", i,
4532 ((uint32_t *) t.result)[i]);
4533 fprintf (stream, "static const\n");
4534 fprintf (stream, "struct\n");
4535 fprintf (stream, " {\n");
4536 fprintf (stream, " int level1[%zu];\n", t.level1_size);
4537 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
4538 fprintf (stream, " unsigned short level3[%zu * %d];\n", t.level3_size,
4539 (1 << t.p) * 2 / 16);
4540 fprintf (stream, " }\n");
4541 fprintf (stream, "%s =\n", name);
4542 fprintf (stream, "{\n");
4543 fprintf (stream, " {");
4544 if (t.level1_size > 8)
4545 fprintf (stream, "\n ");
4546 for (i = 0; i < t.level1_size; i++)
4549 if (i > 0 && (i % 8) == 0)
4550 fprintf (stream, "\n ");
4551 offset = ((uint32_t *) (t.result + level1_offset))[i];
4553 fprintf (stream, " %5d", -1);
4555 fprintf (stream, " %5zu",
4556 (offset - level2_offset) / sizeof (uint32_t));
4557 if (i+1 < t.level1_size)
4558 fprintf (stream, ",");
4560 if (t.level1_size > 8)
4561 fprintf (stream, "\n ");
4562 fprintf (stream, " },\n");
4563 fprintf (stream, " {");
4564 if (t.level2_size << t.q > 8)
4565 fprintf (stream, "\n ");
4566 for (i = 0; i < t.level2_size << t.q; i++)
4569 if (i > 0 && (i % 8) == 0)
4570 fprintf (stream, "\n ");
4571 offset = ((uint32_t *) (t.result + level2_offset))[i];
4573 fprintf (stream, " %5d", -1);
4575 fprintf (stream, " %5zu",
4576 (offset - level3_offset) / sizeof (uint8_t));
4577 if (i+1 < t.level2_size << t.q)
4578 fprintf (stream, ",");
4580 if (t.level2_size << t.q > 8)
4581 fprintf (stream, "\n ");
4582 fprintf (stream, " },\n");
4583 /* Pack the level3 array. Each entry needs 2 bits only. */
4584 fprintf (stream, " {");
4585 if ((t.level3_size << t.p) * 2 / 16 > 8)
4586 fprintf (stream, "\n ");
4587 for (i = 0; i < (t.level3_size << t.p) * 2 / 16; i++)
4589 if (i > 0 && (i % 8) == 0)
4590 fprintf (stream, "\n ");
4591 fprintf (stream, " 0x%04x",
4592 (((uint8_t *) (t.result + level3_offset))[8 * i] << 0)
4593 | (((uint8_t *) (t.result + level3_offset))[8 * i + 1] << 2)
4594 | (((uint8_t *) (t.result + level3_offset))[8 * i + 2] << 4)
4595 | (((uint8_t *) (t.result + level3_offset))[8 * i + 3] << 6)
4596 | (((uint8_t *) (t.result + level3_offset))[8 * i + 4] << 8)
4597 | (((uint8_t *) (t.result + level3_offset))[8 * i + 5] << 10)
4598 | (((uint8_t *) (t.result + level3_offset))[8 * i + 6] << 12)
4599 | (((uint8_t *) (t.result + level3_offset))[8 * i + 7] << 14));
4600 if (i+1 < (t.level3_size << t.p) * 2 / 16)
4601 fprintf (stream, ",");
4603 if ((t.level3_size << t.p) * 2 / 16 > 8)
4604 fprintf (stream, "\n ");
4605 fprintf (stream, " }\n");
4606 fprintf (stream, "};\n");
4608 if (ferror (stream) || fclose (stream))
4610 fprintf (stderr, "error writing to '%s'\n", filename);
4616 output_ident_properties (const char *version)
4618 #define PROPERTY(P) \
4619 debug_output_predicate ("unictype/sy_" #P ".txt", is_ ## P); \
4620 output_predicate_test ("../tests/unictype/test-sy_" #P ".c", is_ ## P, "uc_is_" #P " (c)"); \
4621 output_predicate ("unictype/sy_" #P ".h", is_ ## P, "u_" #P, "Language syntax properties", version);
4622 PROPERTY(c_whitespace)
4623 PROPERTY(java_whitespace)
4626 output_ident_category ("unictype/sy_c_ident.h", c_ident_category, "u_c_ident", version);
4627 output_ident_category ("unictype/sy_java_ident.h", java_ident_category, "u_java_ident", version);
4630 /* ========================================================================= */
4632 /* Like ISO C <ctype.h> and <wctype.h>. Compatible to glibc's
4633 glibc/localedata/locales/i18n file, generated by
4634 glibc/localedata/gen-unicode-ctype.c. */
4636 /* Character mappings. */
4639 to_upper (unsigned int ch)
4641 if (unicode_attributes[ch].name != NULL
4642 && unicode_attributes[ch].upper != NONE)
4643 return unicode_attributes[ch].upper;
4649 to_lower (unsigned int ch)
4651 if (unicode_attributes[ch].name != NULL
4652 && unicode_attributes[ch].lower != NONE)
4653 return unicode_attributes[ch].lower;
4659 to_title (unsigned int ch)
4661 if (unicode_attributes[ch].name != NULL
4662 && unicode_attributes[ch].title != NONE)
4663 return unicode_attributes[ch].title;
4668 /* Character class properties. */
4671 is_upper (unsigned int ch)
4673 return (to_lower (ch) != ch);
4677 is_lower (unsigned int ch)
4679 return (to_upper (ch) != ch)
4680 /* <U00DF> is lowercase, but without simple to_upper mapping. */
4685 is_alpha (unsigned int ch)
4687 return (unicode_attributes[ch].name != NULL
4688 && ((unicode_attributes[ch].category[0] == 'L'
4689 /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
4690 <U0E2F>, <U0E46> should belong to is_punct. */
4691 && (ch != 0x0E2F) && (ch != 0x0E46))
4692 /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
4693 <U0E31>, <U0E34>..<U0E3A>, <U0E47>..<U0E4E> are is_alpha. */
4695 || (ch >= 0x0E34 && ch <= 0x0E3A)
4696 || (ch >= 0x0E47 && ch <= 0x0E4E)
4697 /* Avoid warning for <U0345>. */
4699 /* Avoid warnings for <U2160>..<U217F>. */
4700 || (unicode_attributes[ch].category[0] == 'N'
4701 && unicode_attributes[ch].category[1] == 'l')
4702 /* Avoid warnings for <U24B6>..<U24E9>. */
4703 || (unicode_attributes[ch].category[0] == 'S'
4704 && unicode_attributes[ch].category[1] == 'o'
4705 && strstr (unicode_attributes[ch].name, " LETTER ")
4707 /* Consider all the non-ASCII digits as alphabetic.
4708 ISO C 99 forbids us to have them in category "digit",
4709 but we want iswalnum to return true on them. */
4710 || (unicode_attributes[ch].category[0] == 'N'
4711 && unicode_attributes[ch].category[1] == 'd'
4712 && !(ch >= 0x0030 && ch <= 0x0039))));
4716 is_digit (unsigned int ch)
4719 return (unicode_attributes[ch].name != NULL
4720 && unicode_attributes[ch].category[0] == 'N'
4721 && unicode_attributes[ch].category[1] == 'd');
4722 /* Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
4723 a zero. Must add <0> in front of them by hand. */
4725 /* SUSV2 gives us some freedom for the "digit" category, but ISO C 99
4728 The iswdigit function tests for any wide character that corresponds
4729 to a decimal-digit character (as defined in 5.2.1).
4731 the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
4733 return (ch >= 0x0030 && ch <= 0x0039);
4738 is_outdigit (unsigned int ch)
4740 return (ch >= 0x0030 && ch <= 0x0039);
4744 is_alnum (unsigned int ch)
4746 return is_alpha (ch) || is_digit (ch);
4750 is_blank (unsigned int ch)
4752 return (ch == 0x0009 /* '\t' */
4753 /* Category Zs without mention of "<noBreak>" */
4754 || (unicode_attributes[ch].name != NULL
4755 && unicode_attributes[ch].category[0] == 'Z'
4756 && unicode_attributes[ch].category[1] == 's'
4757 && !strstr (unicode_attributes[ch].decomposition, "<noBreak>")));
4761 is_space (unsigned int ch)
4763 /* Don't make U+00A0 a space. Non-breaking space means that all programs
4764 should treat it like a punctuation character, not like a space. */
4765 return (ch == 0x0020 /* ' ' */
4766 || ch == 0x000C /* '\f' */
4767 || ch == 0x000A /* '\n' */
4768 || ch == 0x000D /* '\r' */
4769 || ch == 0x0009 /* '\t' */
4770 || ch == 0x000B /* '\v' */
4771 /* Categories Zl, Zp, and Zs without mention of "<noBreak>" */
4772 || (unicode_attributes[ch].name != NULL
4773 && unicode_attributes[ch].category[0] == 'Z'
4774 && (unicode_attributes[ch].category[1] == 'l'
4775 || unicode_attributes[ch].category[1] == 'p'
4776 || (unicode_attributes[ch].category[1] == 's'
4777 && !strstr (unicode_attributes[ch].decomposition,
4782 is_cntrl (unsigned int ch)
4784 return (unicode_attributes[ch].name != NULL
4785 && (strcmp (unicode_attributes[ch].name, "<control>") == 0
4786 /* Categories Zl and Zp */
4787 || (unicode_attributes[ch].category[0] == 'Z'
4788 && (unicode_attributes[ch].category[1] == 'l'
4789 || unicode_attributes[ch].category[1] == 'p'))));
4793 is_xdigit (unsigned int ch)
4796 return is_digit (ch)
4797 || (ch >= 0x0041 && ch <= 0x0046)
4798 || (ch >= 0x0061 && ch <= 0x0066);
4800 /* SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
4803 The iswxdigit function tests for any wide character that corresponds
4804 to a hexadecimal-digit character (as defined in 6.4.4.1).
4806 hexadecimal-digit: one of 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
4808 return (ch >= 0x0030 && ch <= 0x0039)
4809 || (ch >= 0x0041 && ch <= 0x0046)
4810 || (ch >= 0x0061 && ch <= 0x0066);
4815 is_graph (unsigned int ch)
4817 return (unicode_attributes[ch].name != NULL
4818 && strcmp (unicode_attributes[ch].name, "<control>")
4823 is_print (unsigned int ch)
4825 return (unicode_attributes[ch].name != NULL
4826 && strcmp (unicode_attributes[ch].name, "<control>")
4827 /* Categories Zl and Zp */
4828 && !(unicode_attributes[ch].name != NULL
4829 && unicode_attributes[ch].category[0] == 'Z'
4830 && (unicode_attributes[ch].category[1] == 'l'
4831 || unicode_attributes[ch].category[1] == 'p')));
4835 is_punct (unsigned int ch)
4838 return (unicode_attributes[ch].name != NULL
4839 && unicode_attributes[ch].category[0] == 'P');
4841 /* The traditional POSIX definition of punctuation is every graphic,
4842 non-alphanumeric character. */
4843 return (is_graph (ch) && !is_alpha (ch) && !is_digit (ch));
4847 /* Output all properties. */
4849 output_old_ctype (const char *version)
4851 #define PROPERTY(P) \
4852 debug_output_predicate ("unictype/ctype_" #P ".txt", is_ ## P); \
4853 output_predicate_test ("../tests/unictype/test-ctype_" #P ".c", is_ ## P, "uc_is_" #P " (c)"); \
4854 output_predicate ("unictype/ctype_" #P ".h", is_ ## P, "u_is_" #P, "ISO C <ctype.h> like properties", version);
4873 is_combining (unsigned int ch)
4875 /* Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
4876 file. In 3.0.1 it was identical to the union of the general categories
4877 "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the
4878 PropList.txt file, so we take the latter definition. */
4879 return (unicode_attributes[ch].name != NULL
4880 && unicode_attributes[ch].category[0] == 'M'
4881 && (unicode_attributes[ch].category[1] == 'n'
4882 || unicode_attributes[ch].category[1] == 'c'
4883 || unicode_attributes[ch].category[1] == 'e'));
4887 is_combining_level3 (unsigned int ch)
4889 return is_combining (ch)
4890 && !(unicode_attributes[ch].combining[0] != '\0'
4891 && unicode_attributes[ch].combining[0] != '0'
4892 && strtoul (unicode_attributes[ch].combining, NULL, 10) >= 200);
4895 /* Return the UCS symbol string for a Unicode character. */
4897 ucs_symbol (unsigned int i)
4899 static char buf[11+1];
4901 sprintf (buf, (i < 0x10000 ? "<U%04X>" : "<U%08X>"), i);
4905 /* Return the UCS symbol range string for a Unicode characters interval. */
4907 ucs_symbol_range (unsigned int low, unsigned int high)
4909 static char buf[24+1];
4911 strcpy (buf, ucs_symbol (low));
4913 strcat (buf, ucs_symbol (high));
4917 /* Output a character class (= property) table. */
4920 output_charclass (FILE *stream, const char *classname,
4921 bool (*func) (unsigned int))
4923 char table[0x110000];
4925 bool need_semicolon;
4926 const int max_column = 75;
4929 for (i = 0; i < 0x110000; i++)
4930 table[i] = (int) func (i);
4932 fprintf (stream, "%s ", classname);
4933 need_semicolon = false;
4935 for (i = 0; i < 0x110000; )
4941 unsigned int low, high;
4947 while (i < 0x110000 && table[i]);
4951 strcpy (buf, ucs_symbol (low));
4953 strcpy (buf, ucs_symbol_range (low, high));
4957 fprintf (stream, ";");
4961 if (column + strlen (buf) > max_column)
4963 fprintf (stream, "/\n ");
4967 fprintf (stream, "%s", buf);
4968 column += strlen (buf);
4969 need_semicolon = true;
4972 fprintf (stream, "\n");
4975 /* Output a character mapping table. */
4978 output_charmap (FILE *stream, const char *mapname,
4979 unsigned int (*func) (unsigned int))
4981 char table[0x110000];
4983 bool need_semicolon;
4984 const int max_column = 75;
4987 for (i = 0; i < 0x110000; i++)
4988 table[i] = (func (i) != i);
4990 fprintf (stream, "%s ", mapname);
4991 need_semicolon = false;
4993 for (i = 0; i < 0x110000; i++)
4999 strcat (buf, ucs_symbol (i));
5001 strcat (buf, ucs_symbol (func (i)));
5006 fprintf (stream, ";");
5010 if (column + strlen (buf) > max_column)
5012 fprintf (stream, "/\n ");
5016 fprintf (stream, "%s", buf);
5017 column += strlen (buf);
5018 need_semicolon = true;
5020 fprintf (stream, "\n");
5023 /* Output the width table. */
5026 output_widthmap (FILE *stream)
5030 /* Output the tables to the given file. */
5033 output_tables (const char *filename, const char *version)
5038 stream = fopen (filename, "w");
5041 fprintf (stderr, "cannot open '%s' for writing\n", filename);
5045 fprintf (stream, "escape_char /\n");
5046 fprintf (stream, "comment_char %%\n");
5047 fprintf (stream, "\n");
5048 fprintf (stream, "%% Generated automatically by gen-unicode-ctype for Unicode %s.\n",
5050 fprintf (stream, "\n");
5052 fprintf (stream, "LC_IDENTIFICATION\n");
5053 fprintf (stream, "title \"Unicode %s FDCC-set\"\n", version);
5054 fprintf (stream, "source \"UnicodeData.txt, PropList.txt\"\n");
5055 fprintf (stream, "address \"\"\n");
5056 fprintf (stream, "contact \"\"\n");
5057 fprintf (stream, "email \"bug-glibc@gnu.org\"\n");
5058 fprintf (stream, "tel \"\"\n");
5059 fprintf (stream, "fax \"\"\n");
5060 fprintf (stream, "language \"\"\n");
5061 fprintf (stream, "territory \"Earth\"\n");
5062 fprintf (stream, "revision \"%s\"\n", version);
5067 strftime (date, sizeof (date), "%Y-%m-%d", gmtime (&now));
5068 fprintf (stream, "date \"%s\"\n", date);
5070 fprintf (stream, "category \"unicode:2001\";LC_CTYPE\n");
5071 fprintf (stream, "END LC_IDENTIFICATION\n");
5072 fprintf (stream, "\n");
5074 /* Verifications. */
5075 for (ch = 0; ch < 0x110000; ch++)
5077 /* toupper restriction: "Only characters specified for the keywords
5078 lower and upper shall be specified. */
5079 if (to_upper (ch) != ch && !(is_lower (ch) || is_upper (ch)))
5081 "%s is not upper|lower but toupper(0x%04X) = 0x%04X\n",
5082 ucs_symbol (ch), ch, to_upper (ch));
5084 /* tolower restriction: "Only characters specified for the keywords
5085 lower and upper shall be specified. */
5086 if (to_lower (ch) != ch && !(is_lower (ch) || is_upper (ch)))
5088 "%s is not upper|lower but tolower(0x%04X) = 0x%04X\n",
5089 ucs_symbol (ch), ch, to_lower (ch));
5091 /* alpha restriction: "Characters classified as either upper or lower
5092 shall automatically belong to this class. */
5093 if ((is_lower (ch) || is_upper (ch)) && !is_alpha (ch))
5094 fprintf (stderr, "%s is upper|lower but not alpha\n", ucs_symbol (ch));
5096 /* alpha restriction: "No character specified for the keywords cntrl,
5097 digit, punct or space shall be specified." */
5098 if (is_alpha (ch) && is_cntrl (ch))
5099 fprintf (stderr, "%s is alpha and cntrl\n", ucs_symbol (ch));
5100 if (is_alpha (ch) && is_digit (ch))
5101 fprintf (stderr, "%s is alpha and digit\n", ucs_symbol (ch));
5102 if (is_alpha (ch) && is_punct (ch))
5103 fprintf (stderr, "%s is alpha and punct\n", ucs_symbol (ch));
5104 if (is_alpha (ch) && is_space (ch))
5105 fprintf (stderr, "%s is alpha and space\n", ucs_symbol (ch));
5107 /* space restriction: "No character specified for the keywords upper,
5108 lower, alpha, digit, graph or xdigit shall be specified."
5109 upper, lower, alpha already checked above. */
5110 if (is_space (ch) && is_digit (ch))
5111 fprintf (stderr, "%s is space and digit\n", ucs_symbol (ch));
5112 if (is_space (ch) && is_graph (ch))
5113 fprintf (stderr, "%s is space and graph\n", ucs_symbol (ch));
5114 if (is_space (ch) && is_xdigit (ch))
5115 fprintf (stderr, "%s is space and xdigit\n", ucs_symbol (ch));
5117 /* cntrl restriction: "No character specified for the keywords upper,
5118 lower, alpha, digit, punct, graph, print or xdigit shall be
5119 specified." upper, lower, alpha already checked above. */
5120 if (is_cntrl (ch) && is_digit (ch))
5121 fprintf (stderr, "%s is cntrl and digit\n", ucs_symbol (ch));
5122 if (is_cntrl (ch) && is_punct (ch))
5123 fprintf (stderr, "%s is cntrl and punct\n", ucs_symbol (ch));
5124 if (is_cntrl (ch) && is_graph (ch))
5125 fprintf (stderr, "%s is cntrl and graph\n", ucs_symbol (ch));
5126 if (is_cntrl (ch) && is_print (ch))
5127 fprintf (stderr, "%s is cntrl and print\n", ucs_symbol (ch));
5128 if (is_cntrl (ch) && is_xdigit (ch))
5129 fprintf (stderr, "%s is cntrl and xdigit\n", ucs_symbol (ch));
5131 /* punct restriction: "No character specified for the keywords upper,
5132 lower, alpha, digit, cntrl, xdigit or as the <space> character shall
5133 be specified." upper, lower, alpha, cntrl already checked above. */
5134 if (is_punct (ch) && is_digit (ch))
5135 fprintf (stderr, "%s is punct and digit\n", ucs_symbol (ch));
5136 if (is_punct (ch) && is_xdigit (ch))
5137 fprintf (stderr, "%s is punct and xdigit\n", ucs_symbol (ch));
5138 if (is_punct (ch) && (ch == 0x0020))
5139 fprintf (stderr, "%s is punct\n", ucs_symbol (ch));
5141 /* graph restriction: "No character specified for the keyword cntrl
5142 shall be specified." Already checked above. */
5144 /* print restriction: "No character specified for the keyword cntrl
5145 shall be specified." Already checked above. */
5147 /* graph - print relation: differ only in the <space> character.
5148 How is this possible if there are more than one space character?!
5149 I think susv2/xbd/locale.html should speak of "space characters",
5150 not "space character". */
5151 if (is_print (ch) && !(is_graph (ch) || /* ch == 0x0020 */ is_space (ch)))
5153 "%s is print but not graph|<space>\n", ucs_symbol (ch));
5154 if (!is_print (ch) && (is_graph (ch) || ch == 0x0020))
5156 "%s is graph|<space> but not print\n", ucs_symbol (ch));
5159 fprintf (stream, "LC_CTYPE\n");
5160 output_charclass (stream, "upper", is_upper);
5161 output_charclass (stream, "lower", is_lower);
5162 output_charclass (stream, "alpha", is_alpha);
5163 output_charclass (stream, "digit", is_digit);
5164 output_charclass (stream, "outdigit", is_outdigit);
5165 output_charclass (stream, "blank", is_blank);
5166 output_charclass (stream, "space", is_space);
5167 output_charclass (stream, "cntrl", is_cntrl);
5168 output_charclass (stream, "punct", is_punct);
5169 output_charclass (stream, "xdigit", is_xdigit);
5170 output_charclass (stream, "graph", is_graph);
5171 output_charclass (stream, "print", is_print);
5172 output_charclass (stream, "class \"combining\";", is_combining);
5173 output_charclass (stream, "class \"combining_level3\";", is_combining_level3);
5174 output_charmap (stream, "toupper", to_upper);
5175 output_charmap (stream, "tolower", to_lower);
5176 output_charmap (stream, "map \"totitle\";", to_title);
5177 output_widthmap (stream);
5178 fprintf (stream, "END LC_CTYPE\n");
5180 if (ferror (stream) || fclose (stream))
5182 fprintf (stderr, "error writing to '%s'\n", filename);
5189 /* ========================================================================= */
5191 /* The width property from the EastAsianWidth.txt file.
5192 Each is NULL (unassigned) or "N", "A", "H", "W", "F", "Na". */
5193 const char * unicode_width[0x110000];
5195 /* Stores in unicode_width[] the width property from the EastAsianWidth.txt
5198 fill_width (const char *width_filename)
5202 char field0[FIELDLEN];
5203 char field1[FIELDLEN];
5204 char field2[FIELDLEN];
5207 for (i = 0; i < 0x110000; i++)
5208 unicode_width[i] = (unicode_attributes[i].name != NULL ? "N" : NULL);
5210 stream = fopen (width_filename, "r");
5213 fprintf (stderr, "error during fopen of '%s'\n", width_filename);
5228 do c = getc (stream); while (c != EOF && c != '\n');
5232 n = getfield (stream, field0, ';');
5233 n += getfield (stream, field1, ' ');
5234 n += getfield (stream, field2, '\n');
5239 fprintf (stderr, "short line in '%s':%d\n", width_filename, lineno);
5242 i = strtoul (field0, NULL, 16);
5243 if (strstr (field0, "..") != NULL)
5245 /* Deal with a range. */
5246 j = strtoul (strstr (field0, "..") + 2, NULL, 16);
5248 unicode_width[i] = strdup (field1);
5252 /* Single character line. */
5253 unicode_width[i] = strdup (field1);
5256 if (ferror (stream) || fclose (stream))
5258 fprintf (stderr, "error reading from '%s'\n", width_filename);
5263 /* ========================================================================= */
5265 /* Non-spacing attribute and width. */
5267 /* The non-spacing attribute table consists of:
5268 - Non-spacing characters; generated from PropList.txt or
5269 "grep '^[^;]*;[^;]*;[^;]*;[^;]*;NSM;' UnicodeData.txt"
5270 - Format control characters; generated from
5271 "grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt"
5272 - Zero width characters; generated from
5273 "grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt"
5277 is_nonspacing (unsigned int ch)
5279 return (unicode_attributes[ch].name != NULL
5280 && (get_bidi_category (ch) == UC_BIDI_NSM
5281 || is_category_Cc (ch) || is_category_Cf (ch)
5282 || strncmp (unicode_attributes[ch].name, "ZERO WIDTH ", 11) == 0));
5286 output_nonspacing_property (const char *filename)
5289 int ind[0x110000 / 0x200];
5294 stream = fopen (filename, "w");
5297 fprintf (stderr, "cannot open '%s' for writing\n", filename);
5302 for (i = 0; i < 0x110000 / 0x200; i++)
5304 bool nontrivial = false;
5307 if (i != 0xe0000 / 0x200) /* The 0xe0000 block is handled by code. */
5308 for (ch = i * 0x200; ch < (i + 1) * 0x200; ch++)
5309 if (is_nonspacing (ch))
5315 ind[i] = next_ind++;
5320 fprintf (stream, "static const unsigned char nonspacing_table_data[%d*64] = {\n",
5323 for (i = 0; i < 0x110000 / 0x200; i++)
5325 bool nontrivial = (ind[i] >= 0);
5331 fprintf (stream, " /* 0x%04x-0x%04x */\n", i * 0x200, (i + 1) * 0x200 - 1);
5332 for (j = 0; j < 8; j++)
5336 fprintf (stream, " ");
5337 for (k = 0; k < 8; k++)
5340 unsigned char bits = 0;
5342 for (l = 0; l < 8; l++)
5344 unsigned int ch = i * 0x200 + j * 0x40 + k * 8 + l;
5346 if (is_nonspacing (ch))
5349 fprintf (stream, " 0x%02x%c", bits,
5350 ind[i] + 1 == next_ind && j == 8 - 1 && k == 8 - 1 ? ' ' : ',');
5352 fprintf (stream, " /* 0x%04x-0x%04x */\n",
5353 i * 0x200 + j * 0x40, i * 0x200 + (j + 1) * 0x40 - 1);
5358 fprintf (stream, "};\n");
5360 i_max = ((i_max + 8 - 1) / 8) * 8;
5361 fprintf (stream, "static const signed char nonspacing_table_ind[%u] = {\n",
5366 for (j = 0; j < i_max / 8; j++)
5370 fprintf (stream, " ");
5371 for (k = 0; k < 8; k++)
5374 fprintf (stream, " %2d%c", ind[i],
5375 j == i_max / 8 - 1 && k == 8 - 1 ? ' ' : ',');
5377 fprintf (stream, " /* 0x%04x-0x%04x */\n",
5378 j * 8 * 0x200, (j + 1) * 8 * 0x200 - 1);
5381 fprintf (stream, "};\n");
5383 if (ferror (stream) || fclose (stream))
5385 fprintf (stderr, "error writing to '%s'\n", filename);
5390 /* Returns the width of ch as one of 0, '0', '1', '2', 'A'. */
5392 symbolic_width (unsigned int ch)
5394 /* Test for unassigned character. */
5395 if (is_property_unassigned_code_value (ch))
5397 /* Unicode TR#11 section "Unassigned and Private-Use Characters". */
5398 if (ch >= 0xE000 && ch <= 0xF8FF) /* Private Use */
5400 if ((ch >= 0x4E00 && ch <= 0x9FFF) /* CJK Unified Ideographs block */
5401 || (ch >= 0x3400 && ch <= 0x4DBF) /* CJK Unified Ideographs Extension A block */
5402 || (ch >= 0xF900 && ch <= 0xFAFF) /* CJK Compatibility Ideographs block */
5403 || (ch >= 0x20000 && ch <= 0x2FFFF) /* Supplementary Ideographic Plane */
5404 || (ch >= 0x30000 && ch <= 0x3FFFF) /* Tertiary Ideographic Plane */)
5410 /* Test for non-spacing or control character. */
5411 if (is_category_Cc (ch) && ch < 0x00A0)
5413 if (is_nonspacing (ch))
5415 /* Test for double-width character. */
5416 if (unicode_width[ch] != NULL
5417 && (strcmp (unicode_width[ch], "W") == 0
5418 || strcmp (unicode_width[ch], "F") == 0))
5420 /* Test for half-width character. */
5421 if (unicode_width[ch] != NULL
5422 && strcmp (unicode_width[ch], "H") == 0)
5425 /* In ancient CJK encodings, Cyrillic and most other characters are
5426 double-width as well. */
5427 if (ch >= 0x00A1 && ch < 0x10000)
5433 output_width_property_test (const char *filename)
5436 unsigned int interval_start, interval_end, ch;
5437 char interval_value;
5439 stream = fopen (filename, "w");
5442 fprintf (stderr, "cannot open '%s' for writing\n", filename);
5447 interval_start = interval_end = 0; /* avoid GCC warning */
5448 for (ch = 0; ch < 0x110000; ch++)
5450 char value = symbolic_width (ch);
5451 if (value != 0) /* skip Cc control characters and unassigned characters */
5453 if (value == interval_value)
5454 /* Extend the interval. */
5458 /* Terminate the interval. */
5459 if (interval_value != 0)
5461 if (interval_end == interval_start)
5462 fprintf (stream, "%04X\t\t%c\n", interval_start, interval_value);
5464 fprintf (stream, "%04X..%04X\t%c\n", interval_start, interval_end, interval_value);
5466 /* Start a new interval. */
5467 interval_start = interval_end = ch;
5468 interval_value = value;
5472 /* Terminate the last interval. */
5473 if (interval_value != 0)
5475 if (interval_end == interval_start)
5476 fprintf (stream, "%04X\t\t%c\n", interval_start, interval_value);
5478 fprintf (stream, "%04X..%04X\t%c\n", interval_start, interval_end, interval_value);
5481 if (ferror (stream) || fclose (stream))
5483 fprintf (stderr, "error writing to '%s'\n", filename);
5488 /* ========================================================================= */
5490 /* Line breaking classification.
5491 Updated for Unicode TR #14 revision 26. */
5495 /* Values >= 25 are resolved at run time. */
5496 LBP_BK = 25, /* mandatory break */
5497 /*LBP_CR, carriage return - not used here because it's a DOSism */
5498 /*LBP_LF, line feed - not used here because it's a DOSism */
5499 LBP_CM = 26, /* attached characters and combining marks */
5500 /*LBP_NL, next line - not used here because it's equivalent to LBP_BK */
5501 /*LBP_SG, surrogates - not used here because they are not characters */
5502 LBP_WJ = 0, /* word joiner */
5503 LBP_ZW = 27, /* zero width space */
5504 LBP_GL = 1, /* non-breaking (glue) */
5505 LBP_SP = 28, /* space */
5506 LBP_B2 = 2, /* break opportunity before and after */
5507 LBP_BA = 3, /* break opportunity after */
5508 LBP_BB = 4, /* break opportunity before */
5509 LBP_HY = 5, /* hyphen */
5510 LBP_CB = 29, /* contingent break opportunity */
5511 LBP_CL = 6, /* closing punctuation */
5512 LBP_CP = 7, /* closing parenthesis */
5513 LBP_EX = 8, /* exclamation/interrogation */
5514 LBP_IN = 9, /* inseparable */
5515 LBP_NS = 10, /* non starter */
5516 LBP_OP = 11, /* opening punctuation */
5517 LBP_QU = 12, /* ambiguous quotation */
5518 LBP_IS = 13, /* infix separator (numeric) */
5519 LBP_NU = 14, /* numeric */
5520 LBP_PO = 15, /* postfix (numeric) */
5521 LBP_PR = 16, /* prefix (numeric) */
5522 LBP_SY = 17, /* symbols allowing breaks */
5523 LBP_AI = 30, /* ambiguous (alphabetic or ideograph) */
5524 LBP_AL = 18, /* ordinary alphabetic and symbol characters */
5525 LBP_H2 = 19, /* Hangul LV syllable */
5526 LBP_H3 = 20, /* Hangul LVT syllable */
5527 LBP_ID = 21, /* ideographic */
5528 LBP_JL = 22, /* Hangul L Jamo */
5529 LBP_JV = 23, /* Hangul V Jamo */
5530 LBP_JT = 24, /* Hangul T Jamo */
5531 LBP_SA = 31, /* complex context (South East Asian) */
5532 LBP_XX = 32 /* unknown */
5535 /* Returns the line breaking classification for ch, as a bit mask. */
5537 get_lbp (unsigned int ch)
5541 if (unicode_attributes[ch].name != NULL)
5543 /* mandatory break */
5544 if (ch == 0x000A || ch == 0x000D || ch == 0x0085 /* newline */
5545 || ch == 0x000C /* form feed */
5546 || ch == 0x000B /* line tabulation */
5547 || ch == 0x2028 /* LINE SEPARATOR */
5548 || ch == 0x2029 /* PARAGRAPH SEPARATOR */)
5549 attr |= (int64_t) 1 << LBP_BK;
5551 if (ch == 0x2060 /* WORD JOINER */
5552 || ch == 0xFEFF /* ZERO WIDTH NO-BREAK SPACE */)
5553 attr |= (int64_t) 1 << LBP_WJ;
5555 /* zero width space */
5556 if (ch == 0x200B /* ZERO WIDTH SPACE */)
5557 attr |= (int64_t) 1 << LBP_ZW;
5559 /* non-breaking (glue) */
5560 if (ch == 0x00A0 /* NO-BREAK SPACE */
5561 || ch == 0x202F /* NARROW NO-BREAK SPACE */
5562 || ch == 0x180E /* MONGOLIAN VOWEL SEPARATOR */
5563 || ch == 0x034F /* COMBINING GRAPHEME JOINER */
5564 || ch == 0x2007 /* FIGURE SPACE */
5565 || ch == 0x2011 /* NON-BREAKING HYPHEN */
5566 || ch == 0x0F08 /* TIBETAN MARK SBRUL SHAD */
5567 || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */
5568 || ch == 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */
5569 || (ch >= 0x035C && ch <= 0x0362) /* COMBINING DOUBLE ... */
5570 /* Extra characters for compatibility with Unicode LineBreak.txt. */
5571 || ch == 0x0FD9 /* TIBETAN MARK LEADING MCHAN RTAGS */
5572 || ch == 0x0FDA /* TIBETAN MARK TRAILING MCHAN RTAGS */)
5573 attr |= (int64_t) 1 << LBP_GL;
5576 if (ch == 0x0020 /* SPACE */)
5577 attr |= (int64_t) 1 << LBP_SP;
5579 /* break opportunity before and after */
5580 if (ch == 0x2014 /* EM DASH */)
5581 attr |= (int64_t) 1 << LBP_B2;
5583 /* break opportunity after */
5584 if (/* Breaking Spaces */
5585 ch == 0x1680 /* OGHAM SPACE MARK */
5586 || ch == 0x2000 /* EN QUAD */
5587 || ch == 0x2001 /* EM QUAD */
5588 || ch == 0x2002 /* EN SPACE */
5589 || ch == 0x2003 /* EM SPACE */
5590 || ch == 0x2004 /* THREE-PER-EM SPACE */
5591 || ch == 0x2005 /* FOUR-PER-EM SPACE */
5592 || ch == 0x2006 /* SIX-PER-EM SPACE */
5593 || ch == 0x2008 /* PUNCTUATION SPACE */
5594 || ch == 0x2009 /* THIN SPACE */
5595 || ch == 0x200A /* HAIR SPACE */
5596 || ch == 0x205F /* MEDIUM MATHEMATICAL SPACE */
5598 || ch == 0x0009 /* tab */
5599 /* Conditional Hyphens */
5600 || ch == 0x00AD /* SOFT HYPHEN */
5601 /* Breaking Hyphens */
5602 || ch == 0x058A /* ARMENIAN HYPHEN */
5603 || ch == 0x1400 /* CANADIAN SYLLABICS HYPHEN */
5604 || ch == 0x2010 /* HYPHEN */
5605 || ch == 0x2012 /* FIGURE DASH */
5606 || ch == 0x2013 /* EN DASH */
5607 /* Visible Word Dividers */
5608 || ch == 0x05BE /* HEBREW PUNCTUATION MAQAF */
5609 || ch == 0x0F0B /* TIBETAN MARK INTERSYLLABIC TSHEG */
5610 || ch == 0x1361 /* ETHIOPIC WORDSPACE */
5611 || ch == 0x17D8 /* KHMER SIGN BEYYAL */
5612 || ch == 0x17DA /* KHMER SIGN KOOMUUT */
5613 || ch == 0x2027 /* HYPHENATION POINT */
5614 || ch == 0x007C /* VERTICAL LINE */
5615 /* Historic Word Separators */
5616 || ch == 0x16EB /* RUNIC SINGLE PUNCTUATION */
5617 || ch == 0x16EC /* RUNIC MULTIPLE PUNCTUATION */
5618 || ch == 0x16ED /* RUNIC CROSS PUNCTUATION */
5619 || ch == 0x2056 /* THREE DOT PUNCTUATION */
5620 || ch == 0x2058 /* FOUR DOT PUNCTUATION */
5621 || ch == 0x2059 /* FIVE DOT PUNCTUATION */
5622 || ch == 0x205A /* TWO DOT PUNCTUATION */
5623 || ch == 0x205B /* FOUR DOT MARK */
5624 || ch == 0x205D /* TRICOLON */
5625 || ch == 0x205E /* VERTICAL FOUR DOTS */
5626 || ch == 0x2E19 /* PALM BRANCH */
5627 || ch == 0x2E2A /* TWO DOTS OVER ONE DOT PUNCTUATION */
5628 || ch == 0x2E2B /* ONE DOT OVER TWO DOTS PUNCTUATION */
5629 || ch == 0x2E2C /* SQUARED FOUR DOT PUNCTUATION */
5630 || ch == 0x2E2D /* FIVE DOT PUNCTUATION */
5631 || ch == 0x2E30 /* RING POINT */
5632 || ch == 0x2E31 /* WORD SEPARATOR MIDDLE DOT */
5633 || ch == 0x10100 /* AEGEAN WORD SEPARATOR LINE */
5634 || ch == 0x10101 /* AEGEAN WORD SEPARATOR DOT */
5635 || ch == 0x10102 /* AEGEAN CHECK MARK */
5636 || ch == 0x1039F /* UGARITIC WORD DIVIDER */
5637 || ch == 0x103D0 /* OLD PERSIAN WORD DIVIDER */
5638 || ch == 0x1091F /* PHOENICIAN WORD SEPARATOR */
5639 || ch == 0x12470 /* CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER */
5641 || ch == 0x0964 /* DEVANAGARI DANDA */
5642 || ch == 0x0965 /* DEVANAGARI DOUBLE DANDA */
5643 || ch == 0x0E5A /* THAI CHARACTER ANGKHANKHU */
5644 || ch == 0x0E5B /* THAI CHARACTER KHOMUT */
5645 || ch == 0x104A /* MYANMAR SIGN LITTLE SECTION */
5646 || ch == 0x104B /* MYANMAR SIGN SECTION */
5647 || ch == 0x1735 /* PHILIPPINE SINGLE PUNCTUATION */
5648 || ch == 0x1736 /* PHILIPPINE DOUBLE PUNCTUATION */
5649 || ch == 0x17D4 /* KHMER SIGN KHAN */
5650 || ch == 0x17D5 /* KHMER SIGN BARIYOOSAN */
5651 || ch == 0x1B5E /* BALINESE CARIK SIKI */
5652 || ch == 0x1B5F /* BALINESE CARIK PAREREN */
5653 || ch == 0xA8CE /* SAURASHTRA DANDA */
5654 || ch == 0xA8CF /* SAURASHTRA DOUBLE DANDA */
5655 || ch == 0xAA5D /* CHAM PUNCTUATION DANDA */
5656 || ch == 0xAA5E /* CHAM PUNCTUATION DOUBLE DANDA */
5657 || ch == 0xAA5F /* CHAM PUNCTUATION TRIPLE DANDA */
5658 || ch == 0x10A56 /* KHAROSHTHI PUNCTUATION DANDA */
5659 || ch == 0x10A57 /* KHAROSHTHI PUNCTUATION DOUBLE DANDA */
5661 || ch == 0x0F34 /* TIBETAN MARK BSDUS RTAGS */
5662 || ch == 0x0F7F /* TIBETAN SIGN RNAM BCAD */
5663 || ch == 0x0F85 /* TIBETAN MARK PALUTA */
5664 || ch == 0x0FBE /* TIBETAN KU RU KHA */
5665 || ch == 0x0FBF /* TIBETAN KU RU KHA BZHI MIG CAN */
5666 || ch == 0x0FD2 /* TIBETAN MARK NYIS TSHEG */
5667 /* Other Terminating Punctuation */
5668 || ch == 0x1804 /* MONGOLIAN COLON */
5669 || ch == 0x1805 /* MONGOLIAN FOUR DOTS */
5670 || ch == 0x1B5A /* BALINESE PANTI */
5671 || ch == 0x1B5B /* BALINESE PAMADA */
5672 || ch == 0x1B5D /* BALINESE CARIK PAMUNGKAH */
5673 || ch == 0x1B60 /* BALINESE PAMENENG */
5674 || ch == 0x1C3B /* LEPCHA PUNCTUATION TA-ROL */
5675 || ch == 0x1C3C /* LEPCHA PUNCTUATION NYET THYOOM TA-ROL */
5676 || ch == 0x1C3D /* LEPCHA PUNCTUATION CER-WA */
5677 || ch == 0x1C3E /* LEPCHA PUNCTUATION TSHOOK CER-WA */
5678 || ch == 0x1C3F /* LEPCHA PUNCTUATION TSHOOK */
5679 || ch == 0x1C7E /* OL CHIKI PUNCTUATION MUCAAD */
5680 || ch == 0x1C7F /* OL CHIKI PUNCTUATION DOUBLE MUCAAD */
5681 || ch == 0x2CFA /* COPTIC OLD NUBIAN DIRECT QUESTION MARK */
5682 || ch == 0x2CFB /* COPTIC OLD NUBIAN INDIRECT QUESTION MARK */
5683 || ch == 0x2CFC /* COPTIC OLD NUBIAN VERSE DIVIDER */
5684 || ch == 0x2CFF /* COPTIC MORPHOLOGICAL DIVIDER */
5685 || (ch >= 0x2E0E && ch <= 0x2E15) /* EDITORIAL CORONIS .. UPWARDS ANCORA */
5686 || ch == 0x2E17 /* DOUBLE OBLIQUE HYPHEN */
5687 || ch == 0xA60D /* VAI COMMA */
5688 || ch == 0xA60F /* VAI QUESTION MARK */
5689 || ch == 0xA92E /* KAYAH LI SIGN CWI */
5690 || ch == 0xA92F /* KAYAH LI SIGN SHYA */
5691 || ch == 0x10A50 /* KHAROSHTHI PUNCTUATION DOT */
5692 || ch == 0x10A51 /* KHAROSHTHI PUNCTUATION SMALL CIRCLE */
5693 || ch == 0x10A52 /* KHAROSHTHI PUNCTUATION CIRCLE */
5694 || ch == 0x10A53 /* KHAROSHTHI PUNCTUATION CRESCENT BAR */
5695 || ch == 0x10A54 /* KHAROSHTHI PUNCTUATION MANGALAM */
5696 || ch == 0x10A55 /* KHAROSHTHI PUNCTUATION LOTUS */
5697 /* Extra characters for compatibility with Unicode LineBreak.txt. */
5698 || ch == 0x2D70 /* TIFINAGH SEPARATOR MARK */
5699 || ch == 0xA4FE /* LISU PUNCTUATION COMMA */
5700 || ch == 0xA4FF /* LISU PUNCTUATION FULL STOP */
5701 || ch == 0xA6F3 /* BAMUM FULL STOP */
5702 || ch == 0xA6F4 /* BAMUM COLON */
5703 || ch == 0xA6F5 /* BAMUM COMMA */
5704 || ch == 0xA6F6 /* BAMUM SEMICOLON */
5705 || ch == 0xA6F7 /* BAMUM QUESTION MARK */
5706 || ch == 0xA9C7 /* JAVANESE PADA PANGKAT */
5707 || ch == 0xA9C8 /* JAVANESE PADA LINGSA */
5708 || ch == 0xA9C9 /* JAVANESE PADA LUNGSI */
5709 || ch == 0xABEB /* MEETEI MAYEK CHEIKHEI */
5710 || ch == 0x10857 /* IMPERIAL ARAMAIC SECTION SIGN */
5711 || ch == 0x10B39 /* AVESTAN ABBREVIATION MARK */
5712 || ch == 0x10B3A /* TINY TWO DOTS OVER ONE DOT PUNCTUATION */
5713 || ch == 0x10B3B /* SMALL TWO DOTS OVER ONE DOT PUNCTUATION */
5714 || ch == 0x10B3C /* LARGE TWO DOTS OVER ONE DOT PUNCTUATION */
5715 || ch == 0x10B3D /* LARGE ONE DOT OVER TWO DOTS PUNCTUATION */
5716 || ch == 0x10B3E /* LARGE TWO RINGS OVER ONE RING PUNCTUATION */
5717 || ch == 0x10B3F /* LARGE ONE RING OVER TWO RINGS PUNCTUATION */
5718 || ch == 0x11047 /* BRAHMI DANDA */
5719 || ch == 0x11048 /* BRAHMI DOUBLE DANDA */
5720 || ch == 0x110BE /* KAITHI SECTION MARK */
5721 || ch == 0x110BF /* KAITHI DOUBLE SECTION MARK */
5722 || ch == 0x110C0 /* KAITHI DANDA */
5723 || ch == 0x110C1 /* KAITHI DOUBLE DANDA */
5724 || ch == 0x12471 /* CUNEIFORM PUNCTUATION SIGN VERTICAL COLON */
5725 || ch == 0x12472 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL COLON */
5726 || ch == 0x12473 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL TRICOLON */)
5727 attr |= (int64_t) 1 << LBP_BA;
5729 /* break opportunity before */
5730 if (ch == 0x00B4 /* ACUTE ACCENT */
5731 || ch == 0x1FFD /* GREEK OXIA */
5732 || ch == 0x02DF /* MODIFIER LETTER CROSS ACCENT */
5733 || ch == 0x02C8 /* MODIFIER LETTER VERTICAL LINE */
5734 || ch == 0x02CC /* MODIFIER LETTER LOW VERTICAL LINE */
5735 || ch == 0x0F01 /* TIBETAN MARK GTER YIG MGO TRUNCATED A */
5736 || ch == 0x0F02 /* TIBETAN MARK GTER YIG MGO -UM RNAM BCAD MA */
5737 || ch == 0x0F03 /* TIBETAN MARK GTER YIG MGO -UM GTER TSHEG MA */
5738 || ch == 0x0F04 /* TIBETAN MARK INITIAL YIG MGO MDUN MA */
5739 || ch == 0x0F06 /* TIBETAN MARK CARET YIG MGO PHUR SHAD MA */
5740 || ch == 0x0F07 /* TIBETAN MARK YIG MGO TSHEG SHAD MA */
5741 || ch == 0x0F09 /* TIBETAN MARK BSKUR YIG MGO */
5742 || ch == 0x0F0A /* TIBETAN MARK BKA- SHOG YIG MGO */
5743 || ch == 0x0FD0 /* TIBETAN MARK BSKA- SHOG GI MGO RGYAN */
5744 || ch == 0x0FD1 /* TIBETAN MARK MNYAM YIG GI MGO RGYAN */
5745 || ch == 0x0FD3 /* TIBETAN MARK INITIAL BRDA RNYING YIG MGO MDUN MA */
5746 || ch == 0xA874 /* PHAGS-PA SINGLE HEAD MARK */
5747 || ch == 0xA875 /* PHAGS-PA DOUBLE HEAD MARK */
5748 || ch == 0x1806 /* MONGOLIAN TODO SOFT HYPHEN */)
5749 attr |= (int64_t) 1 << LBP_BB;
5752 if (ch == 0x002D /* HYPHEN-MINUS */)
5753 attr |= (int64_t) 1 << LBP_HY;
5755 /* contingent break opportunity */
5756 if (ch == 0xFFFC /* OBJECT REPLACEMENT CHARACTER */)
5757 attr |= (int64_t) 1 << LBP_CB;
5759 /* closing parenthesis */
5760 if (ch == 0x0029 /* RIGHT PARENTHESIS */
5761 || ch == 0x005D /* RIGHT SQUARE BRACKET */)
5762 attr |= (int64_t) 1 << LBP_CP;
5764 /* closing punctuation */
5765 if ((unicode_attributes[ch].category[0] == 'P'
5766 && unicode_attributes[ch].category[1] == 'e'
5767 && !(attr & ((int64_t) 1 << LBP_CP)))
5768 || ch == 0x3001 /* IDEOGRAPHIC COMMA */
5769 || ch == 0x3002 /* IDEOGRAPHIC FULL STOP */
5770 || ch == 0xFE11 /* PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC COMMA */
5771 || ch == 0xFE12 /* PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC FULL STOP */
5772 || ch == 0xFE50 /* SMALL COMMA */
5773 || ch == 0xFE52 /* SMALL FULL STOP */
5774 || ch == 0xFF0C /* FULLWIDTH COMMA */
5775 || ch == 0xFF0E /* FULLWIDTH FULL STOP */
5776 || ch == 0xFF61 /* HALFWIDTH IDEOGRAPHIC FULL STOP */
5777 || ch == 0xFF64 /* HALFWIDTH IDEOGRAPHIC COMMA */
5778 /* Extra characters for compatibility with Unicode LineBreak.txt. */
5779 || ch == 0x1325B /* EGYPTIAN HIEROGLYPH O006D */
5780 || ch == 0x1325C /* EGYPTIAN HIEROGLYPH O006E */
5781 || ch == 0x1325D /* EGYPTIAN HIEROGLYPH O006F */
5782 || ch == 0x13282 /* EGYPTIAN HIEROGLYPH O033A */
5783 || ch == 0x13287 /* EGYPTIAN HIEROGLYPH O036B */
5784 || ch == 0x13289 /* EGYPTIAN HIEROGLYPH O036D */
5785 || ch == 0x1337A /* EGYPTIAN HIEROGLYPH V011B */
5786 || ch == 0x1337B /* EGYPTIAN HIEROGLYPH V011C */)
5787 attr |= (int64_t) 1 << LBP_CL;
5789 /* exclamation/interrogation */
5790 if (ch == 0x0021 /* EXCLAMATION MARK */
5791 || ch == 0x003F /* QUESTION MARK */
5792 || ch == 0x05C6 /* HEBREW PUNCTUATION NUN HAFUKHA */
5793 || ch == 0x061B /* ARABIC SEMICOLON */
5794 || ch == 0x061E /* ARABIC TRIPLE DOT PUNCTUATION MARK */
5795 || ch == 0x061F /* ARABIC QUESTION MARK */
5796 || ch == 0x06D4 /* ARABIC FULL STOP */
5797 || ch == 0x07F9 /* NKO EXCLAMATION MARK */
5798 || ch == 0x0F0D /* TIBETAN MARK SHAD */
5799 || ch == 0x0F0E /* TIBETAN MARK NYIS SHAD */
5800 || ch == 0x0F0F /* TIBETAN MARK TSHEG SHAD */
5801 || ch == 0x0F10 /* TIBETAN MARK NYIS TSHEG SHAD */
5802 || ch == 0x0F11 /* TIBETAN MARK RIN CHEN SPUNGS SHAD */
5803 || ch == 0x0F14 /* TIBETAN MARK GTER TSHEG */
5804 || ch == 0x1802 /* MONGOLIAN COMMA */
5805 || ch == 0x1803 /* MONGOLIAN FULL STOP */
5806 || ch == 0x1808 /* MONGOLIAN MANCHU COMMA */
5807 || ch == 0x1809 /* MONGOLIAN MANCHU FULL STOP */
5808 || ch == 0x1944 /* LIMBU EXCLAMATION MARK */
5809 || ch == 0x1945 /* LIMBU QUESTION MARK */
5810 || ch == 0x2762 /* HEAVY EXCLAMATION MARK ORNAMENT */
5811 || ch == 0x2763 /* HEAVY HEART EXCLAMATION MARK ORNAMENT */
5812 || ch == 0x2CF9 /* COPTIC OLD NUBIAN FULL STOP */
5813 || ch == 0x2CFE /* COPTIC FULL STOP */
5814 || ch == 0x2E2E /* REVERSED QUESTION MARK */
5815 || ch == 0xA60E /* VAI FULL STOP */
5816 || ch == 0xA876 /* PHAGS-PA MARK SHAD */
5817 || ch == 0xA877 /* PHAGS-PA MARK DOUBLE SHAD */
5818 || ch == 0xFE15 /* PRESENTATION FORM FOR VERTICAL EXCLAMATION MARK */
5819 || ch == 0xFE16 /* PRESENTATION FORM FOR VERTICAL QUESTION MARK */
5820 || ch == 0xFE56 /* SMALL QUESTION MARK */
5821 || ch == 0xFE57 /* SMALL EXCLAMATION MARK */
5822 || ch == 0xFF01 /* FULLWIDTH EXCLAMATION MARK */
5823 || ch == 0xFF1F /* FULLWIDTH QUESTION MARK */)
5824 attr |= (int64_t) 1 << LBP_EX;
5827 if (ch == 0x2024 /* ONE DOT LEADER */
5828 || ch == 0x2025 /* TWO DOT LEADER */
5829 || ch == 0x2026 /* HORIZONTAL ELLIPSIS */
5830 || ch == 0xFE19 /* PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS */)
5831 attr |= (int64_t) 1 << LBP_IN;
5834 if (ch == 0x17D6 /* KHMER SIGN CAMNUC PII KUUH */
5835 || ch == 0x203C /* DOUBLE EXCLAMATION MARK */
5836 || ch == 0x203D /* INTERROBANG */
5837 || ch == 0x2047 /* DOUBLE QUESTION MARK */
5838 || ch == 0x2048 /* QUESTION EXCLAMATION MARK */
5839 || ch == 0x2049 /* EXCLAMATION QUESTION MARK */
5840 || ch == 0x3005 /* IDEOGRAPHIC ITERATION MARK */
5841 || ch == 0x301C /* WAVE DASH */
5842 || ch == 0x303C /* MASU MARK */
5843 || ch == 0x303B /* VERTICAL IDEOGRAPHIC ITERATION MARK */
5844 || ch == 0x309B /* KATAKANA-HIRAGANA VOICED SOUND MARK */
5845 || ch == 0x309C /* KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK */
5846 || ch == 0x309D /* HIRAGANA ITERATION MARK */
5847 || ch == 0x309E /* HIRAGANA VOICED ITERATION MARK */
5848 || ch == 0x30A0 /* KATAKANA-HIRAGANA DOUBLE HYPHEN */
5849 || ch == 0x30FB /* KATAKANA MIDDLE DOT */
5850 || ch == 0x30FC /* KATAKANA-HIRAGANA PROLONGED SOUND MARK */
5851 || ch == 0x30FD /* KATAKANA ITERATION MARK */
5852 || ch == 0x30FE /* KATAKANA VOICED ITERATION MARK */
5853 || ch == 0xA015 /* YI SYLLABLE WU */
5854 || ch == 0xFE54 /* SMALL SEMICOLON */
5855 || ch == 0xFE55 /* SMALL COLON */
5856 || ch == 0xFF1A /* FULLWIDTH COLON */
5857 || ch == 0xFF1B /* FULLWIDTH SEMICOLON */
5858 || ch == 0xFF65 /* HALFWIDTH KATAKANA MIDDLE DOT */
5859 || ch == 0xFF70 /* HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK */
5860 || ch == 0xFF9E /* HALFWIDTH KATAKANA VOICED SOUND MARK */
5861 || ch == 0xFF9F /* HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK */
5862 || strstr (unicode_attributes[ch].name, "HIRAGANA LETTER SMALL ") != NULL
5863 || strstr (unicode_attributes[ch].name, "KATAKANA LETTER SMALL ") != NULL)
5864 attr |= (int64_t) 1 << LBP_NS;
5866 /* opening punctuation */
5867 if ((unicode_attributes[ch].category[0] == 'P'
5868 && unicode_attributes[ch].category[1] == 's')
5869 || ch == 0x00A1 /* INVERTED EXCLAMATION MARK */
5870 || ch == 0x00BF /* INVERTED QUESTION MARK */
5871 || ch == 0x2E18 /* INVERTED INTERROBANG */
5872 /* Extra characters for compatibility with Unicode LineBreak.txt. */
5873 || ch == 0x13258 /* EGYPTIAN HIEROGLYPH O006A */
5874 || ch == 0x13259 /* EGYPTIAN HIEROGLYPH O006B */
5875 || ch == 0x1325A /* EGYPTIAN HIEROGLYPH O006C */
5876 || ch == 0x13286 /* EGYPTIAN HIEROGLYPH O036A */
5877 || ch == 0x13288 /* EGYPTIAN HIEROGLYPH O036C */
5878 || ch == 0x13379 /* EGYPTIAN HIEROGLYPH V011A */)
5879 attr |= (int64_t) 1 << LBP_OP;
5881 /* ambiguous quotation */
5882 if ((unicode_attributes[ch].category[0] == 'P'
5883 && (unicode_attributes[ch].category[1] == 'f'
5884 || unicode_attributes[ch].category[1] == 'i'))
5885 || ch == 0x0022 /* QUOTATION MARK */
5886 || ch == 0x0027 /* APOSTROPHE */
5887 || ch == 0x275B /* HEAVY SINGLE TURNED COMMA QUOTATION MARK ORNAMENT */
5888 || ch == 0x275C /* HEAVY SINGLE COMMA QUOTATION MARK ORNAMENT */
5889 || ch == 0x275D /* HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT */
5890 || ch == 0x275E /* HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT */
5891 || ch == 0x2E00 /* RIGHT ANGLE SUBSTITUTION MARKER */
5892 || ch == 0x2E01 /* RIGHT ANGLE DOTTED SUBSTITUTION MARKER */
5893 || ch == 0x2E06 /* RAISED INTERPOLATION MARKER */
5894 || ch == 0x2E07 /* RAISED DOTTED INTERPOLATION MARKER */
5895 || ch == 0x2E08 /* DOTTED TRANSPOSITION MARKER */
5896 || ch == 0x2E0B /* RAISED SQUARE */)
5897 attr |= (int64_t) 1 << LBP_QU;
5899 /* infix separator (numeric) */
5900 if (ch == 0x002C /* COMMA */
5901 || ch == 0x002E /* FULL STOP */
5902 || ch == 0x003A /* COLON */
5903 || ch == 0x003B /* SEMICOLON */
5904 || ch == 0x037E /* GREEK QUESTION MARK */
5905 || ch == 0x0589 /* ARMENIAN FULL STOP */
5906 || ch == 0x060C /* ARABIC COMMA */
5907 || ch == 0x060D /* ARABIC DATE SEPARATOR */
5908 || ch == 0x07F8 /* NKO COMMA */
5909 || ch == 0x2044 /* FRACTION SLASH */
5910 || ch == 0xFE10 /* PRESENTATION FORM FOR VERTICAL COMMA */
5911 || ch == 0xFE13 /* PRESENTATION FORM FOR VERTICAL COLON */
5912 || ch == 0xFE14 /* PRESENTATION FORM FOR VERTICAL SEMICOLON */)
5913 attr |= (int64_t) 1 << LBP_IS;
5916 if ((unicode_attributes[ch].category[0] == 'N'
5917 && unicode_attributes[ch].category[1] == 'd'
5918 && strstr (unicode_attributes[ch].name, "FULLWIDTH") == NULL)
5919 || ch == 0x066B /* ARABIC DECIMAL SEPARATOR */
5920 || ch == 0x066C /* ARABIC THOUSANDS SEPARATOR */)
5921 attr |= (int64_t) 1 << LBP_NU;
5923 /* postfix (numeric) */
5924 if (ch == 0x0025 /* PERCENT SIGN */
5925 || ch == 0x00A2 /* CENT SIGN */
5926 || ch == 0x00B0 /* DEGREE SIGN */
5927 || ch == 0x060B /* AFGHANI SIGN */
5928 || ch == 0x066A /* ARABIC PERCENT SIGN */
5929 || ch == 0x2030 /* PER MILLE SIGN */
5930 || ch == 0x2031 /* PER TEN THOUSAND SIGN */
5931 || ch == 0x2032 /* PRIME */
5932 || ch == 0x2033 /* DOUBLE PRIME */
5933 || ch == 0x2034 /* TRIPLE PRIME */
5934 || ch == 0x2035 /* REVERSED PRIME */
5935 || ch == 0x2036 /* REVERSED DOUBLE PRIME */
5936 || ch == 0x2037 /* REVERSED TRIPLE PRIME */
5937 || ch == 0x20A7 /* PESETA SIGN */
5938 || ch == 0x2103 /* DEGREE CELSIUS */
5939 || ch == 0x2109 /* DEGREE FAHRENHEIT */
5940 || ch == 0xFDFC /* RIAL SIGN */
5941 || ch == 0xFE6A /* SMALL PERCENT SIGN */
5942 || ch == 0xFF05 /* FULLWIDTH PERCENT SIGN */
5943 || ch == 0xFFE0 /* FULLWIDTH DIGIT ZERO */
5944 /* Extra characters for compatibility with Unicode LineBreak.txt. */
5945 || ch == 0x0609 /* ARABIC-INDIC PER MILLE SIGN */
5946 || ch == 0x060A /* ARABIC-INDIC PER TEN THOUSAND SIGN */
5947 || ch == 0x09F2 /* BENGALI RUPEE MARK */
5948 || ch == 0x09F3 /* BENGALI RUPEE SIGN */
5949 || ch == 0x09F9 /* BENGALI CURRENCY DENOMINATOR SIXTEEN */
5950 || ch == 0x0D79 /* MALAYALAM DATE MARK */
5951 || ch == 0x20B6 /* LIVRE TOURNOIS SIGN */
5952 || ch == 0xA838 /* NORTH INDIC RUPEE MARK */)
5953 attr |= (int64_t) 1 << LBP_PO;
5955 /* prefix (numeric) */
5956 if ((unicode_attributes[ch].category[0] == 'S'
5957 && unicode_attributes[ch].category[1] == 'c')
5958 || ch == 0x002B /* PLUS SIGN */
5959 || ch == 0x005C /* REVERSE SOLIDUS */
5960 || ch == 0x00B1 /* PLUS-MINUS SIGN */
5961 || ch == 0x2116 /* NUMERO SIGN */
5962 || ch == 0x2212 /* MINUS SIGN */
5963 || ch == 0x2213 /* MINUS-OR-PLUS SIGN */)
5964 if (!(attr & ((int64_t) 1 << LBP_PO)))
5965 attr |= (int64_t) 1 << LBP_PR;
5967 /* symbols allowing breaks */
5968 if (ch == 0x002F /* SOLIDUS */)
5969 attr |= (int64_t) 1 << LBP_SY;
5971 if (ch >= 0xAC00 && ch <= 0xD7A3 && ((ch - 0xAC00) % 28) == 0)
5972 attr |= (int64_t) 1 << LBP_H2;
5974 if (ch >= 0xAC00 && ch <= 0xD7A3 && ((ch - 0xAC00) % 28) != 0)
5975 attr |= (int64_t) 1 << LBP_H3;
5977 if ((ch >= 0x1100 && ch <= 0x115F) || (ch >= 0xA960 && ch <= 0xA97C))
5978 attr |= (int64_t) 1 << LBP_JL;
5980 if ((ch >= 0x1160 && ch <= 0x11A7) || (ch >= 0xD7B0 && ch <= 0xD7C6))
5981 attr |= (int64_t) 1 << LBP_JV;
5983 if ((ch >= 0x11A8 && ch <= 0x11FF) || (ch >= 0xD7CB && ch <= 0xD7FB))
5984 attr |= (int64_t) 1 << LBP_JT;
5986 /* complex context (South East Asian) */
5987 if (((unicode_attributes[ch].category[0] == 'C'
5988 && unicode_attributes[ch].category[1] == 'f')
5989 || (unicode_attributes[ch].category[0] == 'L'
5990 && (unicode_attributes[ch].category[1] == 'm'
5991 || unicode_attributes[ch].category[1] == 'o'))
5992 || (unicode_attributes[ch].category[0] == 'M'
5993 && (unicode_attributes[ch].category[1] == 'c'
5994 || unicode_attributes[ch].category[1] == 'n')
5995 && ch != 0x1A7F /* TAI THAM COMBINING CRYPTOGRAMMIC DOT */)
5996 /* Extra characters for compatibility with Unicode LineBreak.txt. */
5997 || ch == 0x109E /* MYANMAR SYMBOL SHAN ONE */
5998 || ch == 0x109F /* MYANMAR SYMBOL SHAN EXCLAMATION */
5999 || ch == 0x19DA /* NEW TAI LUE THAM DIGIT ONE */
6000 || ch == 0x19DE /* NEW TAI LUE SIGN LAE */
6001 || ch == 0x19DF /* NEW TAI LUE SIGN LAEV */
6002 || (ch >= 0x1AA0 && ch <= 0x1AAD) /* TAI THAM SIGN */
6003 || (ch >= 0xAA77 && ch <= 0xAA79) /* MYANMAR SYMBOL AITON */
6004 || (ch >= 0xAADE && ch <= 0xAADF) /* TAI VIET SYMBOL */)
6005 && ((ch >= 0x0E00 && ch <= 0x0EFF) /* Thai, Lao */
6006 || (ch >= 0x1000 && ch <= 0x109F) /* Myanmar */
6007 || (ch >= 0x1780 && ch <= 0x17FF) /* Khmer */
6008 || (ch >= 0x1950 && ch <= 0x19DF) /* Tai Le, New Tai Lue */
6009 || (ch >= 0x1A20 && ch <= 0x1AAF) /* Tai Tham */
6010 || (ch >= 0xAA60 && ch <= 0xAADF) /* Myanmar Extended-A, Tai Viet */))
6011 attr |= (int64_t) 1 << LBP_SA;
6013 /* attached characters and combining marks */
6014 if ((unicode_attributes[ch].category[0] == 'M'
6015 && (unicode_attributes[ch].category[1] == 'c'
6016 || unicode_attributes[ch].category[1] == 'e'
6017 || unicode_attributes[ch].category[1] == 'n'))
6018 || (unicode_attributes[ch].category[0] == 'C'
6019 && (unicode_attributes[ch].category[1] == 'c'
6020 || unicode_attributes[ch].category[1] == 'f')
6021 && ch != 0x110BD /* KAITHI NUMBER SIGN */))
6022 if (!(attr & (((int64_t) 1 << LBP_BK) | ((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_GL) | ((int64_t) 1 << LBP_SA) | ((int64_t) 1 << LBP_WJ) | ((int64_t) 1 << LBP_ZW))))
6023 attr |= (int64_t) 1 << LBP_CM;
6026 if ((ch >= 0x2E80 && ch <= 0x2FFF) /* CJK RADICAL, KANGXI RADICAL, IDEOGRAPHIC DESCRIPTION */
6027 || ch == 0x3000 /* IDEOGRAPHIC SPACE */
6028 || (ch >= 0x3040 && ch <= 0x309F) /* HIRAGANA */
6029 || (ch >= 0x30A0 && ch <= 0x30FF) /* KATAKANA */
6030 || (ch >= 0x3400 && ch <= 0x4DBF) /* CJK Ideograph Extension A */
6031 || (ch >= 0x4E00 && ch <= 0x9FFF) /* CJK Ideograph */
6032 || (ch >= 0xF900 && ch <= 0xFAD9) /* CJK COMPATIBILITY IDEOGRAPH */
6033 || (ch >= 0xA000 && ch <= 0xA48F) /* YI SYLLABLE */
6034 || (ch >= 0xA490 && ch <= 0xA4CF) /* YI RADICAL */
6035 || ch == 0xFE62 /* SMALL PLUS SIGN */
6036 || ch == 0xFE63 /* SMALL HYPHEN-MINUS */
6037 || ch == 0xFE64 /* SMALL LESS-THAN SIGN */
6038 || ch == 0xFE65 /* SMALL GREATER-THAN SIGN */
6039 || ch == 0xFE66 /* SMALL EQUALS SIGN */
6040 || (ch >= 0xFF10 && ch <= 0xFF19) /* FULLWIDTH DIGIT */
6041 || (ch >= 0x20000 && ch <= 0x2A6D6) /* CJK Ideograph Extension B */
6042 || (ch >= 0x2F800 && ch <= 0x2FA1D) /* CJK COMPATIBILITY IDEOGRAPH */
6043 || strstr (unicode_attributes[ch].name, "FULLWIDTH LATIN ") != NULL
6044 || (ch >= 0x3000 && ch <= 0x33FF
6045 && !(attr & (((int64_t) 1 << LBP_CM) | ((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_OP) | ((int64_t) 1 << LBP_CL) | ((int64_t) 1 << LBP_CP))))
6046 /* Extra characters for compatibility with Unicode LineBreak.txt. */
6047 || ch == 0xFE30 /* PRESENTATION FORM FOR VERTICAL TWO DOT LEADER */
6048 || ch == 0xFE31 /* PRESENTATION FORM FOR VERTICAL EM DASH */
6049 || ch == 0xFE32 /* PRESENTATION FORM FOR VERTICAL EN DASH */
6050 || ch == 0xFE33 /* PRESENTATION FORM FOR VERTICAL LOW LINE */
6051 || ch == 0xFE34 /* PRESENTATION FORM FOR VERTICAL WAVY LOW LINE */
6052 || ch == 0xFE45 /* SESAME DOT */
6053 || ch == 0xFE46 /* WHITE SESAME DOT */
6054 || ch == 0xFE49 /* DASHED OVERLINE */
6055 || ch == 0xFE4A /* CENTRELINE OVERLINE */
6056 || ch == 0xFE4B /* WAVY OVERLINE */
6057 || ch == 0xFE4C /* DOUBLE WAVY OVERLINE */
6058 || ch == 0xFE4D /* DASHED LOW LINE */
6059 || ch == 0xFE4E /* CENTRELINE LOW LINE */
6060 || ch == 0xFE4F /* WAVY LOW LINE */
6061 || ch == 0xFE51 /* SMALL IDEOGRAPHIC COMMA */
6062 || ch == 0xFE58 /* SMALL EM DASH */
6063 || ch == 0xFE5F /* SMALL NUMBER SIGN */
6064 || ch == 0xFE60 /* SMALL AMPERSAND */
6065 || ch == 0xFE61 /* SMALL ASTERISK */
6066 || ch == 0xFE68 /* SMALL REVERSE SOLIDUS */
6067 || ch == 0xFE6B /* SMALL COMMERCIAL AT */
6068 || ch == 0xFF02 /* FULLWIDTH QUOTATION MARK */
6069 || ch == 0xFF03 /* FULLWIDTH NUMBER SIGN */
6070 || ch == 0xFF06 /* FULLWIDTH AMPERSAND */
6071 || ch == 0xFF07 /* FULLWIDTH APOSTROPHE */
6072 || ch == 0xFF0A /* FULLWIDTH ASTERISK */
6073 || ch == 0xFF0B /* FULLWIDTH PLUS SIGN */
6074 || ch == 0xFF0D /* FULLWIDTH HYPHEN-MINUS */
6075 || ch == 0xFF0F /* FULLWIDTH SOLIDUS */
6076 || ch == 0xFF1C /* FULLWIDTH LESS-THAN SIGN */
6077 || ch == 0xFF1D /* FULLWIDTH EQUALS SIGN */
6078 || ch == 0xFF1E /* FULLWIDTH GREATER-THAN SIGN */
6079 || ch == 0xFF20 /* FULLWIDTH COMMERCIAL AT */
6080 || ch == 0xFF3C /* FULLWIDTH REVERSE SOLIDUS */
6081 || ch == 0xFF3E /* FULLWIDTH CIRCUMFLEX ACCENT */
6082 || ch == 0xFF3F /* FULLWIDTH LOW LINE */
6083 || ch == 0xFF40 /* FULLWIDTH GRAVE ACCENT */
6084 || ch == 0xFF5C /* FULLWIDTH VERTICAL LINE */
6085 || ch == 0xFF5E /* FULLWIDTH TILDE */
6086 || ch == 0xFFE2 /* FULLWIDTH NOT SIGN */
6087 || ch == 0xFFE3 /* FULLWIDTH MACRON */
6088 || ch == 0xFFE4 /* FULLWIDTH BROKEN BAR */
6089 /* Extra characters for compatibility with Unicode LineBreak.txt. */
6090 || (ch >= 0x1B000 && ch <= 0x1B001) /* Kana Supplement */
6091 || (ch >= 0x1F200 && ch <= 0x1F248) /* Enclosed Ideographic Supplement */
6092 || (ch >= 0x1F250 && ch <= 0x1F251) /* Enclosed Ideographic Supplement */
6093 || (ch >= 0x2A700 && ch <= 0x2B734) /* CJK Ideograph Extension C */
6094 || (ch >= 0x2B740 && ch <= 0x2B81D) /* CJK Ideograph Extension D */)
6095 if (!(attr & (((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_CM))))
6097 /* ambiguous (ideograph) ? */
6098 if ((unicode_width[ch] != NULL
6099 && unicode_width[ch][0] == 'A'
6101 || ch == 0x24EA /* CIRCLED DIGIT ZERO */
6102 || (ch >= 0x2780 && ch <= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */)
6103 attr |= (int64_t) 1 << LBP_AI;
6105 attr |= (int64_t) 1 << LBP_ID;
6108 /* ordinary alphabetic and symbol characters */
6109 if ((unicode_attributes[ch].category[0] == 'L'
6110 && (unicode_attributes[ch].category[1] == 'u'
6111 || unicode_attributes[ch].category[1] == 'l'
6112 || unicode_attributes[ch].category[1] == 't'
6113 || unicode_attributes[ch].category[1] == 'm'
6114 || unicode_attributes[ch].category[1] == 'o'))
6115 || (unicode_attributes[ch].category[0] == 'S'
6116 && (unicode_attributes[ch].category[1] == 'm'
6117 || unicode_attributes[ch].category[1] == 'k'
6118 || unicode_attributes[ch].category[1] == 'o'))
6119 || (unicode_attributes[ch].category[0] == 'N'
6120 && (unicode_attributes[ch].category[1] == 'l'
6121 || unicode_attributes[ch].category[1] == 'o'))
6122 || (unicode_attributes[ch].category[0] == 'P'
6123 && (unicode_attributes[ch].category[1] == 'c'
6124 || unicode_attributes[ch].category[1] == 'd'
6125 || unicode_attributes[ch].category[1] == 'o'))
6126 || ch == 0x0600 /* ARABIC NUMBER SIGN */
6127 || ch == 0x0601 /* ARABIC SIGN SANAH */
6128 || ch == 0x0602 /* ARABIC FOOTNOTE MARKER */
6129 || ch == 0x0603 /* ARABIC SIGN SAFHA */
6130 || ch == 0x06DD /* ARABIC END OF AYAH */
6131 || ch == 0x070F /* SYRIAC ABBREVIATION MARK */
6132 || ch == 0x2061 /* FUNCTION APPLICATION */
6133 || ch == 0x2062 /* INVISIBLE TIMES */
6134 || ch == 0x2063 /* INVISIBLE SEPARATOR */
6135 || ch == 0x2064 /* INVISIBLE PLUS */
6136 /* Extra characters for compatibility with Unicode LineBreak.txt. */
6137 || ch == 0x110BD /* KAITHI NUMBER SIGN */)
6138 if (!(attr & (((int64_t) 1 << LBP_GL) | ((int64_t) 1 << LBP_B2) | ((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_BB) | ((int64_t) 1 << LBP_HY) | ((int64_t) 1 << LBP_CB) | ((int64_t) 1 << LBP_CL) | ((int64_t) 1 << LBP_CP) | ((int64_t) 1 << LBP_EX) | ((int64_t) 1 << LBP_IN) | ((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_OP) | ((int64_t) 1 << LBP_QU) | ((int64_t) 1 << LBP_IS) | ((int64_t) 1 << LBP_NU) | ((int64_t) 1 << LBP_PO) | ((int64_t) 1 << LBP_PR) | ((int64_t) 1 << LBP_SY) | ((int64_t) 1 << LBP_H2) | ((int64_t) 1 << LBP_H3) | ((int64_t) 1 << LBP_JL) | ((int64_t) 1 << LBP_JV) | ((int64_t) 1 << LBP_JT) | ((int64_t) 1 << LBP_SA) | ((int64_t) 1 << LBP_ID))))
6140 /* ambiguous (alphabetic) ? */
6141 if ((unicode_width[ch] != NULL
6142 && unicode_width[ch][0] == 'A'
6144 /* Extra exceptions for compatibility with Unicode LineBreak.txt. */
6145 && ch != 0x2022 /* BULLET */
6146 && ch != 0x203E /* OVERLINE */
6147 && ch != 0x2126 /* OHM SIGN */
6148 && ch != 0x2153 /* VULGAR FRACTION ONE THIRD */
6149 && ch != 0x215C /* VULGAR FRACTION THREE EIGHTHS */
6150 && ch != 0x215D /* VULGAR FRACTION FIVE EIGHTHS */
6151 && ch != 0x21B8 /* NORTH WEST ARROW TO LONG BAR */
6152 && ch != 0x21B9 /* LEFTWARDS ARROW TO BAR OVER RIGHTWARDS ARROW TO BAR */
6153 && ch != 0x21E7 /* UPWARDS WHITE ARROW */
6154 && ch != 0x24FF /* NEGATIVE CIRCLED DIGIT ZERO */
6155 && ch != 0x273D /* HEAVY TEARDROP-SPOKED ASTERISK */)
6156 || ch == 0x00A7 /* SECTION SIGN */
6157 || ch == 0x00A8 /* DIAERESIS */
6158 || ch == 0x00AA /* FEMININE ORDINAL INDICATOR */
6159 || ch == 0x00B2 /* SUPERSCRIPT TWO */
6160 || ch == 0x00B3 /* SUPERSCRIPT THREE */
6161 || ch == 0x00B6 /* PILCROW SIGN */
6162 || ch == 0x00B7 /* MIDDLE DOT */
6163 || ch == 0x00B8 /* CEDILLA */
6164 || ch == 0x00B9 /* SUPERSCRIPT ONE */
6165 || ch == 0x00BA /* MASCULINE ORDINAL INDICATOR */
6166 || ch == 0x00BC /* VULGAR FRACTION ONE QUARTER */
6167 || ch == 0x00BD /* VULGAR FRACTION ONE HALF */
6168 || ch == 0x00BE /* VULGAR FRACTION THREE QUARTERS */
6169 || ch == 0x00D7 /* MULTIPLICATION SIGN */
6170 || ch == 0x00F7 /* DIVISION SIGN */
6171 || ch == 0x02C7 /* CARON */
6172 || ch == 0x02C9 /* MODIFIER LETTER MACRON */
6173 || ch == 0x02CA /* MODIFIER LETTER ACUTE ACCENT */
6174 || ch == 0x02CB /* MODIFIER LETTER GRAVE ACCENT */
6175 || ch == 0x02CD /* MODIFIER LETTER LOW MACRON */
6176 || ch == 0x02D0 /* MODIFIER LETTER TRIANGULAR COLON */
6177 || ch == 0x02D8 /* BREVE */
6178 || ch == 0x02D9 /* DOT ABOVE */
6179 || ch == 0x02DA /* RING ABOVE */
6180 || ch == 0x02DB /* OGONEK */
6181 || ch == 0x02DD /* DOUBLE ACUTE ACCENT */
6182 || ch == 0x24EA /* CIRCLED DIGIT ZERO */
6183 || (ch >= 0x2780 && ch <= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */
6184 /* Extra characters for compatibility with Unicode LineBreak.txt. */
6185 || ch == 0x2155 /* VULGAR FRACTION ONE FIFTH */
6186 || ch == 0x2574 /* BOX DRAWINGS LIGHT LEFT */
6187 || ch == 0x2616 /* WHITE SHOGI PIECE */
6188 || ch == 0x2617 /* BLACK SHOGI PIECE */)
6189 attr |= (int64_t) 1 << LBP_AI;
6191 attr |= (int64_t) 1 << LBP_AL;
6192 attr &= ~((int64_t) 1 << LBP_CM);
6197 /* Unassigned character. */
6198 if ((ch >= 0x3400 && ch <= 0x4DBF) /* CJK Unified Ideographs Extension A */
6199 || (ch >= 0x4E00 && ch <= 0x9FFF) /* CJK Unified Ideographs */
6200 || (ch >= 0xF900 && ch <= 0xFAFF) /* CJK Compatibility Ideographs */
6201 || (ch >= 0x20000 && ch <= 0x2A6FF) /* CJK Unified Ideographs Extension B */
6202 || (ch >= 0x2A700 && ch <= 0x2F7FF) /* CJK Unified Ideographs Extension C,
6203 Supplementary Ideographic Plane (Plane 2) outside of blocks */
6204 || (ch >= 0x2F800 && ch <= 0x2FFFD) /* CJK Compatibility Ideographs Supplement,
6205 Supplementary Ideographic Plane (Plane 2) outside of blocks */
6206 || (ch >= 0x30000 && ch <= 0x3FFFD) /* Tertiary Ideographic Plane (Plane 3) outside of blocks */)
6207 attr |= (int64_t) 1 << LBP_ID;
6212 attr |= (int64_t) 1 << LBP_XX;
6217 /* Output the line breaking properties in a human readable format. */
6219 debug_output_lbp (FILE *stream)
6223 for (i = 0; i < 0x110000; i++)
6225 int64_t attr = get_lbp (i);
6226 if (attr != (int64_t) 1 << LBP_XX)
6228 fprintf (stream, "0x%04X", i);
6229 #define PRINT_BIT(attr,bit) \
6230 if (attr & ((int64_t) 1 << bit)) fprintf (stream, " " #bit);
6231 PRINT_BIT(attr,LBP_BK);
6232 PRINT_BIT(attr,LBP_CM);
6233 PRINT_BIT(attr,LBP_WJ);
6234 PRINT_BIT(attr,LBP_ZW);
6235 PRINT_BIT(attr,LBP_GL);
6236 PRINT_BIT(attr,LBP_SP);
6237 PRINT_BIT(attr,LBP_B2);
6238 PRINT_BIT(attr,LBP_BA);
6239 PRINT_BIT(attr,LBP_BB);
6240 PRINT_BIT(attr,LBP_HY);
6241 PRINT_BIT(attr,LBP_CB);
6242 PRINT_BIT(attr,LBP_CL);
6243 PRINT_BIT(attr,LBP_CP);
6244 PRINT_BIT(attr,LBP_EX);
6245 PRINT_BIT(attr,LBP_IN);
6246 PRINT_BIT(attr,LBP_NS);
6247 PRINT_BIT(attr,LBP_OP);
6248 PRINT_BIT(attr,LBP_QU);
6249 PRINT_BIT(attr,LBP_IS);
6250 PRINT_BIT(attr,LBP_NU);
6251 PRINT_BIT(attr,LBP_PO);
6252 PRINT_BIT(attr,LBP_PR);
6253 PRINT_BIT(attr,LBP_SY);
6254 PRINT_BIT(attr,LBP_AI);
6255 PRINT_BIT(attr,LBP_AL);
6256 PRINT_BIT(attr,LBP_H2);
6257 PRINT_BIT(attr,LBP_H3);
6258 PRINT_BIT(attr,LBP_ID);
6259 PRINT_BIT(attr,LBP_JL);
6260 PRINT_BIT(attr,LBP_JV);
6261 PRINT_BIT(attr,LBP_JT);
6262 PRINT_BIT(attr,LBP_SA);
6263 PRINT_BIT(attr,LBP_XX);
6265 fprintf (stream, "\n");
6271 debug_output_lbrk_tables (const char *filename)
6275 stream = fopen (filename, "w");
6278 fprintf (stderr, "cannot open '%s' for writing\n", filename);
6282 debug_output_lbp (stream);
6284 if (ferror (stream) || fclose (stream))
6286 fprintf (stderr, "error writing to '%s'\n", filename);
6291 /* The line breaking property from the LineBreak.txt file. */
6292 int unicode_org_lbp[0x110000];
6294 /* Stores in unicode_org_lbp[] the line breaking property from the
6295 LineBreak.txt file. */
6297 fill_org_lbp (const char *linebreak_filename)
6301 char field0[FIELDLEN];
6302 char field1[FIELDLEN];
6303 char field2[FIELDLEN];
6306 for (i = 0; i < 0x110000; i++)
6307 unicode_org_lbp[i] = LBP_XX;
6309 stream = fopen (linebreak_filename, "r");
6312 fprintf (stderr, "error during fopen of '%s'\n", linebreak_filename);
6328 do c = getc (stream); while (c != EOF && c != '\n');
6332 n = getfield (stream, field0, ';');
6333 n += getfield (stream, field1, ' ');
6334 n += getfield (stream, field2, '\n');
6339 fprintf (stderr, "short line in '%s':%d\n", linebreak_filename,
6343 #define TRY(bit) else if (strcmp (field1, #bit + 4) == 0) value = bit;
6379 else if (strcmp (field1, "LF") == 0) value = LBP_BK;
6380 else if (strcmp (field1, "CR") == 0) value = LBP_BK;
6381 else if (strcmp (field1, "NL") == 0) value = LBP_BK;
6382 else if (strcmp (field1, "SG") == 0) value = LBP_XX;
6385 fprintf (stderr, "unknown property value \"%s\" in '%s':%d\n",
6386 field1, linebreak_filename, lineno);
6389 i = strtoul (field0, NULL, 16);
6390 if (strstr (field0, "..") != NULL)
6392 /* Deal with a range. */
6393 j = strtoul (strstr (field0, "..") + 2, NULL, 16);
6395 unicode_org_lbp[i] = value;
6399 /* Single character line. */
6400 unicode_org_lbp[i] = value;
6403 if (ferror (stream) || fclose (stream))
6405 fprintf (stderr, "error reading from '%s'\n", linebreak_filename);
6410 /* Output the line breaking properties in a human readable format. */
6412 debug_output_org_lbp (FILE *stream)
6416 for (i = 0; i < 0x110000; i++)
6418 int attr = unicode_org_lbp[i];
6421 fprintf (stream, "0x%04X", i);
6422 #define PRINT_BIT(attr,bit) \
6423 if (attr == bit) fprintf (stream, " " #bit);
6424 PRINT_BIT(attr,LBP_BK);
6425 PRINT_BIT(attr,LBP_CM);
6426 PRINT_BIT(attr,LBP_WJ);
6427 PRINT_BIT(attr,LBP_ZW);
6428 PRINT_BIT(attr,LBP_GL);
6429 PRINT_BIT(attr,LBP_SP);
6430 PRINT_BIT(attr,LBP_B2);
6431 PRINT_BIT(attr,LBP_BA);
6432 PRINT_BIT(attr,LBP_BB);
6433 PRINT_BIT(attr,LBP_HY);
6434 PRINT_BIT(attr,LBP_CB);
6435 PRINT_BIT(attr,LBP_CL);
6436 PRINT_BIT(attr,LBP_CP);
6437 PRINT_BIT(attr,LBP_EX);
6438 PRINT_BIT(attr,LBP_IN);
6439 PRINT_BIT(attr,LBP_NS);
6440 PRINT_BIT(attr,LBP_OP);
6441 PRINT_BIT(attr,LBP_QU);
6442 PRINT_BIT(attr,LBP_IS);
6443 PRINT_BIT(attr,LBP_NU);
6444 PRINT_BIT(attr,LBP_PO);
6445 PRINT_BIT(attr,LBP_PR);
6446 PRINT_BIT(attr,LBP_SY);
6447 PRINT_BIT(attr,LBP_AI);
6448 PRINT_BIT(attr,LBP_AL);
6449 PRINT_BIT(attr,LBP_H2);
6450 PRINT_BIT(attr,LBP_H3);
6451 PRINT_BIT(attr,LBP_ID);
6452 PRINT_BIT(attr,LBP_JL);
6453 PRINT_BIT(attr,LBP_JV);
6454 PRINT_BIT(attr,LBP_JT);
6455 PRINT_BIT(attr,LBP_SA);
6456 PRINT_BIT(attr,LBP_XX);
6458 fprintf (stream, "\n");
6464 debug_output_org_lbrk_tables (const char *filename)
6468 stream = fopen (filename, "w");
6471 fprintf (stderr, "cannot open '%s' for writing\n", filename);
6475 debug_output_org_lbp (stream);
6477 if (ferror (stream) || fclose (stream))
6479 fprintf (stderr, "error writing to '%s'\n", filename);
6484 /* Construction of sparse 3-level tables. */
6485 #define TABLE lbp_table
6486 #define ELEMENT unsigned char
6487 #define DEFAULT LBP_XX
6488 #define xmalloc malloc
6489 #define xrealloc realloc
6493 output_lbp (FILE *stream1, FILE *stream2)
6497 unsigned int level1_offset, level2_offset, level3_offset;
6501 lbp_table_init (&t);
6503 for (i = 0; i < 0x110000; i++)
6505 int64_t attr = get_lbp (i);
6507 /* Now attr should contain exactly one bit. */
6508 if (attr == 0 || ((attr & (attr - 1)) != 0))
6511 if (attr != (int64_t) 1 << LBP_XX)
6513 unsigned int log2_attr;
6514 for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++);
6516 lbp_table_add (&t, i, log2_attr);
6520 lbp_table_finalize (&t);
6523 5 * sizeof (uint32_t);
6525 5 * sizeof (uint32_t)
6526 + t.level1_size * sizeof (uint32_t);
6528 5 * sizeof (uint32_t)
6529 + t.level1_size * sizeof (uint32_t)
6530 + (t.level2_size << t.q) * sizeof (uint32_t);
6532 for (i = 0; i < 5; i++)
6533 fprintf (stream1, "#define lbrkprop_header_%d %d\n", i,
6534 ((uint32_t *) t.result)[i]);
6535 fprintf (stream1, "\n");
6536 fprintf (stream1, "typedef struct\n");
6537 fprintf (stream1, " {\n");
6538 fprintf (stream1, " int level1[%zu];\n", t.level1_size);
6539 fprintf (stream1, " int level2[%zu << %d];\n", t.level2_size, t.q);
6540 fprintf (stream1, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
6541 fprintf (stream1, " }\n");
6542 fprintf (stream1, "lbrkprop_t;\n");
6543 fprintf (stream1, "extern const lbrkprop_t unilbrkprop;\n");
6545 fprintf (stream2, "const lbrkprop_t unilbrkprop =\n");
6546 fprintf (stream2, "{\n");
6547 fprintf (stream2, " {");
6548 if (t.level1_size > 8)
6549 fprintf (stream2, "\n ");
6550 for (i = 0; i < t.level1_size; i++)
6553 if (i > 0 && (i % 8) == 0)
6554 fprintf (stream2, "\n ");
6555 offset = ((uint32_t *) (t.result + level1_offset))[i];
6557 fprintf (stream2, " %5d", -1);
6559 fprintf (stream2, " %5zu",
6560 (offset - level2_offset) / sizeof (uint32_t));
6561 if (i+1 < t.level1_size)
6562 fprintf (stream2, ",");
6564 if (t.level1_size > 8)
6565 fprintf (stream2, "\n ");
6566 fprintf (stream2, " },\n");
6567 fprintf (stream2, " {");
6568 if (t.level2_size << t.q > 8)
6569 fprintf (stream2, "\n ");
6570 for (i = 0; i < t.level2_size << t.q; i++)
6573 if (i > 0 && (i % 8) == 0)
6574 fprintf (stream2, "\n ");
6575 offset = ((uint32_t *) (t.result + level2_offset))[i];
6577 fprintf (stream2, " %5d", -1);
6579 fprintf (stream2, " %5zu",
6580 (offset - level3_offset) / sizeof (unsigned char));
6581 if (i+1 < t.level2_size << t.q)
6582 fprintf (stream2, ",");
6584 if (t.level2_size << t.q > 8)
6585 fprintf (stream2, "\n ");
6586 fprintf (stream2, " },\n");
6587 fprintf (stream2, " {");
6588 if (t.level3_size << t.p > 8)
6589 fprintf (stream2, "\n ");
6590 for (i = 0; i < t.level3_size << t.p; i++)
6592 unsigned char value = ((unsigned char *) (t.result + level3_offset))[i];
6593 const char *value_string;
6596 #define CASE(x) case x: value_string = #x; break;
6634 if (i > 0 && (i % 8) == 0)
6635 fprintf (stream2, "\n ");
6636 fprintf (stream2, " %s%s", value_string,
6637 (i+1 < t.level3_size << t.p ? "," : ""));
6639 if (t.level3_size << t.p > 8)
6640 fprintf (stream2, "\n ");
6641 fprintf (stream2, " }\n");
6642 fprintf (stream2, "};\n");
6646 output_lbrk_tables (const char *filename1, const char *filename2, const char *version)
6648 const char *filenames[2];
6652 filenames[0] = filename1;
6653 filenames[1] = filename2;
6655 for (i = 0; i < 2; i++)
6657 streams[i] = fopen (filenames[i], "w");
6658 if (streams[i] == NULL)
6660 fprintf (stderr, "cannot open '%s' for writing\n", filenames[i]);
6665 for (i = 0; i < 2; i++)
6667 FILE *stream = streams[i];
6669 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
6670 fprintf (stream, "/* Line breaking properties of Unicode characters. */\n");
6671 fprintf (stream, "/* Generated automatically by gen-lbrk for Unicode %s. */\n",
6673 fprintf (stream, "\n");
6675 /* Put a GPL header on it. The gnulib module is under LGPL (although it
6676 still carries the GPL header), and it's gnulib-tool which replaces the
6677 GPL header with an LGPL header. */
6678 fprintf (stream, "/* Copyright (C) 2000-2002, 2004, 2008 Free Software Foundation, Inc.\n");
6679 fprintf (stream, "\n");
6680 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
6681 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
6682 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
6683 fprintf (stream, " (at your option) any later version.\n");
6684 fprintf (stream, "\n");
6685 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
6686 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
6687 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
6688 fprintf (stream, " GNU General Public License for more details.\n");
6689 fprintf (stream, "\n");
6690 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
6691 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
6692 fprintf (stream, "\n");
6695 output_lbp (streams[0], streams[1]);
6697 for (i = 0; i < 2; i++)
6699 if (ferror (streams[i]) || fclose (streams[i]))
6701 fprintf (stderr, "error writing to '%s'\n", filenames[i]);
6707 /* ========================================================================= */
6709 /* Word break property.
6710 Updated for Unicode TR #29 revision 17. */
6712 /* Possible values of the Word_Break property. */
6727 WBP_EXTENDNUMLET = 7
6730 /* Returns the word breaking property for ch, as a bit mask. */
6732 get_wbp (unsigned int ch)
6736 if (unicode_attributes[ch].name != NULL)
6739 attr |= 1 << WBP_CR;
6742 attr |= 1 << WBP_LF;
6744 if (ch == 0x000B || ch == 0x000C
6746 || ch == 0x2028 || ch == 0x2029)
6747 attr |= 1 << WBP_NEWLINE;
6749 if (((unicode_properties[ch] >> PROP_GRAPHEME_EXTEND) & 1) != 0
6750 || (unicode_attributes[ch].category != NULL
6751 && strcmp (unicode_attributes[ch].category, "Mc") == 0))
6752 attr |= 1 << WBP_EXTEND;
6754 if (unicode_attributes[ch].category != NULL
6755 && strcmp (unicode_attributes[ch].category, "Cf") == 0
6756 && ch != 0x200B && ch != 0x200C && ch != 0x200D)
6757 attr |= 1 << WBP_FORMAT;
6759 if ((unicode_scripts[ch] < numscripts
6760 && strcmp (scripts[unicode_scripts[ch]], "Katakana") == 0)
6761 || (ch >= 0x3031 && ch <= 0x3035)
6762 || ch == 0x309B || ch == 0x309C || ch == 0x30A0 || ch == 0x30FC
6764 attr |= 1 << WBP_KATAKANA;
6766 if ((((unicode_properties[ch] >> PROP_ALPHABETIC) & 1) != 0
6768 && ((unicode_properties[ch] >> PROP_IDEOGRAPHIC) & 1) == 0
6769 && (attr & (1 << WBP_KATAKANA)) == 0
6770 && ((get_lbp (ch) >> LBP_SA) & 1) == 0
6771 && !(unicode_scripts[ch] < numscripts
6772 && strcmp (scripts[unicode_scripts[ch]], "Hiragana") == 0)
6773 && (attr & (1 << WBP_EXTEND)) == 0)
6774 attr |= 1 << WBP_ALETTER;
6776 if (is_WBP_MIDNUMLET (ch))
6777 attr |= 1 << WBP_MIDNUMLET;
6779 if (is_WBP_MIDLETTER (ch))
6780 attr |= 1 << WBP_MIDLETTER;
6782 if ((((get_lbp (ch) >> LBP_IS) & 1) != 0
6783 || ch == 0x066C || ch == 0xFE50 || ch == 0xFE54 || ch == 0xFF0C
6785 && ch != 0x003A && ch != 0xFE13 && ch != 0x002E)
6786 attr |= 1 << WBP_MIDNUM;
6788 if (((get_lbp (ch) >> LBP_NU) & 1) != 0
6790 attr |= 1 << WBP_NUMERIC;
6792 if (unicode_attributes[ch].category != NULL
6793 && strcmp (unicode_attributes[ch].category, "Pc") == 0)
6794 attr |= 1 << WBP_EXTENDNUMLET;
6799 attr |= 1 << WBP_OTHER;
6804 /* Output the word break property in a human readable format. */
6806 debug_output_wbp (FILE *stream)
6810 for (i = 0; i < 0x110000; i++)
6812 int attr = get_wbp (i);
6813 if (attr != 1 << WBP_OTHER)
6815 fprintf (stream, "0x%04X", i);
6816 if (attr & (1 << WBP_CR))
6817 fprintf (stream, " CR");
6818 if (attr & (1 << WBP_LF))
6819 fprintf (stream, " LF");
6820 if (attr & (1 << WBP_NEWLINE))
6821 fprintf (stream, " Newline");
6822 if (attr & (1 << WBP_EXTEND))
6823 fprintf (stream, " Extend");
6824 if (attr & (1 << WBP_FORMAT))
6825 fprintf (stream, " Format");
6826 if (attr & (1 << WBP_KATAKANA))
6827 fprintf (stream, " Katakana");
6828 if (attr & (1 << WBP_ALETTER))
6829 fprintf (stream, " ALetter");
6830 if (attr & (1 << WBP_MIDNUMLET))
6831 fprintf (stream, " MidNumLet");
6832 if (attr & (1 << WBP_MIDLETTER))
6833 fprintf (stream, " MidLetter");
6834 if (attr & (1 << WBP_MIDNUM))
6835 fprintf (stream, " MidNum");
6836 if (attr & (1 << WBP_NUMERIC))
6837 fprintf (stream, " Numeric");
6838 if (attr & (1 << WBP_EXTENDNUMLET))
6839 fprintf (stream, " ExtendNumLet");
6840 fprintf (stream, "\n");
6846 debug_output_wbrk_tables (const char *filename)
6850 stream = fopen (filename, "w");
6853 fprintf (stderr, "cannot open '%s' for writing\n", filename);
6857 debug_output_wbp (stream);
6859 if (ferror (stream) || fclose (stream))
6861 fprintf (stderr, "error writing to '%s'\n", filename);
6866 /* The word break property from the WordBreakProperty.txt file. */
6867 int unicode_org_wbp[0x110000];
6869 /* Stores in unicode_org_wbp[] the word break property from the
6870 WordBreakProperty.txt file. */
6872 fill_org_wbp (const char *wordbreakproperty_filename)
6877 for (i = 0; i < 0x110000; i++)
6878 unicode_org_wbp[i] = WBP_OTHER;
6880 stream = fopen (wordbreakproperty_filename, "r");
6883 fprintf (stderr, "error during fopen of '%s'\n", wordbreakproperty_filename);
6890 unsigned int i1, i2;
6891 char padding[200+1];
6892 char propname[200+1];
6895 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
6898 if (buf[0] == '\0' || buf[0] == '#')
6901 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4)
6903 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3)
6905 fprintf (stderr, "parse error in '%s'\n",
6906 wordbreakproperty_filename);
6911 #define PROP(name,value) \
6912 if (strcmp (propname, name) == 0) propvalue = value; else
6915 PROP ("Newline", WBP_NEWLINE)
6916 PROP ("Extend", WBP_EXTEND)
6917 PROP ("Format", WBP_FORMAT)
6918 PROP ("Katakana", WBP_KATAKANA)
6919 PROP ("ALetter", WBP_ALETTER)
6920 PROP ("MidNumLet", WBP_MIDNUMLET)
6921 PROP ("MidLetter", WBP_MIDLETTER)
6922 PROP ("MidNum", WBP_MIDNUM)
6923 PROP ("Numeric", WBP_NUMERIC)
6924 PROP ("ExtendNumLet", WBP_EXTENDNUMLET)
6927 fprintf (stderr, "unknown property value '%s' in '%s'\n", propname,
6928 wordbreakproperty_filename);
6931 if (!(i1 <= i2 && i2 < 0x110000))
6934 for (i = i1; i <= i2; i++)
6935 unicode_org_wbp[i] = propvalue;
6938 if (ferror (stream) || fclose (stream))
6940 fprintf (stderr, "error reading from '%s'\n", wordbreakproperty_filename);
6945 /* Output the word break property in a human readable format. */
6947 debug_output_org_wbp (FILE *stream)
6951 for (i = 0; i < 0x110000; i++)
6953 int propvalue = unicode_org_wbp[i];
6954 if (propvalue != WBP_OTHER)
6956 fprintf (stream, "0x%04X", i);
6957 #define PROP(name,value) \
6958 if (propvalue == value) fprintf (stream, " " name); else
6961 PROP ("Newline", WBP_NEWLINE)
6962 PROP ("Extend", WBP_EXTEND)
6963 PROP ("Format", WBP_FORMAT)
6964 PROP ("Katakana", WBP_KATAKANA)
6965 PROP ("ALetter", WBP_ALETTER)
6966 PROP ("MidNumLet", WBP_MIDNUMLET)
6967 PROP ("MidLetter", WBP_MIDLETTER)
6968 PROP ("MidNum", WBP_MIDNUM)
6969 PROP ("Numeric", WBP_NUMERIC)
6970 PROP ("ExtendNumLet", WBP_EXTENDNUMLET)
6972 fprintf (stream, " ??");
6973 fprintf (stream, "\n");
6979 debug_output_org_wbrk_tables (const char *filename)
6983 stream = fopen (filename, "w");
6986 fprintf (stderr, "cannot open '%s' for writing\n", filename);
6990 debug_output_org_wbp (stream);
6992 if (ferror (stream) || fclose (stream))
6994 fprintf (stderr, "error writing to '%s'\n", filename);
6999 /* Construction of sparse 3-level tables. */
7000 #define TABLE wbp_table
7001 #define ELEMENT unsigned char
7002 #define DEFAULT WBP_OTHER
7003 #define xmalloc malloc
7004 #define xrealloc realloc
7008 output_wbp (FILE *stream)
7012 unsigned int level1_offset, level2_offset, level3_offset;
7016 wbp_table_init (&t);
7018 for (i = 0; i < 0x110000; i++)
7020 int attr = get_wbp (i);
7022 /* Now attr should contain exactly one bit. */
7023 if (attr == 0 || ((attr & (attr - 1)) != 0))
7026 if (attr != 1 << WBP_OTHER)
7028 unsigned int log2_attr;
7029 for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++);
7031 wbp_table_add (&t, i, log2_attr);
7035 wbp_table_finalize (&t);
7038 5 * sizeof (uint32_t);
7040 5 * sizeof (uint32_t)
7041 + t.level1_size * sizeof (uint32_t);
7043 5 * sizeof (uint32_t)
7044 + t.level1_size * sizeof (uint32_t)
7045 + (t.level2_size << t.q) * sizeof (uint32_t);
7047 for (i = 0; i < 5; i++)
7048 fprintf (stream, "#define wbrkprop_header_%d %d\n", i,
7049 ((uint32_t *) t.result)[i]);
7050 fprintf (stream, "\n");
7051 fprintf (stream, "typedef struct\n");
7052 fprintf (stream, " {\n");
7053 fprintf (stream, " int level1[%zu];\n", t.level1_size);
7054 fprintf (stream, " int level2[%zu << %d];\n", t.level2_size, t.q);
7055 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
7056 fprintf (stream, " }\n");
7057 fprintf (stream, "wbrkprop_t;\n");
7058 fprintf (stream, "static const wbrkprop_t uniwbrkprop =\n");
7059 fprintf (stream, "{\n");
7060 fprintf (stream, " {");
7061 if (t.level1_size > 8)
7062 fprintf (stream, "\n ");
7063 for (i = 0; i < t.level1_size; i++)
7066 if (i > 0 && (i % 8) == 0)
7067 fprintf (stream, "\n ");
7068 offset = ((uint32_t *) (t.result + level1_offset))[i];
7070 fprintf (stream, " %5d", -1);
7072 fprintf (stream, " %5zu",
7073 (offset - level2_offset) / sizeof (uint32_t));
7074 if (i+1 < t.level1_size)
7075 fprintf (stream, ",");
7077 if (t.level1_size > 8)
7078 fprintf (stream, "\n ");
7079 fprintf (stream, " },\n");
7080 fprintf (stream, " {");
7081 if (t.level2_size << t.q > 8)
7082 fprintf (stream, "\n ");
7083 for (i = 0; i < t.level2_size << t.q; i++)
7086 if (i > 0 && (i % 8) == 0)
7087 fprintf (stream, "\n ");
7088 offset = ((uint32_t *) (t.result + level2_offset))[i];
7090 fprintf (stream, " %5d", -1);
7092 fprintf (stream, " %5zu",
7093 (offset - level3_offset) / sizeof (unsigned char));
7094 if (i+1 < t.level2_size << t.q)
7095 fprintf (stream, ",");
7097 if (t.level2_size << t.q > 8)
7098 fprintf (stream, "\n ");
7099 fprintf (stream, " },\n");
7100 fprintf (stream, " {");
7101 if (t.level3_size << t.p > 4)
7102 fprintf (stream, "\n ");
7103 for (i = 0; i < t.level3_size << t.p; i++)
7105 unsigned char value = ((unsigned char *) (t.result + level3_offset))[i];
7106 const char *value_string;
7109 #define CASE(x) case x: value_string = #x; break;
7118 CASE(WBP_MIDNUMLET);
7119 CASE(WBP_MIDLETTER);
7122 CASE(WBP_EXTENDNUMLET);
7127 if (i > 0 && (i % 4) == 0)
7128 fprintf (stream, "\n ");
7129 fprintf (stream, " %s%s", value_string,
7130 (i+1 < t.level3_size << t.p ? "," : ""));
7132 if (t.level3_size << t.p > 4)
7133 fprintf (stream, "\n ");
7134 fprintf (stream, " }\n");
7135 fprintf (stream, "};\n");
7139 output_wbrk_tables (const char *filename, const char *version)
7143 stream = fopen (filename, "w");
7146 fprintf (stderr, "cannot open '%s' for writing\n", filename);
7150 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
7151 fprintf (stream, "/* Line breaking properties of Unicode characters. */\n");
7152 fprintf (stream, "/* Generated automatically by gen-uni-tables for Unicode %s. */\n",
7154 fprintf (stream, "\n");
7156 /* Put a GPL header on it. The gnulib module is under LGPL (although it
7157 still carries the GPL header), and it's gnulib-tool which replaces the
7158 GPL header with an LGPL header. */
7159 fprintf (stream, "/* Copyright (C) 2000-2002, 2004, 2007-2009 Free Software Foundation, Inc.\n");
7160 fprintf (stream, "\n");
7161 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
7162 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
7163 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
7164 fprintf (stream, " (at your option) any later version.\n");
7165 fprintf (stream, "\n");
7166 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
7167 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
7168 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
7169 fprintf (stream, " GNU General Public License for more details.\n");
7170 fprintf (stream, "\n");
7171 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
7172 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
7173 fprintf (stream, "\n");
7175 output_wbp (stream);
7177 if (ferror (stream) || fclose (stream))
7179 fprintf (stderr, "error writing to '%s'\n", filename);
7184 /* ========================================================================= */
7186 /* Grapheme break property.
7187 Updated for Unicode TR #29 revision 17. */
7189 /* Possible values of the Grapheme_Cluster_Break property. */
7198 GBP_SPACINGMARK = 6,
7206 /* Construction of sparse 3-level tables. */
7207 #define TABLE gbp_table
7208 #define ELEMENT unsigned char
7209 #define DEFAULT GBP_OTHER
7210 #define xmalloc malloc
7211 #define xrealloc realloc
7214 /* The grapheme break property from the GraphemeBreakProperty.txt file. */
7215 int unicode_org_gbp[0x110000];
7217 /* Output the unit test data for the grapheme break property. */
7219 output_gbp_test (const char *filename)
7225 stream = fopen (filename, "w");
7228 fprintf (stderr, "cannot open '%s' for writing\n", filename);
7232 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
7233 fprintf (stream, "/* Test the Unicode grapheme break property functions.\n");
7234 fprintf (stream, " Copyright (C) 2010 Free Software Foundation, Inc.\n");
7235 fprintf (stream, "\n");
7236 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
7237 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
7238 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
7239 fprintf (stream, " (at your option) any later version.\n");
7240 fprintf (stream, "\n");
7241 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
7242 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
7243 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
7244 fprintf (stream, " GNU General Public License for more details.\n");
7245 fprintf (stream, "\n");
7246 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
7247 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
7248 fprintf (stream, "\n");
7251 for (ch = 0; ch < 0x110000; ch++)
7253 int gbp = unicode_org_gbp[ch];
7254 const char *gbp_string;
7256 while (ch + 1 < 0x110000 && unicode_org_gbp[ch + 1] == gbp)
7261 #define CASE(x) case x: gbp_string = #x; break;
7268 CASE (GBP_SPACINGMARK)
7280 fprintf (stream, ",\n");
7281 fprintf (stream, "{ 0x%04X, %s }", ch + 1, gbp_string);
7285 fprintf (stream, "\n");
7287 if (ferror (stream) || fclose (stream))
7289 fprintf (stderr, "error writing to '%s'\n", filename);
7294 /* Output the per-character grapheme break property table. */
7296 output_gbp_table (const char *filename, const char *version)
7301 unsigned int level1_offset, level2_offset, level3_offset;
7303 stream = fopen (filename, "w");
7306 fprintf (stderr, "cannot open '%s' for writing\n", filename);
7310 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
7311 fprintf (stream, "/* Grapheme break property of Unicode characters. */\n");
7312 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
7317 gbp_table_init (&t);
7319 for (ch = 0; ch < 0x110000; ch++)
7320 gbp_table_add (&t, ch, unicode_org_gbp[ch]);
7322 gbp_table_finalize (&t);
7324 /* Offsets in t.result, in memory of this process. */
7326 5 * sizeof (uint32_t);
7328 5 * sizeof (uint32_t)
7329 + t.level1_size * sizeof (uint32_t);
7331 5 * sizeof (uint32_t)
7332 + t.level1_size * sizeof (uint32_t)
7333 + (t.level2_size << t.q) * sizeof (uint32_t);
7335 for (i = 0; i < 5; i++)
7336 fprintf (stream, "#define gbrkprop_header_%d %d\n", i,
7337 ((uint32_t *) t.result)[i]);
7338 fprintf (stream, "static const\n");
7339 fprintf (stream, "struct\n");
7340 fprintf (stream, " {\n");
7341 fprintf (stream, " int level1[%zu];\n", t.level1_size);
7342 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
7343 fprintf (stream, " unsigned char level3[(%zu << %d) / 2];\n",
7344 t.level3_size, t.p);
7345 fprintf (stream, " }\n");
7346 fprintf (stream, "unigbrkprop =\n");
7347 fprintf (stream, "{\n");
7348 fprintf (stream, " {");
7349 if (t.level1_size > 8)
7350 fprintf (stream, "\n ");
7351 for (i = 0; i < t.level1_size; i++)
7354 if (i > 0 && (i % 8) == 0)
7355 fprintf (stream, "\n ");
7356 offset = ((uint32_t *) (t.result + level1_offset))[i];
7358 fprintf (stream, " %5d", -1);
7360 fprintf (stream, " %5zu",
7361 (offset - level2_offset) / sizeof (uint32_t));
7362 if (i+1 < t.level1_size)
7363 fprintf (stream, ",");
7365 if (t.level1_size > 8)
7366 fprintf (stream, "\n ");
7367 fprintf (stream, " },\n");
7368 fprintf (stream, " {");
7369 if (t.level2_size << t.q > 8)
7370 fprintf (stream, "\n ");
7371 for (i = 0; i < t.level2_size << t.q; i++)
7374 if (i > 0 && (i % 8) == 0)
7375 fprintf (stream, "\n ");
7376 offset = ((uint32_t *) (t.result + level2_offset))[i];
7378 fprintf (stream, " %5d", -1);
7380 fprintf (stream, " %5zu",
7381 (offset - level3_offset) / sizeof (uint8_t) / 2);
7382 if (i+1 < t.level2_size << t.q)
7383 fprintf (stream, ",");
7385 if (t.level2_size << t.q > 8)
7386 fprintf (stream, "\n ");
7387 fprintf (stream, " },\n");
7388 fprintf (stream, " {");
7389 if (t.level3_size << t.p > 8)
7390 fprintf (stream, "\n ");
7391 for (i = 0; i < (t.level3_size << t.p) / 2; i++)
7393 unsigned char *p = (unsigned char *) (t.result + level3_offset);
7394 unsigned char value0 = p[i * 2];
7395 unsigned char value1 = p[i * 2 + 1];
7396 if (i > 0 && (i % 8) == 0)
7397 fprintf (stream, "\n ");
7398 fprintf (stream, " 0x%02x%s", (value1 << 4) + value0,
7399 (i+1 < (t.level3_size << t.p) / 2 ? "," : ""));
7401 if (t.level3_size << t.p > 8)
7402 fprintf (stream, "\n ");
7403 fprintf (stream, " }\n");
7404 fprintf (stream, "};\n");
7406 if (ferror (stream) || fclose (stream))
7408 fprintf (stderr, "error writing to '%s'\n", filename);
7413 /* Stores in unicode_org_gbp[] the grapheme breaking property from the
7414 GraphemeBreakProperty.txt file. */
7416 fill_org_gbp (const char *graphemebreakproperty_filename)
7422 for (i = 0; i < 0x110000; i++)
7423 unicode_org_gbp[i] = GBP_OTHER;
7425 stream = fopen (graphemebreakproperty_filename, "r");
7428 fprintf (stderr, "error during fopen of '%s'\n",
7429 graphemebreakproperty_filename);
7436 unsigned int i1, i2;
7437 char padding[200+1];
7438 char propname[200+1];
7442 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
7445 if (buf[0] == '\0' || buf[0] == '#')
7448 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4)
7450 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3)
7452 fprintf (stderr, "parse error in '%s'\n",
7453 graphemebreakproperty_filename);
7458 #define PROP(name,value) \
7459 if (strcmp (propname, name) == 0) propvalue = value; else
7462 PROP ("Control", GBP_CONTROL)
7463 PROP ("Extend", GBP_EXTEND)
7464 PROP ("Prepend", GBP_PREPEND)
7465 PROP ("SpacingMark", GBP_SPACINGMARK)
7470 PROP ("LVT", GBP_LVT)
7473 fprintf (stderr, "unknown property value '%s' in %s:%d\n", propname,
7474 graphemebreakproperty_filename, lineno);
7477 if (!(i1 <= i2 && i2 < 0x110000))
7480 for (i = i1; i <= i2; i++)
7481 unicode_org_gbp[i] = propvalue;
7483 if (ferror (stream) || fclose (stream))
7485 fprintf (stderr, "error reading from '%s'\n", graphemebreakproperty_filename);
7490 /* ========================================================================= */
7492 /* Composition and decomposition.
7493 Updated for Unicode TR #15 revision 33. */
7495 /* Maximum number of characters into which a single Unicode character can be
7497 #define MAX_DECOMP_LENGTH 18
7501 UC_DECOMP_CANONICAL,/* Canonical decomposition. */
7502 UC_DECOMP_FONT, /* <font> A font variant (e.g. a blackletter form). */
7503 UC_DECOMP_NOBREAK, /* <noBreak> A no-break version of a space or hyphen. */
7504 UC_DECOMP_INITIAL, /* <initial> An initial presentation form (Arabic). */
7505 UC_DECOMP_MEDIAL, /* <medial> A medial presentation form (Arabic). */
7506 UC_DECOMP_FINAL, /* <final> A final presentation form (Arabic). */
7507 UC_DECOMP_ISOLATED,/* <isolated> An isolated presentation form (Arabic). */
7508 UC_DECOMP_CIRCLE, /* <circle> An encircled form. */
7509 UC_DECOMP_SUPER, /* <super> A superscript form. */
7510 UC_DECOMP_SUB, /* <sub> A subscript form. */
7511 UC_DECOMP_VERTICAL,/* <vertical> A vertical layout presentation form. */
7512 UC_DECOMP_WIDE, /* <wide> A wide (or zenkaku) compatibility character. */
7513 UC_DECOMP_NARROW, /* <narrow> A narrow (or hankaku) compatibility character. */
7514 UC_DECOMP_SMALL, /* <small> A small variant form (CNS compatibility). */
7515 UC_DECOMP_SQUARE, /* <square> A CJK squared font variant. */
7516 UC_DECOMP_FRACTION,/* <fraction> A vulgar fraction form. */
7517 UC_DECOMP_COMPAT /* <compat> Otherwise unspecified compatibility character. */
7520 /* Return the decomposition for a Unicode character (ignoring Hangul Jamo
7521 decompositions). Return the type, or -1 for none. */
7523 get_decomposition (unsigned int ch,
7524 unsigned int *lengthp, unsigned int decomposed[MAX_DECOMP_LENGTH])
7526 const char *decomposition = unicode_attributes[ch].decomposition;
7528 if (decomposition != NULL && decomposition[0] != '\0')
7530 int type = UC_DECOMP_CANONICAL;
7531 unsigned int length;
7534 if (decomposition[0] == '<')
7539 rangle = strchr (decomposition + 1, '>');
7542 typelen = rangle + 1 - decomposition;
7543 #define TYPE(t1,t2) \
7544 if (typelen == (sizeof (t1) - 1) && memcmp (decomposition, t1, typelen) == 0) \
7547 TYPE ("<font>", UC_DECOMP_FONT)
7548 TYPE ("<noBreak>", UC_DECOMP_NOBREAK)
7549 TYPE ("<initial>", UC_DECOMP_INITIAL)
7550 TYPE ("<medial>", UC_DECOMP_MEDIAL)
7551 TYPE ("<final>", UC_DECOMP_FINAL)
7552 TYPE ("<isolated>", UC_DECOMP_ISOLATED)
7553 TYPE ("<circle>", UC_DECOMP_CIRCLE)
7554 TYPE ("<super>", UC_DECOMP_SUPER)
7555 TYPE ("<sub>", UC_DECOMP_SUB)
7556 TYPE ("<vertical>", UC_DECOMP_VERTICAL)
7557 TYPE ("<wide>", UC_DECOMP_WIDE)
7558 TYPE ("<narrow>", UC_DECOMP_NARROW)
7559 TYPE ("<small>", UC_DECOMP_SMALL)
7560 TYPE ("<square>", UC_DECOMP_SQUARE)
7561 TYPE ("<fraction>", UC_DECOMP_FRACTION)
7562 TYPE ("<compat>", UC_DECOMP_COMPAT)
7564 fprintf (stderr, "unknown decomposition type %*s\n", (int)typelen, decomposition);
7568 decomposition = rangle + 1;
7569 if (decomposition[0] == ' ')
7572 for (length = 0; length < MAX_DECOMP_LENGTH; length++)
7574 decomposed[length] = strtoul (decomposition, &endptr, 16);
7575 if (endptr == decomposition)
7577 decomposition = endptr;
7578 if (decomposition[0] == ' ')
7581 if (*decomposition != '\0')
7582 /* MAX_DECOMP_LENGTH is too small. */
7592 /* Construction of sparse 3-level tables. */
7593 #define TABLE decomp_table
7594 #define ELEMENT uint16_t
7595 #define DEFAULT (uint16_t)(-1)
7596 #define xmalloc malloc
7597 #define xrealloc realloc
7601 output_decomposition (FILE *stream1, FILE *stream2)
7603 struct decomp_table t;
7604 unsigned int level1_offset, level2_offset, level3_offset;
7605 unsigned int offset;
7611 decomp_table_init (&t);
7613 fprintf (stream1, "extern const unsigned char gl_uninorm_decomp_chars_table[];\n");
7614 fprintf (stream1, "\n");
7615 fprintf (stream2, "const unsigned char gl_uninorm_decomp_chars_table[] =\n{");
7618 for (ch = 0; ch < 0x110000; ch++)
7620 unsigned int length;
7621 unsigned int decomposed[MAX_DECOMP_LENGTH];
7622 int type = get_decomposition (ch, &length, decomposed);
7626 if (!(offset < (1 << 15)))
7628 decomp_table_add (&t, ch, ((type == UC_DECOMP_CANONICAL ? 0 : 1) << 15) | offset);
7630 /* Produce length 3-bytes entries. */
7632 /* We would need a special representation of zero-length entries. */
7634 for (i = 0; i < length; i++)
7637 fprintf (stream2, ",");
7638 if ((offset % 4) == 0)
7639 fprintf (stream2, "\n ");
7640 if (!(decomposed[i] < (1 << 18)))
7642 fprintf (stream2, " 0x%02X, 0x%02X, 0x%02X",
7643 (((i+1 < length ? (1 << 23) : 0)
7644 | (i == 0 ? (type << 18) : 0)
7645 | decomposed[i]) >> 16) & 0xff,
7646 (decomposed[i] >> 8) & 0xff,
7647 decomposed[i] & 0xff);
7653 fprintf (stream2, "\n};\n");
7654 fprintf (stream2, "\n");
7656 decomp_table_finalize (&t);
7659 5 * sizeof (uint32_t);
7661 5 * sizeof (uint32_t)
7662 + t.level1_size * sizeof (uint32_t);
7664 5 * sizeof (uint32_t)
7665 + t.level1_size * sizeof (uint32_t)
7666 + (t.level2_size << t.q) * sizeof (uint32_t);
7668 for (i = 0; i < 5; i++)
7669 fprintf (stream1, "#define decomp_header_%d %d\n", i,
7670 ((uint32_t *) t.result)[i]);
7671 fprintf (stream1, "\n");
7672 fprintf (stream1, "typedef struct\n");
7673 fprintf (stream1, " {\n");
7674 fprintf (stream1, " int level1[%zu];\n", t.level1_size);
7675 fprintf (stream1, " int level2[%zu << %d];\n", t.level2_size, t.q);
7676 fprintf (stream1, " unsigned short level3[%zu << %d];\n", t.level3_size, t.p);
7677 fprintf (stream1, " }\n");
7678 fprintf (stream1, "decomp_index_table_t;\n");
7679 fprintf (stream1, "extern const decomp_index_table_t gl_uninorm_decomp_index_table;\n");
7680 fprintf (stream2, "const decomp_index_table_t gl_uninorm_decomp_index_table =\n");
7681 fprintf (stream2, "{\n");
7682 fprintf (stream2, " {");
7683 if (t.level1_size > 8)
7684 fprintf (stream2, "\n ");
7685 for (i = 0; i < t.level1_size; i++)
7688 if (i > 0 && (i % 8) == 0)
7689 fprintf (stream2, "\n ");
7690 offset = ((uint32_t *) (t.result + level1_offset))[i];
7692 fprintf (stream2, " %5d", -1);
7694 fprintf (stream2, " %5zu",
7695 (offset - level2_offset) / sizeof (uint32_t));
7696 if (i+1 < t.level1_size)
7697 fprintf (stream2, ",");
7699 if (t.level1_size > 8)
7700 fprintf (stream2, "\n ");
7701 fprintf (stream2, " },\n");
7702 fprintf (stream2, " {");
7703 if (t.level2_size << t.q > 8)
7704 fprintf (stream2, "\n ");
7705 for (i = 0; i < t.level2_size << t.q; i++)
7708 if (i > 0 && (i % 8) == 0)
7709 fprintf (stream2, "\n ");
7710 offset = ((uint32_t *) (t.result + level2_offset))[i];
7712 fprintf (stream2, " %5d", -1);
7714 fprintf (stream2, " %5zu",
7715 (offset - level3_offset) / sizeof (uint16_t));
7716 if (i+1 < t.level2_size << t.q)
7717 fprintf (stream2, ",");
7719 if (t.level2_size << t.q > 8)
7720 fprintf (stream2, "\n ");
7721 fprintf (stream2, " },\n");
7722 fprintf (stream2, " {");
7723 if (t.level3_size << t.p > 8)
7724 fprintf (stream2, "\n ");
7725 for (i = 0; i < t.level3_size << t.p; i++)
7727 uint16_t value = ((uint16_t *) (t.result + level3_offset))[i];
7728 if (i > 0 && (i % 8) == 0)
7729 fprintf (stream2, "\n ");
7730 fprintf (stream2, " %5d", value == (uint16_t)(-1) ? -1 : value);
7731 if (i+1 < t.level3_size << t.p)
7732 fprintf (stream2, ",");
7734 if (t.level3_size << t.p > 8)
7735 fprintf (stream2, "\n ");
7736 fprintf (stream2, " }\n");
7737 fprintf (stream2, "};\n");
7741 output_decomposition_tables (const char *filename1, const char *filename2, const char *version)
7743 const char *filenames[2];
7747 filenames[0] = filename1;
7748 filenames[1] = filename2;
7750 for (i = 0; i < 2; i++)
7752 streams[i] = fopen (filenames[i], "w");
7753 if (streams[i] == NULL)
7755 fprintf (stderr, "cannot open '%s' for writing\n", filenames[i]);
7760 for (i = 0; i < 2; i++)
7762 FILE *stream = streams[i];
7764 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
7765 fprintf (stream, "/* Decomposition of Unicode characters. */\n");
7766 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
7768 fprintf (stream, "\n");
7771 output_decomposition (streams[0], streams[1]);
7773 for (i = 0; i < 2; i++)
7775 if (ferror (streams[i]) || fclose (streams[i]))
7777 fprintf (stderr, "error writing to '%s'\n", filenames[i]);
7783 /* The "excluded from composition" property from the CompositionExclusions.txt file. */
7784 char unicode_composition_exclusions[0x110000];
7787 fill_composition_exclusions (const char *compositionexclusions_filename)
7792 stream = fopen (compositionexclusions_filename, "r");
7795 fprintf (stderr, "error during fopen of '%s'\n", compositionexclusions_filename);
7799 for (i = 0; i < 0x110000; i++)
7800 unicode_composition_exclusions[i] = 0;
7807 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
7810 if (buf[0] == '\0' || buf[0] == '#')
7813 if (sscanf (buf, "%X", &i) != 1)
7815 fprintf (stderr, "parse error in '%s'\n", compositionexclusions_filename);
7818 if (!(i < 0x110000))
7821 unicode_composition_exclusions[i] = 1;
7824 if (ferror (stream) || fclose (stream))
7826 fprintf (stderr, "error reading from '%s'\n", compositionexclusions_filename);
7832 debug_output_composition_tables (const char *filename)
7837 stream = fopen (filename, "w");
7840 fprintf (stderr, "cannot open '%s' for writing\n", filename);
7844 for (ch = 0; ch < 0x110000; ch++)
7846 unsigned int length;
7847 unsigned int decomposed[MAX_DECOMP_LENGTH];
7848 int type = get_decomposition (ch, &length, decomposed);
7850 if (type == UC_DECOMP_CANONICAL
7851 /* Consider only binary decompositions.
7852 Exclude singleton decompositions. */
7855 unsigned int code1 = decomposed[0];
7856 unsigned int code2 = decomposed[1];
7857 unsigned int combined = ch;
7859 /* Exclude decompositions where the first part is not a starter,
7860 i.e. is not of canonical combining class 0. */
7861 if (strcmp (unicode_attributes[code1].combining, "0") == 0
7862 /* Exclude characters listed in CompositionExclusions.txt. */
7863 && !unicode_composition_exclusions[combined])
7865 /* The combined character must now also be a starter.
7867 if (strcmp (unicode_attributes[combined].combining, "0") != 0)
7870 fprintf (stream, "0x%04X\t0x%04X\t0x%04X\t%s\n",
7874 unicode_attributes[code2].combining);
7879 if (ferror (stream) || fclose (stream))
7881 fprintf (stderr, "error writing to '%s'\n", filename);
7887 output_composition_tables (const char *filename, const char *version)
7892 stream = fopen (filename, "w");
7895 fprintf (stderr, "cannot open '%s' for writing\n", filename);
7899 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
7900 fprintf (stream, "/* Canonical composition of Unicode characters. */\n");
7901 fprintf (stream, "/* Generated automatically by gen-uni-tables for Unicode %s. */\n",
7903 fprintf (stream, "\n");
7905 /* Put a GPL header on it. The gnulib module is under LGPL (although it
7906 still carries the GPL header), and it's gnulib-tool which replaces the
7907 GPL header with an LGPL header. */
7908 fprintf (stream, "/* Copyright (C) 2009 Free Software Foundation, Inc.\n");
7909 fprintf (stream, "\n");
7910 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
7911 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
7912 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
7913 fprintf (stream, " (at your option) any later version.\n");
7914 fprintf (stream, "\n");
7915 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
7916 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
7917 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
7918 fprintf (stream, " GNU General Public License for more details.\n");
7919 fprintf (stream, "\n");
7920 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
7921 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
7922 fprintf (stream, "\n");
7924 /* The composition table is a set of mappings (code1, code2) -> combined,
7926 367 values for code1 (from 0x003C to 0x30FD),
7927 54 values for code2 (from 0x0300 to 0x309A).
7928 For a fixed code1, there are from 1 to 19 possible values for code2.
7929 For a fixed code2, there are from 1 to 117 possible values for code1.
7930 This is a very sparse matrix.
7932 We want an O(1) hash lookup.
7934 We could implement the hash lookup by mapping (code1, code2) to a linear
7935 combination mul1*code1 + mul2*code2, which is then used as an index into
7936 a 3-level table. But this leads to a table of size 37 KB.
7938 We use gperf to implement the hash lookup, giving it the 928 sets of
7939 4 bytes (code1, code2) as input. gperf generates a hash table of size
7940 1527, which is quite good (60% filled). It requires an auxiliary table
7941 lookup in a table of size 0.5 KB. The total tables size is 11 KB. */
7943 fprintf (stream, "struct composition_rule { char codes[6]; };\n");
7944 fprintf (stream, "%%struct-type\n");
7945 fprintf (stream, "%%language=ANSI-C\n");
7946 fprintf (stream, "%%define slot-name codes\n");
7947 fprintf (stream, "%%define hash-function-name gl_uninorm_compose_hash\n");
7948 fprintf (stream, "%%define lookup-function-name gl_uninorm_compose_lookup\n");
7949 fprintf (stream, "%%compare-lengths\n");
7950 fprintf (stream, "%%compare-strncmp\n");
7951 fprintf (stream, "%%readonly-tables\n");
7952 fprintf (stream, "%%omit-struct-type\n");
7953 fprintf (stream, "%%%%\n");
7955 for (ch = 0; ch < 0x110000; ch++)
7957 unsigned int length;
7958 unsigned int decomposed[MAX_DECOMP_LENGTH];
7959 int type = get_decomposition (ch, &length, decomposed);
7961 if (type == UC_DECOMP_CANONICAL
7962 /* Consider only binary decompositions.
7963 Exclude singleton decompositions. */
7966 unsigned int code1 = decomposed[0];
7967 unsigned int code2 = decomposed[1];
7968 unsigned int combined = ch;
7970 /* Exclude decompositions where the first part is not a starter,
7971 i.e. is not of canonical combining class 0. */
7972 if (strcmp (unicode_attributes[code1].combining, "0") == 0
7973 /* Exclude characters listed in CompositionExclusions.txt. */
7974 && !unicode_composition_exclusions[combined])
7976 /* The combined character must now also be a starter.
7978 if (strcmp (unicode_attributes[combined].combining, "0") != 0)
7981 fprintf (stream, "\"\\x%02x\\x%02x\\x%02x\\x%02x\\x%02x\\x%02x\", 0x%04x\n",
7982 (code1 >> 16) & 0xff, (code1 >> 8) & 0xff, code1 & 0xff,
7983 (code2 >> 16) & 0xff, (code2 >> 8) & 0xff, code2 & 0xff,
7989 if (ferror (stream) || fclose (stream))
7991 fprintf (stderr, "error writing to '%s'\n", filename);
7996 /* ========================================================================= */
7998 /* Output the test for a simple character mapping table to the given file. */
8001 output_simple_mapping_test (const char *filename,
8002 const char *function_name,
8003 unsigned int (*func) (unsigned int),
8004 const char *version)
8010 stream = fopen (filename, "w");
8013 fprintf (stderr, "cannot open '%s' for writing\n", filename);
8017 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
8018 fprintf (stream, "/* Test the Unicode character mapping functions.\n");
8019 fprintf (stream, " Copyright (C) 2009 Free Software Foundation, Inc.\n");
8020 fprintf (stream, "\n");
8021 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
8022 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
8023 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
8024 fprintf (stream, " (at your option) any later version.\n");
8025 fprintf (stream, "\n");
8026 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
8027 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
8028 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
8029 fprintf (stream, " GNU General Public License for more details.\n");
8030 fprintf (stream, "\n");
8031 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
8032 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
8033 fprintf (stream, "\n");
8034 fprintf (stream, "/* Generated automatically by gen-case.c for Unicode %s. */\n",
8036 fprintf (stream, "\n");
8037 fprintf (stream, "#include \"test-mapping-part1.h\"\n");
8038 fprintf (stream, "\n");
8041 for (ch = 0; ch < 0x110000; ch++)
8043 unsigned int value = func (ch);
8048 fprintf (stream, ",\n");
8049 fprintf (stream, " { 0x%04X, 0x%04X }", ch, value);
8054 fprintf (stream, "\n");
8056 fprintf (stream, "\n");
8057 fprintf (stream, "#define MAP(c) %s (c)\n", function_name);
8058 fprintf (stream, "#include \"test-mapping-part2.h\"\n");
8060 if (ferror (stream) || fclose (stream))
8062 fprintf (stderr, "error writing to '%s'\n", filename);
8067 /* Construction of sparse 3-level tables. */
8068 #define TABLE mapping_table
8069 #define ELEMENT int32_t
8071 #define xmalloc malloc
8072 #define xrealloc realloc
8075 /* Output a simple character mapping table to the given file. */
8078 output_simple_mapping (const char *filename,
8079 unsigned int (*func) (unsigned int),
8080 const char *version)
8084 struct mapping_table t;
8085 unsigned int level1_offset, level2_offset, level3_offset;
8087 stream = fopen (filename, "w");
8090 fprintf (stderr, "cannot open '%s' for writing\n", filename);
8094 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
8095 fprintf (stream, "/* Simple character mapping of Unicode characters. */\n");
8096 fprintf (stream, "/* Generated automatically by gen-case.c for Unicode %s. */\n",
8101 mapping_table_init (&t);
8103 for (ch = 0; ch < 0x110000; ch++)
8105 int value = (int) func (ch) - (int) ch;
8107 mapping_table_add (&t, ch, value);
8110 mapping_table_finalize (&t);
8112 /* Offsets in t.result, in memory of this process. */
8114 5 * sizeof (uint32_t);
8116 5 * sizeof (uint32_t)
8117 + t.level1_size * sizeof (uint32_t);
8119 5 * sizeof (uint32_t)
8120 + t.level1_size * sizeof (uint32_t)
8121 + (t.level2_size << t.q) * sizeof (uint32_t);
8123 for (i = 0; i < 5; i++)
8124 fprintf (stream, "#define mapping_header_%d %d\n", i,
8125 ((uint32_t *) t.result)[i]);
8126 fprintf (stream, "static const\n");
8127 fprintf (stream, "struct\n");
8128 fprintf (stream, " {\n");
8129 fprintf (stream, " int level1[%zu];\n", t.level1_size);
8130 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
8131 fprintf (stream, " int level3[%zu << %d];\n", t.level3_size, t.p);
8132 fprintf (stream, " }\n");
8133 fprintf (stream, "u_mapping =\n");
8134 fprintf (stream, "{\n");
8135 fprintf (stream, " {");
8136 if (t.level1_size > 8)
8137 fprintf (stream, "\n ");
8138 for (i = 0; i < t.level1_size; i++)
8141 if (i > 0 && (i % 8) == 0)
8142 fprintf (stream, "\n ");
8143 offset = ((uint32_t *) (t.result + level1_offset))[i];
8145 fprintf (stream, " %5d", -1);
8147 fprintf (stream, " %5zu",
8148 (offset - level2_offset) / sizeof (uint32_t));
8149 if (i+1 < t.level1_size)
8150 fprintf (stream, ",");
8152 if (t.level1_size > 8)
8153 fprintf (stream, "\n ");
8154 fprintf (stream, " },\n");
8155 fprintf (stream, " {");
8156 if (t.level2_size << t.q > 8)
8157 fprintf (stream, "\n ");
8158 for (i = 0; i < t.level2_size << t.q; i++)
8161 if (i > 0 && (i % 8) == 0)
8162 fprintf (stream, "\n ");
8163 offset = ((uint32_t *) (t.result + level2_offset))[i];
8165 fprintf (stream, " %5d", -1);
8167 fprintf (stream, " %5zu",
8168 (offset - level3_offset) / sizeof (int32_t));
8169 if (i+1 < t.level2_size << t.q)
8170 fprintf (stream, ",");
8172 if (t.level2_size << t.q > 8)
8173 fprintf (stream, "\n ");
8174 fprintf (stream, " },\n");
8175 fprintf (stream, " {");
8176 if (t.level3_size << t.p > 8)
8177 fprintf (stream, "\n ");
8178 for (i = 0; i < t.level3_size << t.p; i++)
8180 if (i > 0 && (i % 8) == 0)
8181 fprintf (stream, "\n ");
8182 fprintf (stream, " %5d", ((int32_t *) (t.result + level3_offset))[i]);
8183 if (i+1 < t.level3_size << t.p)
8184 fprintf (stream, ",");
8186 if (t.level3_size << t.p > 8)
8187 fprintf (stream, "\n ");
8188 fprintf (stream, " }\n");
8189 fprintf (stream, "};\n");
8191 if (ferror (stream) || fclose (stream))
8193 fprintf (stderr, "error writing to '%s'\n", filename);
8198 /* ========================================================================= */
8200 /* A special casing context.
8201 A context is negated through x -> -x. */
8206 SCC_AFTER_SOFT_DOTTED,
8212 /* A special casing rule. */
8213 struct special_casing_rule
8216 unsigned int lower_mapping[3];
8217 unsigned int title_mapping[3];
8218 unsigned int upper_mapping[3];
8219 unsigned int casefold_mapping[3];
8220 const char *language;
8224 /* The special casing rules. */
8225 struct special_casing_rule **casing_rules;
8226 unsigned int num_casing_rules;
8227 unsigned int allocated_casing_rules;
8230 add_casing_rule (struct special_casing_rule *new_rule)
8232 if (num_casing_rules == allocated_casing_rules)
8234 allocated_casing_rules = 2 * allocated_casing_rules;
8235 if (allocated_casing_rules < 16)
8236 allocated_casing_rules = 16;
8238 (struct special_casing_rule **)
8239 realloc (casing_rules, allocated_casing_rules * sizeof (struct special_casing_rule *));
8241 casing_rules[num_casing_rules++] = new_rule;
8244 /* Stores in casing_rules the special casing rules found in
8245 specialcasing_filename. */
8247 fill_casing_rules (const char *specialcasing_filename)
8251 stream = fopen (specialcasing_filename, "r");
8254 fprintf (stderr, "error during fopen of '%s'\n", specialcasing_filename);
8258 casing_rules = NULL;
8259 num_casing_rules = 0;
8260 allocated_casing_rules = 0;
8270 unsigned int lower_mapping[3];
8271 unsigned int title_mapping[3];
8272 unsigned int upper_mapping[3];
8276 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
8279 if (buf[0] == '\0' || buf[0] == '#')
8284 code = strtoul (scanptr, &endptr, 16);
8285 if (endptr == scanptr)
8287 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
8291 if (*scanptr != ';')
8293 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
8298 /* Scan lower mapping. */
8299 for (i = 0; i < 3; i++)
8300 lower_mapping[i] = 0;
8301 for (i = 0; i < 3; i++)
8303 while (*scanptr == ' ')
8305 if (*scanptr == ';')
8307 lower_mapping[i] = strtoul (scanptr, &endptr, 16);
8308 if (endptr == scanptr)
8310 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
8315 if (*scanptr != ';')
8317 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
8322 /* Scan title mapping. */
8323 for (i = 0; i < 3; i++)
8324 title_mapping[i] = 0;
8325 for (i = 0; i < 3; i++)
8327 while (*scanptr == ' ')
8329 if (*scanptr == ';')
8331 title_mapping[i] = strtoul (scanptr, &endptr, 16);
8332 if (endptr == scanptr)
8334 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
8339 if (*scanptr != ';')
8341 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
8346 /* Scan upper mapping. */
8347 for (i = 0; i < 3; i++)
8348 upper_mapping[i] = 0;
8349 for (i = 0; i < 3; i++)
8351 while (*scanptr == ' ')
8353 if (*scanptr == ';')
8355 upper_mapping[i] = strtoul (scanptr, &endptr, 16);
8356 if (endptr == scanptr)
8358 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
8363 if (*scanptr != ';')
8365 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
8370 /* Scan language and context. */
8372 context = SCC_ALWAYS;
8373 while (*scanptr == ' ')
8375 if (*scanptr != '\0' && *scanptr != '#')
8377 const char *word_begin = scanptr;
8378 const char *word_end;
8380 while (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';' && *scanptr != ' ')
8384 while (*scanptr == ' ')
8387 if (word_end - word_begin == 2)
8389 language = (char *) malloc ((word_end - word_begin) + 1);
8390 memcpy (language, word_begin, 2);
8391 language[word_end - word_begin] = '\0';
8392 word_begin = word_end = NULL;
8394 if (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';')
8396 word_begin = scanptr;
8397 while (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';' && *scanptr != ' ')
8403 if (word_end > word_begin)
8405 bool negate = false;
8407 if (word_end - word_begin >= 4 && memcmp (word_begin, "Not_", 4) == 0)
8412 if (word_end - word_begin == 11 && memcmp (word_begin, "Final_Sigma", 11) == 0)
8413 context = SCC_FINAL_SIGMA;
8414 else if (word_end - word_begin == 17 && memcmp (word_begin, "After_Soft_Dotted", 17) == 0)
8415 context = SCC_AFTER_SOFT_DOTTED;
8416 else if (word_end - word_begin == 10 && memcmp (word_begin, "More_Above", 10) == 0)
8417 context = SCC_MORE_ABOVE;
8418 else if (word_end - word_begin == 10 && memcmp (word_begin, "Before_Dot", 10) == 0)
8419 context = SCC_BEFORE_DOT;
8420 else if (word_end - word_begin == 7 && memcmp (word_begin, "After_I", 7) == 0)
8421 context = SCC_AFTER_I;
8424 fprintf (stderr, "unknown context type in '%s'\n", specialcasing_filename);
8428 context = - context;
8431 if (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';')
8433 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
8438 /* Store the rule. */
8440 struct special_casing_rule *new_rule =
8441 (struct special_casing_rule *) malloc (sizeof (struct special_casing_rule));
8442 new_rule->code = code;
8443 new_rule->language = language;
8444 new_rule->context = context;
8445 memcpy (new_rule->lower_mapping, lower_mapping, sizeof (new_rule->lower_mapping));
8446 memcpy (new_rule->title_mapping, title_mapping, sizeof (new_rule->title_mapping));
8447 memcpy (new_rule->upper_mapping, upper_mapping, sizeof (new_rule->upper_mapping));
8449 add_casing_rule (new_rule);
8453 if (ferror (stream) || fclose (stream))
8455 fprintf (stderr, "error reading from '%s'\n", specialcasing_filename);
8460 /* A casefolding rule. */
8461 struct casefold_rule
8464 unsigned int mapping[3];
8465 const char *language;
8468 /* The casefolding rules. */
8469 struct casefold_rule **casefolding_rules;
8470 unsigned int num_casefolding_rules;
8471 unsigned int allocated_casefolding_rules;
8473 /* Stores in casefolding_rules the case folding rules found in
8474 casefolding_filename. */
8476 fill_casefolding_rules (const char *casefolding_filename)
8480 stream = fopen (casefolding_filename, "r");
8483 fprintf (stderr, "error during fopen of '%s'\n", casefolding_filename);
8487 casefolding_rules = NULL;
8488 num_casefolding_rules = 0;
8489 allocated_casefolding_rules = 0;
8500 unsigned int mapping[3];
8502 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
8505 if (buf[0] == '\0' || buf[0] == '#')
8510 code = strtoul (scanptr, &endptr, 16);
8511 if (endptr == scanptr)
8513 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
8517 if (*scanptr != ';')
8519 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
8525 while (*scanptr == ' ')
8530 case 'C': case 'F': case 'S': case 'T':
8534 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
8538 if (*scanptr != ';')
8540 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
8545 /* Scan casefold mapping. */
8546 for (i = 0; i < 3; i++)
8548 for (i = 0; i < 3; i++)
8550 while (*scanptr == ' ')
8552 if (*scanptr == ';')
8554 mapping[i] = strtoul (scanptr, &endptr, 16);
8555 if (endptr == scanptr)
8557 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
8562 if (*scanptr != ';')
8564 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
8569 /* Ignore rules of type 'S'; we use the rules of type 'F' instead. */
8572 const char * const *languages;
8573 unsigned int languages_count;
8575 /* Type 'T' indicates that the rule is applicable to Turkish
8579 static const char * const turkish_languages[] = { "tr", "az" };
8580 languages = turkish_languages;
8581 languages_count = 2;
8585 static const char * const all_languages[] = { NULL };
8586 languages = all_languages;
8587 languages_count = 1;
8590 for (i = 0; i < languages_count; i++)
8592 /* Store a new rule. */
8593 struct casefold_rule *new_rule =
8594 (struct casefold_rule *) malloc (sizeof (struct casefold_rule));
8595 new_rule->code = code;
8596 memcpy (new_rule->mapping, mapping, sizeof (new_rule->mapping));
8597 new_rule->language = languages[i];
8599 if (num_casefolding_rules == allocated_casefolding_rules)
8601 allocated_casefolding_rules = 2 * allocated_casefolding_rules;
8602 if (allocated_casefolding_rules < 16)
8603 allocated_casefolding_rules = 16;
8605 (struct casefold_rule **)
8606 realloc (casefolding_rules,
8607 allocated_casefolding_rules * sizeof (struct casefold_rule *));
8609 casefolding_rules[num_casefolding_rules++] = new_rule;
8614 if (ferror (stream) || fclose (stream))
8616 fprintf (stderr, "error reading from '%s'\n", casefolding_filename);
8621 /* Casefold mapping, when it maps to a single character. */
8622 unsigned int unicode_casefold[0x110000];
8625 to_casefold (unsigned int ch)
8627 return unicode_casefold[ch];
8630 /* Redistribute the casefolding_rules:
8631 - Rules that map to a single character, language independently, are stored
8632 in unicode_casefold.
8633 - Other rules are merged into casing_rules. */
8635 redistribute_casefolding_rules (void)
8637 unsigned int ch, i, j;
8639 /* Fill unicode_casefold[]. */
8640 for (ch = 0; ch < 0x110000; ch++)
8641 unicode_casefold[ch] = ch;
8642 for (i = 0; i < num_casefolding_rules; i++)
8644 struct casefold_rule *cfrule = casefolding_rules[i];
8646 if (cfrule->language == NULL && cfrule->mapping[1] == 0)
8649 if (!(ch < 0x110000))
8651 unicode_casefold[ch] = cfrule->mapping[0];
8655 /* Extend the special casing rules by filling in their casefold_mapping[]
8657 for (j = 0; j < num_casing_rules; j++)
8659 struct special_casing_rule *rule = casing_rules[j];
8662 rule->casefold_mapping[0] = to_casefold (rule->code);
8663 for (k = 1; k < 3; k++)
8664 rule->casefold_mapping[k] = 0;
8667 /* Now merge the other casefolding rules into casing_rules. */
8668 for (i = 0; i < num_casefolding_rules; i++)
8670 struct casefold_rule *cfrule = casefolding_rules[i];
8672 if (!(cfrule->language == NULL && cfrule->mapping[1] == 0))
8674 /* Find a rule that applies to the same code, same language, and it
8675 has context SCC_ALWAYS. At the same time, update all rules that
8676 have the same code and same or more specific language. */
8677 struct special_casing_rule *found_rule = NULL;
8679 for (j = 0; j < num_casing_rules; j++)
8681 struct special_casing_rule *rule = casing_rules[j];
8683 if (rule->code == cfrule->code
8684 && (cfrule->language == NULL
8685 || (rule->language != NULL
8686 && strcmp (rule->language, cfrule->language) == 0)))
8688 memcpy (rule->casefold_mapping, cfrule->mapping,
8689 sizeof (rule->casefold_mapping));
8691 if ((cfrule->language == NULL
8692 ? rule->language == NULL
8693 : rule->language != NULL
8694 && strcmp (rule->language, cfrule->language) == 0)
8695 && rule->context == SCC_ALWAYS)
8703 if (found_rule == NULL)
8705 /* Create a new rule. */
8706 struct special_casing_rule *new_rule =
8707 (struct special_casing_rule *) malloc (sizeof (struct special_casing_rule));
8709 /* Try to find a rule that applies to the same code, no language
8710 restriction, and with context SCC_ALWAYS. */
8711 for (j = 0; j < num_casing_rules; j++)
8713 struct special_casing_rule *rule = casing_rules[j];
8715 if (rule->code == cfrule->code
8716 && rule->context == SCC_ALWAYS
8717 && rule->language == NULL)
8725 new_rule->code = cfrule->code;
8726 new_rule->language = cfrule->language;
8727 new_rule->context = SCC_ALWAYS;
8728 if (found_rule != NULL)
8730 memcpy (new_rule->lower_mapping, found_rule->lower_mapping,
8731 sizeof (new_rule->lower_mapping));
8732 memcpy (new_rule->title_mapping, found_rule->title_mapping,
8733 sizeof (new_rule->title_mapping));
8734 memcpy (new_rule->upper_mapping, found_rule->upper_mapping,
8735 sizeof (new_rule->upper_mapping));
8741 new_rule->lower_mapping[0] = to_lower (cfrule->code);
8742 for (k = 1; k < 3; k++)
8743 new_rule->lower_mapping[k] = 0;
8744 new_rule->title_mapping[0] = to_title (cfrule->code);
8745 for (k = 1; k < 3; k++)
8746 new_rule->title_mapping[k] = 0;
8747 new_rule->upper_mapping[0] = to_upper (cfrule->code);
8748 for (k = 1; k < 3; k++)
8749 new_rule->upper_mapping[k] = 0;
8751 memcpy (new_rule->casefold_mapping, cfrule->mapping,
8752 sizeof (new_rule->casefold_mapping));
8754 add_casing_rule (new_rule);
8761 compare_casing_rules (const void *a, const void *b)
8763 struct special_casing_rule *a_rule = *(struct special_casing_rule **) a;
8764 struct special_casing_rule *b_rule = *(struct special_casing_rule **) b;
8765 unsigned int a_code = a_rule->code;
8766 unsigned int b_code = b_rule->code;
8768 if (a_code < b_code)
8770 if (a_code > b_code)
8773 /* Sort the more specific rules before the more general ones. */
8774 return (- ((a_rule->language != NULL ? 1 : 0) + (a_rule->context != SCC_ALWAYS ? 1 : 0))
8775 + ((b_rule->language != NULL ? 1 : 0) + (b_rule->context != SCC_ALWAYS ? 1 : 0)));
8779 sort_casing_rules (void)
8781 /* Sort the rules 1. by code, 2. by specificity. */
8782 if (num_casing_rules > 1)
8783 qsort (casing_rules, num_casing_rules, sizeof (struct special_casing_rule *),
8784 compare_casing_rules);
8787 /* Output the special casing rules. */
8789 output_casing_rules (const char *filename, const char *version)
8795 stream = fopen (filename, "w");
8798 fprintf (stderr, "cannot open '%s' for writing\n", filename);
8802 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
8803 fprintf (stream, "/* Special casing rules of Unicode characters. */\n");
8804 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
8806 fprintf (stream, "struct special_casing_rule { char code[3]; };\n");
8807 fprintf (stream, "%%struct-type\n");
8808 fprintf (stream, "%%language=ANSI-C\n");
8809 fprintf (stream, "%%define slot-name code\n");
8810 fprintf (stream, "%%define hash-function-name gl_unicase_special_hash\n");
8811 fprintf (stream, "%%define lookup-function-name gl_unicase_special_lookup\n");
8812 fprintf (stream, "%%compare-lengths\n");
8813 fprintf (stream, "%%compare-strncmp\n");
8814 fprintf (stream, "%%readonly-tables\n");
8815 fprintf (stream, "%%omit-struct-type\n");
8816 fprintf (stream, "%%%%\n");
8819 for (i = 0; i < num_casing_rules; i++)
8821 struct special_casing_rule *rule = casing_rules[i];
8824 if (i > 0 && rule->code == casing_rules[i - 1]->code)
8829 if (!(rule->code < 0x10000))
8831 fprintf (stderr, "special rule #%u: code %u out of range\n", i, rule->code);
8835 fprintf (stream, "\"\\x%02x\\x%02x\\x%02x\", ",
8836 (rule->code >> 8) & 0xff, rule->code & 0xff, minor);
8838 fprintf (stream, "%d, ",
8839 i + 1 < num_casing_rules && casing_rules[i + 1]->code == rule->code ? 1 : 0);
8841 context = rule->context;
8844 fprintf (stream, "-");
8845 context = - context;
8848 fprintf (stream, " ");
8852 fprintf (stream, "SCC_ALWAYS ");
8854 case SCC_FINAL_SIGMA:
8855 fprintf (stream, "SCC_FINAL_SIGMA ");
8857 case SCC_AFTER_SOFT_DOTTED:
8858 fprintf (stream, "SCC_AFTER_SOFT_DOTTED");
8860 case SCC_MORE_ABOVE:
8861 fprintf (stream, "SCC_MORE_ABOVE ");
8863 case SCC_BEFORE_DOT:
8864 fprintf (stream, "SCC_BEFORE_DOT ");
8867 fprintf (stream, "SCC_AFTER_I ");
8872 fprintf (stream, ", ");
8874 if (rule->language != NULL)
8876 if (strlen (rule->language) != 2)
8878 fprintf (stream, "{ '%c', '%c' }, ", rule->language[0], rule->language[1]);
8881 fprintf (stream, "{ '\\0', '\\0' }, ");
8883 fprintf (stream, "{ ");
8884 for (j = 0; j < 3; j++)
8887 fprintf (stream, ", ");
8888 if (!(rule->upper_mapping[j] < 0x10000))
8890 fprintf (stderr, "special rule #%u: upper mapping of code %u out of range\n", i, rule->code);
8893 if (rule->upper_mapping[j] != 0)
8894 fprintf (stream, "0x%04X", rule->upper_mapping[j]);
8896 fprintf (stream, " 0");
8898 fprintf (stream, " }, { ");
8899 for (j = 0; j < 3; j++)
8902 fprintf (stream, ", ");
8903 if (!(rule->lower_mapping[j] < 0x10000))
8905 fprintf (stderr, "special rule #%u: lower mapping of code %u out of range\n", i, rule->code);
8908 if (rule->lower_mapping[j] != 0)
8909 fprintf (stream, "0x%04X", rule->lower_mapping[j]);
8911 fprintf (stream, " 0");
8913 fprintf (stream, " }, { ");
8914 for (j = 0; j < 3; j++)
8917 fprintf (stream, ", ");
8918 if (!(rule->title_mapping[j] < 0x10000))
8920 fprintf (stderr, "special rule #%u: title mapping of code %u out of range\n", i, rule->code);
8923 if (rule->title_mapping[j] != 0)
8924 fprintf (stream, "0x%04X", rule->title_mapping[j]);
8926 fprintf (stream, " 0");
8928 fprintf (stream, " }, { ");
8929 for (j = 0; j < 3; j++)
8932 fprintf (stream, ", ");
8933 if (!(rule->casefold_mapping[j] < 0x10000))
8935 fprintf (stderr, "special rule #%u: casefold mapping of code %u out of range\n", i, rule->code);
8938 if (rule->casefold_mapping[j] != 0)
8939 fprintf (stream, "0x%04X", rule->casefold_mapping[j]);
8941 fprintf (stream, " 0");
8943 fprintf (stream, " }\n");
8946 if (ferror (stream) || fclose (stream))
8948 fprintf (stderr, "error writing to '%s'\n", filename);
8953 /* ========================================================================= */
8955 /* Quoting the Unicode standard:
8956 Definition: A character is defined to be "cased" if it has the Lowercase
8957 or Uppercase property or has a General_Category value of
8958 Titlecase_Letter. */
8960 is_cased (unsigned int ch)
8962 return (is_property_lowercase (ch)
8963 || is_property_uppercase (ch)
8964 || is_category_Lt (ch));
8967 /* Quoting the Unicode standard:
8968 Definition: A character is defined to be "case-ignorable" if it has the
8969 value MidLetter {or the value MidNumLet} for the Word_Break property or
8970 its General_Category is one of Nonspacing_Mark (Mn), Enclosing_Mark (Me),
8971 Format (Cf), Modifier_Letter (Lm), or Modifier_Symbol (Sk).
8972 The text marked in braces was added in Unicode 5.1.0, see
8973 <http://www.unicode.org/versions/Unicode5.1.0/> section "Update of
8974 Definition of case-ignorable". */
8975 /* Since this predicate is only used for the "Before C" and "After C"
8976 conditions of FINAL_SIGMA, we exclude the "cased" characters here.
8977 This simplifies the evaluation of the regular expressions
8978 \p{cased} (\p{case-ignorable})* C
8980 C (\p{case-ignorable})* \p{cased}
8983 is_case_ignorable (unsigned int ch)
8985 return (unicode_org_wbp[ch] == WBP_MIDLETTER
8986 || unicode_org_wbp[ch] == WBP_MIDNUMLET
8987 || is_category_Mn (ch)
8988 || is_category_Me (ch)
8989 || is_category_Cf (ch)
8990 || is_category_Lm (ch)
8991 || is_category_Sk (ch))
8995 /* ------------------------------------------------------------------------- */
8997 /* Output all case related properties. */
8999 output_casing_properties (const char *version)
9001 #define PROPERTY(FN,P) \
9002 debug_output_predicate ("unicase/" #FN ".txt", is_ ## P); \
9003 output_predicate_test ("../tests/unicase/test-" #FN ".c", is_ ## P, "uc_is_" #P " (c)"); \
9004 output_predicate ("unicase/" #FN ".h", is_ ## P, "u_casing_property_" #P, "Casing Properties", version);
9005 PROPERTY(cased, cased)
9006 PROPERTY(ignorable, case_ignorable)
9010 /* ========================================================================= */
9013 main (int argc, char * argv[])
9015 const char *unicodedata_filename;
9016 const char *proplist_filename;
9017 const char *derivedproplist_filename;
9018 const char *scripts_filename;
9019 const char *blocks_filename;
9020 const char *proplist30_filename;
9021 const char *eastasianwidth_filename;
9022 const char *linebreak_filename;
9023 const char *wordbreakproperty_filename;
9024 const char *graphemebreakproperty_filename;
9025 const char *compositionexclusions_filename;
9026 const char *specialcasing_filename;
9027 const char *casefolding_filename;
9028 const char *version;
9032 fprintf (stderr, "Usage: %s UnicodeData.txt PropList.txt DerivedCoreProperties.txt Scripts.txt Blocks.txt PropList-3.0.1.txt EastAsianWidth.txt LineBreak.txt WordBreakProperty.txt GraphemeBreakProperty.txt CompositionExclusions.txt SpecialCasing.txt CaseFolding.txt version\n",
9037 unicodedata_filename = argv[1];
9038 proplist_filename = argv[2];
9039 derivedproplist_filename = argv[3];
9040 scripts_filename = argv[4];
9041 blocks_filename = argv[5];
9042 proplist30_filename = argv[6];
9043 eastasianwidth_filename = argv[7];
9044 linebreak_filename = argv[8];
9045 wordbreakproperty_filename = argv[9];
9046 graphemebreakproperty_filename = argv[10];
9047 compositionexclusions_filename = argv[11];
9048 specialcasing_filename = argv[12];
9049 casefolding_filename = argv[13];
9052 fill_attributes (unicodedata_filename);
9053 clear_properties ();
9054 fill_properties (proplist_filename);
9055 fill_properties (derivedproplist_filename);
9056 fill_properties30 (proplist30_filename);
9057 fill_scripts (scripts_filename);
9058 fill_blocks (blocks_filename);
9059 fill_width (eastasianwidth_filename);
9060 fill_org_lbp (linebreak_filename);
9061 fill_org_wbp (wordbreakproperty_filename);
9062 fill_org_gbp (graphemebreakproperty_filename);
9063 fill_composition_exclusions (compositionexclusions_filename);
9064 fill_casing_rules (specialcasing_filename);
9065 fill_casefolding_rules (casefolding_filename);
9066 redistribute_casefolding_rules ();
9067 sort_casing_rules ();
9069 output_categories (version);
9070 output_category ("unictype/categ_of.h", version);
9071 output_combclass ("unictype/combining.h", version);
9072 output_bidi_category ("unictype/bidi_of.h", version);
9073 output_decimal_digit_test ("../tests/unictype/test-decdigit.h", version);
9074 output_decimal_digit ("unictype/decdigit.h", version);
9075 output_digit_test ("../tests/unictype/test-digit.h", version);
9076 output_digit ("unictype/digit.h", version);
9077 output_numeric_test ("../tests/unictype/test-numeric.h", version);
9078 output_numeric ("unictype/numeric.h", version);
9079 output_mirror ("unictype/mirror.h", version);
9080 output_properties (version);
9081 output_scripts (version);
9082 output_scripts_byname (version);
9083 output_blocks (version);
9084 output_ident_properties (version);
9085 output_nonspacing_property ("uniwidth/width.c.part");
9086 output_width_property_test ("../tests/uniwidth/test-uc_width2.sh.part");
9087 output_old_ctype (version);
9089 debug_output_lbrk_tables ("unilbrk/lbrkprop.txt");
9090 debug_output_org_lbrk_tables ("unilbrk/lbrkprop_org.txt");
9091 output_lbrk_tables ("unilbrk/lbrkprop1.h", "unilbrk/lbrkprop2.h", version);
9093 debug_output_wbrk_tables ("uniwbrk/wbrkprop.txt");
9094 debug_output_org_wbrk_tables ("uniwbrk/wbrkprop_org.txt");
9095 output_wbrk_tables ("uniwbrk/wbrkprop.h", version);
9097 output_gbp_test ("../tests/unigbrk/test-uc-gbrk-prop.h");
9098 output_gbp_table ("unigbrk/gbrkprop.h", version);
9100 output_decomposition_tables ("uninorm/decomposition-table1.h", "uninorm/decomposition-table2.h", version);
9101 debug_output_composition_tables ("uninorm/composition.txt");
9102 output_composition_tables ("uninorm/composition-table.gperf", version);
9104 output_simple_mapping_test ("../tests/unicase/test-uc_toupper.c", "uc_toupper", to_upper, version);
9105 output_simple_mapping_test ("../tests/unicase/test-uc_tolower.c", "uc_tolower", to_lower, version);
9106 output_simple_mapping_test ("../tests/unicase/test-uc_totitle.c", "uc_totitle", to_title, version);
9107 output_simple_mapping ("unicase/toupper.h", to_upper, version);
9108 output_simple_mapping ("unicase/tolower.h", to_lower, version);
9109 output_simple_mapping ("unicase/totitle.h", to_title, version);
9110 output_simple_mapping ("unicase/tocasefold.h", to_casefold, version);
9111 output_casing_rules ("unicase/special-casing-table.gperf", version);
9112 output_casing_properties (version);
9118 * For Emacs M-x compile
9120 * compile-command: "
9121 gcc -O -Wall gen-uni-tables.c -Iunictype -o gen-uni-tables && \
9123 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/UnicodeData.txt \
9124 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/PropList.txt \
9125 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/DerivedCoreProperties.txt \
9126 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/Scripts.txt \
9127 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/Blocks.txt \
9128 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/3.0.1/PropList-3.0.1.txt \
9129 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/EastAsianWidth.txt \
9130 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/LineBreak.txt \
9131 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/auxiliary/WordBreakProperty.txt \
9132 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/auxiliary/GraphemeBreakProperty.txt \
9133 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/CompositionExclusions.txt \
9134 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/SpecialCasing.txt \
9135 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/CaseFolding.txt \
9137 && diff unilbrk/lbrkprop_org.txt unilbrk/lbrkprop.txt \
9138 && diff uniwbrk/wbrkprop_org.txt uniwbrk/wbrkprop.txt