1 /* Generate Unicode conforming character classification tables and
2 line break properties tables and word break property tables and
3 decomposition/composition and case mapping tables from a UnicodeData file.
4 Copyright (C) 2000-2002, 2004, 2007-2011 Free Software Foundation, Inc.
5 Written by Bruno Haible <bruno@clisp.org>, 2000-2002.
7 This program is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <http://www.gnu.org/licenses/>. */
21 $ gen-uni-tables /usr/local/share/Unidata/UnicodeData.txt \
22 /usr/local/share/Unidata/PropList.txt \
23 /usr/local/share/Unidata/DerivedCoreProperties.txt \
24 /usr/local/share/Unidata/Scripts.txt \
25 /usr/local/share/Unidata/Blocks.txt \
26 /usr/local/share/Unidata/PropList-3.0.1.txt \
27 /usr/local/share/Unidata/EastAsianWidth.txt \
28 /usr/local/share/Unidata/LineBreak.txt \
29 /usr/local/share/Unidata/WordBreakProperty.txt \
30 /usr/local/share/Unidata/GraphemeBreakProperty.txt \
31 /usr/local/share/Unidata/CompositionExclusions.txt \
32 /usr/local/share/Unidata/SpecialCasing.txt \
33 /usr/local/share/Unidata/CaseFolding.txt \
44 /* ========================================================================= */
46 /* Reading UnicodeData.txt. */
49 /* This structure represents one line in the UnicodeData.txt file. */
50 struct unicode_attribute
52 const char *name; /* Character name */
53 const char *category; /* General category */
54 const char *combining; /* Canonical combining class */
55 const char *bidi; /* Bidirectional category */
56 const char *decomposition; /* Character decomposition mapping */
57 const char *decdigit; /* Decimal digit value */
58 const char *digit; /* Digit value */
59 const char *numeric; /* Numeric value */
60 bool mirrored; /* mirrored */
61 const char *oldname; /* Old Unicode 1.0 name */
62 const char *comment; /* Comment */
63 unsigned int upper; /* Uppercase mapping */
64 unsigned int lower; /* Lowercase mapping */
65 unsigned int title; /* Titlecase mapping */
68 /* Missing fields are represented with "" for strings, and NONE for
70 #define NONE (~(unsigned int)0)
72 /* The entire contents of the UnicodeData.txt file. */
73 struct unicode_attribute unicode_attributes [0x110000];
75 /* Stores in unicode_attributes[i] the values from the given fields. */
77 fill_attribute (unsigned int i,
78 const char *field1, const char *field2,
79 const char *field3, const char *field4,
80 const char *field5, const char *field6,
81 const char *field7, const char *field8,
82 const char *field9, const char *field10,
83 const char *field11, const char *field12,
84 const char *field13, const char *field14)
86 struct unicode_attribute * uni;
90 fprintf (stderr, "index too large\n");
93 if (strcmp (field2, "Cs") == 0)
94 /* Surrogates are UTF-16 artefacts, not real characters. Ignore them. */
96 uni = &unicode_attributes[i];
97 /* Copy the strings. */
98 uni->name = strdup (field1);
99 uni->category = (field2[0] == '\0' ? "" : strdup (field2));
100 uni->combining = (field3[0] == '\0' ? "" : strdup (field3));
101 uni->bidi = (field4[0] == '\0' ? "" : strdup (field4));
102 uni->decomposition = (field5[0] == '\0' ? "" : strdup (field5));
103 uni->decdigit = (field6[0] == '\0' ? "" : strdup (field6));
104 uni->digit = (field7[0] == '\0' ? "" : strdup (field7));
105 uni->numeric = (field8[0] == '\0' ? "" : strdup (field8));
106 uni->mirrored = (field9[0] == 'Y');
107 uni->oldname = (field10[0] == '\0' ? "" : strdup (field10));
108 uni->comment = (field11[0] == '\0' ? "" : strdup (field11));
109 uni->upper = (field12[0] =='\0' ? NONE : strtoul (field12, NULL, 16));
110 uni->lower = (field13[0] =='\0' ? NONE : strtoul (field13, NULL, 16));
111 uni->title = (field14[0] =='\0' ? NONE : strtoul (field14, NULL, 16));
114 /* Maximum length of a field in the UnicodeData.txt file. */
117 /* Reads the next field from STREAM. The buffer BUFFER has size FIELDLEN.
118 Reads up to (but excluding) DELIM.
119 Returns 1 when a field was successfully read, otherwise 0. */
121 getfield (FILE *stream, char *buffer, int delim)
126 for (; (c = getc (stream)), (c != EOF && c != delim); )
128 /* The original unicode.org UnicodeData.txt file happens to have
129 CR/LF line terminators. Silently convert to LF. */
133 /* Put c into the buffer. */
134 if (++count >= FIELDLEN - 1)
136 fprintf (stderr, "field longer than expected, increase FIELDLEN\n");
149 /* Stores in unicode_attributes[] the entire contents of the UnicodeData.txt
152 fill_attributes (const char *unicodedata_filename)
156 char field0[FIELDLEN];
157 char field1[FIELDLEN];
158 char field2[FIELDLEN];
159 char field3[FIELDLEN];
160 char field4[FIELDLEN];
161 char field5[FIELDLEN];
162 char field6[FIELDLEN];
163 char field7[FIELDLEN];
164 char field8[FIELDLEN];
165 char field9[FIELDLEN];
166 char field10[FIELDLEN];
167 char field11[FIELDLEN];
168 char field12[FIELDLEN];
169 char field13[FIELDLEN];
170 char field14[FIELDLEN];
173 for (i = 0; i < 0x110000; i++)
174 unicode_attributes[i].name = NULL;
176 stream = fopen (unicodedata_filename, "r");
179 fprintf (stderr, "error during fopen of '%s'\n", unicodedata_filename);
188 n = getfield (stream, field0, ';');
189 n += getfield (stream, field1, ';');
190 n += getfield (stream, field2, ';');
191 n += getfield (stream, field3, ';');
192 n += getfield (stream, field4, ';');
193 n += getfield (stream, field5, ';');
194 n += getfield (stream, field6, ';');
195 n += getfield (stream, field7, ';');
196 n += getfield (stream, field8, ';');
197 n += getfield (stream, field9, ';');
198 n += getfield (stream, field10, ';');
199 n += getfield (stream, field11, ';');
200 n += getfield (stream, field12, ';');
201 n += getfield (stream, field13, ';');
202 n += getfield (stream, field14, '\n');
207 fprintf (stderr, "short line in '%s':%d\n",
208 unicodedata_filename, lineno);
211 i = strtoul (field0, NULL, 16);
213 && strlen (field1) >= 9
214 && strcmp (field1 + strlen (field1) - 8, ", First>") == 0)
216 /* Deal with a range. */
218 n = getfield (stream, field0, ';');
219 n += getfield (stream, field1, ';');
220 n += getfield (stream, field2, ';');
221 n += getfield (stream, field3, ';');
222 n += getfield (stream, field4, ';');
223 n += getfield (stream, field5, ';');
224 n += getfield (stream, field6, ';');
225 n += getfield (stream, field7, ';');
226 n += getfield (stream, field8, ';');
227 n += getfield (stream, field9, ';');
228 n += getfield (stream, field10, ';');
229 n += getfield (stream, field11, ';');
230 n += getfield (stream, field12, ';');
231 n += getfield (stream, field13, ';');
232 n += getfield (stream, field14, '\n');
235 fprintf (stderr, "missing end range in '%s':%d\n",
236 unicodedata_filename, lineno);
239 if (!(field1[0] == '<'
240 && strlen (field1) >= 8
241 && strcmp (field1 + strlen (field1) - 7, ", Last>") == 0))
243 fprintf (stderr, "missing end range in '%s':%d\n",
244 unicodedata_filename, lineno);
247 field1[strlen (field1) - 7] = '\0';
248 j = strtoul (field0, NULL, 16);
250 fill_attribute (i, field1+1, field2, field3, field4, field5,
251 field6, field7, field8, field9, field10,
252 field11, field12, field13, field14);
256 /* Single character line */
257 fill_attribute (i, field1, field2, field3, field4, field5,
258 field6, field7, field8, field9, field10,
259 field11, field12, field13, field14);
262 if (ferror (stream) || fclose (stream))
264 fprintf (stderr, "error reading from '%s'\n", unicodedata_filename);
269 /* ========================================================================= */
271 /* General category. */
272 /* See Unicode 3.0 book, section 4.5,
276 is_category_L (unsigned int ch)
278 return (unicode_attributes[ch].name != NULL
279 && unicode_attributes[ch].category[0] == 'L');
283 is_category_Lu (unsigned int ch)
285 return (unicode_attributes[ch].name != NULL
286 && unicode_attributes[ch].category[0] == 'L'
287 && unicode_attributes[ch].category[1] == 'u');
291 is_category_Ll (unsigned int ch)
293 return (unicode_attributes[ch].name != NULL
294 && unicode_attributes[ch].category[0] == 'L'
295 && unicode_attributes[ch].category[1] == 'l');
299 is_category_Lt (unsigned int ch)
301 return (unicode_attributes[ch].name != NULL
302 && unicode_attributes[ch].category[0] == 'L'
303 && unicode_attributes[ch].category[1] == 't');
307 is_category_Lm (unsigned int ch)
309 return (unicode_attributes[ch].name != NULL
310 && unicode_attributes[ch].category[0] == 'L'
311 && unicode_attributes[ch].category[1] == 'm');
315 is_category_Lo (unsigned int ch)
317 return (unicode_attributes[ch].name != NULL
318 && unicode_attributes[ch].category[0] == 'L'
319 && unicode_attributes[ch].category[1] == 'o');
323 is_category_M (unsigned int ch)
325 return (unicode_attributes[ch].name != NULL
326 && unicode_attributes[ch].category[0] == 'M');
330 is_category_Mn (unsigned int ch)
332 return (unicode_attributes[ch].name != NULL
333 && unicode_attributes[ch].category[0] == 'M'
334 && unicode_attributes[ch].category[1] == 'n');
338 is_category_Mc (unsigned int ch)
340 return (unicode_attributes[ch].name != NULL
341 && unicode_attributes[ch].category[0] == 'M'
342 && unicode_attributes[ch].category[1] == 'c');
346 is_category_Me (unsigned int ch)
348 return (unicode_attributes[ch].name != NULL
349 && unicode_attributes[ch].category[0] == 'M'
350 && unicode_attributes[ch].category[1] == 'e');
354 is_category_N (unsigned int ch)
356 return (unicode_attributes[ch].name != NULL
357 && unicode_attributes[ch].category[0] == 'N');
361 is_category_Nd (unsigned int ch)
363 return (unicode_attributes[ch].name != NULL
364 && unicode_attributes[ch].category[0] == 'N'
365 && unicode_attributes[ch].category[1] == 'd');
369 is_category_Nl (unsigned int ch)
371 return (unicode_attributes[ch].name != NULL
372 && unicode_attributes[ch].category[0] == 'N'
373 && unicode_attributes[ch].category[1] == 'l');
377 is_category_No (unsigned int ch)
379 return (unicode_attributes[ch].name != NULL
380 && unicode_attributes[ch].category[0] == 'N'
381 && unicode_attributes[ch].category[1] == 'o');
385 is_category_P (unsigned int ch)
387 return (unicode_attributes[ch].name != NULL
388 && unicode_attributes[ch].category[0] == 'P');
392 is_category_Pc (unsigned int ch)
394 return (unicode_attributes[ch].name != NULL
395 && unicode_attributes[ch].category[0] == 'P'
396 && unicode_attributes[ch].category[1] == 'c');
400 is_category_Pd (unsigned int ch)
402 return (unicode_attributes[ch].name != NULL
403 && unicode_attributes[ch].category[0] == 'P'
404 && unicode_attributes[ch].category[1] == 'd');
408 is_category_Ps (unsigned int ch)
410 return (unicode_attributes[ch].name != NULL
411 && unicode_attributes[ch].category[0] == 'P'
412 && unicode_attributes[ch].category[1] == 's');
416 is_category_Pe (unsigned int ch)
418 return (unicode_attributes[ch].name != NULL
419 && unicode_attributes[ch].category[0] == 'P'
420 && unicode_attributes[ch].category[1] == 'e');
424 is_category_Pi (unsigned int ch)
426 return (unicode_attributes[ch].name != NULL
427 && unicode_attributes[ch].category[0] == 'P'
428 && unicode_attributes[ch].category[1] == 'i');
432 is_category_Pf (unsigned int ch)
434 return (unicode_attributes[ch].name != NULL
435 && unicode_attributes[ch].category[0] == 'P'
436 && unicode_attributes[ch].category[1] == 'f');
440 is_category_Po (unsigned int ch)
442 return (unicode_attributes[ch].name != NULL
443 && unicode_attributes[ch].category[0] == 'P'
444 && unicode_attributes[ch].category[1] == 'o');
448 is_category_S (unsigned int ch)
450 return (unicode_attributes[ch].name != NULL
451 && unicode_attributes[ch].category[0] == 'S');
455 is_category_Sm (unsigned int ch)
457 return (unicode_attributes[ch].name != NULL
458 && unicode_attributes[ch].category[0] == 'S'
459 && unicode_attributes[ch].category[1] == 'm');
463 is_category_Sc (unsigned int ch)
465 return (unicode_attributes[ch].name != NULL
466 && unicode_attributes[ch].category[0] == 'S'
467 && unicode_attributes[ch].category[1] == 'c');
471 is_category_Sk (unsigned int ch)
473 return (unicode_attributes[ch].name != NULL
474 && unicode_attributes[ch].category[0] == 'S'
475 && unicode_attributes[ch].category[1] == 'k');
479 is_category_So (unsigned int ch)
481 return (unicode_attributes[ch].name != NULL
482 && unicode_attributes[ch].category[0] == 'S'
483 && unicode_attributes[ch].category[1] == 'o');
487 is_category_Z (unsigned int ch)
489 return (unicode_attributes[ch].name != NULL
490 && unicode_attributes[ch].category[0] == 'Z');
494 is_category_Zs (unsigned int ch)
496 return (unicode_attributes[ch].name != NULL
497 && unicode_attributes[ch].category[0] == 'Z'
498 && unicode_attributes[ch].category[1] == 's');
502 is_category_Zl (unsigned int ch)
504 return (unicode_attributes[ch].name != NULL
505 && unicode_attributes[ch].category[0] == 'Z'
506 && unicode_attributes[ch].category[1] == 'l');
510 is_category_Zp (unsigned int ch)
512 return (unicode_attributes[ch].name != NULL
513 && unicode_attributes[ch].category[0] == 'Z'
514 && unicode_attributes[ch].category[1] == 'p');
518 is_category_C (unsigned int ch)
520 return (unicode_attributes[ch].name == NULL
521 || unicode_attributes[ch].category[0] == 'C');
525 is_category_Cc (unsigned int ch)
527 return (unicode_attributes[ch].name != NULL
528 && unicode_attributes[ch].category[0] == 'C'
529 && unicode_attributes[ch].category[1] == 'c');
533 is_category_Cf (unsigned int ch)
535 return (unicode_attributes[ch].name != NULL
536 && unicode_attributes[ch].category[0] == 'C'
537 && unicode_attributes[ch].category[1] == 'f');
541 is_category_Cs (unsigned int ch)
543 return (ch >= 0xd800 && ch < 0xe000);
547 is_category_Co (unsigned int ch)
549 return (unicode_attributes[ch].name != NULL
550 && unicode_attributes[ch].category[0] == 'C'
551 && unicode_attributes[ch].category[1] == 'o');
555 is_category_Cn (unsigned int ch)
557 return (unicode_attributes[ch].name == NULL
558 && !(ch >= 0xd800 && ch < 0xe000));
561 /* Output a boolean property in a human readable format. */
563 debug_output_predicate (const char *filename, bool (*predicate) (unsigned int))
568 stream = fopen (filename, "w");
571 fprintf (stderr, "cannot open '%s' for writing\n", filename);
575 #if 0 /* This yields huge text output. */
576 for (ch = 0; ch < 0x110000; ch++)
579 fprintf (stream, "0x%04X\n", ch);
582 for (ch = 0; ch < 0x110000; ch++)
585 unsigned int first = ch;
588 while (ch + 1 < 0x110000 && predicate (ch + 1))
592 fprintf (stream, "0x%04X..0x%04X\n", first, last);
594 fprintf (stream, "0x%04X\n", ch);
598 if (ferror (stream) || fclose (stream))
600 fprintf (stderr, "error writing to '%s'\n", filename);
605 /* Output the unit test for a boolean property. */
607 output_predicate_test (const char *filename, bool (*predicate) (unsigned int), const char *expression)
613 stream = fopen (filename, "w");
616 fprintf (stderr, "cannot open '%s' for writing\n", filename);
620 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
621 fprintf (stream, "/* Test the Unicode character type functions.\n");
622 fprintf (stream, " Copyright (C) 2007 Free Software Foundation, Inc.\n");
623 fprintf (stream, "\n");
624 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
625 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
626 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
627 fprintf (stream, " (at your option) any later version.\n");
628 fprintf (stream, "\n");
629 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
630 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
631 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
632 fprintf (stream, " GNU General Public License for more details.\n");
633 fprintf (stream, "\n");
634 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
635 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
636 fprintf (stream, "\n");
637 fprintf (stream, "#include \"test-predicate-part1.h\"\n");
638 fprintf (stream, "\n");
641 for (ch = 0; ch < 0x110000; ch++)
644 unsigned int first = ch;
647 while (ch + 1 < 0x110000 && predicate (ch + 1))
651 fprintf (stream, ",\n");
652 fprintf (stream, " { 0x%04X, 0x%04X }", first, last);
656 fprintf (stream, "\n");
658 fprintf (stream, "\n");
659 fprintf (stream, "#define PREDICATE(c) %s\n", expression);
660 fprintf (stream, "#include \"test-predicate-part2.h\"\n");
662 if (ferror (stream) || fclose (stream))
664 fprintf (stderr, "error writing to '%s'\n", filename);
669 /* Construction of sparse 3-level tables. */
670 #define TABLE predicate_table
671 #define xmalloc malloc
672 #define xrealloc realloc
673 #include "3levelbit.h"
675 /* Output a boolean property in a three-level bitmap. */
677 output_predicate (const char *filename, bool (*predicate) (unsigned int), const char *name, const char *comment, const char *version)
681 struct predicate_table t;
682 unsigned int level1_offset, level2_offset, level3_offset;
684 stream = fopen (filename, "w");
687 fprintf (stderr, "cannot open '%s' for writing\n", filename);
691 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
692 fprintf (stream, "/* %s of Unicode characters. */\n", comment);
693 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
698 predicate_table_init (&t);
700 for (ch = 0; ch < 0x110000; ch++)
702 predicate_table_add (&t, ch);
704 predicate_table_finalize (&t);
706 /* Offsets in t.result, in memory of this process. */
708 5 * sizeof (uint32_t);
710 5 * sizeof (uint32_t)
711 + t.level1_size * sizeof (uint32_t);
713 5 * sizeof (uint32_t)
714 + t.level1_size * sizeof (uint32_t)
715 + (t.level2_size << t.q) * sizeof (uint32_t);
717 for (i = 0; i < 5; i++)
719 fprintf (stream, "#define header_%d %d\n", i,
720 ((uint32_t *) t.result)[i]);
722 fprintf (stream, "static const\n");
723 fprintf (stream, "struct\n");
724 fprintf (stream, " {\n");
725 fprintf (stream, " int header[1];\n");
726 fprintf (stream, " int level1[%zu];\n", t.level1_size);
727 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
728 fprintf (stream, " /*unsigned*/ int level3[%zu << %d];\n", t.level3_size, t.p);
729 fprintf (stream, " }\n");
730 fprintf (stream, "%s =\n", name);
731 fprintf (stream, "{\n");
732 fprintf (stream, " { %d },\n", ((uint32_t *) t.result)[1]);
733 fprintf (stream, " {");
734 if (t.level1_size > 1)
735 fprintf (stream, "\n ");
736 for (i = 0; i < t.level1_size; i++)
739 if (i > 0 && (i % 1) == 0)
740 fprintf (stream, "\n ");
741 offset = ((uint32_t *) (t.result + level1_offset))[i];
743 fprintf (stream, " %5d", -1);
745 fprintf (stream, " %5zu * sizeof (int) / sizeof (short) + %5zu",
746 1 + t.level1_size, (offset - level2_offset) / sizeof (uint32_t));
747 if (i+1 < t.level1_size)
748 fprintf (stream, ",");
750 if (t.level1_size > 1)
751 fprintf (stream, "\n ");
752 fprintf (stream, " },\n");
753 fprintf (stream, " {");
754 if (t.level2_size << t.q > 1)
755 fprintf (stream, "\n ");
756 for (i = 0; i < t.level2_size << t.q; i++)
759 if (i > 0 && (i % 1) == 0)
760 fprintf (stream, "\n ");
761 offset = ((uint32_t *) (t.result + level2_offset))[i];
763 fprintf (stream, " %5d", -1);
765 fprintf (stream, " %5zu + %5zu * sizeof (short) / sizeof (int) + %5zu",
766 1 + t.level1_size, t.level2_size << t.q, (offset - level3_offset) / sizeof (uint32_t));
767 if (i+1 < t.level2_size << t.q)
768 fprintf (stream, ",");
770 if (t.level2_size << t.q > 1)
771 fprintf (stream, "\n ");
772 fprintf (stream, " },\n");
773 fprintf (stream, " {");
774 if (t.level3_size << t.p > 4)
775 fprintf (stream, "\n ");
776 for (i = 0; i < t.level3_size << t.p; i++)
778 if (i > 0 && (i % 4) == 0)
779 fprintf (stream, "\n ");
780 fprintf (stream, " 0x%08X",
781 ((uint32_t *) (t.result + level3_offset))[i]);
782 if (i+1 < t.level3_size << t.p)
783 fprintf (stream, ",");
785 if (t.level3_size << t.p > 4)
786 fprintf (stream, "\n ");
787 fprintf (stream, " }\n");
788 fprintf (stream, "};\n");
790 if (ferror (stream) || fclose (stream))
792 fprintf (stderr, "error writing to '%s'\n", filename);
797 /* Output all categories. */
799 output_categories (const char *version)
801 #define CATEGORY(C) \
802 debug_output_predicate ("unictype/categ_" #C ".txt", is_category_ ## C); \
803 output_predicate_test ("../tests/unictype/test-categ_" #C ".c", is_category_ ## C, "uc_is_general_category (c, UC_CATEGORY_" #C ")"); \
804 output_predicate ("unictype/categ_" #C ".h", is_category_ ## C, "u_categ_" #C, "Categories", version);
847 UC_CATEGORY_MASK_L = 0x0000001f,
848 UC_CATEGORY_MASK_Lu = 0x00000001,
849 UC_CATEGORY_MASK_Ll = 0x00000002,
850 UC_CATEGORY_MASK_Lt = 0x00000004,
851 UC_CATEGORY_MASK_Lm = 0x00000008,
852 UC_CATEGORY_MASK_Lo = 0x00000010,
853 UC_CATEGORY_MASK_M = 0x000000e0,
854 UC_CATEGORY_MASK_Mn = 0x00000020,
855 UC_CATEGORY_MASK_Mc = 0x00000040,
856 UC_CATEGORY_MASK_Me = 0x00000080,
857 UC_CATEGORY_MASK_N = 0x00000700,
858 UC_CATEGORY_MASK_Nd = 0x00000100,
859 UC_CATEGORY_MASK_Nl = 0x00000200,
860 UC_CATEGORY_MASK_No = 0x00000400,
861 UC_CATEGORY_MASK_P = 0x0003f800,
862 UC_CATEGORY_MASK_Pc = 0x00000800,
863 UC_CATEGORY_MASK_Pd = 0x00001000,
864 UC_CATEGORY_MASK_Ps = 0x00002000,
865 UC_CATEGORY_MASK_Pe = 0x00004000,
866 UC_CATEGORY_MASK_Pi = 0x00008000,
867 UC_CATEGORY_MASK_Pf = 0x00010000,
868 UC_CATEGORY_MASK_Po = 0x00020000,
869 UC_CATEGORY_MASK_S = 0x003c0000,
870 UC_CATEGORY_MASK_Sm = 0x00040000,
871 UC_CATEGORY_MASK_Sc = 0x00080000,
872 UC_CATEGORY_MASK_Sk = 0x00100000,
873 UC_CATEGORY_MASK_So = 0x00200000,
874 UC_CATEGORY_MASK_Z = 0x01c00000,
875 UC_CATEGORY_MASK_Zs = 0x00400000,
876 UC_CATEGORY_MASK_Zl = 0x00800000,
877 UC_CATEGORY_MASK_Zp = 0x01000000,
878 UC_CATEGORY_MASK_C = 0x3e000000,
879 UC_CATEGORY_MASK_Cc = 0x02000000,
880 UC_CATEGORY_MASK_Cf = 0x04000000,
881 UC_CATEGORY_MASK_Cs = 0x08000000,
882 UC_CATEGORY_MASK_Co = 0x10000000,
883 UC_CATEGORY_MASK_Cn = 0x20000000
887 general_category_byname (const char *category_name)
889 if (category_name[0] != '\0'
890 && (category_name[1] == '\0' || category_name[2] == '\0'))
891 switch (category_name[0])
894 switch (category_name[1])
896 case '\0': return UC_CATEGORY_MASK_L;
897 case 'u': return UC_CATEGORY_MASK_Lu;
898 case 'l': return UC_CATEGORY_MASK_Ll;
899 case 't': return UC_CATEGORY_MASK_Lt;
900 case 'm': return UC_CATEGORY_MASK_Lm;
901 case 'o': return UC_CATEGORY_MASK_Lo;
905 switch (category_name[1])
907 case '\0': return UC_CATEGORY_MASK_M;
908 case 'n': return UC_CATEGORY_MASK_Mn;
909 case 'c': return UC_CATEGORY_MASK_Mc;
910 case 'e': return UC_CATEGORY_MASK_Me;
914 switch (category_name[1])
916 case '\0': return UC_CATEGORY_MASK_N;
917 case 'd': return UC_CATEGORY_MASK_Nd;
918 case 'l': return UC_CATEGORY_MASK_Nl;
919 case 'o': return UC_CATEGORY_MASK_No;
923 switch (category_name[1])
925 case '\0': return UC_CATEGORY_MASK_P;
926 case 'c': return UC_CATEGORY_MASK_Pc;
927 case 'd': return UC_CATEGORY_MASK_Pd;
928 case 's': return UC_CATEGORY_MASK_Ps;
929 case 'e': return UC_CATEGORY_MASK_Pe;
930 case 'i': return UC_CATEGORY_MASK_Pi;
931 case 'f': return UC_CATEGORY_MASK_Pf;
932 case 'o': return UC_CATEGORY_MASK_Po;
936 switch (category_name[1])
938 case '\0': return UC_CATEGORY_MASK_S;
939 case 'm': return UC_CATEGORY_MASK_Sm;
940 case 'c': return UC_CATEGORY_MASK_Sc;
941 case 'k': return UC_CATEGORY_MASK_Sk;
942 case 'o': return UC_CATEGORY_MASK_So;
946 switch (category_name[1])
948 case '\0': return UC_CATEGORY_MASK_Z;
949 case 's': return UC_CATEGORY_MASK_Zs;
950 case 'l': return UC_CATEGORY_MASK_Zl;
951 case 'p': return UC_CATEGORY_MASK_Zp;
955 switch (category_name[1])
957 case '\0': return UC_CATEGORY_MASK_C;
958 case 'c': return UC_CATEGORY_MASK_Cc;
959 case 'f': return UC_CATEGORY_MASK_Cf;
960 case 's': return UC_CATEGORY_MASK_Cs;
961 case 'o': return UC_CATEGORY_MASK_Co;
962 case 'n': return UC_CATEGORY_MASK_Cn;
966 /* Invalid category name. */
970 /* Construction of sparse 3-level tables. */
971 #define TABLE category_table
972 #define ELEMENT uint8_t
973 #define DEFAULT 29 /* = log2(UC_CATEGORY_MASK_Cn) */
974 #define xmalloc malloc
975 #define xrealloc realloc
978 /* Output the per-character category table. */
980 output_category (const char *filename, const char *version)
984 struct category_table t;
985 unsigned int level1_offset, level2_offset, level3_offset;
986 uint16_t *level3_packed;
988 stream = fopen (filename, "w");
991 fprintf (stderr, "cannot open '%s' for writing\n", filename);
995 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
996 fprintf (stream, "/* Categories of Unicode characters. */\n");
997 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1002 category_table_init (&t);
1004 for (ch = 0; ch < 0x110000; ch++)
1007 unsigned int log2_value;
1009 if (is_category_Cs (ch))
1010 value = UC_CATEGORY_MASK_Cs;
1011 else if (unicode_attributes[ch].name != NULL)
1012 value = general_category_byname (unicode_attributes[ch].category);
1016 /* Now value should contain exactly one bit. */
1017 if (value == 0 || ((value & (value - 1)) != 0))
1020 for (log2_value = 0; value > 1; value >>= 1, log2_value++);
1022 category_table_add (&t, ch, log2_value);
1025 category_table_finalize (&t);
1027 /* Offsets in t.result, in memory of this process. */
1029 5 * sizeof (uint32_t);
1031 5 * sizeof (uint32_t)
1032 + t.level1_size * sizeof (uint32_t);
1034 5 * sizeof (uint32_t)
1035 + t.level1_size * sizeof (uint32_t)
1036 + (t.level2_size << t.q) * sizeof (uint32_t);
1038 for (i = 0; i < 5; i++)
1039 fprintf (stream, "#define category_header_%d %d\n", i,
1040 ((uint32_t *) t.result)[i]);
1041 fprintf (stream, "static const\n");
1042 fprintf (stream, "struct\n");
1043 fprintf (stream, " {\n");
1044 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1045 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1046 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
1047 (1 << t.p) * 5 / 16);
1048 fprintf (stream, " }\n");
1049 fprintf (stream, "u_category =\n");
1050 fprintf (stream, "{\n");
1051 fprintf (stream, " {");
1052 if (t.level1_size > 8)
1053 fprintf (stream, "\n ");
1054 for (i = 0; i < t.level1_size; i++)
1057 if (i > 0 && (i % 8) == 0)
1058 fprintf (stream, "\n ");
1059 offset = ((uint32_t *) (t.result + level1_offset))[i];
1061 fprintf (stream, " %5d", -1);
1063 fprintf (stream, " %5zu",
1064 (offset - level2_offset) / sizeof (uint32_t));
1065 if (i+1 < t.level1_size)
1066 fprintf (stream, ",");
1068 if (t.level1_size > 8)
1069 fprintf (stream, "\n ");
1070 fprintf (stream, " },\n");
1071 fprintf (stream, " {");
1072 if (t.level2_size << t.q > 8)
1073 fprintf (stream, "\n ");
1074 for (i = 0; i < t.level2_size << t.q; i++)
1077 if (i > 0 && (i % 8) == 0)
1078 fprintf (stream, "\n ");
1079 offset = ((uint32_t *) (t.result + level2_offset))[i];
1081 fprintf (stream, " %5d", -1);
1083 fprintf (stream, " %5zu",
1084 (offset - level3_offset) / sizeof (uint8_t));
1085 if (i+1 < t.level2_size << t.q)
1086 fprintf (stream, ",");
1088 if (t.level2_size << t.q > 8)
1089 fprintf (stream, "\n ");
1090 fprintf (stream, " },\n");
1091 /* Pack the level3 array. Each entry needs 5 bits only. Use 16-bit units,
1092 not 32-bit units, in order to make the lookup function easier. */
1095 calloc ((t.level3_size << t.p) * 5 / 16 + 1, sizeof (uint16_t));
1096 for (i = 0; i < t.level3_size << t.p; i++)
1098 unsigned int j = (i * 5) / 16;
1099 unsigned int k = (i * 5) % 16;
1100 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
1101 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
1102 level3_packed[j] = value & 0xffff;
1103 level3_packed[j+1] = value >> 16;
1105 fprintf (stream, " {");
1106 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1107 fprintf (stream, "\n ");
1108 for (i = 0; i < (t.level3_size << t.p) * 5 / 16 + 1; i++)
1110 if (i > 0 && (i % 8) == 0)
1111 fprintf (stream, "\n ");
1112 fprintf (stream, " 0x%04x", level3_packed[i]);
1113 if (i+1 < (t.level3_size << t.p) * 5 / 16 + 1)
1114 fprintf (stream, ",");
1116 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1117 fprintf (stream, "\n ");
1118 fprintf (stream, " }\n");
1119 free (level3_packed);
1120 fprintf (stream, "};\n");
1122 if (ferror (stream) || fclose (stream))
1124 fprintf (stderr, "error writing to '%s'\n", filename);
1129 /* ========================================================================= */
1131 /* Canonical combining class. */
1132 /* See Unicode 3.0 book, section 4.2,
1135 /* Construction of sparse 3-level tables. */
1136 #define TABLE combclass_table
1137 #define ELEMENT uint8_t
1139 #define xmalloc malloc
1140 #define xrealloc realloc
1143 /* Output the per-character combining class table. */
1145 output_combclass (const char *filename, const char *version)
1149 struct combclass_table t;
1150 unsigned int level1_offset, level2_offset, level3_offset;
1152 stream = fopen (filename, "w");
1155 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1159 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1160 fprintf (stream, "/* Combining class of Unicode characters. */\n");
1161 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1166 combclass_table_init (&t);
1168 for (ch = 0; ch < 0x110000; ch++)
1169 if (unicode_attributes[ch].name != NULL)
1171 int value = atoi (unicode_attributes[ch].combining);
1172 if (!(value >= 0 && value <= 255))
1174 combclass_table_add (&t, ch, value);
1177 combclass_table_finalize (&t);
1179 /* Offsets in t.result, in memory of this process. */
1181 5 * sizeof (uint32_t);
1183 5 * sizeof (uint32_t)
1184 + t.level1_size * sizeof (uint32_t);
1186 5 * sizeof (uint32_t)
1187 + t.level1_size * sizeof (uint32_t)
1188 + (t.level2_size << t.q) * sizeof (uint32_t);
1190 for (i = 0; i < 5; i++)
1191 fprintf (stream, "#define combclass_header_%d %d\n", i,
1192 ((uint32_t *) t.result)[i]);
1193 fprintf (stream, "static const\n");
1194 fprintf (stream, "struct\n");
1195 fprintf (stream, " {\n");
1196 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1197 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1198 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
1199 fprintf (stream, " }\n");
1200 fprintf (stream, "u_combclass =\n");
1201 fprintf (stream, "{\n");
1202 fprintf (stream, " {");
1203 if (t.level1_size > 8)
1204 fprintf (stream, "\n ");
1205 for (i = 0; i < t.level1_size; i++)
1208 if (i > 0 && (i % 8) == 0)
1209 fprintf (stream, "\n ");
1210 offset = ((uint32_t *) (t.result + level1_offset))[i];
1212 fprintf (stream, " %5d", -1);
1214 fprintf (stream, " %5zu",
1215 (offset - level2_offset) / sizeof (uint32_t));
1216 if (i+1 < t.level1_size)
1217 fprintf (stream, ",");
1219 if (t.level1_size > 8)
1220 fprintf (stream, "\n ");
1221 fprintf (stream, " },\n");
1222 fprintf (stream, " {");
1223 if (t.level2_size << t.q > 8)
1224 fprintf (stream, "\n ");
1225 for (i = 0; i < t.level2_size << t.q; i++)
1228 if (i > 0 && (i % 8) == 0)
1229 fprintf (stream, "\n ");
1230 offset = ((uint32_t *) (t.result + level2_offset))[i];
1232 fprintf (stream, " %5d", -1);
1234 fprintf (stream, " %5zu",
1235 (offset - level3_offset) / sizeof (uint8_t));
1236 if (i+1 < t.level2_size << t.q)
1237 fprintf (stream, ",");
1239 if (t.level2_size << t.q > 8)
1240 fprintf (stream, "\n ");
1241 fprintf (stream, " },\n");
1242 fprintf (stream, " {");
1243 if (t.level3_size << t.p > 8)
1244 fprintf (stream, "\n ");
1245 for (i = 0; i < t.level3_size << t.p; i++)
1247 if (i > 0 && (i % 8) == 0)
1248 fprintf (stream, "\n ");
1249 fprintf (stream, " %3d", ((uint8_t *) (t.result + level3_offset))[i]);
1250 if (i+1 < t.level3_size << t.p)
1251 fprintf (stream, ",");
1253 if (t.level3_size << t.p > 8)
1254 fprintf (stream, "\n ");
1255 fprintf (stream, " }\n");
1256 fprintf (stream, "};\n");
1258 if (ferror (stream) || fclose (stream))
1260 fprintf (stderr, "error writing to '%s'\n", filename);
1265 /* ========================================================================= */
1267 /* Bidirectional category. */
1268 /* See Unicode 3.0 book, section 4.3,
1273 UC_BIDI_L, /* Left-to-Right */
1274 UC_BIDI_LRE, /* Left-to-Right Embedding */
1275 UC_BIDI_LRO, /* Left-to-Right Override */
1276 UC_BIDI_R, /* Right-to-Left */
1277 UC_BIDI_AL, /* Right-to-Left Arabic */
1278 UC_BIDI_RLE, /* Right-to-Left Embedding */
1279 UC_BIDI_RLO, /* Right-to-Left Override */
1280 UC_BIDI_PDF, /* Pop Directional Format */
1281 UC_BIDI_EN, /* European Number */
1282 UC_BIDI_ES, /* European Number Separator */
1283 UC_BIDI_ET, /* European Number Terminator */
1284 UC_BIDI_AN, /* Arabic Number */
1285 UC_BIDI_CS, /* Common Number Separator */
1286 UC_BIDI_NSM, /* Non-Spacing Mark */
1287 UC_BIDI_BN, /* Boundary Neutral */
1288 UC_BIDI_B, /* Paragraph Separator */
1289 UC_BIDI_S, /* Segment Separator */
1290 UC_BIDI_WS, /* Whitespace */
1291 UC_BIDI_ON /* Other Neutral */
1295 bidi_category_byname (const char *category_name)
1297 switch (category_name[0])
1300 switch (category_name[1])
1303 if (category_name[2] == '\0')
1307 if (category_name[2] == '\0')
1313 switch (category_name[1])
1318 if (category_name[2] == '\0')
1324 switch (category_name[1])
1327 if (category_name[2] == '\0')
1333 switch (category_name[1])
1336 if (category_name[2] == '\0')
1340 if (category_name[2] == '\0')
1344 if (category_name[2] == '\0')
1350 switch (category_name[1])
1355 switch (category_name[2])
1358 if (category_name[3] == '\0')
1362 if (category_name[3] == '\0')
1370 switch (category_name[1])
1373 switch (category_name[2])
1376 if (category_name[3] == '\0')
1384 switch (category_name[1])
1387 if (category_name[2] == '\0')
1393 switch (category_name[1])
1396 switch (category_name[2])
1399 if (category_name[3] == '\0')
1407 switch (category_name[1])
1412 switch (category_name[2])
1415 if (category_name[3] == '\0')
1419 if (category_name[3] == '\0')
1427 if (category_name[1] == '\0')
1431 switch (category_name[1])
1434 if (category_name[2] == '\0')
1440 /* Invalid bidi category name. */
1445 get_bidi_category (unsigned int ch)
1447 if (unicode_attributes[ch].name != NULL)
1448 return bidi_category_byname (unicode_attributes[ch].bidi);
1451 /* The bidi category of unassigned characters depends on the range.
1452 See UTR #9 and DerivedBidiClass.txt. */
1453 if ((ch >= 0x0590 && ch <= 0x05FF)
1454 || (ch >= 0x07FB && ch <= 0x08FF)
1455 || (ch >= 0xFB37 && ch <= 0xFB45)
1456 || (ch >= 0x10800 && ch <= 0x10FFF))
1458 else if ((ch >= 0x0600 && ch <= 0x07BF)
1459 || (ch >= 0x2064 && ch <= 0x2069)
1460 || (ch >= 0xFBB2 && ch <= 0xFDCF)
1461 || (ch >= 0xFDFE && ch <= 0xFEFE))
1463 else if ((ch >= 0xFDD0 && ch <= 0xFDEF)
1464 || (ch >= 0xFFF0 && ch <= 0xFFFF)
1465 || (ch & 0xFFFF) == 0xFFFE
1466 || (ch & 0xFFFF) == 0xFFFF
1467 || (ch >= 0xE0000 && ch <= 0xE0FFF))
1474 /* Construction of sparse 3-level tables. */
1475 #define TABLE bidi_category_table
1476 #define ELEMENT uint8_t
1477 #define DEFAULT UC_BIDI_L
1478 #define xmalloc malloc
1479 #define xrealloc realloc
1482 /* Output the per-character bidi category table. */
1484 output_bidi_category (const char *filename, const char *version)
1488 struct bidi_category_table t;
1489 unsigned int level1_offset, level2_offset, level3_offset;
1490 uint16_t *level3_packed;
1492 stream = fopen (filename, "w");
1495 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1499 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1500 fprintf (stream, "/* Bidi categories of Unicode characters. */\n");
1501 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1506 bidi_category_table_init (&t);
1508 for (ch = 0; ch < 0x110000; ch++)
1510 int value = get_bidi_category (ch);
1512 bidi_category_table_add (&t, ch, value);
1515 bidi_category_table_finalize (&t);
1517 /* Offsets in t.result, in memory of this process. */
1519 5 * sizeof (uint32_t);
1521 5 * sizeof (uint32_t)
1522 + t.level1_size * sizeof (uint32_t);
1524 5 * sizeof (uint32_t)
1525 + t.level1_size * sizeof (uint32_t)
1526 + (t.level2_size << t.q) * sizeof (uint32_t);
1528 for (i = 0; i < 5; i++)
1529 fprintf (stream, "#define bidi_category_header_%d %d\n", i,
1530 ((uint32_t *) t.result)[i]);
1531 fprintf (stream, "static const\n");
1532 fprintf (stream, "struct\n");
1533 fprintf (stream, " {\n");
1534 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1535 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1536 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
1537 (1 << t.p) * 5 / 16);
1538 fprintf (stream, " }\n");
1539 fprintf (stream, "u_bidi_category =\n");
1540 fprintf (stream, "{\n");
1541 fprintf (stream, " {");
1542 if (t.level1_size > 8)
1543 fprintf (stream, "\n ");
1544 for (i = 0; i < t.level1_size; i++)
1547 if (i > 0 && (i % 8) == 0)
1548 fprintf (stream, "\n ");
1549 offset = ((uint32_t *) (t.result + level1_offset))[i];
1551 fprintf (stream, " %5d", -1);
1553 fprintf (stream, " %5zu",
1554 (offset - level2_offset) / sizeof (uint32_t));
1555 if (i+1 < t.level1_size)
1556 fprintf (stream, ",");
1558 if (t.level1_size > 8)
1559 fprintf (stream, "\n ");
1560 fprintf (stream, " },\n");
1561 fprintf (stream, " {");
1562 if (t.level2_size << t.q > 8)
1563 fprintf (stream, "\n ");
1564 for (i = 0; i < t.level2_size << t.q; i++)
1567 if (i > 0 && (i % 8) == 0)
1568 fprintf (stream, "\n ");
1569 offset = ((uint32_t *) (t.result + level2_offset))[i];
1571 fprintf (stream, " %5d", -1);
1573 fprintf (stream, " %5zu",
1574 (offset - level3_offset) / sizeof (uint8_t));
1575 if (i+1 < t.level2_size << t.q)
1576 fprintf (stream, ",");
1578 if (t.level2_size << t.q > 8)
1579 fprintf (stream, "\n ");
1580 fprintf (stream, " },\n");
1581 /* Pack the level3 array. Each entry needs 5 bits only. Use 16-bit units,
1582 not 32-bit units, in order to make the lookup function easier. */
1585 calloc ((t.level3_size << t.p) * 5 / 16 + 1, sizeof (uint16_t));
1586 for (i = 0; i < t.level3_size << t.p; i++)
1588 unsigned int j = (i * 5) / 16;
1589 unsigned int k = (i * 5) % 16;
1590 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
1591 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
1592 level3_packed[j] = value & 0xffff;
1593 level3_packed[j+1] = value >> 16;
1595 fprintf (stream, " {");
1596 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1597 fprintf (stream, "\n ");
1598 for (i = 0; i < (t.level3_size << t.p) * 5 / 16 + 1; i++)
1600 if (i > 0 && (i % 8) == 0)
1601 fprintf (stream, "\n ");
1602 fprintf (stream, " 0x%04x", level3_packed[i]);
1603 if (i+1 < (t.level3_size << t.p) * 5 / 16 + 1)
1604 fprintf (stream, ",");
1606 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1607 fprintf (stream, "\n ");
1608 fprintf (stream, " }\n");
1609 free (level3_packed);
1610 fprintf (stream, "};\n");
1612 if (ferror (stream) || fclose (stream))
1614 fprintf (stderr, "error writing to '%s'\n", filename);
1619 /* ========================================================================= */
1621 /* Decimal digit value. */
1622 /* See Unicode 3.0 book, section 4.6. */
1625 get_decdigit_value (unsigned int ch)
1627 if (unicode_attributes[ch].name != NULL
1628 && unicode_attributes[ch].decdigit[0] != '\0')
1629 return atoi (unicode_attributes[ch].decdigit);
1633 /* Construction of sparse 3-level tables. */
1634 #define TABLE decdigit_table
1635 #define ELEMENT uint8_t
1637 #define xmalloc malloc
1638 #define xrealloc realloc
1641 /* Output the unit test for the per-character decimal digit value table. */
1643 output_decimal_digit_test (const char *filename, const char *version)
1649 stream = fopen (filename, "w");
1652 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1656 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1657 fprintf (stream, "/* Decimal digit values of Unicode characters. */\n");
1658 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1662 for (ch = 0; ch < 0x110000; ch++)
1664 int value = get_decdigit_value (ch);
1666 if (!(value >= -1 && value < 10))
1672 fprintf (stream, ",\n");
1673 fprintf (stream, " { 0x%04X, %d }", ch, value);
1678 fprintf (stream, "\n");
1680 if (ferror (stream) || fclose (stream))
1682 fprintf (stderr, "error writing to '%s'\n", filename);
1687 /* Output the per-character decimal digit value table. */
1689 output_decimal_digit (const char *filename, const char *version)
1693 struct decdigit_table t;
1694 unsigned int level1_offset, level2_offset, level3_offset;
1696 stream = fopen (filename, "w");
1699 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1703 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1704 fprintf (stream, "/* Decimal digit values of Unicode characters. */\n");
1705 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1710 decdigit_table_init (&t);
1712 for (ch = 0; ch < 0x110000; ch++)
1714 int value = 1 + get_decdigit_value (ch);
1716 if (!(value >= 0 && value <= 10))
1719 decdigit_table_add (&t, ch, value);
1722 decdigit_table_finalize (&t);
1724 /* Offsets in t.result, in memory of this process. */
1726 5 * sizeof (uint32_t);
1728 5 * sizeof (uint32_t)
1729 + t.level1_size * sizeof (uint32_t);
1731 5 * sizeof (uint32_t)
1732 + t.level1_size * sizeof (uint32_t)
1733 + (t.level2_size << t.q) * sizeof (uint32_t);
1735 for (i = 0; i < 5; i++)
1736 fprintf (stream, "#define decdigit_header_%d %d\n", i,
1737 ((uint32_t *) t.result)[i]);
1738 fprintf (stream, "static const\n");
1739 fprintf (stream, "struct\n");
1740 fprintf (stream, " {\n");
1741 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1742 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1743 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size,
1745 fprintf (stream, " }\n");
1746 fprintf (stream, "u_decdigit =\n");
1747 fprintf (stream, "{\n");
1748 fprintf (stream, " {");
1749 if (t.level1_size > 8)
1750 fprintf (stream, "\n ");
1751 for (i = 0; i < t.level1_size; i++)
1754 if (i > 0 && (i % 8) == 0)
1755 fprintf (stream, "\n ");
1756 offset = ((uint32_t *) (t.result + level1_offset))[i];
1758 fprintf (stream, " %5d", -1);
1760 fprintf (stream, " %5zu",
1761 (offset - level2_offset) / sizeof (uint32_t));
1762 if (i+1 < t.level1_size)
1763 fprintf (stream, ",");
1765 if (t.level1_size > 8)
1766 fprintf (stream, "\n ");
1767 fprintf (stream, " },\n");
1768 fprintf (stream, " {");
1769 if (t.level2_size << t.q > 8)
1770 fprintf (stream, "\n ");
1771 for (i = 0; i < t.level2_size << t.q; i++)
1774 if (i > 0 && (i % 8) == 0)
1775 fprintf (stream, "\n ");
1776 offset = ((uint32_t *) (t.result + level2_offset))[i];
1778 fprintf (stream, " %5d", -1);
1780 fprintf (stream, " %5zu",
1781 (offset - level3_offset) / sizeof (uint8_t));
1782 if (i+1 < t.level2_size << t.q)
1783 fprintf (stream, ",");
1785 if (t.level2_size << t.q > 8)
1786 fprintf (stream, "\n ");
1787 fprintf (stream, " },\n");
1788 /* Pack the level3 array. Each entry needs 4 bits only. */
1789 fprintf (stream, " {");
1790 if (t.level3_size << (t.p - 1) > 8)
1791 fprintf (stream, "\n ");
1792 for (i = 0; i < t.level3_size << (t.p - 1); i++)
1794 if (i > 0 && (i % 8) == 0)
1795 fprintf (stream, "\n ");
1796 fprintf (stream, " 0x%02x",
1797 ((uint8_t *) (t.result + level3_offset))[2*i]
1798 + (((uint8_t *) (t.result + level3_offset))[2*i+1] << 4));
1799 if (i+1 < t.level3_size << (t.p - 1))
1800 fprintf (stream, ",");
1802 if (t.level3_size << (t.p - 1) > 8)
1803 fprintf (stream, "\n ");
1804 fprintf (stream, " }\n");
1805 fprintf (stream, "};\n");
1807 if (ferror (stream) || fclose (stream))
1809 fprintf (stderr, "error writing to '%s'\n", filename);
1814 /* ========================================================================= */
1817 /* See Unicode 3.0 book, section 4.6. */
1820 get_digit_value (unsigned int ch)
1822 if (unicode_attributes[ch].name != NULL
1823 && unicode_attributes[ch].digit[0] != '\0')
1824 return atoi (unicode_attributes[ch].digit);
1828 /* Output the unit test for the per-character digit value table. */
1830 output_digit_test (const char *filename, const char *version)
1836 stream = fopen (filename, "w");
1839 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1843 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1844 fprintf (stream, "/* Digit values of Unicode characters. */\n");
1845 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1849 for (ch = 0; ch < 0x110000; ch++)
1851 int value = get_digit_value (ch);
1853 if (!(value >= -1 && value < 10))
1859 fprintf (stream, ",\n");
1860 fprintf (stream, " { 0x%04X, %d }", ch, value);
1865 fprintf (stream, "\n");
1867 if (ferror (stream) || fclose (stream))
1869 fprintf (stderr, "error writing to '%s'\n", filename);
1874 /* Output the per-character digit value table. */
1876 output_digit (const char *filename, const char *version)
1880 struct decdigit_table t;
1881 unsigned int level1_offset, level2_offset, level3_offset;
1883 stream = fopen (filename, "w");
1886 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1890 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1891 fprintf (stream, "/* Digit values of Unicode characters. */\n");
1892 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1897 decdigit_table_init (&t);
1899 for (ch = 0; ch < 0x110000; ch++)
1901 int value = 1 + get_digit_value (ch);
1903 if (!(value >= 0 && value <= 10))
1906 decdigit_table_add (&t, ch, value);
1909 decdigit_table_finalize (&t);
1911 /* Offsets in t.result, in memory of this process. */
1913 5 * sizeof (uint32_t);
1915 5 * sizeof (uint32_t)
1916 + t.level1_size * sizeof (uint32_t);
1918 5 * sizeof (uint32_t)
1919 + t.level1_size * sizeof (uint32_t)
1920 + (t.level2_size << t.q) * sizeof (uint32_t);
1922 for (i = 0; i < 5; i++)
1923 fprintf (stream, "#define digit_header_%d %d\n", i,
1924 ((uint32_t *) t.result)[i]);
1925 fprintf (stream, "static const\n");
1926 fprintf (stream, "struct\n");
1927 fprintf (stream, " {\n");
1928 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1929 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1930 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size,
1932 fprintf (stream, " }\n");
1933 fprintf (stream, "u_digit =\n");
1934 fprintf (stream, "{\n");
1935 fprintf (stream, " {");
1936 if (t.level1_size > 8)
1937 fprintf (stream, "\n ");
1938 for (i = 0; i < t.level1_size; i++)
1941 if (i > 0 && (i % 8) == 0)
1942 fprintf (stream, "\n ");
1943 offset = ((uint32_t *) (t.result + level1_offset))[i];
1945 fprintf (stream, " %5d", -1);
1947 fprintf (stream, " %5zu",
1948 (offset - level2_offset) / sizeof (uint32_t));
1949 if (i+1 < t.level1_size)
1950 fprintf (stream, ",");
1952 if (t.level1_size > 8)
1953 fprintf (stream, "\n ");
1954 fprintf (stream, " },\n");
1955 fprintf (stream, " {");
1956 if (t.level2_size << t.q > 8)
1957 fprintf (stream, "\n ");
1958 for (i = 0; i < t.level2_size << t.q; i++)
1961 if (i > 0 && (i % 8) == 0)
1962 fprintf (stream, "\n ");
1963 offset = ((uint32_t *) (t.result + level2_offset))[i];
1965 fprintf (stream, " %5d", -1);
1967 fprintf (stream, " %5zu",
1968 (offset - level3_offset) / sizeof (uint8_t));
1969 if (i+1 < t.level2_size << t.q)
1970 fprintf (stream, ",");
1972 if (t.level2_size << t.q > 8)
1973 fprintf (stream, "\n ");
1974 fprintf (stream, " },\n");
1975 /* Pack the level3 array. Each entry needs 4 bits only. */
1976 fprintf (stream, " {");
1977 if (t.level3_size << (t.p - 1) > 8)
1978 fprintf (stream, "\n ");
1979 for (i = 0; i < t.level3_size << (t.p - 1); i++)
1981 if (i > 0 && (i % 8) == 0)
1982 fprintf (stream, "\n ");
1983 fprintf (stream, " 0x%02x",
1984 ((uint8_t *) (t.result + level3_offset))[2*i]
1985 + (((uint8_t *) (t.result + level3_offset))[2*i+1] << 4));
1986 if (i+1 < t.level3_size << (t.p - 1))
1987 fprintf (stream, ",");
1989 if (t.level3_size << (t.p - 1) > 8)
1990 fprintf (stream, "\n ");
1991 fprintf (stream, " }\n");
1992 fprintf (stream, "};\n");
1994 if (ferror (stream) || fclose (stream))
1996 fprintf (stderr, "error writing to '%s'\n", filename);
2001 /* ========================================================================= */
2003 /* Numeric value. */
2004 /* See Unicode 3.0 book, section 4.6. */
2006 typedef struct { int numerator; int denominator; } uc_fraction_t;
2008 static uc_fraction_t
2009 get_numeric_value (unsigned int ch)
2011 uc_fraction_t value;
2013 if (unicode_attributes[ch].name != NULL
2014 && unicode_attributes[ch].numeric[0] != '\0')
2016 const char *str = unicode_attributes[ch].numeric;
2017 /* str is of the form "integer" or "integer/posinteger". */
2018 value.numerator = atoi (str);
2019 if (strchr (str, '/') != NULL)
2020 value.denominator = atoi (strchr (str, '/') + 1);
2022 value.denominator = 1;
2026 value.numerator = 0;
2027 value.denominator = 0;
2032 /* Output the unit test for the per-character numeric value table. */
2034 output_numeric_test (const char *filename, const char *version)
2040 stream = fopen (filename, "w");
2043 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2047 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2048 fprintf (stream, "/* Numeric values of Unicode characters. */\n");
2049 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
2053 for (ch = 0; ch < 0x110000; ch++)
2055 uc_fraction_t value = get_numeric_value (ch);
2057 if (value.numerator != 0 || value.denominator != 0)
2060 fprintf (stream, ",\n");
2061 fprintf (stream, " { 0x%04X, %d, %d }",
2062 ch, value.numerator, value.denominator);
2067 fprintf (stream, "\n");
2069 if (ferror (stream) || fclose (stream))
2071 fprintf (stderr, "error writing to '%s'\n", filename);
2076 /* Construction of sparse 3-level tables. */
2077 #define TABLE numeric_table
2078 #define ELEMENT uint8_t
2080 #define xmalloc malloc
2081 #define xrealloc realloc
2084 /* Output the per-character numeric value table. */
2086 output_numeric (const char *filename, const char *version)
2089 uc_fraction_t fractions[128];
2090 unsigned int nfractions;
2091 unsigned int ch, i, j;
2092 struct numeric_table t;
2093 unsigned int level1_offset, level2_offset, level3_offset;
2094 uint16_t *level3_packed;
2096 stream = fopen (filename, "w");
2099 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2103 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2104 fprintf (stream, "/* Numeric values of Unicode characters. */\n");
2105 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
2108 /* Create table of occurring fractions. */
2110 for (ch = 0; ch < 0x110000; ch++)
2112 uc_fraction_t value = get_numeric_value (ch);
2114 for (i = 0; i < nfractions; i++)
2115 if (value.numerator == fractions[i].numerator
2116 && value.denominator == fractions[i].denominator)
2118 if (i == nfractions)
2120 if (nfractions == 128)
2122 for (i = 0; i < nfractions; i++)
2123 if (value.denominator < fractions[i].denominator
2124 || (value.denominator == fractions[i].denominator
2125 && value.numerator < fractions[i].numerator))
2127 for (j = nfractions; j > i; j--)
2128 fractions[j] = fractions[j - 1];
2129 fractions[i] = value;
2134 fprintf (stream, "static const uc_fraction_t u_numeric_values[%d] =\n",
2136 fprintf (stream, "{\n");
2137 for (i = 0; i < nfractions; i++)
2139 fprintf (stream, " { %d, %d }", fractions[i].numerator,
2140 fractions[i].denominator);
2141 if (i+1 < nfractions)
2142 fprintf (stream, ",");
2143 fprintf (stream, "\n");
2145 fprintf (stream, "};\n");
2149 numeric_table_init (&t);
2151 for (ch = 0; ch < 0x110000; ch++)
2153 uc_fraction_t value = get_numeric_value (ch);
2155 for (i = 0; i < nfractions; i++)
2156 if (value.numerator == fractions[i].numerator
2157 && value.denominator == fractions[i].denominator)
2159 if (i == nfractions)
2162 numeric_table_add (&t, ch, i);
2165 numeric_table_finalize (&t);
2167 /* Offsets in t.result, in memory of this process. */
2169 5 * sizeof (uint32_t);
2171 5 * sizeof (uint32_t)
2172 + t.level1_size * sizeof (uint32_t);
2174 5 * sizeof (uint32_t)
2175 + t.level1_size * sizeof (uint32_t)
2176 + (t.level2_size << t.q) * sizeof (uint32_t);
2178 for (i = 0; i < 5; i++)
2179 fprintf (stream, "#define numeric_header_%d %d\n", i,
2180 ((uint32_t *) t.result)[i]);
2181 fprintf (stream, "static const\n");
2182 fprintf (stream, "struct\n");
2183 fprintf (stream, " {\n");
2184 fprintf (stream, " int level1[%zu];\n", t.level1_size);
2185 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
2186 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
2187 (1 << t.p) * 7 / 16);
2188 fprintf (stream, " }\n");
2189 fprintf (stream, "u_numeric =\n");
2190 fprintf (stream, "{\n");
2191 fprintf (stream, " {");
2192 if (t.level1_size > 8)
2193 fprintf (stream, "\n ");
2194 for (i = 0; i < t.level1_size; i++)
2197 if (i > 0 && (i % 8) == 0)
2198 fprintf (stream, "\n ");
2199 offset = ((uint32_t *) (t.result + level1_offset))[i];
2201 fprintf (stream, " %5d", -1);
2203 fprintf (stream, " %5zu",
2204 (offset - level2_offset) / sizeof (uint32_t));
2205 if (i+1 < t.level1_size)
2206 fprintf (stream, ",");
2208 if (t.level1_size > 8)
2209 fprintf (stream, "\n ");
2210 fprintf (stream, " },\n");
2211 fprintf (stream, " {");
2212 if (t.level2_size << t.q > 8)
2213 fprintf (stream, "\n ");
2214 for (i = 0; i < t.level2_size << t.q; i++)
2217 if (i > 0 && (i % 8) == 0)
2218 fprintf (stream, "\n ");
2219 offset = ((uint32_t *) (t.result + level2_offset))[i];
2221 fprintf (stream, " %5d", -1);
2223 fprintf (stream, " %5zu",
2224 (offset - level3_offset) / sizeof (uint8_t));
2225 if (i+1 < t.level2_size << t.q)
2226 fprintf (stream, ",");
2228 if (t.level2_size << t.q > 8)
2229 fprintf (stream, "\n ");
2230 fprintf (stream, " },\n");
2231 /* Pack the level3 array. Each entry needs 7 bits only. Use 16-bit units,
2232 not 32-bit units, in order to make the lookup function easier. */
2235 calloc ((t.level3_size << t.p) * 7 / 16 + 1, sizeof (uint16_t));
2236 for (i = 0; i < t.level3_size << t.p; i++)
2238 unsigned int j = (i * 7) / 16;
2239 unsigned int k = (i * 7) % 16;
2240 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
2241 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
2242 level3_packed[j] = value & 0xffff;
2243 level3_packed[j+1] = value >> 16;
2245 fprintf (stream, " {");
2246 if ((t.level3_size << t.p) * 7 / 16 + 1 > 8)
2247 fprintf (stream, "\n ");
2248 for (i = 0; i < (t.level3_size << t.p) * 7 / 16 + 1; i++)
2250 if (i > 0 && (i % 8) == 0)
2251 fprintf (stream, "\n ");
2252 fprintf (stream, " 0x%04x", level3_packed[i]);
2253 if (i+1 < (t.level3_size << t.p) * 7 / 16 + 1)
2254 fprintf (stream, ",");
2256 if ((t.level3_size << t.p) * 7 / 16 + 1 > 8)
2257 fprintf (stream, "\n ");
2258 fprintf (stream, " }\n");
2259 free (level3_packed);
2260 fprintf (stream, "};\n");
2262 if (ferror (stream) || fclose (stream))
2264 fprintf (stderr, "error writing to '%s'\n", filename);
2269 /* ========================================================================= */
2272 /* See Unicode 3.0 book, section 4.7,
2275 /* List of mirrored character pairs. This is a subset of the characters
2276 having the BidiMirrored property. */
2277 static unsigned int mirror_pairs[][2] =
2334 get_mirror_value (unsigned int ch)
2337 unsigned int mirror_char;
2340 mirrored = (unicode_attributes[ch].name != NULL
2341 && unicode_attributes[ch].mirrored);
2342 mirror_char = 0xfffd;
2343 for (i = 0; i < sizeof (mirror_pairs) / sizeof (mirror_pairs[0]); i++)
2344 if (ch == mirror_pairs[i][0])
2346 mirror_char = mirror_pairs[i][1];
2349 else if (ch == mirror_pairs[i][1])
2351 mirror_char = mirror_pairs[i][0];
2355 return (int) mirror_char - (int) ch;
2358 if (mirror_char != 0xfffd)
2364 /* Construction of sparse 3-level tables. */
2365 #define TABLE mirror_table
2366 #define ELEMENT int32_t
2368 #define xmalloc malloc
2369 #define xrealloc realloc
2372 /* Output the per-character mirror table. */
2374 output_mirror (const char *filename, const char *version)
2378 struct mirror_table t;
2379 unsigned int level1_offset, level2_offset, level3_offset;
2381 stream = fopen (filename, "w");
2384 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2388 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2389 fprintf (stream, "/* Mirrored Unicode characters. */\n");
2390 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
2395 mirror_table_init (&t);
2397 for (ch = 0; ch < 0x110000; ch++)
2399 int value = get_mirror_value (ch);
2401 mirror_table_add (&t, ch, value);
2404 mirror_table_finalize (&t);
2406 /* Offsets in t.result, in memory of this process. */
2408 5 * sizeof (uint32_t);
2410 5 * sizeof (uint32_t)
2411 + t.level1_size * sizeof (uint32_t);
2413 5 * sizeof (uint32_t)
2414 + t.level1_size * sizeof (uint32_t)
2415 + (t.level2_size << t.q) * sizeof (uint32_t);
2417 for (i = 0; i < 5; i++)
2418 fprintf (stream, "#define mirror_header_%d %d\n", i,
2419 ((uint32_t *) t.result)[i]);
2420 fprintf (stream, "static const\n");
2421 fprintf (stream, "struct\n");
2422 fprintf (stream, " {\n");
2423 fprintf (stream, " int level1[%zu];\n", t.level1_size);
2424 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
2425 fprintf (stream, " int level3[%zu << %d];\n", t.level3_size, t.p);
2426 fprintf (stream, " }\n");
2427 fprintf (stream, "u_mirror =\n");
2428 fprintf (stream, "{\n");
2429 fprintf (stream, " {");
2430 if (t.level1_size > 8)
2431 fprintf (stream, "\n ");
2432 for (i = 0; i < t.level1_size; i++)
2435 if (i > 0 && (i % 8) == 0)
2436 fprintf (stream, "\n ");
2437 offset = ((uint32_t *) (t.result + level1_offset))[i];
2439 fprintf (stream, " %5d", -1);
2441 fprintf (stream, " %5zu",
2442 (offset - level2_offset) / sizeof (uint32_t));
2443 if (i+1 < t.level1_size)
2444 fprintf (stream, ",");
2446 if (t.level1_size > 8)
2447 fprintf (stream, "\n ");
2448 fprintf (stream, " },\n");
2449 fprintf (stream, " {");
2450 if (t.level2_size << t.q > 8)
2451 fprintf (stream, "\n ");
2452 for (i = 0; i < t.level2_size << t.q; i++)
2455 if (i > 0 && (i % 8) == 0)
2456 fprintf (stream, "\n ");
2457 offset = ((uint32_t *) (t.result + level2_offset))[i];
2459 fprintf (stream, " %5d", -1);
2461 fprintf (stream, " %5zu",
2462 (offset - level3_offset) / sizeof (int32_t));
2463 if (i+1 < t.level2_size << t.q)
2464 fprintf (stream, ",");
2466 if (t.level2_size << t.q > 8)
2467 fprintf (stream, "\n ");
2468 fprintf (stream, " },\n");
2469 fprintf (stream, " {");
2470 if (t.level3_size << t.p > 8)
2471 fprintf (stream, "\n ");
2472 for (i = 0; i < t.level3_size << t.p; i++)
2474 if (i > 0 && (i % 8) == 0)
2475 fprintf (stream, "\n ");
2476 fprintf (stream, " %5d", ((int32_t *) (t.result + level3_offset))[i]);
2477 if (i+1 < t.level3_size << t.p)
2478 fprintf (stream, ",");
2480 if (t.level3_size << t.p > 8)
2481 fprintf (stream, "\n ");
2482 fprintf (stream, " }\n");
2483 fprintf (stream, "};\n");
2485 if (ferror (stream) || fclose (stream))
2487 fprintf (stderr, "error writing to '%s'\n", filename);
2492 /* ========================================================================= */
2496 /* Reading PropList.txt and DerivedCoreProperties.txt. */
2505 PROP_QUOTATION_MARK,
2506 PROP_TERMINAL_PUNCTUATION,
2509 PROP_ASCII_HEX_DIGIT,
2510 PROP_OTHER_ALPHABETIC,
2514 PROP_OTHER_LOWERCASE,
2515 PROP_OTHER_UPPERCASE,
2516 PROP_NONCHARACTER_CODE_POINT,
2517 PROP_OTHER_GRAPHEME_EXTEND,
2518 PROP_IDS_BINARY_OPERATOR,
2519 PROP_IDS_TRINARY_OPERATOR,
2521 PROP_UNIFIED_IDEOGRAPH,
2522 PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT,
2525 PROP_LOGICAL_ORDER_EXCEPTION,
2526 PROP_OTHER_ID_START,
2527 PROP_OTHER_ID_CONTINUE,
2529 PROP_VARIATION_SELECTOR,
2530 PROP_PATTERN_WHITE_SPACE,
2531 PROP_PATTERN_SYNTAX,
2532 /* DerivedCoreProperties.txt */
2541 PROP_DEFAULT_IGNORABLE_CODE_POINT,
2542 PROP_GRAPHEME_EXTEND,
2546 unsigned long long unicode_properties[0x110000];
2549 clear_properties (void)
2553 for (i = 0; i < 0x110000; i++)
2554 unicode_properties[i] = 0;
2557 /* Stores in unicode_properties[] the properties from the
2558 PropList.txt or DerivedCoreProperties.txt file. */
2560 fill_properties (const char *proplist_filename)
2565 stream = fopen (proplist_filename, "r");
2568 fprintf (stderr, "error during fopen of '%s'\n", proplist_filename);
2575 unsigned int i1, i2;
2576 char padding[200+1];
2577 char propname[200+1];
2578 unsigned int propvalue;
2580 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
2583 if (buf[0] == '\0' || buf[0] == '#')
2586 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4)
2588 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3)
2590 fprintf (stderr, "parse error in '%s'\n", proplist_filename);
2595 #define PROP(name,value) \
2596 if (strcmp (propname, name) == 0) propvalue = value; else
2598 PROP ("White_Space", PROP_WHITE_SPACE)
2599 PROP ("Bidi_Control", PROP_BIDI_CONTROL)
2600 PROP ("Join_Control", PROP_JOIN_CONTROL)
2601 PROP ("Dash", PROP_DASH)
2602 PROP ("Hyphen", PROP_HYPHEN)
2603 PROP ("Quotation_Mark", PROP_QUOTATION_MARK)
2604 PROP ("Terminal_Punctuation", PROP_TERMINAL_PUNCTUATION)
2605 PROP ("Other_Math", PROP_OTHER_MATH)
2606 PROP ("Hex_Digit", PROP_HEX_DIGIT)
2607 PROP ("ASCII_Hex_Digit", PROP_ASCII_HEX_DIGIT)
2608 PROP ("Other_Alphabetic", PROP_OTHER_ALPHABETIC)
2609 PROP ("Ideographic", PROP_IDEOGRAPHIC)
2610 PROP ("Diacritic", PROP_DIACRITIC)
2611 PROP ("Extender", PROP_EXTENDER)
2612 PROP ("Other_Lowercase", PROP_OTHER_LOWERCASE)
2613 PROP ("Other_Uppercase", PROP_OTHER_UPPERCASE)
2614 PROP ("Noncharacter_Code_Point", PROP_NONCHARACTER_CODE_POINT)
2615 PROP ("Other_Grapheme_Extend", PROP_OTHER_GRAPHEME_EXTEND)
2616 PROP ("IDS_Binary_Operator", PROP_IDS_BINARY_OPERATOR)
2617 PROP ("IDS_Trinary_Operator", PROP_IDS_TRINARY_OPERATOR)
2618 PROP ("Radical", PROP_RADICAL)
2619 PROP ("Unified_Ideograph", PROP_UNIFIED_IDEOGRAPH)
2620 PROP ("Other_Default_Ignorable_Code_Point", PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)
2621 PROP ("Deprecated", PROP_DEPRECATED)
2622 PROP ("Soft_Dotted", PROP_SOFT_DOTTED)
2623 PROP ("Logical_Order_Exception", PROP_LOGICAL_ORDER_EXCEPTION)
2624 PROP ("Other_ID_Start", PROP_OTHER_ID_START)
2625 PROP ("Other_ID_Continue", PROP_OTHER_ID_CONTINUE)
2626 PROP ("STerm", PROP_STERM)
2627 PROP ("Variation_Selector", PROP_VARIATION_SELECTOR)
2628 PROP ("Pattern_White_Space", PROP_PATTERN_WHITE_SPACE)
2629 PROP ("Pattern_Syntax", PROP_PATTERN_SYNTAX)
2630 /* DerivedCoreProperties.txt */
2631 PROP ("Math", PROP_MATH)
2632 PROP ("Alphabetic", PROP_ALPHABETIC)
2633 PROP ("Lowercase", PROP_LOWERCASE)
2634 PROP ("Uppercase", PROP_UPPERCASE)
2635 PROP ("ID_Start", PROP_ID_START)
2636 PROP ("ID_Continue", PROP_ID_CONTINUE)
2637 PROP ("XID_Start", PROP_XID_START)
2638 PROP ("XID_Continue", PROP_XID_CONTINUE)
2639 PROP ("Default_Ignorable_Code_Point", PROP_DEFAULT_IGNORABLE_CODE_POINT)
2640 PROP ("Grapheme_Extend", PROP_GRAPHEME_EXTEND)
2641 PROP ("Grapheme_Base", PROP_GRAPHEME_BASE)
2642 PROP ("Grapheme_Link", PROP_GRAPHEME_LINK)
2645 fprintf (stderr, "unknown property named '%s' in '%s'\n", propname,
2649 if (!(i1 <= i2 && i2 < 0x110000))
2652 for (i = i1; i <= i2; i++)
2653 unicode_properties[i] |= 1ULL << propvalue;
2656 if (ferror (stream) || fclose (stream))
2658 fprintf (stderr, "error reading from '%s'\n", proplist_filename);
2663 /* Stores in array the given property from the Unicode 3.0 PropList.txt
2666 fill_property30 (char array[0x110000], const char *proplist_filename, const char *property_name)
2672 for (i = 0; i < 0x110000; i++)
2675 stream = fopen (proplist_filename, "r");
2678 fprintf (stderr, "error during fopen of '%s'\n", proplist_filename);
2682 /* Search for the "Property dump for: ..." line. */
2685 if (fscanf (stream, "%100[^\n]\n", buf) < 1)
2687 fprintf (stderr, "no property found in '%s'\n", proplist_filename);
2691 while (strstr (buf, property_name) == NULL);
2695 unsigned int i1, i2;
2697 if (fscanf (stream, "%100[^\n]\n", buf) < 1)
2701 if (strlen (buf) >= 10 && buf[4] == '.' && buf[5] == '.')
2703 if (sscanf (buf, "%4X..%4X", &i1, &i2) < 2)
2705 fprintf (stderr, "parse error in property in '%s'\n",
2710 else if (strlen (buf) >= 4)
2712 if (sscanf (buf, "%4X", &i1) < 1)
2714 fprintf (stderr, "parse error in property in '%s'\n",
2722 fprintf (stderr, "parse error in property in '%s'\n",
2726 if (!(i1 <= i2 && i2 < 0x110000))
2728 for (i = i1; i <= i2; i++)
2731 if (ferror (stream) || fclose (stream))
2733 fprintf (stderr, "error reading from '%s'\n", proplist_filename);
2738 /* Properties from Unicode 3.0 PropList.txt file. */
2740 /* The paired punctuation property from the PropList.txt file. */
2741 char unicode_pairedpunctuation[0x110000];
2743 /* The left of pair property from the PropList.txt file. */
2744 char unicode_leftofpair[0x110000];
2747 fill_properties30 (const char *proplist30_filename)
2749 fill_property30 (unicode_pairedpunctuation, proplist30_filename, "(Paired Punctuation)");
2750 fill_property30 (unicode_leftofpair, proplist30_filename, "(Left of Pair)");
2753 /* ------------------------------------------------------------------------- */
2755 /* See PropList.txt, UCD.html. */
2757 is_property_white_space (unsigned int ch)
2759 return ((unicode_properties[ch] & (1ULL << PROP_WHITE_SPACE)) != 0);
2762 /* See Unicode 3.0 book, section 4.10,
2763 PropList.txt, UCD.html,
2764 DerivedCoreProperties.txt, UCD.html. */
2766 is_property_alphabetic (unsigned int ch)
2770 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_ALPHABETIC)) != 0)
2771 /* For some reason, the following are listed as having property
2772 Alphabetic but not as having property Other_Alphabetic. */
2773 || (ch >= 0x16EE && ch <= 0x16F0) /* RUNIC SYMBOLS */
2774 || (ch >= 0x2160 && ch <= 0x2182) /* ROMAN NUMERALS */
2775 || (ch >= 0x2185 && ch <= 0x2188) /* ROMAN NUMERALS */
2776 || (ch >= 0x24D0 && ch <= 0x24E9) /* CIRCLED LATIN SMALL LETTER */
2777 || (ch == 0x3007) /* IDEOGRAPHIC NUMBER ZERO */
2778 || (ch >= 0x3021 && ch <= 0x3029) /* HANGZHOU NUMERAL */
2779 || (ch >= 0x3038 && ch <= 0x303A) /* HANGZHOU NUMERAL */
2780 || (ch >= 0x10140 && ch <= 0x10174) /* GREEK ACROPHONICS */
2781 || (ch == 0x10341) /* GOTHIC LETTER NINETY */
2782 || (ch == 0x1034A) /* GOTHIC LETTER NINE HUNDRED */
2783 || (ch >= 0x103D1 && ch <= 0x103D5) /* OLD PERSIAN NUMBERS */
2784 || (ch >= 0x12400 && ch <= 0x12462); /* CUNEIFORM NUMERIC SIGNS */
2786 ((unicode_properties[ch] & (1ULL << PROP_ALPHABETIC)) != 0);
2788 if (result1 != result2)
2793 /* See PropList.txt, UCD.html. */
2795 is_property_other_alphabetic (unsigned int ch)
2797 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ALPHABETIC)) != 0);
2800 /* See PropList.txt, UCD.html. */
2802 is_property_not_a_character (unsigned int ch)
2804 return ((unicode_properties[ch] & (1ULL << PROP_NONCHARACTER_CODE_POINT)) != 0);
2807 /* See PropList.txt, UCD.html,
2808 DerivedCoreProperties.txt, UCD.html. */
2810 is_property_default_ignorable_code_point (unsigned int ch)
2813 (is_category_Cf (ch)
2814 && !(ch >= 0xFFF9 && ch <= 0xFFFB) /* Annotations */
2815 && !((ch >= 0x0600 && ch <= 0x0603) || ch == 0x06DD || ch == 0x070F))
2816 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)) != 0)
2817 || ((unicode_properties[ch] & (1ULL << PROP_VARIATION_SELECTOR)) != 0);
2819 ((unicode_properties[ch] & (1ULL << PROP_DEFAULT_IGNORABLE_CODE_POINT)) != 0);
2821 if (result1 != result2)
2826 /* See PropList.txt, UCD.html. */
2828 is_property_other_default_ignorable_code_point (unsigned int ch)
2830 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)) != 0);
2833 /* See PropList.txt, UCD.html. */
2835 is_property_deprecated (unsigned int ch)
2837 return ((unicode_properties[ch] & (1ULL << PROP_DEPRECATED)) != 0);
2840 /* See PropList.txt, UCD.html. */
2842 is_property_logical_order_exception (unsigned int ch)
2844 return ((unicode_properties[ch] & (1ULL << PROP_LOGICAL_ORDER_EXCEPTION)) != 0);
2847 /* See PropList.txt, UCD.html. */
2849 is_property_variation_selector (unsigned int ch)
2851 return ((unicode_properties[ch] & (1ULL << PROP_VARIATION_SELECTOR)) != 0);
2854 /* See PropList-3.0.1.txt. */
2856 is_property_private_use (unsigned int ch)
2858 /* Determined through "grep 'Private Use,' UnicodeData-3.1.0.txt". */
2859 return (ch >= 0xE000 && ch <= 0xF8FF)
2860 || (ch >= 0xF0000 && ch <= 0xFFFFD)
2861 || (ch >= 0x100000 && ch <= 0x10FFFD);
2864 /* See PropList-3.0.1.txt. */
2866 is_property_unassigned_code_value (unsigned int ch)
2868 return (is_category_Cn (ch) && !is_property_not_a_character (ch));
2871 /* See PropList.txt, UCD.html,
2872 DerivedCoreProperties.txt, UCD.html. */
2874 is_property_uppercase (unsigned int ch)
2878 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_UPPERCASE)) != 0);
2880 ((unicode_properties[ch] & (1ULL << PROP_UPPERCASE)) != 0);
2882 if (result1 != result2)
2887 /* See PropList.txt, UCD.html. */
2889 is_property_other_uppercase (unsigned int ch)
2891 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_UPPERCASE)) != 0);
2894 /* See PropList.txt, UCD.html,
2895 DerivedCoreProperties.txt, UCD.html. */
2897 is_property_lowercase (unsigned int ch)
2901 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_LOWERCASE)) != 0);
2903 ((unicode_properties[ch] & (1ULL << PROP_LOWERCASE)) != 0);
2905 if (result1 != result2)
2910 /* See PropList.txt, UCD.html. */
2912 is_property_other_lowercase (unsigned int ch)
2914 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_LOWERCASE)) != 0);
2917 /* See PropList-3.0.1.txt. */
2919 is_property_titlecase (unsigned int ch)
2921 return is_category_Lt (ch);
2924 /* See PropList.txt, UCD.html. */
2926 is_property_soft_dotted (unsigned int ch)
2928 return ((unicode_properties[ch] & (1ULL << PROP_SOFT_DOTTED)) != 0);
2931 /* See DerivedCoreProperties.txt, UCD.html. */
2933 is_property_id_start (unsigned int ch)
2935 return ((unicode_properties[ch] & (1ULL << PROP_ID_START)) != 0);
2938 /* See PropList.txt, UCD.html. */
2940 is_property_other_id_start (unsigned int ch)
2942 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ID_START)) != 0);
2945 /* See DerivedCoreProperties.txt, UCD.html. */
2947 is_property_id_continue (unsigned int ch)
2949 return ((unicode_properties[ch] & (1ULL << PROP_ID_CONTINUE)) != 0);
2952 /* See PropList.txt, UCD.html. */
2954 is_property_other_id_continue (unsigned int ch)
2956 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ID_CONTINUE)) != 0);
2959 /* See DerivedCoreProperties.txt, UCD.html. */
2961 is_property_xid_start (unsigned int ch)
2963 return ((unicode_properties[ch] & (1ULL << PROP_XID_START)) != 0);
2966 /* See DerivedCoreProperties.txt, UCD.html. */
2968 is_property_xid_continue (unsigned int ch)
2970 return ((unicode_properties[ch] & (1ULL << PROP_XID_CONTINUE)) != 0);
2973 /* See PropList.txt, UCD.html. */
2975 is_property_pattern_white_space (unsigned int ch)
2977 return ((unicode_properties[ch] & (1ULL << PROP_PATTERN_WHITE_SPACE)) != 0);
2980 /* See PropList.txt, UCD.html. */
2982 is_property_pattern_syntax (unsigned int ch)
2984 return ((unicode_properties[ch] & (1ULL << PROP_PATTERN_SYNTAX)) != 0);
2987 /* See PropList.txt, UCD.html. */
2989 is_property_join_control (unsigned int ch)
2991 return ((unicode_properties[ch] & (1ULL << PROP_JOIN_CONTROL)) != 0);
2994 /* See DerivedCoreProperties.txt, UCD.html. */
2996 is_property_grapheme_base (unsigned int ch)
2998 return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_BASE)) != 0);
3001 /* See DerivedCoreProperties.txt, UCD.html. */
3003 is_property_grapheme_extend (unsigned int ch)
3005 return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_EXTEND)) != 0);
3008 /* See PropList.txt, UCD.html. */
3010 is_property_other_grapheme_extend (unsigned int ch)
3012 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_GRAPHEME_EXTEND)) != 0);
3015 /* See DerivedCoreProperties.txt, UCD.html. */
3017 is_property_grapheme_link (unsigned int ch)
3019 return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_LINK)) != 0);
3022 /* See PropList.txt, UCD.html. */
3024 is_property_bidi_control (unsigned int ch)
3026 return ((unicode_properties[ch] & (1ULL << PROP_BIDI_CONTROL)) != 0);
3029 /* See PropList-3.0.1.txt. */
3031 is_property_bidi_left_to_right (unsigned int ch)
3033 return (get_bidi_category (ch) == UC_BIDI_L);
3036 /* See PropList-3.0.1.txt. */
3038 is_property_bidi_hebrew_right_to_left (unsigned int ch)
3040 return (get_bidi_category (ch) == UC_BIDI_R);
3043 /* See PropList-3.0.1.txt. */
3045 is_property_bidi_arabic_right_to_left (unsigned int ch)
3047 return (get_bidi_category (ch) == UC_BIDI_AL);
3050 /* See PropList-3.0.1.txt. */
3052 is_property_bidi_european_digit (unsigned int ch)
3054 return (get_bidi_category (ch) == UC_BIDI_EN);
3057 /* See PropList-3.0.1.txt. */
3059 is_property_bidi_eur_num_separator (unsigned int ch)
3061 return (get_bidi_category (ch) == UC_BIDI_ES);
3064 /* See PropList-3.0.1.txt. */
3066 is_property_bidi_eur_num_terminator (unsigned int ch)
3068 return (get_bidi_category (ch) == UC_BIDI_ET);
3071 /* See PropList-3.0.1.txt. */
3073 is_property_bidi_arabic_digit (unsigned int ch)
3075 return (get_bidi_category (ch) == UC_BIDI_AN);
3078 /* See PropList-3.0.1.txt. */
3080 is_property_bidi_common_separator (unsigned int ch)
3082 return (get_bidi_category (ch) == UC_BIDI_CS);
3085 /* See PropList-3.0.1.txt. */
3087 is_property_bidi_block_separator (unsigned int ch)
3089 return (get_bidi_category (ch) == UC_BIDI_B);
3092 /* See PropList-3.0.1.txt. */
3094 is_property_bidi_segment_separator (unsigned int ch)
3096 return (get_bidi_category (ch) == UC_BIDI_S);
3099 /* See PropList-3.0.1.txt. */
3101 is_property_bidi_whitespace (unsigned int ch)
3103 return (get_bidi_category (ch) == UC_BIDI_WS);
3106 /* See PropList-3.0.1.txt. */
3108 is_property_bidi_non_spacing_mark (unsigned int ch)
3110 return (get_bidi_category (ch) == UC_BIDI_NSM);
3113 /* See PropList-3.0.1.txt. */
3115 is_property_bidi_boundary_neutral (unsigned int ch)
3117 return (get_bidi_category (ch) == UC_BIDI_BN);
3120 /* See PropList-3.0.1.txt. */
3122 is_property_bidi_pdf (unsigned int ch)
3124 return (get_bidi_category (ch) == UC_BIDI_PDF);
3127 /* See PropList-3.0.1.txt. */
3129 is_property_bidi_embedding_or_override (unsigned int ch)
3131 int category = get_bidi_category (ch);
3132 return (category == UC_BIDI_LRE || category == UC_BIDI_LRO
3133 || category == UC_BIDI_RLE || category == UC_BIDI_RLO);
3136 /* See PropList-3.0.1.txt. */
3138 is_property_bidi_other_neutral (unsigned int ch)
3140 return (get_bidi_category (ch) == UC_BIDI_ON);
3143 /* See PropList.txt, UCD.html. */
3145 is_property_hex_digit (unsigned int ch)
3147 return ((unicode_properties[ch] & (1ULL << PROP_HEX_DIGIT)) != 0);
3150 /* See PropList.txt, UCD.html. */
3152 is_property_ascii_hex_digit (unsigned int ch)
3154 return ((unicode_properties[ch] & (1ULL << PROP_ASCII_HEX_DIGIT)) != 0);
3157 /* See Unicode 3.0 book, section 4.10,
3158 PropList.txt, UCD.html. */
3160 is_property_ideographic (unsigned int ch)
3162 return ((unicode_properties[ch] & (1ULL << PROP_IDEOGRAPHIC)) != 0);
3165 /* See PropList.txt, UCD.html. */
3167 is_property_unified_ideograph (unsigned int ch)
3169 return ((unicode_properties[ch] & (1ULL << PROP_UNIFIED_IDEOGRAPH)) != 0);
3172 /* See PropList.txt, UCD.html. */
3174 is_property_radical (unsigned int ch)
3176 return ((unicode_properties[ch] & (1ULL << PROP_RADICAL)) != 0);
3179 /* See PropList.txt, UCD.html. */
3181 is_property_ids_binary_operator (unsigned int ch)
3183 return ((unicode_properties[ch] & (1ULL << PROP_IDS_BINARY_OPERATOR)) != 0);
3186 /* See PropList.txt, UCD.html. */
3188 is_property_ids_trinary_operator (unsigned int ch)
3190 return ((unicode_properties[ch] & (1ULL << PROP_IDS_TRINARY_OPERATOR)) != 0);
3193 /* See PropList-3.0.1.txt. */
3195 is_property_zero_width (unsigned int ch)
3197 return is_category_Cf (ch)
3198 || (unicode_attributes[ch].name != NULL
3199 && strstr (unicode_attributes[ch].name, "ZERO WIDTH") != NULL);
3202 /* See PropList-3.0.1.txt. */
3204 is_property_space (unsigned int ch)
3206 return is_category_Zs (ch);
3209 /* See PropList-3.0.1.txt. */
3211 is_property_non_break (unsigned int ch)
3213 /* This is exactly the set of characters having line breaking
3215 return (ch == 0x00A0 /* NO-BREAK SPACE */
3216 || ch == 0x034F /* COMBINING GRAPHEME JOINER */
3217 || ch == 0x035C /* COMBINING DOUBLE BREVE BELOW */
3218 || ch == 0x035D /* COMBINING DOUBLE BREVE */
3219 || ch == 0x035E /* COMBINING DOUBLE MACRON */
3220 || ch == 0x035F /* COMBINING DOUBLE MACRON BELOW */
3221 || ch == 0x0360 /* COMBINING DOUBLE TILDE */
3222 || ch == 0x0361 /* COMBINING DOUBLE INVERTED BREVE */
3223 || ch == 0x0362 /* COMBINING DOUBLE RIGHTWARDS ARROW BELOW */
3224 || ch == 0x0F08 /* TIBETAN MARK SBRUL SHAD */
3225 || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */
3226 || ch == 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */
3227 || ch == 0x180E /* MONGOLIAN VOWEL SEPARATOR */
3228 || ch == 0x2007 /* FIGURE SPACE */
3229 || ch == 0x2011 /* NON-BREAKING HYPHEN */
3230 || ch == 0x202F /* NARROW NO-BREAK SPACE */);
3233 /* See PropList-3.0.1.txt. */
3235 is_property_iso_control (unsigned int ch)
3238 (unicode_attributes[ch].name != NULL
3239 && strcmp (unicode_attributes[ch].name, "<control>") == 0);
3241 is_category_Cc (ch);
3243 if (result1 != result2)
3248 /* See PropList-3.0.1.txt. */
3250 is_property_format_control (unsigned int ch)
3252 return (is_category_Cf (ch)
3253 && get_bidi_category (ch) == UC_BIDI_BN
3254 && !is_property_join_control (ch)
3258 /* See PropList.txt, UCD.html. */
3260 is_property_dash (unsigned int ch)
3262 return ((unicode_properties[ch] & (1ULL << PROP_DASH)) != 0);
3265 /* See PropList.txt, UCD.html. */
3267 is_property_hyphen (unsigned int ch)
3269 return ((unicode_properties[ch] & (1ULL << PROP_HYPHEN)) != 0);
3272 /* See PropList-3.0.1.txt. */
3274 is_property_punctuation (unsigned int ch)
3276 return is_category_P (ch);
3279 /* See PropList-3.0.1.txt. */
3281 is_property_line_separator (unsigned int ch)
3283 return is_category_Zl (ch);
3286 /* See PropList-3.0.1.txt. */
3288 is_property_paragraph_separator (unsigned int ch)
3290 return is_category_Zp (ch);
3293 /* See PropList.txt, UCD.html. */
3295 is_property_quotation_mark (unsigned int ch)
3297 return ((unicode_properties[ch] & (1ULL << PROP_QUOTATION_MARK)) != 0);
3300 /* See PropList.txt, UCD.html. */
3302 is_property_sentence_terminal (unsigned int ch)
3304 return ((unicode_properties[ch] & (1ULL << PROP_STERM)) != 0);
3307 /* See PropList.txt, UCD.html. */
3309 is_property_terminal_punctuation (unsigned int ch)
3311 return ((unicode_properties[ch] & (1ULL << PROP_TERMINAL_PUNCTUATION)) != 0);
3314 /* See PropList-3.0.1.txt. */
3316 is_property_currency_symbol (unsigned int ch)
3318 return is_category_Sc (ch);
3321 /* See Unicode 3.0 book, section 4.9,
3322 PropList.txt, UCD.html,
3323 DerivedCoreProperties.txt, UCD.html. */
3325 is_property_math (unsigned int ch)
3329 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_MATH)) != 0);
3331 ((unicode_properties[ch] & (1ULL << PROP_MATH)) != 0);
3333 if (result1 != result2)
3338 /* See PropList.txt, UCD.html. */
3340 is_property_other_math (unsigned int ch)
3342 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_MATH)) != 0);
3345 /* See PropList-3.0.1.txt. */
3347 is_property_paired_punctuation (unsigned int ch)
3349 return unicode_pairedpunctuation[ch];
3352 /* See PropList-3.0.1.txt. */
3354 is_property_left_of_pair (unsigned int ch)
3356 return unicode_leftofpair[ch];
3359 /* See PropList-3.0.1.txt. */
3361 is_property_combining (unsigned int ch)
3363 return (unicode_attributes[ch].name != NULL
3364 && (strcmp (unicode_attributes[ch].combining, "0") != 0
3365 || is_category_Mc (ch)
3366 || is_category_Me (ch)
3367 || is_category_Mn (ch)));
3370 #if 0 /* same as is_property_bidi_non_spacing_mark */
3371 /* See PropList-3.0.1.txt. */
3373 is_property_non_spacing (unsigned int ch)
3375 return (unicode_attributes[ch].name != NULL
3376 && get_bidi_category (ch) == UC_BIDI_NSM);
3380 /* See PropList-3.0.1.txt. */
3382 is_property_composite (unsigned int ch)
3384 /* This definition differs from the one in PropList-3.0.1.txt, but is more
3385 logical in some sense. */
3386 if (ch >= 0xAC00 && ch <= 0xD7A4) /* Hangul Syllables */
3388 if (unicode_attributes[ch].name != NULL
3389 && unicode_attributes[ch].decomposition != NULL)
3391 /* Test whether the decomposition contains more than one character,
3392 and the first is not a space. */
3393 const char *decomp = unicode_attributes[ch].decomposition;
3394 if (decomp[0] == '<')
3396 decomp = strchr (decomp, '>') + 1;
3397 if (decomp[0] == ' ')
3400 return strchr (decomp, ' ') != NULL && strncmp (decomp, "0020 ", 5) != 0;
3405 /* See PropList-3.0.1.txt. */
3407 is_property_decimal_digit (unsigned int ch)
3409 return is_category_Nd (ch);
3412 /* See PropList-3.0.1.txt. */
3414 is_property_numeric (unsigned int ch)
3416 return ((get_numeric_value (ch)).denominator > 0)
3417 || (ch == 0x09F8) /* BENGALI CURRENCY NUMERATOR ONE LESS THAN THE DENOMINATOR */
3418 || (ch == 0x2183); /* ROMAN NUMERAL REVERSED ONE HUNDRED */
3421 /* See PropList.txt, UCD.html. */
3423 is_property_diacritic (unsigned int ch)
3425 return ((unicode_properties[ch] & (1ULL << PROP_DIACRITIC)) != 0);
3428 /* See PropList.txt, UCD.html. */
3430 is_property_extender (unsigned int ch)
3432 return ((unicode_properties[ch] & (1ULL << PROP_EXTENDER)) != 0);
3435 /* See PropList-3.0.1.txt. */
3437 is_property_ignorable_control (unsigned int ch)
3439 return ((is_category_Cc (ch) && get_bidi_category (ch) == UC_BIDI_BN)
3440 || is_category_Cf (ch))
3444 /* ------------------------------------------------------------------------- */
3446 /* Output all properties. */
3448 output_properties (const char *version)
3450 #define PROPERTY(P) \
3451 debug_output_predicate ("unictype/pr_" #P ".txt", is_property_ ## P); \
3452 output_predicate_test ("../tests/unictype/test-pr_" #P ".c", is_property_ ## P, "uc_is_property_" #P " (c)"); \
3453 output_predicate ("unictype/pr_" #P ".h", is_property_ ## P, "u_property_" #P, "Properties", version);
3454 PROPERTY(white_space)
3455 PROPERTY(alphabetic)
3456 PROPERTY(other_alphabetic)
3457 PROPERTY(not_a_character)
3458 PROPERTY(default_ignorable_code_point)
3459 PROPERTY(other_default_ignorable_code_point)
3460 PROPERTY(deprecated)
3461 PROPERTY(logical_order_exception)
3462 PROPERTY(variation_selector)
3463 PROPERTY(private_use)
3464 PROPERTY(unassigned_code_value)
3466 PROPERTY(other_uppercase)
3468 PROPERTY(other_lowercase)
3470 PROPERTY(soft_dotted)
3472 PROPERTY(other_id_start)
3473 PROPERTY(id_continue)
3474 PROPERTY(other_id_continue)
3476 PROPERTY(xid_continue)
3477 PROPERTY(pattern_white_space)
3478 PROPERTY(pattern_syntax)
3479 PROPERTY(join_control)
3480 PROPERTY(grapheme_base)
3481 PROPERTY(grapheme_extend)
3482 PROPERTY(other_grapheme_extend)
3483 PROPERTY(grapheme_link)
3484 PROPERTY(bidi_control)
3485 PROPERTY(bidi_left_to_right)
3486 PROPERTY(bidi_hebrew_right_to_left)
3487 PROPERTY(bidi_arabic_right_to_left)
3488 PROPERTY(bidi_european_digit)
3489 PROPERTY(bidi_eur_num_separator)
3490 PROPERTY(bidi_eur_num_terminator)
3491 PROPERTY(bidi_arabic_digit)
3492 PROPERTY(bidi_common_separator)
3493 PROPERTY(bidi_block_separator)
3494 PROPERTY(bidi_segment_separator)
3495 PROPERTY(bidi_whitespace)
3496 PROPERTY(bidi_non_spacing_mark)
3497 PROPERTY(bidi_boundary_neutral)
3499 PROPERTY(bidi_embedding_or_override)
3500 PROPERTY(bidi_other_neutral)
3502 PROPERTY(ascii_hex_digit)
3503 PROPERTY(ideographic)
3504 PROPERTY(unified_ideograph)
3506 PROPERTY(ids_binary_operator)
3507 PROPERTY(ids_trinary_operator)
3508 PROPERTY(zero_width)
3511 PROPERTY(iso_control)
3512 PROPERTY(format_control)
3515 PROPERTY(punctuation)
3516 PROPERTY(line_separator)
3517 PROPERTY(paragraph_separator)
3518 PROPERTY(quotation_mark)
3519 PROPERTY(sentence_terminal)
3520 PROPERTY(terminal_punctuation)
3521 PROPERTY(currency_symbol)
3523 PROPERTY(other_math)
3524 PROPERTY(paired_punctuation)
3525 PROPERTY(left_of_pair)
3528 PROPERTY(decimal_digit)
3532 PROPERTY(ignorable_control)
3536 /* ========================================================================= */
3540 static const char *scripts[256];
3541 static unsigned int numscripts;
3543 static uint8_t unicode_scripts[0x110000];
3546 fill_scripts (const char *scripts_filename)
3551 stream = fopen (scripts_filename, "r");
3554 fprintf (stderr, "error during fopen of '%s'\n", scripts_filename);
3560 for (i = 0; i < 0x110000; i++)
3561 unicode_scripts[i] = (uint8_t)~(uint8_t)0;
3566 unsigned int i1, i2;
3567 char padding[200+1];
3568 char scriptname[200+1];
3571 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
3574 if (buf[0] == '\0' || buf[0] == '#')
3577 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, scriptname) != 4)
3579 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, scriptname) != 3)
3581 fprintf (stderr, "parse error in '%s'\n", scripts_filename);
3591 for (script = numscripts - 1; script >= 0; script--)
3592 if (strcmp (scripts[script], scriptname) == 0)
3596 scripts[numscripts] = strdup (scriptname);
3597 script = numscripts;
3599 if (numscripts == 256)
3603 for (i = i1; i <= i2; i++)
3605 if (unicode_scripts[i] != (uint8_t)~(uint8_t)0)
3606 fprintf (stderr, "0x%04X belongs to multiple scripts\n", i);
3607 unicode_scripts[i] = script;
3611 if (ferror (stream) || fclose (stream))
3613 fprintf (stderr, "error reading from '%s'\n", scripts_filename);
3618 /* Construction of sparse 3-level tables. */
3619 #define TABLE script_table
3620 #define ELEMENT uint8_t
3621 #define DEFAULT (uint8_t)~(uint8_t)0
3622 #define xmalloc malloc
3623 #define xrealloc realloc
3627 output_scripts (const char *version)
3629 const char *filename = "unictype/scripts.h";
3631 unsigned int ch, s, i;
3632 struct script_table t;
3633 unsigned int level1_offset, level2_offset, level3_offset;
3637 const char *lowercase_name;
3640 scriptinfo_t scriptinfo[256];
3642 stream = fopen (filename, "w");
3645 fprintf (stderr, "cannot open '%s' for writing\n", filename);
3649 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
3650 fprintf (stream, "/* Unicode scripts. */\n");
3651 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
3654 for (s = 0; s < numscripts; s++)
3656 char *lcp = strdup (scripts[s]);
3659 for (cp = lcp; *cp != '\0'; cp++)
3660 if (*cp >= 'A' && *cp <= 'Z')
3663 scriptinfo[s].lowercase_name = lcp;
3666 for (s = 0; s < numscripts; s++)
3668 fprintf (stream, "static const uc_interval_t script_%s_intervals[] =\n",
3669 scriptinfo[s].lowercase_name);
3670 fprintf (stream, "{\n");
3672 for (ch = 0; ch < 0x110000; ch++)
3673 if (unicode_scripts[ch] == s)
3679 while (ch + 1 < 0x110000 && unicode_scripts[ch + 1] == s)
3684 fprintf (stream, ",\n");
3686 fprintf (stream, " { 0x%04X, 1, 1 }", start);
3688 fprintf (stream, " { 0x%04X, 1, 0 }, { 0x%04X, 0, 1 }",
3692 fprintf (stream, "\n");
3693 fprintf (stream, "};\n");
3696 fprintf (stream, "static const uc_script_t scripts[%d] =\n", numscripts);
3697 fprintf (stream, "{\n");
3698 for (s = 0; s < numscripts; s++)
3700 fprintf (stream, " {\n");
3701 fprintf (stream, " sizeof (script_%s_intervals) / sizeof (uc_interval_t),\n",
3702 scriptinfo[s].lowercase_name);
3703 fprintf (stream, " script_%s_intervals,\n",
3704 scriptinfo[s].lowercase_name);
3705 fprintf (stream, " \"%s\"\n", scripts[s]);
3706 fprintf (stream, " }");
3707 if (s+1 < numscripts)
3708 fprintf (stream, ",");
3709 fprintf (stream, "\n");
3711 fprintf (stream, "};\n");
3715 script_table_init (&t);
3717 for (ch = 0; ch < 0x110000; ch++)
3719 unsigned int s = unicode_scripts[ch];
3720 if (s != (uint8_t)~(uint8_t)0)
3721 script_table_add (&t, ch, s);
3724 script_table_finalize (&t);
3726 /* Offsets in t.result, in memory of this process. */
3728 5 * sizeof (uint32_t);
3730 5 * sizeof (uint32_t)
3731 + t.level1_size * sizeof (uint32_t);
3733 5 * sizeof (uint32_t)
3734 + t.level1_size * sizeof (uint32_t)
3735 + (t.level2_size << t.q) * sizeof (uint32_t);
3737 for (i = 0; i < 5; i++)
3738 fprintf (stream, "#define script_header_%d %d\n", i,
3739 ((uint32_t *) t.result)[i]);
3740 fprintf (stream, "static const\n");
3741 fprintf (stream, "struct\n");
3742 fprintf (stream, " {\n");
3743 fprintf (stream, " int level1[%zu];\n", t.level1_size);
3744 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
3745 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
3746 fprintf (stream, " }\n");
3747 fprintf (stream, "u_script =\n");
3748 fprintf (stream, "{\n");
3749 fprintf (stream, " {");
3750 if (t.level1_size > 8)
3751 fprintf (stream, "\n ");
3752 for (i = 0; i < t.level1_size; i++)
3755 if (i > 0 && (i % 8) == 0)
3756 fprintf (stream, "\n ");
3757 offset = ((uint32_t *) (t.result + level1_offset))[i];
3759 fprintf (stream, " %5d", -1);
3761 fprintf (stream, " %5zu",
3762 (offset - level2_offset) / sizeof (uint32_t));
3763 if (i+1 < t.level1_size)
3764 fprintf (stream, ",");
3766 if (t.level1_size > 8)
3767 fprintf (stream, "\n ");
3768 fprintf (stream, " },\n");
3769 fprintf (stream, " {");
3770 if (t.level2_size << t.q > 8)
3771 fprintf (stream, "\n ");
3772 for (i = 0; i < t.level2_size << t.q; i++)
3775 if (i > 0 && (i % 8) == 0)
3776 fprintf (stream, "\n ");
3777 offset = ((uint32_t *) (t.result + level2_offset))[i];
3779 fprintf (stream, " %5d", -1);
3781 fprintf (stream, " %5zu",
3782 (offset - level3_offset) / sizeof (uint8_t));
3783 if (i+1 < t.level2_size << t.q)
3784 fprintf (stream, ",");
3786 if (t.level2_size << t.q > 8)
3787 fprintf (stream, "\n ");
3788 fprintf (stream, " },\n");
3789 fprintf (stream, " {");
3790 if (t.level3_size << t.p > 8)
3791 fprintf (stream, "\n ");
3792 for (i = 0; i < t.level3_size << t.p; i++)
3794 if (i > 0 && (i % 8) == 0)
3795 fprintf (stream, "\n ");
3796 fprintf (stream, " %3d", ((uint8_t *) (t.result + level3_offset))[i]);
3797 if (i+1 < t.level3_size << t.p)
3798 fprintf (stream, ",");
3800 if (t.level3_size << t.p > 8)
3801 fprintf (stream, "\n ");
3802 fprintf (stream, " }\n");
3803 fprintf (stream, "};\n");
3805 if (ferror (stream) || fclose (stream))
3807 fprintf (stderr, "error writing to '%s'\n", filename);
3813 output_scripts_byname (const char *version)
3815 const char *filename = "unictype/scripts_byname.gperf";
3819 stream = fopen (filename, "w");
3822 fprintf (stderr, "cannot open '%s' for writing\n", filename);
3826 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
3827 fprintf (stream, "/* Unicode scripts. */\n");
3828 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
3830 fprintf (stream, "struct named_script { const char *name; unsigned int index; };\n");
3831 fprintf (stream, "%%struct-type\n");
3832 fprintf (stream, "%%language=ANSI-C\n");
3833 fprintf (stream, "%%define hash-function-name scripts_hash\n");
3834 fprintf (stream, "%%define lookup-function-name uc_script_lookup\n");
3835 fprintf (stream, "%%readonly-tables\n");
3836 fprintf (stream, "%%global-table\n");
3837 fprintf (stream, "%%define word-array-name script_names\n");
3838 fprintf (stream, "%%%%\n");
3839 for (s = 0; s < numscripts; s++)
3840 fprintf (stream, "%s, %u\n", scripts[s], s);
3842 if (ferror (stream) || fclose (stream))
3844 fprintf (stderr, "error writing to '%s'\n", filename);
3849 /* ========================================================================= */
3853 typedef struct { unsigned int start; unsigned int end; const char *name; }
3855 static block_t blocks[256];
3856 static unsigned int numblocks;
3859 fill_blocks (const char *blocks_filename)
3863 stream = fopen (blocks_filename, "r");
3866 fprintf (stderr, "error during fopen of '%s'\n", blocks_filename);
3873 unsigned int i1, i2;
3874 char padding[200+1];
3875 char blockname[200+1];
3877 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
3880 if (buf[0] == '\0' || buf[0] == '#')
3883 if (sscanf (buf, "%X..%X%[ ;]%[^\r]", &i1, &i2, padding, blockname) != 4)
3885 fprintf (stderr, "parse error in '%s'\n", blocks_filename);
3888 blocks[numblocks].start = i1;
3889 blocks[numblocks].end = i2;
3890 blocks[numblocks].name = strdup (blockname);
3891 /* It must be sorted. */
3892 if (numblocks > 0 && !(blocks[numblocks-1].end < blocks[numblocks].start))
3895 if (numblocks == 256)
3899 if (ferror (stream) || fclose (stream))
3901 fprintf (stderr, "error reading from '%s'\n", blocks_filename);
3906 /* Return the smallest block index among the blocks for characters >= ch. */
3908 block_first_index (unsigned int ch)
3910 /* Binary search. */
3911 unsigned int lo = 0;
3912 unsigned int hi = numblocks;
3914 All blocks[i], i < lo, have blocks[i].end < ch,
3915 all blocks[i], i >= hi, have blocks[i].end >= ch. */
3918 unsigned int mid = (lo + hi) / 2; /* >= lo, < hi */
3919 if (blocks[mid].end < ch)
3927 /* Return the largest block index among the blocks for characters <= ch,
3930 block_last_index (unsigned int ch)
3932 /* Binary search. */
3933 unsigned int lo = 0;
3934 unsigned int hi = numblocks;
3936 All blocks[i], i < lo, have blocks[i].start <= ch,
3937 all blocks[i], i >= hi, have blocks[i].start > ch. */
3940 unsigned int mid = (lo + hi) / 2; /* >= lo, < hi */
3941 if (blocks[mid].start <= ch)
3950 output_blocks (const char *version)
3952 const char *filename = "unictype/blocks.h";
3953 const unsigned int shift = 8; /* bits to shift away for array access */
3954 const unsigned int threshold = 0x30000; /* cut-off table here to save space */
3959 stream = fopen (filename, "w");
3962 fprintf (stderr, "cannot open '%s' for writing\n", filename);
3966 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
3967 fprintf (stream, "/* Unicode blocks. */\n");
3968 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
3971 fprintf (stream, "static const uc_block_t blocks[] =\n");
3972 fprintf (stream, "{\n");
3973 for (i = 0; i < numblocks; i++)
3975 fprintf (stream, " { 0x%04X, 0x%04X, \"%s\" }", blocks[i].start,
3976 blocks[i].end, blocks[i].name);
3977 if (i+1 < numblocks)
3978 fprintf (stream, ",");
3979 fprintf (stream, "\n");
3981 fprintf (stream, "};\n");
3982 fprintf (stream, "#define blocks_level1_shift %d\n", shift);
3983 fprintf (stream, "#define blocks_level1_threshold 0x%04X\n", threshold);
3984 fprintf (stream, "static const uint8_t blocks_level1[%d * 2] =\n",
3985 threshold >> shift);
3986 fprintf (stream, "{\n");
3987 for (i1 = 0; i1 < (threshold >> shift); i1++)
3989 unsigned int first_index = block_first_index (i1 << shift);
3990 unsigned int last_index = block_last_index (((i1 + 1) << shift) - 1);
3991 fprintf (stream, " %3d, %3d", first_index, last_index);
3992 if (i1+1 < (threshold >> shift))
3993 fprintf (stream, ",");
3994 fprintf (stream, "\n");
3996 fprintf (stream, "};\n");
3997 fprintf (stream, "#define blocks_upper_first_index %d\n",
3998 block_first_index (threshold));
3999 fprintf (stream, "#define blocks_upper_last_index %d\n",
4000 block_last_index (0x10FFFF));
4002 if (ferror (stream) || fclose (stream))
4004 fprintf (stderr, "error writing to '%s'\n", filename);
4009 /* ========================================================================= */
4011 /* C and Java syntax. */
4015 UC_IDENTIFIER_START, /* valid as first or subsequent character */
4016 UC_IDENTIFIER_VALID, /* valid as subsequent character only */
4017 UC_IDENTIFIER_INVALID, /* not valid */
4018 UC_IDENTIFIER_IGNORABLE /* ignorable (Java only) */
4021 /* ISO C 99 section 6.4.(3). */
4023 is_c_whitespace (unsigned int ch)
4025 return (ch == ' ' /* space */
4026 || ch == '\t' /* horizontal tab */
4027 || ch == '\n' || ch == '\r' /* new-line */
4028 || ch == '\v' /* vertical tab */
4029 || ch == '\f'); /* form-feed */
4032 /* ISO C 99 section 6.4.2.1 and appendix D. */
4034 c_ident_category (unsigned int ch)
4036 /* Section 6.4.2.1. */
4037 if (ch >= '0' && ch <= '9')
4038 return UC_IDENTIFIER_VALID;
4039 if ((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || ch == '_')
4040 return UC_IDENTIFIER_START;
4046 || (ch >= 0x00C0 && ch <= 0x00D6)
4047 || (ch >= 0x00D8 && ch <= 0x00F6)
4048 || (ch >= 0x00F8 && ch <= 0x01F5)
4049 || (ch >= 0x01FA && ch <= 0x0217)
4050 || (ch >= 0x0250 && ch <= 0x02A8)
4051 || (ch >= 0x1E00 && ch <= 0x1E9B)
4052 || (ch >= 0x1EA0 && ch <= 0x1EF9)
4056 || (ch >= 0x0388 && ch <= 0x038A)
4058 || (ch >= 0x038E && ch <= 0x03A1)
4059 || (ch >= 0x03A3 && ch <= 0x03CE)
4060 || (ch >= 0x03D0 && ch <= 0x03D6)
4065 || (ch >= 0x03E2 && ch <= 0x03F3)
4066 || (ch >= 0x1F00 && ch <= 0x1F15)
4067 || (ch >= 0x1F18 && ch <= 0x1F1D)
4068 || (ch >= 0x1F20 && ch <= 0x1F45)
4069 || (ch >= 0x1F48 && ch <= 0x1F4D)
4070 || (ch >= 0x1F50 && ch <= 0x1F57)
4074 || (ch >= 0x1F5F && ch <= 0x1F7D)
4075 || (ch >= 0x1F80 && ch <= 0x1FB4)
4076 || (ch >= 0x1FB6 && ch <= 0x1FBC)
4077 || (ch >= 0x1FC2 && ch <= 0x1FC4)
4078 || (ch >= 0x1FC6 && ch <= 0x1FCC)
4079 || (ch >= 0x1FD0 && ch <= 0x1FD3)
4080 || (ch >= 0x1FD6 && ch <= 0x1FDB)
4081 || (ch >= 0x1FE0 && ch <= 0x1FEC)
4082 || (ch >= 0x1FF2 && ch <= 0x1FF4)
4083 || (ch >= 0x1FF6 && ch <= 0x1FFC)
4085 || (ch >= 0x0401 && ch <= 0x040C)
4086 || (ch >= 0x040E && ch <= 0x044F)
4087 || (ch >= 0x0451 && ch <= 0x045C)
4088 || (ch >= 0x045E && ch <= 0x0481)
4089 || (ch >= 0x0490 && ch <= 0x04C4)
4090 || (ch >= 0x04C7 && ch <= 0x04C8)
4091 || (ch >= 0x04CB && ch <= 0x04CC)
4092 || (ch >= 0x04D0 && ch <= 0x04EB)
4093 || (ch >= 0x04EE && ch <= 0x04F5)
4094 || (ch >= 0x04F8 && ch <= 0x04F9)
4096 || (ch >= 0x0531 && ch <= 0x0556)
4097 || (ch >= 0x0561 && ch <= 0x0587)
4099 || (ch >= 0x05B0 && ch <= 0x05B9)
4100 || (ch >= 0x05BB && ch <= 0x05BD)
4102 || (ch >= 0x05C1 && ch <= 0x05C2)
4103 || (ch >= 0x05D0 && ch <= 0x05EA)
4104 || (ch >= 0x05F0 && ch <= 0x05F2)
4106 || (ch >= 0x0621 && ch <= 0x063A)
4107 || (ch >= 0x0640 && ch <= 0x0652)
4108 || (ch >= 0x0670 && ch <= 0x06B7)
4109 || (ch >= 0x06BA && ch <= 0x06BE)
4110 || (ch >= 0x06C0 && ch <= 0x06CE)
4111 || (ch >= 0x06D0 && ch <= 0x06DC)
4112 || (ch >= 0x06E5 && ch <= 0x06E8)
4113 || (ch >= 0x06EA && ch <= 0x06ED)
4115 || (ch >= 0x0901 && ch <= 0x0903)
4116 || (ch >= 0x0905 && ch <= 0x0939)
4117 || (ch >= 0x093E && ch <= 0x094D)
4118 || (ch >= 0x0950 && ch <= 0x0952)
4119 || (ch >= 0x0958 && ch <= 0x0963)
4121 || (ch >= 0x0981 && ch <= 0x0983)
4122 || (ch >= 0x0985 && ch <= 0x098C)
4123 || (ch >= 0x098F && ch <= 0x0990)
4124 || (ch >= 0x0993 && ch <= 0x09A8)
4125 || (ch >= 0x09AA && ch <= 0x09B0)
4127 || (ch >= 0x09B6 && ch <= 0x09B9)
4128 || (ch >= 0x09BE && ch <= 0x09C4)
4129 || (ch >= 0x09C7 && ch <= 0x09C8)
4130 || (ch >= 0x09CB && ch <= 0x09CD)
4131 || (ch >= 0x09DC && ch <= 0x09DD)
4132 || (ch >= 0x09DF && ch <= 0x09E3)
4133 || (ch >= 0x09F0 && ch <= 0x09F1)
4136 || (ch >= 0x0A05 && ch <= 0x0A0A)
4137 || (ch >= 0x0A0F && ch <= 0x0A10)
4138 || (ch >= 0x0A13 && ch <= 0x0A28)
4139 || (ch >= 0x0A2A && ch <= 0x0A30)
4140 || (ch >= 0x0A32 && ch <= 0x0A33)
4141 || (ch >= 0x0A35 && ch <= 0x0A36)
4142 || (ch >= 0x0A38 && ch <= 0x0A39)
4143 || (ch >= 0x0A3E && ch <= 0x0A42)
4144 || (ch >= 0x0A47 && ch <= 0x0A48)
4145 || (ch >= 0x0A4B && ch <= 0x0A4D)
4146 || (ch >= 0x0A59 && ch <= 0x0A5C)
4150 || (ch >= 0x0A81 && ch <= 0x0A83)
4151 || (ch >= 0x0A85 && ch <= 0x0A8B)
4153 || (ch >= 0x0A8F && ch <= 0x0A91)
4154 || (ch >= 0x0A93 && ch <= 0x0AA8)
4155 || (ch >= 0x0AAA && ch <= 0x0AB0)
4156 || (ch >= 0x0AB2 && ch <= 0x0AB3)
4157 || (ch >= 0x0AB5 && ch <= 0x0AB9)
4158 || (ch >= 0x0ABD && ch <= 0x0AC5)
4159 || (ch >= 0x0AC7 && ch <= 0x0AC9)
4160 || (ch >= 0x0ACB && ch <= 0x0ACD)
4164 || (ch >= 0x0B01 && ch <= 0x0B03)
4165 || (ch >= 0x0B05 && ch <= 0x0B0C)
4166 || (ch >= 0x0B0F && ch <= 0x0B10)
4167 || (ch >= 0x0B13 && ch <= 0x0B28)
4168 || (ch >= 0x0B2A && ch <= 0x0B30)
4169 || (ch >= 0x0B32 && ch <= 0x0B33)
4170 || (ch >= 0x0B36 && ch <= 0x0B39)
4171 || (ch >= 0x0B3E && ch <= 0x0B43)
4172 || (ch >= 0x0B47 && ch <= 0x0B48)
4173 || (ch >= 0x0B4B && ch <= 0x0B4D)
4174 || (ch >= 0x0B5C && ch <= 0x0B5D)
4175 || (ch >= 0x0B5F && ch <= 0x0B61)
4177 || (ch >= 0x0B82 && ch <= 0x0B83)
4178 || (ch >= 0x0B85 && ch <= 0x0B8A)
4179 || (ch >= 0x0B8E && ch <= 0x0B90)
4180 || (ch >= 0x0B92 && ch <= 0x0B95)
4181 || (ch >= 0x0B99 && ch <= 0x0B9A)
4183 || (ch >= 0x0B9E && ch <= 0x0B9F)
4184 || (ch >= 0x0BA3 && ch <= 0x0BA4)
4185 || (ch >= 0x0BA8 && ch <= 0x0BAA)
4186 || (ch >= 0x0BAE && ch <= 0x0BB5)
4187 || (ch >= 0x0BB7 && ch <= 0x0BB9)
4188 || (ch >= 0x0BBE && ch <= 0x0BC2)
4189 || (ch >= 0x0BC6 && ch <= 0x0BC8)
4190 || (ch >= 0x0BCA && ch <= 0x0BCD)
4192 || (ch >= 0x0C01 && ch <= 0x0C03)
4193 || (ch >= 0x0C05 && ch <= 0x0C0C)
4194 || (ch >= 0x0C0E && ch <= 0x0C10)
4195 || (ch >= 0x0C12 && ch <= 0x0C28)
4196 || (ch >= 0x0C2A && ch <= 0x0C33)
4197 || (ch >= 0x0C35 && ch <= 0x0C39)
4198 || (ch >= 0x0C3E && ch <= 0x0C44)
4199 || (ch >= 0x0C46 && ch <= 0x0C48)
4200 || (ch >= 0x0C4A && ch <= 0x0C4D)
4201 || (ch >= 0x0C60 && ch <= 0x0C61)
4203 || (ch >= 0x0C82 && ch <= 0x0C83)
4204 || (ch >= 0x0C85 && ch <= 0x0C8C)
4205 || (ch >= 0x0C8E && ch <= 0x0C90)
4206 || (ch >= 0x0C92 && ch <= 0x0CA8)
4207 || (ch >= 0x0CAA && ch <= 0x0CB3)
4208 || (ch >= 0x0CB5 && ch <= 0x0CB9)
4209 || (ch >= 0x0CBE && ch <= 0x0CC4)
4210 || (ch >= 0x0CC6 && ch <= 0x0CC8)
4211 || (ch >= 0x0CCA && ch <= 0x0CCD)
4213 || (ch >= 0x0CE0 && ch <= 0x0CE1)
4215 || (ch >= 0x0D02 && ch <= 0x0D03)
4216 || (ch >= 0x0D05 && ch <= 0x0D0C)
4217 || (ch >= 0x0D0E && ch <= 0x0D10)
4218 || (ch >= 0x0D12 && ch <= 0x0D28)
4219 || (ch >= 0x0D2A && ch <= 0x0D39)
4220 || (ch >= 0x0D3E && ch <= 0x0D43)
4221 || (ch >= 0x0D46 && ch <= 0x0D48)
4222 || (ch >= 0x0D4A && ch <= 0x0D4D)
4223 || (ch >= 0x0D60 && ch <= 0x0D61)
4225 || (ch >= 0x0E01 && ch <= 0x0E3A)
4226 || (ch >= 0x0E40 && ch <= 0x0E5B)
4228 || (ch >= 0x0E81 && ch <= 0x0E82)
4230 || (ch >= 0x0E87 && ch <= 0x0E88)
4233 || (ch >= 0x0E94 && ch <= 0x0E97)
4234 || (ch >= 0x0E99 && ch <= 0x0E9F)
4235 || (ch >= 0x0EA1 && ch <= 0x0EA3)
4238 || (ch >= 0x0EAA && ch <= 0x0EAB)
4239 || (ch >= 0x0EAD && ch <= 0x0EAE)
4240 || (ch >= 0x0EB0 && ch <= 0x0EB9)
4241 || (ch >= 0x0EBB && ch <= 0x0EBD)
4242 || (ch >= 0x0EC0 && ch <= 0x0EC4)
4244 || (ch >= 0x0EC8 && ch <= 0x0ECD)
4245 || (ch >= 0x0EDC && ch <= 0x0EDD)
4248 || (ch >= 0x0F18 && ch <= 0x0F19)
4252 || (ch >= 0x0F3E && ch <= 0x0F47)
4253 || (ch >= 0x0F49 && ch <= 0x0F69)
4254 || (ch >= 0x0F71 && ch <= 0x0F84)
4255 || (ch >= 0x0F86 && ch <= 0x0F8B)
4256 || (ch >= 0x0F90 && ch <= 0x0F95)
4258 || (ch >= 0x0F99 && ch <= 0x0FAD)
4259 || (ch >= 0x0FB1 && ch <= 0x0FB7)
4262 || (ch >= 0x10A0 && ch <= 0x10C5)
4263 || (ch >= 0x10D0 && ch <= 0x10F6)
4265 || (ch >= 0x3041 && ch <= 0x3093)
4266 || (ch >= 0x309B && ch <= 0x309C)
4268 || (ch >= 0x30A1 && ch <= 0x30F6)
4269 || (ch >= 0x30FB && ch <= 0x30FC)
4271 || (ch >= 0x3105 && ch <= 0x312C)
4272 /* CJK Unified Ideographs */
4273 || (ch >= 0x4E00 && ch <= 0x9FA5)
4275 || (ch >= 0xAC00 && ch <= 0xD7A3)
4277 || (ch >= 0x0660 && ch <= 0x0669)
4278 || (ch >= 0x06F0 && ch <= 0x06F9)
4279 || (ch >= 0x0966 && ch <= 0x096F)
4280 || (ch >= 0x09E6 && ch <= 0x09EF)
4281 || (ch >= 0x0A66 && ch <= 0x0A6F)
4282 || (ch >= 0x0AE6 && ch <= 0x0AEF)
4283 || (ch >= 0x0B66 && ch <= 0x0B6F)
4284 || (ch >= 0x0BE7 && ch <= 0x0BEF)
4285 || (ch >= 0x0C66 && ch <= 0x0C6F)
4286 || (ch >= 0x0CE6 && ch <= 0x0CEF)
4287 || (ch >= 0x0D66 && ch <= 0x0D6F)
4288 || (ch >= 0x0E50 && ch <= 0x0E59)
4289 || (ch >= 0x0ED0 && ch <= 0x0ED9)
4290 || (ch >= 0x0F20 && ch <= 0x0F33)
4291 /* Special characters */
4294 || (ch >= 0x02B0 && ch <= 0x02B8)
4296 || (ch >= 0x02BD && ch <= 0x02C1)
4297 || (ch >= 0x02D0 && ch <= 0x02D1)
4298 || (ch >= 0x02E0 && ch <= 0x02E4)
4304 || (ch >= 0x203F && ch <= 0x2040)
4307 || (ch >= 0x210A && ch <= 0x2113)
4309 || (ch >= 0x2118 && ch <= 0x211D)
4313 || (ch >= 0x212A && ch <= 0x2131)
4314 || (ch >= 0x2133 && ch <= 0x2138)
4315 || (ch >= 0x2160 && ch <= 0x2182)
4316 || (ch >= 0x3005 && ch <= 0x3007)
4317 || (ch >= 0x3021 && ch <= 0x3029)
4319 return UC_IDENTIFIER_START;
4320 return UC_IDENTIFIER_INVALID;
4323 /* The Java Language Specification, 3rd edition, §3.6.
4324 http://java.sun.com/docs/books/jls/third_edition/html/lexical.html#95710 */
4326 is_java_whitespace (unsigned int ch)
4328 return (ch == ' ' || ch == '\t' || ch == '\f'
4329 || ch == '\n' || ch == '\r');
4332 /* The Java Language Specification, 3rd edition, §3.8.
4333 http://java.sun.com/docs/books/jls/third_edition/html/lexical.html#40625
4334 and Character.isJavaIdentifierStart and Character.isJavaIdentifierPart */
4336 java_ident_category (unsigned int ch)
4338 /* FIXME: Check this against Sun's JDK implementation. */
4339 if (is_category_L (ch) /* = Character.isLetter(ch) */
4340 || is_category_Nl (ch) /* = Character.getType(ch)==LETTER_NUMBER */
4341 || is_category_Sc (ch) /* currency symbol */
4342 || is_category_Pc (ch) /* connector punctuation */
4344 return UC_IDENTIFIER_START;
4345 if (is_category_Nd (ch) /* digit */
4346 || is_category_Mc (ch) /* combining mark */
4347 || is_category_Mn (ch) /* non-spacing mark */
4349 return UC_IDENTIFIER_VALID;
4350 if ((ch >= 0x0000 && ch <= 0x0008)
4351 || (ch >= 0x000E && ch <= 0x001B)
4352 || (ch >= 0x007F && ch <= 0x009F)
4353 || is_category_Cf (ch) /* = Character.getType(ch)==FORMAT */
4355 return UC_IDENTIFIER_IGNORABLE;
4356 return UC_IDENTIFIER_INVALID;
4359 /* Construction of sparse 3-level tables. */
4360 #define TABLE identsyntax_table
4361 #define ELEMENT uint8_t
4362 #define DEFAULT UC_IDENTIFIER_INVALID
4363 #define xmalloc malloc
4364 #define xrealloc realloc
4367 /* Output an identifier syntax categorization in a three-level bitmap. */
4369 output_ident_category (const char *filename, int (*predicate) (unsigned int), const char *name, const char *version)
4373 struct identsyntax_table t;
4374 unsigned int level1_offset, level2_offset, level3_offset;
4376 stream = fopen (filename, "w");
4379 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4383 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4384 fprintf (stream, "/* Language syntax properties of Unicode characters. */\n");
4385 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
4390 identsyntax_table_init (&t);
4392 for (ch = 0; ch < 0x110000; ch++)
4394 int syntaxcode = predicate (ch);
4395 if (syntaxcode != UC_IDENTIFIER_INVALID)
4396 identsyntax_table_add (&t, ch, syntaxcode);
4399 identsyntax_table_finalize (&t);
4401 /* Offsets in t.result, in memory of this process. */
4403 5 * sizeof (uint32_t);
4405 5 * sizeof (uint32_t)
4406 + t.level1_size * sizeof (uint32_t);
4408 5 * sizeof (uint32_t)
4409 + t.level1_size * sizeof (uint32_t)
4410 + (t.level2_size << t.q) * sizeof (uint32_t);
4412 for (i = 0; i < 5; i++)
4413 fprintf (stream, "#define identsyntax_header_%d %d\n", i,
4414 ((uint32_t *) t.result)[i]);
4415 fprintf (stream, "static const\n");
4416 fprintf (stream, "struct\n");
4417 fprintf (stream, " {\n");
4418 fprintf (stream, " int level1[%zu];\n", t.level1_size);
4419 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
4420 fprintf (stream, " unsigned short level3[%zu * %d];\n", t.level3_size,
4421 (1 << t.p) * 2 / 16);
4422 fprintf (stream, " }\n");
4423 fprintf (stream, "%s =\n", name);
4424 fprintf (stream, "{\n");
4425 fprintf (stream, " {");
4426 if (t.level1_size > 8)
4427 fprintf (stream, "\n ");
4428 for (i = 0; i < t.level1_size; i++)
4431 if (i > 0 && (i % 8) == 0)
4432 fprintf (stream, "\n ");
4433 offset = ((uint32_t *) (t.result + level1_offset))[i];
4435 fprintf (stream, " %5d", -1);
4437 fprintf (stream, " %5zu",
4438 (offset - level2_offset) / sizeof (uint32_t));
4439 if (i+1 < t.level1_size)
4440 fprintf (stream, ",");
4442 if (t.level1_size > 8)
4443 fprintf (stream, "\n ");
4444 fprintf (stream, " },\n");
4445 fprintf (stream, " {");
4446 if (t.level2_size << t.q > 8)
4447 fprintf (stream, "\n ");
4448 for (i = 0; i < t.level2_size << t.q; i++)
4451 if (i > 0 && (i % 8) == 0)
4452 fprintf (stream, "\n ");
4453 offset = ((uint32_t *) (t.result + level2_offset))[i];
4455 fprintf (stream, " %5d", -1);
4457 fprintf (stream, " %5zu",
4458 (offset - level3_offset) / sizeof (uint8_t));
4459 if (i+1 < t.level2_size << t.q)
4460 fprintf (stream, ",");
4462 if (t.level2_size << t.q > 8)
4463 fprintf (stream, "\n ");
4464 fprintf (stream, " },\n");
4465 /* Pack the level3 array. Each entry needs 2 bits only. */
4466 fprintf (stream, " {");
4467 if ((t.level3_size << t.p) * 2 / 16 > 8)
4468 fprintf (stream, "\n ");
4469 for (i = 0; i < (t.level3_size << t.p) * 2 / 16; i++)
4471 if (i > 0 && (i % 8) == 0)
4472 fprintf (stream, "\n ");
4473 fprintf (stream, " 0x%04x",
4474 (((uint8_t *) (t.result + level3_offset))[8 * i] << 0)
4475 | (((uint8_t *) (t.result + level3_offset))[8 * i + 1] << 2)
4476 | (((uint8_t *) (t.result + level3_offset))[8 * i + 2] << 4)
4477 | (((uint8_t *) (t.result + level3_offset))[8 * i + 3] << 6)
4478 | (((uint8_t *) (t.result + level3_offset))[8 * i + 4] << 8)
4479 | (((uint8_t *) (t.result + level3_offset))[8 * i + 5] << 10)
4480 | (((uint8_t *) (t.result + level3_offset))[8 * i + 6] << 12)
4481 | (((uint8_t *) (t.result + level3_offset))[8 * i + 7] << 14));
4482 if (i+1 < (t.level3_size << t.p) * 2 / 16)
4483 fprintf (stream, ",");
4485 if ((t.level3_size << t.p) * 2 / 16 > 8)
4486 fprintf (stream, "\n ");
4487 fprintf (stream, " }\n");
4488 fprintf (stream, "};\n");
4490 if (ferror (stream) || fclose (stream))
4492 fprintf (stderr, "error writing to '%s'\n", filename);
4498 output_ident_properties (const char *version)
4500 #define PROPERTY(P) \
4501 debug_output_predicate ("unictype/sy_" #P ".txt", is_ ## P); \
4502 output_predicate_test ("../tests/unictype/test-sy_" #P ".c", is_ ## P, "uc_is_" #P " (c)"); \
4503 output_predicate ("unictype/sy_" #P ".h", is_ ## P, "u_" #P, "Language syntax properties", version);
4504 PROPERTY(c_whitespace)
4505 PROPERTY(java_whitespace)
4508 output_ident_category ("unictype/sy_c_ident.h", c_ident_category, "u_c_ident", version);
4509 output_ident_category ("unictype/sy_java_ident.h", java_ident_category, "u_java_ident", version);
4512 /* ========================================================================= */
4514 /* Like ISO C <ctype.h> and <wctype.h>. Compatible to glibc's
4515 glibc/localedata/locales/i18n file, generated by
4516 glibc/localedata/gen-unicode-ctype.c. */
4518 /* Character mappings. */
4521 to_upper (unsigned int ch)
4523 if (unicode_attributes[ch].name != NULL
4524 && unicode_attributes[ch].upper != NONE)
4525 return unicode_attributes[ch].upper;
4531 to_lower (unsigned int ch)
4533 if (unicode_attributes[ch].name != NULL
4534 && unicode_attributes[ch].lower != NONE)
4535 return unicode_attributes[ch].lower;
4541 to_title (unsigned int ch)
4543 if (unicode_attributes[ch].name != NULL
4544 && unicode_attributes[ch].title != NONE)
4545 return unicode_attributes[ch].title;
4550 /* Character class properties. */
4553 is_upper (unsigned int ch)
4555 return (to_lower (ch) != ch);
4559 is_lower (unsigned int ch)
4561 return (to_upper (ch) != ch)
4562 /* <U00DF> is lowercase, but without simple to_upper mapping. */
4567 is_alpha (unsigned int ch)
4569 return (unicode_attributes[ch].name != NULL
4570 && ((unicode_attributes[ch].category[0] == 'L'
4571 /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
4572 <U0E2F>, <U0E46> should belong to is_punct. */
4573 && (ch != 0x0E2F) && (ch != 0x0E46))
4574 /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
4575 <U0E31>, <U0E34>..<U0E3A>, <U0E47>..<U0E4E> are is_alpha. */
4577 || (ch >= 0x0E34 && ch <= 0x0E3A)
4578 || (ch >= 0x0E47 && ch <= 0x0E4E)
4579 /* Avoid warning for <U0345>. */
4581 /* Avoid warnings for <U2160>..<U217F>. */
4582 || (unicode_attributes[ch].category[0] == 'N'
4583 && unicode_attributes[ch].category[1] == 'l')
4584 /* Avoid warnings for <U24B6>..<U24E9>. */
4585 || (unicode_attributes[ch].category[0] == 'S'
4586 && unicode_attributes[ch].category[1] == 'o'
4587 && strstr (unicode_attributes[ch].name, " LETTER ")
4589 /* Consider all the non-ASCII digits as alphabetic.
4590 ISO C 99 forbids us to have them in category "digit",
4591 but we want iswalnum to return true on them. */
4592 || (unicode_attributes[ch].category[0] == 'N'
4593 && unicode_attributes[ch].category[1] == 'd'
4594 && !(ch >= 0x0030 && ch <= 0x0039))));
4598 is_digit (unsigned int ch)
4601 return (unicode_attributes[ch].name != NULL
4602 && unicode_attributes[ch].category[0] == 'N'
4603 && unicode_attributes[ch].category[1] == 'd');
4604 /* Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
4605 a zero. Must add <0> in front of them by hand. */
4607 /* SUSV2 gives us some freedom for the "digit" category, but ISO C 99
4610 The iswdigit function tests for any wide character that corresponds
4611 to a decimal-digit character (as defined in 5.2.1).
4613 the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
4615 return (ch >= 0x0030 && ch <= 0x0039);
4620 is_outdigit (unsigned int ch)
4622 return (ch >= 0x0030 && ch <= 0x0039);
4626 is_alnum (unsigned int ch)
4628 return is_alpha (ch) || is_digit (ch);
4632 is_blank (unsigned int ch)
4634 return (ch == 0x0009 /* '\t' */
4635 /* Category Zs without mention of "<noBreak>" */
4636 || (unicode_attributes[ch].name != NULL
4637 && unicode_attributes[ch].category[0] == 'Z'
4638 && unicode_attributes[ch].category[1] == 's'
4639 && !strstr (unicode_attributes[ch].decomposition, "<noBreak>")));
4643 is_space (unsigned int ch)
4645 /* Don't make U+00A0 a space. Non-breaking space means that all programs
4646 should treat it like a punctuation character, not like a space. */
4647 return (ch == 0x0020 /* ' ' */
4648 || ch == 0x000C /* '\f' */
4649 || ch == 0x000A /* '\n' */
4650 || ch == 0x000D /* '\r' */
4651 || ch == 0x0009 /* '\t' */
4652 || ch == 0x000B /* '\v' */
4653 /* Categories Zl, Zp, and Zs without mention of "<noBreak>" */
4654 || (unicode_attributes[ch].name != NULL
4655 && unicode_attributes[ch].category[0] == 'Z'
4656 && (unicode_attributes[ch].category[1] == 'l'
4657 || unicode_attributes[ch].category[1] == 'p'
4658 || (unicode_attributes[ch].category[1] == 's'
4659 && !strstr (unicode_attributes[ch].decomposition,
4664 is_cntrl (unsigned int ch)
4666 return (unicode_attributes[ch].name != NULL
4667 && (strcmp (unicode_attributes[ch].name, "<control>") == 0
4668 /* Categories Zl and Zp */
4669 || (unicode_attributes[ch].category[0] == 'Z'
4670 && (unicode_attributes[ch].category[1] == 'l'
4671 || unicode_attributes[ch].category[1] == 'p'))));
4675 is_xdigit (unsigned int ch)
4678 return is_digit (ch)
4679 || (ch >= 0x0041 && ch <= 0x0046)
4680 || (ch >= 0x0061 && ch <= 0x0066);
4682 /* SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
4685 The iswxdigit function tests for any wide character that corresponds
4686 to a hexadecimal-digit character (as defined in 6.4.4.1).
4688 hexadecimal-digit: one of 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
4690 return (ch >= 0x0030 && ch <= 0x0039)
4691 || (ch >= 0x0041 && ch <= 0x0046)
4692 || (ch >= 0x0061 && ch <= 0x0066);
4697 is_graph (unsigned int ch)
4699 return (unicode_attributes[ch].name != NULL
4700 && strcmp (unicode_attributes[ch].name, "<control>")
4705 is_print (unsigned int ch)
4707 return (unicode_attributes[ch].name != NULL
4708 && strcmp (unicode_attributes[ch].name, "<control>")
4709 /* Categories Zl and Zp */
4710 && !(unicode_attributes[ch].name != NULL
4711 && unicode_attributes[ch].category[0] == 'Z'
4712 && (unicode_attributes[ch].category[1] == 'l'
4713 || unicode_attributes[ch].category[1] == 'p')));
4717 is_punct (unsigned int ch)
4720 return (unicode_attributes[ch].name != NULL
4721 && unicode_attributes[ch].category[0] == 'P');
4723 /* The traditional POSIX definition of punctuation is every graphic,
4724 non-alphanumeric character. */
4725 return (is_graph (ch) && !is_alpha (ch) && !is_digit (ch));
4729 /* Output all properties. */
4731 output_old_ctype (const char *version)
4733 #define PROPERTY(P) \
4734 debug_output_predicate ("unictype/ctype_" #P ".txt", is_ ## P); \
4735 output_predicate_test ("../tests/unictype/test-ctype_" #P ".c", is_ ## P, "uc_is_" #P " (c)"); \
4736 output_predicate ("unictype/ctype_" #P ".h", is_ ## P, "u_is_" #P, "ISO C <ctype.h> like properties", version);
4755 is_combining (unsigned int ch)
4757 /* Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
4758 file. In 3.0.1 it was identical to the union of the general categories
4759 "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the
4760 PropList.txt file, so we take the latter definition. */
4761 return (unicode_attributes[ch].name != NULL
4762 && unicode_attributes[ch].category[0] == 'M'
4763 && (unicode_attributes[ch].category[1] == 'n'
4764 || unicode_attributes[ch].category[1] == 'c'
4765 || unicode_attributes[ch].category[1] == 'e'));
4769 is_combining_level3 (unsigned int ch)
4771 return is_combining (ch)
4772 && !(unicode_attributes[ch].combining[0] != '\0'
4773 && unicode_attributes[ch].combining[0] != '0'
4774 && strtoul (unicode_attributes[ch].combining, NULL, 10) >= 200);
4777 /* Return the UCS symbol string for a Unicode character. */
4779 ucs_symbol (unsigned int i)
4781 static char buf[11+1];
4783 sprintf (buf, (i < 0x10000 ? "<U%04X>" : "<U%08X>"), i);
4787 /* Return the UCS symbol range string for a Unicode characters interval. */
4789 ucs_symbol_range (unsigned int low, unsigned int high)
4791 static char buf[24+1];
4793 strcpy (buf, ucs_symbol (low));
4795 strcat (buf, ucs_symbol (high));
4799 /* Output a character class (= property) table. */
4802 output_charclass (FILE *stream, const char *classname,
4803 bool (*func) (unsigned int))
4805 char table[0x110000];
4807 bool need_semicolon;
4808 const int max_column = 75;
4811 for (i = 0; i < 0x110000; i++)
4812 table[i] = (int) func (i);
4814 fprintf (stream, "%s ", classname);
4815 need_semicolon = false;
4817 for (i = 0; i < 0x110000; )
4823 unsigned int low, high;
4829 while (i < 0x110000 && table[i]);
4833 strcpy (buf, ucs_symbol (low));
4835 strcpy (buf, ucs_symbol_range (low, high));
4839 fprintf (stream, ";");
4843 if (column + strlen (buf) > max_column)
4845 fprintf (stream, "/\n ");
4849 fprintf (stream, "%s", buf);
4850 column += strlen (buf);
4851 need_semicolon = true;
4854 fprintf (stream, "\n");
4857 /* Output a character mapping table. */
4860 output_charmap (FILE *stream, const char *mapname,
4861 unsigned int (*func) (unsigned int))
4863 char table[0x110000];
4865 bool need_semicolon;
4866 const int max_column = 75;
4869 for (i = 0; i < 0x110000; i++)
4870 table[i] = (func (i) != i);
4872 fprintf (stream, "%s ", mapname);
4873 need_semicolon = false;
4875 for (i = 0; i < 0x110000; i++)
4881 strcat (buf, ucs_symbol (i));
4883 strcat (buf, ucs_symbol (func (i)));
4888 fprintf (stream, ";");
4892 if (column + strlen (buf) > max_column)
4894 fprintf (stream, "/\n ");
4898 fprintf (stream, "%s", buf);
4899 column += strlen (buf);
4900 need_semicolon = true;
4902 fprintf (stream, "\n");
4905 /* Output the width table. */
4908 output_widthmap (FILE *stream)
4912 /* Output the tables to the given file. */
4915 output_tables (const char *filename, const char *version)
4920 stream = fopen (filename, "w");
4923 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4927 fprintf (stream, "escape_char /\n");
4928 fprintf (stream, "comment_char %%\n");
4929 fprintf (stream, "\n");
4930 fprintf (stream, "%% Generated automatically by gen-unicode-ctype for Unicode %s.\n",
4932 fprintf (stream, "\n");
4934 fprintf (stream, "LC_IDENTIFICATION\n");
4935 fprintf (stream, "title \"Unicode %s FDCC-set\"\n", version);
4936 fprintf (stream, "source \"UnicodeData.txt, PropList.txt\"\n");
4937 fprintf (stream, "address \"\"\n");
4938 fprintf (stream, "contact \"\"\n");
4939 fprintf (stream, "email \"bug-glibc@gnu.org\"\n");
4940 fprintf (stream, "tel \"\"\n");
4941 fprintf (stream, "fax \"\"\n");
4942 fprintf (stream, "language \"\"\n");
4943 fprintf (stream, "territory \"Earth\"\n");
4944 fprintf (stream, "revision \"%s\"\n", version);
4949 strftime (date, sizeof (date), "%Y-%m-%d", gmtime (&now));
4950 fprintf (stream, "date \"%s\"\n", date);
4952 fprintf (stream, "category \"unicode:2001\";LC_CTYPE\n");
4953 fprintf (stream, "END LC_IDENTIFICATION\n");
4954 fprintf (stream, "\n");
4956 /* Verifications. */
4957 for (ch = 0; ch < 0x110000; ch++)
4959 /* toupper restriction: "Only characters specified for the keywords
4960 lower and upper shall be specified. */
4961 if (to_upper (ch) != ch && !(is_lower (ch) || is_upper (ch)))
4963 "%s is not upper|lower but toupper(0x%04X) = 0x%04X\n",
4964 ucs_symbol (ch), ch, to_upper (ch));
4966 /* tolower restriction: "Only characters specified for the keywords
4967 lower and upper shall be specified. */
4968 if (to_lower (ch) != ch && !(is_lower (ch) || is_upper (ch)))
4970 "%s is not upper|lower but tolower(0x%04X) = 0x%04X\n",
4971 ucs_symbol (ch), ch, to_lower (ch));
4973 /* alpha restriction: "Characters classified as either upper or lower
4974 shall automatically belong to this class. */
4975 if ((is_lower (ch) || is_upper (ch)) && !is_alpha (ch))
4976 fprintf (stderr, "%s is upper|lower but not alpha\n", ucs_symbol (ch));
4978 /* alpha restriction: "No character specified for the keywords cntrl,
4979 digit, punct or space shall be specified." */
4980 if (is_alpha (ch) && is_cntrl (ch))
4981 fprintf (stderr, "%s is alpha and cntrl\n", ucs_symbol (ch));
4982 if (is_alpha (ch) && is_digit (ch))
4983 fprintf (stderr, "%s is alpha and digit\n", ucs_symbol (ch));
4984 if (is_alpha (ch) && is_punct (ch))
4985 fprintf (stderr, "%s is alpha and punct\n", ucs_symbol (ch));
4986 if (is_alpha (ch) && is_space (ch))
4987 fprintf (stderr, "%s is alpha and space\n", ucs_symbol (ch));
4989 /* space restriction: "No character specified for the keywords upper,
4990 lower, alpha, digit, graph or xdigit shall be specified."
4991 upper, lower, alpha already checked above. */
4992 if (is_space (ch) && is_digit (ch))
4993 fprintf (stderr, "%s is space and digit\n", ucs_symbol (ch));
4994 if (is_space (ch) && is_graph (ch))
4995 fprintf (stderr, "%s is space and graph\n", ucs_symbol (ch));
4996 if (is_space (ch) && is_xdigit (ch))
4997 fprintf (stderr, "%s is space and xdigit\n", ucs_symbol (ch));
4999 /* cntrl restriction: "No character specified for the keywords upper,
5000 lower, alpha, digit, punct, graph, print or xdigit shall be
5001 specified." upper, lower, alpha already checked above. */
5002 if (is_cntrl (ch) && is_digit (ch))
5003 fprintf (stderr, "%s is cntrl and digit\n", ucs_symbol (ch));
5004 if (is_cntrl (ch) && is_punct (ch))
5005 fprintf (stderr, "%s is cntrl and punct\n", ucs_symbol (ch));
5006 if (is_cntrl (ch) && is_graph (ch))
5007 fprintf (stderr, "%s is cntrl and graph\n", ucs_symbol (ch));
5008 if (is_cntrl (ch) && is_print (ch))
5009 fprintf (stderr, "%s is cntrl and print\n", ucs_symbol (ch));
5010 if (is_cntrl (ch) && is_xdigit (ch))
5011 fprintf (stderr, "%s is cntrl and xdigit\n", ucs_symbol (ch));
5013 /* punct restriction: "No character specified for the keywords upper,
5014 lower, alpha, digit, cntrl, xdigit or as the <space> character shall
5015 be specified." upper, lower, alpha, cntrl already checked above. */
5016 if (is_punct (ch) && is_digit (ch))
5017 fprintf (stderr, "%s is punct and digit\n", ucs_symbol (ch));
5018 if (is_punct (ch) && is_xdigit (ch))
5019 fprintf (stderr, "%s is punct and xdigit\n", ucs_symbol (ch));
5020 if (is_punct (ch) && (ch == 0x0020))
5021 fprintf (stderr, "%s is punct\n", ucs_symbol (ch));
5023 /* graph restriction: "No character specified for the keyword cntrl
5024 shall be specified." Already checked above. */
5026 /* print restriction: "No character specified for the keyword cntrl
5027 shall be specified." Already checked above. */
5029 /* graph - print relation: differ only in the <space> character.
5030 How is this possible if there are more than one space character?!
5031 I think susv2/xbd/locale.html should speak of "space characters",
5032 not "space character". */
5033 if (is_print (ch) && !(is_graph (ch) || /* ch == 0x0020 */ is_space (ch)))
5035 "%s is print but not graph|<space>\n", ucs_symbol (ch));
5036 if (!is_print (ch) && (is_graph (ch) || ch == 0x0020))
5038 "%s is graph|<space> but not print\n", ucs_symbol (ch));
5041 fprintf (stream, "LC_CTYPE\n");
5042 output_charclass (stream, "upper", is_upper);
5043 output_charclass (stream, "lower", is_lower);
5044 output_charclass (stream, "alpha", is_alpha);
5045 output_charclass (stream, "digit", is_digit);
5046 output_charclass (stream, "outdigit", is_outdigit);
5047 output_charclass (stream, "blank", is_blank);
5048 output_charclass (stream, "space", is_space);
5049 output_charclass (stream, "cntrl", is_cntrl);
5050 output_charclass (stream, "punct", is_punct);
5051 output_charclass (stream, "xdigit", is_xdigit);
5052 output_charclass (stream, "graph", is_graph);
5053 output_charclass (stream, "print", is_print);
5054 output_charclass (stream, "class \"combining\";", is_combining);
5055 output_charclass (stream, "class \"combining_level3\";", is_combining_level3);
5056 output_charmap (stream, "toupper", to_upper);
5057 output_charmap (stream, "tolower", to_lower);
5058 output_charmap (stream, "map \"totitle\";", to_title);
5059 output_widthmap (stream);
5060 fprintf (stream, "END LC_CTYPE\n");
5062 if (ferror (stream) || fclose (stream))
5064 fprintf (stderr, "error writing to '%s'\n", filename);
5071 /* ========================================================================= */
5073 /* The width property from the EastAsianWidth.txt file.
5074 Each is NULL (unassigned) or "N", "A", "H", "W", "F", "Na". */
5075 const char * unicode_width[0x110000];
5077 /* Stores in unicode_width[] the width property from the EastAsianWidth.txt
5080 fill_width (const char *width_filename)
5084 char field0[FIELDLEN];
5085 char field1[FIELDLEN];
5086 char field2[FIELDLEN];
5089 for (i = 0; i < 0x110000; i++)
5090 unicode_width[i] = (unicode_attributes[i].name != NULL ? "N" : NULL);
5092 stream = fopen (width_filename, "r");
5095 fprintf (stderr, "error during fopen of '%s'\n", width_filename);
5110 do c = getc (stream); while (c != EOF && c != '\n');
5114 n = getfield (stream, field0, ';');
5115 n += getfield (stream, field1, ' ');
5116 n += getfield (stream, field2, '\n');
5121 fprintf (stderr, "short line in '%s':%d\n", width_filename, lineno);
5124 i = strtoul (field0, NULL, 16);
5125 if (strstr (field0, "..") != NULL)
5127 /* Deal with a range. */
5128 j = strtoul (strstr (field0, "..") + 2, NULL, 16);
5130 unicode_width[i] = strdup (field1);
5134 /* Single character line. */
5135 unicode_width[i] = strdup (field1);
5138 if (ferror (stream) || fclose (stream))
5140 fprintf (stderr, "error reading from '%s'\n", width_filename);
5145 /* ========================================================================= */
5147 /* Non-spacing attribute and width. */
5149 /* The non-spacing attribute table consists of:
5150 - Non-spacing characters; generated from PropList.txt or
5151 "grep '^[^;]*;[^;]*;[^;]*;[^;]*;NSM;' UnicodeData.txt"
5152 - Format control characters; generated from
5153 "grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt"
5154 - Zero width characters; generated from
5155 "grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt"
5159 is_nonspacing (unsigned int ch)
5161 return (unicode_attributes[ch].name != NULL
5162 && (get_bidi_category (ch) == UC_BIDI_NSM
5163 || is_category_Cc (ch) || is_category_Cf (ch)
5164 || strncmp (unicode_attributes[ch].name, "ZERO WIDTH ", 11) == 0));
5168 output_nonspacing_property (const char *filename)
5171 int ind[0x110000 / 0x200];
5176 stream = fopen (filename, "w");
5179 fprintf (stderr, "cannot open '%s' for writing\n", filename);
5184 for (i = 0; i < 0x110000 / 0x200; i++)
5186 bool nontrivial = false;
5189 if (i != 0xe0000 / 0x200) /* The 0xe0000 block is handled by code. */
5190 for (ch = i * 0x200; ch < (i + 1) * 0x200; ch++)
5191 if (is_nonspacing (ch))
5197 ind[i] = next_ind++;
5202 fprintf (stream, "static const unsigned char nonspacing_table_data[%d*64] = {\n",
5205 for (i = 0; i < 0x110000 / 0x200; i++)
5207 bool nontrivial = (ind[i] >= 0);
5213 fprintf (stream, " /* 0x%04x-0x%04x */\n", i * 0x200, (i + 1) * 0x200 - 1);
5214 for (j = 0; j < 8; j++)
5218 fprintf (stream, " ");
5219 for (k = 0; k < 8; k++)
5222 unsigned char bits = 0;
5224 for (l = 0; l < 8; l++)
5226 unsigned int ch = i * 0x200 + j * 0x40 + k * 8 + l;
5228 if (is_nonspacing (ch))
5231 fprintf (stream, " 0x%02x%c", bits,
5232 ind[i] + 1 == next_ind && j == 8 - 1 && k == 8 - 1 ? ' ' : ',');
5234 fprintf (stream, " /* 0x%04x-0x%04x */\n",
5235 i * 0x200 + j * 0x40, i * 0x200 + (j + 1) * 0x40 - 1);
5240 fprintf (stream, "};\n");
5242 i_max = ((i_max + 8 - 1) / 8) * 8;
5243 fprintf (stream, "static const signed char nonspacing_table_ind[%u] = {\n",
5248 for (j = 0; j < i_max / 8; j++)
5252 fprintf (stream, " ");
5253 for (k = 0; k < 8; k++)
5256 fprintf (stream, " %2d%c", ind[i],
5257 j == i_max / 8 - 1 && k == 8 - 1 ? ' ' : ',');
5259 fprintf (stream, " /* 0x%04x-0x%04x */\n",
5260 j * 8 * 0x200, (j + 1) * 8 * 0x200 - 1);
5263 fprintf (stream, "};\n");
5265 if (ferror (stream) || fclose (stream))
5267 fprintf (stderr, "error writing to '%s'\n", filename);
5272 /* Returns the width of ch as one of 0, '0', '1', '2', 'A'. */
5274 symbolic_width (unsigned int ch)
5276 /* Test for unassigned character. */
5277 if (is_property_unassigned_code_value (ch))
5279 /* Unicode TR#11 section "Unassigned and Private-Use Characters". */
5280 if (ch >= 0xE000 && ch <= 0xF8FF)
5282 if ((ch >= 0x20000 && ch <= 0x2FFFD) || (ch >= 0x30000 && ch <= 0x3FFFD))
5288 /* Test for non-spacing or control character. */
5289 if (is_category_Cc (ch) && ch < 0x00A0)
5291 if (is_nonspacing (ch))
5293 /* Test for double-width character. */
5294 if (unicode_width[ch] != NULL
5295 && (strcmp (unicode_width[ch], "W") == 0
5296 || strcmp (unicode_width[ch], "F") == 0))
5298 /* Test for half-width character. */
5299 if (unicode_width[ch] != NULL
5300 && strcmp (unicode_width[ch], "H") == 0)
5303 /* In ancient CJK encodings, Cyrillic and most other characters are
5304 double-width as well. */
5305 if (ch >= 0x00A1 && ch < 0x10000)
5311 output_width_property_test (const char *filename)
5314 unsigned int interval_start, interval_end, ch;
5315 char interval_value;
5317 stream = fopen (filename, "w");
5320 fprintf (stderr, "cannot open '%s' for writing\n", filename);
5325 interval_start = interval_end = 0; /* avoid GCC warning */
5326 for (ch = 0; ch < 0x110000; ch++)
5328 char value = symbolic_width (ch);
5329 if (value != 0) /* skip Cc control characters and unassigned characters */
5331 if (value == interval_value)
5332 /* Extend the interval. */
5336 /* Terminate the interval. */
5337 if (interval_value != 0)
5339 if (interval_end == interval_start)
5340 fprintf (stream, "%04X\t\t%c\n", interval_start, interval_value);
5342 fprintf (stream, "%04X..%04X\t%c\n", interval_start, interval_end, interval_value);
5344 /* Start a new interval. */
5345 interval_start = interval_end = ch;
5346 interval_value = value;
5350 /* Terminate the last interval. */
5351 if (interval_value != 0)
5353 if (interval_end == interval_start)
5354 fprintf (stream, "%04X\t\t%c\n", interval_start, interval_value);
5356 fprintf (stream, "%04X..%04X\t%c\n", interval_start, interval_end, interval_value);
5359 if (ferror (stream) || fclose (stream))
5361 fprintf (stderr, "error writing to '%s'\n", filename);
5366 /* ========================================================================= */
5368 /* Line breaking classification. */
5372 /* Values >= 24 are resolved at run time. */
5373 LBP_BK = 24, /* mandatory break */
5374 /*LBP_CR, carriage return - not used here because it's a DOSism */
5375 /*LBP_LF, line feed - not used here because it's a DOSism */
5376 LBP_CM = 25, /* attached characters and combining marks */
5377 /*LBP_NL, next line - not used here because it's equivalent to LBP_BK */
5378 /*LBP_SG, surrogates - not used here because they are not characters */
5379 LBP_WJ = 0, /* word joiner */
5380 LBP_ZW = 26, /* zero width space */
5381 LBP_GL = 1, /* non-breaking (glue) */
5382 LBP_SP = 27, /* space */
5383 LBP_B2 = 2, /* break opportunity before and after */
5384 LBP_BA = 3, /* break opportunity after */
5385 LBP_BB = 4, /* break opportunity before */
5386 LBP_HY = 5, /* hyphen */
5387 LBP_CB = 28, /* contingent break opportunity */
5388 LBP_CL = 6, /* closing punctuation */
5389 LBP_EX = 7, /* exclamation/interrogation */
5390 LBP_IN = 8, /* inseparable */
5391 LBP_NS = 9, /* non starter */
5392 LBP_OP = 10, /* opening punctuation */
5393 LBP_QU = 11, /* ambiguous quotation */
5394 LBP_IS = 12, /* infix separator (numeric) */
5395 LBP_NU = 13, /* numeric */
5396 LBP_PO = 14, /* postfix (numeric) */
5397 LBP_PR = 15, /* prefix (numeric) */
5398 LBP_SY = 16, /* symbols allowing breaks */
5399 LBP_AI = 29, /* ambiguous (alphabetic or ideograph) */
5400 LBP_AL = 17, /* ordinary alphabetic and symbol characters */
5401 LBP_H2 = 18, /* Hangul LV syllable */
5402 LBP_H3 = 19, /* Hangul LVT syllable */
5403 LBP_ID = 20, /* ideographic */
5404 LBP_JL = 21, /* Hangul L Jamo */
5405 LBP_JV = 22, /* Hangul V Jamo */
5406 LBP_JT = 23, /* Hangul T Jamo */
5407 LBP_SA = 30, /* complex context (South East Asian) */
5408 LBP_XX = 31 /* unknown */
5411 /* Returns the line breaking classification for ch, as a bit mask. */
5413 get_lbp (unsigned int ch)
5417 if (unicode_attributes[ch].name != NULL)
5419 /* mandatory break */
5420 if (ch == 0x000A || ch == 0x000D || ch == 0x0085 /* newline */
5421 || ch == 0x000C /* form feed */
5422 || ch == 0x000B /* line tabulation */
5423 || ch == 0x2028 /* LINE SEPARATOR */
5424 || ch == 0x2029 /* PARAGRAPH SEPARATOR */)
5425 attr |= (int64_t) 1 << LBP_BK;
5427 if (ch == 0x2060 /* WORD JOINER */
5428 || ch == 0xFEFF /* ZERO WIDTH NO-BREAK SPACE */)
5429 attr |= (int64_t) 1 << LBP_WJ;
5431 /* zero width space */
5432 if (ch == 0x200B /* ZERO WIDTH SPACE */)
5433 attr |= (int64_t) 1 << LBP_ZW;
5435 /* non-breaking (glue) */
5436 if (ch == 0x00A0 /* NO-BREAK SPACE */
5437 || ch == 0x202F /* NARROW NO-BREAK SPACE */
5438 || ch == 0x180E /* MONGOLIAN VOWEL SEPARATOR */
5439 || ch == 0x034F /* COMBINING GRAPHEME JOINER */
5440 || ch == 0x2007 /* FIGURE SPACE */
5441 || ch == 0x2011 /* NON-BREAKING HYPHEN */
5442 || ch == 0x0F08 /* TIBETAN MARK SBRUL SHAD */
5443 || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */
5444 || ch == 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */
5445 || (ch >= 0x035C && ch <= 0x0362) /* COMBINING DOUBLE ... */)
5446 attr |= (int64_t) 1 << LBP_GL;
5449 if (ch == 0x0020 /* SPACE */)
5450 attr |= (int64_t) 1 << LBP_SP;
5452 /* break opportunity before and after */
5453 if (ch == 0x2014 /* EM DASH */)
5454 attr |= (int64_t) 1 << LBP_B2;
5456 /* break opportunity after */
5457 if (ch == 0x1680 /* OGHAM SPACE MARK */
5458 || ch == 0x2000 /* EN QUAD */
5459 || ch == 0x2001 /* EM QUAD */
5460 || ch == 0x2002 /* EN SPACE */
5461 || ch == 0x2003 /* EM SPACE */
5462 || ch == 0x2004 /* THREE-PER-EM SPACE */
5463 || ch == 0x2005 /* FOUR-PER-EM SPACE */
5464 || ch == 0x2006 /* SIX-PER-EM SPACE */
5465 || ch == 0x2008 /* PUNCTUATION SPACE */
5466 || ch == 0x2009 /* THIN SPACE */
5467 || ch == 0x200A /* HAIR SPACE */
5468 || ch == 0x205F /* MEDIUM MATHEMATICAL SPACE */
5469 || ch == 0x0009 /* tab */
5470 || ch == 0x00AD /* SOFT HYPHEN */
5471 || ch == 0x058A /* ARMENIAN HYPHEN */
5472 || ch == 0x2010 /* HYPHEN */
5473 || ch == 0x2012 /* FIGURE DASH */
5474 || ch == 0x2013 /* EN DASH */
5475 || ch == 0x05BE /* HEBREW PUNCTUATION MAQAF */
5476 || ch == 0x0F0B /* TIBETAN MARK INTERSYLLABIC TSHEG */
5477 || ch == 0x1361 /* ETHIOPIC WORDSPACE */
5478 || ch == 0x17D8 /* KHMER SIGN BEYYAL */
5479 || ch == 0x17DA /* KHMER SIGN KOOMUUT */
5480 || ch == 0x2027 /* HYPHENATION POINT */
5481 || ch == 0x007C /* VERTICAL LINE */
5482 || ch == 0x16EB /* RUNIC SINGLE PUNCTUATION */
5483 || ch == 0x16EC /* RUNIC MULTIPLE PUNCTUATION */
5484 || ch == 0x16ED /* RUNIC CROSS PUNCTUATION */
5485 || ch == 0x2056 /* THREE DOT PUNCTUATION */
5486 || ch == 0x2058 /* FOUR DOT PUNCTUATION */
5487 || ch == 0x2059 /* FIVE DOT PUNCTUATION */
5488 || ch == 0x205A /* TWO DOT PUNCTUATION */
5489 || ch == 0x205B /* FOUR DOT MARK */
5490 || ch == 0x205D /* TRICOLON */
5491 || ch == 0x205E /* VERTICAL FOUR DOTS */
5492 || ch == 0x2E19 /* PALM BRANCH */
5493 || ch == 0x2E2A /* TWO DOTS OVER ONE DOT PUNCTUATION */
5494 || ch == 0x2E2B /* ONE DOT OVER TWO DOTS PUNCTUATION */
5495 || ch == 0x2E2C /* SQUARED FOUR DOT PUNCTUATION */
5496 || ch == 0x2E2D /* FIVE DOT PUNCTUATION */
5497 || ch == 0x2E30 /* RING POINT */
5498 || ch == 0x10100 /* AEGEAN WORD SEPARATOR LINE */
5499 || ch == 0x10101 /* AEGEAN WORD SEPARATOR DOT */
5500 || ch == 0x10102 /* AEGEAN CHECK MARK */
5501 || ch == 0x1039F /* UGARITIC WORD DIVIDER */
5502 || ch == 0x103D0 /* OLD PERSIAN WORD DIVIDER */
5503 || ch == 0x1091F /* PHOENICIAN WORD SEPARATOR */
5504 || ch == 0x12470 /* CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER */
5505 || ch == 0x0964 /* DEVANAGARI DANDA */
5506 || ch == 0x0965 /* DEVANAGARI DOUBLE DANDA */
5507 || ch == 0x0E5A /* THAI CHARACTER ANGKHANKHU */
5508 || ch == 0x0E5B /* THAI CHARACTER KHOMUT */
5509 || ch == 0x104A /* MYANMAR SIGN LITTLE SECTION */
5510 || ch == 0x104B /* MYANMAR SIGN SECTION */
5511 || ch == 0x1735 /* PHILIPPINE SINGLE PUNCTUATION */
5512 || ch == 0x1736 /* PHILIPPINE DOUBLE PUNCTUATION */
5513 || ch == 0x17D4 /* KHMER SIGN KHAN */
5514 || ch == 0x17D5 /* KHMER SIGN BARIYOOSAN */
5515 || ch == 0x1B5E /* BALINESE CARIK SIKI */
5516 || ch == 0x1B5F /* BALINESE CARIK PAREREN */
5517 || ch == 0xA8CE /* SAURASHTRA DANDA */
5518 || ch == 0xA8CF /* SAURASHTRA DOUBLE DANDA */
5519 || ch == 0xAA5D /* CHAM PUNCTUATION DANDA */
5520 || ch == 0xAA5E /* CHAM PUNCTUATION DOUBLE DANDA */
5521 || ch == 0xAA5F /* CHAM PUNCTUATION TRIPLE DANDA */
5522 || ch == 0x10A56 /* KHAROSHTHI PUNCTUATION DANDA */
5523 || ch == 0x10A57 /* KHAROSHTHI PUNCTUATION DOUBLE DANDA */
5524 || ch == 0x0F34 /* TIBETAN MARK BSDUS RTAGS */
5525 || ch == 0x0F7F /* TIBETAN SIGN RNAM BCAD */
5526 || ch == 0x0F85 /* TIBETAN MARK PALUTA */
5527 || ch == 0x0FBE /* TIBETAN KU RU KHA */
5528 || ch == 0x0FBF /* TIBETAN KU RU KHA BZHI MIG CAN */
5529 || ch == 0x0FD2 /* TIBETAN MARK NYIS TSHEG */
5530 || ch == 0x1804 /* MONGOLIAN COLON */
5531 || ch == 0x1805 /* MONGOLIAN FOUR DOTS */
5532 || ch == 0x1B5A /* BALINESE PANTI */
5533 || ch == 0x1B5B /* BALINESE PAMADA */
5534 || ch == 0x1B5C /* BALINESE WINDU */
5535 || ch == 0x1B5D /* BALINESE CARIK PAMUNGKAH */
5536 || ch == 0x1B60 /* BALINESE PAMENENG */
5537 || ch == 0x1C3B /* LEPCHA PUNCTUATION TA-ROL */
5538 || ch == 0x1C3C /* LEPCHA PUNCTUATION NYET THYOOM TA-ROL */
5539 || ch == 0x1C3D /* LEPCHA PUNCTUATION CER-WA */
5540 || ch == 0x1C3E /* LEPCHA PUNCTUATION TSHOOK CER-WA */
5541 || ch == 0x1C3F /* LEPCHA PUNCTUATION TSHOOK */
5542 || ch == 0x1C7E /* OL CHIKI PUNCTUATION MUCAAD */
5543 || ch == 0x1C7F /* OL CHIKI PUNCTUATION DOUBLE MUCAAD */
5544 || ch == 0x2CFA /* COPTIC OLD NUBIAN DIRECT QUESTION MARK */
5545 || ch == 0x2CFB /* COPTIC OLD NUBIAN INDIRECT QUESTION MARK */
5546 || ch == 0x2CFC /* COPTIC OLD NUBIAN VERSE DIVIDER */
5547 || ch == 0x2CFF /* COPTIC MORPHOLOGICAL DIVIDER */
5548 || (ch >= 0x2E0E && ch <= 0x2E15) /* EDITORIAL CORONIS .. UPWARDS ANCORA */
5549 || ch == 0x2E17 /* DOUBLE OBLIQUE HYPHEN */
5550 || ch == 0xA60D /* VAI COMMA */
5551 || ch == 0xA60F /* VAI QUESTION MARK */
5552 || ch == 0xA92E /* KAYAH LI SIGN CWI */
5553 || ch == 0xA92F /* KAYAH LI SIGN SHYA */
5554 || ch == 0x10A50 /* KHAROSHTHI PUNCTUATION DOT */
5555 || ch == 0x10A51 /* KHAROSHTHI PUNCTUATION SMALL CIRCLE */
5556 || ch == 0x10A52 /* KHAROSHTHI PUNCTUATION CIRCLE */
5557 || ch == 0x10A53 /* KHAROSHTHI PUNCTUATION CRESCENT BAR */
5558 || ch == 0x10A54 /* KHAROSHTHI PUNCTUATION MANGALAM */
5559 || ch == 0x10A55 /* KHAROSHTHI PUNCTUATION LOTUS */
5560 /* Extra characters for compatibility with Unicode LineBreak.txt. */
5561 || ch == 0x12471 /* CUNEIFORM PUNCTUATION SIGN VERTICAL COLON */
5562 || ch == 0x12472 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL COLON */
5563 || ch == 0x12473 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL TRICOLON */)
5564 attr |= (int64_t) 1 << LBP_BA;
5566 /* break opportunity before */
5567 if (ch == 0x00B4 /* ACUTE ACCENT */
5568 || ch == 0x1FFD /* GREEK OXIA */
5569 || ch == 0x02DF /* MODIFIER LETTER CROSS ACCENT */
5570 || ch == 0x02C8 /* MODIFIER LETTER VERTICAL LINE */
5571 || ch == 0x02CC /* MODIFIER LETTER LOW VERTICAL LINE */
5572 || ch == 0x0F01 /* TIBETAN MARK GTER YIG MGO TRUNCATED A */
5573 || ch == 0x0F02 /* TIBETAN MARK GTER YIG MGO -UM RNAM BCAD MA */
5574 || ch == 0x0F03 /* TIBETAN MARK GTER YIG MGO -UM GTER TSHEG MA */
5575 || ch == 0x0F04 /* TIBETAN MARK INITIAL YIG MGO MDUN MA */
5576 || ch == 0x0F06 /* TIBETAN MARK CARET YIG MGO PHUR SHAD MA */
5577 || ch == 0x0F07 /* TIBETAN MARK YIG MGO TSHEG SHAD MA */
5578 || ch == 0x0F09 /* TIBETAN MARK BSKUR YIG MGO */
5579 || ch == 0x0F0A /* TIBETAN MARK BKA- SHOG YIG MGO */
5580 || ch == 0x0FD0 /* TIBETAN MARK BSKA- SHOG GI MGO RGYAN */
5581 || ch == 0x0FD1 /* TIBETAN MARK MNYAM YIG GI MGO RGYAN */
5582 || ch == 0x0FD3 /* TIBETAN MARK INITIAL BRDA RNYING YIG MGO MDUN MA */
5583 || ch == 0xA874 /* PHAGS-PA SINGLE HEAD MARK */
5584 || ch == 0xA875 /* PHAGS-PA DOUBLE HEAD MARK */
5585 || ch == 0x1806 /* MONGOLIAN TODO SOFT HYPHEN */)
5586 attr |= (int64_t) 1 << LBP_BB;
5589 if (ch == 0x002D /* HYPHEN-MINUS */)
5590 attr |= (int64_t) 1 << LBP_HY;
5592 /* contingent break opportunity */
5593 if (ch == 0xFFFC /* OBJECT REPLACEMENT CHARACTER */)
5594 attr |= (int64_t) 1 << LBP_CB;
5596 /* closing punctuation */
5597 if ((unicode_attributes[ch].category[0] == 'P'
5598 && unicode_attributes[ch].category[1] == 'e')
5599 || ch == 0x3001 /* IDEOGRAPHIC COMMA */
5600 || ch == 0x3002 /* IDEOGRAPHIC FULL STOP */
5601 || ch == 0xFE11 /* PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC COMMA */
5602 || ch == 0xFE12 /* PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC FULL STOP */
5603 || ch == 0xFE50 /* SMALL COMMA */
5604 || ch == 0xFE52 /* SMALL FULL STOP */
5605 || ch == 0xFF0C /* FULLWIDTH COMMA */
5606 || ch == 0xFF0E /* FULLWIDTH FULL STOP */
5607 || ch == 0xFF61 /* HALFWIDTH IDEOGRAPHIC FULL STOP */
5608 || ch == 0xFF64 /* HALFWIDTH IDEOGRAPHIC COMMA */)
5609 attr |= (int64_t) 1 << LBP_CL;
5611 /* exclamation/interrogation */
5612 if (ch == 0x0021 /* EXCLAMATION MARK */
5613 || ch == 0x003F /* QUESTION MARK */
5614 || ch == 0x05C6 /* HEBREW PUNCTUATION NUN HAFUKHA */
5615 || ch == 0x061B /* ARABIC SEMICOLON */
5616 || ch == 0x061E /* ARABIC TRIPLE DOT PUNCTUATION MARK */
5617 || ch == 0x061F /* ARABIC QUESTION MARK */
5618 || ch == 0x06D4 /* ARABIC FULL STOP */
5619 || ch == 0x07F9 /* NKO EXCLAMATION MARK */
5620 || ch == 0x0F0D /* TIBETAN MARK SHAD */
5621 || ch == 0x0F0E /* TIBETAN MARK NYIS SHAD */
5622 || ch == 0x0F0F /* TIBETAN MARK TSHEG SHAD */
5623 || ch == 0x0F10 /* TIBETAN MARK NYIS TSHEG SHAD */
5624 || ch == 0x0F11 /* TIBETAN MARK RIN CHEN SPUNGS SHAD */
5625 || ch == 0x0F14 /* TIBETAN MARK GTER TSHEG */
5626 || ch == 0x1802 /* MONGOLIAN COMMA */
5627 || ch == 0x1803 /* MONGOLIAN FULL STOP */
5628 || ch == 0x1808 /* MONGOLIAN MANCHU COMMA */
5629 || ch == 0x1809 /* MONGOLIAN MANCHU FULL STOP */
5630 || ch == 0x1944 /* LIMBU EXCLAMATION MARK */
5631 || ch == 0x1945 /* LIMBU QUESTION MARK */
5632 || ch == 0x2762 /* HEAVY EXCLAMATION MARK ORNAMENT */
5633 || ch == 0x2763 /* HEAVY HEART EXCLAMATION MARK ORNAMENT */
5634 || ch == 0x2CF9 /* COPTIC OLD NUBIAN FULL STOP */
5635 || ch == 0x2CFE /* COPTIC FULL STOP */
5636 || ch == 0x2E2E /* REVERSED QUESTION MARK */
5637 || ch == 0xA60E /* VAI FULL STOP */
5638 || ch == 0xA876 /* PHAGS-PA MARK SHAD */
5639 || ch == 0xA877 /* PHAGS-PA MARK DOUBLE SHAD */
5640 || ch == 0xFE15 /* PRESENTATION FORM FOR VERTICAL EXCLAMATION MARK */
5641 || ch == 0xFE16 /* PRESENTATION FORM FOR VERTICAL QUESTION MARK */
5642 || ch == 0xFE56 /* SMALL QUESTION MARK */
5643 || ch == 0xFE57 /* SMALL EXCLAMATION MARK */
5644 || ch == 0xFF01 /* FULLWIDTH EXCLAMATION MARK */
5645 || ch == 0xFF1F /* FULLWIDTH QUESTION MARK */)
5646 attr |= (int64_t) 1 << LBP_EX;
5649 if (ch == 0x2024 /* ONE DOT LEADER */
5650 || ch == 0x2025 /* TWO DOT LEADER */
5651 || ch == 0x2026 /* HORIZONTAL ELLIPSIS */
5652 || ch == 0xFE19 /* PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS */)
5653 attr |= (int64_t) 1 << LBP_IN;
5656 if (ch == 0x17D6 /* KHMER SIGN CAMNUC PII KUUH */
5657 || ch == 0x203C /* DOUBLE EXCLAMATION MARK */
5658 || ch == 0x203D /* INTERROBANG */
5659 || ch == 0x2047 /* DOUBLE QUESTION MARK */
5660 || ch == 0x2048 /* QUESTION EXCLAMATION MARK */
5661 || ch == 0x2049 /* EXCLAMATION QUESTION MARK */
5662 || ch == 0x3005 /* IDEOGRAPHIC ITERATION MARK */
5663 || ch == 0x301C /* WAVE DASH */
5664 || ch == 0x303C /* MASU MARK */
5665 || ch == 0x303B /* VERTICAL IDEOGRAPHIC ITERATION MARK */
5666 || ch == 0x309B /* KATAKANA-HIRAGANA VOICED SOUND MARK */
5667 || ch == 0x309C /* KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK */
5668 || ch == 0x309D /* HIRAGANA ITERATION MARK */
5669 || ch == 0x309E /* HIRAGANA VOICED ITERATION MARK */
5670 || ch == 0x30A0 /* KATAKANA-HIRAGANA DOUBLE HYPHEN */
5671 || ch == 0x30FB /* KATAKANA MIDDLE DOT */
5672 || ch == 0x30FC /* KATAKANA-HIRAGANA PROLONGED SOUND MARK */
5673 || ch == 0x30FD /* KATAKANA ITERATION MARK */
5674 || ch == 0x30FE /* KATAKANA VOICED ITERATION MARK */
5675 || ch == 0xA015 /* YI SYLLABLE WU */
5676 || ch == 0xFE54 /* SMALL SEMICOLON */
5677 || ch == 0xFE55 /* SMALL COLON */
5678 || ch == 0xFF1A /* FULLWIDTH COLON */
5679 || ch == 0xFF1B /* FULLWIDTH SEMICOLON */
5680 || ch == 0xFF65 /* HALFWIDTH KATAKANA MIDDLE DOT */
5681 || ch == 0xFF70 /* HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK */
5682 || ch == 0xFF9E /* HALFWIDTH KATAKANA VOICED SOUND MARK */
5683 || ch == 0xFF9F /* HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK */
5684 || strstr (unicode_attributes[ch].name, "HIRAGANA LETTER SMALL ") != NULL
5685 || strstr (unicode_attributes[ch].name, "KATAKANA LETTER SMALL ") != NULL)
5686 attr |= (int64_t) 1 << LBP_NS;
5688 /* opening punctuation */
5689 if ((unicode_attributes[ch].category[0] == 'P'
5690 && unicode_attributes[ch].category[1] == 's')
5691 || ch == 0x00A1 /* INVERTED EXCLAMATION MARK */
5692 || ch == 0x00BF /* INVERTED QUESTION MARK */
5693 || ch == 0x2E18 /* INVERTED INTERROBANG */)
5694 attr |= (int64_t) 1 << LBP_OP;
5696 /* ambiguous quotation */
5697 if ((unicode_attributes[ch].category[0] == 'P'
5698 && (unicode_attributes[ch].category[1] == 'f'
5699 || unicode_attributes[ch].category[1] == 'i'))
5700 || ch == 0x0022 /* QUOTATION MARK */
5701 || ch == 0x0027 /* APOSTROPHE */
5702 || ch == 0x275B /* HEAVY SINGLE TURNED COMMA QUOTATION MARK ORNAMENT */
5703 || ch == 0x275C /* HEAVY SINGLE COMMA QUOTATION MARK ORNAMENT */
5704 || ch == 0x275D /* HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT */
5705 || ch == 0x275E /* HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT */
5706 || ch == 0x2E00 /* RIGHT ANGLE SUBSTITUTION MARKER */
5707 || ch == 0x2E01 /* RIGHT ANGLE DOTTED SUBSTITUTION MARKER */
5708 || ch == 0x2E06 /* RAISED INTERPOLATION MARKER */
5709 || ch == 0x2E07 /* RAISED DOTTED INTERPOLATION MARKER */
5710 || ch == 0x2E08 /* DOTTED TRANSPOSITION MARKER */
5711 || ch == 0x2E0B /* RAISED SQUARE */)
5712 attr |= (int64_t) 1 << LBP_QU;
5714 /* infix separator (numeric) */
5715 if (ch == 0x002C /* COMMA */
5716 || ch == 0x002E /* FULL STOP */
5717 || ch == 0x003A /* COLON */
5718 || ch == 0x003B /* SEMICOLON */
5719 || ch == 0x037E /* GREEK QUESTION MARK */
5720 || ch == 0x0589 /* ARMENIAN FULL STOP */
5721 || ch == 0x060C /* ARABIC COMMA */
5722 || ch == 0x060D /* ARABIC DATE SEPARATOR */
5723 || ch == 0x07F8 /* NKO COMMA */
5724 || ch == 0x2044 /* FRACTION SLASH */
5725 || ch == 0xFE10 /* PRESENTATION FORM FOR VERTICAL COMMA */
5726 || ch == 0xFE13 /* PRESENTATION FORM FOR VERTICAL COLON */
5727 || ch == 0xFE14 /* PRESENTATION FORM FOR VERTICAL SEMICOLON */)
5728 attr |= (int64_t) 1 << LBP_IS;
5731 if ((unicode_attributes[ch].category[0] == 'N'
5732 && unicode_attributes[ch].category[1] == 'd'
5733 && strstr (unicode_attributes[ch].name, "FULLWIDTH") == NULL)
5734 || ch == 0x066B /* ARABIC DECIMAL SEPARATOR */
5735 || ch == 0x066C /* ARABIC THOUSANDS SEPARATOR */)
5736 attr |= (int64_t) 1 << LBP_NU;
5738 /* postfix (numeric) */
5739 if (ch == 0x0025 /* PERCENT SIGN */
5740 || ch == 0x00A2 /* CENT SIGN */
5741 || ch == 0x00B0 /* DEGREE SIGN */
5742 || ch == 0x060B /* AFGHANI SIGN */
5743 || ch == 0x066A /* ARABIC PERCENT SIGN */
5744 || ch == 0x2030 /* PER MILLE SIGN */
5745 || ch == 0x2031 /* PER TEN THOUSAND SIGN */
5746 || ch == 0x2032 /* PRIME */
5747 || ch == 0x2033 /* DOUBLE PRIME */
5748 || ch == 0x2034 /* TRIPLE PRIME */
5749 || ch == 0x2035 /* REVERSED PRIME */
5750 || ch == 0x2036 /* REVERSED DOUBLE PRIME */
5751 || ch == 0x2037 /* REVERSED TRIPLE PRIME */
5752 || ch == 0x20A7 /* PESETA SIGN */
5753 || ch == 0x2103 /* DEGREE CELSIUS */
5754 || ch == 0x2109 /* DEGREE FAHRENHEIT */
5755 || ch == 0xFDFC /* RIAL SIGN */
5756 || ch == 0xFE6A /* SMALL PERCENT SIGN */
5757 || ch == 0xFF05 /* FULLWIDTH PERCENT SIGN */
5758 || ch == 0xFFE0 /* FULLWIDTH DIGIT ZERO */
5759 /* Extra characters for compatibility with Unicode LineBreak.txt. */
5760 || ch == 0x0609 /* ARABIC-INDIC PER MILLE SIGN */
5761 || ch == 0x060A /* ARABIC-INDIC PER TEN THOUSAND SIGN */
5762 || ch == 0x0D79 /* MALAYALAM DATE MARK */)
5763 attr |= (int64_t) 1 << LBP_PO;
5765 /* prefix (numeric) */
5766 if ((unicode_attributes[ch].category[0] == 'S'
5767 && unicode_attributes[ch].category[1] == 'c')
5768 || ch == 0x002B /* PLUS SIGN */
5769 || ch == 0x005C /* REVERSE SOLIDUS */
5770 || ch == 0x00B1 /* PLUS-MINUS SIGN */
5771 || ch == 0x2116 /* NUMERO SIGN */
5772 || ch == 0x2212 /* MINUS SIGN */
5773 || ch == 0x2213 /* MINUS-OR-PLUS SIGN */)
5774 if (!(attr & ((int64_t) 1 << LBP_PO)))
5775 attr |= (int64_t) 1 << LBP_PR;
5777 /* symbols allowing breaks */
5778 if (ch == 0x002F /* SOLIDUS */)
5779 attr |= (int64_t) 1 << LBP_SY;
5781 if (ch >= 0xAC00 && ch <= 0xD7A3 && ((ch - 0xAC00) % 28) == 0)
5782 attr |= (int64_t) 1 << LBP_H2;
5784 if (ch >= 0xAC00 && ch <= 0xD7A3 && ((ch - 0xAC00) % 28) != 0)
5785 attr |= (int64_t) 1 << LBP_H3;
5787 if ((ch >= 0x1100 && ch <= 0x1159) || ch == 0x115F)
5788 attr |= (int64_t) 1 << LBP_JL;
5790 if (ch >= 0x1160 && ch <= 0x11A2)
5791 attr |= (int64_t) 1 << LBP_JV;
5793 if (ch >= 0x11A8 && ch <= 0x11F9)
5794 attr |= (int64_t) 1 << LBP_JT;
5796 /* complex context (South East Asian) */
5797 if (((unicode_attributes[ch].category[0] == 'C'
5798 && unicode_attributes[ch].category[1] == 'f')
5799 || (unicode_attributes[ch].category[0] == 'L'
5800 && (unicode_attributes[ch].category[1] == 'm'
5801 || unicode_attributes[ch].category[1] == 'o'))
5802 || (unicode_attributes[ch].category[0] == 'M'
5803 && (unicode_attributes[ch].category[1] == 'c'
5804 || unicode_attributes[ch].category[1] == 'n'))
5805 /* Extra characters for compatibility with Unicode LineBreak.txt. */
5806 || ch == 0x109E /* MYANMAR SYMBOL SHAN ONE */
5807 || ch == 0x109F /* MYANMAR SYMBOL SHAN EXCLAMATION */
5808 || ch == 0x19DE /* NEW TAI LUE SIGN LAE */
5809 || ch == 0x19DF /* NEW TAI LUE SIGN LAEV */)
5810 && ((ch >= 0x0E00 && ch <= 0x0EFF)
5811 || (ch >= 0x1000 && ch <= 0x109F)
5812 || (ch >= 0x1780 && ch <= 0x17FF)
5813 || (ch >= 0x1950 && ch <= 0x19DF)))
5814 attr |= (int64_t) 1 << LBP_SA;
5816 /* attached characters and combining marks */
5817 if ((unicode_attributes[ch].category[0] == 'M'
5818 && (unicode_attributes[ch].category[1] == 'c'
5819 || unicode_attributes[ch].category[1] == 'e'
5820 || unicode_attributes[ch].category[1] == 'n'))
5821 || (unicode_attributes[ch].category[0] == 'C'
5822 && (unicode_attributes[ch].category[1] == 'c'
5823 || unicode_attributes[ch].category[1] == 'f')))
5824 if (!(attr & (((int64_t) 1 << LBP_BK) | ((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_GL) | ((int64_t) 1 << LBP_SA) | ((int64_t) 1 << LBP_WJ) | ((int64_t) 1 << LBP_ZW))))
5825 attr |= (int64_t) 1 << LBP_CM;
5828 if ((ch >= 0x2E80 && ch <= 0x2FFF) /* CJK RADICAL, KANGXI RADICAL, IDEOGRAPHIC DESCRIPTION */
5829 || ch == 0x3000 /* IDEOGRAPHIC SPACE */
5830 || (ch >= 0x3040 && ch <= 0x309F) /* HIRAGANA */
5831 || (ch >= 0x30A0 && ch <= 0x30FF) /* KATAKANA */
5832 || (ch >= 0x3400 && ch <= 0x4DB5) /* CJK Ideograph Extension A */
5833 || (ch >= 0x4E00 && ch <= 0x9FC3) /* CJK Ideograph */
5834 || (ch >= 0xF900 && ch <= 0xFAD9) /* CJK COMPATIBILITY IDEOGRAPH */
5835 || (ch >= 0xA000 && ch <= 0xA48F) /* YI SYLLABLE */
5836 || (ch >= 0xA490 && ch <= 0xA4CF) /* YI RADICAL */
5837 || ch == 0xFE62 /* SMALL PLUS SIGN */
5838 || ch == 0xFE63 /* SMALL HYPHEN-MINUS */
5839 || ch == 0xFE64 /* SMALL LESS-THAN SIGN */
5840 || ch == 0xFE65 /* SMALL GREATER-THAN SIGN */
5841 || ch == 0xFE66 /* SMALL EQUALS SIGN */
5842 || (ch >= 0xFF10 && ch <= 0xFF19) /* FULLWIDTH DIGIT */
5843 || (ch >= 0x20000 && ch <= 0x2A6D6) /* CJK Ideograph Extension B */
5844 || (ch >= 0x2F800 && ch <= 0x2FA1D) /* CJK COMPATIBILITY IDEOGRAPH */
5845 || strstr (unicode_attributes[ch].name, "FULLWIDTH LATIN ") != NULL
5846 || (ch >= 0x3000 && ch <= 0x33FF
5847 && !(attr & (((int64_t) 1 << LBP_CM) | ((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_OP) | ((int64_t) 1 << LBP_CL))))
5848 /* Extra characters for compatibility with Unicode LineBreak.txt. */
5849 || ch == 0xFE30 /* PRESENTATION FORM FOR VERTICAL TWO DOT LEADER */
5850 || ch == 0xFE31 /* PRESENTATION FORM FOR VERTICAL EM DASH */
5851 || ch == 0xFE32 /* PRESENTATION FORM FOR VERTICAL EN DASH */
5852 || ch == 0xFE33 /* PRESENTATION FORM FOR VERTICAL LOW LINE */
5853 || ch == 0xFE34 /* PRESENTATION FORM FOR VERTICAL WAVY LOW LINE */
5854 || ch == 0xFE45 /* SESAME DOT */
5855 || ch == 0xFE46 /* WHITE SESAME DOT */
5856 || ch == 0xFE49 /* DASHED OVERLINE */
5857 || ch == 0xFE4A /* CENTRELINE OVERLINE */
5858 || ch == 0xFE4B /* WAVY OVERLINE */
5859 || ch == 0xFE4C /* DOUBLE WAVY OVERLINE */
5860 || ch == 0xFE4D /* DASHED LOW LINE */
5861 || ch == 0xFE4E /* CENTRELINE LOW LINE */
5862 || ch == 0xFE4F /* WAVY LOW LINE */
5863 || ch == 0xFE51 /* SMALL IDEOGRAPHIC COMMA */
5864 || ch == 0xFE58 /* SMALL EM DASH */
5865 || ch == 0xFE5F /* SMALL NUMBER SIGN */
5866 || ch == 0xFE60 /* SMALL AMPERSAND */
5867 || ch == 0xFE61 /* SMALL ASTERISK */
5868 || ch == 0xFE68 /* SMALL REVERSE SOLIDUS */
5869 || ch == 0xFE6B /* SMALL COMMERCIAL AT */
5870 || ch == 0xFF02 /* FULLWIDTH QUOTATION MARK */
5871 || ch == 0xFF03 /* FULLWIDTH NUMBER SIGN */
5872 || ch == 0xFF06 /* FULLWIDTH AMPERSAND */
5873 || ch == 0xFF07 /* FULLWIDTH APOSTROPHE */
5874 || ch == 0xFF0A /* FULLWIDTH ASTERISK */
5875 || ch == 0xFF0B /* FULLWIDTH PLUS SIGN */
5876 || ch == 0xFF0D /* FULLWIDTH HYPHEN-MINUS */
5877 || ch == 0xFF0F /* FULLWIDTH SOLIDUS */
5878 || ch == 0xFF1C /* FULLWIDTH LESS-THAN SIGN */
5879 || ch == 0xFF1D /* FULLWIDTH EQUALS SIGN */
5880 || ch == 0xFF1E /* FULLWIDTH GREATER-THAN SIGN */
5881 || ch == 0xFF20 /* FULLWIDTH COMMERCIAL AT */
5882 || ch == 0xFF3C /* FULLWIDTH REVERSE SOLIDUS */
5883 || ch == 0xFF3E /* FULLWIDTH CIRCUMFLEX ACCENT */
5884 || ch == 0xFF3F /* FULLWIDTH LOW LINE */
5885 || ch == 0xFF40 /* FULLWIDTH GRAVE ACCENT */
5886 || ch == 0xFF5C /* FULLWIDTH VERTICAL LINE */
5887 || ch == 0xFF5E /* FULLWIDTH TILDE */
5888 || ch == 0xFFE2 /* FULLWIDTH NOT SIGN */
5889 || ch == 0xFFE3 /* FULLWIDTH MACRON */
5890 || ch == 0xFFE4 /* FULLWIDTH BROKEN BAR */)
5891 if (!(attr & (((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_CM))))
5893 /* ambiguous (ideograph) ? */
5894 if ((unicode_width[ch] != NULL
5895 && unicode_width[ch][0] == 'A'
5897 || ch == 0x24EA /* CIRCLED DIGIT ZERO */
5898 || (ch >= 0x2780 && ch <= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */)
5899 attr |= (int64_t) 1 << LBP_AI;
5901 attr |= (int64_t) 1 << LBP_ID;
5904 /* ordinary alphabetic and symbol characters */
5905 if ((unicode_attributes[ch].category[0] == 'L'
5906 && (unicode_attributes[ch].category[1] == 'u'
5907 || unicode_attributes[ch].category[1] == 'l'
5908 || unicode_attributes[ch].category[1] == 't'
5909 || unicode_attributes[ch].category[1] == 'm'
5910 || unicode_attributes[ch].category[1] == 'o'))
5911 || (unicode_attributes[ch].category[0] == 'S'
5912 && (unicode_attributes[ch].category[1] == 'm'
5913 || unicode_attributes[ch].category[1] == 'k'
5914 || unicode_attributes[ch].category[1] == 'o'))
5915 || (unicode_attributes[ch].category[0] == 'N'
5916 && (unicode_attributes[ch].category[1] == 'l'
5917 || unicode_attributes[ch].category[1] == 'o'))
5918 || (unicode_attributes[ch].category[0] == 'P'
5919 && (unicode_attributes[ch].category[1] == 'c'
5920 || unicode_attributes[ch].category[1] == 'd'
5921 || unicode_attributes[ch].category[1] == 'o'))
5922 || ch == 0x0600 /* ARABIC NUMBER SIGN */
5923 || ch == 0x0601 /* ARABIC SIGN SANAH */
5924 || ch == 0x0602 /* ARABIC FOOTNOTE MARKER */
5925 || ch == 0x0603 /* ARABIC SIGN SAFHA */
5926 || ch == 0x06DD /* ARABIC END OF AYAH */
5927 || ch == 0x070F /* SYRIAC ABBREVIATION MARK */
5928 || ch == 0x2061 /* FUNCTION APPLICATION */
5929 || ch == 0x2062 /* INVISIBLE TIMES */
5930 || ch == 0x2063 /* INVISIBLE SEPARATOR */
5931 || ch == 0x2064 /* INVISIBLE PLUS */)
5932 if (!(attr & (((int64_t) 1 << LBP_GL) | ((int64_t) 1 << LBP_B2) | ((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_BB) | ((int64_t) 1 << LBP_HY) | ((int64_t) 1 << LBP_CB) | ((int64_t) 1 << LBP_CL) | ((int64_t) 1 << LBP_EX) | ((int64_t) 1 << LBP_IN) | ((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_OP) | ((int64_t) 1 << LBP_QU) | ((int64_t) 1 << LBP_IS) | ((int64_t) 1 << LBP_NU) | ((int64_t) 1 << LBP_PO) | ((int64_t) 1 << LBP_PR) | ((int64_t) 1 << LBP_SY) | ((int64_t) 1 << LBP_H2) | ((int64_t) 1 << LBP_H3) | ((int64_t) 1 << LBP_JL) | ((int64_t) 1 << LBP_JV) | ((int64_t) 1 << LBP_JT) | ((int64_t) 1 << LBP_SA) | ((int64_t) 1 << LBP_ID))))
5934 /* ambiguous (alphabetic) ? */
5935 if ((unicode_width[ch] != NULL
5936 && unicode_width[ch][0] == 'A'
5938 /* Extra exceptions for compatibility with Unicode LineBreak.txt. */
5939 && ch != 0x2022 /* BULLET */
5940 && ch != 0x203E /* OVERLINE */
5941 && ch != 0x2126 /* OHM SIGN */
5942 && ch != 0x2153 /* VULGAR FRACTION ONE THIRD */
5943 && ch != 0x215C /* VULGAR FRACTION THREE EIGHTHS */
5944 && ch != 0x215D /* VULGAR FRACTION FIVE EIGHTHS */
5945 && ch != 0x21B8 /* NORTH WEST ARROW TO LONG BAR */
5946 && ch != 0x21B9 /* LEFTWARDS ARROW TO BAR OVER RIGHTWARDS ARROW TO BAR */
5947 && ch != 0x21E7 /* UPWARDS WHITE ARROW */
5948 && ch != 0x24FF /* NEGATIVE CIRCLED DIGIT ZERO */
5949 && ch != 0x273D /* HEAVY TEARDROP-SPOKED ASTERISK */)
5950 || ch == 0x00A7 /* SECTION SIGN */
5951 || ch == 0x00A8 /* DIAERESIS */
5952 || ch == 0x00AA /* FEMININE ORDINAL INDICATOR */
5953 || ch == 0x00B2 /* SUPERSCRIPT TWO */
5954 || ch == 0x00B3 /* SUPERSCRIPT THREE */
5955 || ch == 0x00B6 /* PILCROW SIGN */
5956 || ch == 0x00B7 /* MIDDLE DOT */
5957 || ch == 0x00B8 /* CEDILLA */
5958 || ch == 0x00B9 /* SUPERSCRIPT ONE */
5959 || ch == 0x00BA /* MASCULINE ORDINAL INDICATOR */
5960 || ch == 0x00BC /* VULGAR FRACTION ONE QUARTER */
5961 || ch == 0x00BD /* VULGAR FRACTION ONE HALF */
5962 || ch == 0x00BE /* VULGAR FRACTION THREE QUARTERS */
5963 || ch == 0x00D7 /* MULTIPLICATION SIGN */
5964 || ch == 0x00F7 /* DIVISION SIGN */
5965 || ch == 0x02C7 /* CARON */
5966 || ch == 0x02C9 /* MODIFIER LETTER MACRON */
5967 || ch == 0x02CA /* MODIFIER LETTER ACUTE ACCENT */
5968 || ch == 0x02CB /* MODIFIER LETTER GRAVE ACCENT */
5969 || ch == 0x02CD /* MODIFIER LETTER LOW MACRON */
5970 || ch == 0x02D0 /* MODIFIER LETTER TRIANGULAR COLON */
5971 || ch == 0x02D8 /* BREVE */
5972 || ch == 0x02D9 /* DOT ABOVE */
5973 || ch == 0x02DA /* RING ABOVE */
5974 || ch == 0x02DB /* OGONEK */
5975 || ch == 0x02DD /* DOUBLE ACUTE ACCENT */
5976 || ch == 0x24EA /* CIRCLED DIGIT ZERO */
5977 || (ch >= 0x2780 && ch <= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */
5978 /* Extra characters for compatibility with Unicode LineBreak.txt. */
5979 || ch == 0x2155 /* VULGAR FRACTION ONE FIFTH */
5980 || ch == 0x2574 /* BOX DRAWINGS LIGHT LEFT */
5981 || ch == 0x2616 /* WHITE SHOGI PIECE */
5982 || ch == 0x2617 /* BLACK SHOGI PIECE */)
5983 attr |= (int64_t) 1 << LBP_AI;
5985 attr |= (int64_t) 1 << LBP_AL;
5986 attr &= ~((int64_t) 1 << LBP_CM);
5992 attr |= (int64_t) 1 << LBP_XX;
5997 /* Output the line breaking properties in a human readable format. */
5999 debug_output_lbp (FILE *stream)
6003 for (i = 0; i < 0x110000; i++)
6005 int64_t attr = get_lbp (i);
6006 if (attr != (int64_t) 1 << LBP_XX)
6008 fprintf (stream, "0x%04X", i);
6009 #define PRINT_BIT(attr,bit) \
6010 if (attr & (1 << bit)) fprintf (stream, " " #bit);
6011 PRINT_BIT(attr,LBP_BK);
6012 PRINT_BIT(attr,LBP_CM);
6013 PRINT_BIT(attr,LBP_WJ);
6014 PRINT_BIT(attr,LBP_ZW);
6015 PRINT_BIT(attr,LBP_GL);
6016 PRINT_BIT(attr,LBP_SP);
6017 PRINT_BIT(attr,LBP_B2);
6018 PRINT_BIT(attr,LBP_BA);
6019 PRINT_BIT(attr,LBP_BB);
6020 PRINT_BIT(attr,LBP_HY);
6021 PRINT_BIT(attr,LBP_CB);
6022 PRINT_BIT(attr,LBP_CL);
6023 PRINT_BIT(attr,LBP_EX);
6024 PRINT_BIT(attr,LBP_IN);
6025 PRINT_BIT(attr,LBP_NS);
6026 PRINT_BIT(attr,LBP_OP);
6027 PRINT_BIT(attr,LBP_QU);
6028 PRINT_BIT(attr,LBP_IS);
6029 PRINT_BIT(attr,LBP_NU);
6030 PRINT_BIT(attr,LBP_PO);
6031 PRINT_BIT(attr,LBP_PR);
6032 PRINT_BIT(attr,LBP_SY);
6033 PRINT_BIT(attr,LBP_AI);
6034 PRINT_BIT(attr,LBP_AL);
6035 PRINT_BIT(attr,LBP_H2);
6036 PRINT_BIT(attr,LBP_H3);
6037 PRINT_BIT(attr,LBP_ID);
6038 PRINT_BIT(attr,LBP_JL);
6039 PRINT_BIT(attr,LBP_JV);
6040 PRINT_BIT(attr,LBP_JT);
6041 PRINT_BIT(attr,LBP_SA);
6042 PRINT_BIT(attr,LBP_XX);
6044 fprintf (stream, "\n");
6050 debug_output_lbrk_tables (const char *filename)
6054 stream = fopen (filename, "w");
6057 fprintf (stderr, "cannot open '%s' for writing\n", filename);
6061 debug_output_lbp (stream);
6063 if (ferror (stream) || fclose (stream))
6065 fprintf (stderr, "error writing to '%s'\n", filename);
6070 /* The line breaking property from the LineBreak.txt file. */
6071 int unicode_org_lbp[0x110000];
6073 /* Stores in unicode_org_lbp[] the line breaking property from the
6074 LineBreak.txt file. */
6076 fill_org_lbp (const char *linebreak_filename)
6080 char field0[FIELDLEN];
6081 char field1[FIELDLEN];
6082 char field2[FIELDLEN];
6085 for (i = 0; i < 0x110000; i++)
6086 unicode_org_lbp[i] = LBP_XX;
6088 stream = fopen (linebreak_filename, "r");
6091 fprintf (stderr, "error during fopen of '%s'\n", linebreak_filename);
6107 do c = getc (stream); while (c != EOF && c != '\n');
6111 n = getfield (stream, field0, ';');
6112 n += getfield (stream, field1, ' ');
6113 n += getfield (stream, field2, '\n');
6118 fprintf (stderr, "short line in '%s':%d\n", linebreak_filename,
6122 #define TRY(bit) else if (strcmp (field1, #bit + 4) == 0) value = bit;
6157 else if (strcmp (field1, "LF") == 0) value = LBP_BK;
6158 else if (strcmp (field1, "CR") == 0) value = LBP_BK;
6159 else if (strcmp (field1, "NL") == 0) value = LBP_BK;
6160 else if (strcmp (field1, "SG") == 0) value = LBP_XX;
6163 fprintf (stderr, "unknown property value \"%s\" in '%s':%d\n",
6164 field1, linebreak_filename, lineno);
6167 i = strtoul (field0, NULL, 16);
6168 if (strstr (field0, "..") != NULL)
6170 /* Deal with a range. */
6171 j = strtoul (strstr (field0, "..") + 2, NULL, 16);
6173 unicode_org_lbp[i] = value;
6177 /* Single character line. */
6178 unicode_org_lbp[i] = value;
6181 if (ferror (stream) || fclose (stream))
6183 fprintf (stderr, "error reading from '%s'\n", linebreak_filename);
6188 /* Output the line breaking properties in a human readable format. */
6190 debug_output_org_lbp (FILE *stream)
6194 for (i = 0; i < 0x110000; i++)
6196 int attr = unicode_org_lbp[i];
6199 fprintf (stream, "0x%04X", i);
6200 #define PRINT_BIT(attr,bit) \
6201 if (attr == bit) fprintf (stream, " " #bit);
6202 PRINT_BIT(attr,LBP_BK);
6203 PRINT_BIT(attr,LBP_CM);
6204 PRINT_BIT(attr,LBP_WJ);
6205 PRINT_BIT(attr,LBP_ZW);
6206 PRINT_BIT(attr,LBP_GL);
6207 PRINT_BIT(attr,LBP_SP);
6208 PRINT_BIT(attr,LBP_B2);
6209 PRINT_BIT(attr,LBP_BA);
6210 PRINT_BIT(attr,LBP_BB);
6211 PRINT_BIT(attr,LBP_HY);
6212 PRINT_BIT(attr,LBP_CB);
6213 PRINT_BIT(attr,LBP_CL);
6214 PRINT_BIT(attr,LBP_EX);
6215 PRINT_BIT(attr,LBP_IN);
6216 PRINT_BIT(attr,LBP_NS);
6217 PRINT_BIT(attr,LBP_OP);
6218 PRINT_BIT(attr,LBP_QU);
6219 PRINT_BIT(attr,LBP_IS);
6220 PRINT_BIT(attr,LBP_NU);
6221 PRINT_BIT(attr,LBP_PO);
6222 PRINT_BIT(attr,LBP_PR);
6223 PRINT_BIT(attr,LBP_SY);
6224 PRINT_BIT(attr,LBP_AI);
6225 PRINT_BIT(attr,LBP_AL);
6226 PRINT_BIT(attr,LBP_H2);
6227 PRINT_BIT(attr,LBP_H3);
6228 PRINT_BIT(attr,LBP_ID);
6229 PRINT_BIT(attr,LBP_JL);
6230 PRINT_BIT(attr,LBP_JV);
6231 PRINT_BIT(attr,LBP_JT);
6232 PRINT_BIT(attr,LBP_SA);
6233 PRINT_BIT(attr,LBP_XX);
6235 fprintf (stream, "\n");
6241 debug_output_org_lbrk_tables (const char *filename)
6245 stream = fopen (filename, "w");
6248 fprintf (stderr, "cannot open '%s' for writing\n", filename);
6252 debug_output_org_lbp (stream);
6254 if (ferror (stream) || fclose (stream))
6256 fprintf (stderr, "error writing to '%s'\n", filename);
6261 /* Construction of sparse 3-level tables. */
6262 #define TABLE lbp_table
6263 #define ELEMENT unsigned char
6264 #define DEFAULT LBP_XX
6265 #define xmalloc malloc
6266 #define xrealloc realloc
6270 output_lbp (FILE *stream1, FILE *stream2)
6274 unsigned int level1_offset, level2_offset, level3_offset;
6278 lbp_table_init (&t);
6280 for (i = 0; i < 0x110000; i++)
6282 int64_t attr = get_lbp (i);
6284 /* Now attr should contain exactly one bit. */
6285 if (attr == 0 || ((attr & (attr - 1)) != 0))
6288 if (attr != (int64_t) 1 << LBP_XX)
6290 unsigned int log2_attr;
6291 for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++);
6293 lbp_table_add (&t, i, log2_attr);
6297 lbp_table_finalize (&t);
6300 5 * sizeof (uint32_t);
6302 5 * sizeof (uint32_t)
6303 + t.level1_size * sizeof (uint32_t);
6305 5 * sizeof (uint32_t)
6306 + t.level1_size * sizeof (uint32_t)
6307 + (t.level2_size << t.q) * sizeof (uint32_t);
6309 for (i = 0; i < 5; i++)
6310 fprintf (stream1, "#define lbrkprop_header_%d %d\n", i,
6311 ((uint32_t *) t.result)[i]);
6312 fprintf (stream1, "\n");
6313 fprintf (stream1, "typedef struct\n");
6314 fprintf (stream1, " {\n");
6315 fprintf (stream1, " int level1[%zu];\n", t.level1_size);
6316 fprintf (stream1, " int level2[%zu << %d];\n", t.level2_size, t.q);
6317 fprintf (stream1, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
6318 fprintf (stream1, " }\n");
6319 fprintf (stream1, "lbrkprop_t;\n");
6320 fprintf (stream1, "extern const lbrkprop_t unilbrkprop;\n");
6322 fprintf (stream2, "const lbrkprop_t unilbrkprop =\n");
6323 fprintf (stream2, "{\n");
6324 fprintf (stream2, " {");
6325 if (t.level1_size > 8)
6326 fprintf (stream2, "\n ");
6327 for (i = 0; i < t.level1_size; i++)
6330 if (i > 0 && (i % 8) == 0)
6331 fprintf (stream2, "\n ");
6332 offset = ((uint32_t *) (t.result + level1_offset))[i];
6334 fprintf (stream2, " %5d", -1);
6336 fprintf (stream2, " %5zu",
6337 (offset - level2_offset) / sizeof (uint32_t));
6338 if (i+1 < t.level1_size)
6339 fprintf (stream2, ",");
6341 if (t.level1_size > 8)
6342 fprintf (stream2, "\n ");
6343 fprintf (stream2, " },\n");
6344 fprintf (stream2, " {");
6345 if (t.level2_size << t.q > 8)
6346 fprintf (stream2, "\n ");
6347 for (i = 0; i < t.level2_size << t.q; i++)
6350 if (i > 0 && (i % 8) == 0)
6351 fprintf (stream2, "\n ");
6352 offset = ((uint32_t *) (t.result + level2_offset))[i];
6354 fprintf (stream2, " %5d", -1);
6356 fprintf (stream2, " %5zu",
6357 (offset - level3_offset) / sizeof (unsigned char));
6358 if (i+1 < t.level2_size << t.q)
6359 fprintf (stream2, ",");
6361 if (t.level2_size << t.q > 8)
6362 fprintf (stream2, "\n ");
6363 fprintf (stream2, " },\n");
6364 fprintf (stream2, " {");
6365 if (t.level3_size << t.p > 8)
6366 fprintf (stream2, "\n ");
6367 for (i = 0; i < t.level3_size << t.p; i++)
6369 unsigned char value = ((unsigned char *) (t.result + level3_offset))[i];
6370 const char *value_string;
6373 #define CASE(x) case x: value_string = #x; break;
6410 if (i > 0 && (i % 8) == 0)
6411 fprintf (stream2, "\n ");
6412 fprintf (stream2, " %s%s", value_string,
6413 (i+1 < t.level3_size << t.p ? "," : ""));
6415 if (t.level3_size << t.p > 8)
6416 fprintf (stream2, "\n ");
6417 fprintf (stream2, " }\n");
6418 fprintf (stream2, "};\n");
6422 output_lbrk_tables (const char *filename1, const char *filename2, const char *version)
6424 const char *filenames[2];
6428 filenames[0] = filename1;
6429 filenames[1] = filename2;
6431 for (i = 0; i < 2; i++)
6433 streams[i] = fopen (filenames[i], "w");
6434 if (streams[i] == NULL)
6436 fprintf (stderr, "cannot open '%s' for writing\n", filenames[i]);
6441 for (i = 0; i < 2; i++)
6443 FILE *stream = streams[i];
6445 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
6446 fprintf (stream, "/* Line breaking properties of Unicode characters. */\n");
6447 fprintf (stream, "/* Generated automatically by gen-lbrk for Unicode %s. */\n",
6449 fprintf (stream, "\n");
6451 /* Put a GPL header on it. The gnulib module is under LGPL (although it
6452 still carries the GPL header), and it's gnulib-tool which replaces the
6453 GPL header with an LGPL header. */
6454 fprintf (stream, "/* Copyright (C) 2000-2002, 2004, 2008 Free Software Foundation, Inc.\n");
6455 fprintf (stream, "\n");
6456 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
6457 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
6458 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
6459 fprintf (stream, " (at your option) any later version.\n");
6460 fprintf (stream, "\n");
6461 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
6462 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
6463 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
6464 fprintf (stream, " GNU General Public License for more details.\n");
6465 fprintf (stream, "\n");
6466 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
6467 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
6468 fprintf (stream, "\n");
6471 output_lbp (streams[0], streams[1]);
6473 for (i = 0; i < 2; i++)
6475 if (ferror (streams[i]) || fclose (streams[i]))
6477 fprintf (stderr, "error writing to '%s'\n", filenames[i]);
6483 /* ========================================================================= */
6485 /* Word break property. */
6487 /* Possible values of the Word_Break property. */
6502 WBP_EXTENDNUMLET = 7
6505 /* Returns the word breaking property for ch, as a bit mask. */
6507 get_wbp (unsigned int ch)
6511 if (unicode_attributes[ch].name != NULL)
6514 attr |= 1 << WBP_CR;
6517 attr |= 1 << WBP_LF;
6519 if (ch == 0x000B || ch == 0x000C
6521 || ch == 0x2028 || ch == 0x2029)
6522 attr |= 1 << WBP_NEWLINE;
6524 if (((unicode_properties[ch] >> PROP_GRAPHEME_EXTEND) & 1) != 0
6525 || (unicode_attributes[ch].category != NULL
6526 && strcmp (unicode_attributes[ch].category, "Mc") == 0))
6527 attr |= 1 << WBP_EXTEND;
6529 if (unicode_attributes[ch].category != NULL
6530 && strcmp (unicode_attributes[ch].category, "Cf") == 0
6531 && ch != 0x200C && ch != 0x200D)
6532 attr |= 1 << WBP_FORMAT;
6534 if ((unicode_scripts[ch] < numscripts
6535 && strcmp (scripts[unicode_scripts[ch]], "Katakana") == 0)
6536 || (ch >= 0x3031 && ch <= 0x3035)
6537 || ch == 0x309B || ch == 0x309C || ch == 0x30A0 || ch == 0x30FC
6539 attr |= 1 << WBP_KATAKANA;
6541 if ((((unicode_properties[ch] >> PROP_ALPHABETIC) & 1) != 0
6543 && ((unicode_properties[ch] >> PROP_IDEOGRAPHIC) & 1) == 0
6544 && (attr & (1 << WBP_KATAKANA)) == 0
6545 && ((get_lbp (ch) >> LBP_SA) & 1) == 0
6546 && !(unicode_scripts[ch] < numscripts
6547 && strcmp (scripts[unicode_scripts[ch]], "Hiragana") == 0)
6548 && (attr & (1 << WBP_EXTEND)) == 0)
6549 attr |= 1 << WBP_ALETTER;
6551 if (ch == 0x0027 || ch == 0x002E || ch == 0x2018 || ch == 0x2019
6552 || ch == 0x2024 || ch == 0xFE52 || ch == 0xFF07 || ch == 0xFF0E)
6553 attr |= 1 << WBP_MIDNUMLET;
6555 if (ch == 0x00B7 || ch == 0x05F4 || ch == 0x2027 || ch == 0x003A
6556 || ch == 0x0387 || ch == 0xFE13 || ch == 0xFE55 || ch == 0xFF1A)
6557 attr |= 1 << WBP_MIDLETTER;
6559 if ((((get_lbp (ch) >> LBP_IS) & 1) != 0
6560 || ch == 0x066C || ch == 0xFE50 || ch == 0xFE54 || ch == 0xFF0C
6562 && ch != 0x003A && ch != 0xFE13 && ch != 0x002E)
6563 attr |= 1 << WBP_MIDNUM;
6565 if (((get_lbp (ch) >> LBP_NU) & 1) != 0
6567 attr |= 1 << WBP_NUMERIC;
6569 if (unicode_attributes[ch].category != NULL
6570 && strcmp (unicode_attributes[ch].category, "Pc") == 0)
6571 attr |= 1 << WBP_EXTENDNUMLET;
6576 attr |= 1 << WBP_OTHER;
6581 /* Output the word break property in a human readable format. */
6583 debug_output_wbp (FILE *stream)
6587 for (i = 0; i < 0x110000; i++)
6589 int attr = get_wbp (i);
6590 if (attr != 1 << WBP_OTHER)
6592 fprintf (stream, "0x%04X", i);
6593 if (attr & (1 << WBP_CR))
6594 fprintf (stream, " CR");
6595 if (attr & (1 << WBP_LF))
6596 fprintf (stream, " LF");
6597 if (attr & (1 << WBP_NEWLINE))
6598 fprintf (stream, " Newline");
6599 if (attr & (1 << WBP_EXTEND))
6600 fprintf (stream, " Extend");
6601 if (attr & (1 << WBP_FORMAT))
6602 fprintf (stream, " Format");
6603 if (attr & (1 << WBP_KATAKANA))
6604 fprintf (stream, " Katakana");
6605 if (attr & (1 << WBP_ALETTER))
6606 fprintf (stream, " ALetter");
6607 if (attr & (1 << WBP_MIDNUMLET))
6608 fprintf (stream, " MidNumLet");
6609 if (attr & (1 << WBP_MIDLETTER))
6610 fprintf (stream, " MidLetter");
6611 if (attr & (1 << WBP_MIDNUM))
6612 fprintf (stream, " MidNum");
6613 if (attr & (1 << WBP_NUMERIC))
6614 fprintf (stream, " Numeric");
6615 if (attr & (1 << WBP_EXTENDNUMLET))
6616 fprintf (stream, " ExtendNumLet");
6617 fprintf (stream, "\n");
6623 debug_output_wbrk_tables (const char *filename)
6627 stream = fopen (filename, "w");
6630 fprintf (stderr, "cannot open '%s' for writing\n", filename);
6634 debug_output_wbp (stream);
6636 if (ferror (stream) || fclose (stream))
6638 fprintf (stderr, "error writing to '%s'\n", filename);
6643 /* The word break property from the WordBreakProperty.txt file. */
6644 int unicode_org_wbp[0x110000];
6646 /* Stores in unicode_org_wbp[] the word break property from the
6647 WordBreakProperty.txt file. */
6649 fill_org_wbp (const char *wordbreakproperty_filename)
6654 for (i = 0; i < 0x110000; i++)
6655 unicode_org_wbp[i] = WBP_OTHER;
6657 stream = fopen (wordbreakproperty_filename, "r");
6660 fprintf (stderr, "error during fopen of '%s'\n", wordbreakproperty_filename);
6667 unsigned int i1, i2;
6668 char padding[200+1];
6669 char propname[200+1];
6672 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
6675 if (buf[0] == '\0' || buf[0] == '#')
6678 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4)
6680 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3)
6682 fprintf (stderr, "parse error in '%s'\n",
6683 wordbreakproperty_filename);
6688 #define PROP(name,value) \
6689 if (strcmp (propname, name) == 0) propvalue = value; else
6692 PROP ("Newline", WBP_NEWLINE)
6693 PROP ("Extend", WBP_EXTEND)
6694 PROP ("Format", WBP_FORMAT)
6695 PROP ("Katakana", WBP_KATAKANA)
6696 PROP ("ALetter", WBP_ALETTER)
6697 PROP ("MidNumLet", WBP_MIDNUMLET)
6698 PROP ("MidLetter", WBP_MIDLETTER)
6699 PROP ("MidNum", WBP_MIDNUM)
6700 PROP ("Numeric", WBP_NUMERIC)
6701 PROP ("ExtendNumLet", WBP_EXTENDNUMLET)
6704 fprintf (stderr, "unknown property value '%s' in '%s'\n", propname,
6705 wordbreakproperty_filename);
6708 if (!(i1 <= i2 && i2 < 0x110000))
6711 for (i = i1; i <= i2; i++)
6712 unicode_org_wbp[i] = propvalue;
6715 if (ferror (stream) || fclose (stream))
6717 fprintf (stderr, "error reading from '%s'\n", wordbreakproperty_filename);
6722 /* Output the word break property in a human readable format. */
6724 debug_output_org_wbp (FILE *stream)
6728 for (i = 0; i < 0x110000; i++)
6730 int propvalue = unicode_org_wbp[i];
6731 if (propvalue != WBP_OTHER)
6733 fprintf (stream, "0x%04X", i);
6734 #define PROP(name,value) \
6735 if (propvalue == value) fprintf (stream, " " name); else
6738 PROP ("Newline", WBP_NEWLINE)
6739 PROP ("Extend", WBP_EXTEND)
6740 PROP ("Format", WBP_FORMAT)
6741 PROP ("Katakana", WBP_KATAKANA)
6742 PROP ("ALetter", WBP_ALETTER)
6743 PROP ("MidNumLet", WBP_MIDNUMLET)
6744 PROP ("MidLetter", WBP_MIDLETTER)
6745 PROP ("MidNum", WBP_MIDNUM)
6746 PROP ("Numeric", WBP_NUMERIC)
6747 PROP ("ExtendNumLet", WBP_EXTENDNUMLET)
6749 fprintf (stream, " ??");
6750 fprintf (stream, "\n");
6756 debug_output_org_wbrk_tables (const char *filename)
6760 stream = fopen (filename, "w");
6763 fprintf (stderr, "cannot open '%s' for writing\n", filename);
6767 debug_output_org_wbp (stream);
6769 if (ferror (stream) || fclose (stream))
6771 fprintf (stderr, "error writing to '%s'\n", filename);
6776 /* Construction of sparse 3-level tables. */
6777 #define TABLE wbp_table
6778 #define ELEMENT unsigned char
6779 #define DEFAULT WBP_OTHER
6780 #define xmalloc malloc
6781 #define xrealloc realloc
6785 output_wbp (FILE *stream)
6789 unsigned int level1_offset, level2_offset, level3_offset;
6793 wbp_table_init (&t);
6795 for (i = 0; i < 0x110000; i++)
6797 int attr = get_wbp (i);
6799 /* Now attr should contain exactly one bit. */
6800 if (attr == 0 || ((attr & (attr - 1)) != 0))
6803 if (attr != 1 << WBP_OTHER)
6805 unsigned int log2_attr;
6806 for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++);
6808 wbp_table_add (&t, i, log2_attr);
6812 wbp_table_finalize (&t);
6815 5 * sizeof (uint32_t);
6817 5 * sizeof (uint32_t)
6818 + t.level1_size * sizeof (uint32_t);
6820 5 * sizeof (uint32_t)
6821 + t.level1_size * sizeof (uint32_t)
6822 + (t.level2_size << t.q) * sizeof (uint32_t);
6824 for (i = 0; i < 5; i++)
6825 fprintf (stream, "#define wbrkprop_header_%d %d\n", i,
6826 ((uint32_t *) t.result)[i]);
6827 fprintf (stream, "\n");
6828 fprintf (stream, "typedef struct\n");
6829 fprintf (stream, " {\n");
6830 fprintf (stream, " int level1[%zu];\n", t.level1_size);
6831 fprintf (stream, " int level2[%zu << %d];\n", t.level2_size, t.q);
6832 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
6833 fprintf (stream, " }\n");
6834 fprintf (stream, "wbrkprop_t;\n");
6835 fprintf (stream, "static const wbrkprop_t uniwbrkprop =\n");
6836 fprintf (stream, "{\n");
6837 fprintf (stream, " {");
6838 if (t.level1_size > 8)
6839 fprintf (stream, "\n ");
6840 for (i = 0; i < t.level1_size; i++)
6843 if (i > 0 && (i % 8) == 0)
6844 fprintf (stream, "\n ");
6845 offset = ((uint32_t *) (t.result + level1_offset))[i];
6847 fprintf (stream, " %5d", -1);
6849 fprintf (stream, " %5zu",
6850 (offset - level2_offset) / sizeof (uint32_t));
6851 if (i+1 < t.level1_size)
6852 fprintf (stream, ",");
6854 if (t.level1_size > 8)
6855 fprintf (stream, "\n ");
6856 fprintf (stream, " },\n");
6857 fprintf (stream, " {");
6858 if (t.level2_size << t.q > 8)
6859 fprintf (stream, "\n ");
6860 for (i = 0; i < t.level2_size << t.q; i++)
6863 if (i > 0 && (i % 8) == 0)
6864 fprintf (stream, "\n ");
6865 offset = ((uint32_t *) (t.result + level2_offset))[i];
6867 fprintf (stream, " %5d", -1);
6869 fprintf (stream, " %5zu",
6870 (offset - level3_offset) / sizeof (unsigned char));
6871 if (i+1 < t.level2_size << t.q)
6872 fprintf (stream, ",");
6874 if (t.level2_size << t.q > 8)
6875 fprintf (stream, "\n ");
6876 fprintf (stream, " },\n");
6877 fprintf (stream, " {");
6878 if (t.level3_size << t.p > 4)
6879 fprintf (stream, "\n ");
6880 for (i = 0; i < t.level3_size << t.p; i++)
6882 unsigned char value = ((unsigned char *) (t.result + level3_offset))[i];
6883 const char *value_string;
6886 #define CASE(x) case x: value_string = #x; break;
6895 CASE(WBP_MIDNUMLET);
6896 CASE(WBP_MIDLETTER);
6899 CASE(WBP_EXTENDNUMLET);
6904 if (i > 0 && (i % 4) == 0)
6905 fprintf (stream, "\n ");
6906 fprintf (stream, " %s%s", value_string,
6907 (i+1 < t.level3_size << t.p ? "," : ""));
6909 if (t.level3_size << t.p > 4)
6910 fprintf (stream, "\n ");
6911 fprintf (stream, " }\n");
6912 fprintf (stream, "};\n");
6916 output_wbrk_tables (const char *filename, const char *version)
6920 stream = fopen (filename, "w");
6923 fprintf (stderr, "cannot open '%s' for writing\n", filename);
6927 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
6928 fprintf (stream, "/* Line breaking properties of Unicode characters. */\n");
6929 fprintf (stream, "/* Generated automatically by gen-uni-tables for Unicode %s. */\n",
6931 fprintf (stream, "\n");
6933 /* Put a GPL header on it. The gnulib module is under LGPL (although it
6934 still carries the GPL header), and it's gnulib-tool which replaces the
6935 GPL header with an LGPL header. */
6936 fprintf (stream, "/* Copyright (C) 2000-2002, 2004, 2007-2009 Free Software Foundation, Inc.\n");
6937 fprintf (stream, "\n");
6938 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
6939 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
6940 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
6941 fprintf (stream, " (at your option) any later version.\n");
6942 fprintf (stream, "\n");
6943 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
6944 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
6945 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
6946 fprintf (stream, " GNU General Public License for more details.\n");
6947 fprintf (stream, "\n");
6948 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
6949 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
6950 fprintf (stream, "\n");
6952 output_wbp (stream);
6954 if (ferror (stream) || fclose (stream))
6956 fprintf (stderr, "error writing to '%s'\n", filename);
6961 /* ========================================================================= */
6963 /* Grapheme break property. */
6965 /* Possible values of the Grapheme_Cluster_Break property. */
6974 GBP_SPACINGMARK = 6,
6982 /* Construction of sparse 3-level tables. */
6983 #define TABLE gbp_table
6984 #define ELEMENT unsigned char
6985 #define DEFAULT GBP_OTHER
6986 #define xmalloc malloc
6987 #define xrealloc realloc
6990 /* The grapheme break property from the GraphemeBreakProperty.txt file. */
6991 int unicode_org_gbp[0x110000];
6993 /* Output the unit test data for the grapheme break property. */
6995 output_gbp_test (const char *filename)
7001 stream = fopen (filename, "w");
7004 fprintf (stderr, "cannot open '%s' for writing\n", filename);
7008 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
7009 fprintf (stream, "/* Test the Unicode grapheme break property functions.\n");
7010 fprintf (stream, " Copyright (C) 2010 Free Software Foundation, Inc.\n");
7011 fprintf (stream, "\n");
7012 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
7013 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
7014 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
7015 fprintf (stream, " (at your option) any later version.\n");
7016 fprintf (stream, "\n");
7017 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
7018 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
7019 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
7020 fprintf (stream, " GNU General Public License for more details.\n");
7021 fprintf (stream, "\n");
7022 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
7023 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
7024 fprintf (stream, "\n");
7027 for (ch = 0; ch < 0x110000; ch++)
7029 int gbp = unicode_org_gbp[ch];
7030 const char *gbp_string;
7032 while (ch + 1 < 0x110000 && unicode_org_gbp[ch + 1] == gbp)
7037 #define CASE(x) case x: gbp_string = #x; break;
7044 CASE (GBP_SPACINGMARK)
7056 fprintf (stream, ",\n");
7057 fprintf (stream, "{ 0x%04X, %s }", ch + 1, gbp_string);
7061 fprintf (stream, "\n");
7063 if (ferror (stream) || fclose (stream))
7065 fprintf (stderr, "error writing to '%s'\n", filename);
7070 /* Output the per-character grapheme break property table. */
7072 output_gbp_table (const char *filename, const char *version)
7077 unsigned int level1_offset, level2_offset, level3_offset;
7079 stream = fopen (filename, "w");
7082 fprintf (stderr, "cannot open '%s' for writing\n", filename);
7086 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
7087 fprintf (stream, "/* Grapheme break property of Unicode characters. */\n");
7088 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
7093 gbp_table_init (&t);
7095 for (ch = 0; ch < 0x110000; ch++)
7096 gbp_table_add (&t, ch, unicode_org_gbp[ch]);
7098 gbp_table_finalize (&t);
7100 /* Offsets in t.result, in memory of this process. */
7102 5 * sizeof (uint32_t);
7104 5 * sizeof (uint32_t)
7105 + t.level1_size * sizeof (uint32_t);
7107 5 * sizeof (uint32_t)
7108 + t.level1_size * sizeof (uint32_t)
7109 + (t.level2_size << t.q) * sizeof (uint32_t);
7111 for (i = 0; i < 5; i++)
7112 fprintf (stream, "#define gbrkprop_header_%d %d\n", i,
7113 ((uint32_t *) t.result)[i]);
7114 fprintf (stream, "static const\n");
7115 fprintf (stream, "struct\n");
7116 fprintf (stream, " {\n");
7117 fprintf (stream, " int level1[%zu];\n", t.level1_size);
7118 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
7119 fprintf (stream, " unsigned char level3[(%zu << %d) / 2];\n",
7120 t.level3_size, t.p);
7121 fprintf (stream, " }\n");
7122 fprintf (stream, "unigbrkprop =\n");
7123 fprintf (stream, "{\n");
7124 fprintf (stream, " {");
7125 if (t.level1_size > 8)
7126 fprintf (stream, "\n ");
7127 for (i = 0; i < t.level1_size; i++)
7130 if (i > 0 && (i % 8) == 0)
7131 fprintf (stream, "\n ");
7132 offset = ((uint32_t *) (t.result + level1_offset))[i];
7134 fprintf (stream, " %5d", -1);
7136 fprintf (stream, " %5zu",
7137 (offset - level2_offset) / sizeof (uint32_t));
7138 if (i+1 < t.level1_size)
7139 fprintf (stream, ",");
7141 if (t.level1_size > 8)
7142 fprintf (stream, "\n ");
7143 fprintf (stream, " },\n");
7144 fprintf (stream, " {");
7145 if (t.level2_size << t.q > 8)
7146 fprintf (stream, "\n ");
7147 for (i = 0; i < t.level2_size << t.q; i++)
7150 if (i > 0 && (i % 8) == 0)
7151 fprintf (stream, "\n ");
7152 offset = ((uint32_t *) (t.result + level2_offset))[i];
7154 fprintf (stream, " %5d", -1);
7156 fprintf (stream, " %5zu",
7157 (offset - level3_offset) / sizeof (uint8_t) / 2);
7158 if (i+1 < t.level2_size << t.q)
7159 fprintf (stream, ",");
7161 if (t.level2_size << t.q > 8)
7162 fprintf (stream, "\n ");
7163 fprintf (stream, " },\n");
7164 fprintf (stream, " {");
7165 if (t.level3_size << t.p > 8)
7166 fprintf (stream, "\n ");
7167 for (i = 0; i < (t.level3_size << t.p) / 2; i++)
7169 unsigned char *p = (unsigned char *) (t.result + level3_offset);
7170 unsigned char value0 = p[i * 2];
7171 unsigned char value1 = p[i * 2 + 1];
7172 if (i > 0 && (i % 8) == 0)
7173 fprintf (stream, "\n ");
7174 fprintf (stream, " 0x%02x%s", (value1 << 4) + value0,
7175 (i+1 < (t.level3_size << t.p) / 2 ? "," : ""));
7177 if (t.level3_size << t.p > 8)
7178 fprintf (stream, "\n ");
7179 fprintf (stream, " }\n");
7180 fprintf (stream, "};\n");
7182 if (ferror (stream) || fclose (stream))
7184 fprintf (stderr, "error writing to '%s'\n", filename);
7189 /* Stores in unicode_org_gbp[] the grapheme breaking property from the
7190 GraphemeBreakProperty.txt file. */
7192 fill_org_gbp (const char *graphemebreakproperty_filename)
7198 for (i = 0; i < 0x110000; i++)
7199 unicode_org_gbp[i] = GBP_OTHER;
7201 stream = fopen (graphemebreakproperty_filename, "r");
7204 fprintf (stderr, "error during fopen of '%s'\n",
7205 graphemebreakproperty_filename);
7212 unsigned int i1, i2;
7213 char padding[200+1];
7214 char propname[200+1];
7218 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
7221 if (buf[0] == '\0' || buf[0] == '#')
7224 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4)
7226 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3)
7228 fprintf (stderr, "parse error in '%s'\n",
7229 graphemebreakproperty_filename);
7234 #define PROP(name,value) \
7235 if (strcmp (propname, name) == 0) propvalue = value; else
7238 PROP ("Control", GBP_CONTROL)
7239 PROP ("Extend", GBP_EXTEND)
7240 PROP ("Prepend", GBP_PREPEND)
7241 PROP ("SpacingMark", GBP_SPACINGMARK)
7246 PROP ("LVT", GBP_LVT)
7249 fprintf (stderr, "unknown property value '%s' in %s:%d\n", propname,
7250 graphemebreakproperty_filename, lineno);
7253 if (!(i1 <= i2 && i2 < 0x110000))
7256 for (i = i1; i <= i2; i++)
7257 unicode_org_gbp[i] = propvalue;
7259 if (ferror (stream) || fclose (stream))
7261 fprintf (stderr, "error reading from '%s'\n", graphemebreakproperty_filename);
7266 /* ========================================================================= */
7268 /* Maximum number of characters into which a single Unicode character can be
7270 #define MAX_DECOMP_LENGTH 18
7274 UC_DECOMP_CANONICAL,/* Canonical decomposition. */
7275 UC_DECOMP_FONT, /* <font> A font variant (e.g. a blackletter form). */
7276 UC_DECOMP_NOBREAK, /* <noBreak> A no-break version of a space or hyphen. */
7277 UC_DECOMP_INITIAL, /* <initial> An initial presentation form (Arabic). */
7278 UC_DECOMP_MEDIAL, /* <medial> A medial presentation form (Arabic). */
7279 UC_DECOMP_FINAL, /* <final> A final presentation form (Arabic). */
7280 UC_DECOMP_ISOLATED,/* <isolated> An isolated presentation form (Arabic). */
7281 UC_DECOMP_CIRCLE, /* <circle> An encircled form. */
7282 UC_DECOMP_SUPER, /* <super> A superscript form. */
7283 UC_DECOMP_SUB, /* <sub> A subscript form. */
7284 UC_DECOMP_VERTICAL,/* <vertical> A vertical layout presentation form. */
7285 UC_DECOMP_WIDE, /* <wide> A wide (or zenkaku) compatibility character. */
7286 UC_DECOMP_NARROW, /* <narrow> A narrow (or hankaku) compatibility character. */
7287 UC_DECOMP_SMALL, /* <small> A small variant form (CNS compatibility). */
7288 UC_DECOMP_SQUARE, /* <square> A CJK squared font variant. */
7289 UC_DECOMP_FRACTION,/* <fraction> A vulgar fraction form. */
7290 UC_DECOMP_COMPAT /* <compat> Otherwise unspecified compatibility character. */
7293 /* Return the decomposition for a Unicode character (ignoring Hangul Jamo
7294 decompositions). Return the type, or -1 for none. */
7296 get_decomposition (unsigned int ch,
7297 unsigned int *lengthp, unsigned int decomposed[MAX_DECOMP_LENGTH])
7299 const char *decomposition = unicode_attributes[ch].decomposition;
7301 if (decomposition != NULL && decomposition[0] != '\0')
7303 int type = UC_DECOMP_CANONICAL;
7304 unsigned int length;
7307 if (decomposition[0] == '<')
7312 rangle = strchr (decomposition + 1, '>');
7315 typelen = rangle + 1 - decomposition;
7316 #define TYPE(t1,t2) \
7317 if (typelen == (sizeof (t1) - 1) && memcmp (decomposition, t1, typelen) == 0) \
7320 TYPE ("<font>", UC_DECOMP_FONT)
7321 TYPE ("<noBreak>", UC_DECOMP_NOBREAK)
7322 TYPE ("<initial>", UC_DECOMP_INITIAL)
7323 TYPE ("<medial>", UC_DECOMP_MEDIAL)
7324 TYPE ("<final>", UC_DECOMP_FINAL)
7325 TYPE ("<isolated>", UC_DECOMP_ISOLATED)
7326 TYPE ("<circle>", UC_DECOMP_CIRCLE)
7327 TYPE ("<super>", UC_DECOMP_SUPER)
7328 TYPE ("<sub>", UC_DECOMP_SUB)
7329 TYPE ("<vertical>", UC_DECOMP_VERTICAL)
7330 TYPE ("<wide>", UC_DECOMP_WIDE)
7331 TYPE ("<narrow>", UC_DECOMP_NARROW)
7332 TYPE ("<small>", UC_DECOMP_SMALL)
7333 TYPE ("<square>", UC_DECOMP_SQUARE)
7334 TYPE ("<fraction>", UC_DECOMP_FRACTION)
7335 TYPE ("<compat>", UC_DECOMP_COMPAT)
7337 fprintf (stderr, "unknown decomposition type %*s\n", (int)typelen, decomposition);
7341 decomposition = rangle + 1;
7342 if (decomposition[0] == ' ')
7345 for (length = 0; length < MAX_DECOMP_LENGTH; length++)
7347 decomposed[length] = strtoul (decomposition, &endptr, 16);
7348 if (endptr == decomposition)
7350 decomposition = endptr;
7351 if (decomposition[0] == ' ')
7354 if (*decomposition != '\0')
7355 /* MAX_DECOMP_LENGTH is too small. */
7365 /* Construction of sparse 3-level tables. */
7366 #define TABLE decomp_table
7367 #define ELEMENT uint16_t
7368 #define DEFAULT (uint16_t)(-1)
7369 #define xmalloc malloc
7370 #define xrealloc realloc
7374 output_decomposition (FILE *stream1, FILE *stream2)
7376 struct decomp_table t;
7377 unsigned int level1_offset, level2_offset, level3_offset;
7378 unsigned int offset;
7384 decomp_table_init (&t);
7386 fprintf (stream1, "extern const unsigned char gl_uninorm_decomp_chars_table[];\n");
7387 fprintf (stream1, "\n");
7388 fprintf (stream2, "const unsigned char gl_uninorm_decomp_chars_table[] =\n{");
7391 for (ch = 0; ch < 0x110000; ch++)
7393 unsigned int length;
7394 unsigned int decomposed[MAX_DECOMP_LENGTH];
7395 int type = get_decomposition (ch, &length, decomposed);
7399 if (!(offset < (1 << 15)))
7401 decomp_table_add (&t, ch, ((type == UC_DECOMP_CANONICAL ? 0 : 1) << 15) | offset);
7403 /* Produce length 3-bytes entries. */
7405 /* We would need a special representation of zero-length entries. */
7407 for (i = 0; i < length; i++)
7410 fprintf (stream2, ",");
7411 if ((offset % 4) == 0)
7412 fprintf (stream2, "\n ");
7413 if (!(decomposed[i] < (1 << 18)))
7415 fprintf (stream2, " 0x%02X, 0x%02X, 0x%02X",
7416 (((i+1 < length ? (1 << 23) : 0)
7417 | (i == 0 ? (type << 18) : 0)
7418 | decomposed[i]) >> 16) & 0xff,
7419 (decomposed[i] >> 8) & 0xff,
7420 decomposed[i] & 0xff);
7426 fprintf (stream2, "\n};\n");
7427 fprintf (stream2, "\n");
7429 decomp_table_finalize (&t);
7432 5 * sizeof (uint32_t);
7434 5 * sizeof (uint32_t)
7435 + t.level1_size * sizeof (uint32_t);
7437 5 * sizeof (uint32_t)
7438 + t.level1_size * sizeof (uint32_t)
7439 + (t.level2_size << t.q) * sizeof (uint32_t);
7441 for (i = 0; i < 5; i++)
7442 fprintf (stream1, "#define decomp_header_%d %d\n", i,
7443 ((uint32_t *) t.result)[i]);
7444 fprintf (stream1, "\n");
7445 fprintf (stream1, "typedef struct\n");
7446 fprintf (stream1, " {\n");
7447 fprintf (stream1, " int level1[%zu];\n", t.level1_size);
7448 fprintf (stream1, " int level2[%zu << %d];\n", t.level2_size, t.q);
7449 fprintf (stream1, " unsigned short level3[%zu << %d];\n", t.level3_size, t.p);
7450 fprintf (stream1, " }\n");
7451 fprintf (stream1, "decomp_index_table_t;\n");
7452 fprintf (stream1, "extern const decomp_index_table_t gl_uninorm_decomp_index_table;\n");
7453 fprintf (stream2, "const decomp_index_table_t gl_uninorm_decomp_index_table =\n");
7454 fprintf (stream2, "{\n");
7455 fprintf (stream2, " {");
7456 if (t.level1_size > 8)
7457 fprintf (stream2, "\n ");
7458 for (i = 0; i < t.level1_size; i++)
7461 if (i > 0 && (i % 8) == 0)
7462 fprintf (stream2, "\n ");
7463 offset = ((uint32_t *) (t.result + level1_offset))[i];
7465 fprintf (stream2, " %5d", -1);
7467 fprintf (stream2, " %5zu",
7468 (offset - level2_offset) / sizeof (uint32_t));
7469 if (i+1 < t.level1_size)
7470 fprintf (stream2, ",");
7472 if (t.level1_size > 8)
7473 fprintf (stream2, "\n ");
7474 fprintf (stream2, " },\n");
7475 fprintf (stream2, " {");
7476 if (t.level2_size << t.q > 8)
7477 fprintf (stream2, "\n ");
7478 for (i = 0; i < t.level2_size << t.q; i++)
7481 if (i > 0 && (i % 8) == 0)
7482 fprintf (stream2, "\n ");
7483 offset = ((uint32_t *) (t.result + level2_offset))[i];
7485 fprintf (stream2, " %5d", -1);
7487 fprintf (stream2, " %5zu",
7488 (offset - level3_offset) / sizeof (uint16_t));
7489 if (i+1 < t.level2_size << t.q)
7490 fprintf (stream2, ",");
7492 if (t.level2_size << t.q > 8)
7493 fprintf (stream2, "\n ");
7494 fprintf (stream2, " },\n");
7495 fprintf (stream2, " {");
7496 if (t.level3_size << t.p > 8)
7497 fprintf (stream2, "\n ");
7498 for (i = 0; i < t.level3_size << t.p; i++)
7500 uint16_t value = ((uint16_t *) (t.result + level3_offset))[i];
7501 if (i > 0 && (i % 8) == 0)
7502 fprintf (stream2, "\n ");
7503 fprintf (stream2, " %5d", value == (uint16_t)(-1) ? -1 : value);
7504 if (i+1 < t.level3_size << t.p)
7505 fprintf (stream2, ",");
7507 if (t.level3_size << t.p > 8)
7508 fprintf (stream2, "\n ");
7509 fprintf (stream2, " }\n");
7510 fprintf (stream2, "};\n");
7514 output_decomposition_tables (const char *filename1, const char *filename2, const char *version)
7516 const char *filenames[2];
7520 filenames[0] = filename1;
7521 filenames[1] = filename2;
7523 for (i = 0; i < 2; i++)
7525 streams[i] = fopen (filenames[i], "w");
7526 if (streams[i] == NULL)
7528 fprintf (stderr, "cannot open '%s' for writing\n", filenames[i]);
7533 for (i = 0; i < 2; i++)
7535 FILE *stream = streams[i];
7537 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
7538 fprintf (stream, "/* Decomposition of Unicode characters. */\n");
7539 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
7541 fprintf (stream, "\n");
7544 output_decomposition (streams[0], streams[1]);
7546 for (i = 0; i < 2; i++)
7548 if (ferror (streams[i]) || fclose (streams[i]))
7550 fprintf (stderr, "error writing to '%s'\n", filenames[i]);
7556 /* The "excluded from composition" property from the CompositionExclusions.txt file. */
7557 char unicode_composition_exclusions[0x110000];
7560 fill_composition_exclusions (const char *compositionexclusions_filename)
7565 stream = fopen (compositionexclusions_filename, "r");
7568 fprintf (stderr, "error during fopen of '%s'\n", compositionexclusions_filename);
7572 for (i = 0; i < 0x110000; i++)
7573 unicode_composition_exclusions[i] = 0;
7580 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
7583 if (buf[0] == '\0' || buf[0] == '#')
7586 if (sscanf (buf, "%X", &i) != 1)
7588 fprintf (stderr, "parse error in '%s'\n", compositionexclusions_filename);
7591 if (!(i < 0x110000))
7594 unicode_composition_exclusions[i] = 1;
7597 if (ferror (stream) || fclose (stream))
7599 fprintf (stderr, "error reading from '%s'\n", compositionexclusions_filename);
7605 debug_output_composition_tables (const char *filename)
7610 stream = fopen (filename, "w");
7613 fprintf (stderr, "cannot open '%s' for writing\n", filename);
7617 for (ch = 0; ch < 0x110000; ch++)
7619 unsigned int length;
7620 unsigned int decomposed[MAX_DECOMP_LENGTH];
7621 int type = get_decomposition (ch, &length, decomposed);
7623 if (type == UC_DECOMP_CANONICAL
7624 /* Consider only binary decompositions.
7625 Exclude singleton decompositions. */
7628 unsigned int code1 = decomposed[0];
7629 unsigned int code2 = decomposed[1];
7630 unsigned int combined = ch;
7632 /* Exclude decompositions where the first part is not a starter,
7633 i.e. is not of canonical combining class 0. */
7634 if (strcmp (unicode_attributes[code1].combining, "0") == 0
7635 /* Exclude characters listed in CompositionExclusions.txt. */
7636 && !unicode_composition_exclusions[combined])
7638 /* The combined character must now also be a starter.
7640 if (strcmp (unicode_attributes[combined].combining, "0") != 0)
7643 fprintf (stream, "0x%04X\t0x%04X\t0x%04X\t%s\n",
7647 unicode_attributes[code2].combining);
7652 if (ferror (stream) || fclose (stream))
7654 fprintf (stderr, "error writing to '%s'\n", filename);
7660 output_composition_tables (const char *filename, const char *version)
7665 stream = fopen (filename, "w");
7668 fprintf (stderr, "cannot open '%s' for writing\n", filename);
7672 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
7673 fprintf (stream, "/* Canonical composition of Unicode characters. */\n");
7674 fprintf (stream, "/* Generated automatically by gen-uni-tables for Unicode %s. */\n",
7676 fprintf (stream, "\n");
7678 /* Put a GPL header on it. The gnulib module is under LGPL (although it
7679 still carries the GPL header), and it's gnulib-tool which replaces the
7680 GPL header with an LGPL header. */
7681 fprintf (stream, "/* Copyright (C) 2009 Free Software Foundation, Inc.\n");
7682 fprintf (stream, "\n");
7683 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
7684 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
7685 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
7686 fprintf (stream, " (at your option) any later version.\n");
7687 fprintf (stream, "\n");
7688 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
7689 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
7690 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
7691 fprintf (stream, " GNU General Public License for more details.\n");
7692 fprintf (stream, "\n");
7693 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
7694 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
7695 fprintf (stream, "\n");
7697 /* The composition table is a set of mappings (code1, code2) -> combined,
7699 367 values for code1 (from 0x003C to 0x30FD),
7700 54 values for code2 (from 0x0300 to 0x309A).
7701 For a fixed code1, there are from 1 to 19 possible values for code2.
7702 For a fixed code2, there are from 1 to 117 possible values for code1.
7703 This is a very sparse matrix.
7705 We want an O(1) hash lookup.
7707 We could implement the hash lookup by mapping (code1, code2) to a linear
7708 combination mul1*code1 + mul2*code2, which is then used as an index into
7709 a 3-level table. But this leads to a table of size 37 KB.
7711 We use gperf to implement the hash lookup, giving it the 928 sets of
7712 4 bytes (code1, code2) as input. gperf generates a hash table of size
7713 1527, which is quite good (60% filled). It requires an auxiliary table
7714 lookup in a table of size 0.5 KB. The total tables size is 11 KB. */
7716 fprintf (stream, "struct composition_rule { char codes[4]; };\n");
7717 fprintf (stream, "%%struct-type\n");
7718 fprintf (stream, "%%language=ANSI-C\n");
7719 fprintf (stream, "%%define slot-name codes\n");
7720 fprintf (stream, "%%define hash-function-name gl_uninorm_compose_hash\n");
7721 fprintf (stream, "%%define lookup-function-name gl_uninorm_compose_lookup\n");
7722 fprintf (stream, "%%compare-lengths\n");
7723 fprintf (stream, "%%compare-strncmp\n");
7724 fprintf (stream, "%%readonly-tables\n");
7725 fprintf (stream, "%%omit-struct-type\n");
7726 fprintf (stream, "%%%%\n");
7728 for (ch = 0; ch < 0x110000; ch++)
7730 unsigned int length;
7731 unsigned int decomposed[MAX_DECOMP_LENGTH];
7732 int type = get_decomposition (ch, &length, decomposed);
7734 if (type == UC_DECOMP_CANONICAL
7735 /* Consider only binary decompositions.
7736 Exclude singleton decompositions. */
7739 unsigned int code1 = decomposed[0];
7740 unsigned int code2 = decomposed[1];
7741 unsigned int combined = ch;
7743 /* Exclude decompositions where the first part is not a starter,
7744 i.e. is not of canonical combining class 0. */
7745 if (strcmp (unicode_attributes[code1].combining, "0") == 0
7746 /* Exclude characters listed in CompositionExclusions.txt. */
7747 && !unicode_composition_exclusions[combined])
7749 /* The combined character must now also be a starter.
7751 if (strcmp (unicode_attributes[combined].combining, "0") != 0)
7754 if (!(code1 < 0x10000))
7756 if (!(code2 < 0x10000))
7758 if (!(combined < 0x10000))
7761 fprintf (stream, "\"\\x%02x\\x%02x\\x%02x\\x%02x\", 0x%04x\n",
7762 (code1 >> 8) & 0xff, code1 & 0xff,
7763 (code2 >> 8) & 0xff, code2 & 0xff,
7769 if (ferror (stream) || fclose (stream))
7771 fprintf (stderr, "error writing to '%s'\n", filename);
7776 /* ========================================================================= */
7778 /* Output the test for a simple character mapping table to the given file. */
7781 output_simple_mapping_test (const char *filename,
7782 const char *function_name,
7783 unsigned int (*func) (unsigned int),
7784 const char *version)
7790 stream = fopen (filename, "w");
7793 fprintf (stderr, "cannot open '%s' for writing\n", filename);
7797 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
7798 fprintf (stream, "/* Test the Unicode character mapping functions.\n");
7799 fprintf (stream, " Copyright (C) 2009 Free Software Foundation, Inc.\n");
7800 fprintf (stream, "\n");
7801 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
7802 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
7803 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
7804 fprintf (stream, " (at your option) any later version.\n");
7805 fprintf (stream, "\n");
7806 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
7807 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
7808 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
7809 fprintf (stream, " GNU General Public License for more details.\n");
7810 fprintf (stream, "\n");
7811 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
7812 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
7813 fprintf (stream, "\n");
7814 fprintf (stream, "/* Generated automatically by gen-case.c for Unicode %s. */\n",
7816 fprintf (stream, "\n");
7817 fprintf (stream, "#include \"test-mapping-part1.h\"\n");
7818 fprintf (stream, "\n");
7821 for (ch = 0; ch < 0x110000; ch++)
7823 unsigned int value = func (ch);
7828 fprintf (stream, ",\n");
7829 fprintf (stream, " { 0x%04X, 0x%04X }", ch, value);
7834 fprintf (stream, "\n");
7836 fprintf (stream, "\n");
7837 fprintf (stream, "#define MAP(c) %s (c)\n", function_name);
7838 fprintf (stream, "#include \"test-mapping-part2.h\"\n");
7840 if (ferror (stream) || fclose (stream))
7842 fprintf (stderr, "error writing to '%s'\n", filename);
7847 /* Construction of sparse 3-level tables. */
7848 #define TABLE mapping_table
7849 #define ELEMENT int32_t
7851 #define xmalloc malloc
7852 #define xrealloc realloc
7855 /* Output a simple character mapping table to the given file. */
7858 output_simple_mapping (const char *filename,
7859 unsigned int (*func) (unsigned int),
7860 const char *version)
7864 struct mapping_table t;
7865 unsigned int level1_offset, level2_offset, level3_offset;
7867 stream = fopen (filename, "w");
7870 fprintf (stderr, "cannot open '%s' for writing\n", filename);
7874 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
7875 fprintf (stream, "/* Simple character mapping of Unicode characters. */\n");
7876 fprintf (stream, "/* Generated automatically by gen-case.c for Unicode %s. */\n",
7881 mapping_table_init (&t);
7883 for (ch = 0; ch < 0x110000; ch++)
7885 int value = (int) func (ch) - (int) ch;
7887 mapping_table_add (&t, ch, value);
7890 mapping_table_finalize (&t);
7892 /* Offsets in t.result, in memory of this process. */
7894 5 * sizeof (uint32_t);
7896 5 * sizeof (uint32_t)
7897 + t.level1_size * sizeof (uint32_t);
7899 5 * sizeof (uint32_t)
7900 + t.level1_size * sizeof (uint32_t)
7901 + (t.level2_size << t.q) * sizeof (uint32_t);
7903 for (i = 0; i < 5; i++)
7904 fprintf (stream, "#define mapping_header_%d %d\n", i,
7905 ((uint32_t *) t.result)[i]);
7906 fprintf (stream, "static const\n");
7907 fprintf (stream, "struct\n");
7908 fprintf (stream, " {\n");
7909 fprintf (stream, " int level1[%zu];\n", t.level1_size);
7910 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
7911 fprintf (stream, " int level3[%zu << %d];\n", t.level3_size, t.p);
7912 fprintf (stream, " }\n");
7913 fprintf (stream, "u_mapping =\n");
7914 fprintf (stream, "{\n");
7915 fprintf (stream, " {");
7916 if (t.level1_size > 8)
7917 fprintf (stream, "\n ");
7918 for (i = 0; i < t.level1_size; i++)
7921 if (i > 0 && (i % 8) == 0)
7922 fprintf (stream, "\n ");
7923 offset = ((uint32_t *) (t.result + level1_offset))[i];
7925 fprintf (stream, " %5d", -1);
7927 fprintf (stream, " %5zu",
7928 (offset - level2_offset) / sizeof (uint32_t));
7929 if (i+1 < t.level1_size)
7930 fprintf (stream, ",");
7932 if (t.level1_size > 8)
7933 fprintf (stream, "\n ");
7934 fprintf (stream, " },\n");
7935 fprintf (stream, " {");
7936 if (t.level2_size << t.q > 8)
7937 fprintf (stream, "\n ");
7938 for (i = 0; i < t.level2_size << t.q; i++)
7941 if (i > 0 && (i % 8) == 0)
7942 fprintf (stream, "\n ");
7943 offset = ((uint32_t *) (t.result + level2_offset))[i];
7945 fprintf (stream, " %5d", -1);
7947 fprintf (stream, " %5zu",
7948 (offset - level3_offset) / sizeof (int32_t));
7949 if (i+1 < t.level2_size << t.q)
7950 fprintf (stream, ",");
7952 if (t.level2_size << t.q > 8)
7953 fprintf (stream, "\n ");
7954 fprintf (stream, " },\n");
7955 fprintf (stream, " {");
7956 if (t.level3_size << t.p > 8)
7957 fprintf (stream, "\n ");
7958 for (i = 0; i < t.level3_size << t.p; i++)
7960 if (i > 0 && (i % 8) == 0)
7961 fprintf (stream, "\n ");
7962 fprintf (stream, " %5d", ((int32_t *) (t.result + level3_offset))[i]);
7963 if (i+1 < t.level3_size << t.p)
7964 fprintf (stream, ",");
7966 if (t.level3_size << t.p > 8)
7967 fprintf (stream, "\n ");
7968 fprintf (stream, " }\n");
7969 fprintf (stream, "};\n");
7971 if (ferror (stream) || fclose (stream))
7973 fprintf (stderr, "error writing to '%s'\n", filename);
7978 /* ========================================================================= */
7980 /* A special casing context.
7981 A context is negated through x -> -x. */
7986 SCC_AFTER_SOFT_DOTTED,
7992 /* A special casing rule. */
7993 struct special_casing_rule
7996 unsigned int lower_mapping[3];
7997 unsigned int title_mapping[3];
7998 unsigned int upper_mapping[3];
7999 unsigned int casefold_mapping[3];
8000 const char *language;
8004 /* The special casing rules. */
8005 struct special_casing_rule **casing_rules;
8006 unsigned int num_casing_rules;
8007 unsigned int allocated_casing_rules;
8010 add_casing_rule (struct special_casing_rule *new_rule)
8012 if (num_casing_rules == allocated_casing_rules)
8014 allocated_casing_rules = 2 * allocated_casing_rules;
8015 if (allocated_casing_rules < 16)
8016 allocated_casing_rules = 16;
8018 (struct special_casing_rule **)
8019 realloc (casing_rules, allocated_casing_rules * sizeof (struct special_casing_rule *));
8021 casing_rules[num_casing_rules++] = new_rule;
8024 /* Stores in casing_rules the special casing rules found in
8025 specialcasing_filename. */
8027 fill_casing_rules (const char *specialcasing_filename)
8031 stream = fopen (specialcasing_filename, "r");
8034 fprintf (stderr, "error during fopen of '%s'\n", specialcasing_filename);
8038 casing_rules = NULL;
8039 num_casing_rules = 0;
8040 allocated_casing_rules = 0;
8050 unsigned int lower_mapping[3];
8051 unsigned int title_mapping[3];
8052 unsigned int upper_mapping[3];
8056 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
8059 if (buf[0] == '\0' || buf[0] == '#')
8064 code = strtoul (scanptr, &endptr, 16);
8065 if (endptr == scanptr)
8067 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
8071 if (*scanptr != ';')
8073 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
8078 /* Scan lower mapping. */
8079 for (i = 0; i < 3; i++)
8080 lower_mapping[i] = 0;
8081 for (i = 0; i < 3; i++)
8083 while (*scanptr == ' ')
8085 if (*scanptr == ';')
8087 lower_mapping[i] = strtoul (scanptr, &endptr, 16);
8088 if (endptr == scanptr)
8090 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
8095 if (*scanptr != ';')
8097 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
8102 /* Scan title mapping. */
8103 for (i = 0; i < 3; i++)
8104 title_mapping[i] = 0;
8105 for (i = 0; i < 3; i++)
8107 while (*scanptr == ' ')
8109 if (*scanptr == ';')
8111 title_mapping[i] = strtoul (scanptr, &endptr, 16);
8112 if (endptr == scanptr)
8114 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
8119 if (*scanptr != ';')
8121 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
8126 /* Scan upper mapping. */
8127 for (i = 0; i < 3; i++)
8128 upper_mapping[i] = 0;
8129 for (i = 0; i < 3; i++)
8131 while (*scanptr == ' ')
8133 if (*scanptr == ';')
8135 upper_mapping[i] = strtoul (scanptr, &endptr, 16);
8136 if (endptr == scanptr)
8138 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
8143 if (*scanptr != ';')
8145 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
8150 /* Scan language and context. */
8152 context = SCC_ALWAYS;
8153 while (*scanptr == ' ')
8155 if (*scanptr != '\0' && *scanptr != '#')
8157 const char *word_begin = scanptr;
8158 const char *word_end;
8160 while (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';' && *scanptr != ' ')
8164 while (*scanptr == ' ')
8167 if (word_end - word_begin == 2)
8169 language = (char *) malloc ((word_end - word_begin) + 1);
8170 memcpy (language, word_begin, 2);
8171 language[word_end - word_begin] = '\0';
8172 word_begin = word_end = NULL;
8174 if (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';')
8176 word_begin = scanptr;
8177 while (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';' && *scanptr != ' ')
8183 if (word_end > word_begin)
8185 bool negate = false;
8187 if (word_end - word_begin >= 4 && memcmp (word_begin, "Not_", 4) == 0)
8192 if (word_end - word_begin == 11 && memcmp (word_begin, "Final_Sigma", 11) == 0)
8193 context = SCC_FINAL_SIGMA;
8194 else if (word_end - word_begin == 17 && memcmp (word_begin, "After_Soft_Dotted", 17) == 0)
8195 context = SCC_AFTER_SOFT_DOTTED;
8196 else if (word_end - word_begin == 10 && memcmp (word_begin, "More_Above", 10) == 0)
8197 context = SCC_MORE_ABOVE;
8198 else if (word_end - word_begin == 10 && memcmp (word_begin, "Before_Dot", 10) == 0)
8199 context = SCC_BEFORE_DOT;
8200 else if (word_end - word_begin == 7 && memcmp (word_begin, "After_I", 7) == 0)
8201 context = SCC_AFTER_I;
8204 fprintf (stderr, "unknown context type in '%s'\n", specialcasing_filename);
8208 context = - context;
8211 if (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';')
8213 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
8218 /* Store the rule. */
8220 struct special_casing_rule *new_rule =
8221 (struct special_casing_rule *) malloc (sizeof (struct special_casing_rule));
8222 new_rule->code = code;
8223 new_rule->language = language;
8224 new_rule->context = context;
8225 memcpy (new_rule->lower_mapping, lower_mapping, sizeof (new_rule->lower_mapping));
8226 memcpy (new_rule->title_mapping, title_mapping, sizeof (new_rule->title_mapping));
8227 memcpy (new_rule->upper_mapping, upper_mapping, sizeof (new_rule->upper_mapping));
8229 add_casing_rule (new_rule);
8233 if (ferror (stream) || fclose (stream))
8235 fprintf (stderr, "error reading from '%s'\n", specialcasing_filename);
8240 /* A casefolding rule. */
8241 struct casefold_rule
8244 unsigned int mapping[3];
8245 const char *language;
8248 /* The casefolding rules. */
8249 struct casefold_rule **casefolding_rules;
8250 unsigned int num_casefolding_rules;
8251 unsigned int allocated_casefolding_rules;
8253 /* Stores in casefolding_rules the case folding rules found in
8254 casefolding_filename. */
8256 fill_casefolding_rules (const char *casefolding_filename)
8260 stream = fopen (casefolding_filename, "r");
8263 fprintf (stderr, "error during fopen of '%s'\n", casefolding_filename);
8267 casefolding_rules = NULL;
8268 num_casefolding_rules = 0;
8269 allocated_casefolding_rules = 0;
8280 unsigned int mapping[3];
8282 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
8285 if (buf[0] == '\0' || buf[0] == '#')
8290 code = strtoul (scanptr, &endptr, 16);
8291 if (endptr == scanptr)
8293 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
8297 if (*scanptr != ';')
8299 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
8305 while (*scanptr == ' ')
8310 case 'C': case 'F': case 'S': case 'T':
8314 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
8318 if (*scanptr != ';')
8320 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
8325 /* Scan casefold mapping. */
8326 for (i = 0; i < 3; i++)
8328 for (i = 0; i < 3; i++)
8330 while (*scanptr == ' ')
8332 if (*scanptr == ';')
8334 mapping[i] = strtoul (scanptr, &endptr, 16);
8335 if (endptr == scanptr)
8337 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
8342 if (*scanptr != ';')
8344 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
8349 /* Ignore rules of type 'S'; we use the rules of type 'F' instead. */
8352 const char * const *languages;
8353 unsigned int languages_count;
8355 /* Type 'T' indicates that the rule is applicable to Turkish
8359 static const char * const turkish_languages[] = { "tr", "az" };
8360 languages = turkish_languages;
8361 languages_count = 2;
8365 static const char * const all_languages[] = { NULL };
8366 languages = all_languages;
8367 languages_count = 1;
8370 for (i = 0; i < languages_count; i++)
8372 /* Store a new rule. */
8373 struct casefold_rule *new_rule =
8374 (struct casefold_rule *) malloc (sizeof (struct casefold_rule));
8375 new_rule->code = code;
8376 memcpy (new_rule->mapping, mapping, sizeof (new_rule->mapping));
8377 new_rule->language = languages[i];
8379 if (num_casefolding_rules == allocated_casefolding_rules)
8381 allocated_casefolding_rules = 2 * allocated_casefolding_rules;
8382 if (allocated_casefolding_rules < 16)
8383 allocated_casefolding_rules = 16;
8385 (struct casefold_rule **)
8386 realloc (casefolding_rules,
8387 allocated_casefolding_rules * sizeof (struct casefold_rule *));
8389 casefolding_rules[num_casefolding_rules++] = new_rule;
8394 if (ferror (stream) || fclose (stream))
8396 fprintf (stderr, "error reading from '%s'\n", casefolding_filename);
8401 /* Casefold mapping, when it maps to a single character. */
8402 unsigned int unicode_casefold[0x110000];
8405 to_casefold (unsigned int ch)
8407 return unicode_casefold[ch];
8410 /* Redistribute the casefolding_rules:
8411 - Rules that map to a single character, language independently, are stored
8412 in unicode_casefold.
8413 - Other rules are merged into casing_rules. */
8415 redistribute_casefolding_rules (void)
8417 unsigned int ch, i, j;
8419 /* Fill unicode_casefold[]. */
8420 for (ch = 0; ch < 0x110000; ch++)
8421 unicode_casefold[ch] = ch;
8422 for (i = 0; i < num_casefolding_rules; i++)
8424 struct casefold_rule *cfrule = casefolding_rules[i];
8426 if (cfrule->language == NULL && cfrule->mapping[1] == 0)
8429 if (!(ch < 0x110000))
8431 unicode_casefold[ch] = cfrule->mapping[0];
8435 /* Extend the special casing rules by filling in their casefold_mapping[]
8437 for (j = 0; j < num_casing_rules; j++)
8439 struct special_casing_rule *rule = casing_rules[j];
8442 rule->casefold_mapping[0] = to_casefold (rule->code);
8443 for (k = 1; k < 3; k++)
8444 rule->casefold_mapping[k] = 0;
8447 /* Now merge the other casefolding rules into casing_rules. */
8448 for (i = 0; i < num_casefolding_rules; i++)
8450 struct casefold_rule *cfrule = casefolding_rules[i];
8452 if (!(cfrule->language == NULL && cfrule->mapping[1] == 0))
8454 /* Find a rule that applies to the same code, same language, and it
8455 has context SCC_ALWAYS. At the same time, update all rules that
8456 have the same code and same or more specific language. */
8457 struct special_casing_rule *found_rule = NULL;
8459 for (j = 0; j < num_casing_rules; j++)
8461 struct special_casing_rule *rule = casing_rules[j];
8463 if (rule->code == cfrule->code
8464 && (cfrule->language == NULL
8465 || (rule->language != NULL
8466 && strcmp (rule->language, cfrule->language) == 0)))
8468 memcpy (rule->casefold_mapping, cfrule->mapping,
8469 sizeof (rule->casefold_mapping));
8471 if ((cfrule->language == NULL
8472 ? rule->language == NULL
8473 : rule->language != NULL
8474 && strcmp (rule->language, cfrule->language) == 0)
8475 && rule->context == SCC_ALWAYS)
8483 if (found_rule == NULL)
8485 /* Create a new rule. */
8486 struct special_casing_rule *new_rule =
8487 (struct special_casing_rule *) malloc (sizeof (struct special_casing_rule));
8489 /* Try to find a rule that applies to the same code, no language
8490 restriction, and with context SCC_ALWAYS. */
8491 for (j = 0; j < num_casing_rules; j++)
8493 struct special_casing_rule *rule = casing_rules[j];
8495 if (rule->code == cfrule->code
8496 && rule->context == SCC_ALWAYS
8497 && rule->language == NULL)
8505 new_rule->code = cfrule->code;
8506 new_rule->language = cfrule->language;
8507 new_rule->context = SCC_ALWAYS;
8508 if (found_rule != NULL)
8510 memcpy (new_rule->lower_mapping, found_rule->lower_mapping,
8511 sizeof (new_rule->lower_mapping));
8512 memcpy (new_rule->title_mapping, found_rule->title_mapping,
8513 sizeof (new_rule->title_mapping));
8514 memcpy (new_rule->upper_mapping, found_rule->upper_mapping,
8515 sizeof (new_rule->upper_mapping));
8521 new_rule->lower_mapping[0] = to_lower (cfrule->code);
8522 for (k = 1; k < 3; k++)
8523 new_rule->lower_mapping[k] = 0;
8524 new_rule->title_mapping[0] = to_title (cfrule->code);
8525 for (k = 1; k < 3; k++)
8526 new_rule->title_mapping[k] = 0;
8527 new_rule->upper_mapping[0] = to_upper (cfrule->code);
8528 for (k = 1; k < 3; k++)
8529 new_rule->upper_mapping[k] = 0;
8531 memcpy (new_rule->casefold_mapping, cfrule->mapping,
8532 sizeof (new_rule->casefold_mapping));
8534 add_casing_rule (new_rule);
8541 compare_casing_rules (const void *a, const void *b)
8543 struct special_casing_rule *a_rule = *(struct special_casing_rule **) a;
8544 struct special_casing_rule *b_rule = *(struct special_casing_rule **) b;
8545 unsigned int a_code = a_rule->code;
8546 unsigned int b_code = b_rule->code;
8548 if (a_code < b_code)
8550 if (a_code > b_code)
8553 /* Sort the more specific rules before the more general ones. */
8554 return (- ((a_rule->language != NULL ? 1 : 0) + (a_rule->context != SCC_ALWAYS ? 1 : 0))
8555 + ((b_rule->language != NULL ? 1 : 0) + (b_rule->context != SCC_ALWAYS ? 1 : 0)));
8559 sort_casing_rules (void)
8561 /* Sort the rules 1. by code, 2. by specificity. */
8562 if (num_casing_rules > 1)
8563 qsort (casing_rules, num_casing_rules, sizeof (struct special_casing_rule *),
8564 compare_casing_rules);
8567 /* Output the special casing rules. */
8569 output_casing_rules (const char *filename, const char *version)
8575 stream = fopen (filename, "w");
8578 fprintf (stderr, "cannot open '%s' for writing\n", filename);
8582 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
8583 fprintf (stream, "/* Special casing rules of Unicode characters. */\n");
8584 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
8586 fprintf (stream, "struct special_casing_rule { char code[3]; };\n");
8587 fprintf (stream, "%%struct-type\n");
8588 fprintf (stream, "%%language=ANSI-C\n");
8589 fprintf (stream, "%%define slot-name code\n");
8590 fprintf (stream, "%%define hash-function-name gl_unicase_special_hash\n");
8591 fprintf (stream, "%%define lookup-function-name gl_unicase_special_lookup\n");
8592 fprintf (stream, "%%compare-lengths\n");
8593 fprintf (stream, "%%compare-strncmp\n");
8594 fprintf (stream, "%%readonly-tables\n");
8595 fprintf (stream, "%%omit-struct-type\n");
8596 fprintf (stream, "%%%%\n");
8599 for (i = 0; i < num_casing_rules; i++)
8601 struct special_casing_rule *rule = casing_rules[i];
8604 if (i > 0 && rule->code == casing_rules[i - 1]->code)
8609 if (!(rule->code < 0x10000))
8611 fprintf (stderr, "special rule #%u: code %u out of range\n", i, rule->code);
8615 fprintf (stream, "\"\\x%02x\\x%02x\\x%02x\", ",
8616 (rule->code >> 8) & 0xff, rule->code & 0xff, minor);
8618 fprintf (stream, "%d, ",
8619 i + 1 < num_casing_rules && casing_rules[i + 1]->code == rule->code ? 1 : 0);
8621 context = rule->context;
8624 fprintf (stream, "-");
8625 context = - context;
8628 fprintf (stream, " ");
8632 fprintf (stream, "SCC_ALWAYS ");
8634 case SCC_FINAL_SIGMA:
8635 fprintf (stream, "SCC_FINAL_SIGMA ");
8637 case SCC_AFTER_SOFT_DOTTED:
8638 fprintf (stream, "SCC_AFTER_SOFT_DOTTED");
8640 case SCC_MORE_ABOVE:
8641 fprintf (stream, "SCC_MORE_ABOVE ");
8643 case SCC_BEFORE_DOT:
8644 fprintf (stream, "SCC_BEFORE_DOT ");
8647 fprintf (stream, "SCC_AFTER_I ");
8652 fprintf (stream, ", ");
8654 if (rule->language != NULL)
8656 if (strlen (rule->language) != 2)
8658 fprintf (stream, "{ '%c', '%c' }, ", rule->language[0], rule->language[1]);
8661 fprintf (stream, "{ '\\0', '\\0' }, ");
8663 fprintf (stream, "{ ");
8664 for (j = 0; j < 3; j++)
8667 fprintf (stream, ", ");
8668 if (!(rule->upper_mapping[j] < 0x10000))
8670 fprintf (stderr, "special rule #%u: upper mapping of code %u out of range\n", i, rule->code);
8673 if (rule->upper_mapping[j] != 0)
8674 fprintf (stream, "0x%04X", rule->upper_mapping[j]);
8676 fprintf (stream, " 0");
8678 fprintf (stream, " }, { ");
8679 for (j = 0; j < 3; j++)
8682 fprintf (stream, ", ");
8683 if (!(rule->lower_mapping[j] < 0x10000))
8685 fprintf (stderr, "special rule #%u: lower mapping of code %u out of range\n", i, rule->code);
8688 if (rule->lower_mapping[j] != 0)
8689 fprintf (stream, "0x%04X", rule->lower_mapping[j]);
8691 fprintf (stream, " 0");
8693 fprintf (stream, " }, { ");
8694 for (j = 0; j < 3; j++)
8697 fprintf (stream, ", ");
8698 if (!(rule->title_mapping[j] < 0x10000))
8700 fprintf (stderr, "special rule #%u: title mapping of code %u out of range\n", i, rule->code);
8703 if (rule->title_mapping[j] != 0)
8704 fprintf (stream, "0x%04X", rule->title_mapping[j]);
8706 fprintf (stream, " 0");
8708 fprintf (stream, " }, { ");
8709 for (j = 0; j < 3; j++)
8712 fprintf (stream, ", ");
8713 if (!(rule->casefold_mapping[j] < 0x10000))
8715 fprintf (stderr, "special rule #%u: casefold mapping of code %u out of range\n", i, rule->code);
8718 if (rule->casefold_mapping[j] != 0)
8719 fprintf (stream, "0x%04X", rule->casefold_mapping[j]);
8721 fprintf (stream, " 0");
8723 fprintf (stream, " }\n");
8726 if (ferror (stream) || fclose (stream))
8728 fprintf (stderr, "error writing to '%s'\n", filename);
8733 /* ========================================================================= */
8735 /* Quoting the Unicode standard:
8736 Definition: A character is defined to be "cased" if it has the Lowercase
8737 or Uppercase property or has a General_Category value of
8738 Titlecase_Letter. */
8740 is_cased (unsigned int ch)
8742 return (is_property_lowercase (ch)
8743 || is_property_uppercase (ch)
8744 || is_category_Lt (ch));
8747 /* Quoting the Unicode standard:
8748 Definition: A character is defined to be "case-ignorable" if it has the
8749 value MidLetter {or the value MidNumLet} for the Word_Break property or
8750 its General_Category is one of Nonspacing_Mark (Mn), Enclosing_Mark (Me),
8751 Format (Cf), Modifier_Letter (Lm), or Modifier_Symbol (Sk).
8752 The text marked in braces was added in Unicode 5.1.0, see
8753 <http://www.unicode.org/versions/Unicode5.1.0/> section "Update of
8754 Definition of case-ignorable". */
8755 /* Since this predicate is only used for the "Before C" and "After C"
8756 conditions of FINAL_SIGMA, we exclude the "cased" characters here.
8757 This simplifies the evaluation of the regular expressions
8758 \p{cased} (\p{case-ignorable})* C
8760 C (\p{case-ignorable})* \p{cased}
8763 is_case_ignorable (unsigned int ch)
8765 return (unicode_org_wbp[ch] == WBP_MIDLETTER
8766 || unicode_org_wbp[ch] == WBP_MIDNUMLET
8767 || is_category_Mn (ch)
8768 || is_category_Me (ch)
8769 || is_category_Cf (ch)
8770 || is_category_Lm (ch)
8771 || is_category_Sk (ch))
8775 /* ------------------------------------------------------------------------- */
8777 /* Output all case related properties. */
8779 output_casing_properties (const char *version)
8781 #define PROPERTY(FN,P) \
8782 debug_output_predicate ("unicase/" #FN ".txt", is_ ## P); \
8783 output_predicate_test ("../tests/unicase/test-" #FN ".c", is_ ## P, "uc_is_" #P " (c)"); \
8784 output_predicate ("unicase/" #FN ".h", is_ ## P, "u_casing_property_" #P, "Casing Properties", version);
8785 PROPERTY(cased, cased)
8786 PROPERTY(ignorable, case_ignorable)
8790 /* ========================================================================= */
8793 main (int argc, char * argv[])
8795 const char *unicodedata_filename;
8796 const char *proplist_filename;
8797 const char *derivedproplist_filename;
8798 const char *scripts_filename;
8799 const char *blocks_filename;
8800 const char *proplist30_filename;
8801 const char *eastasianwidth_filename;
8802 const char *linebreak_filename;
8803 const char *wordbreakproperty_filename;
8804 const char *graphemebreakproperty_filename;
8805 const char *compositionexclusions_filename;
8806 const char *specialcasing_filename;
8807 const char *casefolding_filename;
8808 const char *version;
8812 fprintf (stderr, "Usage: %s UnicodeData.txt PropList.txt DerivedCoreProperties.txt Scripts.txt Blocks.txt PropList-3.0.1.txt EastAsianWidth.txt LineBreak.txt WordBreakProperty.txt GraphemeBreakProperty.txt CompositionExclusions.txt SpecialCasing.txt CaseFolding.txt version\n",
8817 unicodedata_filename = argv[1];
8818 proplist_filename = argv[2];
8819 derivedproplist_filename = argv[3];
8820 scripts_filename = argv[4];
8821 blocks_filename = argv[5];
8822 proplist30_filename = argv[6];
8823 eastasianwidth_filename = argv[7];
8824 linebreak_filename = argv[8];
8825 wordbreakproperty_filename = argv[9];
8826 graphemebreakproperty_filename = argv[10];
8827 compositionexclusions_filename = argv[11];
8828 specialcasing_filename = argv[12];
8829 casefolding_filename = argv[13];
8832 fill_attributes (unicodedata_filename);
8833 clear_properties ();
8834 fill_properties (proplist_filename);
8835 fill_properties (derivedproplist_filename);
8836 fill_properties30 (proplist30_filename);
8837 fill_scripts (scripts_filename);
8838 fill_blocks (blocks_filename);
8839 fill_width (eastasianwidth_filename);
8840 fill_org_lbp (linebreak_filename);
8841 fill_org_wbp (wordbreakproperty_filename);
8842 fill_org_gbp (graphemebreakproperty_filename);
8843 fill_composition_exclusions (compositionexclusions_filename);
8844 fill_casing_rules (specialcasing_filename);
8845 fill_casefolding_rules (casefolding_filename);
8846 redistribute_casefolding_rules ();
8847 sort_casing_rules ();
8849 output_categories (version);
8850 output_category ("unictype/categ_of.h", version);
8851 output_combclass ("unictype/combining.h", version);
8852 output_bidi_category ("unictype/bidi_of.h", version);
8853 output_decimal_digit_test ("../tests/unictype/test-decdigit.h", version);
8854 output_decimal_digit ("unictype/decdigit.h", version);
8855 output_digit_test ("../tests/unictype/test-digit.h", version);
8856 output_digit ("unictype/digit.h", version);
8857 output_numeric_test ("../tests/unictype/test-numeric.h", version);
8858 output_numeric ("unictype/numeric.h", version);
8859 output_mirror ("unictype/mirror.h", version);
8860 output_properties (version);
8861 output_scripts (version);
8862 output_scripts_byname (version);
8863 output_blocks (version);
8864 output_ident_properties (version);
8865 output_nonspacing_property ("uniwidth/width.c.part");
8866 output_width_property_test ("../tests/uniwidth/test-uc_width2.sh.part");
8867 output_old_ctype (version);
8869 debug_output_lbrk_tables ("unilbrk/lbrkprop.txt");
8870 debug_output_org_lbrk_tables ("unilbrk/lbrkprop_org.txt");
8871 output_lbrk_tables ("unilbrk/lbrkprop1.h", "unilbrk/lbrkprop2.h", version);
8873 debug_output_wbrk_tables ("uniwbrk/wbrkprop.txt");
8874 debug_output_org_wbrk_tables ("uniwbrk/wbrkprop_org.txt");
8875 output_wbrk_tables ("uniwbrk/wbrkprop.h", version);
8877 output_gbp_test ("../tests/unigbrk/test-uc-gbrk-prop.h");
8878 output_gbp_table ("unigbrk/gbrkprop.h", version);
8880 output_decomposition_tables ("uninorm/decomposition-table1.h", "uninorm/decomposition-table2.h", version);
8881 debug_output_composition_tables ("uninorm/composition.txt");
8882 output_composition_tables ("uninorm/composition-table.gperf", version);
8884 output_simple_mapping_test ("../tests/unicase/test-uc_toupper.c", "uc_toupper", to_upper, version);
8885 output_simple_mapping_test ("../tests/unicase/test-uc_tolower.c", "uc_tolower", to_lower, version);
8886 output_simple_mapping_test ("../tests/unicase/test-uc_totitle.c", "uc_totitle", to_title, version);
8887 output_simple_mapping ("unicase/toupper.h", to_upper, version);
8888 output_simple_mapping ("unicase/tolower.h", to_lower, version);
8889 output_simple_mapping ("unicase/totitle.h", to_title, version);
8890 output_simple_mapping ("unicase/tocasefold.h", to_casefold, version);
8891 output_casing_rules ("unicase/special-casing-table.gperf", version);
8892 output_casing_properties (version);
8898 * For Emacs M-x compile
8900 * compile-command: "
8901 gcc -O -Wall gen-uni-tables.c -Iunictype -o gen-uni-tables && \
8903 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/UnicodeData.txt \
8904 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/PropList.txt \
8905 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/DerivedCoreProperties.txt \
8906 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/Scripts.txt \
8907 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/Blocks.txt \
8908 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/3.0.1/PropList-3.0.1.txt \
8909 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/EastAsianWidth.txt \
8910 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/LineBreak.txt \
8911 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/auxiliary/WordBreakProperty.txt \
8912 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/auxiliary/GraphemeBreakProperty.txt \
8913 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/CompositionExclusions.txt \
8914 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/SpecialCasing.txt \
8915 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/CaseFolding.txt \
8917 && diff unilbrk/lbrkprop_org.txt unilbrk/lbrkprop.txt \
8918 && diff uniwbrk/wbrkprop_org.txt uniwbrk/wbrkprop.txt