1 /* Generate Unicode conforming character classification tables from a
3 Copyright (C) 2000-2002, 2007-2009 Free Software Foundation, Inc.
4 Written by Bruno Haible <bruno@clisp.org>, 2000-2002.
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <http://www.gnu.org/licenses/>. */
20 $ gen-ctype /usr/local/share/Unidata/UnicodeData.txt \
21 /usr/local/share/Unidata/PropList.txt \
22 /usr/local/share/Unidata/DerivedCoreProperties.txt \
23 /usr/local/share/Unidata/Scripts.txt \
24 /usr/local/share/Unidata/Blocks.txt \
25 /usr/local/share/Unidata/PropList-3.0.1.txt \
36 /* ========================================================================= */
38 /* Reading UnicodeData.txt. */
41 /* This structure represents one line in the UnicodeData.txt file. */
42 struct unicode_attribute
44 const char *name; /* Character name */
45 const char *category; /* General category */
46 const char *combining; /* Canonical combining class */
47 const char *bidi; /* Bidirectional category */
48 const char *decomposition; /* Character decomposition mapping */
49 const char *decdigit; /* Decimal digit value */
50 const char *digit; /* Digit value */
51 const char *numeric; /* Numeric value */
52 bool mirrored; /* mirrored */
53 const char *oldname; /* Old Unicode 1.0 name */
54 const char *comment; /* Comment */
55 unsigned int upper; /* Uppercase mapping */
56 unsigned int lower; /* Lowercase mapping */
57 unsigned int title; /* Titlecase mapping */
60 /* Missing fields are represented with "" for strings, and NONE for
62 #define NONE (~(unsigned int)0)
64 /* The entire contents of the UnicodeData.txt file. */
65 struct unicode_attribute unicode_attributes [0x110000];
67 /* Stores in unicode_attributes[i] the values from the given fields. */
69 fill_attribute (unsigned int i,
70 const char *field1, const char *field2,
71 const char *field3, const char *field4,
72 const char *field5, const char *field6,
73 const char *field7, const char *field8,
74 const char *field9, const char *field10,
75 const char *field11, const char *field12,
76 const char *field13, const char *field14)
78 struct unicode_attribute * uni;
82 fprintf (stderr, "index too large\n");
85 if (strcmp (field2, "Cs") == 0)
86 /* Surrogates are UTF-16 artefacts, not real characters. Ignore them. */
88 uni = &unicode_attributes[i];
89 /* Copy the strings. */
90 uni->name = strdup (field1);
91 uni->category = (field2[0] == '\0' ? "" : strdup (field2));
92 uni->combining = (field3[0] == '\0' ? "" : strdup (field3));
93 uni->bidi = (field4[0] == '\0' ? "" : strdup (field4));
94 uni->decomposition = (field5[0] == '\0' ? "" : strdup (field5));
95 uni->decdigit = (field6[0] == '\0' ? "" : strdup (field6));
96 uni->digit = (field7[0] == '\0' ? "" : strdup (field7));
97 uni->numeric = (field8[0] == '\0' ? "" : strdup (field8));
98 uni->mirrored = (field9[0] == 'Y');
99 uni->oldname = (field10[0] == '\0' ? "" : strdup (field10));
100 uni->comment = (field11[0] == '\0' ? "" : strdup (field11));
101 uni->upper = (field12[0] =='\0' ? NONE : strtoul (field12, NULL, 16));
102 uni->lower = (field13[0] =='\0' ? NONE : strtoul (field13, NULL, 16));
103 uni->title = (field14[0] =='\0' ? NONE : strtoul (field14, NULL, 16));
106 /* Maximum length of a field in the UnicodeData.txt file. */
109 /* Reads the next field from STREAM. The buffer BUFFER has size FIELDLEN.
110 Reads up to (but excluding) DELIM.
111 Returns 1 when a field was successfully read, otherwise 0. */
113 getfield (FILE *stream, char *buffer, int delim)
118 for (; (c = getc (stream)), (c != EOF && c != delim); )
120 /* The original unicode.org UnicodeData.txt file happens to have
121 CR/LF line terminators. Silently convert to LF. */
125 /* Put c into the buffer. */
126 if (++count >= FIELDLEN - 1)
128 fprintf (stderr, "field longer than expected, increase FIELDLEN\n");
141 /* Stores in unicode_attributes[] the entire contents of the UnicodeData.txt
144 fill_attributes (const char *unicodedata_filename)
148 char field0[FIELDLEN];
149 char field1[FIELDLEN];
150 char field2[FIELDLEN];
151 char field3[FIELDLEN];
152 char field4[FIELDLEN];
153 char field5[FIELDLEN];
154 char field6[FIELDLEN];
155 char field7[FIELDLEN];
156 char field8[FIELDLEN];
157 char field9[FIELDLEN];
158 char field10[FIELDLEN];
159 char field11[FIELDLEN];
160 char field12[FIELDLEN];
161 char field13[FIELDLEN];
162 char field14[FIELDLEN];
165 for (i = 0; i < 0x110000; i++)
166 unicode_attributes[i].name = NULL;
168 stream = fopen (unicodedata_filename, "r");
171 fprintf (stderr, "error during fopen of '%s'\n", unicodedata_filename);
180 n = getfield (stream, field0, ';');
181 n += getfield (stream, field1, ';');
182 n += getfield (stream, field2, ';');
183 n += getfield (stream, field3, ';');
184 n += getfield (stream, field4, ';');
185 n += getfield (stream, field5, ';');
186 n += getfield (stream, field6, ';');
187 n += getfield (stream, field7, ';');
188 n += getfield (stream, field8, ';');
189 n += getfield (stream, field9, ';');
190 n += getfield (stream, field10, ';');
191 n += getfield (stream, field11, ';');
192 n += getfield (stream, field12, ';');
193 n += getfield (stream, field13, ';');
194 n += getfield (stream, field14, '\n');
199 fprintf (stderr, "short line in '%s':%d\n",
200 unicodedata_filename, lineno);
203 i = strtoul (field0, NULL, 16);
205 && strlen (field1) >= 9
206 && strcmp (field1 + strlen(field1) - 8, ", First>") == 0)
208 /* Deal with a range. */
210 n = getfield (stream, field0, ';');
211 n += getfield (stream, field1, ';');
212 n += getfield (stream, field2, ';');
213 n += getfield (stream, field3, ';');
214 n += getfield (stream, field4, ';');
215 n += getfield (stream, field5, ';');
216 n += getfield (stream, field6, ';');
217 n += getfield (stream, field7, ';');
218 n += getfield (stream, field8, ';');
219 n += getfield (stream, field9, ';');
220 n += getfield (stream, field10, ';');
221 n += getfield (stream, field11, ';');
222 n += getfield (stream, field12, ';');
223 n += getfield (stream, field13, ';');
224 n += getfield (stream, field14, '\n');
227 fprintf (stderr, "missing end range in '%s':%d\n",
228 unicodedata_filename, lineno);
231 if (!(field1[0] == '<'
232 && strlen (field1) >= 8
233 && strcmp (field1 + strlen (field1) - 7, ", Last>") == 0))
235 fprintf (stderr, "missing end range in '%s':%d\n",
236 unicodedata_filename, lineno);
239 field1[strlen (field1) - 7] = '\0';
240 j = strtoul (field0, NULL, 16);
242 fill_attribute (i, field1+1, field2, field3, field4, field5,
243 field6, field7, field8, field9, field10,
244 field11, field12, field13, field14);
248 /* Single character line */
249 fill_attribute (i, field1, field2, field3, field4, field5,
250 field6, field7, field8, field9, field10,
251 field11, field12, field13, field14);
254 if (ferror (stream) || fclose (stream))
256 fprintf (stderr, "error reading from '%s'\n", unicodedata_filename);
261 /* ========================================================================= */
263 /* General category. */
264 /* See Unicode 3.0 book, section 4.5,
268 is_category_L (unsigned int ch)
270 return (unicode_attributes[ch].name != NULL
271 && unicode_attributes[ch].category[0] == 'L');
275 is_category_Lu (unsigned int ch)
277 return (unicode_attributes[ch].name != NULL
278 && unicode_attributes[ch].category[0] == 'L'
279 && unicode_attributes[ch].category[1] == 'u');
283 is_category_Ll (unsigned int ch)
285 return (unicode_attributes[ch].name != NULL
286 && unicode_attributes[ch].category[0] == 'L'
287 && unicode_attributes[ch].category[1] == 'l');
291 is_category_Lt (unsigned int ch)
293 return (unicode_attributes[ch].name != NULL
294 && unicode_attributes[ch].category[0] == 'L'
295 && unicode_attributes[ch].category[1] == 't');
299 is_category_Lm (unsigned int ch)
301 return (unicode_attributes[ch].name != NULL
302 && unicode_attributes[ch].category[0] == 'L'
303 && unicode_attributes[ch].category[1] == 'm');
307 is_category_Lo (unsigned int ch)
309 return (unicode_attributes[ch].name != NULL
310 && unicode_attributes[ch].category[0] == 'L'
311 && unicode_attributes[ch].category[1] == 'o');
315 is_category_M (unsigned int ch)
317 return (unicode_attributes[ch].name != NULL
318 && unicode_attributes[ch].category[0] == 'M');
322 is_category_Mn (unsigned int ch)
324 return (unicode_attributes[ch].name != NULL
325 && unicode_attributes[ch].category[0] == 'M'
326 && unicode_attributes[ch].category[1] == 'n');
330 is_category_Mc (unsigned int ch)
332 return (unicode_attributes[ch].name != NULL
333 && unicode_attributes[ch].category[0] == 'M'
334 && unicode_attributes[ch].category[1] == 'c');
338 is_category_Me (unsigned int ch)
340 return (unicode_attributes[ch].name != NULL
341 && unicode_attributes[ch].category[0] == 'M'
342 && unicode_attributes[ch].category[1] == 'e');
346 is_category_N (unsigned int ch)
348 return (unicode_attributes[ch].name != NULL
349 && unicode_attributes[ch].category[0] == 'N');
353 is_category_Nd (unsigned int ch)
355 return (unicode_attributes[ch].name != NULL
356 && unicode_attributes[ch].category[0] == 'N'
357 && unicode_attributes[ch].category[1] == 'd');
361 is_category_Nl (unsigned int ch)
363 return (unicode_attributes[ch].name != NULL
364 && unicode_attributes[ch].category[0] == 'N'
365 && unicode_attributes[ch].category[1] == 'l');
369 is_category_No (unsigned int ch)
371 return (unicode_attributes[ch].name != NULL
372 && unicode_attributes[ch].category[0] == 'N'
373 && unicode_attributes[ch].category[1] == 'o');
377 is_category_P (unsigned int ch)
379 return (unicode_attributes[ch].name != NULL
380 && unicode_attributes[ch].category[0] == 'P');
384 is_category_Pc (unsigned int ch)
386 return (unicode_attributes[ch].name != NULL
387 && unicode_attributes[ch].category[0] == 'P'
388 && unicode_attributes[ch].category[1] == 'c');
392 is_category_Pd (unsigned int ch)
394 return (unicode_attributes[ch].name != NULL
395 && unicode_attributes[ch].category[0] == 'P'
396 && unicode_attributes[ch].category[1] == 'd');
400 is_category_Ps (unsigned int ch)
402 return (unicode_attributes[ch].name != NULL
403 && unicode_attributes[ch].category[0] == 'P'
404 && unicode_attributes[ch].category[1] == 's');
408 is_category_Pe (unsigned int ch)
410 return (unicode_attributes[ch].name != NULL
411 && unicode_attributes[ch].category[0] == 'P'
412 && unicode_attributes[ch].category[1] == 'e');
416 is_category_Pi (unsigned int ch)
418 return (unicode_attributes[ch].name != NULL
419 && unicode_attributes[ch].category[0] == 'P'
420 && unicode_attributes[ch].category[1] == 'i');
424 is_category_Pf (unsigned int ch)
426 return (unicode_attributes[ch].name != NULL
427 && unicode_attributes[ch].category[0] == 'P'
428 && unicode_attributes[ch].category[1] == 'f');
432 is_category_Po (unsigned int ch)
434 return (unicode_attributes[ch].name != NULL
435 && unicode_attributes[ch].category[0] == 'P'
436 && unicode_attributes[ch].category[1] == 'o');
440 is_category_S (unsigned int ch)
442 return (unicode_attributes[ch].name != NULL
443 && unicode_attributes[ch].category[0] == 'S');
447 is_category_Sm (unsigned int ch)
449 return (unicode_attributes[ch].name != NULL
450 && unicode_attributes[ch].category[0] == 'S'
451 && unicode_attributes[ch].category[1] == 'm');
455 is_category_Sc (unsigned int ch)
457 return (unicode_attributes[ch].name != NULL
458 && unicode_attributes[ch].category[0] == 'S'
459 && unicode_attributes[ch].category[1] == 'c');
463 is_category_Sk (unsigned int ch)
465 return (unicode_attributes[ch].name != NULL
466 && unicode_attributes[ch].category[0] == 'S'
467 && unicode_attributes[ch].category[1] == 'k');
471 is_category_So (unsigned int ch)
473 return (unicode_attributes[ch].name != NULL
474 && unicode_attributes[ch].category[0] == 'S'
475 && unicode_attributes[ch].category[1] == 'o');
479 is_category_Z (unsigned int ch)
481 return (unicode_attributes[ch].name != NULL
482 && unicode_attributes[ch].category[0] == 'Z');
486 is_category_Zs (unsigned int ch)
488 return (unicode_attributes[ch].name != NULL
489 && unicode_attributes[ch].category[0] == 'Z'
490 && unicode_attributes[ch].category[1] == 's');
494 is_category_Zl (unsigned int ch)
496 return (unicode_attributes[ch].name != NULL
497 && unicode_attributes[ch].category[0] == 'Z'
498 && unicode_attributes[ch].category[1] == 'l');
502 is_category_Zp (unsigned int ch)
504 return (unicode_attributes[ch].name != NULL
505 && unicode_attributes[ch].category[0] == 'Z'
506 && unicode_attributes[ch].category[1] == 'p');
510 is_category_C (unsigned int ch)
512 return (unicode_attributes[ch].name == NULL
513 || unicode_attributes[ch].category[0] == 'C');
517 is_category_Cc (unsigned int ch)
519 return (unicode_attributes[ch].name != NULL
520 && unicode_attributes[ch].category[0] == 'C'
521 && unicode_attributes[ch].category[1] == 'c');
525 is_category_Cf (unsigned int ch)
527 return (unicode_attributes[ch].name != NULL
528 && unicode_attributes[ch].category[0] == 'C'
529 && unicode_attributes[ch].category[1] == 'f');
533 is_category_Cs (unsigned int ch)
535 return (ch >= 0xd800 && ch < 0xe000);
539 is_category_Co (unsigned int ch)
541 return (unicode_attributes[ch].name != NULL
542 && unicode_attributes[ch].category[0] == 'C'
543 && unicode_attributes[ch].category[1] == 'o');
547 is_category_Cn (unsigned int ch)
549 return (unicode_attributes[ch].name == NULL
550 && !(ch >= 0xd800 && ch < 0xe000));
553 /* Output a boolean property in a human readable format. */
555 debug_output_predicate (const char *filename, bool (*predicate) (unsigned int))
560 stream = fopen (filename, "w");
563 fprintf (stderr, "cannot open '%s' for writing\n", filename);
567 #if 0 /* This yields huge text output. */
568 for (ch = 0; ch < 0x110000; ch++)
571 fprintf (stream, "0x%04X\n", ch);
574 for (ch = 0; ch < 0x110000; ch++)
577 unsigned int first = ch;
580 while (ch + 1 < 0x110000 && predicate (ch + 1))
584 fprintf (stream, "0x%04X..0x%04X\n", first, last);
586 fprintf (stream, "0x%04X\n", ch);
590 if (ferror (stream) || fclose (stream))
592 fprintf (stderr, "error writing to '%s'\n", filename);
597 /* Output the unit test for a boolean property. */
599 output_predicate_test (const char *filename, bool (*predicate) (unsigned int), const char *expression)
605 stream = fopen (filename, "w");
608 fprintf (stderr, "cannot open '%s' for writing\n", filename);
612 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
613 fprintf (stream, "/* Test the Unicode character type functions.\n");
614 fprintf (stream, " Copyright (C) 2007 Free Software Foundation, Inc.\n");
615 fprintf (stream, "\n");
616 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
617 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
618 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
619 fprintf (stream, " (at your option) any later version.\n");
620 fprintf (stream, "\n");
621 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
622 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
623 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
624 fprintf (stream, " GNU General Public License for more details.\n");
625 fprintf (stream, "\n");
626 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
627 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
628 fprintf (stream, "\n");
629 fprintf (stream, "#include \"test-predicate-part1.h\"\n");
630 fprintf (stream, "\n");
633 for (ch = 0; ch < 0x110000; ch++)
636 unsigned int first = ch;
639 while (ch + 1 < 0x110000 && predicate (ch + 1))
643 fprintf (stream, ",\n");
644 fprintf (stream, " { 0x%04X, 0x%04X }", first, last);
648 fprintf (stream, "\n");
650 fprintf (stream, "\n");
651 fprintf (stream, "#define PREDICATE(c) %s\n", expression);
652 fprintf (stream, "#include \"test-predicate-part2.h\"\n");
654 if (ferror (stream) || fclose (stream))
656 fprintf (stderr, "error writing to '%s'\n", filename);
661 /* Construction of sparse 3-level tables. */
662 #define TABLE predicate_table
663 #define xmalloc malloc
664 #define xrealloc realloc
665 #include "3levelbit.h"
667 /* Output a boolean property in a three-level bitmap. */
669 output_predicate (const char *filename, bool (*predicate) (unsigned int), const char *name, const char *comment, const char *version)
673 struct predicate_table t;
674 unsigned int level1_offset, level2_offset, level3_offset;
676 stream = fopen (filename, "w");
679 fprintf (stderr, "cannot open '%s' for writing\n", filename);
683 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
684 fprintf (stream, "/* %s of Unicode characters. */\n", comment);
685 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
690 predicate_table_init (&t);
692 for (ch = 0; ch < 0x110000; ch++)
694 predicate_table_add (&t, ch);
696 predicate_table_finalize (&t);
698 /* Offsets in t.result, in memory of this process. */
700 5 * sizeof (uint32_t);
702 5 * sizeof (uint32_t)
703 + t.level1_size * sizeof (uint32_t);
705 5 * sizeof (uint32_t)
706 + t.level1_size * sizeof (uint32_t)
707 + (t.level2_size << t.q) * sizeof (uint32_t);
709 for (i = 0; i < 5; i++)
711 fprintf (stream, "#define header_%d %d\n", i,
712 ((uint32_t *) t.result)[i]);
714 fprintf (stream, "static const\n");
715 fprintf (stream, "struct\n");
716 fprintf (stream, " {\n");
717 fprintf (stream, " int header[1];\n");
718 fprintf (stream, " int level1[%zu];\n", t.level1_size);
719 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
720 fprintf (stream, " /*unsigned*/ int level3[%zu << %d];\n", t.level3_size, t.p);
721 fprintf (stream, " }\n");
722 fprintf (stream, "%s =\n", name);
723 fprintf (stream, "{\n");
724 fprintf (stream, " { %d },\n", ((uint32_t *) t.result)[1]);
725 fprintf (stream, " {");
726 if (t.level1_size > 1)
727 fprintf (stream, "\n ");
728 for (i = 0; i < t.level1_size; i++)
731 if (i > 0 && (i % 1) == 0)
732 fprintf (stream, "\n ");
733 offset = ((uint32_t *) (t.result + level1_offset))[i];
735 fprintf (stream, " %5d", -1);
737 fprintf (stream, " %5zd * sizeof (int) / sizeof (short) + %5zd",
738 1 + t.level1_size, (offset - level2_offset) / sizeof (uint32_t));
739 if (i+1 < t.level1_size)
740 fprintf (stream, ",");
742 if (t.level1_size > 1)
743 fprintf (stream, "\n ");
744 fprintf (stream, " },\n");
745 fprintf (stream, " {");
746 if (t.level2_size << t.q > 1)
747 fprintf (stream, "\n ");
748 for (i = 0; i < t.level2_size << t.q; i++)
751 if (i > 0 && (i % 1) == 0)
752 fprintf (stream, "\n ");
753 offset = ((uint32_t *) (t.result + level2_offset))[i];
755 fprintf (stream, " %5d", -1);
757 fprintf (stream, " %5zd + %5zd * sizeof (short) / sizeof (int) + %5zd",
758 1 + t.level1_size, t.level2_size << t.q, (offset - level3_offset) / sizeof (uint32_t));
759 if (i+1 < t.level2_size << t.q)
760 fprintf (stream, ",");
762 if (t.level2_size << t.q > 1)
763 fprintf (stream, "\n ");
764 fprintf (stream, " },\n");
765 fprintf (stream, " {");
766 if (t.level3_size << t.p > 4)
767 fprintf (stream, "\n ");
768 for (i = 0; i < t.level3_size << t.p; i++)
770 if (i > 0 && (i % 4) == 0)
771 fprintf (stream, "\n ");
772 fprintf (stream, " 0x%08X",
773 ((uint32_t *) (t.result + level3_offset))[i]);
774 if (i+1 < t.level3_size << t.p)
775 fprintf (stream, ",");
777 if (t.level3_size << t.p > 4)
778 fprintf (stream, "\n ");
779 fprintf (stream, " }\n");
780 fprintf (stream, "};\n");
782 if (ferror (stream) || fclose (stream))
784 fprintf (stderr, "error writing to '%s'\n", filename);
789 /* Output all categories. */
791 output_categories (const char *version)
793 #define CATEGORY(C) \
794 debug_output_predicate ("categ_" #C ".txt", is_category_ ## C); \
795 output_predicate_test ("test-categ_" #C ".c", is_category_ ## C, "uc_is_general_category (c, UC_CATEGORY_" #C ")"); \
796 output_predicate ("categ_" #C ".h", is_category_ ## C, "u_categ_" #C, "Categories", version);
839 UC_CATEGORY_MASK_L = 0x0000001f,
840 UC_CATEGORY_MASK_Lu = 0x00000001,
841 UC_CATEGORY_MASK_Ll = 0x00000002,
842 UC_CATEGORY_MASK_Lt = 0x00000004,
843 UC_CATEGORY_MASK_Lm = 0x00000008,
844 UC_CATEGORY_MASK_Lo = 0x00000010,
845 UC_CATEGORY_MASK_M = 0x000000e0,
846 UC_CATEGORY_MASK_Mn = 0x00000020,
847 UC_CATEGORY_MASK_Mc = 0x00000040,
848 UC_CATEGORY_MASK_Me = 0x00000080,
849 UC_CATEGORY_MASK_N = 0x00000700,
850 UC_CATEGORY_MASK_Nd = 0x00000100,
851 UC_CATEGORY_MASK_Nl = 0x00000200,
852 UC_CATEGORY_MASK_No = 0x00000400,
853 UC_CATEGORY_MASK_P = 0x0003f800,
854 UC_CATEGORY_MASK_Pc = 0x00000800,
855 UC_CATEGORY_MASK_Pd = 0x00001000,
856 UC_CATEGORY_MASK_Ps = 0x00002000,
857 UC_CATEGORY_MASK_Pe = 0x00004000,
858 UC_CATEGORY_MASK_Pi = 0x00008000,
859 UC_CATEGORY_MASK_Pf = 0x00010000,
860 UC_CATEGORY_MASK_Po = 0x00020000,
861 UC_CATEGORY_MASK_S = 0x003c0000,
862 UC_CATEGORY_MASK_Sm = 0x00040000,
863 UC_CATEGORY_MASK_Sc = 0x00080000,
864 UC_CATEGORY_MASK_Sk = 0x00100000,
865 UC_CATEGORY_MASK_So = 0x00200000,
866 UC_CATEGORY_MASK_Z = 0x01c00000,
867 UC_CATEGORY_MASK_Zs = 0x00400000,
868 UC_CATEGORY_MASK_Zl = 0x00800000,
869 UC_CATEGORY_MASK_Zp = 0x01000000,
870 UC_CATEGORY_MASK_C = 0x3e000000,
871 UC_CATEGORY_MASK_Cc = 0x02000000,
872 UC_CATEGORY_MASK_Cf = 0x04000000,
873 UC_CATEGORY_MASK_Cs = 0x08000000,
874 UC_CATEGORY_MASK_Co = 0x10000000,
875 UC_CATEGORY_MASK_Cn = 0x20000000
879 general_category_byname (const char *category_name)
881 if (category_name[0] != '\0'
882 && (category_name[1] == '\0' || category_name[2] == '\0'))
883 switch (category_name[0])
886 switch (category_name[1])
888 case '\0': return UC_CATEGORY_MASK_L;
889 case 'u': return UC_CATEGORY_MASK_Lu;
890 case 'l': return UC_CATEGORY_MASK_Ll;
891 case 't': return UC_CATEGORY_MASK_Lt;
892 case 'm': return UC_CATEGORY_MASK_Lm;
893 case 'o': return UC_CATEGORY_MASK_Lo;
897 switch (category_name[1])
899 case '\0': return UC_CATEGORY_MASK_M;
900 case 'n': return UC_CATEGORY_MASK_Mn;
901 case 'c': return UC_CATEGORY_MASK_Mc;
902 case 'e': return UC_CATEGORY_MASK_Me;
906 switch (category_name[1])
908 case '\0': return UC_CATEGORY_MASK_N;
909 case 'd': return UC_CATEGORY_MASK_Nd;
910 case 'l': return UC_CATEGORY_MASK_Nl;
911 case 'o': return UC_CATEGORY_MASK_No;
915 switch (category_name[1])
917 case '\0': return UC_CATEGORY_MASK_P;
918 case 'c': return UC_CATEGORY_MASK_Pc;
919 case 'd': return UC_CATEGORY_MASK_Pd;
920 case 's': return UC_CATEGORY_MASK_Ps;
921 case 'e': return UC_CATEGORY_MASK_Pe;
922 case 'i': return UC_CATEGORY_MASK_Pi;
923 case 'f': return UC_CATEGORY_MASK_Pf;
924 case 'o': return UC_CATEGORY_MASK_Po;
928 switch (category_name[1])
930 case '\0': return UC_CATEGORY_MASK_S;
931 case 'm': return UC_CATEGORY_MASK_Sm;
932 case 'c': return UC_CATEGORY_MASK_Sc;
933 case 'k': return UC_CATEGORY_MASK_Sk;
934 case 'o': return UC_CATEGORY_MASK_So;
938 switch (category_name[1])
940 case '\0': return UC_CATEGORY_MASK_Z;
941 case 's': return UC_CATEGORY_MASK_Zs;
942 case 'l': return UC_CATEGORY_MASK_Zl;
943 case 'p': return UC_CATEGORY_MASK_Zp;
947 switch (category_name[1])
949 case '\0': return UC_CATEGORY_MASK_C;
950 case 'c': return UC_CATEGORY_MASK_Cc;
951 case 'f': return UC_CATEGORY_MASK_Cf;
952 case 's': return UC_CATEGORY_MASK_Cs;
953 case 'o': return UC_CATEGORY_MASK_Co;
954 case 'n': return UC_CATEGORY_MASK_Cn;
958 /* Invalid category name. */
962 /* Construction of sparse 3-level tables. */
963 #define TABLE category_table
964 #define ELEMENT uint8_t
965 #define DEFAULT 29 /* = log2(UC_CATEGORY_MASK_Cn) */
966 #define xmalloc malloc
967 #define xrealloc realloc
970 /* Output the per-character category table. */
972 output_category (const char *filename, const char *version)
976 struct category_table t;
977 unsigned int level1_offset, level2_offset, level3_offset;
978 uint16_t *level3_packed;
980 stream = fopen (filename, "w");
983 fprintf (stderr, "cannot open '%s' for writing\n", filename);
987 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
988 fprintf (stream, "/* Categories of Unicode characters. */\n");
989 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
994 category_table_init (&t);
996 for (ch = 0; ch < 0x110000; ch++)
999 unsigned int log2_value;
1001 if (is_category_Cs (ch))
1002 value = UC_CATEGORY_MASK_Cs;
1003 else if (unicode_attributes[ch].name != NULL)
1004 value = general_category_byname (unicode_attributes[ch].category);
1008 /* Now value should contain exactly one bit. */
1009 if (value == 0 || ((value & (value - 1)) != 0))
1012 for (log2_value = 0; value > 1; value >>= 1, log2_value++);
1014 category_table_add (&t, ch, log2_value);
1017 category_table_finalize (&t);
1019 /* Offsets in t.result, in memory of this process. */
1021 5 * sizeof (uint32_t);
1023 5 * sizeof (uint32_t)
1024 + t.level1_size * sizeof (uint32_t);
1026 5 * sizeof (uint32_t)
1027 + t.level1_size * sizeof (uint32_t)
1028 + (t.level2_size << t.q) * sizeof (uint32_t);
1030 for (i = 0; i < 5; i++)
1031 fprintf (stream, "#define category_header_%d %d\n", i,
1032 ((uint32_t *) t.result)[i]);
1033 fprintf (stream, "static const\n");
1034 fprintf (stream, "struct\n");
1035 fprintf (stream, " {\n");
1036 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1037 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1038 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
1039 (1 << t.p) * 5 / 16);
1040 fprintf (stream, " }\n");
1041 fprintf (stream, "u_category =\n");
1042 fprintf (stream, "{\n");
1043 fprintf (stream, " {");
1044 if (t.level1_size > 8)
1045 fprintf (stream, "\n ");
1046 for (i = 0; i < t.level1_size; i++)
1049 if (i > 0 && (i % 8) == 0)
1050 fprintf (stream, "\n ");
1051 offset = ((uint32_t *) (t.result + level1_offset))[i];
1053 fprintf (stream, " %5d", -1);
1055 fprintf (stream, " %5zd",
1056 (offset - level2_offset) / sizeof (uint32_t));
1057 if (i+1 < t.level1_size)
1058 fprintf (stream, ",");
1060 if (t.level1_size > 8)
1061 fprintf (stream, "\n ");
1062 fprintf (stream, " },\n");
1063 fprintf (stream, " {");
1064 if (t.level2_size << t.q > 8)
1065 fprintf (stream, "\n ");
1066 for (i = 0; i < t.level2_size << t.q; i++)
1069 if (i > 0 && (i % 8) == 0)
1070 fprintf (stream, "\n ");
1071 offset = ((uint32_t *) (t.result + level2_offset))[i];
1073 fprintf (stream, " %5d", -1);
1075 fprintf (stream, " %5zd",
1076 (offset - level3_offset) / sizeof (uint8_t));
1077 if (i+1 < t.level2_size << t.q)
1078 fprintf (stream, ",");
1080 if (t.level2_size << t.q > 8)
1081 fprintf (stream, "\n ");
1082 fprintf (stream, " },\n");
1083 /* Pack the level3 array. Each entry needs 5 bits only. Use 16-bit units,
1084 not 32-bit units, in order to make the lookup function easier. */
1087 calloc ((t.level3_size << t.p) * 5 / 16 + 1, sizeof (uint16_t));
1088 for (i = 0; i < t.level3_size << t.p; i++)
1090 unsigned int j = (i * 5) / 16;
1091 unsigned int k = (i * 5) % 16;
1092 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
1093 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
1094 level3_packed[j] = value & 0xffff;
1095 level3_packed[j+1] = value >> 16;
1097 fprintf (stream, " {");
1098 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1099 fprintf (stream, "\n ");
1100 for (i = 0; i < (t.level3_size << t.p) * 5 / 16 + 1; i++)
1102 if (i > 0 && (i % 8) == 0)
1103 fprintf (stream, "\n ");
1104 fprintf (stream, " 0x%04x", level3_packed[i]);
1105 if (i+1 < (t.level3_size << t.p) * 5 / 16 + 1)
1106 fprintf (stream, ",");
1108 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1109 fprintf (stream, "\n ");
1110 fprintf (stream, " }\n");
1111 free (level3_packed);
1112 fprintf (stream, "};\n");
1114 if (ferror (stream) || fclose (stream))
1116 fprintf (stderr, "error writing to '%s'\n", filename);
1121 /* ========================================================================= */
1123 /* Canonical combining class. */
1124 /* See Unicode 3.0 book, section 4.2,
1127 /* Construction of sparse 3-level tables. */
1128 #define TABLE combclass_table
1129 #define ELEMENT uint8_t
1131 #define xmalloc malloc
1132 #define xrealloc realloc
1135 /* Output the per-character combining class table. */
1137 output_combclass (const char *filename, const char *version)
1141 struct combclass_table t;
1142 unsigned int level1_offset, level2_offset, level3_offset;
1144 stream = fopen (filename, "w");
1147 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1151 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1152 fprintf (stream, "/* Combining class of Unicode characters. */\n");
1153 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1158 combclass_table_init (&t);
1160 for (ch = 0; ch < 0x110000; ch++)
1161 if (unicode_attributes[ch].name != NULL)
1163 int value = atoi (unicode_attributes[ch].combining);
1164 if (!(value >= 0 && value <= 255))
1166 combclass_table_add (&t, ch, value);
1169 combclass_table_finalize (&t);
1171 /* Offsets in t.result, in memory of this process. */
1173 5 * sizeof (uint32_t);
1175 5 * sizeof (uint32_t)
1176 + t.level1_size * sizeof (uint32_t);
1178 5 * sizeof (uint32_t)
1179 + t.level1_size * sizeof (uint32_t)
1180 + (t.level2_size << t.q) * sizeof (uint32_t);
1182 for (i = 0; i < 5; i++)
1183 fprintf (stream, "#define combclass_header_%d %d\n", i,
1184 ((uint32_t *) t.result)[i]);
1185 fprintf (stream, "static const\n");
1186 fprintf (stream, "struct\n");
1187 fprintf (stream, " {\n");
1188 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1189 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1190 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
1191 fprintf (stream, " }\n");
1192 fprintf (stream, "u_combclass =\n");
1193 fprintf (stream, "{\n");
1194 fprintf (stream, " {");
1195 if (t.level1_size > 8)
1196 fprintf (stream, "\n ");
1197 for (i = 0; i < t.level1_size; i++)
1200 if (i > 0 && (i % 8) == 0)
1201 fprintf (stream, "\n ");
1202 offset = ((uint32_t *) (t.result + level1_offset))[i];
1204 fprintf (stream, " %5d", -1);
1206 fprintf (stream, " %5zd",
1207 (offset - level2_offset) / sizeof (uint32_t));
1208 if (i+1 < t.level1_size)
1209 fprintf (stream, ",");
1211 if (t.level1_size > 8)
1212 fprintf (stream, "\n ");
1213 fprintf (stream, " },\n");
1214 fprintf (stream, " {");
1215 if (t.level2_size << t.q > 8)
1216 fprintf (stream, "\n ");
1217 for (i = 0; i < t.level2_size << t.q; i++)
1220 if (i > 0 && (i % 8) == 0)
1221 fprintf (stream, "\n ");
1222 offset = ((uint32_t *) (t.result + level2_offset))[i];
1224 fprintf (stream, " %5d", -1);
1226 fprintf (stream, " %5zd",
1227 (offset - level3_offset) / sizeof (uint8_t));
1228 if (i+1 < t.level2_size << t.q)
1229 fprintf (stream, ",");
1231 if (t.level2_size << t.q > 8)
1232 fprintf (stream, "\n ");
1233 fprintf (stream, " },\n");
1234 fprintf (stream, " {");
1235 if (t.level3_size << t.p > 8)
1236 fprintf (stream, "\n ");
1237 for (i = 0; i < t.level3_size << t.p; i++)
1239 if (i > 0 && (i % 8) == 0)
1240 fprintf (stream, "\n ");
1241 fprintf (stream, " %3d", ((uint8_t *) (t.result + level3_offset))[i]);
1242 if (i+1 < t.level3_size << t.p)
1243 fprintf (stream, ",");
1245 if (t.level3_size << t.p > 8)
1246 fprintf (stream, "\n ");
1247 fprintf (stream, " }\n");
1248 fprintf (stream, "};\n");
1250 if (ferror (stream) || fclose (stream))
1252 fprintf (stderr, "error writing to '%s'\n", filename);
1257 /* ========================================================================= */
1259 /* Bidirectional category. */
1260 /* See Unicode 3.0 book, section 4.3,
1265 UC_BIDI_L, /* Left-to-Right */
1266 UC_BIDI_LRE, /* Left-to-Right Embedding */
1267 UC_BIDI_LRO, /* Left-to-Right Override */
1268 UC_BIDI_R, /* Right-to-Left */
1269 UC_BIDI_AL, /* Right-to-Left Arabic */
1270 UC_BIDI_RLE, /* Right-to-Left Embedding */
1271 UC_BIDI_RLO, /* Right-to-Left Override */
1272 UC_BIDI_PDF, /* Pop Directional Format */
1273 UC_BIDI_EN, /* European Number */
1274 UC_BIDI_ES, /* European Number Separator */
1275 UC_BIDI_ET, /* European Number Terminator */
1276 UC_BIDI_AN, /* Arabic Number */
1277 UC_BIDI_CS, /* Common Number Separator */
1278 UC_BIDI_NSM, /* Non-Spacing Mark */
1279 UC_BIDI_BN, /* Boundary Neutral */
1280 UC_BIDI_B, /* Paragraph Separator */
1281 UC_BIDI_S, /* Segment Separator */
1282 UC_BIDI_WS, /* Whitespace */
1283 UC_BIDI_ON /* Other Neutral */
1287 bidi_category_byname (const char *category_name)
1289 switch (category_name[0])
1292 switch (category_name[1])
1295 if (category_name[2] == '\0')
1299 if (category_name[2] == '\0')
1305 switch (category_name[1])
1310 if (category_name[2] == '\0')
1316 switch (category_name[1])
1319 if (category_name[2] == '\0')
1325 switch (category_name[1])
1328 if (category_name[2] == '\0')
1332 if (category_name[2] == '\0')
1336 if (category_name[2] == '\0')
1342 switch (category_name[1])
1347 switch (category_name[2])
1350 if (category_name[3] == '\0')
1354 if (category_name[3] == '\0')
1362 switch (category_name[1])
1365 switch (category_name[2])
1368 if (category_name[3] == '\0')
1376 switch (category_name[1])
1379 if (category_name[2] == '\0')
1385 switch (category_name[1])
1388 switch (category_name[2])
1391 if (category_name[3] == '\0')
1399 switch (category_name[1])
1404 switch (category_name[2])
1407 if (category_name[3] == '\0')
1411 if (category_name[3] == '\0')
1419 if (category_name[1] == '\0')
1423 switch (category_name[1])
1426 if (category_name[2] == '\0')
1432 /* Invalid bidi category name. */
1437 get_bidi_category (unsigned int ch)
1439 if (unicode_attributes[ch].name != NULL)
1440 return bidi_category_byname (unicode_attributes[ch].bidi);
1443 /* The bidi category of unassigned characters depends on the range.
1444 See UTR #9 and DerivedBidiClass.txt. */
1445 if ((ch >= 0x0590 && ch <= 0x05FF)
1446 || (ch >= 0x07FB && ch <= 0x08FF)
1447 || (ch >= 0xFB37 && ch <= 0xFB45)
1448 || (ch >= 0x10800 && ch <= 0x10FFF))
1450 else if ((ch >= 0x0600 && ch <= 0x07BF)
1451 || (ch >= 0x2064 && ch <= 0x2069)
1452 || (ch >= 0xFBB2 && ch <= 0xFDCF)
1453 || (ch >= 0xFDFE && ch <= 0xFEFE))
1455 else if ((ch >= 0xFDD0 && ch <= 0xFDEF)
1456 || (ch >= 0xFFF0 && ch <= 0xFFFF)
1457 || (ch & 0xFFFF) == 0xFFFE
1458 || (ch & 0xFFFF) == 0xFFFF
1459 || (ch >= 0xE0000 && ch <= 0xE0FFF))
1466 /* Construction of sparse 3-level tables. */
1467 #define TABLE bidi_category_table
1468 #define ELEMENT uint8_t
1469 #define DEFAULT UC_BIDI_L
1470 #define xmalloc malloc
1471 #define xrealloc realloc
1474 /* Output the per-character bidi category table. */
1476 output_bidi_category (const char *filename, const char *version)
1480 struct bidi_category_table t;
1481 unsigned int level1_offset, level2_offset, level3_offset;
1482 uint16_t *level3_packed;
1484 stream = fopen (filename, "w");
1487 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1491 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1492 fprintf (stream, "/* Bidi categories of Unicode characters. */\n");
1493 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1498 bidi_category_table_init (&t);
1500 for (ch = 0; ch < 0x110000; ch++)
1502 int value = get_bidi_category (ch);
1504 bidi_category_table_add (&t, ch, value);
1507 bidi_category_table_finalize (&t);
1509 /* Offsets in t.result, in memory of this process. */
1511 5 * sizeof (uint32_t);
1513 5 * sizeof (uint32_t)
1514 + t.level1_size * sizeof (uint32_t);
1516 5 * sizeof (uint32_t)
1517 + t.level1_size * sizeof (uint32_t)
1518 + (t.level2_size << t.q) * sizeof (uint32_t);
1520 for (i = 0; i < 5; i++)
1521 fprintf (stream, "#define bidi_category_header_%d %d\n", i,
1522 ((uint32_t *) t.result)[i]);
1523 fprintf (stream, "static const\n");
1524 fprintf (stream, "struct\n");
1525 fprintf (stream, " {\n");
1526 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1527 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1528 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
1529 (1 << t.p) * 5 / 16);
1530 fprintf (stream, " }\n");
1531 fprintf (stream, "u_bidi_category =\n");
1532 fprintf (stream, "{\n");
1533 fprintf (stream, " {");
1534 if (t.level1_size > 8)
1535 fprintf (stream, "\n ");
1536 for (i = 0; i < t.level1_size; i++)
1539 if (i > 0 && (i % 8) == 0)
1540 fprintf (stream, "\n ");
1541 offset = ((uint32_t *) (t.result + level1_offset))[i];
1543 fprintf (stream, " %5d", -1);
1545 fprintf (stream, " %5zd",
1546 (offset - level2_offset) / sizeof (uint32_t));
1547 if (i+1 < t.level1_size)
1548 fprintf (stream, ",");
1550 if (t.level1_size > 8)
1551 fprintf (stream, "\n ");
1552 fprintf (stream, " },\n");
1553 fprintf (stream, " {");
1554 if (t.level2_size << t.q > 8)
1555 fprintf (stream, "\n ");
1556 for (i = 0; i < t.level2_size << t.q; i++)
1559 if (i > 0 && (i % 8) == 0)
1560 fprintf (stream, "\n ");
1561 offset = ((uint32_t *) (t.result + level2_offset))[i];
1563 fprintf (stream, " %5d", -1);
1565 fprintf (stream, " %5zd",
1566 (offset - level3_offset) / sizeof (uint8_t));
1567 if (i+1 < t.level2_size << t.q)
1568 fprintf (stream, ",");
1570 if (t.level2_size << t.q > 8)
1571 fprintf (stream, "\n ");
1572 fprintf (stream, " },\n");
1573 /* Pack the level3 array. Each entry needs 5 bits only. Use 16-bit units,
1574 not 32-bit units, in order to make the lookup function easier. */
1577 calloc ((t.level3_size << t.p) * 5 / 16 + 1, sizeof (uint16_t));
1578 for (i = 0; i < t.level3_size << t.p; i++)
1580 unsigned int j = (i * 5) / 16;
1581 unsigned int k = (i * 5) % 16;
1582 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
1583 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
1584 level3_packed[j] = value & 0xffff;
1585 level3_packed[j+1] = value >> 16;
1587 fprintf (stream, " {");
1588 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1589 fprintf (stream, "\n ");
1590 for (i = 0; i < (t.level3_size << t.p) * 5 / 16 + 1; i++)
1592 if (i > 0 && (i % 8) == 0)
1593 fprintf (stream, "\n ");
1594 fprintf (stream, " 0x%04x", level3_packed[i]);
1595 if (i+1 < (t.level3_size << t.p) * 5 / 16 + 1)
1596 fprintf (stream, ",");
1598 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1599 fprintf (stream, "\n ");
1600 fprintf (stream, " }\n");
1601 free (level3_packed);
1602 fprintf (stream, "};\n");
1604 if (ferror (stream) || fclose (stream))
1606 fprintf (stderr, "error writing to '%s'\n", filename);
1611 /* ========================================================================= */
1613 /* Decimal digit value. */
1614 /* See Unicode 3.0 book, section 4.6. */
1617 get_decdigit_value (unsigned int ch)
1619 if (unicode_attributes[ch].name != NULL
1620 && unicode_attributes[ch].decdigit[0] != '\0')
1621 return atoi (unicode_attributes[ch].decdigit);
1625 /* Construction of sparse 3-level tables. */
1626 #define TABLE decdigit_table
1627 #define ELEMENT uint8_t
1629 #define xmalloc malloc
1630 #define xrealloc realloc
1633 /* Output the unit test for the per-character decimal digit value table. */
1635 output_decimal_digit_test (const char *filename, const char *version)
1641 stream = fopen (filename, "w");
1644 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1648 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1649 fprintf (stream, "/* Decimal digit values of Unicode characters. */\n");
1650 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1654 for (ch = 0; ch < 0x110000; ch++)
1656 int value = get_decdigit_value (ch);
1658 if (!(value >= -1 && value < 10))
1664 fprintf (stream, ",\n");
1665 fprintf (stream, " { 0x%04X, %d }", ch, value);
1670 fprintf (stream, "\n");
1672 if (ferror (stream) || fclose (stream))
1674 fprintf (stderr, "error writing to '%s'\n", filename);
1679 /* Output the per-character decimal digit value table. */
1681 output_decimal_digit (const char *filename, const char *version)
1685 struct decdigit_table t;
1686 unsigned int level1_offset, level2_offset, level3_offset;
1688 stream = fopen (filename, "w");
1691 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1695 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1696 fprintf (stream, "/* Decimal digit values of Unicode characters. */\n");
1697 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1702 decdigit_table_init (&t);
1704 for (ch = 0; ch < 0x110000; ch++)
1706 int value = 1 + get_decdigit_value (ch);
1708 if (!(value >= 0 && value <= 10))
1711 decdigit_table_add (&t, ch, value);
1714 decdigit_table_finalize (&t);
1716 /* Offsets in t.result, in memory of this process. */
1718 5 * sizeof (uint32_t);
1720 5 * sizeof (uint32_t)
1721 + t.level1_size * sizeof (uint32_t);
1723 5 * sizeof (uint32_t)
1724 + t.level1_size * sizeof (uint32_t)
1725 + (t.level2_size << t.q) * sizeof (uint32_t);
1727 for (i = 0; i < 5; i++)
1728 fprintf (stream, "#define decdigit_header_%d %d\n", i,
1729 ((uint32_t *) t.result)[i]);
1730 fprintf (stream, "static const\n");
1731 fprintf (stream, "struct\n");
1732 fprintf (stream, " {\n");
1733 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1734 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1735 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size,
1737 fprintf (stream, " }\n");
1738 fprintf (stream, "u_decdigit =\n");
1739 fprintf (stream, "{\n");
1740 fprintf (stream, " {");
1741 if (t.level1_size > 8)
1742 fprintf (stream, "\n ");
1743 for (i = 0; i < t.level1_size; i++)
1746 if (i > 0 && (i % 8) == 0)
1747 fprintf (stream, "\n ");
1748 offset = ((uint32_t *) (t.result + level1_offset))[i];
1750 fprintf (stream, " %5d", -1);
1752 fprintf (stream, " %5zd",
1753 (offset - level2_offset) / sizeof (uint32_t));
1754 if (i+1 < t.level1_size)
1755 fprintf (stream, ",");
1757 if (t.level1_size > 8)
1758 fprintf (stream, "\n ");
1759 fprintf (stream, " },\n");
1760 fprintf (stream, " {");
1761 if (t.level2_size << t.q > 8)
1762 fprintf (stream, "\n ");
1763 for (i = 0; i < t.level2_size << t.q; i++)
1766 if (i > 0 && (i % 8) == 0)
1767 fprintf (stream, "\n ");
1768 offset = ((uint32_t *) (t.result + level2_offset))[i];
1770 fprintf (stream, " %5d", -1);
1772 fprintf (stream, " %5zd",
1773 (offset - level3_offset) / sizeof (uint8_t));
1774 if (i+1 < t.level2_size << t.q)
1775 fprintf (stream, ",");
1777 if (t.level2_size << t.q > 8)
1778 fprintf (stream, "\n ");
1779 fprintf (stream, " },\n");
1780 /* Pack the level3 array. Each entry needs 4 bits only. */
1781 fprintf (stream, " {");
1782 if (t.level3_size << (t.p - 1) > 8)
1783 fprintf (stream, "\n ");
1784 for (i = 0; i < t.level3_size << (t.p - 1); i++)
1786 if (i > 0 && (i % 8) == 0)
1787 fprintf (stream, "\n ");
1788 fprintf (stream, " 0x%02x",
1789 ((uint8_t *) (t.result + level3_offset))[2*i]
1790 + (((uint8_t *) (t.result + level3_offset))[2*i+1] << 4));
1791 if (i+1 < t.level3_size << (t.p - 1))
1792 fprintf (stream, ",");
1794 if (t.level3_size << (t.p - 1) > 8)
1795 fprintf (stream, "\n ");
1796 fprintf (stream, " }\n");
1797 fprintf (stream, "};\n");
1799 if (ferror (stream) || fclose (stream))
1801 fprintf (stderr, "error writing to '%s'\n", filename);
1806 /* ========================================================================= */
1809 /* See Unicode 3.0 book, section 4.6. */
1812 get_digit_value (unsigned int ch)
1814 if (unicode_attributes[ch].name != NULL
1815 && unicode_attributes[ch].digit[0] != '\0')
1816 return atoi (unicode_attributes[ch].digit);
1820 /* Output the unit test for the per-character digit value table. */
1822 output_digit_test (const char *filename, const char *version)
1828 stream = fopen (filename, "w");
1831 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1835 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1836 fprintf (stream, "/* Digit values of Unicode characters. */\n");
1837 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1841 for (ch = 0; ch < 0x110000; ch++)
1843 int value = get_digit_value (ch);
1845 if (!(value >= -1 && value < 10))
1851 fprintf (stream, ",\n");
1852 fprintf (stream, " { 0x%04X, %d }", ch, value);
1857 fprintf (stream, "\n");
1859 if (ferror (stream) || fclose (stream))
1861 fprintf (stderr, "error writing to '%s'\n", filename);
1866 /* Output the per-character digit value table. */
1868 output_digit (const char *filename, const char *version)
1872 struct decdigit_table t;
1873 unsigned int level1_offset, level2_offset, level3_offset;
1875 stream = fopen (filename, "w");
1878 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1882 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1883 fprintf (stream, "/* Digit values of Unicode characters. */\n");
1884 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1889 decdigit_table_init (&t);
1891 for (ch = 0; ch < 0x110000; ch++)
1893 int value = 1 + get_digit_value (ch);
1895 if (!(value >= 0 && value <= 10))
1898 decdigit_table_add (&t, ch, value);
1901 decdigit_table_finalize (&t);
1903 /* Offsets in t.result, in memory of this process. */
1905 5 * sizeof (uint32_t);
1907 5 * sizeof (uint32_t)
1908 + t.level1_size * sizeof (uint32_t);
1910 5 * sizeof (uint32_t)
1911 + t.level1_size * sizeof (uint32_t)
1912 + (t.level2_size << t.q) * sizeof (uint32_t);
1914 for (i = 0; i < 5; i++)
1915 fprintf (stream, "#define digit_header_%d %d\n", i,
1916 ((uint32_t *) t.result)[i]);
1917 fprintf (stream, "static const\n");
1918 fprintf (stream, "struct\n");
1919 fprintf (stream, " {\n");
1920 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1921 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1922 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size,
1924 fprintf (stream, " }\n");
1925 fprintf (stream, "u_digit =\n");
1926 fprintf (stream, "{\n");
1927 fprintf (stream, " {");
1928 if (t.level1_size > 8)
1929 fprintf (stream, "\n ");
1930 for (i = 0; i < t.level1_size; i++)
1933 if (i > 0 && (i % 8) == 0)
1934 fprintf (stream, "\n ");
1935 offset = ((uint32_t *) (t.result + level1_offset))[i];
1937 fprintf (stream, " %5d", -1);
1939 fprintf (stream, " %5zd",
1940 (offset - level2_offset) / sizeof (uint32_t));
1941 if (i+1 < t.level1_size)
1942 fprintf (stream, ",");
1944 if (t.level1_size > 8)
1945 fprintf (stream, "\n ");
1946 fprintf (stream, " },\n");
1947 fprintf (stream, " {");
1948 if (t.level2_size << t.q > 8)
1949 fprintf (stream, "\n ");
1950 for (i = 0; i < t.level2_size << t.q; i++)
1953 if (i > 0 && (i % 8) == 0)
1954 fprintf (stream, "\n ");
1955 offset = ((uint32_t *) (t.result + level2_offset))[i];
1957 fprintf (stream, " %5d", -1);
1959 fprintf (stream, " %5zd",
1960 (offset - level3_offset) / sizeof (uint8_t));
1961 if (i+1 < t.level2_size << t.q)
1962 fprintf (stream, ",");
1964 if (t.level2_size << t.q > 8)
1965 fprintf (stream, "\n ");
1966 fprintf (stream, " },\n");
1967 /* Pack the level3 array. Each entry needs 4 bits only. */
1968 fprintf (stream, " {");
1969 if (t.level3_size << (t.p - 1) > 8)
1970 fprintf (stream, "\n ");
1971 for (i = 0; i < t.level3_size << (t.p - 1); i++)
1973 if (i > 0 && (i % 8) == 0)
1974 fprintf (stream, "\n ");
1975 fprintf (stream, " 0x%02x",
1976 ((uint8_t *) (t.result + level3_offset))[2*i]
1977 + (((uint8_t *) (t.result + level3_offset))[2*i+1] << 4));
1978 if (i+1 < t.level3_size << (t.p - 1))
1979 fprintf (stream, ",");
1981 if (t.level3_size << (t.p - 1) > 8)
1982 fprintf (stream, "\n ");
1983 fprintf (stream, " }\n");
1984 fprintf (stream, "};\n");
1986 if (ferror (stream) || fclose (stream))
1988 fprintf (stderr, "error writing to '%s'\n", filename);
1993 /* ========================================================================= */
1995 /* Numeric value. */
1996 /* See Unicode 3.0 book, section 4.6. */
1998 typedef struct { int numerator; int denominator; } uc_fraction_t;
2000 static uc_fraction_t
2001 get_numeric_value (unsigned int ch)
2003 uc_fraction_t value;
2005 if (unicode_attributes[ch].name != NULL
2006 && unicode_attributes[ch].numeric[0] != '\0')
2008 const char *str = unicode_attributes[ch].numeric;
2009 /* str is of the form "integer" or "integer/posinteger". */
2010 value.numerator = atoi (str);
2011 if (strchr (str, '/') != NULL)
2012 value.denominator = atoi (strchr (str, '/') + 1);
2014 value.denominator = 1;
2018 value.numerator = 0;
2019 value.denominator = 0;
2024 /* Output the unit test for the per-character numeric value table. */
2026 output_numeric_test (const char *filename, const char *version)
2032 stream = fopen (filename, "w");
2035 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2039 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2040 fprintf (stream, "/* Numeric values of Unicode characters. */\n");
2041 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
2045 for (ch = 0; ch < 0x110000; ch++)
2047 uc_fraction_t value = get_numeric_value (ch);
2049 if (value.numerator != 0 || value.denominator != 0)
2052 fprintf (stream, ",\n");
2053 fprintf (stream, " { 0x%04X, %d, %d }",
2054 ch, value.numerator, value.denominator);
2059 fprintf (stream, "\n");
2061 if (ferror (stream) || fclose (stream))
2063 fprintf (stderr, "error writing to '%s'\n", filename);
2068 /* Construction of sparse 3-level tables. */
2069 #define TABLE numeric_table
2070 #define ELEMENT uint8_t
2072 #define xmalloc malloc
2073 #define xrealloc realloc
2076 /* Output the per-character numeric value table. */
2078 output_numeric (const char *filename, const char *version)
2081 uc_fraction_t fractions[128];
2082 unsigned int nfractions;
2083 unsigned int ch, i, j;
2084 struct numeric_table t;
2085 unsigned int level1_offset, level2_offset, level3_offset;
2086 uint16_t *level3_packed;
2088 stream = fopen (filename, "w");
2091 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2095 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2096 fprintf (stream, "/* Numeric values of Unicode characters. */\n");
2097 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
2100 /* Create table of occurring fractions. */
2102 for (ch = 0; ch < 0x110000; ch++)
2104 uc_fraction_t value = get_numeric_value (ch);
2106 for (i = 0; i < nfractions; i++)
2107 if (value.numerator == fractions[i].numerator
2108 && value.denominator == fractions[i].denominator)
2110 if (i == nfractions)
2112 if (nfractions == 128)
2114 for (i = 0; i < nfractions; i++)
2115 if (value.denominator < fractions[i].denominator
2116 || (value.denominator == fractions[i].denominator
2117 && value.numerator < fractions[i].numerator))
2119 for (j = nfractions; j > i; j--)
2120 fractions[j] = fractions[j - 1];
2121 fractions[i] = value;
2126 fprintf (stream, "static const uc_fraction_t u_numeric_values[%d] =\n",
2128 fprintf (stream, "{\n");
2129 for (i = 0; i < nfractions; i++)
2131 fprintf (stream, " { %d, %d }", fractions[i].numerator,
2132 fractions[i].denominator);
2133 if (i+1 < nfractions)
2134 fprintf (stream, ",");
2135 fprintf (stream, "\n");
2137 fprintf (stream, "};\n");
2141 numeric_table_init (&t);
2143 for (ch = 0; ch < 0x110000; ch++)
2145 uc_fraction_t value = get_numeric_value (ch);
2147 for (i = 0; i < nfractions; i++)
2148 if (value.numerator == fractions[i].numerator
2149 && value.denominator == fractions[i].denominator)
2151 if (i == nfractions)
2154 numeric_table_add (&t, ch, i);
2157 numeric_table_finalize (&t);
2159 /* Offsets in t.result, in memory of this process. */
2161 5 * sizeof (uint32_t);
2163 5 * sizeof (uint32_t)
2164 + t.level1_size * sizeof (uint32_t);
2166 5 * sizeof (uint32_t)
2167 + t.level1_size * sizeof (uint32_t)
2168 + (t.level2_size << t.q) * sizeof (uint32_t);
2170 for (i = 0; i < 5; i++)
2171 fprintf (stream, "#define numeric_header_%d %d\n", i,
2172 ((uint32_t *) t.result)[i]);
2173 fprintf (stream, "static const\n");
2174 fprintf (stream, "struct\n");
2175 fprintf (stream, " {\n");
2176 fprintf (stream, " int level1[%zu];\n", t.level1_size);
2177 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
2178 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
2179 (1 << t.p) * 7 / 16);
2180 fprintf (stream, " }\n");
2181 fprintf (stream, "u_numeric =\n");
2182 fprintf (stream, "{\n");
2183 fprintf (stream, " {");
2184 if (t.level1_size > 8)
2185 fprintf (stream, "\n ");
2186 for (i = 0; i < t.level1_size; i++)
2189 if (i > 0 && (i % 8) == 0)
2190 fprintf (stream, "\n ");
2191 offset = ((uint32_t *) (t.result + level1_offset))[i];
2193 fprintf (stream, " %5d", -1);
2195 fprintf (stream, " %5zd",
2196 (offset - level2_offset) / sizeof (uint32_t));
2197 if (i+1 < t.level1_size)
2198 fprintf (stream, ",");
2200 if (t.level1_size > 8)
2201 fprintf (stream, "\n ");
2202 fprintf (stream, " },\n");
2203 fprintf (stream, " {");
2204 if (t.level2_size << t.q > 8)
2205 fprintf (stream, "\n ");
2206 for (i = 0; i < t.level2_size << t.q; i++)
2209 if (i > 0 && (i % 8) == 0)
2210 fprintf (stream, "\n ");
2211 offset = ((uint32_t *) (t.result + level2_offset))[i];
2213 fprintf (stream, " %5d", -1);
2215 fprintf (stream, " %5zd",
2216 (offset - level3_offset) / sizeof (uint8_t));
2217 if (i+1 < t.level2_size << t.q)
2218 fprintf (stream, ",");
2220 if (t.level2_size << t.q > 8)
2221 fprintf (stream, "\n ");
2222 fprintf (stream, " },\n");
2223 /* Pack the level3 array. Each entry needs 7 bits only. Use 16-bit units,
2224 not 32-bit units, in order to make the lookup function easier. */
2227 calloc ((t.level3_size << t.p) * 7 / 16 + 1, sizeof (uint16_t));
2228 for (i = 0; i < t.level3_size << t.p; i++)
2230 unsigned int j = (i * 7) / 16;
2231 unsigned int k = (i * 7) % 16;
2232 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
2233 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
2234 level3_packed[j] = value & 0xffff;
2235 level3_packed[j+1] = value >> 16;
2237 fprintf (stream, " {");
2238 if ((t.level3_size << t.p) * 7 / 16 + 1 > 8)
2239 fprintf (stream, "\n ");
2240 for (i = 0; i < (t.level3_size << t.p) * 7 / 16 + 1; i++)
2242 if (i > 0 && (i % 8) == 0)
2243 fprintf (stream, "\n ");
2244 fprintf (stream, " 0x%04x", level3_packed[i]);
2245 if (i+1 < (t.level3_size << t.p) * 7 / 16 + 1)
2246 fprintf (stream, ",");
2248 if ((t.level3_size << t.p) * 7 / 16 + 1 > 8)
2249 fprintf (stream, "\n ");
2250 fprintf (stream, " }\n");
2251 free (level3_packed);
2252 fprintf (stream, "};\n");
2254 if (ferror (stream) || fclose (stream))
2256 fprintf (stderr, "error writing to '%s'\n", filename);
2261 /* ========================================================================= */
2264 /* See Unicode 3.0 book, section 4.7,
2267 /* List of mirrored character pairs. This is a subset of the characters
2268 having the BidiMirrored property. */
2269 static unsigned int mirror_pairs[][2] =
2326 get_mirror_value (unsigned int ch)
2329 unsigned int mirror_char;
2332 mirrored = (unicode_attributes[ch].name != NULL
2333 && unicode_attributes[ch].mirrored);
2334 mirror_char = 0xfffd;
2335 for (i = 0; i < sizeof (mirror_pairs) / sizeof (mirror_pairs[0]); i++)
2336 if (ch == mirror_pairs[i][0])
2338 mirror_char = mirror_pairs[i][1];
2341 else if (ch == mirror_pairs[i][1])
2343 mirror_char = mirror_pairs[i][0];
2347 return (int) mirror_char - (int) ch;
2350 if (mirror_char != 0xfffd)
2356 /* Construction of sparse 3-level tables. */
2357 #define TABLE mirror_table
2358 #define ELEMENT int32_t
2360 #define xmalloc malloc
2361 #define xrealloc realloc
2364 /* Output the per-character mirror table. */
2366 output_mirror (const char *filename, const char *version)
2370 struct mirror_table t;
2371 unsigned int level1_offset, level2_offset, level3_offset;
2373 stream = fopen (filename, "w");
2376 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2380 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2381 fprintf (stream, "/* Mirrored Unicode characters. */\n");
2382 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
2387 mirror_table_init (&t);
2389 for (ch = 0; ch < 0x110000; ch++)
2391 int value = get_mirror_value (ch);
2393 mirror_table_add (&t, ch, value);
2396 mirror_table_finalize (&t);
2398 /* Offsets in t.result, in memory of this process. */
2400 5 * sizeof (uint32_t);
2402 5 * sizeof (uint32_t)
2403 + t.level1_size * sizeof (uint32_t);
2405 5 * sizeof (uint32_t)
2406 + t.level1_size * sizeof (uint32_t)
2407 + (t.level2_size << t.q) * sizeof (uint32_t);
2409 for (i = 0; i < 5; i++)
2410 fprintf (stream, "#define mirror_header_%d %d\n", i,
2411 ((uint32_t *) t.result)[i]);
2412 fprintf (stream, "static const\n");
2413 fprintf (stream, "struct\n");
2414 fprintf (stream, " {\n");
2415 fprintf (stream, " int level1[%zu];\n", t.level1_size);
2416 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
2417 fprintf (stream, " int level3[%zu << %d];\n", t.level3_size, t.p);
2418 fprintf (stream, " }\n");
2419 fprintf (stream, "u_mirror =\n");
2420 fprintf (stream, "{\n");
2421 fprintf (stream, " {");
2422 if (t.level1_size > 8)
2423 fprintf (stream, "\n ");
2424 for (i = 0; i < t.level1_size; i++)
2427 if (i > 0 && (i % 8) == 0)
2428 fprintf (stream, "\n ");
2429 offset = ((uint32_t *) (t.result + level1_offset))[i];
2431 fprintf (stream, " %5d", -1);
2433 fprintf (stream, " %5zd",
2434 (offset - level2_offset) / sizeof (uint32_t));
2435 if (i+1 < t.level1_size)
2436 fprintf (stream, ",");
2438 if (t.level1_size > 8)
2439 fprintf (stream, "\n ");
2440 fprintf (stream, " },\n");
2441 fprintf (stream, " {");
2442 if (t.level2_size << t.q > 8)
2443 fprintf (stream, "\n ");
2444 for (i = 0; i < t.level2_size << t.q; i++)
2447 if (i > 0 && (i % 8) == 0)
2448 fprintf (stream, "\n ");
2449 offset = ((uint32_t *) (t.result + level2_offset))[i];
2451 fprintf (stream, " %5d", -1);
2453 fprintf (stream, " %5zd",
2454 (offset - level3_offset) / sizeof (int32_t));
2455 if (i+1 < t.level2_size << t.q)
2456 fprintf (stream, ",");
2458 if (t.level2_size << t.q > 8)
2459 fprintf (stream, "\n ");
2460 fprintf (stream, " },\n");
2461 fprintf (stream, " {");
2462 if (t.level3_size << t.p > 8)
2463 fprintf (stream, "\n ");
2464 for (i = 0; i < t.level3_size << t.p; i++)
2466 if (i > 0 && (i % 8) == 0)
2467 fprintf (stream, "\n ");
2468 fprintf (stream, " %5d", ((int32_t *) (t.result + level3_offset))[i]);
2469 if (i+1 < t.level3_size << t.p)
2470 fprintf (stream, ",");
2472 if (t.level3_size << t.p > 8)
2473 fprintf (stream, "\n ");
2474 fprintf (stream, " }\n");
2475 fprintf (stream, "};\n");
2477 if (ferror (stream) || fclose (stream))
2479 fprintf (stderr, "error writing to '%s'\n", filename);
2484 /* ========================================================================= */
2488 /* Reading PropList.txt and DerivedCoreProperties.txt. */
2497 PROP_QUOTATION_MARK,
2498 PROP_TERMINAL_PUNCTUATION,
2501 PROP_ASCII_HEX_DIGIT,
2502 PROP_OTHER_ALPHABETIC,
2506 PROP_OTHER_LOWERCASE,
2507 PROP_OTHER_UPPERCASE,
2508 PROP_NONCHARACTER_CODE_POINT,
2509 PROP_OTHER_GRAPHEME_EXTEND,
2510 PROP_IDS_BINARY_OPERATOR,
2511 PROP_IDS_TRINARY_OPERATOR,
2513 PROP_UNIFIED_IDEOGRAPH,
2514 PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT,
2517 PROP_LOGICAL_ORDER_EXCEPTION,
2518 PROP_OTHER_ID_START,
2519 PROP_OTHER_ID_CONTINUE,
2521 PROP_VARIATION_SELECTOR,
2522 PROP_PATTERN_WHITE_SPACE,
2523 PROP_PATTERN_SYNTAX,
2524 /* DerivedCoreProperties.txt */
2533 PROP_DEFAULT_IGNORABLE_CODE_POINT,
2534 PROP_GRAPHEME_EXTEND,
2538 unsigned long long unicode_properties[0x110000];
2541 clear_properties (void)
2545 for (i = 0; i < 0x110000; i++)
2546 unicode_properties[i] = 0;
2549 /* Stores in unicode_properties[] the properties from the
2550 PropList.txt or DerivedCoreProperties.txt file. */
2552 fill_properties (const char *proplist_filename)
2557 stream = fopen (proplist_filename, "r");
2560 fprintf (stderr, "error during fopen of '%s'\n", proplist_filename);
2567 unsigned int i1, i2;
2568 char padding[200+1];
2569 char propname[200+1];
2570 unsigned int propvalue;
2572 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
2575 if (buf[0] == '\0' || buf[0] == '#')
2578 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4)
2580 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3)
2582 fprintf (stderr, "parse error in '%s'\n", proplist_filename);
2587 #define PROP(name,value) \
2588 if (strcmp (propname, name) == 0) propvalue = value; else
2590 PROP ("White_Space", PROP_WHITE_SPACE)
2591 PROP ("Bidi_Control", PROP_BIDI_CONTROL)
2592 PROP ("Join_Control", PROP_JOIN_CONTROL)
2593 PROP ("Dash", PROP_DASH)
2594 PROP ("Hyphen", PROP_HYPHEN)
2595 PROP ("Quotation_Mark", PROP_QUOTATION_MARK)
2596 PROP ("Terminal_Punctuation", PROP_TERMINAL_PUNCTUATION)
2597 PROP ("Other_Math", PROP_OTHER_MATH)
2598 PROP ("Hex_Digit", PROP_HEX_DIGIT)
2599 PROP ("ASCII_Hex_Digit", PROP_ASCII_HEX_DIGIT)
2600 PROP ("Other_Alphabetic", PROP_OTHER_ALPHABETIC)
2601 PROP ("Ideographic", PROP_IDEOGRAPHIC)
2602 PROP ("Diacritic", PROP_DIACRITIC)
2603 PROP ("Extender", PROP_EXTENDER)
2604 PROP ("Other_Lowercase", PROP_OTHER_LOWERCASE)
2605 PROP ("Other_Uppercase", PROP_OTHER_UPPERCASE)
2606 PROP ("Noncharacter_Code_Point", PROP_NONCHARACTER_CODE_POINT)
2607 PROP ("Other_Grapheme_Extend", PROP_OTHER_GRAPHEME_EXTEND)
2608 PROP ("IDS_Binary_Operator", PROP_IDS_BINARY_OPERATOR)
2609 PROP ("IDS_Trinary_Operator", PROP_IDS_TRINARY_OPERATOR)
2610 PROP ("Radical", PROP_RADICAL)
2611 PROP ("Unified_Ideograph", PROP_UNIFIED_IDEOGRAPH)
2612 PROP ("Other_Default_Ignorable_Code_Point", PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)
2613 PROP ("Deprecated", PROP_DEPRECATED)
2614 PROP ("Soft_Dotted", PROP_SOFT_DOTTED)
2615 PROP ("Logical_Order_Exception", PROP_LOGICAL_ORDER_EXCEPTION)
2616 PROP ("Other_ID_Start", PROP_OTHER_ID_START)
2617 PROP ("Other_ID_Continue", PROP_OTHER_ID_CONTINUE)
2618 PROP ("STerm", PROP_STERM)
2619 PROP ("Variation_Selector", PROP_VARIATION_SELECTOR)
2620 PROP ("Pattern_White_Space", PROP_PATTERN_WHITE_SPACE)
2621 PROP ("Pattern_Syntax", PROP_PATTERN_SYNTAX)
2622 /* DerivedCoreProperties.txt */
2623 PROP ("Math", PROP_MATH)
2624 PROP ("Alphabetic", PROP_ALPHABETIC)
2625 PROP ("Lowercase", PROP_LOWERCASE)
2626 PROP ("Uppercase", PROP_UPPERCASE)
2627 PROP ("ID_Start", PROP_ID_START)
2628 PROP ("ID_Continue", PROP_ID_CONTINUE)
2629 PROP ("XID_Start", PROP_XID_START)
2630 PROP ("XID_Continue", PROP_XID_CONTINUE)
2631 PROP ("Default_Ignorable_Code_Point", PROP_DEFAULT_IGNORABLE_CODE_POINT)
2632 PROP ("Grapheme_Extend", PROP_GRAPHEME_EXTEND)
2633 PROP ("Grapheme_Base", PROP_GRAPHEME_BASE)
2634 PROP ("Grapheme_Link", PROP_GRAPHEME_LINK)
2637 fprintf (stderr, "unknown property named '%s' in '%s'\n", propname,
2641 if (!(i1 <= i2 && i2 < 0x110000))
2644 for (i = i1; i <= i2; i++)
2645 unicode_properties[i] |= 1ULL << propvalue;
2648 if (ferror (stream) || fclose (stream))
2650 fprintf (stderr, "error reading from '%s'\n", proplist_filename);
2655 /* Stores in array the given property from the Unicode 3.0 PropList.txt
2658 fill_property30 (char array[0x110000], const char *proplist_filename, const char *property_name)
2664 for (i = 0; i < 0x110000; i++)
2667 stream = fopen (proplist_filename, "r");
2670 fprintf (stderr, "error during fopen of '%s'\n", proplist_filename);
2674 /* Search for the "Property dump for: ..." line. */
2677 if (fscanf (stream, "%100[^\n]\n", buf) < 1)
2679 fprintf (stderr, "no property found in '%s'\n", proplist_filename);
2683 while (strstr (buf, property_name) == NULL);
2687 unsigned int i1, i2;
2689 if (fscanf (stream, "%100[^\n]\n", buf) < 1)
2693 if (strlen (buf) >= 10 && buf[4] == '.' && buf[5] == '.')
2695 if (sscanf (buf, "%4X..%4X", &i1, &i2) < 2)
2697 fprintf (stderr, "parse error in property in '%s'\n",
2702 else if (strlen (buf) >= 4)
2704 if (sscanf (buf, "%4X", &i1) < 1)
2706 fprintf (stderr, "parse error in property in '%s'\n",
2714 fprintf (stderr, "parse error in property in '%s'\n",
2718 if (!(i1 <= i2 && i2 < 0x110000))
2720 for (i = i1; i <= i2; i++)
2723 if (ferror (stream) || fclose (stream))
2725 fprintf (stderr, "error reading from '%s'\n", proplist_filename);
2730 /* Properties from Unicode 3.0 PropList.txt file. */
2732 /* The paired punctuation property from the PropList.txt file. */
2733 char unicode_pairedpunctuation[0x110000];
2735 /* The left of pair property from the PropList.txt file. */
2736 char unicode_leftofpair[0x110000];
2739 fill_properties30 (const char *proplist30_filename)
2741 fill_property30 (unicode_pairedpunctuation, proplist30_filename, "(Paired Punctuation)");
2742 fill_property30 (unicode_leftofpair, proplist30_filename, "(Left of Pair)");
2745 /* ------------------------------------------------------------------------- */
2747 /* See PropList.txt, UCD.html. */
2749 is_property_white_space (unsigned int ch)
2751 return ((unicode_properties[ch] & (1ULL << PROP_WHITE_SPACE)) != 0);
2754 /* See Unicode 3.0 book, section 4.10,
2755 PropList.txt, UCD.html,
2756 DerivedCoreProperties.txt, UCD.html. */
2758 is_property_alphabetic (unsigned int ch)
2762 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_ALPHABETIC)) != 0)
2763 /* For some reason, the following are listed as having property
2764 Alphabetic but not as having property Other_Alphabetic. */
2765 || (ch >= 0x16EE && ch <= 0x16F0) /* RUNIC SYMBOLS */
2766 || (ch >= 0x2160 && ch <= 0x2182) /* ROMAN NUMERALS */
2767 || (ch >= 0x24D0 && ch <= 0x24E9) /* CIRCLED LATIN SMALL LETTER */
2768 || (ch == 0x3007) /* IDEOGRAPHIC NUMBER ZERO */
2769 || (ch >= 0x3021 && ch <= 0x3029) /* HANGZHOU NUMERAL */
2770 || (ch >= 0x3038 && ch <= 0x303A) /* HANGZHOU NUMERAL */
2771 || (ch >= 0x10140 && ch <= 0x10174) /* GREEK ACROPHONICS */
2772 || (ch == 0x10341) /* GOTHIC LETTER NINETY */
2773 || (ch == 0x1034A) /* GOTHIC LETTER NINE HUNDRED */
2774 || (ch >= 0x103D1 && ch <= 0x103D5) /* OLD PERSIAN NUMBERS */
2775 || (ch >= 0x12400 && ch <= 0x12462); /* CUNEIFORM NUMERIC SIGNS */
2777 ((unicode_properties[ch] & (1ULL << PROP_ALPHABETIC)) != 0);
2779 if (result1 != result2)
2784 /* See PropList.txt, UCD.html. */
2786 is_property_other_alphabetic (unsigned int ch)
2788 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ALPHABETIC)) != 0);
2791 /* See PropList.txt, UCD.html. */
2793 is_property_not_a_character (unsigned int ch)
2795 return ((unicode_properties[ch] & (1ULL << PROP_NONCHARACTER_CODE_POINT)) != 0);
2798 /* See PropList.txt, UCD.html,
2799 DerivedCoreProperties.txt, UCD.html. */
2801 is_property_default_ignorable_code_point (unsigned int ch)
2804 (is_category_Cf (ch)
2805 && !(ch >= 0xFFF9 && ch <= 0xFFFB)) /* Annotations */
2806 || ((is_category_Cc (ch) || is_category_Cs (ch))
2807 && !is_property_white_space (ch))
2808 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)) != 0)
2809 || ((unicode_properties[ch] & (1ULL << PROP_VARIATION_SELECTOR)) != 0)
2810 || is_property_not_a_character (ch);
2812 ((unicode_properties[ch] & (1ULL << PROP_DEFAULT_IGNORABLE_CODE_POINT)) != 0);
2814 if (result1 != result2)
2819 /* See PropList.txt, UCD.html. */
2821 is_property_other_default_ignorable_code_point (unsigned int ch)
2823 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)) != 0);
2826 /* See PropList.txt, UCD.html. */
2828 is_property_deprecated (unsigned int ch)
2830 return ((unicode_properties[ch] & (1ULL << PROP_DEPRECATED)) != 0);
2833 /* See PropList.txt, UCD.html. */
2835 is_property_logical_order_exception (unsigned int ch)
2837 return ((unicode_properties[ch] & (1ULL << PROP_LOGICAL_ORDER_EXCEPTION)) != 0);
2840 /* See PropList.txt, UCD.html. */
2842 is_property_variation_selector (unsigned int ch)
2844 return ((unicode_properties[ch] & (1ULL << PROP_VARIATION_SELECTOR)) != 0);
2847 /* See PropList-3.0.1.txt. */
2849 is_property_private_use (unsigned int ch)
2851 /* Determined through "grep 'Private Use,' UnicodeData-3.1.0.txt". */
2852 return (ch >= 0xE000 && ch <= 0xF8FF)
2853 || (ch >= 0xF0000 && ch <= 0xFFFFD)
2854 || (ch >= 0x100000 && ch <= 0x10FFFD);
2857 /* See PropList-3.0.1.txt. */
2859 is_property_unassigned_code_value (unsigned int ch)
2861 return (is_category_Cn (ch) && !is_property_not_a_character (ch));
2864 /* See PropList.txt, UCD.html,
2865 DerivedCoreProperties.txt, UCD.html. */
2867 is_property_uppercase (unsigned int ch)
2871 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_UPPERCASE)) != 0);
2873 ((unicode_properties[ch] & (1ULL << PROP_UPPERCASE)) != 0);
2875 if (result1 != result2)
2880 /* See PropList.txt, UCD.html. */
2882 is_property_other_uppercase (unsigned int ch)
2884 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_UPPERCASE)) != 0);
2887 /* See PropList.txt, UCD.html,
2888 DerivedCoreProperties.txt, UCD.html. */
2890 is_property_lowercase (unsigned int ch)
2894 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_LOWERCASE)) != 0);
2896 ((unicode_properties[ch] & (1ULL << PROP_LOWERCASE)) != 0);
2898 if (result1 != result2)
2903 /* See PropList.txt, UCD.html. */
2905 is_property_other_lowercase (unsigned int ch)
2907 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_LOWERCASE)) != 0);
2910 /* See PropList-3.0.1.txt. */
2912 is_property_titlecase (unsigned int ch)
2914 return is_category_Lt (ch);
2917 /* See PropList.txt, UCD.html. */
2919 is_property_soft_dotted (unsigned int ch)
2921 return ((unicode_properties[ch] & (1ULL << PROP_SOFT_DOTTED)) != 0);
2924 /* See DerivedCoreProperties.txt, UCD.html. */
2926 is_property_id_start (unsigned int ch)
2928 return ((unicode_properties[ch] & (1ULL << PROP_ID_START)) != 0);
2931 /* See PropList.txt, UCD.html. */
2933 is_property_other_id_start (unsigned int ch)
2935 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ID_START)) != 0);
2938 /* See DerivedCoreProperties.txt, UCD.html. */
2940 is_property_id_continue (unsigned int ch)
2942 return ((unicode_properties[ch] & (1ULL << PROP_ID_CONTINUE)) != 0);
2945 /* See PropList.txt, UCD.html. */
2947 is_property_other_id_continue (unsigned int ch)
2949 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ID_CONTINUE)) != 0);
2952 /* See DerivedCoreProperties.txt, UCD.html. */
2954 is_property_xid_start (unsigned int ch)
2956 return ((unicode_properties[ch] & (1ULL << PROP_XID_START)) != 0);
2959 /* See DerivedCoreProperties.txt, UCD.html. */
2961 is_property_xid_continue (unsigned int ch)
2963 return ((unicode_properties[ch] & (1ULL << PROP_XID_CONTINUE)) != 0);
2966 /* See PropList.txt, UCD.html. */
2968 is_property_pattern_white_space (unsigned int ch)
2970 return ((unicode_properties[ch] & (1ULL << PROP_PATTERN_WHITE_SPACE)) != 0);
2973 /* See PropList.txt, UCD.html. */
2975 is_property_pattern_syntax (unsigned int ch)
2977 return ((unicode_properties[ch] & (1ULL << PROP_PATTERN_SYNTAX)) != 0);
2980 /* See PropList.txt, UCD.html. */
2982 is_property_join_control (unsigned int ch)
2984 return ((unicode_properties[ch] & (1ULL << PROP_JOIN_CONTROL)) != 0);
2987 /* See DerivedCoreProperties.txt, UCD.html. */
2989 is_property_grapheme_base (unsigned int ch)
2991 return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_BASE)) != 0);
2994 /* See DerivedCoreProperties.txt, UCD.html. */
2996 is_property_grapheme_extend (unsigned int ch)
2998 return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_EXTEND)) != 0);
3001 /* See PropList.txt, UCD.html. */
3003 is_property_other_grapheme_extend (unsigned int ch)
3005 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_GRAPHEME_EXTEND)) != 0);
3008 /* See DerivedCoreProperties.txt, UCD.html. */
3010 is_property_grapheme_link (unsigned int ch)
3012 return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_LINK)) != 0);
3015 /* See PropList.txt, UCD.html. */
3017 is_property_bidi_control (unsigned int ch)
3019 return ((unicode_properties[ch] & (1ULL << PROP_BIDI_CONTROL)) != 0);
3022 /* See PropList-3.0.1.txt. */
3024 is_property_bidi_left_to_right (unsigned int ch)
3026 return (get_bidi_category (ch) == UC_BIDI_L);
3029 /* See PropList-3.0.1.txt. */
3031 is_property_bidi_hebrew_right_to_left (unsigned int ch)
3033 return (get_bidi_category (ch) == UC_BIDI_R);
3036 /* See PropList-3.0.1.txt. */
3038 is_property_bidi_arabic_right_to_left (unsigned int ch)
3040 return (get_bidi_category (ch) == UC_BIDI_AL);
3043 /* See PropList-3.0.1.txt. */
3045 is_property_bidi_european_digit (unsigned int ch)
3047 return (get_bidi_category (ch) == UC_BIDI_EN);
3050 /* See PropList-3.0.1.txt. */
3052 is_property_bidi_eur_num_separator (unsigned int ch)
3054 return (get_bidi_category (ch) == UC_BIDI_ES);
3057 /* See PropList-3.0.1.txt. */
3059 is_property_bidi_eur_num_terminator (unsigned int ch)
3061 return (get_bidi_category (ch) == UC_BIDI_ET);
3064 /* See PropList-3.0.1.txt. */
3066 is_property_bidi_arabic_digit (unsigned int ch)
3068 return (get_bidi_category (ch) == UC_BIDI_AN);
3071 /* See PropList-3.0.1.txt. */
3073 is_property_bidi_common_separator (unsigned int ch)
3075 return (get_bidi_category (ch) == UC_BIDI_CS);
3078 /* See PropList-3.0.1.txt. */
3080 is_property_bidi_block_separator (unsigned int ch)
3082 return (get_bidi_category (ch) == UC_BIDI_B);
3085 /* See PropList-3.0.1.txt. */
3087 is_property_bidi_segment_separator (unsigned int ch)
3089 return (get_bidi_category (ch) == UC_BIDI_S);
3092 /* See PropList-3.0.1.txt. */
3094 is_property_bidi_whitespace (unsigned int ch)
3096 return (get_bidi_category (ch) == UC_BIDI_WS);
3099 /* See PropList-3.0.1.txt. */
3101 is_property_bidi_non_spacing_mark (unsigned int ch)
3103 return (get_bidi_category (ch) == UC_BIDI_NSM);
3106 /* See PropList-3.0.1.txt. */
3108 is_property_bidi_boundary_neutral (unsigned int ch)
3110 return (get_bidi_category (ch) == UC_BIDI_BN);
3113 /* See PropList-3.0.1.txt. */
3115 is_property_bidi_pdf (unsigned int ch)
3117 return (get_bidi_category (ch) == UC_BIDI_PDF);
3120 /* See PropList-3.0.1.txt. */
3122 is_property_bidi_embedding_or_override (unsigned int ch)
3124 int category = get_bidi_category (ch);
3125 return (category == UC_BIDI_LRE || category == UC_BIDI_LRO
3126 || category == UC_BIDI_RLE || category == UC_BIDI_RLO);
3129 /* See PropList-3.0.1.txt. */
3131 is_property_bidi_other_neutral (unsigned int ch)
3133 return (get_bidi_category (ch) == UC_BIDI_ON);
3136 /* See PropList.txt, UCD.html. */
3138 is_property_hex_digit (unsigned int ch)
3140 return ((unicode_properties[ch] & (1ULL << PROP_HEX_DIGIT)) != 0);
3143 /* See PropList.txt, UCD.html. */
3145 is_property_ascii_hex_digit (unsigned int ch)
3147 return ((unicode_properties[ch] & (1ULL << PROP_ASCII_HEX_DIGIT)) != 0);
3150 /* See Unicode 3.0 book, section 4.10,
3151 PropList.txt, UCD.html. */
3153 is_property_ideographic (unsigned int ch)
3155 return ((unicode_properties[ch] & (1ULL << PROP_IDEOGRAPHIC)) != 0);
3158 /* See PropList.txt, UCD.html. */
3160 is_property_unified_ideograph (unsigned int ch)
3162 return ((unicode_properties[ch] & (1ULL << PROP_UNIFIED_IDEOGRAPH)) != 0);
3165 /* See PropList.txt, UCD.html. */
3167 is_property_radical (unsigned int ch)
3169 return ((unicode_properties[ch] & (1ULL << PROP_RADICAL)) != 0);
3172 /* See PropList.txt, UCD.html. */
3174 is_property_ids_binary_operator (unsigned int ch)
3176 return ((unicode_properties[ch] & (1ULL << PROP_IDS_BINARY_OPERATOR)) != 0);
3179 /* See PropList.txt, UCD.html. */
3181 is_property_ids_trinary_operator (unsigned int ch)
3183 return ((unicode_properties[ch] & (1ULL << PROP_IDS_TRINARY_OPERATOR)) != 0);
3186 /* See PropList-3.0.1.txt. */
3188 is_property_zero_width (unsigned int ch)
3190 return is_category_Cf (ch)
3191 || (unicode_attributes[ch].name != NULL
3192 && strstr (unicode_attributes[ch].name, "ZERO WIDTH") != NULL);
3195 /* See PropList-3.0.1.txt. */
3197 is_property_space (unsigned int ch)
3199 return is_category_Zs (ch);
3202 /* See PropList-3.0.1.txt. */
3204 is_property_non_break (unsigned int ch)
3206 /* This is exactly the set of characters having line breaking
3208 return (ch == 0x00A0 /* NO-BREAK SPACE */
3209 || ch == 0x034F /* COMBINING GRAPHEME JOINER */
3210 || ch == 0x035C /* COMBINING DOUBLE BREVE BELOW */
3211 || ch == 0x035D /* COMBINING DOUBLE BREVE */
3212 || ch == 0x035E /* COMBINING DOUBLE MACRON */
3213 || ch == 0x035F /* COMBINING DOUBLE MACRON BELOW */
3214 || ch == 0x0360 /* COMBINING DOUBLE TILDE */
3215 || ch == 0x0361 /* COMBINING DOUBLE INVERTED BREVE */
3216 || ch == 0x0362 /* COMBINING DOUBLE RIGHTWARDS ARROW BELOW */
3217 || ch == 0x0F08 /* TIBETAN MARK SBRUL SHAD */
3218 || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */
3219 || ch == 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */
3220 || ch == 0x180E /* MONGOLIAN VOWEL SEPARATOR */
3221 || ch == 0x2007 /* FIGURE SPACE */
3222 || ch == 0x2011 /* NON-BREAKING HYPHEN */
3223 || ch == 0x202F /* NARROW NO-BREAK SPACE */);
3226 /* See PropList-3.0.1.txt. */
3228 is_property_iso_control (unsigned int ch)
3231 (unicode_attributes[ch].name != NULL
3232 && strcmp (unicode_attributes[ch].name, "<control>") == 0);
3234 is_category_Cc (ch);
3236 if (result1 != result2)
3241 /* See PropList-3.0.1.txt. */
3243 is_property_format_control (unsigned int ch)
3245 return (is_category_Cf (ch)
3246 && get_bidi_category (ch) == UC_BIDI_BN
3247 && !is_property_join_control (ch)
3251 /* See PropList.txt, UCD.html. */
3253 is_property_dash (unsigned int ch)
3255 return ((unicode_properties[ch] & (1ULL << PROP_DASH)) != 0);
3258 /* See PropList.txt, UCD.html. */
3260 is_property_hyphen (unsigned int ch)
3262 return ((unicode_properties[ch] & (1ULL << PROP_HYPHEN)) != 0);
3265 /* See PropList-3.0.1.txt. */
3267 is_property_punctuation (unsigned int ch)
3269 return is_category_P (ch);
3272 /* See PropList-3.0.1.txt. */
3274 is_property_line_separator (unsigned int ch)
3276 return is_category_Zl (ch);
3279 /* See PropList-3.0.1.txt. */
3281 is_property_paragraph_separator (unsigned int ch)
3283 return is_category_Zp (ch);
3286 /* See PropList.txt, UCD.html. */
3288 is_property_quotation_mark (unsigned int ch)
3290 return ((unicode_properties[ch] & (1ULL << PROP_QUOTATION_MARK)) != 0);
3293 /* See PropList.txt, UCD.html. */
3295 is_property_sentence_terminal (unsigned int ch)
3297 return ((unicode_properties[ch] & (1ULL << PROP_STERM)) != 0);
3300 /* See PropList.txt, UCD.html. */
3302 is_property_terminal_punctuation (unsigned int ch)
3304 return ((unicode_properties[ch] & (1ULL << PROP_TERMINAL_PUNCTUATION)) != 0);
3307 /* See PropList-3.0.1.txt. */
3309 is_property_currency_symbol (unsigned int ch)
3311 return is_category_Sc (ch);
3314 /* See Unicode 3.0 book, section 4.9,
3315 PropList.txt, UCD.html,
3316 DerivedCoreProperties.txt, UCD.html. */
3318 is_property_math (unsigned int ch)
3322 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_MATH)) != 0);
3324 ((unicode_properties[ch] & (1ULL << PROP_MATH)) != 0);
3326 if (result1 != result2)
3331 /* See PropList.txt, UCD.html. */
3333 is_property_other_math (unsigned int ch)
3335 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_MATH)) != 0);
3338 /* See PropList-3.0.1.txt. */
3340 is_property_paired_punctuation (unsigned int ch)
3342 return unicode_pairedpunctuation[ch];
3345 /* See PropList-3.0.1.txt. */
3347 is_property_left_of_pair (unsigned int ch)
3349 return unicode_leftofpair[ch];
3352 /* See PropList-3.0.1.txt. */
3354 is_property_combining (unsigned int ch)
3356 return (unicode_attributes[ch].name != NULL
3357 && (strcmp (unicode_attributes[ch].combining, "0") != 0
3358 || is_category_Mc (ch)
3359 || is_category_Me (ch)
3360 || is_category_Mn (ch)));
3363 #if 0 /* same as is_property_bidi_non_spacing_mark */
3364 /* See PropList-3.0.1.txt. */
3366 is_property_non_spacing (unsigned int ch)
3368 return (unicode_attributes[ch].name != NULL
3369 && get_bidi_category (ch) == UC_BIDI_NSM);
3373 /* See PropList-3.0.1.txt. */
3375 is_property_composite (unsigned int ch)
3377 /* This definition differs from the one in PropList-3.0.1.txt, but is more
3378 logical in some sense. */
3379 if (ch >= 0xAC00 && ch <= 0xD7A4) /* Hangul Syllables */
3381 if (unicode_attributes[ch].name != NULL
3382 && unicode_attributes[ch].decomposition != NULL)
3384 /* Test whether the decomposition contains more than one character,
3385 and the first is not a space. */
3386 const char *decomp = unicode_attributes[ch].decomposition;
3387 if (decomp[0] == '<')
3389 decomp = strchr (decomp, '>') + 1;
3390 if (decomp[0] == ' ')
3393 return strchr (decomp, ' ') != NULL && strncmp (decomp, "0020 ", 5) != 0;
3398 /* See PropList-3.0.1.txt. */
3400 is_property_decimal_digit (unsigned int ch)
3402 return is_category_Nd (ch);
3405 /* See PropList-3.0.1.txt. */
3407 is_property_numeric (unsigned int ch)
3409 return ((get_numeric_value (ch)).denominator > 0)
3410 || (ch == 0x09F8) /* BENGALI CURRENCY NUMERATOR ONE LESS THAN THE DENOMINATOR */
3411 || (ch == 0x2183); /* ROMAN NUMERAL REVERSED ONE HUNDRED */
3414 /* See PropList.txt, UCD.html. */
3416 is_property_diacritic (unsigned int ch)
3418 return ((unicode_properties[ch] & (1ULL << PROP_DIACRITIC)) != 0);
3421 /* See PropList.txt, UCD.html. */
3423 is_property_extender (unsigned int ch)
3425 return ((unicode_properties[ch] & (1ULL << PROP_EXTENDER)) != 0);
3428 /* See PropList-3.0.1.txt. */
3430 is_property_ignorable_control (unsigned int ch)
3432 return ((is_category_Cc (ch) && get_bidi_category (ch) == UC_BIDI_BN)
3433 || is_category_Cf (ch))
3437 /* ------------------------------------------------------------------------- */
3439 /* Output all properties. */
3441 output_properties (const char *version)
3443 #define PROPERTY(P) \
3444 debug_output_predicate ("pr_" #P ".txt", is_property_ ## P); \
3445 output_predicate_test ("test-pr_" #P ".c", is_property_ ## P, "uc_is_property_" #P " (c)"); \
3446 output_predicate ("pr_" #P ".h", is_property_ ## P, "u_property_" #P, "Properties", version);
3447 PROPERTY(white_space)
3448 PROPERTY(alphabetic)
3449 PROPERTY(other_alphabetic)
3450 PROPERTY(not_a_character)
3451 PROPERTY(default_ignorable_code_point)
3452 PROPERTY(other_default_ignorable_code_point)
3453 PROPERTY(deprecated)
3454 PROPERTY(logical_order_exception)
3455 PROPERTY(variation_selector)
3456 PROPERTY(private_use)
3457 PROPERTY(unassigned_code_value)
3459 PROPERTY(other_uppercase)
3461 PROPERTY(other_lowercase)
3463 PROPERTY(soft_dotted)
3465 PROPERTY(other_id_start)
3466 PROPERTY(id_continue)
3467 PROPERTY(other_id_continue)
3469 PROPERTY(xid_continue)
3470 PROPERTY(pattern_white_space)
3471 PROPERTY(pattern_syntax)
3472 PROPERTY(join_control)
3473 PROPERTY(grapheme_base)
3474 PROPERTY(grapheme_extend)
3475 PROPERTY(other_grapheme_extend)
3476 PROPERTY(grapheme_link)
3477 PROPERTY(bidi_control)
3478 PROPERTY(bidi_left_to_right)
3479 PROPERTY(bidi_hebrew_right_to_left)
3480 PROPERTY(bidi_arabic_right_to_left)
3481 PROPERTY(bidi_european_digit)
3482 PROPERTY(bidi_eur_num_separator)
3483 PROPERTY(bidi_eur_num_terminator)
3484 PROPERTY(bidi_arabic_digit)
3485 PROPERTY(bidi_common_separator)
3486 PROPERTY(bidi_block_separator)
3487 PROPERTY(bidi_segment_separator)
3488 PROPERTY(bidi_whitespace)
3489 PROPERTY(bidi_non_spacing_mark)
3490 PROPERTY(bidi_boundary_neutral)
3492 PROPERTY(bidi_embedding_or_override)
3493 PROPERTY(bidi_other_neutral)
3495 PROPERTY(ascii_hex_digit)
3496 PROPERTY(ideographic)
3497 PROPERTY(unified_ideograph)
3499 PROPERTY(ids_binary_operator)
3500 PROPERTY(ids_trinary_operator)
3501 PROPERTY(zero_width)
3504 PROPERTY(iso_control)
3505 PROPERTY(format_control)
3508 PROPERTY(punctuation)
3509 PROPERTY(line_separator)
3510 PROPERTY(paragraph_separator)
3511 PROPERTY(quotation_mark)
3512 PROPERTY(sentence_terminal)
3513 PROPERTY(terminal_punctuation)
3514 PROPERTY(currency_symbol)
3516 PROPERTY(other_math)
3517 PROPERTY(paired_punctuation)
3518 PROPERTY(left_of_pair)
3521 PROPERTY(decimal_digit)
3525 PROPERTY(ignorable_control)
3529 /* ========================================================================= */
3533 static const char *scripts[256];
3534 static unsigned int numscripts;
3536 static uint8_t unicode_scripts[0x110000];
3539 fill_scripts (const char *scripts_filename)
3544 stream = fopen (scripts_filename, "r");
3547 fprintf (stderr, "error during fopen of '%s'\n", scripts_filename);
3553 for (i = 0; i < 0x110000; i++)
3554 unicode_scripts[i] = (uint8_t)~(uint8_t)0;
3559 unsigned int i1, i2;
3560 char padding[200+1];
3561 char scriptname[200+1];
3564 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
3567 if (buf[0] == '\0' || buf[0] == '#')
3570 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, scriptname) != 4)
3572 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, scriptname) != 3)
3574 fprintf (stderr, "parse error in '%s'\n", scripts_filename);
3584 for (script = numscripts - 1; script >= 0; script--)
3585 if (strcmp (scripts[script], scriptname) == 0)
3589 scripts[numscripts] = strdup (scriptname);
3590 script = numscripts;
3592 if (numscripts == 256)
3596 for (i = i1; i <= i2; i++)
3598 if (unicode_scripts[i] != (uint8_t)~(uint8_t)0)
3599 fprintf (stderr, "0x%04X belongs to multiple scripts\n", i);
3600 unicode_scripts[i] = script;
3604 if (ferror (stream) || fclose (stream))
3606 fprintf (stderr, "error reading from '%s'\n", scripts_filename);
3611 /* Construction of sparse 3-level tables. */
3612 #define TABLE script_table
3613 #define ELEMENT uint8_t
3614 #define DEFAULT (uint8_t)~(uint8_t)0
3615 #define xmalloc malloc
3616 #define xrealloc realloc
3620 output_scripts (const char *version)
3622 const char *filename = "scripts.h";
3624 unsigned int ch, s, i;
3625 struct script_table t;
3626 unsigned int level1_offset, level2_offset, level3_offset;
3630 const char *lowercase_name;
3633 scriptinfo_t scriptinfo[256];
3635 stream = fopen (filename, "w");
3638 fprintf (stderr, "cannot open '%s' for writing\n", filename);
3642 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
3643 fprintf (stream, "/* Unicode scripts. */\n");
3644 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
3647 for (s = 0; s < numscripts; s++)
3649 char *lcp = strdup (scripts[s]);
3652 for (cp = lcp; *cp != '\0'; cp++)
3653 if (*cp >= 'A' && *cp <= 'Z')
3656 scriptinfo[s].lowercase_name = lcp;
3659 for (s = 0; s < numscripts; s++)
3661 fprintf (stream, "static const uc_interval_t script_%s_intervals[] =\n",
3662 scriptinfo[s].lowercase_name);
3663 fprintf (stream, "{\n");
3665 for (ch = 0; ch < 0x110000; ch++)
3666 if (unicode_scripts[ch] == s)
3672 while (ch + 1 < 0x110000 && unicode_scripts[ch + 1] == s)
3677 fprintf (stream, ",\n");
3679 fprintf (stream, " { 0x%04X, 1, 1 }", start);
3681 fprintf (stream, " { 0x%04X, 1, 0 }, { 0x%04X, 0, 1 }",
3685 fprintf (stream, "\n");
3686 fprintf (stream, "};\n");
3689 fprintf (stream, "static const uc_script_t scripts[%d] =\n", numscripts);
3690 fprintf (stream, "{\n");
3691 for (s = 0; s < numscripts; s++)
3693 fprintf (stream, " {\n");
3694 fprintf (stream, " sizeof (script_%s_intervals) / sizeof (uc_interval_t),\n",
3695 scriptinfo[s].lowercase_name);
3696 fprintf (stream, " script_%s_intervals,\n",
3697 scriptinfo[s].lowercase_name);
3698 fprintf (stream, " \"%s\"\n", scripts[s]);
3699 fprintf (stream, " }");
3700 if (s+1 < numscripts)
3701 fprintf (stream, ",");
3702 fprintf (stream, "\n");
3704 fprintf (stream, "};\n");
3708 script_table_init (&t);
3710 for (ch = 0; ch < 0x110000; ch++)
3712 unsigned int s = unicode_scripts[ch];
3713 if (s != (uint8_t)~(uint8_t)0)
3714 script_table_add (&t, ch, s);
3717 script_table_finalize (&t);
3719 /* Offsets in t.result, in memory of this process. */
3721 5 * sizeof (uint32_t);
3723 5 * sizeof (uint32_t)
3724 + t.level1_size * sizeof (uint32_t);
3726 5 * sizeof (uint32_t)
3727 + t.level1_size * sizeof (uint32_t)
3728 + (t.level2_size << t.q) * sizeof (uint32_t);
3730 for (i = 0; i < 5; i++)
3731 fprintf (stream, "#define script_header_%d %d\n", i,
3732 ((uint32_t *) t.result)[i]);
3733 fprintf (stream, "static const\n");
3734 fprintf (stream, "struct\n");
3735 fprintf (stream, " {\n");
3736 fprintf (stream, " int level1[%zu];\n", t.level1_size);
3737 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
3738 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
3739 fprintf (stream, " }\n");
3740 fprintf (stream, "u_script =\n");
3741 fprintf (stream, "{\n");
3742 fprintf (stream, " {");
3743 if (t.level1_size > 8)
3744 fprintf (stream, "\n ");
3745 for (i = 0; i < t.level1_size; i++)
3748 if (i > 0 && (i % 8) == 0)
3749 fprintf (stream, "\n ");
3750 offset = ((uint32_t *) (t.result + level1_offset))[i];
3752 fprintf (stream, " %5d", -1);
3754 fprintf (stream, " %5zd",
3755 (offset - level2_offset) / sizeof (uint32_t));
3756 if (i+1 < t.level1_size)
3757 fprintf (stream, ",");
3759 if (t.level1_size > 8)
3760 fprintf (stream, "\n ");
3761 fprintf (stream, " },\n");
3762 fprintf (stream, " {");
3763 if (t.level2_size << t.q > 8)
3764 fprintf (stream, "\n ");
3765 for (i = 0; i < t.level2_size << t.q; i++)
3768 if (i > 0 && (i % 8) == 0)
3769 fprintf (stream, "\n ");
3770 offset = ((uint32_t *) (t.result + level2_offset))[i];
3772 fprintf (stream, " %5d", -1);
3774 fprintf (stream, " %5zd",
3775 (offset - level3_offset) / sizeof (uint8_t));
3776 if (i+1 < t.level2_size << t.q)
3777 fprintf (stream, ",");
3779 if (t.level2_size << t.q > 8)
3780 fprintf (stream, "\n ");
3781 fprintf (stream, " },\n");
3782 fprintf (stream, " {");
3783 if (t.level3_size << t.p > 8)
3784 fprintf (stream, "\n ");
3785 for (i = 0; i < t.level3_size << t.p; i++)
3787 if (i > 0 && (i % 8) == 0)
3788 fprintf (stream, "\n ");
3789 fprintf (stream, " %3d", ((uint8_t *) (t.result + level3_offset))[i]);
3790 if (i+1 < t.level3_size << t.p)
3791 fprintf (stream, ",");
3793 if (t.level3_size << t.p > 8)
3794 fprintf (stream, "\n ");
3795 fprintf (stream, " }\n");
3796 fprintf (stream, "};\n");
3798 if (ferror (stream) || fclose (stream))
3800 fprintf (stderr, "error writing to '%s'\n", filename);
3806 output_scripts_byname (const char *version)
3808 const char *filename = "scripts_byname.gperf";
3812 stream = fopen (filename, "w");
3815 fprintf (stderr, "cannot open '%s' for writing\n", filename);
3819 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
3820 fprintf (stream, "/* Unicode scripts. */\n");
3821 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
3823 fprintf (stream, "struct named_script { const char *name; unsigned int index; };\n");
3824 fprintf (stream, "%%struct-type\n");
3825 fprintf (stream, "%%language=ANSI-C\n");
3826 fprintf (stream, "%%define hash-function-name scripts_hash\n");
3827 fprintf (stream, "%%define lookup-function-name uc_script_lookup\n");
3828 fprintf (stream, "%%readonly-tables\n");
3829 fprintf (stream, "%%global-table\n");
3830 fprintf (stream, "%%define word-array-name script_names\n");
3831 fprintf (stream, "%%%%\n");
3832 for (s = 0; s < numscripts; s++)
3833 fprintf (stream, "%s, %u\n", scripts[s], s);
3835 if (ferror (stream) || fclose (stream))
3837 fprintf (stderr, "error writing to '%s'\n", filename);
3842 /* ========================================================================= */
3846 typedef struct { unsigned int start; unsigned int end; const char *name; }
3848 static block_t blocks[256];
3849 static unsigned int numblocks;
3852 fill_blocks (const char *blocks_filename)
3856 stream = fopen (blocks_filename, "r");
3859 fprintf (stderr, "error during fopen of '%s'\n", blocks_filename);
3866 unsigned int i1, i2;
3867 char padding[200+1];
3868 char blockname[200+1];
3870 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
3873 if (buf[0] == '\0' || buf[0] == '#')
3876 if (sscanf (buf, "%X..%X%[ ;]%[^\r]", &i1, &i2, padding, blockname) != 4)
3878 fprintf (stderr, "parse error in '%s'\n", blocks_filename);
3881 blocks[numblocks].start = i1;
3882 blocks[numblocks].end = i2;
3883 blocks[numblocks].name = strdup (blockname);
3884 /* It must be sorted. */
3885 if (numblocks > 0 && !(blocks[numblocks-1].end < blocks[numblocks].start))
3888 if (numblocks == 256)
3892 if (ferror (stream) || fclose (stream))
3894 fprintf (stderr, "error reading from '%s'\n", blocks_filename);
3899 /* Return the smallest block index among the blocks for characters >= ch. */
3901 block_first_index (unsigned int ch)
3903 /* Binary search. */
3904 unsigned int lo = 0;
3905 unsigned int hi = numblocks;
3907 All blocks[i], i < lo, have blocks[i].end < ch,
3908 all blocks[i], i >= hi, have blocks[i].end >= ch. */
3911 unsigned int mid = (lo + hi) / 2; /* >= lo, < hi */
3912 if (blocks[mid].end < ch)
3920 /* Return the largest block index among the blocks for characters <= ch,
3923 block_last_index (unsigned int ch)
3925 /* Binary search. */
3926 unsigned int lo = 0;
3927 unsigned int hi = numblocks;
3929 All blocks[i], i < lo, have blocks[i].start <= ch,
3930 all blocks[i], i >= hi, have blocks[i].start > ch. */
3933 unsigned int mid = (lo + hi) / 2; /* >= lo, < hi */
3934 if (blocks[mid].start <= ch)
3943 output_blocks (const char *version)
3945 const char *filename = "blocks.h";
3946 const unsigned int shift = 8; /* bits to shift away for array access */
3947 const unsigned int threshold = 0x30000; /* cut-off table here to save space */
3952 stream = fopen (filename, "w");
3955 fprintf (stderr, "cannot open '%s' for writing\n", filename);
3959 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
3960 fprintf (stream, "/* Unicode blocks. */\n");
3961 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
3964 fprintf (stream, "static const uc_block_t blocks[] =\n");
3965 fprintf (stream, "{\n");
3966 for (i = 0; i < numblocks; i++)
3968 fprintf (stream, " { 0x%04X, 0x%04X, \"%s\" }", blocks[i].start,
3969 blocks[i].end, blocks[i].name);
3970 if (i+1 < numblocks)
3971 fprintf (stream, ",");
3972 fprintf (stream, "\n");
3974 fprintf (stream, "};\n");
3975 fprintf (stream, "#define blocks_level1_shift %d\n", shift);
3976 fprintf (stream, "#define blocks_level1_threshold 0x%04X\n", threshold);
3977 fprintf (stream, "static const uint8_t blocks_level1[%d * 2] =\n",
3978 threshold >> shift);
3979 fprintf (stream, "{\n");
3980 for (i1 = 0; i1 < (threshold >> shift); i1++)
3982 unsigned int first_index = block_first_index (i1 << shift);
3983 unsigned int last_index = block_last_index (((i1 + 1) << shift) - 1);
3984 fprintf (stream, " %3d, %3d", first_index, last_index);
3985 if (i1+1 < (threshold >> shift))
3986 fprintf (stream, ",");
3987 fprintf (stream, "\n");
3989 fprintf (stream, "};\n");
3990 fprintf (stream, "#define blocks_upper_first_index %d\n",
3991 block_first_index (threshold));
3992 fprintf (stream, "#define blocks_upper_last_index %d\n",
3993 block_last_index (0x10FFFF));
3995 if (ferror (stream) || fclose (stream))
3997 fprintf (stderr, "error writing to '%s'\n", filename);
4002 /* ========================================================================= */
4004 /* C and Java syntax. */
4008 UC_IDENTIFIER_START, /* valid as first or subsequent character */
4009 UC_IDENTIFIER_VALID, /* valid as subsequent character only */
4010 UC_IDENTIFIER_INVALID, /* not valid */
4011 UC_IDENTIFIER_IGNORABLE /* ignorable (Java only) */
4014 /* ISO C 99 section 6.4.(3). */
4016 is_c_whitespace (unsigned int ch)
4018 return (ch == ' ' /* space */
4019 || ch == '\t' /* horizontal tab */
4020 || ch == '\n' || ch == '\r' /* new-line */
4021 || ch == '\v' /* vertical tab */
4022 || ch == '\f'); /* form-feed */
4025 /* ISO C 99 section 6.4.2.1 and appendix D. */
4027 c_ident_category (unsigned int ch)
4029 /* Section 6.4.2.1. */
4030 if (ch >= '0' && ch <= '9')
4031 return UC_IDENTIFIER_VALID;
4032 if ((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || ch == '_')
4033 return UC_IDENTIFIER_START;
4039 || (ch >= 0x00C0 && ch <= 0x00D6)
4040 || (ch >= 0x00D8 && ch <= 0x00F6)
4041 || (ch >= 0x00F8 && ch <= 0x01F5)
4042 || (ch >= 0x01FA && ch <= 0x0217)
4043 || (ch >= 0x0250 && ch <= 0x02A8)
4044 || (ch >= 0x1E00 && ch <= 0x1E9B)
4045 || (ch >= 0x1EA0 && ch <= 0x1EF9)
4049 || (ch >= 0x0388 && ch <= 0x038A)
4051 || (ch >= 0x038E && ch <= 0x03A1)
4052 || (ch >= 0x03A3 && ch <= 0x03CE)
4053 || (ch >= 0x03D0 && ch <= 0x03D6)
4058 || (ch >= 0x03E2 && ch <= 0x03F3)
4059 || (ch >= 0x1F00 && ch <= 0x1F15)
4060 || (ch >= 0x1F18 && ch <= 0x1F1D)
4061 || (ch >= 0x1F20 && ch <= 0x1F45)
4062 || (ch >= 0x1F48 && ch <= 0x1F4D)
4063 || (ch >= 0x1F50 && ch <= 0x1F57)
4067 || (ch >= 0x1F5F && ch <= 0x1F7D)
4068 || (ch >= 0x1F80 && ch <= 0x1FB4)
4069 || (ch >= 0x1FB6 && ch <= 0x1FBC)
4070 || (ch >= 0x1FC2 && ch <= 0x1FC4)
4071 || (ch >= 0x1FC6 && ch <= 0x1FCC)
4072 || (ch >= 0x1FD0 && ch <= 0x1FD3)
4073 || (ch >= 0x1FD6 && ch <= 0x1FDB)
4074 || (ch >= 0x1FE0 && ch <= 0x1FEC)
4075 || (ch >= 0x1FF2 && ch <= 0x1FF4)
4076 || (ch >= 0x1FF6 && ch <= 0x1FFC)
4078 || (ch >= 0x0401 && ch <= 0x040C)
4079 || (ch >= 0x040E && ch <= 0x044F)
4080 || (ch >= 0x0451 && ch <= 0x045C)
4081 || (ch >= 0x045E && ch <= 0x0481)
4082 || (ch >= 0x0490 && ch <= 0x04C4)
4083 || (ch >= 0x04C7 && ch <= 0x04C8)
4084 || (ch >= 0x04CB && ch <= 0x04CC)
4085 || (ch >= 0x04D0 && ch <= 0x04EB)
4086 || (ch >= 0x04EE && ch <= 0x04F5)
4087 || (ch >= 0x04F8 && ch <= 0x04F9)
4089 || (ch >= 0x0531 && ch <= 0x0556)
4090 || (ch >= 0x0561 && ch <= 0x0587)
4092 || (ch >= 0x05B0 && ch <= 0x05B9)
4093 || (ch >= 0x05BB && ch <= 0x05BD)
4095 || (ch >= 0x05C1 && ch <= 0x05C2)
4096 || (ch >= 0x05D0 && ch <= 0x05EA)
4097 || (ch >= 0x05F0 && ch <= 0x05F2)
4099 || (ch >= 0x0621 && ch <= 0x063A)
4100 || (ch >= 0x0640 && ch <= 0x0652)
4101 || (ch >= 0x0670 && ch <= 0x06B7)
4102 || (ch >= 0x06BA && ch <= 0x06BE)
4103 || (ch >= 0x06C0 && ch <= 0x06CE)
4104 || (ch >= 0x06D0 && ch <= 0x06DC)
4105 || (ch >= 0x06E5 && ch <= 0x06E8)
4106 || (ch >= 0x06EA && ch <= 0x06ED)
4108 || (ch >= 0x0901 && ch <= 0x0903)
4109 || (ch >= 0x0905 && ch <= 0x0939)
4110 || (ch >= 0x093E && ch <= 0x094D)
4111 || (ch >= 0x0950 && ch <= 0x0952)
4112 || (ch >= 0x0958 && ch <= 0x0963)
4114 || (ch >= 0x0981 && ch <= 0x0983)
4115 || (ch >= 0x0985 && ch <= 0x098C)
4116 || (ch >= 0x098F && ch <= 0x0990)
4117 || (ch >= 0x0993 && ch <= 0x09A8)
4118 || (ch >= 0x09AA && ch <= 0x09B0)
4120 || (ch >= 0x09B6 && ch <= 0x09B9)
4121 || (ch >= 0x09BE && ch <= 0x09C4)
4122 || (ch >= 0x09C7 && ch <= 0x09C8)
4123 || (ch >= 0x09CB && ch <= 0x09CD)
4124 || (ch >= 0x09DC && ch <= 0x09DD)
4125 || (ch >= 0x09DF && ch <= 0x09E3)
4126 || (ch >= 0x09F0 && ch <= 0x09F1)
4129 || (ch >= 0x0A05 && ch <= 0x0A0A)
4130 || (ch >= 0x0A0F && ch <= 0x0A10)
4131 || (ch >= 0x0A13 && ch <= 0x0A28)
4132 || (ch >= 0x0A2A && ch <= 0x0A30)
4133 || (ch >= 0x0A32 && ch <= 0x0A33)
4134 || (ch >= 0x0A35 && ch <= 0x0A36)
4135 || (ch >= 0x0A38 && ch <= 0x0A39)
4136 || (ch >= 0x0A3E && ch <= 0x0A42)
4137 || (ch >= 0x0A47 && ch <= 0x0A48)
4138 || (ch >= 0x0A4B && ch <= 0x0A4D)
4139 || (ch >= 0x0A59 && ch <= 0x0A5C)
4143 || (ch >= 0x0A81 && ch <= 0x0A83)
4144 || (ch >= 0x0A85 && ch <= 0x0A8B)
4146 || (ch >= 0x0A8F && ch <= 0x0A91)
4147 || (ch >= 0x0A93 && ch <= 0x0AA8)
4148 || (ch >= 0x0AAA && ch <= 0x0AB0)
4149 || (ch >= 0x0AB2 && ch <= 0x0AB3)
4150 || (ch >= 0x0AB5 && ch <= 0x0AB9)
4151 || (ch >= 0x0ABD && ch <= 0x0AC5)
4152 || (ch >= 0x0AC7 && ch <= 0x0AC9)
4153 || (ch >= 0x0ACB && ch <= 0x0ACD)
4157 || (ch >= 0x0B01 && ch <= 0x0B03)
4158 || (ch >= 0x0B05 && ch <= 0x0B0C)
4159 || (ch >= 0x0B0F && ch <= 0x0B10)
4160 || (ch >= 0x0B13 && ch <= 0x0B28)
4161 || (ch >= 0x0B2A && ch <= 0x0B30)
4162 || (ch >= 0x0B32 && ch <= 0x0B33)
4163 || (ch >= 0x0B36 && ch <= 0x0B39)
4164 || (ch >= 0x0B3E && ch <= 0x0B43)
4165 || (ch >= 0x0B47 && ch <= 0x0B48)
4166 || (ch >= 0x0B4B && ch <= 0x0B4D)
4167 || (ch >= 0x0B5C && ch <= 0x0B5D)
4168 || (ch >= 0x0B5F && ch <= 0x0B61)
4170 || (ch >= 0x0B82 && ch <= 0x0B83)
4171 || (ch >= 0x0B85 && ch <= 0x0B8A)
4172 || (ch >= 0x0B8E && ch <= 0x0B90)
4173 || (ch >= 0x0B92 && ch <= 0x0B95)
4174 || (ch >= 0x0B99 && ch <= 0x0B9A)
4176 || (ch >= 0x0B9E && ch <= 0x0B9F)
4177 || (ch >= 0x0BA3 && ch <= 0x0BA4)
4178 || (ch >= 0x0BA8 && ch <= 0x0BAA)
4179 || (ch >= 0x0BAE && ch <= 0x0BB5)
4180 || (ch >= 0x0BB7 && ch <= 0x0BB9)
4181 || (ch >= 0x0BBE && ch <= 0x0BC2)
4182 || (ch >= 0x0BC6 && ch <= 0x0BC8)
4183 || (ch >= 0x0BCA && ch <= 0x0BCD)
4185 || (ch >= 0x0C01 && ch <= 0x0C03)
4186 || (ch >= 0x0C05 && ch <= 0x0C0C)
4187 || (ch >= 0x0C0E && ch <= 0x0C10)
4188 || (ch >= 0x0C12 && ch <= 0x0C28)
4189 || (ch >= 0x0C2A && ch <= 0x0C33)
4190 || (ch >= 0x0C35 && ch <= 0x0C39)
4191 || (ch >= 0x0C3E && ch <= 0x0C44)
4192 || (ch >= 0x0C46 && ch <= 0x0C48)
4193 || (ch >= 0x0C4A && ch <= 0x0C4D)
4194 || (ch >= 0x0C60 && ch <= 0x0C61)
4196 || (ch >= 0x0C82 && ch <= 0x0C83)
4197 || (ch >= 0x0C85 && ch <= 0x0C8C)
4198 || (ch >= 0x0C8E && ch <= 0x0C90)
4199 || (ch >= 0x0C92 && ch <= 0x0CA8)
4200 || (ch >= 0x0CAA && ch <= 0x0CB3)
4201 || (ch >= 0x0CB5 && ch <= 0x0CB9)
4202 || (ch >= 0x0CBE && ch <= 0x0CC4)
4203 || (ch >= 0x0CC6 && ch <= 0x0CC8)
4204 || (ch >= 0x0CCA && ch <= 0x0CCD)
4206 || (ch >= 0x0CE0 && ch <= 0x0CE1)
4208 || (ch >= 0x0D02 && ch <= 0x0D03)
4209 || (ch >= 0x0D05 && ch <= 0x0D0C)
4210 || (ch >= 0x0D0E && ch <= 0x0D10)
4211 || (ch >= 0x0D12 && ch <= 0x0D28)
4212 || (ch >= 0x0D2A && ch <= 0x0D39)
4213 || (ch >= 0x0D3E && ch <= 0x0D43)
4214 || (ch >= 0x0D46 && ch <= 0x0D48)
4215 || (ch >= 0x0D4A && ch <= 0x0D4D)
4216 || (ch >= 0x0D60 && ch <= 0x0D61)
4218 || (ch >= 0x0E01 && ch <= 0x0E3A)
4219 || (ch >= 0x0E40 && ch <= 0x0E5B)
4221 || (ch >= 0x0E81 && ch <= 0x0E82)
4223 || (ch >= 0x0E87 && ch <= 0x0E88)
4226 || (ch >= 0x0E94 && ch <= 0x0E97)
4227 || (ch >= 0x0E99 && ch <= 0x0E9F)
4228 || (ch >= 0x0EA1 && ch <= 0x0EA3)
4231 || (ch >= 0x0EAA && ch <= 0x0EAB)
4232 || (ch >= 0x0EAD && ch <= 0x0EAE)
4233 || (ch >= 0x0EB0 && ch <= 0x0EB9)
4234 || (ch >= 0x0EBB && ch <= 0x0EBD)
4235 || (ch >= 0x0EC0 && ch <= 0x0EC4)
4237 || (ch >= 0x0EC8 && ch <= 0x0ECD)
4238 || (ch >= 0x0EDC && ch <= 0x0EDD)
4241 || (ch >= 0x0F18 && ch <= 0x0F19)
4245 || (ch >= 0x0F3E && ch <= 0x0F47)
4246 || (ch >= 0x0F49 && ch <= 0x0F69)
4247 || (ch >= 0x0F71 && ch <= 0x0F84)
4248 || (ch >= 0x0F86 && ch <= 0x0F8B)
4249 || (ch >= 0x0F90 && ch <= 0x0F95)
4251 || (ch >= 0x0F99 && ch <= 0x0FAD)
4252 || (ch >= 0x0FB1 && ch <= 0x0FB7)
4255 || (ch >= 0x10A0 && ch <= 0x10C5)
4256 || (ch >= 0x10D0 && ch <= 0x10F6)
4258 || (ch >= 0x3041 && ch <= 0x3093)
4259 || (ch >= 0x309B && ch <= 0x309C)
4261 || (ch >= 0x30A1 && ch <= 0x30F6)
4262 || (ch >= 0x30FB && ch <= 0x30FC)
4264 || (ch >= 0x3105 && ch <= 0x312C)
4265 /* CJK Unified Ideographs */
4266 || (ch >= 0x4E00 && ch <= 0x9FA5)
4268 || (ch >= 0xAC00 && ch <= 0xD7A3)
4270 || (ch >= 0x0660 && ch <= 0x0669)
4271 || (ch >= 0x06F0 && ch <= 0x06F9)
4272 || (ch >= 0x0966 && ch <= 0x096F)
4273 || (ch >= 0x09E6 && ch <= 0x09EF)
4274 || (ch >= 0x0A66 && ch <= 0x0A6F)
4275 || (ch >= 0x0AE6 && ch <= 0x0AEF)
4276 || (ch >= 0x0B66 && ch <= 0x0B6F)
4277 || (ch >= 0x0BE7 && ch <= 0x0BEF)
4278 || (ch >= 0x0C66 && ch <= 0x0C6F)
4279 || (ch >= 0x0CE6 && ch <= 0x0CEF)
4280 || (ch >= 0x0D66 && ch <= 0x0D6F)
4281 || (ch >= 0x0E50 && ch <= 0x0E59)
4282 || (ch >= 0x0ED0 && ch <= 0x0ED9)
4283 || (ch >= 0x0F20 && ch <= 0x0F33)
4284 /* Special characters */
4287 || (ch >= 0x02B0 && ch <= 0x02B8)
4289 || (ch >= 0x02BD && ch <= 0x02C1)
4290 || (ch >= 0x02D0 && ch <= 0x02D1)
4291 || (ch >= 0x02E0 && ch <= 0x02E4)
4297 || (ch >= 0x203F && ch <= 0x2040)
4300 || (ch >= 0x210A && ch <= 0x2113)
4302 || (ch >= 0x2118 && ch <= 0x211D)
4306 || (ch >= 0x212A && ch <= 0x2131)
4307 || (ch >= 0x2133 && ch <= 0x2138)
4308 || (ch >= 0x2160 && ch <= 0x2182)
4309 || (ch >= 0x3005 && ch <= 0x3007)
4310 || (ch >= 0x3021 && ch <= 0x3029)
4312 return UC_IDENTIFIER_START;
4313 return UC_IDENTIFIER_INVALID;
4316 /* The Java Language Specification, 3rd edition, §3.6.
4317 http://java.sun.com/docs/books/jls/third_edition/html/lexical.html#95710 */
4319 is_java_whitespace (unsigned int ch)
4321 return (ch == ' ' || ch == '\t' || ch == '\f'
4322 || ch == '\n' || ch == '\r');
4325 /* The Java Language Specification, 3rd edition, §3.8.
4326 http://java.sun.com/docs/books/jls/third_edition/html/lexical.html#40625
4327 and Character.isJavaIdentifierStart and Character.isJavaIdentifierPart */
4329 java_ident_category (unsigned int ch)
4331 /* FIXME: Check this against Sun's JDK implementation. */
4332 if (is_category_L (ch) /* = Character.isLetter(ch) */
4333 || is_category_Nl (ch) /* = Character.getType(ch)==LETTER_NUMBER */
4334 || is_category_Sc (ch) /* currency symbol */
4335 || is_category_Pc (ch) /* connector punctuation */
4337 return UC_IDENTIFIER_START;
4338 if (is_category_Nd (ch) /* digit */
4339 || is_category_Mc (ch) /* combining mark */
4340 || is_category_Mn (ch) /* non-spacing mark */
4342 return UC_IDENTIFIER_VALID;
4343 if ((ch >= 0x0000 && ch <= 0x0008)
4344 || (ch >= 0x000E && ch <= 0x001B)
4345 || (ch >= 0x007F && ch <= 0x009F)
4346 || is_category_Cf (ch) /* = Character.getType(ch)==FORMAT */
4348 return UC_IDENTIFIER_IGNORABLE;
4349 return UC_IDENTIFIER_INVALID;
4352 /* Construction of sparse 3-level tables. */
4353 #define TABLE identsyntax_table
4354 #define ELEMENT uint8_t
4355 #define DEFAULT UC_IDENTIFIER_INVALID
4356 #define xmalloc malloc
4357 #define xrealloc realloc
4360 /* Output an identifier syntax categorization in a three-level bitmap. */
4362 output_ident_category (const char *filename, int (*predicate) (unsigned int), const char *name, const char *version)
4366 struct identsyntax_table t;
4367 unsigned int level1_offset, level2_offset, level3_offset;
4369 stream = fopen (filename, "w");
4372 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4376 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4377 fprintf (stream, "/* Language syntax properties of Unicode characters. */\n");
4378 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
4383 identsyntax_table_init (&t);
4385 for (ch = 0; ch < 0x110000; ch++)
4387 int syntaxcode = predicate (ch);
4388 if (syntaxcode != UC_IDENTIFIER_INVALID)
4389 identsyntax_table_add (&t, ch, syntaxcode);
4392 identsyntax_table_finalize (&t);
4394 /* Offsets in t.result, in memory of this process. */
4396 5 * sizeof (uint32_t);
4398 5 * sizeof (uint32_t)
4399 + t.level1_size * sizeof (uint32_t);
4401 5 * sizeof (uint32_t)
4402 + t.level1_size * sizeof (uint32_t)
4403 + (t.level2_size << t.q) * sizeof (uint32_t);
4405 for (i = 0; i < 5; i++)
4406 fprintf (stream, "#define identsyntax_header_%d %d\n", i,
4407 ((uint32_t *) t.result)[i]);
4408 fprintf (stream, "static const\n");
4409 fprintf (stream, "struct\n");
4410 fprintf (stream, " {\n");
4411 fprintf (stream, " int level1[%zu];\n", t.level1_size);
4412 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
4413 fprintf (stream, " unsigned short level3[%zu * %d];\n", t.level3_size,
4414 (1 << t.p) * 2 / 16);
4415 fprintf (stream, " }\n");
4416 fprintf (stream, "%s =\n", name);
4417 fprintf (stream, "{\n");
4418 fprintf (stream, " {");
4419 if (t.level1_size > 8)
4420 fprintf (stream, "\n ");
4421 for (i = 0; i < t.level1_size; i++)
4424 if (i > 0 && (i % 8) == 0)
4425 fprintf (stream, "\n ");
4426 offset = ((uint32_t *) (t.result + level1_offset))[i];
4428 fprintf (stream, " %5d", -1);
4430 fprintf (stream, " %5zd",
4431 (offset - level2_offset) / sizeof (uint32_t));
4432 if (i+1 < t.level1_size)
4433 fprintf (stream, ",");
4435 if (t.level1_size > 8)
4436 fprintf (stream, "\n ");
4437 fprintf (stream, " },\n");
4438 fprintf (stream, " {");
4439 if (t.level2_size << t.q > 8)
4440 fprintf (stream, "\n ");
4441 for (i = 0; i < t.level2_size << t.q; i++)
4444 if (i > 0 && (i % 8) == 0)
4445 fprintf (stream, "\n ");
4446 offset = ((uint32_t *) (t.result + level2_offset))[i];
4448 fprintf (stream, " %5d", -1);
4450 fprintf (stream, " %5zd",
4451 (offset - level3_offset) / sizeof (uint8_t));
4452 if (i+1 < t.level2_size << t.q)
4453 fprintf (stream, ",");
4455 if (t.level2_size << t.q > 8)
4456 fprintf (stream, "\n ");
4457 fprintf (stream, " },\n");
4458 /* Pack the level3 array. Each entry needs 2 bits only. */
4459 fprintf (stream, " {");
4460 if ((t.level3_size << t.p) * 2 / 16 > 8)
4461 fprintf (stream, "\n ");
4462 for (i = 0; i < (t.level3_size << t.p) * 2 / 16; i++)
4464 if (i > 0 && (i % 8) == 0)
4465 fprintf (stream, "\n ");
4466 fprintf (stream, " 0x%04x",
4467 (((uint8_t *) (t.result + level3_offset))[8 * i] << 0)
4468 | (((uint8_t *) (t.result + level3_offset))[8 * i + 1] << 2)
4469 | (((uint8_t *) (t.result + level3_offset))[8 * i + 2] << 4)
4470 | (((uint8_t *) (t.result + level3_offset))[8 * i + 3] << 6)
4471 | (((uint8_t *) (t.result + level3_offset))[8 * i + 4] << 8)
4472 | (((uint8_t *) (t.result + level3_offset))[8 * i + 5] << 10)
4473 | (((uint8_t *) (t.result + level3_offset))[8 * i + 6] << 12)
4474 | (((uint8_t *) (t.result + level3_offset))[8 * i + 7] << 14));
4475 if (i+1 < (t.level3_size << t.p) * 2 / 16)
4476 fprintf (stream, ",");
4478 if ((t.level3_size << t.p) * 2 / 16 > 8)
4479 fprintf (stream, "\n ");
4480 fprintf (stream, " }\n");
4481 fprintf (stream, "};\n");
4483 if (ferror (stream) || fclose (stream))
4485 fprintf (stderr, "error writing to '%s'\n", filename);
4491 output_ident_properties (const char *version)
4493 #define PROPERTY(P) \
4494 debug_output_predicate ("sy_" #P ".txt", is_ ## P); \
4495 output_predicate_test ("test-sy_" #P ".c", is_ ## P, "uc_is_" #P " (c)"); \
4496 output_predicate ("sy_" #P ".h", is_ ## P, "u_" #P, "Language syntax properties", version);
4497 PROPERTY(c_whitespace)
4498 PROPERTY(java_whitespace)
4501 output_ident_category ("sy_c_ident.h", c_ident_category, "u_c_ident", version);
4502 output_ident_category ("sy_java_ident.h", java_ident_category, "u_java_ident", version);
4505 /* ========================================================================= */
4507 /* Like ISO C <ctype.h> and <wctype.h>. Compatible to glibc's
4508 glibc/localedata/locales/i18n file, generated by
4509 glibc/localedata/gen-unicode-ctype.c. */
4511 /* Character mappings. */
4514 to_upper (unsigned int ch)
4516 if (unicode_attributes[ch].name != NULL
4517 && unicode_attributes[ch].upper != NONE)
4518 return unicode_attributes[ch].upper;
4524 to_lower (unsigned int ch)
4526 if (unicode_attributes[ch].name != NULL
4527 && unicode_attributes[ch].lower != NONE)
4528 return unicode_attributes[ch].lower;
4534 to_title (unsigned int ch)
4536 if (unicode_attributes[ch].name != NULL
4537 && unicode_attributes[ch].title != NONE)
4538 return unicode_attributes[ch].title;
4543 /* Character class properties. */
4546 is_upper (unsigned int ch)
4548 return (to_lower (ch) != ch);
4552 is_lower (unsigned int ch)
4554 return (to_upper (ch) != ch)
4555 /* <U00DF> is lowercase, but without simple to_upper mapping. */
4560 is_alpha (unsigned int ch)
4562 return (unicode_attributes[ch].name != NULL
4563 && ((unicode_attributes[ch].category[0] == 'L'
4564 /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
4565 <U0E2F>, <U0E46> should belong to is_punct. */
4566 && (ch != 0x0E2F) && (ch != 0x0E46))
4567 /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
4568 <U0E31>, <U0E34>..<U0E3A>, <U0E47>..<U0E4E> are is_alpha. */
4570 || (ch >= 0x0E34 && ch <= 0x0E3A)
4571 || (ch >= 0x0E47 && ch <= 0x0E4E)
4572 /* Avoid warning for <U0345>. */
4574 /* Avoid warnings for <U2160>..<U217F>. */
4575 || (unicode_attributes[ch].category[0] == 'N'
4576 && unicode_attributes[ch].category[1] == 'l')
4577 /* Avoid warnings for <U24B6>..<U24E9>. */
4578 || (unicode_attributes[ch].category[0] == 'S'
4579 && unicode_attributes[ch].category[1] == 'o'
4580 && strstr (unicode_attributes[ch].name, " LETTER ")
4582 /* Consider all the non-ASCII digits as alphabetic.
4583 ISO C 99 forbids us to have them in category "digit",
4584 but we want iswalnum to return true on them. */
4585 || (unicode_attributes[ch].category[0] == 'N'
4586 && unicode_attributes[ch].category[1] == 'd'
4587 && !(ch >= 0x0030 && ch <= 0x0039))));
4591 is_digit (unsigned int ch)
4594 return (unicode_attributes[ch].name != NULL
4595 && unicode_attributes[ch].category[0] == 'N'
4596 && unicode_attributes[ch].category[1] == 'd');
4597 /* Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
4598 a zero. Must add <0> in front of them by hand. */
4600 /* SUSV2 gives us some freedom for the "digit" category, but ISO C 99
4603 The iswdigit function tests for any wide character that corresponds
4604 to a decimal-digit character (as defined in 5.2.1).
4606 the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
4608 return (ch >= 0x0030 && ch <= 0x0039);
4613 is_outdigit (unsigned int ch)
4615 return (ch >= 0x0030 && ch <= 0x0039);
4619 is_alnum (unsigned int ch)
4621 return is_alpha (ch) || is_digit (ch);
4625 is_blank (unsigned int ch)
4627 return (ch == 0x0009 /* '\t' */
4628 /* Category Zs without mention of "<noBreak>" */
4629 || (unicode_attributes[ch].name != NULL
4630 && unicode_attributes[ch].category[0] == 'Z'
4631 && unicode_attributes[ch].category[1] == 's'
4632 && !strstr (unicode_attributes[ch].decomposition, "<noBreak>")));
4636 is_space (unsigned int ch)
4638 /* Don't make U+00A0 a space. Non-breaking space means that all programs
4639 should treat it like a punctuation character, not like a space. */
4640 return (ch == 0x0020 /* ' ' */
4641 || ch == 0x000C /* '\f' */
4642 || ch == 0x000A /* '\n' */
4643 || ch == 0x000D /* '\r' */
4644 || ch == 0x0009 /* '\t' */
4645 || ch == 0x000B /* '\v' */
4646 /* Categories Zl, Zp, and Zs without mention of "<noBreak>" */
4647 || (unicode_attributes[ch].name != NULL
4648 && unicode_attributes[ch].category[0] == 'Z'
4649 && (unicode_attributes[ch].category[1] == 'l'
4650 || unicode_attributes[ch].category[1] == 'p'
4651 || (unicode_attributes[ch].category[1] == 's'
4652 && !strstr (unicode_attributes[ch].decomposition,
4657 is_cntrl (unsigned int ch)
4659 return (unicode_attributes[ch].name != NULL
4660 && (strcmp (unicode_attributes[ch].name, "<control>") == 0
4661 /* Categories Zl and Zp */
4662 || (unicode_attributes[ch].category[0] == 'Z'
4663 && (unicode_attributes[ch].category[1] == 'l'
4664 || unicode_attributes[ch].category[1] == 'p'))));
4668 is_xdigit (unsigned int ch)
4671 return is_digit (ch)
4672 || (ch >= 0x0041 && ch <= 0x0046)
4673 || (ch >= 0x0061 && ch <= 0x0066);
4675 /* SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
4678 The iswxdigit function tests for any wide character that corresponds
4679 to a hexadecimal-digit character (as defined in 6.4.4.1).
4681 hexadecimal-digit: one of 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
4683 return (ch >= 0x0030 && ch <= 0x0039)
4684 || (ch >= 0x0041 && ch <= 0x0046)
4685 || (ch >= 0x0061 && ch <= 0x0066);
4690 is_graph (unsigned int ch)
4692 return (unicode_attributes[ch].name != NULL
4693 && strcmp (unicode_attributes[ch].name, "<control>")
4698 is_print (unsigned int ch)
4700 return (unicode_attributes[ch].name != NULL
4701 && strcmp (unicode_attributes[ch].name, "<control>")
4702 /* Categories Zl and Zp */
4703 && !(unicode_attributes[ch].name != NULL
4704 && unicode_attributes[ch].category[0] == 'Z'
4705 && (unicode_attributes[ch].category[1] == 'l'
4706 || unicode_attributes[ch].category[1] == 'p')));
4710 is_punct (unsigned int ch)
4713 return (unicode_attributes[ch].name != NULL
4714 && unicode_attributes[ch].category[0] == 'P');
4716 /* The traditional POSIX definition of punctuation is every graphic,
4717 non-alphanumeric character. */
4718 return (is_graph (ch) && !is_alpha (ch) && !is_digit (ch));
4722 /* Output all properties. */
4724 output_old_ctype (const char *version)
4726 #define PROPERTY(P) \
4727 debug_output_predicate ("ctype_" #P ".txt", is_ ## P); \
4728 output_predicate_test ("test-ctype_" #P ".c", is_ ## P, "uc_is_" #P " (c)"); \
4729 output_predicate ("ctype_" #P ".h", is_ ## P, "u_is_" #P, "ISO C <ctype.h> like properties", version);
4748 is_combining (unsigned int ch)
4750 /* Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
4751 file. In 3.0.1 it was identical to the union of the general categories
4752 "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the
4753 PropList.txt file, so we take the latter definition. */
4754 return (unicode_attributes[ch].name != NULL
4755 && unicode_attributes[ch].category[0] == 'M'
4756 && (unicode_attributes[ch].category[1] == 'n'
4757 || unicode_attributes[ch].category[1] == 'c'
4758 || unicode_attributes[ch].category[1] == 'e'));
4762 is_combining_level3 (unsigned int ch)
4764 return is_combining (ch)
4765 && !(unicode_attributes[ch].combining[0] != '\0'
4766 && unicode_attributes[ch].combining[0] != '0'
4767 && strtoul (unicode_attributes[ch].combining, NULL, 10) >= 200);
4770 /* Return the UCS symbol string for a Unicode character. */
4772 ucs_symbol (unsigned int i)
4774 static char buf[11+1];
4776 sprintf (buf, (i < 0x10000 ? "<U%04X>" : "<U%08X>"), i);
4780 /* Return the UCS symbol range string for a Unicode characters interval. */
4782 ucs_symbol_range (unsigned int low, unsigned int high)
4784 static char buf[24+1];
4786 strcpy (buf, ucs_symbol (low));
4788 strcat (buf, ucs_symbol (high));
4792 /* Output a character class (= property) table. */
4795 output_charclass (FILE *stream, const char *classname,
4796 bool (*func) (unsigned int))
4798 char table[0x110000];
4800 bool need_semicolon;
4801 const int max_column = 75;
4804 for (i = 0; i < 0x110000; i++)
4805 table[i] = (int) func (i);
4807 fprintf (stream, "%s ", classname);
4808 need_semicolon = false;
4810 for (i = 0; i < 0x110000; )
4816 unsigned int low, high;
4822 while (i < 0x110000 && table[i]);
4826 strcpy (buf, ucs_symbol (low));
4828 strcpy (buf, ucs_symbol_range (low, high));
4832 fprintf (stream, ";");
4836 if (column + strlen (buf) > max_column)
4838 fprintf (stream, "/\n ");
4842 fprintf (stream, "%s", buf);
4843 column += strlen (buf);
4844 need_semicolon = true;
4847 fprintf (stream, "\n");
4850 /* Output a character mapping table. */
4853 output_charmap (FILE *stream, const char *mapname,
4854 unsigned int (*func) (unsigned int))
4856 char table[0x110000];
4858 bool need_semicolon;
4859 const int max_column = 75;
4862 for (i = 0; i < 0x110000; i++)
4863 table[i] = (func (i) != i);
4865 fprintf (stream, "%s ", mapname);
4866 need_semicolon = false;
4868 for (i = 0; i < 0x110000; i++)
4874 strcat (buf, ucs_symbol (i));
4876 strcat (buf, ucs_symbol (func (i)));
4881 fprintf (stream, ";");
4885 if (column + strlen (buf) > max_column)
4887 fprintf (stream, "/\n ");
4891 fprintf (stream, "%s", buf);
4892 column += strlen (buf);
4893 need_semicolon = true;
4895 fprintf (stream, "\n");
4898 /* Output the width table. */
4901 output_widthmap (FILE *stream)
4905 /* Output the tables to the given file. */
4908 output_tables (const char *filename, const char *version)
4913 stream = fopen (filename, "w");
4916 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4920 fprintf (stream, "escape_char /\n");
4921 fprintf (stream, "comment_char %%\n");
4922 fprintf (stream, "\n");
4923 fprintf (stream, "%% Generated automatically by gen-unicode-ctype for Unicode %s.\n",
4925 fprintf (stream, "\n");
4927 fprintf (stream, "LC_IDENTIFICATION\n");
4928 fprintf (stream, "title \"Unicode %s FDCC-set\"\n", version);
4929 fprintf (stream, "source \"UnicodeData.txt, PropList.txt\"\n");
4930 fprintf (stream, "address \"\"\n");
4931 fprintf (stream, "contact \"\"\n");
4932 fprintf (stream, "email \"bug-glibc@gnu.org\"\n");
4933 fprintf (stream, "tel \"\"\n");
4934 fprintf (stream, "fax \"\"\n");
4935 fprintf (stream, "language \"\"\n");
4936 fprintf (stream, "territory \"Earth\"\n");
4937 fprintf (stream, "revision \"%s\"\n", version);
4942 strftime (date, sizeof (date), "%Y-%m-%d", gmtime (&now));
4943 fprintf (stream, "date \"%s\"\n", date);
4945 fprintf (stream, "category \"unicode:2001\";LC_CTYPE\n");
4946 fprintf (stream, "END LC_IDENTIFICATION\n");
4947 fprintf (stream, "\n");
4949 /* Verifications. */
4950 for (ch = 0; ch < 0x110000; ch++)
4952 /* toupper restriction: "Only characters specified for the keywords
4953 lower and upper shall be specified. */
4954 if (to_upper (ch) != ch && !(is_lower (ch) || is_upper (ch)))
4956 "%s is not upper|lower but toupper(0x%04X) = 0x%04X\n",
4957 ucs_symbol (ch), ch, to_upper (ch));
4959 /* tolower restriction: "Only characters specified for the keywords
4960 lower and upper shall be specified. */
4961 if (to_lower (ch) != ch && !(is_lower (ch) || is_upper (ch)))
4963 "%s is not upper|lower but tolower(0x%04X) = 0x%04X\n",
4964 ucs_symbol (ch), ch, to_lower (ch));
4966 /* alpha restriction: "Characters classified as either upper or lower
4967 shall automatically belong to this class. */
4968 if ((is_lower (ch) || is_upper (ch)) && !is_alpha (ch))
4969 fprintf (stderr, "%s is upper|lower but not alpha\n", ucs_symbol (ch));
4971 /* alpha restriction: "No character specified for the keywords cntrl,
4972 digit, punct or space shall be specified." */
4973 if (is_alpha (ch) && is_cntrl (ch))
4974 fprintf (stderr, "%s is alpha and cntrl\n", ucs_symbol (ch));
4975 if (is_alpha (ch) && is_digit (ch))
4976 fprintf (stderr, "%s is alpha and digit\n", ucs_symbol (ch));
4977 if (is_alpha (ch) && is_punct (ch))
4978 fprintf (stderr, "%s is alpha and punct\n", ucs_symbol (ch));
4979 if (is_alpha (ch) && is_space (ch))
4980 fprintf (stderr, "%s is alpha and space\n", ucs_symbol (ch));
4982 /* space restriction: "No character specified for the keywords upper,
4983 lower, alpha, digit, graph or xdigit shall be specified."
4984 upper, lower, alpha already checked above. */
4985 if (is_space (ch) && is_digit (ch))
4986 fprintf (stderr, "%s is space and digit\n", ucs_symbol (ch));
4987 if (is_space (ch) && is_graph (ch))
4988 fprintf (stderr, "%s is space and graph\n", ucs_symbol (ch));
4989 if (is_space (ch) && is_xdigit (ch))
4990 fprintf (stderr, "%s is space and xdigit\n", ucs_symbol (ch));
4992 /* cntrl restriction: "No character specified for the keywords upper,
4993 lower, alpha, digit, punct, graph, print or xdigit shall be
4994 specified." upper, lower, alpha already checked above. */
4995 if (is_cntrl (ch) && is_digit (ch))
4996 fprintf (stderr, "%s is cntrl and digit\n", ucs_symbol (ch));
4997 if (is_cntrl (ch) && is_punct (ch))
4998 fprintf (stderr, "%s is cntrl and punct\n", ucs_symbol (ch));
4999 if (is_cntrl (ch) && is_graph (ch))
5000 fprintf (stderr, "%s is cntrl and graph\n", ucs_symbol (ch));
5001 if (is_cntrl (ch) && is_print (ch))
5002 fprintf (stderr, "%s is cntrl and print\n", ucs_symbol (ch));
5003 if (is_cntrl (ch) && is_xdigit (ch))
5004 fprintf (stderr, "%s is cntrl and xdigit\n", ucs_symbol (ch));
5006 /* punct restriction: "No character specified for the keywords upper,
5007 lower, alpha, digit, cntrl, xdigit or as the <space> character shall
5008 be specified." upper, lower, alpha, cntrl already checked above. */
5009 if (is_punct (ch) && is_digit (ch))
5010 fprintf (stderr, "%s is punct and digit\n", ucs_symbol (ch));
5011 if (is_punct (ch) && is_xdigit (ch))
5012 fprintf (stderr, "%s is punct and xdigit\n", ucs_symbol (ch));
5013 if (is_punct (ch) && (ch == 0x0020))
5014 fprintf (stderr, "%s is punct\n", ucs_symbol (ch));
5016 /* graph restriction: "No character specified for the keyword cntrl
5017 shall be specified." Already checked above. */
5019 /* print restriction: "No character specified for the keyword cntrl
5020 shall be specified." Already checked above. */
5022 /* graph - print relation: differ only in the <space> character.
5023 How is this possible if there are more than one space character?!
5024 I think susv2/xbd/locale.html should speak of "space characters",
5025 not "space character". */
5026 if (is_print (ch) && !(is_graph (ch) || /* ch == 0x0020 */ is_space (ch)))
5028 "%s is print but not graph|<space>\n", ucs_symbol (ch));
5029 if (!is_print (ch) && (is_graph (ch) || ch == 0x0020))
5031 "%s is graph|<space> but not print\n", ucs_symbol (ch));
5034 fprintf (stream, "LC_CTYPE\n");
5035 output_charclass (stream, "upper", is_upper);
5036 output_charclass (stream, "lower", is_lower);
5037 output_charclass (stream, "alpha", is_alpha);
5038 output_charclass (stream, "digit", is_digit);
5039 output_charclass (stream, "outdigit", is_outdigit);
5040 output_charclass (stream, "blank", is_blank);
5041 output_charclass (stream, "space", is_space);
5042 output_charclass (stream, "cntrl", is_cntrl);
5043 output_charclass (stream, "punct", is_punct);
5044 output_charclass (stream, "xdigit", is_xdigit);
5045 output_charclass (stream, "graph", is_graph);
5046 output_charclass (stream, "print", is_print);
5047 output_charclass (stream, "class \"combining\";", is_combining);
5048 output_charclass (stream, "class \"combining_level3\";", is_combining_level3);
5049 output_charmap (stream, "toupper", to_upper);
5050 output_charmap (stream, "tolower", to_lower);
5051 output_charmap (stream, "map \"totitle\";", to_title);
5052 output_widthmap (stream);
5053 fprintf (stream, "END LC_CTYPE\n");
5055 if (ferror (stream) || fclose (stream))
5057 fprintf (stderr, "error writing to '%s'\n", filename);
5065 main (int argc, char * argv[])
5067 const char *unicodedata_filename;
5068 const char *proplist_filename;
5069 const char *derivedproplist_filename;
5070 const char *scripts_filename;
5071 const char *blocks_filename;
5072 const char *proplist30_filename;
5073 const char *version;
5077 fprintf (stderr, "Usage: %s UnicodeData.txt PropList.txt DerivedCoreProperties.txt Scripts.txt Blocks.txt PropList-3.0.1.txt version\n",
5082 unicodedata_filename = argv[1];
5083 proplist_filename = argv[2];
5084 derivedproplist_filename = argv[3];
5085 scripts_filename = argv[4];
5086 blocks_filename = argv[5];
5087 proplist30_filename = argv[6];
5090 fill_attributes (unicodedata_filename);
5091 clear_properties ();
5092 fill_properties (proplist_filename);
5093 fill_properties (derivedproplist_filename);
5094 fill_properties30 (proplist30_filename);
5095 fill_scripts (scripts_filename);
5096 fill_blocks (blocks_filename);
5098 output_categories (version);
5099 output_category ("categ_of.h", version);
5100 output_combclass ("combining.h", version);
5101 output_bidi_category ("bidi_of.h", version);
5102 output_decimal_digit_test ("test-decdigit.h", version);
5103 output_decimal_digit ("decdigit.h", version);
5104 output_digit_test ("test-digit.h", version);
5105 output_digit ("digit.h", version);
5106 output_numeric_test ("test-numeric.h", version);
5107 output_numeric ("numeric.h", version);
5108 output_mirror ("mirror.h", version);
5109 output_properties (version);
5110 output_scripts (version);
5111 output_scripts_byname (version);
5112 output_blocks (version);
5113 output_ident_properties (version);
5114 output_old_ctype (version);
5120 * For Emacs M-x compile
5122 * compile-command: "
5123 gcc -O -Wall gen-ctype.c -o gen-ctype && \
5125 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.0.0/ucd/UnicodeData.txt \
5126 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.0.0/ucd/PropList.txt \
5127 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.0.0/ucd/DerivedCoreProperties.txt \
5128 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.0.0/ucd/Scripts.txt \
5129 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.0.0/ucd/Blocks.txt \
5130 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/3.0.1/PropList-3.0.1.txt \