1 /* Generate Unicode conforming character classification tables and
2 Line Break Properties tables from a UnicodeData file.
3 Copyright (C) 2000-2002, 2004, 2007-2009 Free Software Foundation, Inc.
4 Written by Bruno Haible <bruno@clisp.org>, 2000-2002.
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <http://www.gnu.org/licenses/>. */
20 $ gen-uni-tables /usr/local/share/Unidata/UnicodeData.txt \
21 /usr/local/share/Unidata/PropList.txt \
22 /usr/local/share/Unidata/DerivedCoreProperties.txt \
23 /usr/local/share/Unidata/Scripts.txt \
24 /usr/local/share/Unidata/Blocks.txt \
25 /usr/local/share/Unidata/PropList-3.0.1.txt \
26 /usr/local/share/Unidata/EastAsianWidth.txt \
27 /usr/local/share/Unidata/LineBreak.txt \
28 /usr/local/share/Unidata/WordBreakProperty.txt \
39 /* ========================================================================= */
41 /* Reading UnicodeData.txt. */
44 /* This structure represents one line in the UnicodeData.txt file. */
45 struct unicode_attribute
47 const char *name; /* Character name */
48 const char *category; /* General category */
49 const char *combining; /* Canonical combining class */
50 const char *bidi; /* Bidirectional category */
51 const char *decomposition; /* Character decomposition mapping */
52 const char *decdigit; /* Decimal digit value */
53 const char *digit; /* Digit value */
54 const char *numeric; /* Numeric value */
55 bool mirrored; /* mirrored */
56 const char *oldname; /* Old Unicode 1.0 name */
57 const char *comment; /* Comment */
58 unsigned int upper; /* Uppercase mapping */
59 unsigned int lower; /* Lowercase mapping */
60 unsigned int title; /* Titlecase mapping */
63 /* Missing fields are represented with "" for strings, and NONE for
65 #define NONE (~(unsigned int)0)
67 /* The entire contents of the UnicodeData.txt file. */
68 struct unicode_attribute unicode_attributes [0x110000];
70 /* Stores in unicode_attributes[i] the values from the given fields. */
72 fill_attribute (unsigned int i,
73 const char *field1, const char *field2,
74 const char *field3, const char *field4,
75 const char *field5, const char *field6,
76 const char *field7, const char *field8,
77 const char *field9, const char *field10,
78 const char *field11, const char *field12,
79 const char *field13, const char *field14)
81 struct unicode_attribute * uni;
85 fprintf (stderr, "index too large\n");
88 if (strcmp (field2, "Cs") == 0)
89 /* Surrogates are UTF-16 artefacts, not real characters. Ignore them. */
91 uni = &unicode_attributes[i];
92 /* Copy the strings. */
93 uni->name = strdup (field1);
94 uni->category = (field2[0] == '\0' ? "" : strdup (field2));
95 uni->combining = (field3[0] == '\0' ? "" : strdup (field3));
96 uni->bidi = (field4[0] == '\0' ? "" : strdup (field4));
97 uni->decomposition = (field5[0] == '\0' ? "" : strdup (field5));
98 uni->decdigit = (field6[0] == '\0' ? "" : strdup (field6));
99 uni->digit = (field7[0] == '\0' ? "" : strdup (field7));
100 uni->numeric = (field8[0] == '\0' ? "" : strdup (field8));
101 uni->mirrored = (field9[0] == 'Y');
102 uni->oldname = (field10[0] == '\0' ? "" : strdup (field10));
103 uni->comment = (field11[0] == '\0' ? "" : strdup (field11));
104 uni->upper = (field12[0] =='\0' ? NONE : strtoul (field12, NULL, 16));
105 uni->lower = (field13[0] =='\0' ? NONE : strtoul (field13, NULL, 16));
106 uni->title = (field14[0] =='\0' ? NONE : strtoul (field14, NULL, 16));
109 /* Maximum length of a field in the UnicodeData.txt file. */
112 /* Reads the next field from STREAM. The buffer BUFFER has size FIELDLEN.
113 Reads up to (but excluding) DELIM.
114 Returns 1 when a field was successfully read, otherwise 0. */
116 getfield (FILE *stream, char *buffer, int delim)
121 for (; (c = getc (stream)), (c != EOF && c != delim); )
123 /* The original unicode.org UnicodeData.txt file happens to have
124 CR/LF line terminators. Silently convert to LF. */
128 /* Put c into the buffer. */
129 if (++count >= FIELDLEN - 1)
131 fprintf (stderr, "field longer than expected, increase FIELDLEN\n");
144 /* Stores in unicode_attributes[] the entire contents of the UnicodeData.txt
147 fill_attributes (const char *unicodedata_filename)
151 char field0[FIELDLEN];
152 char field1[FIELDLEN];
153 char field2[FIELDLEN];
154 char field3[FIELDLEN];
155 char field4[FIELDLEN];
156 char field5[FIELDLEN];
157 char field6[FIELDLEN];
158 char field7[FIELDLEN];
159 char field8[FIELDLEN];
160 char field9[FIELDLEN];
161 char field10[FIELDLEN];
162 char field11[FIELDLEN];
163 char field12[FIELDLEN];
164 char field13[FIELDLEN];
165 char field14[FIELDLEN];
168 for (i = 0; i < 0x110000; i++)
169 unicode_attributes[i].name = NULL;
171 stream = fopen (unicodedata_filename, "r");
174 fprintf (stderr, "error during fopen of '%s'\n", unicodedata_filename);
183 n = getfield (stream, field0, ';');
184 n += getfield (stream, field1, ';');
185 n += getfield (stream, field2, ';');
186 n += getfield (stream, field3, ';');
187 n += getfield (stream, field4, ';');
188 n += getfield (stream, field5, ';');
189 n += getfield (stream, field6, ';');
190 n += getfield (stream, field7, ';');
191 n += getfield (stream, field8, ';');
192 n += getfield (stream, field9, ';');
193 n += getfield (stream, field10, ';');
194 n += getfield (stream, field11, ';');
195 n += getfield (stream, field12, ';');
196 n += getfield (stream, field13, ';');
197 n += getfield (stream, field14, '\n');
202 fprintf (stderr, "short line in '%s':%d\n",
203 unicodedata_filename, lineno);
206 i = strtoul (field0, NULL, 16);
208 && strlen (field1) >= 9
209 && strcmp (field1 + strlen(field1) - 8, ", First>") == 0)
211 /* Deal with a range. */
213 n = getfield (stream, field0, ';');
214 n += getfield (stream, field1, ';');
215 n += getfield (stream, field2, ';');
216 n += getfield (stream, field3, ';');
217 n += getfield (stream, field4, ';');
218 n += getfield (stream, field5, ';');
219 n += getfield (stream, field6, ';');
220 n += getfield (stream, field7, ';');
221 n += getfield (stream, field8, ';');
222 n += getfield (stream, field9, ';');
223 n += getfield (stream, field10, ';');
224 n += getfield (stream, field11, ';');
225 n += getfield (stream, field12, ';');
226 n += getfield (stream, field13, ';');
227 n += getfield (stream, field14, '\n');
230 fprintf (stderr, "missing end range in '%s':%d\n",
231 unicodedata_filename, lineno);
234 if (!(field1[0] == '<'
235 && strlen (field1) >= 8
236 && strcmp (field1 + strlen (field1) - 7, ", Last>") == 0))
238 fprintf (stderr, "missing end range in '%s':%d\n",
239 unicodedata_filename, lineno);
242 field1[strlen (field1) - 7] = '\0';
243 j = strtoul (field0, NULL, 16);
245 fill_attribute (i, field1+1, field2, field3, field4, field5,
246 field6, field7, field8, field9, field10,
247 field11, field12, field13, field14);
251 /* Single character line */
252 fill_attribute (i, field1, field2, field3, field4, field5,
253 field6, field7, field8, field9, field10,
254 field11, field12, field13, field14);
257 if (ferror (stream) || fclose (stream))
259 fprintf (stderr, "error reading from '%s'\n", unicodedata_filename);
264 /* ========================================================================= */
266 /* General category. */
267 /* See Unicode 3.0 book, section 4.5,
271 is_category_L (unsigned int ch)
273 return (unicode_attributes[ch].name != NULL
274 && unicode_attributes[ch].category[0] == 'L');
278 is_category_Lu (unsigned int ch)
280 return (unicode_attributes[ch].name != NULL
281 && unicode_attributes[ch].category[0] == 'L'
282 && unicode_attributes[ch].category[1] == 'u');
286 is_category_Ll (unsigned int ch)
288 return (unicode_attributes[ch].name != NULL
289 && unicode_attributes[ch].category[0] == 'L'
290 && unicode_attributes[ch].category[1] == 'l');
294 is_category_Lt (unsigned int ch)
296 return (unicode_attributes[ch].name != NULL
297 && unicode_attributes[ch].category[0] == 'L'
298 && unicode_attributes[ch].category[1] == 't');
302 is_category_Lm (unsigned int ch)
304 return (unicode_attributes[ch].name != NULL
305 && unicode_attributes[ch].category[0] == 'L'
306 && unicode_attributes[ch].category[1] == 'm');
310 is_category_Lo (unsigned int ch)
312 return (unicode_attributes[ch].name != NULL
313 && unicode_attributes[ch].category[0] == 'L'
314 && unicode_attributes[ch].category[1] == 'o');
318 is_category_M (unsigned int ch)
320 return (unicode_attributes[ch].name != NULL
321 && unicode_attributes[ch].category[0] == 'M');
325 is_category_Mn (unsigned int ch)
327 return (unicode_attributes[ch].name != NULL
328 && unicode_attributes[ch].category[0] == 'M'
329 && unicode_attributes[ch].category[1] == 'n');
333 is_category_Mc (unsigned int ch)
335 return (unicode_attributes[ch].name != NULL
336 && unicode_attributes[ch].category[0] == 'M'
337 && unicode_attributes[ch].category[1] == 'c');
341 is_category_Me (unsigned int ch)
343 return (unicode_attributes[ch].name != NULL
344 && unicode_attributes[ch].category[0] == 'M'
345 && unicode_attributes[ch].category[1] == 'e');
349 is_category_N (unsigned int ch)
351 return (unicode_attributes[ch].name != NULL
352 && unicode_attributes[ch].category[0] == 'N');
356 is_category_Nd (unsigned int ch)
358 return (unicode_attributes[ch].name != NULL
359 && unicode_attributes[ch].category[0] == 'N'
360 && unicode_attributes[ch].category[1] == 'd');
364 is_category_Nl (unsigned int ch)
366 return (unicode_attributes[ch].name != NULL
367 && unicode_attributes[ch].category[0] == 'N'
368 && unicode_attributes[ch].category[1] == 'l');
372 is_category_No (unsigned int ch)
374 return (unicode_attributes[ch].name != NULL
375 && unicode_attributes[ch].category[0] == 'N'
376 && unicode_attributes[ch].category[1] == 'o');
380 is_category_P (unsigned int ch)
382 return (unicode_attributes[ch].name != NULL
383 && unicode_attributes[ch].category[0] == 'P');
387 is_category_Pc (unsigned int ch)
389 return (unicode_attributes[ch].name != NULL
390 && unicode_attributes[ch].category[0] == 'P'
391 && unicode_attributes[ch].category[1] == 'c');
395 is_category_Pd (unsigned int ch)
397 return (unicode_attributes[ch].name != NULL
398 && unicode_attributes[ch].category[0] == 'P'
399 && unicode_attributes[ch].category[1] == 'd');
403 is_category_Ps (unsigned int ch)
405 return (unicode_attributes[ch].name != NULL
406 && unicode_attributes[ch].category[0] == 'P'
407 && unicode_attributes[ch].category[1] == 's');
411 is_category_Pe (unsigned int ch)
413 return (unicode_attributes[ch].name != NULL
414 && unicode_attributes[ch].category[0] == 'P'
415 && unicode_attributes[ch].category[1] == 'e');
419 is_category_Pi (unsigned int ch)
421 return (unicode_attributes[ch].name != NULL
422 && unicode_attributes[ch].category[0] == 'P'
423 && unicode_attributes[ch].category[1] == 'i');
427 is_category_Pf (unsigned int ch)
429 return (unicode_attributes[ch].name != NULL
430 && unicode_attributes[ch].category[0] == 'P'
431 && unicode_attributes[ch].category[1] == 'f');
435 is_category_Po (unsigned int ch)
437 return (unicode_attributes[ch].name != NULL
438 && unicode_attributes[ch].category[0] == 'P'
439 && unicode_attributes[ch].category[1] == 'o');
443 is_category_S (unsigned int ch)
445 return (unicode_attributes[ch].name != NULL
446 && unicode_attributes[ch].category[0] == 'S');
450 is_category_Sm (unsigned int ch)
452 return (unicode_attributes[ch].name != NULL
453 && unicode_attributes[ch].category[0] == 'S'
454 && unicode_attributes[ch].category[1] == 'm');
458 is_category_Sc (unsigned int ch)
460 return (unicode_attributes[ch].name != NULL
461 && unicode_attributes[ch].category[0] == 'S'
462 && unicode_attributes[ch].category[1] == 'c');
466 is_category_Sk (unsigned int ch)
468 return (unicode_attributes[ch].name != NULL
469 && unicode_attributes[ch].category[0] == 'S'
470 && unicode_attributes[ch].category[1] == 'k');
474 is_category_So (unsigned int ch)
476 return (unicode_attributes[ch].name != NULL
477 && unicode_attributes[ch].category[0] == 'S'
478 && unicode_attributes[ch].category[1] == 'o');
482 is_category_Z (unsigned int ch)
484 return (unicode_attributes[ch].name != NULL
485 && unicode_attributes[ch].category[0] == 'Z');
489 is_category_Zs (unsigned int ch)
491 return (unicode_attributes[ch].name != NULL
492 && unicode_attributes[ch].category[0] == 'Z'
493 && unicode_attributes[ch].category[1] == 's');
497 is_category_Zl (unsigned int ch)
499 return (unicode_attributes[ch].name != NULL
500 && unicode_attributes[ch].category[0] == 'Z'
501 && unicode_attributes[ch].category[1] == 'l');
505 is_category_Zp (unsigned int ch)
507 return (unicode_attributes[ch].name != NULL
508 && unicode_attributes[ch].category[0] == 'Z'
509 && unicode_attributes[ch].category[1] == 'p');
513 is_category_C (unsigned int ch)
515 return (unicode_attributes[ch].name == NULL
516 || unicode_attributes[ch].category[0] == 'C');
520 is_category_Cc (unsigned int ch)
522 return (unicode_attributes[ch].name != NULL
523 && unicode_attributes[ch].category[0] == 'C'
524 && unicode_attributes[ch].category[1] == 'c');
528 is_category_Cf (unsigned int ch)
530 return (unicode_attributes[ch].name != NULL
531 && unicode_attributes[ch].category[0] == 'C'
532 && unicode_attributes[ch].category[1] == 'f');
536 is_category_Cs (unsigned int ch)
538 return (ch >= 0xd800 && ch < 0xe000);
542 is_category_Co (unsigned int ch)
544 return (unicode_attributes[ch].name != NULL
545 && unicode_attributes[ch].category[0] == 'C'
546 && unicode_attributes[ch].category[1] == 'o');
550 is_category_Cn (unsigned int ch)
552 return (unicode_attributes[ch].name == NULL
553 && !(ch >= 0xd800 && ch < 0xe000));
556 /* Output a boolean property in a human readable format. */
558 debug_output_predicate (const char *filename, bool (*predicate) (unsigned int))
563 stream = fopen (filename, "w");
566 fprintf (stderr, "cannot open '%s' for writing\n", filename);
570 #if 0 /* This yields huge text output. */
571 for (ch = 0; ch < 0x110000; ch++)
574 fprintf (stream, "0x%04X\n", ch);
577 for (ch = 0; ch < 0x110000; ch++)
580 unsigned int first = ch;
583 while (ch + 1 < 0x110000 && predicate (ch + 1))
587 fprintf (stream, "0x%04X..0x%04X\n", first, last);
589 fprintf (stream, "0x%04X\n", ch);
593 if (ferror (stream) || fclose (stream))
595 fprintf (stderr, "error writing to '%s'\n", filename);
600 /* Output the unit test for a boolean property. */
602 output_predicate_test (const char *filename, bool (*predicate) (unsigned int), const char *expression)
608 stream = fopen (filename, "w");
611 fprintf (stderr, "cannot open '%s' for writing\n", filename);
615 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
616 fprintf (stream, "/* Test the Unicode character type functions.\n");
617 fprintf (stream, " Copyright (C) 2007 Free Software Foundation, Inc.\n");
618 fprintf (stream, "\n");
619 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
620 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
621 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
622 fprintf (stream, " (at your option) any later version.\n");
623 fprintf (stream, "\n");
624 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
625 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
626 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
627 fprintf (stream, " GNU General Public License for more details.\n");
628 fprintf (stream, "\n");
629 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
630 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
631 fprintf (stream, "\n");
632 fprintf (stream, "#include \"test-predicate-part1.h\"\n");
633 fprintf (stream, "\n");
636 for (ch = 0; ch < 0x110000; ch++)
639 unsigned int first = ch;
642 while (ch + 1 < 0x110000 && predicate (ch + 1))
646 fprintf (stream, ",\n");
647 fprintf (stream, " { 0x%04X, 0x%04X }", first, last);
651 fprintf (stream, "\n");
653 fprintf (stream, "\n");
654 fprintf (stream, "#define PREDICATE(c) %s\n", expression);
655 fprintf (stream, "#include \"test-predicate-part2.h\"\n");
657 if (ferror (stream) || fclose (stream))
659 fprintf (stderr, "error writing to '%s'\n", filename);
664 /* Construction of sparse 3-level tables. */
665 #define TABLE predicate_table
666 #define xmalloc malloc
667 #define xrealloc realloc
668 #include "3levelbit.h"
670 /* Output a boolean property in a three-level bitmap. */
672 output_predicate (const char *filename, bool (*predicate) (unsigned int), const char *name, const char *comment, const char *version)
676 struct predicate_table t;
677 unsigned int level1_offset, level2_offset, level3_offset;
679 stream = fopen (filename, "w");
682 fprintf (stderr, "cannot open '%s' for writing\n", filename);
686 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
687 fprintf (stream, "/* %s of Unicode characters. */\n", comment);
688 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
693 predicate_table_init (&t);
695 for (ch = 0; ch < 0x110000; ch++)
697 predicate_table_add (&t, ch);
699 predicate_table_finalize (&t);
701 /* Offsets in t.result, in memory of this process. */
703 5 * sizeof (uint32_t);
705 5 * sizeof (uint32_t)
706 + t.level1_size * sizeof (uint32_t);
708 5 * sizeof (uint32_t)
709 + t.level1_size * sizeof (uint32_t)
710 + (t.level2_size << t.q) * sizeof (uint32_t);
712 for (i = 0; i < 5; i++)
714 fprintf (stream, "#define header_%d %d\n", i,
715 ((uint32_t *) t.result)[i]);
717 fprintf (stream, "static const\n");
718 fprintf (stream, "struct\n");
719 fprintf (stream, " {\n");
720 fprintf (stream, " int header[1];\n");
721 fprintf (stream, " int level1[%zu];\n", t.level1_size);
722 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
723 fprintf (stream, " /*unsigned*/ int level3[%zu << %d];\n", t.level3_size, t.p);
724 fprintf (stream, " }\n");
725 fprintf (stream, "%s =\n", name);
726 fprintf (stream, "{\n");
727 fprintf (stream, " { %d },\n", ((uint32_t *) t.result)[1]);
728 fprintf (stream, " {");
729 if (t.level1_size > 1)
730 fprintf (stream, "\n ");
731 for (i = 0; i < t.level1_size; i++)
734 if (i > 0 && (i % 1) == 0)
735 fprintf (stream, "\n ");
736 offset = ((uint32_t *) (t.result + level1_offset))[i];
738 fprintf (stream, " %5d", -1);
740 fprintf (stream, " %5zd * sizeof (int) / sizeof (short) + %5zd",
741 1 + t.level1_size, (offset - level2_offset) / sizeof (uint32_t));
742 if (i+1 < t.level1_size)
743 fprintf (stream, ",");
745 if (t.level1_size > 1)
746 fprintf (stream, "\n ");
747 fprintf (stream, " },\n");
748 fprintf (stream, " {");
749 if (t.level2_size << t.q > 1)
750 fprintf (stream, "\n ");
751 for (i = 0; i < t.level2_size << t.q; i++)
754 if (i > 0 && (i % 1) == 0)
755 fprintf (stream, "\n ");
756 offset = ((uint32_t *) (t.result + level2_offset))[i];
758 fprintf (stream, " %5d", -1);
760 fprintf (stream, " %5zd + %5zd * sizeof (short) / sizeof (int) + %5zd",
761 1 + t.level1_size, t.level2_size << t.q, (offset - level3_offset) / sizeof (uint32_t));
762 if (i+1 < t.level2_size << t.q)
763 fprintf (stream, ",");
765 if (t.level2_size << t.q > 1)
766 fprintf (stream, "\n ");
767 fprintf (stream, " },\n");
768 fprintf (stream, " {");
769 if (t.level3_size << t.p > 4)
770 fprintf (stream, "\n ");
771 for (i = 0; i < t.level3_size << t.p; i++)
773 if (i > 0 && (i % 4) == 0)
774 fprintf (stream, "\n ");
775 fprintf (stream, " 0x%08X",
776 ((uint32_t *) (t.result + level3_offset))[i]);
777 if (i+1 < t.level3_size << t.p)
778 fprintf (stream, ",");
780 if (t.level3_size << t.p > 4)
781 fprintf (stream, "\n ");
782 fprintf (stream, " }\n");
783 fprintf (stream, "};\n");
785 if (ferror (stream) || fclose (stream))
787 fprintf (stderr, "error writing to '%s'\n", filename);
792 /* Output all categories. */
794 output_categories (const char *version)
796 #define CATEGORY(C) \
797 debug_output_predicate ("unictype/categ_" #C ".txt", is_category_ ## C); \
798 output_predicate_test ("../tests/unictype/test-categ_" #C ".c", is_category_ ## C, "uc_is_general_category (c, UC_CATEGORY_" #C ")"); \
799 output_predicate ("unictype/categ_" #C ".h", is_category_ ## C, "u_categ_" #C, "Categories", version);
842 UC_CATEGORY_MASK_L = 0x0000001f,
843 UC_CATEGORY_MASK_Lu = 0x00000001,
844 UC_CATEGORY_MASK_Ll = 0x00000002,
845 UC_CATEGORY_MASK_Lt = 0x00000004,
846 UC_CATEGORY_MASK_Lm = 0x00000008,
847 UC_CATEGORY_MASK_Lo = 0x00000010,
848 UC_CATEGORY_MASK_M = 0x000000e0,
849 UC_CATEGORY_MASK_Mn = 0x00000020,
850 UC_CATEGORY_MASK_Mc = 0x00000040,
851 UC_CATEGORY_MASK_Me = 0x00000080,
852 UC_CATEGORY_MASK_N = 0x00000700,
853 UC_CATEGORY_MASK_Nd = 0x00000100,
854 UC_CATEGORY_MASK_Nl = 0x00000200,
855 UC_CATEGORY_MASK_No = 0x00000400,
856 UC_CATEGORY_MASK_P = 0x0003f800,
857 UC_CATEGORY_MASK_Pc = 0x00000800,
858 UC_CATEGORY_MASK_Pd = 0x00001000,
859 UC_CATEGORY_MASK_Ps = 0x00002000,
860 UC_CATEGORY_MASK_Pe = 0x00004000,
861 UC_CATEGORY_MASK_Pi = 0x00008000,
862 UC_CATEGORY_MASK_Pf = 0x00010000,
863 UC_CATEGORY_MASK_Po = 0x00020000,
864 UC_CATEGORY_MASK_S = 0x003c0000,
865 UC_CATEGORY_MASK_Sm = 0x00040000,
866 UC_CATEGORY_MASK_Sc = 0x00080000,
867 UC_CATEGORY_MASK_Sk = 0x00100000,
868 UC_CATEGORY_MASK_So = 0x00200000,
869 UC_CATEGORY_MASK_Z = 0x01c00000,
870 UC_CATEGORY_MASK_Zs = 0x00400000,
871 UC_CATEGORY_MASK_Zl = 0x00800000,
872 UC_CATEGORY_MASK_Zp = 0x01000000,
873 UC_CATEGORY_MASK_C = 0x3e000000,
874 UC_CATEGORY_MASK_Cc = 0x02000000,
875 UC_CATEGORY_MASK_Cf = 0x04000000,
876 UC_CATEGORY_MASK_Cs = 0x08000000,
877 UC_CATEGORY_MASK_Co = 0x10000000,
878 UC_CATEGORY_MASK_Cn = 0x20000000
882 general_category_byname (const char *category_name)
884 if (category_name[0] != '\0'
885 && (category_name[1] == '\0' || category_name[2] == '\0'))
886 switch (category_name[0])
889 switch (category_name[1])
891 case '\0': return UC_CATEGORY_MASK_L;
892 case 'u': return UC_CATEGORY_MASK_Lu;
893 case 'l': return UC_CATEGORY_MASK_Ll;
894 case 't': return UC_CATEGORY_MASK_Lt;
895 case 'm': return UC_CATEGORY_MASK_Lm;
896 case 'o': return UC_CATEGORY_MASK_Lo;
900 switch (category_name[1])
902 case '\0': return UC_CATEGORY_MASK_M;
903 case 'n': return UC_CATEGORY_MASK_Mn;
904 case 'c': return UC_CATEGORY_MASK_Mc;
905 case 'e': return UC_CATEGORY_MASK_Me;
909 switch (category_name[1])
911 case '\0': return UC_CATEGORY_MASK_N;
912 case 'd': return UC_CATEGORY_MASK_Nd;
913 case 'l': return UC_CATEGORY_MASK_Nl;
914 case 'o': return UC_CATEGORY_MASK_No;
918 switch (category_name[1])
920 case '\0': return UC_CATEGORY_MASK_P;
921 case 'c': return UC_CATEGORY_MASK_Pc;
922 case 'd': return UC_CATEGORY_MASK_Pd;
923 case 's': return UC_CATEGORY_MASK_Ps;
924 case 'e': return UC_CATEGORY_MASK_Pe;
925 case 'i': return UC_CATEGORY_MASK_Pi;
926 case 'f': return UC_CATEGORY_MASK_Pf;
927 case 'o': return UC_CATEGORY_MASK_Po;
931 switch (category_name[1])
933 case '\0': return UC_CATEGORY_MASK_S;
934 case 'm': return UC_CATEGORY_MASK_Sm;
935 case 'c': return UC_CATEGORY_MASK_Sc;
936 case 'k': return UC_CATEGORY_MASK_Sk;
937 case 'o': return UC_CATEGORY_MASK_So;
941 switch (category_name[1])
943 case '\0': return UC_CATEGORY_MASK_Z;
944 case 's': return UC_CATEGORY_MASK_Zs;
945 case 'l': return UC_CATEGORY_MASK_Zl;
946 case 'p': return UC_CATEGORY_MASK_Zp;
950 switch (category_name[1])
952 case '\0': return UC_CATEGORY_MASK_C;
953 case 'c': return UC_CATEGORY_MASK_Cc;
954 case 'f': return UC_CATEGORY_MASK_Cf;
955 case 's': return UC_CATEGORY_MASK_Cs;
956 case 'o': return UC_CATEGORY_MASK_Co;
957 case 'n': return UC_CATEGORY_MASK_Cn;
961 /* Invalid category name. */
965 /* Construction of sparse 3-level tables. */
966 #define TABLE category_table
967 #define ELEMENT uint8_t
968 #define DEFAULT 29 /* = log2(UC_CATEGORY_MASK_Cn) */
969 #define xmalloc malloc
970 #define xrealloc realloc
973 /* Output the per-character category table. */
975 output_category (const char *filename, const char *version)
979 struct category_table t;
980 unsigned int level1_offset, level2_offset, level3_offset;
981 uint16_t *level3_packed;
983 stream = fopen (filename, "w");
986 fprintf (stderr, "cannot open '%s' for writing\n", filename);
990 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
991 fprintf (stream, "/* Categories of Unicode characters. */\n");
992 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
997 category_table_init (&t);
999 for (ch = 0; ch < 0x110000; ch++)
1002 unsigned int log2_value;
1004 if (is_category_Cs (ch))
1005 value = UC_CATEGORY_MASK_Cs;
1006 else if (unicode_attributes[ch].name != NULL)
1007 value = general_category_byname (unicode_attributes[ch].category);
1011 /* Now value should contain exactly one bit. */
1012 if (value == 0 || ((value & (value - 1)) != 0))
1015 for (log2_value = 0; value > 1; value >>= 1, log2_value++);
1017 category_table_add (&t, ch, log2_value);
1020 category_table_finalize (&t);
1022 /* Offsets in t.result, in memory of this process. */
1024 5 * sizeof (uint32_t);
1026 5 * sizeof (uint32_t)
1027 + t.level1_size * sizeof (uint32_t);
1029 5 * sizeof (uint32_t)
1030 + t.level1_size * sizeof (uint32_t)
1031 + (t.level2_size << t.q) * sizeof (uint32_t);
1033 for (i = 0; i < 5; i++)
1034 fprintf (stream, "#define category_header_%d %d\n", i,
1035 ((uint32_t *) t.result)[i]);
1036 fprintf (stream, "static const\n");
1037 fprintf (stream, "struct\n");
1038 fprintf (stream, " {\n");
1039 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1040 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1041 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
1042 (1 << t.p) * 5 / 16);
1043 fprintf (stream, " }\n");
1044 fprintf (stream, "u_category =\n");
1045 fprintf (stream, "{\n");
1046 fprintf (stream, " {");
1047 if (t.level1_size > 8)
1048 fprintf (stream, "\n ");
1049 for (i = 0; i < t.level1_size; i++)
1052 if (i > 0 && (i % 8) == 0)
1053 fprintf (stream, "\n ");
1054 offset = ((uint32_t *) (t.result + level1_offset))[i];
1056 fprintf (stream, " %5d", -1);
1058 fprintf (stream, " %5zd",
1059 (offset - level2_offset) / sizeof (uint32_t));
1060 if (i+1 < t.level1_size)
1061 fprintf (stream, ",");
1063 if (t.level1_size > 8)
1064 fprintf (stream, "\n ");
1065 fprintf (stream, " },\n");
1066 fprintf (stream, " {");
1067 if (t.level2_size << t.q > 8)
1068 fprintf (stream, "\n ");
1069 for (i = 0; i < t.level2_size << t.q; i++)
1072 if (i > 0 && (i % 8) == 0)
1073 fprintf (stream, "\n ");
1074 offset = ((uint32_t *) (t.result + level2_offset))[i];
1076 fprintf (stream, " %5d", -1);
1078 fprintf (stream, " %5zd",
1079 (offset - level3_offset) / sizeof (uint8_t));
1080 if (i+1 < t.level2_size << t.q)
1081 fprintf (stream, ",");
1083 if (t.level2_size << t.q > 8)
1084 fprintf (stream, "\n ");
1085 fprintf (stream, " },\n");
1086 /* Pack the level3 array. Each entry needs 5 bits only. Use 16-bit units,
1087 not 32-bit units, in order to make the lookup function easier. */
1090 calloc ((t.level3_size << t.p) * 5 / 16 + 1, sizeof (uint16_t));
1091 for (i = 0; i < t.level3_size << t.p; i++)
1093 unsigned int j = (i * 5) / 16;
1094 unsigned int k = (i * 5) % 16;
1095 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
1096 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
1097 level3_packed[j] = value & 0xffff;
1098 level3_packed[j+1] = value >> 16;
1100 fprintf (stream, " {");
1101 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1102 fprintf (stream, "\n ");
1103 for (i = 0; i < (t.level3_size << t.p) * 5 / 16 + 1; i++)
1105 if (i > 0 && (i % 8) == 0)
1106 fprintf (stream, "\n ");
1107 fprintf (stream, " 0x%04x", level3_packed[i]);
1108 if (i+1 < (t.level3_size << t.p) * 5 / 16 + 1)
1109 fprintf (stream, ",");
1111 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1112 fprintf (stream, "\n ");
1113 fprintf (stream, " }\n");
1114 free (level3_packed);
1115 fprintf (stream, "};\n");
1117 if (ferror (stream) || fclose (stream))
1119 fprintf (stderr, "error writing to '%s'\n", filename);
1124 /* ========================================================================= */
1126 /* Canonical combining class. */
1127 /* See Unicode 3.0 book, section 4.2,
1130 /* Construction of sparse 3-level tables. */
1131 #define TABLE combclass_table
1132 #define ELEMENT uint8_t
1134 #define xmalloc malloc
1135 #define xrealloc realloc
1138 /* Output the per-character combining class table. */
1140 output_combclass (const char *filename, const char *version)
1144 struct combclass_table t;
1145 unsigned int level1_offset, level2_offset, level3_offset;
1147 stream = fopen (filename, "w");
1150 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1154 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1155 fprintf (stream, "/* Combining class of Unicode characters. */\n");
1156 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1161 combclass_table_init (&t);
1163 for (ch = 0; ch < 0x110000; ch++)
1164 if (unicode_attributes[ch].name != NULL)
1166 int value = atoi (unicode_attributes[ch].combining);
1167 if (!(value >= 0 && value <= 255))
1169 combclass_table_add (&t, ch, value);
1172 combclass_table_finalize (&t);
1174 /* Offsets in t.result, in memory of this process. */
1176 5 * sizeof (uint32_t);
1178 5 * sizeof (uint32_t)
1179 + t.level1_size * sizeof (uint32_t);
1181 5 * sizeof (uint32_t)
1182 + t.level1_size * sizeof (uint32_t)
1183 + (t.level2_size << t.q) * sizeof (uint32_t);
1185 for (i = 0; i < 5; i++)
1186 fprintf (stream, "#define combclass_header_%d %d\n", i,
1187 ((uint32_t *) t.result)[i]);
1188 fprintf (stream, "static const\n");
1189 fprintf (stream, "struct\n");
1190 fprintf (stream, " {\n");
1191 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1192 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1193 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
1194 fprintf (stream, " }\n");
1195 fprintf (stream, "u_combclass =\n");
1196 fprintf (stream, "{\n");
1197 fprintf (stream, " {");
1198 if (t.level1_size > 8)
1199 fprintf (stream, "\n ");
1200 for (i = 0; i < t.level1_size; i++)
1203 if (i > 0 && (i % 8) == 0)
1204 fprintf (stream, "\n ");
1205 offset = ((uint32_t *) (t.result + level1_offset))[i];
1207 fprintf (stream, " %5d", -1);
1209 fprintf (stream, " %5zd",
1210 (offset - level2_offset) / sizeof (uint32_t));
1211 if (i+1 < t.level1_size)
1212 fprintf (stream, ",");
1214 if (t.level1_size > 8)
1215 fprintf (stream, "\n ");
1216 fprintf (stream, " },\n");
1217 fprintf (stream, " {");
1218 if (t.level2_size << t.q > 8)
1219 fprintf (stream, "\n ");
1220 for (i = 0; i < t.level2_size << t.q; i++)
1223 if (i > 0 && (i % 8) == 0)
1224 fprintf (stream, "\n ");
1225 offset = ((uint32_t *) (t.result + level2_offset))[i];
1227 fprintf (stream, " %5d", -1);
1229 fprintf (stream, " %5zd",
1230 (offset - level3_offset) / sizeof (uint8_t));
1231 if (i+1 < t.level2_size << t.q)
1232 fprintf (stream, ",");
1234 if (t.level2_size << t.q > 8)
1235 fprintf (stream, "\n ");
1236 fprintf (stream, " },\n");
1237 fprintf (stream, " {");
1238 if (t.level3_size << t.p > 8)
1239 fprintf (stream, "\n ");
1240 for (i = 0; i < t.level3_size << t.p; i++)
1242 if (i > 0 && (i % 8) == 0)
1243 fprintf (stream, "\n ");
1244 fprintf (stream, " %3d", ((uint8_t *) (t.result + level3_offset))[i]);
1245 if (i+1 < t.level3_size << t.p)
1246 fprintf (stream, ",");
1248 if (t.level3_size << t.p > 8)
1249 fprintf (stream, "\n ");
1250 fprintf (stream, " }\n");
1251 fprintf (stream, "};\n");
1253 if (ferror (stream) || fclose (stream))
1255 fprintf (stderr, "error writing to '%s'\n", filename);
1260 /* ========================================================================= */
1262 /* Bidirectional category. */
1263 /* See Unicode 3.0 book, section 4.3,
1268 UC_BIDI_L, /* Left-to-Right */
1269 UC_BIDI_LRE, /* Left-to-Right Embedding */
1270 UC_BIDI_LRO, /* Left-to-Right Override */
1271 UC_BIDI_R, /* Right-to-Left */
1272 UC_BIDI_AL, /* Right-to-Left Arabic */
1273 UC_BIDI_RLE, /* Right-to-Left Embedding */
1274 UC_BIDI_RLO, /* Right-to-Left Override */
1275 UC_BIDI_PDF, /* Pop Directional Format */
1276 UC_BIDI_EN, /* European Number */
1277 UC_BIDI_ES, /* European Number Separator */
1278 UC_BIDI_ET, /* European Number Terminator */
1279 UC_BIDI_AN, /* Arabic Number */
1280 UC_BIDI_CS, /* Common Number Separator */
1281 UC_BIDI_NSM, /* Non-Spacing Mark */
1282 UC_BIDI_BN, /* Boundary Neutral */
1283 UC_BIDI_B, /* Paragraph Separator */
1284 UC_BIDI_S, /* Segment Separator */
1285 UC_BIDI_WS, /* Whitespace */
1286 UC_BIDI_ON /* Other Neutral */
1290 bidi_category_byname (const char *category_name)
1292 switch (category_name[0])
1295 switch (category_name[1])
1298 if (category_name[2] == '\0')
1302 if (category_name[2] == '\0')
1308 switch (category_name[1])
1313 if (category_name[2] == '\0')
1319 switch (category_name[1])
1322 if (category_name[2] == '\0')
1328 switch (category_name[1])
1331 if (category_name[2] == '\0')
1335 if (category_name[2] == '\0')
1339 if (category_name[2] == '\0')
1345 switch (category_name[1])
1350 switch (category_name[2])
1353 if (category_name[3] == '\0')
1357 if (category_name[3] == '\0')
1365 switch (category_name[1])
1368 switch (category_name[2])
1371 if (category_name[3] == '\0')
1379 switch (category_name[1])
1382 if (category_name[2] == '\0')
1388 switch (category_name[1])
1391 switch (category_name[2])
1394 if (category_name[3] == '\0')
1402 switch (category_name[1])
1407 switch (category_name[2])
1410 if (category_name[3] == '\0')
1414 if (category_name[3] == '\0')
1422 if (category_name[1] == '\0')
1426 switch (category_name[1])
1429 if (category_name[2] == '\0')
1435 /* Invalid bidi category name. */
1440 get_bidi_category (unsigned int ch)
1442 if (unicode_attributes[ch].name != NULL)
1443 return bidi_category_byname (unicode_attributes[ch].bidi);
1446 /* The bidi category of unassigned characters depends on the range.
1447 See UTR #9 and DerivedBidiClass.txt. */
1448 if ((ch >= 0x0590 && ch <= 0x05FF)
1449 || (ch >= 0x07FB && ch <= 0x08FF)
1450 || (ch >= 0xFB37 && ch <= 0xFB45)
1451 || (ch >= 0x10800 && ch <= 0x10FFF))
1453 else if ((ch >= 0x0600 && ch <= 0x07BF)
1454 || (ch >= 0x2064 && ch <= 0x2069)
1455 || (ch >= 0xFBB2 && ch <= 0xFDCF)
1456 || (ch >= 0xFDFE && ch <= 0xFEFE))
1458 else if ((ch >= 0xFDD0 && ch <= 0xFDEF)
1459 || (ch >= 0xFFF0 && ch <= 0xFFFF)
1460 || (ch & 0xFFFF) == 0xFFFE
1461 || (ch & 0xFFFF) == 0xFFFF
1462 || (ch >= 0xE0000 && ch <= 0xE0FFF))
1469 /* Construction of sparse 3-level tables. */
1470 #define TABLE bidi_category_table
1471 #define ELEMENT uint8_t
1472 #define DEFAULT UC_BIDI_L
1473 #define xmalloc malloc
1474 #define xrealloc realloc
1477 /* Output the per-character bidi category table. */
1479 output_bidi_category (const char *filename, const char *version)
1483 struct bidi_category_table t;
1484 unsigned int level1_offset, level2_offset, level3_offset;
1485 uint16_t *level3_packed;
1487 stream = fopen (filename, "w");
1490 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1494 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1495 fprintf (stream, "/* Bidi categories of Unicode characters. */\n");
1496 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1501 bidi_category_table_init (&t);
1503 for (ch = 0; ch < 0x110000; ch++)
1505 int value = get_bidi_category (ch);
1507 bidi_category_table_add (&t, ch, value);
1510 bidi_category_table_finalize (&t);
1512 /* Offsets in t.result, in memory of this process. */
1514 5 * sizeof (uint32_t);
1516 5 * sizeof (uint32_t)
1517 + t.level1_size * sizeof (uint32_t);
1519 5 * sizeof (uint32_t)
1520 + t.level1_size * sizeof (uint32_t)
1521 + (t.level2_size << t.q) * sizeof (uint32_t);
1523 for (i = 0; i < 5; i++)
1524 fprintf (stream, "#define bidi_category_header_%d %d\n", i,
1525 ((uint32_t *) t.result)[i]);
1526 fprintf (stream, "static const\n");
1527 fprintf (stream, "struct\n");
1528 fprintf (stream, " {\n");
1529 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1530 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1531 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
1532 (1 << t.p) * 5 / 16);
1533 fprintf (stream, " }\n");
1534 fprintf (stream, "u_bidi_category =\n");
1535 fprintf (stream, "{\n");
1536 fprintf (stream, " {");
1537 if (t.level1_size > 8)
1538 fprintf (stream, "\n ");
1539 for (i = 0; i < t.level1_size; i++)
1542 if (i > 0 && (i % 8) == 0)
1543 fprintf (stream, "\n ");
1544 offset = ((uint32_t *) (t.result + level1_offset))[i];
1546 fprintf (stream, " %5d", -1);
1548 fprintf (stream, " %5zd",
1549 (offset - level2_offset) / sizeof (uint32_t));
1550 if (i+1 < t.level1_size)
1551 fprintf (stream, ",");
1553 if (t.level1_size > 8)
1554 fprintf (stream, "\n ");
1555 fprintf (stream, " },\n");
1556 fprintf (stream, " {");
1557 if (t.level2_size << t.q > 8)
1558 fprintf (stream, "\n ");
1559 for (i = 0; i < t.level2_size << t.q; i++)
1562 if (i > 0 && (i % 8) == 0)
1563 fprintf (stream, "\n ");
1564 offset = ((uint32_t *) (t.result + level2_offset))[i];
1566 fprintf (stream, " %5d", -1);
1568 fprintf (stream, " %5zd",
1569 (offset - level3_offset) / sizeof (uint8_t));
1570 if (i+1 < t.level2_size << t.q)
1571 fprintf (stream, ",");
1573 if (t.level2_size << t.q > 8)
1574 fprintf (stream, "\n ");
1575 fprintf (stream, " },\n");
1576 /* Pack the level3 array. Each entry needs 5 bits only. Use 16-bit units,
1577 not 32-bit units, in order to make the lookup function easier. */
1580 calloc ((t.level3_size << t.p) * 5 / 16 + 1, sizeof (uint16_t));
1581 for (i = 0; i < t.level3_size << t.p; i++)
1583 unsigned int j = (i * 5) / 16;
1584 unsigned int k = (i * 5) % 16;
1585 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
1586 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
1587 level3_packed[j] = value & 0xffff;
1588 level3_packed[j+1] = value >> 16;
1590 fprintf (stream, " {");
1591 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1592 fprintf (stream, "\n ");
1593 for (i = 0; i < (t.level3_size << t.p) * 5 / 16 + 1; i++)
1595 if (i > 0 && (i % 8) == 0)
1596 fprintf (stream, "\n ");
1597 fprintf (stream, " 0x%04x", level3_packed[i]);
1598 if (i+1 < (t.level3_size << t.p) * 5 / 16 + 1)
1599 fprintf (stream, ",");
1601 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1602 fprintf (stream, "\n ");
1603 fprintf (stream, " }\n");
1604 free (level3_packed);
1605 fprintf (stream, "};\n");
1607 if (ferror (stream) || fclose (stream))
1609 fprintf (stderr, "error writing to '%s'\n", filename);
1614 /* ========================================================================= */
1616 /* Decimal digit value. */
1617 /* See Unicode 3.0 book, section 4.6. */
1620 get_decdigit_value (unsigned int ch)
1622 if (unicode_attributes[ch].name != NULL
1623 && unicode_attributes[ch].decdigit[0] != '\0')
1624 return atoi (unicode_attributes[ch].decdigit);
1628 /* Construction of sparse 3-level tables. */
1629 #define TABLE decdigit_table
1630 #define ELEMENT uint8_t
1632 #define xmalloc malloc
1633 #define xrealloc realloc
1636 /* Output the unit test for the per-character decimal digit value table. */
1638 output_decimal_digit_test (const char *filename, const char *version)
1644 stream = fopen (filename, "w");
1647 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1651 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1652 fprintf (stream, "/* Decimal digit values of Unicode characters. */\n");
1653 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1657 for (ch = 0; ch < 0x110000; ch++)
1659 int value = get_decdigit_value (ch);
1661 if (!(value >= -1 && value < 10))
1667 fprintf (stream, ",\n");
1668 fprintf (stream, " { 0x%04X, %d }", ch, value);
1673 fprintf (stream, "\n");
1675 if (ferror (stream) || fclose (stream))
1677 fprintf (stderr, "error writing to '%s'\n", filename);
1682 /* Output the per-character decimal digit value table. */
1684 output_decimal_digit (const char *filename, const char *version)
1688 struct decdigit_table t;
1689 unsigned int level1_offset, level2_offset, level3_offset;
1691 stream = fopen (filename, "w");
1694 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1698 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1699 fprintf (stream, "/* Decimal digit values of Unicode characters. */\n");
1700 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1705 decdigit_table_init (&t);
1707 for (ch = 0; ch < 0x110000; ch++)
1709 int value = 1 + get_decdigit_value (ch);
1711 if (!(value >= 0 && value <= 10))
1714 decdigit_table_add (&t, ch, value);
1717 decdigit_table_finalize (&t);
1719 /* Offsets in t.result, in memory of this process. */
1721 5 * sizeof (uint32_t);
1723 5 * sizeof (uint32_t)
1724 + t.level1_size * sizeof (uint32_t);
1726 5 * sizeof (uint32_t)
1727 + t.level1_size * sizeof (uint32_t)
1728 + (t.level2_size << t.q) * sizeof (uint32_t);
1730 for (i = 0; i < 5; i++)
1731 fprintf (stream, "#define decdigit_header_%d %d\n", i,
1732 ((uint32_t *) t.result)[i]);
1733 fprintf (stream, "static const\n");
1734 fprintf (stream, "struct\n");
1735 fprintf (stream, " {\n");
1736 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1737 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1738 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size,
1740 fprintf (stream, " }\n");
1741 fprintf (stream, "u_decdigit =\n");
1742 fprintf (stream, "{\n");
1743 fprintf (stream, " {");
1744 if (t.level1_size > 8)
1745 fprintf (stream, "\n ");
1746 for (i = 0; i < t.level1_size; i++)
1749 if (i > 0 && (i % 8) == 0)
1750 fprintf (stream, "\n ");
1751 offset = ((uint32_t *) (t.result + level1_offset))[i];
1753 fprintf (stream, " %5d", -1);
1755 fprintf (stream, " %5zd",
1756 (offset - level2_offset) / sizeof (uint32_t));
1757 if (i+1 < t.level1_size)
1758 fprintf (stream, ",");
1760 if (t.level1_size > 8)
1761 fprintf (stream, "\n ");
1762 fprintf (stream, " },\n");
1763 fprintf (stream, " {");
1764 if (t.level2_size << t.q > 8)
1765 fprintf (stream, "\n ");
1766 for (i = 0; i < t.level2_size << t.q; i++)
1769 if (i > 0 && (i % 8) == 0)
1770 fprintf (stream, "\n ");
1771 offset = ((uint32_t *) (t.result + level2_offset))[i];
1773 fprintf (stream, " %5d", -1);
1775 fprintf (stream, " %5zd",
1776 (offset - level3_offset) / sizeof (uint8_t));
1777 if (i+1 < t.level2_size << t.q)
1778 fprintf (stream, ",");
1780 if (t.level2_size << t.q > 8)
1781 fprintf (stream, "\n ");
1782 fprintf (stream, " },\n");
1783 /* Pack the level3 array. Each entry needs 4 bits only. */
1784 fprintf (stream, " {");
1785 if (t.level3_size << (t.p - 1) > 8)
1786 fprintf (stream, "\n ");
1787 for (i = 0; i < t.level3_size << (t.p - 1); i++)
1789 if (i > 0 && (i % 8) == 0)
1790 fprintf (stream, "\n ");
1791 fprintf (stream, " 0x%02x",
1792 ((uint8_t *) (t.result + level3_offset))[2*i]
1793 + (((uint8_t *) (t.result + level3_offset))[2*i+1] << 4));
1794 if (i+1 < t.level3_size << (t.p - 1))
1795 fprintf (stream, ",");
1797 if (t.level3_size << (t.p - 1) > 8)
1798 fprintf (stream, "\n ");
1799 fprintf (stream, " }\n");
1800 fprintf (stream, "};\n");
1802 if (ferror (stream) || fclose (stream))
1804 fprintf (stderr, "error writing to '%s'\n", filename);
1809 /* ========================================================================= */
1812 /* See Unicode 3.0 book, section 4.6. */
1815 get_digit_value (unsigned int ch)
1817 if (unicode_attributes[ch].name != NULL
1818 && unicode_attributes[ch].digit[0] != '\0')
1819 return atoi (unicode_attributes[ch].digit);
1823 /* Output the unit test for the per-character digit value table. */
1825 output_digit_test (const char *filename, const char *version)
1831 stream = fopen (filename, "w");
1834 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1838 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1839 fprintf (stream, "/* Digit values of Unicode characters. */\n");
1840 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1844 for (ch = 0; ch < 0x110000; ch++)
1846 int value = get_digit_value (ch);
1848 if (!(value >= -1 && value < 10))
1854 fprintf (stream, ",\n");
1855 fprintf (stream, " { 0x%04X, %d }", ch, value);
1860 fprintf (stream, "\n");
1862 if (ferror (stream) || fclose (stream))
1864 fprintf (stderr, "error writing to '%s'\n", filename);
1869 /* Output the per-character digit value table. */
1871 output_digit (const char *filename, const char *version)
1875 struct decdigit_table t;
1876 unsigned int level1_offset, level2_offset, level3_offset;
1878 stream = fopen (filename, "w");
1881 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1885 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1886 fprintf (stream, "/* Digit values of Unicode characters. */\n");
1887 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1892 decdigit_table_init (&t);
1894 for (ch = 0; ch < 0x110000; ch++)
1896 int value = 1 + get_digit_value (ch);
1898 if (!(value >= 0 && value <= 10))
1901 decdigit_table_add (&t, ch, value);
1904 decdigit_table_finalize (&t);
1906 /* Offsets in t.result, in memory of this process. */
1908 5 * sizeof (uint32_t);
1910 5 * sizeof (uint32_t)
1911 + t.level1_size * sizeof (uint32_t);
1913 5 * sizeof (uint32_t)
1914 + t.level1_size * sizeof (uint32_t)
1915 + (t.level2_size << t.q) * sizeof (uint32_t);
1917 for (i = 0; i < 5; i++)
1918 fprintf (stream, "#define digit_header_%d %d\n", i,
1919 ((uint32_t *) t.result)[i]);
1920 fprintf (stream, "static const\n");
1921 fprintf (stream, "struct\n");
1922 fprintf (stream, " {\n");
1923 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1924 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1925 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size,
1927 fprintf (stream, " }\n");
1928 fprintf (stream, "u_digit =\n");
1929 fprintf (stream, "{\n");
1930 fprintf (stream, " {");
1931 if (t.level1_size > 8)
1932 fprintf (stream, "\n ");
1933 for (i = 0; i < t.level1_size; i++)
1936 if (i > 0 && (i % 8) == 0)
1937 fprintf (stream, "\n ");
1938 offset = ((uint32_t *) (t.result + level1_offset))[i];
1940 fprintf (stream, " %5d", -1);
1942 fprintf (stream, " %5zd",
1943 (offset - level2_offset) / sizeof (uint32_t));
1944 if (i+1 < t.level1_size)
1945 fprintf (stream, ",");
1947 if (t.level1_size > 8)
1948 fprintf (stream, "\n ");
1949 fprintf (stream, " },\n");
1950 fprintf (stream, " {");
1951 if (t.level2_size << t.q > 8)
1952 fprintf (stream, "\n ");
1953 for (i = 0; i < t.level2_size << t.q; i++)
1956 if (i > 0 && (i % 8) == 0)
1957 fprintf (stream, "\n ");
1958 offset = ((uint32_t *) (t.result + level2_offset))[i];
1960 fprintf (stream, " %5d", -1);
1962 fprintf (stream, " %5zd",
1963 (offset - level3_offset) / sizeof (uint8_t));
1964 if (i+1 < t.level2_size << t.q)
1965 fprintf (stream, ",");
1967 if (t.level2_size << t.q > 8)
1968 fprintf (stream, "\n ");
1969 fprintf (stream, " },\n");
1970 /* Pack the level3 array. Each entry needs 4 bits only. */
1971 fprintf (stream, " {");
1972 if (t.level3_size << (t.p - 1) > 8)
1973 fprintf (stream, "\n ");
1974 for (i = 0; i < t.level3_size << (t.p - 1); i++)
1976 if (i > 0 && (i % 8) == 0)
1977 fprintf (stream, "\n ");
1978 fprintf (stream, " 0x%02x",
1979 ((uint8_t *) (t.result + level3_offset))[2*i]
1980 + (((uint8_t *) (t.result + level3_offset))[2*i+1] << 4));
1981 if (i+1 < t.level3_size << (t.p - 1))
1982 fprintf (stream, ",");
1984 if (t.level3_size << (t.p - 1) > 8)
1985 fprintf (stream, "\n ");
1986 fprintf (stream, " }\n");
1987 fprintf (stream, "};\n");
1989 if (ferror (stream) || fclose (stream))
1991 fprintf (stderr, "error writing to '%s'\n", filename);
1996 /* ========================================================================= */
1998 /* Numeric value. */
1999 /* See Unicode 3.0 book, section 4.6. */
2001 typedef struct { int numerator; int denominator; } uc_fraction_t;
2003 static uc_fraction_t
2004 get_numeric_value (unsigned int ch)
2006 uc_fraction_t value;
2008 if (unicode_attributes[ch].name != NULL
2009 && unicode_attributes[ch].numeric[0] != '\0')
2011 const char *str = unicode_attributes[ch].numeric;
2012 /* str is of the form "integer" or "integer/posinteger". */
2013 value.numerator = atoi (str);
2014 if (strchr (str, '/') != NULL)
2015 value.denominator = atoi (strchr (str, '/') + 1);
2017 value.denominator = 1;
2021 value.numerator = 0;
2022 value.denominator = 0;
2027 /* Output the unit test for the per-character numeric value table. */
2029 output_numeric_test (const char *filename, const char *version)
2035 stream = fopen (filename, "w");
2038 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2042 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2043 fprintf (stream, "/* Numeric values of Unicode characters. */\n");
2044 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
2048 for (ch = 0; ch < 0x110000; ch++)
2050 uc_fraction_t value = get_numeric_value (ch);
2052 if (value.numerator != 0 || value.denominator != 0)
2055 fprintf (stream, ",\n");
2056 fprintf (stream, " { 0x%04X, %d, %d }",
2057 ch, value.numerator, value.denominator);
2062 fprintf (stream, "\n");
2064 if (ferror (stream) || fclose (stream))
2066 fprintf (stderr, "error writing to '%s'\n", filename);
2071 /* Construction of sparse 3-level tables. */
2072 #define TABLE numeric_table
2073 #define ELEMENT uint8_t
2075 #define xmalloc malloc
2076 #define xrealloc realloc
2079 /* Output the per-character numeric value table. */
2081 output_numeric (const char *filename, const char *version)
2084 uc_fraction_t fractions[128];
2085 unsigned int nfractions;
2086 unsigned int ch, i, j;
2087 struct numeric_table t;
2088 unsigned int level1_offset, level2_offset, level3_offset;
2089 uint16_t *level3_packed;
2091 stream = fopen (filename, "w");
2094 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2098 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2099 fprintf (stream, "/* Numeric values of Unicode characters. */\n");
2100 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
2103 /* Create table of occurring fractions. */
2105 for (ch = 0; ch < 0x110000; ch++)
2107 uc_fraction_t value = get_numeric_value (ch);
2109 for (i = 0; i < nfractions; i++)
2110 if (value.numerator == fractions[i].numerator
2111 && value.denominator == fractions[i].denominator)
2113 if (i == nfractions)
2115 if (nfractions == 128)
2117 for (i = 0; i < nfractions; i++)
2118 if (value.denominator < fractions[i].denominator
2119 || (value.denominator == fractions[i].denominator
2120 && value.numerator < fractions[i].numerator))
2122 for (j = nfractions; j > i; j--)
2123 fractions[j] = fractions[j - 1];
2124 fractions[i] = value;
2129 fprintf (stream, "static const uc_fraction_t u_numeric_values[%d] =\n",
2131 fprintf (stream, "{\n");
2132 for (i = 0; i < nfractions; i++)
2134 fprintf (stream, " { %d, %d }", fractions[i].numerator,
2135 fractions[i].denominator);
2136 if (i+1 < nfractions)
2137 fprintf (stream, ",");
2138 fprintf (stream, "\n");
2140 fprintf (stream, "};\n");
2144 numeric_table_init (&t);
2146 for (ch = 0; ch < 0x110000; ch++)
2148 uc_fraction_t value = get_numeric_value (ch);
2150 for (i = 0; i < nfractions; i++)
2151 if (value.numerator == fractions[i].numerator
2152 && value.denominator == fractions[i].denominator)
2154 if (i == nfractions)
2157 numeric_table_add (&t, ch, i);
2160 numeric_table_finalize (&t);
2162 /* Offsets in t.result, in memory of this process. */
2164 5 * sizeof (uint32_t);
2166 5 * sizeof (uint32_t)
2167 + t.level1_size * sizeof (uint32_t);
2169 5 * sizeof (uint32_t)
2170 + t.level1_size * sizeof (uint32_t)
2171 + (t.level2_size << t.q) * sizeof (uint32_t);
2173 for (i = 0; i < 5; i++)
2174 fprintf (stream, "#define numeric_header_%d %d\n", i,
2175 ((uint32_t *) t.result)[i]);
2176 fprintf (stream, "static const\n");
2177 fprintf (stream, "struct\n");
2178 fprintf (stream, " {\n");
2179 fprintf (stream, " int level1[%zu];\n", t.level1_size);
2180 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
2181 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
2182 (1 << t.p) * 7 / 16);
2183 fprintf (stream, " }\n");
2184 fprintf (stream, "u_numeric =\n");
2185 fprintf (stream, "{\n");
2186 fprintf (stream, " {");
2187 if (t.level1_size > 8)
2188 fprintf (stream, "\n ");
2189 for (i = 0; i < t.level1_size; i++)
2192 if (i > 0 && (i % 8) == 0)
2193 fprintf (stream, "\n ");
2194 offset = ((uint32_t *) (t.result + level1_offset))[i];
2196 fprintf (stream, " %5d", -1);
2198 fprintf (stream, " %5zd",
2199 (offset - level2_offset) / sizeof (uint32_t));
2200 if (i+1 < t.level1_size)
2201 fprintf (stream, ",");
2203 if (t.level1_size > 8)
2204 fprintf (stream, "\n ");
2205 fprintf (stream, " },\n");
2206 fprintf (stream, " {");
2207 if (t.level2_size << t.q > 8)
2208 fprintf (stream, "\n ");
2209 for (i = 0; i < t.level2_size << t.q; i++)
2212 if (i > 0 && (i % 8) == 0)
2213 fprintf (stream, "\n ");
2214 offset = ((uint32_t *) (t.result + level2_offset))[i];
2216 fprintf (stream, " %5d", -1);
2218 fprintf (stream, " %5zd",
2219 (offset - level3_offset) / sizeof (uint8_t));
2220 if (i+1 < t.level2_size << t.q)
2221 fprintf (stream, ",");
2223 if (t.level2_size << t.q > 8)
2224 fprintf (stream, "\n ");
2225 fprintf (stream, " },\n");
2226 /* Pack the level3 array. Each entry needs 7 bits only. Use 16-bit units,
2227 not 32-bit units, in order to make the lookup function easier. */
2230 calloc ((t.level3_size << t.p) * 7 / 16 + 1, sizeof (uint16_t));
2231 for (i = 0; i < t.level3_size << t.p; i++)
2233 unsigned int j = (i * 7) / 16;
2234 unsigned int k = (i * 7) % 16;
2235 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
2236 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
2237 level3_packed[j] = value & 0xffff;
2238 level3_packed[j+1] = value >> 16;
2240 fprintf (stream, " {");
2241 if ((t.level3_size << t.p) * 7 / 16 + 1 > 8)
2242 fprintf (stream, "\n ");
2243 for (i = 0; i < (t.level3_size << t.p) * 7 / 16 + 1; i++)
2245 if (i > 0 && (i % 8) == 0)
2246 fprintf (stream, "\n ");
2247 fprintf (stream, " 0x%04x", level3_packed[i]);
2248 if (i+1 < (t.level3_size << t.p) * 7 / 16 + 1)
2249 fprintf (stream, ",");
2251 if ((t.level3_size << t.p) * 7 / 16 + 1 > 8)
2252 fprintf (stream, "\n ");
2253 fprintf (stream, " }\n");
2254 free (level3_packed);
2255 fprintf (stream, "};\n");
2257 if (ferror (stream) || fclose (stream))
2259 fprintf (stderr, "error writing to '%s'\n", filename);
2264 /* ========================================================================= */
2267 /* See Unicode 3.0 book, section 4.7,
2270 /* List of mirrored character pairs. This is a subset of the characters
2271 having the BidiMirrored property. */
2272 static unsigned int mirror_pairs[][2] =
2329 get_mirror_value (unsigned int ch)
2332 unsigned int mirror_char;
2335 mirrored = (unicode_attributes[ch].name != NULL
2336 && unicode_attributes[ch].mirrored);
2337 mirror_char = 0xfffd;
2338 for (i = 0; i < sizeof (mirror_pairs) / sizeof (mirror_pairs[0]); i++)
2339 if (ch == mirror_pairs[i][0])
2341 mirror_char = mirror_pairs[i][1];
2344 else if (ch == mirror_pairs[i][1])
2346 mirror_char = mirror_pairs[i][0];
2350 return (int) mirror_char - (int) ch;
2353 if (mirror_char != 0xfffd)
2359 /* Construction of sparse 3-level tables. */
2360 #define TABLE mirror_table
2361 #define ELEMENT int32_t
2363 #define xmalloc malloc
2364 #define xrealloc realloc
2367 /* Output the per-character mirror table. */
2369 output_mirror (const char *filename, const char *version)
2373 struct mirror_table t;
2374 unsigned int level1_offset, level2_offset, level3_offset;
2376 stream = fopen (filename, "w");
2379 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2383 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2384 fprintf (stream, "/* Mirrored Unicode characters. */\n");
2385 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
2390 mirror_table_init (&t);
2392 for (ch = 0; ch < 0x110000; ch++)
2394 int value = get_mirror_value (ch);
2396 mirror_table_add (&t, ch, value);
2399 mirror_table_finalize (&t);
2401 /* Offsets in t.result, in memory of this process. */
2403 5 * sizeof (uint32_t);
2405 5 * sizeof (uint32_t)
2406 + t.level1_size * sizeof (uint32_t);
2408 5 * sizeof (uint32_t)
2409 + t.level1_size * sizeof (uint32_t)
2410 + (t.level2_size << t.q) * sizeof (uint32_t);
2412 for (i = 0; i < 5; i++)
2413 fprintf (stream, "#define mirror_header_%d %d\n", i,
2414 ((uint32_t *) t.result)[i]);
2415 fprintf (stream, "static const\n");
2416 fprintf (stream, "struct\n");
2417 fprintf (stream, " {\n");
2418 fprintf (stream, " int level1[%zu];\n", t.level1_size);
2419 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
2420 fprintf (stream, " int level3[%zu << %d];\n", t.level3_size, t.p);
2421 fprintf (stream, " }\n");
2422 fprintf (stream, "u_mirror =\n");
2423 fprintf (stream, "{\n");
2424 fprintf (stream, " {");
2425 if (t.level1_size > 8)
2426 fprintf (stream, "\n ");
2427 for (i = 0; i < t.level1_size; i++)
2430 if (i > 0 && (i % 8) == 0)
2431 fprintf (stream, "\n ");
2432 offset = ((uint32_t *) (t.result + level1_offset))[i];
2434 fprintf (stream, " %5d", -1);
2436 fprintf (stream, " %5zd",
2437 (offset - level2_offset) / sizeof (uint32_t));
2438 if (i+1 < t.level1_size)
2439 fprintf (stream, ",");
2441 if (t.level1_size > 8)
2442 fprintf (stream, "\n ");
2443 fprintf (stream, " },\n");
2444 fprintf (stream, " {");
2445 if (t.level2_size << t.q > 8)
2446 fprintf (stream, "\n ");
2447 for (i = 0; i < t.level2_size << t.q; i++)
2450 if (i > 0 && (i % 8) == 0)
2451 fprintf (stream, "\n ");
2452 offset = ((uint32_t *) (t.result + level2_offset))[i];
2454 fprintf (stream, " %5d", -1);
2456 fprintf (stream, " %5zd",
2457 (offset - level3_offset) / sizeof (int32_t));
2458 if (i+1 < t.level2_size << t.q)
2459 fprintf (stream, ",");
2461 if (t.level2_size << t.q > 8)
2462 fprintf (stream, "\n ");
2463 fprintf (stream, " },\n");
2464 fprintf (stream, " {");
2465 if (t.level3_size << t.p > 8)
2466 fprintf (stream, "\n ");
2467 for (i = 0; i < t.level3_size << t.p; i++)
2469 if (i > 0 && (i % 8) == 0)
2470 fprintf (stream, "\n ");
2471 fprintf (stream, " %5d", ((int32_t *) (t.result + level3_offset))[i]);
2472 if (i+1 < t.level3_size << t.p)
2473 fprintf (stream, ",");
2475 if (t.level3_size << t.p > 8)
2476 fprintf (stream, "\n ");
2477 fprintf (stream, " }\n");
2478 fprintf (stream, "};\n");
2480 if (ferror (stream) || fclose (stream))
2482 fprintf (stderr, "error writing to '%s'\n", filename);
2487 /* ========================================================================= */
2491 /* Reading PropList.txt and DerivedCoreProperties.txt. */
2500 PROP_QUOTATION_MARK,
2501 PROP_TERMINAL_PUNCTUATION,
2504 PROP_ASCII_HEX_DIGIT,
2505 PROP_OTHER_ALPHABETIC,
2509 PROP_OTHER_LOWERCASE,
2510 PROP_OTHER_UPPERCASE,
2511 PROP_NONCHARACTER_CODE_POINT,
2512 PROP_OTHER_GRAPHEME_EXTEND,
2513 PROP_IDS_BINARY_OPERATOR,
2514 PROP_IDS_TRINARY_OPERATOR,
2516 PROP_UNIFIED_IDEOGRAPH,
2517 PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT,
2520 PROP_LOGICAL_ORDER_EXCEPTION,
2521 PROP_OTHER_ID_START,
2522 PROP_OTHER_ID_CONTINUE,
2524 PROP_VARIATION_SELECTOR,
2525 PROP_PATTERN_WHITE_SPACE,
2526 PROP_PATTERN_SYNTAX,
2527 /* DerivedCoreProperties.txt */
2536 PROP_DEFAULT_IGNORABLE_CODE_POINT,
2537 PROP_GRAPHEME_EXTEND,
2541 unsigned long long unicode_properties[0x110000];
2544 clear_properties (void)
2548 for (i = 0; i < 0x110000; i++)
2549 unicode_properties[i] = 0;
2552 /* Stores in unicode_properties[] the properties from the
2553 PropList.txt or DerivedCoreProperties.txt file. */
2555 fill_properties (const char *proplist_filename)
2560 stream = fopen (proplist_filename, "r");
2563 fprintf (stderr, "error during fopen of '%s'\n", proplist_filename);
2570 unsigned int i1, i2;
2571 char padding[200+1];
2572 char propname[200+1];
2573 unsigned int propvalue;
2575 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
2578 if (buf[0] == '\0' || buf[0] == '#')
2581 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4)
2583 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3)
2585 fprintf (stderr, "parse error in '%s'\n", proplist_filename);
2590 #define PROP(name,value) \
2591 if (strcmp (propname, name) == 0) propvalue = value; else
2593 PROP ("White_Space", PROP_WHITE_SPACE)
2594 PROP ("Bidi_Control", PROP_BIDI_CONTROL)
2595 PROP ("Join_Control", PROP_JOIN_CONTROL)
2596 PROP ("Dash", PROP_DASH)
2597 PROP ("Hyphen", PROP_HYPHEN)
2598 PROP ("Quotation_Mark", PROP_QUOTATION_MARK)
2599 PROP ("Terminal_Punctuation", PROP_TERMINAL_PUNCTUATION)
2600 PROP ("Other_Math", PROP_OTHER_MATH)
2601 PROP ("Hex_Digit", PROP_HEX_DIGIT)
2602 PROP ("ASCII_Hex_Digit", PROP_ASCII_HEX_DIGIT)
2603 PROP ("Other_Alphabetic", PROP_OTHER_ALPHABETIC)
2604 PROP ("Ideographic", PROP_IDEOGRAPHIC)
2605 PROP ("Diacritic", PROP_DIACRITIC)
2606 PROP ("Extender", PROP_EXTENDER)
2607 PROP ("Other_Lowercase", PROP_OTHER_LOWERCASE)
2608 PROP ("Other_Uppercase", PROP_OTHER_UPPERCASE)
2609 PROP ("Noncharacter_Code_Point", PROP_NONCHARACTER_CODE_POINT)
2610 PROP ("Other_Grapheme_Extend", PROP_OTHER_GRAPHEME_EXTEND)
2611 PROP ("IDS_Binary_Operator", PROP_IDS_BINARY_OPERATOR)
2612 PROP ("IDS_Trinary_Operator", PROP_IDS_TRINARY_OPERATOR)
2613 PROP ("Radical", PROP_RADICAL)
2614 PROP ("Unified_Ideograph", PROP_UNIFIED_IDEOGRAPH)
2615 PROP ("Other_Default_Ignorable_Code_Point", PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)
2616 PROP ("Deprecated", PROP_DEPRECATED)
2617 PROP ("Soft_Dotted", PROP_SOFT_DOTTED)
2618 PROP ("Logical_Order_Exception", PROP_LOGICAL_ORDER_EXCEPTION)
2619 PROP ("Other_ID_Start", PROP_OTHER_ID_START)
2620 PROP ("Other_ID_Continue", PROP_OTHER_ID_CONTINUE)
2621 PROP ("STerm", PROP_STERM)
2622 PROP ("Variation_Selector", PROP_VARIATION_SELECTOR)
2623 PROP ("Pattern_White_Space", PROP_PATTERN_WHITE_SPACE)
2624 PROP ("Pattern_Syntax", PROP_PATTERN_SYNTAX)
2625 /* DerivedCoreProperties.txt */
2626 PROP ("Math", PROP_MATH)
2627 PROP ("Alphabetic", PROP_ALPHABETIC)
2628 PROP ("Lowercase", PROP_LOWERCASE)
2629 PROP ("Uppercase", PROP_UPPERCASE)
2630 PROP ("ID_Start", PROP_ID_START)
2631 PROP ("ID_Continue", PROP_ID_CONTINUE)
2632 PROP ("XID_Start", PROP_XID_START)
2633 PROP ("XID_Continue", PROP_XID_CONTINUE)
2634 PROP ("Default_Ignorable_Code_Point", PROP_DEFAULT_IGNORABLE_CODE_POINT)
2635 PROP ("Grapheme_Extend", PROP_GRAPHEME_EXTEND)
2636 PROP ("Grapheme_Base", PROP_GRAPHEME_BASE)
2637 PROP ("Grapheme_Link", PROP_GRAPHEME_LINK)
2640 fprintf (stderr, "unknown property named '%s' in '%s'\n", propname,
2644 if (!(i1 <= i2 && i2 < 0x110000))
2647 for (i = i1; i <= i2; i++)
2648 unicode_properties[i] |= 1ULL << propvalue;
2651 if (ferror (stream) || fclose (stream))
2653 fprintf (stderr, "error reading from '%s'\n", proplist_filename);
2658 /* Stores in array the given property from the Unicode 3.0 PropList.txt
2661 fill_property30 (char array[0x110000], const char *proplist_filename, const char *property_name)
2667 for (i = 0; i < 0x110000; i++)
2670 stream = fopen (proplist_filename, "r");
2673 fprintf (stderr, "error during fopen of '%s'\n", proplist_filename);
2677 /* Search for the "Property dump for: ..." line. */
2680 if (fscanf (stream, "%100[^\n]\n", buf) < 1)
2682 fprintf (stderr, "no property found in '%s'\n", proplist_filename);
2686 while (strstr (buf, property_name) == NULL);
2690 unsigned int i1, i2;
2692 if (fscanf (stream, "%100[^\n]\n", buf) < 1)
2696 if (strlen (buf) >= 10 && buf[4] == '.' && buf[5] == '.')
2698 if (sscanf (buf, "%4X..%4X", &i1, &i2) < 2)
2700 fprintf (stderr, "parse error in property in '%s'\n",
2705 else if (strlen (buf) >= 4)
2707 if (sscanf (buf, "%4X", &i1) < 1)
2709 fprintf (stderr, "parse error in property in '%s'\n",
2717 fprintf (stderr, "parse error in property in '%s'\n",
2721 if (!(i1 <= i2 && i2 < 0x110000))
2723 for (i = i1; i <= i2; i++)
2726 if (ferror (stream) || fclose (stream))
2728 fprintf (stderr, "error reading from '%s'\n", proplist_filename);
2733 /* Properties from Unicode 3.0 PropList.txt file. */
2735 /* The paired punctuation property from the PropList.txt file. */
2736 char unicode_pairedpunctuation[0x110000];
2738 /* The left of pair property from the PropList.txt file. */
2739 char unicode_leftofpair[0x110000];
2742 fill_properties30 (const char *proplist30_filename)
2744 fill_property30 (unicode_pairedpunctuation, proplist30_filename, "(Paired Punctuation)");
2745 fill_property30 (unicode_leftofpair, proplist30_filename, "(Left of Pair)");
2748 /* ------------------------------------------------------------------------- */
2750 /* See PropList.txt, UCD.html. */
2752 is_property_white_space (unsigned int ch)
2754 return ((unicode_properties[ch] & (1ULL << PROP_WHITE_SPACE)) != 0);
2757 /* See Unicode 3.0 book, section 4.10,
2758 PropList.txt, UCD.html,
2759 DerivedCoreProperties.txt, UCD.html. */
2761 is_property_alphabetic (unsigned int ch)
2765 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_ALPHABETIC)) != 0)
2766 /* For some reason, the following are listed as having property
2767 Alphabetic but not as having property Other_Alphabetic. */
2768 || (ch >= 0x16EE && ch <= 0x16F0) /* RUNIC SYMBOLS */
2769 || (ch >= 0x2160 && ch <= 0x2182) /* ROMAN NUMERALS */
2770 || (ch >= 0x2185 && ch <= 0x2188) /* ROMAN NUMERALS */
2771 || (ch >= 0x24D0 && ch <= 0x24E9) /* CIRCLED LATIN SMALL LETTER */
2772 || (ch == 0x3007) /* IDEOGRAPHIC NUMBER ZERO */
2773 || (ch >= 0x3021 && ch <= 0x3029) /* HANGZHOU NUMERAL */
2774 || (ch >= 0x3038 && ch <= 0x303A) /* HANGZHOU NUMERAL */
2775 || (ch >= 0x10140 && ch <= 0x10174) /* GREEK ACROPHONICS */
2776 || (ch == 0x10341) /* GOTHIC LETTER NINETY */
2777 || (ch == 0x1034A) /* GOTHIC LETTER NINE HUNDRED */
2778 || (ch >= 0x103D1 && ch <= 0x103D5) /* OLD PERSIAN NUMBERS */
2779 || (ch >= 0x12400 && ch <= 0x12462); /* CUNEIFORM NUMERIC SIGNS */
2781 ((unicode_properties[ch] & (1ULL << PROP_ALPHABETIC)) != 0);
2783 if (result1 != result2)
2788 /* See PropList.txt, UCD.html. */
2790 is_property_other_alphabetic (unsigned int ch)
2792 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ALPHABETIC)) != 0);
2795 /* See PropList.txt, UCD.html. */
2797 is_property_not_a_character (unsigned int ch)
2799 return ((unicode_properties[ch] & (1ULL << PROP_NONCHARACTER_CODE_POINT)) != 0);
2802 /* See PropList.txt, UCD.html,
2803 DerivedCoreProperties.txt, UCD.html. */
2805 is_property_default_ignorable_code_point (unsigned int ch)
2808 (is_category_Cf (ch)
2809 && !(ch >= 0xFFF9 && ch <= 0xFFFB) /* Annotations */
2810 && !((ch >= 0x0600 && ch <= 0x0603) || ch == 0x06DD || ch == 0x070F))
2811 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)) != 0)
2812 || ((unicode_properties[ch] & (1ULL << PROP_VARIATION_SELECTOR)) != 0);
2814 ((unicode_properties[ch] & (1ULL << PROP_DEFAULT_IGNORABLE_CODE_POINT)) != 0);
2816 if (result1 != result2)
2821 /* See PropList.txt, UCD.html. */
2823 is_property_other_default_ignorable_code_point (unsigned int ch)
2825 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)) != 0);
2828 /* See PropList.txt, UCD.html. */
2830 is_property_deprecated (unsigned int ch)
2832 return ((unicode_properties[ch] & (1ULL << PROP_DEPRECATED)) != 0);
2835 /* See PropList.txt, UCD.html. */
2837 is_property_logical_order_exception (unsigned int ch)
2839 return ((unicode_properties[ch] & (1ULL << PROP_LOGICAL_ORDER_EXCEPTION)) != 0);
2842 /* See PropList.txt, UCD.html. */
2844 is_property_variation_selector (unsigned int ch)
2846 return ((unicode_properties[ch] & (1ULL << PROP_VARIATION_SELECTOR)) != 0);
2849 /* See PropList-3.0.1.txt. */
2851 is_property_private_use (unsigned int ch)
2853 /* Determined through "grep 'Private Use,' UnicodeData-3.1.0.txt". */
2854 return (ch >= 0xE000 && ch <= 0xF8FF)
2855 || (ch >= 0xF0000 && ch <= 0xFFFFD)
2856 || (ch >= 0x100000 && ch <= 0x10FFFD);
2859 /* See PropList-3.0.1.txt. */
2861 is_property_unassigned_code_value (unsigned int ch)
2863 return (is_category_Cn (ch) && !is_property_not_a_character (ch));
2866 /* See PropList.txt, UCD.html,
2867 DerivedCoreProperties.txt, UCD.html. */
2869 is_property_uppercase (unsigned int ch)
2873 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_UPPERCASE)) != 0);
2875 ((unicode_properties[ch] & (1ULL << PROP_UPPERCASE)) != 0);
2877 if (result1 != result2)
2882 /* See PropList.txt, UCD.html. */
2884 is_property_other_uppercase (unsigned int ch)
2886 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_UPPERCASE)) != 0);
2889 /* See PropList.txt, UCD.html,
2890 DerivedCoreProperties.txt, UCD.html. */
2892 is_property_lowercase (unsigned int ch)
2896 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_LOWERCASE)) != 0);
2898 ((unicode_properties[ch] & (1ULL << PROP_LOWERCASE)) != 0);
2900 if (result1 != result2)
2905 /* See PropList.txt, UCD.html. */
2907 is_property_other_lowercase (unsigned int ch)
2909 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_LOWERCASE)) != 0);
2912 /* See PropList-3.0.1.txt. */
2914 is_property_titlecase (unsigned int ch)
2916 return is_category_Lt (ch);
2919 /* See PropList.txt, UCD.html. */
2921 is_property_soft_dotted (unsigned int ch)
2923 return ((unicode_properties[ch] & (1ULL << PROP_SOFT_DOTTED)) != 0);
2926 /* See DerivedCoreProperties.txt, UCD.html. */
2928 is_property_id_start (unsigned int ch)
2930 return ((unicode_properties[ch] & (1ULL << PROP_ID_START)) != 0);
2933 /* See PropList.txt, UCD.html. */
2935 is_property_other_id_start (unsigned int ch)
2937 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ID_START)) != 0);
2940 /* See DerivedCoreProperties.txt, UCD.html. */
2942 is_property_id_continue (unsigned int ch)
2944 return ((unicode_properties[ch] & (1ULL << PROP_ID_CONTINUE)) != 0);
2947 /* See PropList.txt, UCD.html. */
2949 is_property_other_id_continue (unsigned int ch)
2951 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ID_CONTINUE)) != 0);
2954 /* See DerivedCoreProperties.txt, UCD.html. */
2956 is_property_xid_start (unsigned int ch)
2958 return ((unicode_properties[ch] & (1ULL << PROP_XID_START)) != 0);
2961 /* See DerivedCoreProperties.txt, UCD.html. */
2963 is_property_xid_continue (unsigned int ch)
2965 return ((unicode_properties[ch] & (1ULL << PROP_XID_CONTINUE)) != 0);
2968 /* See PropList.txt, UCD.html. */
2970 is_property_pattern_white_space (unsigned int ch)
2972 return ((unicode_properties[ch] & (1ULL << PROP_PATTERN_WHITE_SPACE)) != 0);
2975 /* See PropList.txt, UCD.html. */
2977 is_property_pattern_syntax (unsigned int ch)
2979 return ((unicode_properties[ch] & (1ULL << PROP_PATTERN_SYNTAX)) != 0);
2982 /* See PropList.txt, UCD.html. */
2984 is_property_join_control (unsigned int ch)
2986 return ((unicode_properties[ch] & (1ULL << PROP_JOIN_CONTROL)) != 0);
2989 /* See DerivedCoreProperties.txt, UCD.html. */
2991 is_property_grapheme_base (unsigned int ch)
2993 return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_BASE)) != 0);
2996 /* See DerivedCoreProperties.txt, UCD.html. */
2998 is_property_grapheme_extend (unsigned int ch)
3000 return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_EXTEND)) != 0);
3003 /* See PropList.txt, UCD.html. */
3005 is_property_other_grapheme_extend (unsigned int ch)
3007 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_GRAPHEME_EXTEND)) != 0);
3010 /* See DerivedCoreProperties.txt, UCD.html. */
3012 is_property_grapheme_link (unsigned int ch)
3014 return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_LINK)) != 0);
3017 /* See PropList.txt, UCD.html. */
3019 is_property_bidi_control (unsigned int ch)
3021 return ((unicode_properties[ch] & (1ULL << PROP_BIDI_CONTROL)) != 0);
3024 /* See PropList-3.0.1.txt. */
3026 is_property_bidi_left_to_right (unsigned int ch)
3028 return (get_bidi_category (ch) == UC_BIDI_L);
3031 /* See PropList-3.0.1.txt. */
3033 is_property_bidi_hebrew_right_to_left (unsigned int ch)
3035 return (get_bidi_category (ch) == UC_BIDI_R);
3038 /* See PropList-3.0.1.txt. */
3040 is_property_bidi_arabic_right_to_left (unsigned int ch)
3042 return (get_bidi_category (ch) == UC_BIDI_AL);
3045 /* See PropList-3.0.1.txt. */
3047 is_property_bidi_european_digit (unsigned int ch)
3049 return (get_bidi_category (ch) == UC_BIDI_EN);
3052 /* See PropList-3.0.1.txt. */
3054 is_property_bidi_eur_num_separator (unsigned int ch)
3056 return (get_bidi_category (ch) == UC_BIDI_ES);
3059 /* See PropList-3.0.1.txt. */
3061 is_property_bidi_eur_num_terminator (unsigned int ch)
3063 return (get_bidi_category (ch) == UC_BIDI_ET);
3066 /* See PropList-3.0.1.txt. */
3068 is_property_bidi_arabic_digit (unsigned int ch)
3070 return (get_bidi_category (ch) == UC_BIDI_AN);
3073 /* See PropList-3.0.1.txt. */
3075 is_property_bidi_common_separator (unsigned int ch)
3077 return (get_bidi_category (ch) == UC_BIDI_CS);
3080 /* See PropList-3.0.1.txt. */
3082 is_property_bidi_block_separator (unsigned int ch)
3084 return (get_bidi_category (ch) == UC_BIDI_B);
3087 /* See PropList-3.0.1.txt. */
3089 is_property_bidi_segment_separator (unsigned int ch)
3091 return (get_bidi_category (ch) == UC_BIDI_S);
3094 /* See PropList-3.0.1.txt. */
3096 is_property_bidi_whitespace (unsigned int ch)
3098 return (get_bidi_category (ch) == UC_BIDI_WS);
3101 /* See PropList-3.0.1.txt. */
3103 is_property_bidi_non_spacing_mark (unsigned int ch)
3105 return (get_bidi_category (ch) == UC_BIDI_NSM);
3108 /* See PropList-3.0.1.txt. */
3110 is_property_bidi_boundary_neutral (unsigned int ch)
3112 return (get_bidi_category (ch) == UC_BIDI_BN);
3115 /* See PropList-3.0.1.txt. */
3117 is_property_bidi_pdf (unsigned int ch)
3119 return (get_bidi_category (ch) == UC_BIDI_PDF);
3122 /* See PropList-3.0.1.txt. */
3124 is_property_bidi_embedding_or_override (unsigned int ch)
3126 int category = get_bidi_category (ch);
3127 return (category == UC_BIDI_LRE || category == UC_BIDI_LRO
3128 || category == UC_BIDI_RLE || category == UC_BIDI_RLO);
3131 /* See PropList-3.0.1.txt. */
3133 is_property_bidi_other_neutral (unsigned int ch)
3135 return (get_bidi_category (ch) == UC_BIDI_ON);
3138 /* See PropList.txt, UCD.html. */
3140 is_property_hex_digit (unsigned int ch)
3142 return ((unicode_properties[ch] & (1ULL << PROP_HEX_DIGIT)) != 0);
3145 /* See PropList.txt, UCD.html. */
3147 is_property_ascii_hex_digit (unsigned int ch)
3149 return ((unicode_properties[ch] & (1ULL << PROP_ASCII_HEX_DIGIT)) != 0);
3152 /* See Unicode 3.0 book, section 4.10,
3153 PropList.txt, UCD.html. */
3155 is_property_ideographic (unsigned int ch)
3157 return ((unicode_properties[ch] & (1ULL << PROP_IDEOGRAPHIC)) != 0);
3160 /* See PropList.txt, UCD.html. */
3162 is_property_unified_ideograph (unsigned int ch)
3164 return ((unicode_properties[ch] & (1ULL << PROP_UNIFIED_IDEOGRAPH)) != 0);
3167 /* See PropList.txt, UCD.html. */
3169 is_property_radical (unsigned int ch)
3171 return ((unicode_properties[ch] & (1ULL << PROP_RADICAL)) != 0);
3174 /* See PropList.txt, UCD.html. */
3176 is_property_ids_binary_operator (unsigned int ch)
3178 return ((unicode_properties[ch] & (1ULL << PROP_IDS_BINARY_OPERATOR)) != 0);
3181 /* See PropList.txt, UCD.html. */
3183 is_property_ids_trinary_operator (unsigned int ch)
3185 return ((unicode_properties[ch] & (1ULL << PROP_IDS_TRINARY_OPERATOR)) != 0);
3188 /* See PropList-3.0.1.txt. */
3190 is_property_zero_width (unsigned int ch)
3192 return is_category_Cf (ch)
3193 || (unicode_attributes[ch].name != NULL
3194 && strstr (unicode_attributes[ch].name, "ZERO WIDTH") != NULL);
3197 /* See PropList-3.0.1.txt. */
3199 is_property_space (unsigned int ch)
3201 return is_category_Zs (ch);
3204 /* See PropList-3.0.1.txt. */
3206 is_property_non_break (unsigned int ch)
3208 /* This is exactly the set of characters having line breaking
3210 return (ch == 0x00A0 /* NO-BREAK SPACE */
3211 || ch == 0x034F /* COMBINING GRAPHEME JOINER */
3212 || ch == 0x035C /* COMBINING DOUBLE BREVE BELOW */
3213 || ch == 0x035D /* COMBINING DOUBLE BREVE */
3214 || ch == 0x035E /* COMBINING DOUBLE MACRON */
3215 || ch == 0x035F /* COMBINING DOUBLE MACRON BELOW */
3216 || ch == 0x0360 /* COMBINING DOUBLE TILDE */
3217 || ch == 0x0361 /* COMBINING DOUBLE INVERTED BREVE */
3218 || ch == 0x0362 /* COMBINING DOUBLE RIGHTWARDS ARROW BELOW */
3219 || ch == 0x0F08 /* TIBETAN MARK SBRUL SHAD */
3220 || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */
3221 || ch == 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */
3222 || ch == 0x180E /* MONGOLIAN VOWEL SEPARATOR */
3223 || ch == 0x2007 /* FIGURE SPACE */
3224 || ch == 0x2011 /* NON-BREAKING HYPHEN */
3225 || ch == 0x202F /* NARROW NO-BREAK SPACE */);
3228 /* See PropList-3.0.1.txt. */
3230 is_property_iso_control (unsigned int ch)
3233 (unicode_attributes[ch].name != NULL
3234 && strcmp (unicode_attributes[ch].name, "<control>") == 0);
3236 is_category_Cc (ch);
3238 if (result1 != result2)
3243 /* See PropList-3.0.1.txt. */
3245 is_property_format_control (unsigned int ch)
3247 return (is_category_Cf (ch)
3248 && get_bidi_category (ch) == UC_BIDI_BN
3249 && !is_property_join_control (ch)
3253 /* See PropList.txt, UCD.html. */
3255 is_property_dash (unsigned int ch)
3257 return ((unicode_properties[ch] & (1ULL << PROP_DASH)) != 0);
3260 /* See PropList.txt, UCD.html. */
3262 is_property_hyphen (unsigned int ch)
3264 return ((unicode_properties[ch] & (1ULL << PROP_HYPHEN)) != 0);
3267 /* See PropList-3.0.1.txt. */
3269 is_property_punctuation (unsigned int ch)
3271 return is_category_P (ch);
3274 /* See PropList-3.0.1.txt. */
3276 is_property_line_separator (unsigned int ch)
3278 return is_category_Zl (ch);
3281 /* See PropList-3.0.1.txt. */
3283 is_property_paragraph_separator (unsigned int ch)
3285 return is_category_Zp (ch);
3288 /* See PropList.txt, UCD.html. */
3290 is_property_quotation_mark (unsigned int ch)
3292 return ((unicode_properties[ch] & (1ULL << PROP_QUOTATION_MARK)) != 0);
3295 /* See PropList.txt, UCD.html. */
3297 is_property_sentence_terminal (unsigned int ch)
3299 return ((unicode_properties[ch] & (1ULL << PROP_STERM)) != 0);
3302 /* See PropList.txt, UCD.html. */
3304 is_property_terminal_punctuation (unsigned int ch)
3306 return ((unicode_properties[ch] & (1ULL << PROP_TERMINAL_PUNCTUATION)) != 0);
3309 /* See PropList-3.0.1.txt. */
3311 is_property_currency_symbol (unsigned int ch)
3313 return is_category_Sc (ch);
3316 /* See Unicode 3.0 book, section 4.9,
3317 PropList.txt, UCD.html,
3318 DerivedCoreProperties.txt, UCD.html. */
3320 is_property_math (unsigned int ch)
3324 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_MATH)) != 0);
3326 ((unicode_properties[ch] & (1ULL << PROP_MATH)) != 0);
3328 if (result1 != result2)
3333 /* See PropList.txt, UCD.html. */
3335 is_property_other_math (unsigned int ch)
3337 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_MATH)) != 0);
3340 /* See PropList-3.0.1.txt. */
3342 is_property_paired_punctuation (unsigned int ch)
3344 return unicode_pairedpunctuation[ch];
3347 /* See PropList-3.0.1.txt. */
3349 is_property_left_of_pair (unsigned int ch)
3351 return unicode_leftofpair[ch];
3354 /* See PropList-3.0.1.txt. */
3356 is_property_combining (unsigned int ch)
3358 return (unicode_attributes[ch].name != NULL
3359 && (strcmp (unicode_attributes[ch].combining, "0") != 0
3360 || is_category_Mc (ch)
3361 || is_category_Me (ch)
3362 || is_category_Mn (ch)));
3365 #if 0 /* same as is_property_bidi_non_spacing_mark */
3366 /* See PropList-3.0.1.txt. */
3368 is_property_non_spacing (unsigned int ch)
3370 return (unicode_attributes[ch].name != NULL
3371 && get_bidi_category (ch) == UC_BIDI_NSM);
3375 /* See PropList-3.0.1.txt. */
3377 is_property_composite (unsigned int ch)
3379 /* This definition differs from the one in PropList-3.0.1.txt, but is more
3380 logical in some sense. */
3381 if (ch >= 0xAC00 && ch <= 0xD7A4) /* Hangul Syllables */
3383 if (unicode_attributes[ch].name != NULL
3384 && unicode_attributes[ch].decomposition != NULL)
3386 /* Test whether the decomposition contains more than one character,
3387 and the first is not a space. */
3388 const char *decomp = unicode_attributes[ch].decomposition;
3389 if (decomp[0] == '<')
3391 decomp = strchr (decomp, '>') + 1;
3392 if (decomp[0] == ' ')
3395 return strchr (decomp, ' ') != NULL && strncmp (decomp, "0020 ", 5) != 0;
3400 /* See PropList-3.0.1.txt. */
3402 is_property_decimal_digit (unsigned int ch)
3404 return is_category_Nd (ch);
3407 /* See PropList-3.0.1.txt. */
3409 is_property_numeric (unsigned int ch)
3411 return ((get_numeric_value (ch)).denominator > 0)
3412 || (ch == 0x09F8) /* BENGALI CURRENCY NUMERATOR ONE LESS THAN THE DENOMINATOR */
3413 || (ch == 0x2183); /* ROMAN NUMERAL REVERSED ONE HUNDRED */
3416 /* See PropList.txt, UCD.html. */
3418 is_property_diacritic (unsigned int ch)
3420 return ((unicode_properties[ch] & (1ULL << PROP_DIACRITIC)) != 0);
3423 /* See PropList.txt, UCD.html. */
3425 is_property_extender (unsigned int ch)
3427 return ((unicode_properties[ch] & (1ULL << PROP_EXTENDER)) != 0);
3430 /* See PropList-3.0.1.txt. */
3432 is_property_ignorable_control (unsigned int ch)
3434 return ((is_category_Cc (ch) && get_bidi_category (ch) == UC_BIDI_BN)
3435 || is_category_Cf (ch))
3439 /* ------------------------------------------------------------------------- */
3441 /* Output all properties. */
3443 output_properties (const char *version)
3445 #define PROPERTY(P) \
3446 debug_output_predicate ("unictype/pr_" #P ".txt", is_property_ ## P); \
3447 output_predicate_test ("../tests/unictype/test-pr_" #P ".c", is_property_ ## P, "uc_is_property_" #P " (c)"); \
3448 output_predicate ("unictype/pr_" #P ".h", is_property_ ## P, "u_property_" #P, "Properties", version);
3449 PROPERTY(white_space)
3450 PROPERTY(alphabetic)
3451 PROPERTY(other_alphabetic)
3452 PROPERTY(not_a_character)
3453 PROPERTY(default_ignorable_code_point)
3454 PROPERTY(other_default_ignorable_code_point)
3455 PROPERTY(deprecated)
3456 PROPERTY(logical_order_exception)
3457 PROPERTY(variation_selector)
3458 PROPERTY(private_use)
3459 PROPERTY(unassigned_code_value)
3461 PROPERTY(other_uppercase)
3463 PROPERTY(other_lowercase)
3465 PROPERTY(soft_dotted)
3467 PROPERTY(other_id_start)
3468 PROPERTY(id_continue)
3469 PROPERTY(other_id_continue)
3471 PROPERTY(xid_continue)
3472 PROPERTY(pattern_white_space)
3473 PROPERTY(pattern_syntax)
3474 PROPERTY(join_control)
3475 PROPERTY(grapheme_base)
3476 PROPERTY(grapheme_extend)
3477 PROPERTY(other_grapheme_extend)
3478 PROPERTY(grapheme_link)
3479 PROPERTY(bidi_control)
3480 PROPERTY(bidi_left_to_right)
3481 PROPERTY(bidi_hebrew_right_to_left)
3482 PROPERTY(bidi_arabic_right_to_left)
3483 PROPERTY(bidi_european_digit)
3484 PROPERTY(bidi_eur_num_separator)
3485 PROPERTY(bidi_eur_num_terminator)
3486 PROPERTY(bidi_arabic_digit)
3487 PROPERTY(bidi_common_separator)
3488 PROPERTY(bidi_block_separator)
3489 PROPERTY(bidi_segment_separator)
3490 PROPERTY(bidi_whitespace)
3491 PROPERTY(bidi_non_spacing_mark)
3492 PROPERTY(bidi_boundary_neutral)
3494 PROPERTY(bidi_embedding_or_override)
3495 PROPERTY(bidi_other_neutral)
3497 PROPERTY(ascii_hex_digit)
3498 PROPERTY(ideographic)
3499 PROPERTY(unified_ideograph)
3501 PROPERTY(ids_binary_operator)
3502 PROPERTY(ids_trinary_operator)
3503 PROPERTY(zero_width)
3506 PROPERTY(iso_control)
3507 PROPERTY(format_control)
3510 PROPERTY(punctuation)
3511 PROPERTY(line_separator)
3512 PROPERTY(paragraph_separator)
3513 PROPERTY(quotation_mark)
3514 PROPERTY(sentence_terminal)
3515 PROPERTY(terminal_punctuation)
3516 PROPERTY(currency_symbol)
3518 PROPERTY(other_math)
3519 PROPERTY(paired_punctuation)
3520 PROPERTY(left_of_pair)
3523 PROPERTY(decimal_digit)
3527 PROPERTY(ignorable_control)
3531 /* ========================================================================= */
3535 static const char *scripts[256];
3536 static unsigned int numscripts;
3538 static uint8_t unicode_scripts[0x110000];
3541 fill_scripts (const char *scripts_filename)
3546 stream = fopen (scripts_filename, "r");
3549 fprintf (stderr, "error during fopen of '%s'\n", scripts_filename);
3555 for (i = 0; i < 0x110000; i++)
3556 unicode_scripts[i] = (uint8_t)~(uint8_t)0;
3561 unsigned int i1, i2;
3562 char padding[200+1];
3563 char scriptname[200+1];
3566 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
3569 if (buf[0] == '\0' || buf[0] == '#')
3572 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, scriptname) != 4)
3574 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, scriptname) != 3)
3576 fprintf (stderr, "parse error in '%s'\n", scripts_filename);
3586 for (script = numscripts - 1; script >= 0; script--)
3587 if (strcmp (scripts[script], scriptname) == 0)
3591 scripts[numscripts] = strdup (scriptname);
3592 script = numscripts;
3594 if (numscripts == 256)
3598 for (i = i1; i <= i2; i++)
3600 if (unicode_scripts[i] != (uint8_t)~(uint8_t)0)
3601 fprintf (stderr, "0x%04X belongs to multiple scripts\n", i);
3602 unicode_scripts[i] = script;
3606 if (ferror (stream) || fclose (stream))
3608 fprintf (stderr, "error reading from '%s'\n", scripts_filename);
3613 /* Construction of sparse 3-level tables. */
3614 #define TABLE script_table
3615 #define ELEMENT uint8_t
3616 #define DEFAULT (uint8_t)~(uint8_t)0
3617 #define xmalloc malloc
3618 #define xrealloc realloc
3622 output_scripts (const char *version)
3624 const char *filename = "unictype/scripts.h";
3626 unsigned int ch, s, i;
3627 struct script_table t;
3628 unsigned int level1_offset, level2_offset, level3_offset;
3632 const char *lowercase_name;
3635 scriptinfo_t scriptinfo[256];
3637 stream = fopen (filename, "w");
3640 fprintf (stderr, "cannot open '%s' for writing\n", filename);
3644 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
3645 fprintf (stream, "/* Unicode scripts. */\n");
3646 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
3649 for (s = 0; s < numscripts; s++)
3651 char *lcp = strdup (scripts[s]);
3654 for (cp = lcp; *cp != '\0'; cp++)
3655 if (*cp >= 'A' && *cp <= 'Z')
3658 scriptinfo[s].lowercase_name = lcp;
3661 for (s = 0; s < numscripts; s++)
3663 fprintf (stream, "static const uc_interval_t script_%s_intervals[] =\n",
3664 scriptinfo[s].lowercase_name);
3665 fprintf (stream, "{\n");
3667 for (ch = 0; ch < 0x110000; ch++)
3668 if (unicode_scripts[ch] == s)
3674 while (ch + 1 < 0x110000 && unicode_scripts[ch + 1] == s)
3679 fprintf (stream, ",\n");
3681 fprintf (stream, " { 0x%04X, 1, 1 }", start);
3683 fprintf (stream, " { 0x%04X, 1, 0 }, { 0x%04X, 0, 1 }",
3687 fprintf (stream, "\n");
3688 fprintf (stream, "};\n");
3691 fprintf (stream, "static const uc_script_t scripts[%d] =\n", numscripts);
3692 fprintf (stream, "{\n");
3693 for (s = 0; s < numscripts; s++)
3695 fprintf (stream, " {\n");
3696 fprintf (stream, " sizeof (script_%s_intervals) / sizeof (uc_interval_t),\n",
3697 scriptinfo[s].lowercase_name);
3698 fprintf (stream, " script_%s_intervals,\n",
3699 scriptinfo[s].lowercase_name);
3700 fprintf (stream, " \"%s\"\n", scripts[s]);
3701 fprintf (stream, " }");
3702 if (s+1 < numscripts)
3703 fprintf (stream, ",");
3704 fprintf (stream, "\n");
3706 fprintf (stream, "};\n");
3710 script_table_init (&t);
3712 for (ch = 0; ch < 0x110000; ch++)
3714 unsigned int s = unicode_scripts[ch];
3715 if (s != (uint8_t)~(uint8_t)0)
3716 script_table_add (&t, ch, s);
3719 script_table_finalize (&t);
3721 /* Offsets in t.result, in memory of this process. */
3723 5 * sizeof (uint32_t);
3725 5 * sizeof (uint32_t)
3726 + t.level1_size * sizeof (uint32_t);
3728 5 * sizeof (uint32_t)
3729 + t.level1_size * sizeof (uint32_t)
3730 + (t.level2_size << t.q) * sizeof (uint32_t);
3732 for (i = 0; i < 5; i++)
3733 fprintf (stream, "#define script_header_%d %d\n", i,
3734 ((uint32_t *) t.result)[i]);
3735 fprintf (stream, "static const\n");
3736 fprintf (stream, "struct\n");
3737 fprintf (stream, " {\n");
3738 fprintf (stream, " int level1[%zu];\n", t.level1_size);
3739 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
3740 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
3741 fprintf (stream, " }\n");
3742 fprintf (stream, "u_script =\n");
3743 fprintf (stream, "{\n");
3744 fprintf (stream, " {");
3745 if (t.level1_size > 8)
3746 fprintf (stream, "\n ");
3747 for (i = 0; i < t.level1_size; i++)
3750 if (i > 0 && (i % 8) == 0)
3751 fprintf (stream, "\n ");
3752 offset = ((uint32_t *) (t.result + level1_offset))[i];
3754 fprintf (stream, " %5d", -1);
3756 fprintf (stream, " %5zd",
3757 (offset - level2_offset) / sizeof (uint32_t));
3758 if (i+1 < t.level1_size)
3759 fprintf (stream, ",");
3761 if (t.level1_size > 8)
3762 fprintf (stream, "\n ");
3763 fprintf (stream, " },\n");
3764 fprintf (stream, " {");
3765 if (t.level2_size << t.q > 8)
3766 fprintf (stream, "\n ");
3767 for (i = 0; i < t.level2_size << t.q; i++)
3770 if (i > 0 && (i % 8) == 0)
3771 fprintf (stream, "\n ");
3772 offset = ((uint32_t *) (t.result + level2_offset))[i];
3774 fprintf (stream, " %5d", -1);
3776 fprintf (stream, " %5zd",
3777 (offset - level3_offset) / sizeof (uint8_t));
3778 if (i+1 < t.level2_size << t.q)
3779 fprintf (stream, ",");
3781 if (t.level2_size << t.q > 8)
3782 fprintf (stream, "\n ");
3783 fprintf (stream, " },\n");
3784 fprintf (stream, " {");
3785 if (t.level3_size << t.p > 8)
3786 fprintf (stream, "\n ");
3787 for (i = 0; i < t.level3_size << t.p; i++)
3789 if (i > 0 && (i % 8) == 0)
3790 fprintf (stream, "\n ");
3791 fprintf (stream, " %3d", ((uint8_t *) (t.result + level3_offset))[i]);
3792 if (i+1 < t.level3_size << t.p)
3793 fprintf (stream, ",");
3795 if (t.level3_size << t.p > 8)
3796 fprintf (stream, "\n ");
3797 fprintf (stream, " }\n");
3798 fprintf (stream, "};\n");
3800 if (ferror (stream) || fclose (stream))
3802 fprintf (stderr, "error writing to '%s'\n", filename);
3808 output_scripts_byname (const char *version)
3810 const char *filename = "unictype/scripts_byname.gperf";
3814 stream = fopen (filename, "w");
3817 fprintf (stderr, "cannot open '%s' for writing\n", filename);
3821 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
3822 fprintf (stream, "/* Unicode scripts. */\n");
3823 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
3825 fprintf (stream, "struct named_script { const char *name; unsigned int index; };\n");
3826 fprintf (stream, "%%struct-type\n");
3827 fprintf (stream, "%%language=ANSI-C\n");
3828 fprintf (stream, "%%define hash-function-name scripts_hash\n");
3829 fprintf (stream, "%%define lookup-function-name uc_script_lookup\n");
3830 fprintf (stream, "%%readonly-tables\n");
3831 fprintf (stream, "%%global-table\n");
3832 fprintf (stream, "%%define word-array-name script_names\n");
3833 fprintf (stream, "%%%%\n");
3834 for (s = 0; s < numscripts; s++)
3835 fprintf (stream, "%s, %u\n", scripts[s], s);
3837 if (ferror (stream) || fclose (stream))
3839 fprintf (stderr, "error writing to '%s'\n", filename);
3844 /* ========================================================================= */
3848 typedef struct { unsigned int start; unsigned int end; const char *name; }
3850 static block_t blocks[256];
3851 static unsigned int numblocks;
3854 fill_blocks (const char *blocks_filename)
3858 stream = fopen (blocks_filename, "r");
3861 fprintf (stderr, "error during fopen of '%s'\n", blocks_filename);
3868 unsigned int i1, i2;
3869 char padding[200+1];
3870 char blockname[200+1];
3872 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
3875 if (buf[0] == '\0' || buf[0] == '#')
3878 if (sscanf (buf, "%X..%X%[ ;]%[^\r]", &i1, &i2, padding, blockname) != 4)
3880 fprintf (stderr, "parse error in '%s'\n", blocks_filename);
3883 blocks[numblocks].start = i1;
3884 blocks[numblocks].end = i2;
3885 blocks[numblocks].name = strdup (blockname);
3886 /* It must be sorted. */
3887 if (numblocks > 0 && !(blocks[numblocks-1].end < blocks[numblocks].start))
3890 if (numblocks == 256)
3894 if (ferror (stream) || fclose (stream))
3896 fprintf (stderr, "error reading from '%s'\n", blocks_filename);
3901 /* Return the smallest block index among the blocks for characters >= ch. */
3903 block_first_index (unsigned int ch)
3905 /* Binary search. */
3906 unsigned int lo = 0;
3907 unsigned int hi = numblocks;
3909 All blocks[i], i < lo, have blocks[i].end < ch,
3910 all blocks[i], i >= hi, have blocks[i].end >= ch. */
3913 unsigned int mid = (lo + hi) / 2; /* >= lo, < hi */
3914 if (blocks[mid].end < ch)
3922 /* Return the largest block index among the blocks for characters <= ch,
3925 block_last_index (unsigned int ch)
3927 /* Binary search. */
3928 unsigned int lo = 0;
3929 unsigned int hi = numblocks;
3931 All blocks[i], i < lo, have blocks[i].start <= ch,
3932 all blocks[i], i >= hi, have blocks[i].start > ch. */
3935 unsigned int mid = (lo + hi) / 2; /* >= lo, < hi */
3936 if (blocks[mid].start <= ch)
3945 output_blocks (const char *version)
3947 const char *filename = "unictype/blocks.h";
3948 const unsigned int shift = 8; /* bits to shift away for array access */
3949 const unsigned int threshold = 0x30000; /* cut-off table here to save space */
3954 stream = fopen (filename, "w");
3957 fprintf (stderr, "cannot open '%s' for writing\n", filename);
3961 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
3962 fprintf (stream, "/* Unicode blocks. */\n");
3963 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
3966 fprintf (stream, "static const uc_block_t blocks[] =\n");
3967 fprintf (stream, "{\n");
3968 for (i = 0; i < numblocks; i++)
3970 fprintf (stream, " { 0x%04X, 0x%04X, \"%s\" }", blocks[i].start,
3971 blocks[i].end, blocks[i].name);
3972 if (i+1 < numblocks)
3973 fprintf (stream, ",");
3974 fprintf (stream, "\n");
3976 fprintf (stream, "};\n");
3977 fprintf (stream, "#define blocks_level1_shift %d\n", shift);
3978 fprintf (stream, "#define blocks_level1_threshold 0x%04X\n", threshold);
3979 fprintf (stream, "static const uint8_t blocks_level1[%d * 2] =\n",
3980 threshold >> shift);
3981 fprintf (stream, "{\n");
3982 for (i1 = 0; i1 < (threshold >> shift); i1++)
3984 unsigned int first_index = block_first_index (i1 << shift);
3985 unsigned int last_index = block_last_index (((i1 + 1) << shift) - 1);
3986 fprintf (stream, " %3d, %3d", first_index, last_index);
3987 if (i1+1 < (threshold >> shift))
3988 fprintf (stream, ",");
3989 fprintf (stream, "\n");
3991 fprintf (stream, "};\n");
3992 fprintf (stream, "#define blocks_upper_first_index %d\n",
3993 block_first_index (threshold));
3994 fprintf (stream, "#define blocks_upper_last_index %d\n",
3995 block_last_index (0x10FFFF));
3997 if (ferror (stream) || fclose (stream))
3999 fprintf (stderr, "error writing to '%s'\n", filename);
4004 /* ========================================================================= */
4006 /* C and Java syntax. */
4010 UC_IDENTIFIER_START, /* valid as first or subsequent character */
4011 UC_IDENTIFIER_VALID, /* valid as subsequent character only */
4012 UC_IDENTIFIER_INVALID, /* not valid */
4013 UC_IDENTIFIER_IGNORABLE /* ignorable (Java only) */
4016 /* ISO C 99 section 6.4.(3). */
4018 is_c_whitespace (unsigned int ch)
4020 return (ch == ' ' /* space */
4021 || ch == '\t' /* horizontal tab */
4022 || ch == '\n' || ch == '\r' /* new-line */
4023 || ch == '\v' /* vertical tab */
4024 || ch == '\f'); /* form-feed */
4027 /* ISO C 99 section 6.4.2.1 and appendix D. */
4029 c_ident_category (unsigned int ch)
4031 /* Section 6.4.2.1. */
4032 if (ch >= '0' && ch <= '9')
4033 return UC_IDENTIFIER_VALID;
4034 if ((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || ch == '_')
4035 return UC_IDENTIFIER_START;
4041 || (ch >= 0x00C0 && ch <= 0x00D6)
4042 || (ch >= 0x00D8 && ch <= 0x00F6)
4043 || (ch >= 0x00F8 && ch <= 0x01F5)
4044 || (ch >= 0x01FA && ch <= 0x0217)
4045 || (ch >= 0x0250 && ch <= 0x02A8)
4046 || (ch >= 0x1E00 && ch <= 0x1E9B)
4047 || (ch >= 0x1EA0 && ch <= 0x1EF9)
4051 || (ch >= 0x0388 && ch <= 0x038A)
4053 || (ch >= 0x038E && ch <= 0x03A1)
4054 || (ch >= 0x03A3 && ch <= 0x03CE)
4055 || (ch >= 0x03D0 && ch <= 0x03D6)
4060 || (ch >= 0x03E2 && ch <= 0x03F3)
4061 || (ch >= 0x1F00 && ch <= 0x1F15)
4062 || (ch >= 0x1F18 && ch <= 0x1F1D)
4063 || (ch >= 0x1F20 && ch <= 0x1F45)
4064 || (ch >= 0x1F48 && ch <= 0x1F4D)
4065 || (ch >= 0x1F50 && ch <= 0x1F57)
4069 || (ch >= 0x1F5F && ch <= 0x1F7D)
4070 || (ch >= 0x1F80 && ch <= 0x1FB4)
4071 || (ch >= 0x1FB6 && ch <= 0x1FBC)
4072 || (ch >= 0x1FC2 && ch <= 0x1FC4)
4073 || (ch >= 0x1FC6 && ch <= 0x1FCC)
4074 || (ch >= 0x1FD0 && ch <= 0x1FD3)
4075 || (ch >= 0x1FD6 && ch <= 0x1FDB)
4076 || (ch >= 0x1FE0 && ch <= 0x1FEC)
4077 || (ch >= 0x1FF2 && ch <= 0x1FF4)
4078 || (ch >= 0x1FF6 && ch <= 0x1FFC)
4080 || (ch >= 0x0401 && ch <= 0x040C)
4081 || (ch >= 0x040E && ch <= 0x044F)
4082 || (ch >= 0x0451 && ch <= 0x045C)
4083 || (ch >= 0x045E && ch <= 0x0481)
4084 || (ch >= 0x0490 && ch <= 0x04C4)
4085 || (ch >= 0x04C7 && ch <= 0x04C8)
4086 || (ch >= 0x04CB && ch <= 0x04CC)
4087 || (ch >= 0x04D0 && ch <= 0x04EB)
4088 || (ch >= 0x04EE && ch <= 0x04F5)
4089 || (ch >= 0x04F8 && ch <= 0x04F9)
4091 || (ch >= 0x0531 && ch <= 0x0556)
4092 || (ch >= 0x0561 && ch <= 0x0587)
4094 || (ch >= 0x05B0 && ch <= 0x05B9)
4095 || (ch >= 0x05BB && ch <= 0x05BD)
4097 || (ch >= 0x05C1 && ch <= 0x05C2)
4098 || (ch >= 0x05D0 && ch <= 0x05EA)
4099 || (ch >= 0x05F0 && ch <= 0x05F2)
4101 || (ch >= 0x0621 && ch <= 0x063A)
4102 || (ch >= 0x0640 && ch <= 0x0652)
4103 || (ch >= 0x0670 && ch <= 0x06B7)
4104 || (ch >= 0x06BA && ch <= 0x06BE)
4105 || (ch >= 0x06C0 && ch <= 0x06CE)
4106 || (ch >= 0x06D0 && ch <= 0x06DC)
4107 || (ch >= 0x06E5 && ch <= 0x06E8)
4108 || (ch >= 0x06EA && ch <= 0x06ED)
4110 || (ch >= 0x0901 && ch <= 0x0903)
4111 || (ch >= 0x0905 && ch <= 0x0939)
4112 || (ch >= 0x093E && ch <= 0x094D)
4113 || (ch >= 0x0950 && ch <= 0x0952)
4114 || (ch >= 0x0958 && ch <= 0x0963)
4116 || (ch >= 0x0981 && ch <= 0x0983)
4117 || (ch >= 0x0985 && ch <= 0x098C)
4118 || (ch >= 0x098F && ch <= 0x0990)
4119 || (ch >= 0x0993 && ch <= 0x09A8)
4120 || (ch >= 0x09AA && ch <= 0x09B0)
4122 || (ch >= 0x09B6 && ch <= 0x09B9)
4123 || (ch >= 0x09BE && ch <= 0x09C4)
4124 || (ch >= 0x09C7 && ch <= 0x09C8)
4125 || (ch >= 0x09CB && ch <= 0x09CD)
4126 || (ch >= 0x09DC && ch <= 0x09DD)
4127 || (ch >= 0x09DF && ch <= 0x09E3)
4128 || (ch >= 0x09F0 && ch <= 0x09F1)
4131 || (ch >= 0x0A05 && ch <= 0x0A0A)
4132 || (ch >= 0x0A0F && ch <= 0x0A10)
4133 || (ch >= 0x0A13 && ch <= 0x0A28)
4134 || (ch >= 0x0A2A && ch <= 0x0A30)
4135 || (ch >= 0x0A32 && ch <= 0x0A33)
4136 || (ch >= 0x0A35 && ch <= 0x0A36)
4137 || (ch >= 0x0A38 && ch <= 0x0A39)
4138 || (ch >= 0x0A3E && ch <= 0x0A42)
4139 || (ch >= 0x0A47 && ch <= 0x0A48)
4140 || (ch >= 0x0A4B && ch <= 0x0A4D)
4141 || (ch >= 0x0A59 && ch <= 0x0A5C)
4145 || (ch >= 0x0A81 && ch <= 0x0A83)
4146 || (ch >= 0x0A85 && ch <= 0x0A8B)
4148 || (ch >= 0x0A8F && ch <= 0x0A91)
4149 || (ch >= 0x0A93 && ch <= 0x0AA8)
4150 || (ch >= 0x0AAA && ch <= 0x0AB0)
4151 || (ch >= 0x0AB2 && ch <= 0x0AB3)
4152 || (ch >= 0x0AB5 && ch <= 0x0AB9)
4153 || (ch >= 0x0ABD && ch <= 0x0AC5)
4154 || (ch >= 0x0AC7 && ch <= 0x0AC9)
4155 || (ch >= 0x0ACB && ch <= 0x0ACD)
4159 || (ch >= 0x0B01 && ch <= 0x0B03)
4160 || (ch >= 0x0B05 && ch <= 0x0B0C)
4161 || (ch >= 0x0B0F && ch <= 0x0B10)
4162 || (ch >= 0x0B13 && ch <= 0x0B28)
4163 || (ch >= 0x0B2A && ch <= 0x0B30)
4164 || (ch >= 0x0B32 && ch <= 0x0B33)
4165 || (ch >= 0x0B36 && ch <= 0x0B39)
4166 || (ch >= 0x0B3E && ch <= 0x0B43)
4167 || (ch >= 0x0B47 && ch <= 0x0B48)
4168 || (ch >= 0x0B4B && ch <= 0x0B4D)
4169 || (ch >= 0x0B5C && ch <= 0x0B5D)
4170 || (ch >= 0x0B5F && ch <= 0x0B61)
4172 || (ch >= 0x0B82 && ch <= 0x0B83)
4173 || (ch >= 0x0B85 && ch <= 0x0B8A)
4174 || (ch >= 0x0B8E && ch <= 0x0B90)
4175 || (ch >= 0x0B92 && ch <= 0x0B95)
4176 || (ch >= 0x0B99 && ch <= 0x0B9A)
4178 || (ch >= 0x0B9E && ch <= 0x0B9F)
4179 || (ch >= 0x0BA3 && ch <= 0x0BA4)
4180 || (ch >= 0x0BA8 && ch <= 0x0BAA)
4181 || (ch >= 0x0BAE && ch <= 0x0BB5)
4182 || (ch >= 0x0BB7 && ch <= 0x0BB9)
4183 || (ch >= 0x0BBE && ch <= 0x0BC2)
4184 || (ch >= 0x0BC6 && ch <= 0x0BC8)
4185 || (ch >= 0x0BCA && ch <= 0x0BCD)
4187 || (ch >= 0x0C01 && ch <= 0x0C03)
4188 || (ch >= 0x0C05 && ch <= 0x0C0C)
4189 || (ch >= 0x0C0E && ch <= 0x0C10)
4190 || (ch >= 0x0C12 && ch <= 0x0C28)
4191 || (ch >= 0x0C2A && ch <= 0x0C33)
4192 || (ch >= 0x0C35 && ch <= 0x0C39)
4193 || (ch >= 0x0C3E && ch <= 0x0C44)
4194 || (ch >= 0x0C46 && ch <= 0x0C48)
4195 || (ch >= 0x0C4A && ch <= 0x0C4D)
4196 || (ch >= 0x0C60 && ch <= 0x0C61)
4198 || (ch >= 0x0C82 && ch <= 0x0C83)
4199 || (ch >= 0x0C85 && ch <= 0x0C8C)
4200 || (ch >= 0x0C8E && ch <= 0x0C90)
4201 || (ch >= 0x0C92 && ch <= 0x0CA8)
4202 || (ch >= 0x0CAA && ch <= 0x0CB3)
4203 || (ch >= 0x0CB5 && ch <= 0x0CB9)
4204 || (ch >= 0x0CBE && ch <= 0x0CC4)
4205 || (ch >= 0x0CC6 && ch <= 0x0CC8)
4206 || (ch >= 0x0CCA && ch <= 0x0CCD)
4208 || (ch >= 0x0CE0 && ch <= 0x0CE1)
4210 || (ch >= 0x0D02 && ch <= 0x0D03)
4211 || (ch >= 0x0D05 && ch <= 0x0D0C)
4212 || (ch >= 0x0D0E && ch <= 0x0D10)
4213 || (ch >= 0x0D12 && ch <= 0x0D28)
4214 || (ch >= 0x0D2A && ch <= 0x0D39)
4215 || (ch >= 0x0D3E && ch <= 0x0D43)
4216 || (ch >= 0x0D46 && ch <= 0x0D48)
4217 || (ch >= 0x0D4A && ch <= 0x0D4D)
4218 || (ch >= 0x0D60 && ch <= 0x0D61)
4220 || (ch >= 0x0E01 && ch <= 0x0E3A)
4221 || (ch >= 0x0E40 && ch <= 0x0E5B)
4223 || (ch >= 0x0E81 && ch <= 0x0E82)
4225 || (ch >= 0x0E87 && ch <= 0x0E88)
4228 || (ch >= 0x0E94 && ch <= 0x0E97)
4229 || (ch >= 0x0E99 && ch <= 0x0E9F)
4230 || (ch >= 0x0EA1 && ch <= 0x0EA3)
4233 || (ch >= 0x0EAA && ch <= 0x0EAB)
4234 || (ch >= 0x0EAD && ch <= 0x0EAE)
4235 || (ch >= 0x0EB0 && ch <= 0x0EB9)
4236 || (ch >= 0x0EBB && ch <= 0x0EBD)
4237 || (ch >= 0x0EC0 && ch <= 0x0EC4)
4239 || (ch >= 0x0EC8 && ch <= 0x0ECD)
4240 || (ch >= 0x0EDC && ch <= 0x0EDD)
4243 || (ch >= 0x0F18 && ch <= 0x0F19)
4247 || (ch >= 0x0F3E && ch <= 0x0F47)
4248 || (ch >= 0x0F49 && ch <= 0x0F69)
4249 || (ch >= 0x0F71 && ch <= 0x0F84)
4250 || (ch >= 0x0F86 && ch <= 0x0F8B)
4251 || (ch >= 0x0F90 && ch <= 0x0F95)
4253 || (ch >= 0x0F99 && ch <= 0x0FAD)
4254 || (ch >= 0x0FB1 && ch <= 0x0FB7)
4257 || (ch >= 0x10A0 && ch <= 0x10C5)
4258 || (ch >= 0x10D0 && ch <= 0x10F6)
4260 || (ch >= 0x3041 && ch <= 0x3093)
4261 || (ch >= 0x309B && ch <= 0x309C)
4263 || (ch >= 0x30A1 && ch <= 0x30F6)
4264 || (ch >= 0x30FB && ch <= 0x30FC)
4266 || (ch >= 0x3105 && ch <= 0x312C)
4267 /* CJK Unified Ideographs */
4268 || (ch >= 0x4E00 && ch <= 0x9FA5)
4270 || (ch >= 0xAC00 && ch <= 0xD7A3)
4272 || (ch >= 0x0660 && ch <= 0x0669)
4273 || (ch >= 0x06F0 && ch <= 0x06F9)
4274 || (ch >= 0x0966 && ch <= 0x096F)
4275 || (ch >= 0x09E6 && ch <= 0x09EF)
4276 || (ch >= 0x0A66 && ch <= 0x0A6F)
4277 || (ch >= 0x0AE6 && ch <= 0x0AEF)
4278 || (ch >= 0x0B66 && ch <= 0x0B6F)
4279 || (ch >= 0x0BE7 && ch <= 0x0BEF)
4280 || (ch >= 0x0C66 && ch <= 0x0C6F)
4281 || (ch >= 0x0CE6 && ch <= 0x0CEF)
4282 || (ch >= 0x0D66 && ch <= 0x0D6F)
4283 || (ch >= 0x0E50 && ch <= 0x0E59)
4284 || (ch >= 0x0ED0 && ch <= 0x0ED9)
4285 || (ch >= 0x0F20 && ch <= 0x0F33)
4286 /* Special characters */
4289 || (ch >= 0x02B0 && ch <= 0x02B8)
4291 || (ch >= 0x02BD && ch <= 0x02C1)
4292 || (ch >= 0x02D0 && ch <= 0x02D1)
4293 || (ch >= 0x02E0 && ch <= 0x02E4)
4299 || (ch >= 0x203F && ch <= 0x2040)
4302 || (ch >= 0x210A && ch <= 0x2113)
4304 || (ch >= 0x2118 && ch <= 0x211D)
4308 || (ch >= 0x212A && ch <= 0x2131)
4309 || (ch >= 0x2133 && ch <= 0x2138)
4310 || (ch >= 0x2160 && ch <= 0x2182)
4311 || (ch >= 0x3005 && ch <= 0x3007)
4312 || (ch >= 0x3021 && ch <= 0x3029)
4314 return UC_IDENTIFIER_START;
4315 return UC_IDENTIFIER_INVALID;
4318 /* The Java Language Specification, 3rd edition, §3.6.
4319 http://java.sun.com/docs/books/jls/third_edition/html/lexical.html#95710 */
4321 is_java_whitespace (unsigned int ch)
4323 return (ch == ' ' || ch == '\t' || ch == '\f'
4324 || ch == '\n' || ch == '\r');
4327 /* The Java Language Specification, 3rd edition, §3.8.
4328 http://java.sun.com/docs/books/jls/third_edition/html/lexical.html#40625
4329 and Character.isJavaIdentifierStart and Character.isJavaIdentifierPart */
4331 java_ident_category (unsigned int ch)
4333 /* FIXME: Check this against Sun's JDK implementation. */
4334 if (is_category_L (ch) /* = Character.isLetter(ch) */
4335 || is_category_Nl (ch) /* = Character.getType(ch)==LETTER_NUMBER */
4336 || is_category_Sc (ch) /* currency symbol */
4337 || is_category_Pc (ch) /* connector punctuation */
4339 return UC_IDENTIFIER_START;
4340 if (is_category_Nd (ch) /* digit */
4341 || is_category_Mc (ch) /* combining mark */
4342 || is_category_Mn (ch) /* non-spacing mark */
4344 return UC_IDENTIFIER_VALID;
4345 if ((ch >= 0x0000 && ch <= 0x0008)
4346 || (ch >= 0x000E && ch <= 0x001B)
4347 || (ch >= 0x007F && ch <= 0x009F)
4348 || is_category_Cf (ch) /* = Character.getType(ch)==FORMAT */
4350 return UC_IDENTIFIER_IGNORABLE;
4351 return UC_IDENTIFIER_INVALID;
4354 /* Construction of sparse 3-level tables. */
4355 #define TABLE identsyntax_table
4356 #define ELEMENT uint8_t
4357 #define DEFAULT UC_IDENTIFIER_INVALID
4358 #define xmalloc malloc
4359 #define xrealloc realloc
4362 /* Output an identifier syntax categorization in a three-level bitmap. */
4364 output_ident_category (const char *filename, int (*predicate) (unsigned int), const char *name, const char *version)
4368 struct identsyntax_table t;
4369 unsigned int level1_offset, level2_offset, level3_offset;
4371 stream = fopen (filename, "w");
4374 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4378 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4379 fprintf (stream, "/* Language syntax properties of Unicode characters. */\n");
4380 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
4385 identsyntax_table_init (&t);
4387 for (ch = 0; ch < 0x110000; ch++)
4389 int syntaxcode = predicate (ch);
4390 if (syntaxcode != UC_IDENTIFIER_INVALID)
4391 identsyntax_table_add (&t, ch, syntaxcode);
4394 identsyntax_table_finalize (&t);
4396 /* Offsets in t.result, in memory of this process. */
4398 5 * sizeof (uint32_t);
4400 5 * sizeof (uint32_t)
4401 + t.level1_size * sizeof (uint32_t);
4403 5 * sizeof (uint32_t)
4404 + t.level1_size * sizeof (uint32_t)
4405 + (t.level2_size << t.q) * sizeof (uint32_t);
4407 for (i = 0; i < 5; i++)
4408 fprintf (stream, "#define identsyntax_header_%d %d\n", i,
4409 ((uint32_t *) t.result)[i]);
4410 fprintf (stream, "static const\n");
4411 fprintf (stream, "struct\n");
4412 fprintf (stream, " {\n");
4413 fprintf (stream, " int level1[%zu];\n", t.level1_size);
4414 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
4415 fprintf (stream, " unsigned short level3[%zu * %d];\n", t.level3_size,
4416 (1 << t.p) * 2 / 16);
4417 fprintf (stream, " }\n");
4418 fprintf (stream, "%s =\n", name);
4419 fprintf (stream, "{\n");
4420 fprintf (stream, " {");
4421 if (t.level1_size > 8)
4422 fprintf (stream, "\n ");
4423 for (i = 0; i < t.level1_size; i++)
4426 if (i > 0 && (i % 8) == 0)
4427 fprintf (stream, "\n ");
4428 offset = ((uint32_t *) (t.result + level1_offset))[i];
4430 fprintf (stream, " %5d", -1);
4432 fprintf (stream, " %5zd",
4433 (offset - level2_offset) / sizeof (uint32_t));
4434 if (i+1 < t.level1_size)
4435 fprintf (stream, ",");
4437 if (t.level1_size > 8)
4438 fprintf (stream, "\n ");
4439 fprintf (stream, " },\n");
4440 fprintf (stream, " {");
4441 if (t.level2_size << t.q > 8)
4442 fprintf (stream, "\n ");
4443 for (i = 0; i < t.level2_size << t.q; i++)
4446 if (i > 0 && (i % 8) == 0)
4447 fprintf (stream, "\n ");
4448 offset = ((uint32_t *) (t.result + level2_offset))[i];
4450 fprintf (stream, " %5d", -1);
4452 fprintf (stream, " %5zd",
4453 (offset - level3_offset) / sizeof (uint8_t));
4454 if (i+1 < t.level2_size << t.q)
4455 fprintf (stream, ",");
4457 if (t.level2_size << t.q > 8)
4458 fprintf (stream, "\n ");
4459 fprintf (stream, " },\n");
4460 /* Pack the level3 array. Each entry needs 2 bits only. */
4461 fprintf (stream, " {");
4462 if ((t.level3_size << t.p) * 2 / 16 > 8)
4463 fprintf (stream, "\n ");
4464 for (i = 0; i < (t.level3_size << t.p) * 2 / 16; i++)
4466 if (i > 0 && (i % 8) == 0)
4467 fprintf (stream, "\n ");
4468 fprintf (stream, " 0x%04x",
4469 (((uint8_t *) (t.result + level3_offset))[8 * i] << 0)
4470 | (((uint8_t *) (t.result + level3_offset))[8 * i + 1] << 2)
4471 | (((uint8_t *) (t.result + level3_offset))[8 * i + 2] << 4)
4472 | (((uint8_t *) (t.result + level3_offset))[8 * i + 3] << 6)
4473 | (((uint8_t *) (t.result + level3_offset))[8 * i + 4] << 8)
4474 | (((uint8_t *) (t.result + level3_offset))[8 * i + 5] << 10)
4475 | (((uint8_t *) (t.result + level3_offset))[8 * i + 6] << 12)
4476 | (((uint8_t *) (t.result + level3_offset))[8 * i + 7] << 14));
4477 if (i+1 < (t.level3_size << t.p) * 2 / 16)
4478 fprintf (stream, ",");
4480 if ((t.level3_size << t.p) * 2 / 16 > 8)
4481 fprintf (stream, "\n ");
4482 fprintf (stream, " }\n");
4483 fprintf (stream, "};\n");
4485 if (ferror (stream) || fclose (stream))
4487 fprintf (stderr, "error writing to '%s'\n", filename);
4493 output_ident_properties (const char *version)
4495 #define PROPERTY(P) \
4496 debug_output_predicate ("unictype/sy_" #P ".txt", is_ ## P); \
4497 output_predicate_test ("../tests/unictype/test-sy_" #P ".c", is_ ## P, "uc_is_" #P " (c)"); \
4498 output_predicate ("unictype/sy_" #P ".h", is_ ## P, "u_" #P, "Language syntax properties", version);
4499 PROPERTY(c_whitespace)
4500 PROPERTY(java_whitespace)
4503 output_ident_category ("unictype/sy_c_ident.h", c_ident_category, "u_c_ident", version);
4504 output_ident_category ("unictype/sy_java_ident.h", java_ident_category, "u_java_ident", version);
4507 /* ========================================================================= */
4509 /* Like ISO C <ctype.h> and <wctype.h>. Compatible to glibc's
4510 glibc/localedata/locales/i18n file, generated by
4511 glibc/localedata/gen-unicode-ctype.c. */
4513 /* Character mappings. */
4516 to_upper (unsigned int ch)
4518 if (unicode_attributes[ch].name != NULL
4519 && unicode_attributes[ch].upper != NONE)
4520 return unicode_attributes[ch].upper;
4526 to_lower (unsigned int ch)
4528 if (unicode_attributes[ch].name != NULL
4529 && unicode_attributes[ch].lower != NONE)
4530 return unicode_attributes[ch].lower;
4536 to_title (unsigned int ch)
4538 if (unicode_attributes[ch].name != NULL
4539 && unicode_attributes[ch].title != NONE)
4540 return unicode_attributes[ch].title;
4545 /* Character class properties. */
4548 is_upper (unsigned int ch)
4550 return (to_lower (ch) != ch);
4554 is_lower (unsigned int ch)
4556 return (to_upper (ch) != ch)
4557 /* <U00DF> is lowercase, but without simple to_upper mapping. */
4562 is_alpha (unsigned int ch)
4564 return (unicode_attributes[ch].name != NULL
4565 && ((unicode_attributes[ch].category[0] == 'L'
4566 /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
4567 <U0E2F>, <U0E46> should belong to is_punct. */
4568 && (ch != 0x0E2F) && (ch != 0x0E46))
4569 /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
4570 <U0E31>, <U0E34>..<U0E3A>, <U0E47>..<U0E4E> are is_alpha. */
4572 || (ch >= 0x0E34 && ch <= 0x0E3A)
4573 || (ch >= 0x0E47 && ch <= 0x0E4E)
4574 /* Avoid warning for <U0345>. */
4576 /* Avoid warnings for <U2160>..<U217F>. */
4577 || (unicode_attributes[ch].category[0] == 'N'
4578 && unicode_attributes[ch].category[1] == 'l')
4579 /* Avoid warnings for <U24B6>..<U24E9>. */
4580 || (unicode_attributes[ch].category[0] == 'S'
4581 && unicode_attributes[ch].category[1] == 'o'
4582 && strstr (unicode_attributes[ch].name, " LETTER ")
4584 /* Consider all the non-ASCII digits as alphabetic.
4585 ISO C 99 forbids us to have them in category "digit",
4586 but we want iswalnum to return true on them. */
4587 || (unicode_attributes[ch].category[0] == 'N'
4588 && unicode_attributes[ch].category[1] == 'd'
4589 && !(ch >= 0x0030 && ch <= 0x0039))));
4593 is_digit (unsigned int ch)
4596 return (unicode_attributes[ch].name != NULL
4597 && unicode_attributes[ch].category[0] == 'N'
4598 && unicode_attributes[ch].category[1] == 'd');
4599 /* Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
4600 a zero. Must add <0> in front of them by hand. */
4602 /* SUSV2 gives us some freedom for the "digit" category, but ISO C 99
4605 The iswdigit function tests for any wide character that corresponds
4606 to a decimal-digit character (as defined in 5.2.1).
4608 the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
4610 return (ch >= 0x0030 && ch <= 0x0039);
4615 is_outdigit (unsigned int ch)
4617 return (ch >= 0x0030 && ch <= 0x0039);
4621 is_alnum (unsigned int ch)
4623 return is_alpha (ch) || is_digit (ch);
4627 is_blank (unsigned int ch)
4629 return (ch == 0x0009 /* '\t' */
4630 /* Category Zs without mention of "<noBreak>" */
4631 || (unicode_attributes[ch].name != NULL
4632 && unicode_attributes[ch].category[0] == 'Z'
4633 && unicode_attributes[ch].category[1] == 's'
4634 && !strstr (unicode_attributes[ch].decomposition, "<noBreak>")));
4638 is_space (unsigned int ch)
4640 /* Don't make U+00A0 a space. Non-breaking space means that all programs
4641 should treat it like a punctuation character, not like a space. */
4642 return (ch == 0x0020 /* ' ' */
4643 || ch == 0x000C /* '\f' */
4644 || ch == 0x000A /* '\n' */
4645 || ch == 0x000D /* '\r' */
4646 || ch == 0x0009 /* '\t' */
4647 || ch == 0x000B /* '\v' */
4648 /* Categories Zl, Zp, and Zs without mention of "<noBreak>" */
4649 || (unicode_attributes[ch].name != NULL
4650 && unicode_attributes[ch].category[0] == 'Z'
4651 && (unicode_attributes[ch].category[1] == 'l'
4652 || unicode_attributes[ch].category[1] == 'p'
4653 || (unicode_attributes[ch].category[1] == 's'
4654 && !strstr (unicode_attributes[ch].decomposition,
4659 is_cntrl (unsigned int ch)
4661 return (unicode_attributes[ch].name != NULL
4662 && (strcmp (unicode_attributes[ch].name, "<control>") == 0
4663 /* Categories Zl and Zp */
4664 || (unicode_attributes[ch].category[0] == 'Z'
4665 && (unicode_attributes[ch].category[1] == 'l'
4666 || unicode_attributes[ch].category[1] == 'p'))));
4670 is_xdigit (unsigned int ch)
4673 return is_digit (ch)
4674 || (ch >= 0x0041 && ch <= 0x0046)
4675 || (ch >= 0x0061 && ch <= 0x0066);
4677 /* SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
4680 The iswxdigit function tests for any wide character that corresponds
4681 to a hexadecimal-digit character (as defined in 6.4.4.1).
4683 hexadecimal-digit: one of 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
4685 return (ch >= 0x0030 && ch <= 0x0039)
4686 || (ch >= 0x0041 && ch <= 0x0046)
4687 || (ch >= 0x0061 && ch <= 0x0066);
4692 is_graph (unsigned int ch)
4694 return (unicode_attributes[ch].name != NULL
4695 && strcmp (unicode_attributes[ch].name, "<control>")
4700 is_print (unsigned int ch)
4702 return (unicode_attributes[ch].name != NULL
4703 && strcmp (unicode_attributes[ch].name, "<control>")
4704 /* Categories Zl and Zp */
4705 && !(unicode_attributes[ch].name != NULL
4706 && unicode_attributes[ch].category[0] == 'Z'
4707 && (unicode_attributes[ch].category[1] == 'l'
4708 || unicode_attributes[ch].category[1] == 'p')));
4712 is_punct (unsigned int ch)
4715 return (unicode_attributes[ch].name != NULL
4716 && unicode_attributes[ch].category[0] == 'P');
4718 /* The traditional POSIX definition of punctuation is every graphic,
4719 non-alphanumeric character. */
4720 return (is_graph (ch) && !is_alpha (ch) && !is_digit (ch));
4724 /* Output all properties. */
4726 output_old_ctype (const char *version)
4728 #define PROPERTY(P) \
4729 debug_output_predicate ("unictype/ctype_" #P ".txt", is_ ## P); \
4730 output_predicate_test ("../tests/unictype/test-ctype_" #P ".c", is_ ## P, "uc_is_" #P " (c)"); \
4731 output_predicate ("unictype/ctype_" #P ".h", is_ ## P, "u_is_" #P, "ISO C <ctype.h> like properties", version);
4750 is_combining (unsigned int ch)
4752 /* Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
4753 file. In 3.0.1 it was identical to the union of the general categories
4754 "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the
4755 PropList.txt file, so we take the latter definition. */
4756 return (unicode_attributes[ch].name != NULL
4757 && unicode_attributes[ch].category[0] == 'M'
4758 && (unicode_attributes[ch].category[1] == 'n'
4759 || unicode_attributes[ch].category[1] == 'c'
4760 || unicode_attributes[ch].category[1] == 'e'));
4764 is_combining_level3 (unsigned int ch)
4766 return is_combining (ch)
4767 && !(unicode_attributes[ch].combining[0] != '\0'
4768 && unicode_attributes[ch].combining[0] != '0'
4769 && strtoul (unicode_attributes[ch].combining, NULL, 10) >= 200);
4772 /* Return the UCS symbol string for a Unicode character. */
4774 ucs_symbol (unsigned int i)
4776 static char buf[11+1];
4778 sprintf (buf, (i < 0x10000 ? "<U%04X>" : "<U%08X>"), i);
4782 /* Return the UCS symbol range string for a Unicode characters interval. */
4784 ucs_symbol_range (unsigned int low, unsigned int high)
4786 static char buf[24+1];
4788 strcpy (buf, ucs_symbol (low));
4790 strcat (buf, ucs_symbol (high));
4794 /* Output a character class (= property) table. */
4797 output_charclass (FILE *stream, const char *classname,
4798 bool (*func) (unsigned int))
4800 char table[0x110000];
4802 bool need_semicolon;
4803 const int max_column = 75;
4806 for (i = 0; i < 0x110000; i++)
4807 table[i] = (int) func (i);
4809 fprintf (stream, "%s ", classname);
4810 need_semicolon = false;
4812 for (i = 0; i < 0x110000; )
4818 unsigned int low, high;
4824 while (i < 0x110000 && table[i]);
4828 strcpy (buf, ucs_symbol (low));
4830 strcpy (buf, ucs_symbol_range (low, high));
4834 fprintf (stream, ";");
4838 if (column + strlen (buf) > max_column)
4840 fprintf (stream, "/\n ");
4844 fprintf (stream, "%s", buf);
4845 column += strlen (buf);
4846 need_semicolon = true;
4849 fprintf (stream, "\n");
4852 /* Output a character mapping table. */
4855 output_charmap (FILE *stream, const char *mapname,
4856 unsigned int (*func) (unsigned int))
4858 char table[0x110000];
4860 bool need_semicolon;
4861 const int max_column = 75;
4864 for (i = 0; i < 0x110000; i++)
4865 table[i] = (func (i) != i);
4867 fprintf (stream, "%s ", mapname);
4868 need_semicolon = false;
4870 for (i = 0; i < 0x110000; i++)
4876 strcat (buf, ucs_symbol (i));
4878 strcat (buf, ucs_symbol (func (i)));
4883 fprintf (stream, ";");
4887 if (column + strlen (buf) > max_column)
4889 fprintf (stream, "/\n ");
4893 fprintf (stream, "%s", buf);
4894 column += strlen (buf);
4895 need_semicolon = true;
4897 fprintf (stream, "\n");
4900 /* Output the width table. */
4903 output_widthmap (FILE *stream)
4907 /* Output the tables to the given file. */
4910 output_tables (const char *filename, const char *version)
4915 stream = fopen (filename, "w");
4918 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4922 fprintf (stream, "escape_char /\n");
4923 fprintf (stream, "comment_char %%\n");
4924 fprintf (stream, "\n");
4925 fprintf (stream, "%% Generated automatically by gen-unicode-ctype for Unicode %s.\n",
4927 fprintf (stream, "\n");
4929 fprintf (stream, "LC_IDENTIFICATION\n");
4930 fprintf (stream, "title \"Unicode %s FDCC-set\"\n", version);
4931 fprintf (stream, "source \"UnicodeData.txt, PropList.txt\"\n");
4932 fprintf (stream, "address \"\"\n");
4933 fprintf (stream, "contact \"\"\n");
4934 fprintf (stream, "email \"bug-glibc@gnu.org\"\n");
4935 fprintf (stream, "tel \"\"\n");
4936 fprintf (stream, "fax \"\"\n");
4937 fprintf (stream, "language \"\"\n");
4938 fprintf (stream, "territory \"Earth\"\n");
4939 fprintf (stream, "revision \"%s\"\n", version);
4944 strftime (date, sizeof (date), "%Y-%m-%d", gmtime (&now));
4945 fprintf (stream, "date \"%s\"\n", date);
4947 fprintf (stream, "category \"unicode:2001\";LC_CTYPE\n");
4948 fprintf (stream, "END LC_IDENTIFICATION\n");
4949 fprintf (stream, "\n");
4951 /* Verifications. */
4952 for (ch = 0; ch < 0x110000; ch++)
4954 /* toupper restriction: "Only characters specified for the keywords
4955 lower and upper shall be specified. */
4956 if (to_upper (ch) != ch && !(is_lower (ch) || is_upper (ch)))
4958 "%s is not upper|lower but toupper(0x%04X) = 0x%04X\n",
4959 ucs_symbol (ch), ch, to_upper (ch));
4961 /* tolower restriction: "Only characters specified for the keywords
4962 lower and upper shall be specified. */
4963 if (to_lower (ch) != ch && !(is_lower (ch) || is_upper (ch)))
4965 "%s is not upper|lower but tolower(0x%04X) = 0x%04X\n",
4966 ucs_symbol (ch), ch, to_lower (ch));
4968 /* alpha restriction: "Characters classified as either upper or lower
4969 shall automatically belong to this class. */
4970 if ((is_lower (ch) || is_upper (ch)) && !is_alpha (ch))
4971 fprintf (stderr, "%s is upper|lower but not alpha\n", ucs_symbol (ch));
4973 /* alpha restriction: "No character specified for the keywords cntrl,
4974 digit, punct or space shall be specified." */
4975 if (is_alpha (ch) && is_cntrl (ch))
4976 fprintf (stderr, "%s is alpha and cntrl\n", ucs_symbol (ch));
4977 if (is_alpha (ch) && is_digit (ch))
4978 fprintf (stderr, "%s is alpha and digit\n", ucs_symbol (ch));
4979 if (is_alpha (ch) && is_punct (ch))
4980 fprintf (stderr, "%s is alpha and punct\n", ucs_symbol (ch));
4981 if (is_alpha (ch) && is_space (ch))
4982 fprintf (stderr, "%s is alpha and space\n", ucs_symbol (ch));
4984 /* space restriction: "No character specified for the keywords upper,
4985 lower, alpha, digit, graph or xdigit shall be specified."
4986 upper, lower, alpha already checked above. */
4987 if (is_space (ch) && is_digit (ch))
4988 fprintf (stderr, "%s is space and digit\n", ucs_symbol (ch));
4989 if (is_space (ch) && is_graph (ch))
4990 fprintf (stderr, "%s is space and graph\n", ucs_symbol (ch));
4991 if (is_space (ch) && is_xdigit (ch))
4992 fprintf (stderr, "%s is space and xdigit\n", ucs_symbol (ch));
4994 /* cntrl restriction: "No character specified for the keywords upper,
4995 lower, alpha, digit, punct, graph, print or xdigit shall be
4996 specified." upper, lower, alpha already checked above. */
4997 if (is_cntrl (ch) && is_digit (ch))
4998 fprintf (stderr, "%s is cntrl and digit\n", ucs_symbol (ch));
4999 if (is_cntrl (ch) && is_punct (ch))
5000 fprintf (stderr, "%s is cntrl and punct\n", ucs_symbol (ch));
5001 if (is_cntrl (ch) && is_graph (ch))
5002 fprintf (stderr, "%s is cntrl and graph\n", ucs_symbol (ch));
5003 if (is_cntrl (ch) && is_print (ch))
5004 fprintf (stderr, "%s is cntrl and print\n", ucs_symbol (ch));
5005 if (is_cntrl (ch) && is_xdigit (ch))
5006 fprintf (stderr, "%s is cntrl and xdigit\n", ucs_symbol (ch));
5008 /* punct restriction: "No character specified for the keywords upper,
5009 lower, alpha, digit, cntrl, xdigit or as the <space> character shall
5010 be specified." upper, lower, alpha, cntrl already checked above. */
5011 if (is_punct (ch) && is_digit (ch))
5012 fprintf (stderr, "%s is punct and digit\n", ucs_symbol (ch));
5013 if (is_punct (ch) && is_xdigit (ch))
5014 fprintf (stderr, "%s is punct and xdigit\n", ucs_symbol (ch));
5015 if (is_punct (ch) && (ch == 0x0020))
5016 fprintf (stderr, "%s is punct\n", ucs_symbol (ch));
5018 /* graph restriction: "No character specified for the keyword cntrl
5019 shall be specified." Already checked above. */
5021 /* print restriction: "No character specified for the keyword cntrl
5022 shall be specified." Already checked above. */
5024 /* graph - print relation: differ only in the <space> character.
5025 How is this possible if there are more than one space character?!
5026 I think susv2/xbd/locale.html should speak of "space characters",
5027 not "space character". */
5028 if (is_print (ch) && !(is_graph (ch) || /* ch == 0x0020 */ is_space (ch)))
5030 "%s is print but not graph|<space>\n", ucs_symbol (ch));
5031 if (!is_print (ch) && (is_graph (ch) || ch == 0x0020))
5033 "%s is graph|<space> but not print\n", ucs_symbol (ch));
5036 fprintf (stream, "LC_CTYPE\n");
5037 output_charclass (stream, "upper", is_upper);
5038 output_charclass (stream, "lower", is_lower);
5039 output_charclass (stream, "alpha", is_alpha);
5040 output_charclass (stream, "digit", is_digit);
5041 output_charclass (stream, "outdigit", is_outdigit);
5042 output_charclass (stream, "blank", is_blank);
5043 output_charclass (stream, "space", is_space);
5044 output_charclass (stream, "cntrl", is_cntrl);
5045 output_charclass (stream, "punct", is_punct);
5046 output_charclass (stream, "xdigit", is_xdigit);
5047 output_charclass (stream, "graph", is_graph);
5048 output_charclass (stream, "print", is_print);
5049 output_charclass (stream, "class \"combining\";", is_combining);
5050 output_charclass (stream, "class \"combining_level3\";", is_combining_level3);
5051 output_charmap (stream, "toupper", to_upper);
5052 output_charmap (stream, "tolower", to_lower);
5053 output_charmap (stream, "map \"totitle\";", to_title);
5054 output_widthmap (stream);
5055 fprintf (stream, "END LC_CTYPE\n");
5057 if (ferror (stream) || fclose (stream))
5059 fprintf (stderr, "error writing to '%s'\n", filename);
5066 /* ========================================================================= */
5068 /* The width property from the EastAsianWidth.txt file.
5069 Each is NULL (unassigned) or "N", "A", "H", "W", "F", "Na". */
5070 const char * unicode_width[0x110000];
5072 /* Stores in unicode_width[] the width property from the EastAsianWidth.txt
5075 fill_width (const char *width_filename)
5079 char field0[FIELDLEN];
5080 char field1[FIELDLEN];
5081 char field2[FIELDLEN];
5084 for (i = 0; i < 0x110000; i++)
5085 unicode_width[i] = (unicode_attributes[i].name != NULL ? "N" : NULL);
5087 stream = fopen (width_filename, "r");
5090 fprintf (stderr, "error during fopen of '%s'\n", width_filename);
5105 do c = getc (stream); while (c != EOF && c != '\n');
5109 n = getfield (stream, field0, ';');
5110 n += getfield (stream, field1, ' ');
5111 n += getfield (stream, field2, '\n');
5116 fprintf (stderr, "short line in '%s':%d\n", width_filename, lineno);
5119 i = strtoul (field0, NULL, 16);
5120 if (strstr (field0, "..") != NULL)
5122 /* Deal with a range. */
5123 j = strtoul (strstr (field0, "..") + 2, NULL, 16);
5125 unicode_width[i] = strdup (field1);
5129 /* Single character line. */
5130 unicode_width[i] = strdup (field1);
5133 if (ferror (stream) || fclose (stream))
5135 fprintf (stderr, "error reading from '%s'\n", width_filename);
5140 /* Line breaking classification. */
5144 /* Values >= 24 are resolved at run time. */
5145 LBP_BK = 24, /* mandatory break */
5146 /*LBP_CR, carriage return - not used here because it's a DOSism */
5147 /*LBP_LF, line feed - not used here because it's a DOSism */
5148 LBP_CM = 25, /* attached characters and combining marks */
5149 /*LBP_NL, next line - not used here because it's equivalent to LBP_BK */
5150 /*LBP_SG, surrogates - not used here because they are not characters */
5151 LBP_WJ = 0, /* word joiner */
5152 LBP_ZW = 26, /* zero width space */
5153 LBP_GL = 1, /* non-breaking (glue) */
5154 LBP_SP = 27, /* space */
5155 LBP_B2 = 2, /* break opportunity before and after */
5156 LBP_BA = 3, /* break opportunity after */
5157 LBP_BB = 4, /* break opportunity before */
5158 LBP_HY = 5, /* hyphen */
5159 LBP_CB = 28, /* contingent break opportunity */
5160 LBP_CL = 6, /* closing punctuation */
5161 LBP_EX = 7, /* exclamation/interrogation */
5162 LBP_IN = 8, /* inseparable */
5163 LBP_NS = 9, /* non starter */
5164 LBP_OP = 10, /* opening punctuation */
5165 LBP_QU = 11, /* ambiguous quotation */
5166 LBP_IS = 12, /* infix separator (numeric) */
5167 LBP_NU = 13, /* numeric */
5168 LBP_PO = 14, /* postfix (numeric) */
5169 LBP_PR = 15, /* prefix (numeric) */
5170 LBP_SY = 16, /* symbols allowing breaks */
5171 LBP_AI = 29, /* ambiguous (alphabetic or ideograph) */
5172 LBP_AL = 17, /* ordinary alphabetic and symbol characters */
5173 LBP_H2 = 18, /* Hangul LV syllable */
5174 LBP_H3 = 19, /* Hangul LVT syllable */
5175 LBP_ID = 20, /* ideographic */
5176 LBP_JL = 21, /* Hangul L Jamo */
5177 LBP_JV = 22, /* Hangul V Jamo */
5178 LBP_JT = 23, /* Hangul T Jamo */
5179 LBP_SA = 30, /* complex context (South East Asian) */
5180 LBP_XX = 31 /* unknown */
5183 /* Returns the line breaking classification for ch, as a bit mask. */
5185 get_lbp (unsigned int ch)
5189 if (unicode_attributes[ch].name != NULL)
5191 /* mandatory break */
5192 if (ch == 0x000A || ch == 0x000D || ch == 0x0085 /* newline */
5193 || ch == 0x000C /* form feed */
5194 || ch == 0x000B /* line tabulation */
5195 || ch == 0x2028 /* LINE SEPARATOR */
5196 || ch == 0x2029 /* PARAGRAPH SEPARATOR */)
5197 attr |= 1 << LBP_BK;
5199 if (ch == 0x2060 /* WORD JOINER */
5200 || ch == 0xFEFF /* ZERO WIDTH NO-BREAK SPACE */)
5201 attr |= 1 << LBP_WJ;
5203 /* zero width space */
5204 if (ch == 0x200B /* ZERO WIDTH SPACE */)
5205 attr |= 1 << LBP_ZW;
5207 /* non-breaking (glue) */
5208 if (ch == 0x00A0 /* NO-BREAK SPACE */
5209 || ch == 0x202F /* NARROW NO-BREAK SPACE */
5210 || ch == 0x180E /* MONGOLIAN VOWEL SEPARATOR */
5211 || ch == 0x034F /* COMBINING GRAPHEME JOINER */
5212 || ch == 0x2007 /* FIGURE SPACE */
5213 || ch == 0x2011 /* NON-BREAKING HYPHEN */
5214 || ch == 0x0F08 /* TIBETAN MARK SBRUL SHAD */
5215 || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */
5216 || ch == 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */
5217 || (ch >= 0x035C && ch <= 0x0362) /* COMBINING DOUBLE ... */)
5218 attr |= 1 << LBP_GL;
5221 if (ch == 0x0020 /* SPACE */)
5222 attr |= 1 << LBP_SP;
5224 /* break opportunity before and after */
5225 if (ch == 0x2014 /* EM DASH */)
5226 attr |= 1 << LBP_B2;
5228 /* break opportunity after */
5229 if (ch == 0x1680 /* OGHAM SPACE MARK */
5230 || ch == 0x2000 /* EN QUAD */
5231 || ch == 0x2001 /* EM QUAD */
5232 || ch == 0x2002 /* EN SPACE */
5233 || ch == 0x2003 /* EM SPACE */
5234 || ch == 0x2004 /* THREE-PER-EM SPACE */
5235 || ch == 0x2005 /* FOUR-PER-EM SPACE */
5236 || ch == 0x2006 /* SIX-PER-EM SPACE */
5237 || ch == 0x2008 /* PUNCTUATION SPACE */
5238 || ch == 0x2009 /* THIN SPACE */
5239 || ch == 0x200A /* HAIR SPACE */
5240 || ch == 0x205F /* MEDIUM MATHEMATICAL SPACE */
5241 || ch == 0x0009 /* tab */
5242 || ch == 0x00AD /* SOFT HYPHEN */
5243 || ch == 0x058A /* ARMENIAN HYPHEN */
5244 || ch == 0x2010 /* HYPHEN */
5245 || ch == 0x2012 /* FIGURE DASH */
5246 || ch == 0x2013 /* EN DASH */
5247 || ch == 0x05BE /* HEBREW PUNCTUATION MAQAF */
5248 || ch == 0x0F0B /* TIBETAN MARK INTERSYLLABIC TSHEG */
5249 || ch == 0x1361 /* ETHIOPIC WORDSPACE */
5250 || ch == 0x17D8 /* KHMER SIGN BEYYAL */
5251 || ch == 0x17DA /* KHMER SIGN KOOMUUT */
5252 || ch == 0x2027 /* HYPHENATION POINT */
5253 || ch == 0x007C /* VERTICAL LINE */
5254 || ch == 0x16EB /* RUNIC SINGLE PUNCTUATION */
5255 || ch == 0x16EC /* RUNIC MULTIPLE PUNCTUATION */
5256 || ch == 0x16ED /* RUNIC CROSS PUNCTUATION */
5257 || ch == 0x2056 /* THREE DOT PUNCTUATION */
5258 || ch == 0x2058 /* FOUR DOT PUNCTUATION */
5259 || ch == 0x2059 /* FIVE DOT PUNCTUATION */
5260 || ch == 0x205A /* TWO DOT PUNCTUATION */
5261 || ch == 0x205B /* FOUR DOT MARK */
5262 || ch == 0x205D /* TRICOLON */
5263 || ch == 0x205E /* VERTICAL FOUR DOTS */
5264 || ch == 0x2E19 /* PALM BRANCH */
5265 || ch == 0x2E2A /* TWO DOTS OVER ONE DOT PUNCTUATION */
5266 || ch == 0x2E2B /* ONE DOT OVER TWO DOTS PUNCTUATION */
5267 || ch == 0x2E2C /* SQUARED FOUR DOT PUNCTUATION */
5268 || ch == 0x2E2D /* FIVE DOT PUNCTUATION */
5269 || ch == 0x2E30 /* RING POINT */
5270 || ch == 0x10100 /* AEGEAN WORD SEPARATOR LINE */
5271 || ch == 0x10101 /* AEGEAN WORD SEPARATOR DOT */
5272 || ch == 0x10102 /* AEGEAN CHECK MARK */
5273 || ch == 0x1039F /* UGARITIC WORD DIVIDER */
5274 || ch == 0x103D0 /* OLD PERSIAN WORD DIVIDER */
5275 || ch == 0x1091F /* PHOENICIAN WORD SEPARATOR */
5276 || ch == 0x12470 /* CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER */
5277 || ch == 0x0964 /* DEVANAGARI DANDA */
5278 || ch == 0x0965 /* DEVANAGARI DOUBLE DANDA */
5279 || ch == 0x0E5A /* THAI CHARACTER ANGKHANKHU */
5280 || ch == 0x0E5B /* THAI CHARACTER KHOMUT */
5281 || ch == 0x104A /* MYANMAR SIGN LITTLE SECTION */
5282 || ch == 0x104B /* MYANMAR SIGN SECTION */
5283 || ch == 0x1735 /* PHILIPPINE SINGLE PUNCTUATION */
5284 || ch == 0x1736 /* PHILIPPINE DOUBLE PUNCTUATION */
5285 || ch == 0x17D4 /* KHMER SIGN KHAN */
5286 || ch == 0x17D5 /* KHMER SIGN BARIYOOSAN */
5287 || ch == 0x1B5E /* BALINESE CARIK SIKI */
5288 || ch == 0x1B5F /* BALINESE CARIK PAREREN */
5289 || ch == 0xA8CE /* SAURASHTRA DANDA */
5290 || ch == 0xA8CF /* SAURASHTRA DOUBLE DANDA */
5291 || ch == 0xAA5D /* CHAM PUNCTUATION DANDA */
5292 || ch == 0xAA5E /* CHAM PUNCTUATION DOUBLE DANDA */
5293 || ch == 0xAA5F /* CHAM PUNCTUATION TRIPLE DANDA */
5294 || ch == 0x10A56 /* KHAROSHTHI PUNCTUATION DANDA */
5295 || ch == 0x10A57 /* KHAROSHTHI PUNCTUATION DOUBLE DANDA */
5296 || ch == 0x0F34 /* TIBETAN MARK BSDUS RTAGS */
5297 || ch == 0x0F7F /* TIBETAN SIGN RNAM BCAD */
5298 || ch == 0x0F85 /* TIBETAN MARK PALUTA */
5299 || ch == 0x0FBE /* TIBETAN KU RU KHA */
5300 || ch == 0x0FBF /* TIBETAN KU RU KHA BZHI MIG CAN */
5301 || ch == 0x0FD2 /* TIBETAN MARK NYIS TSHEG */
5302 || ch == 0x1804 /* MONGOLIAN COLON */
5303 || ch == 0x1805 /* MONGOLIAN FOUR DOTS */
5304 || ch == 0x1B5A /* BALINESE PANTI */
5305 || ch == 0x1B5B /* BALINESE PAMADA */
5306 || ch == 0x1B5C /* BALINESE WINDU */
5307 || ch == 0x1B5D /* BALINESE CARIK PAMUNGKAH */
5308 || ch == 0x1B60 /* BALINESE PAMENENG */
5309 || ch == 0x1C3B /* LEPCHA PUNCTUATION TA-ROL */
5310 || ch == 0x1C3C /* LEPCHA PUNCTUATION NYET THYOOM TA-ROL */
5311 || ch == 0x1C3D /* LEPCHA PUNCTUATION CER-WA */
5312 || ch == 0x1C3E /* LEPCHA PUNCTUATION TSHOOK CER-WA */
5313 || ch == 0x1C3F /* LEPCHA PUNCTUATION TSHOOK */
5314 || ch == 0x1C7E /* OL CHIKI PUNCTUATION MUCAAD */
5315 || ch == 0x1C7F /* OL CHIKI PUNCTUATION DOUBLE MUCAAD */
5316 || ch == 0x2CFA /* COPTIC OLD NUBIAN DIRECT QUESTION MARK */
5317 || ch == 0x2CFB /* COPTIC OLD NUBIAN INDIRECT QUESTION MARK */
5318 || ch == 0x2CFC /* COPTIC OLD NUBIAN VERSE DIVIDER */
5319 || ch == 0x2CFF /* COPTIC MORPHOLOGICAL DIVIDER */
5320 || (ch >= 0x2E0E && ch <= 0x2E15) /* EDITORIAL CORONIS .. UPWARDS ANCORA */
5321 || ch == 0x2E17 /* DOUBLE OBLIQUE HYPHEN */
5322 || ch == 0xA60D /* VAI COMMA */
5323 || ch == 0xA60F /* VAI QUESTION MARK */
5324 || ch == 0xA92E /* KAYAH LI SIGN CWI */
5325 || ch == 0xA92F /* KAYAH LI SIGN SHYA */
5326 || ch == 0x10A50 /* KHAROSHTHI PUNCTUATION DOT */
5327 || ch == 0x10A51 /* KHAROSHTHI PUNCTUATION SMALL CIRCLE */
5328 || ch == 0x10A52 /* KHAROSHTHI PUNCTUATION CIRCLE */
5329 || ch == 0x10A53 /* KHAROSHTHI PUNCTUATION CRESCENT BAR */
5330 || ch == 0x10A54 /* KHAROSHTHI PUNCTUATION MANGALAM */
5331 || ch == 0x10A55 /* KHAROSHTHI PUNCTUATION LOTUS */
5332 /* Extra characters for compatibility with Unicode LineBreak.txt. */
5333 || ch == 0x12471 /* CUNEIFORM PUNCTUATION SIGN VERTICAL COLON */
5334 || ch == 0x12472 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL COLON */
5335 || ch == 0x12473 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL TRICOLON */)
5336 attr |= 1 << LBP_BA;
5338 /* break opportunity before */
5339 if (ch == 0x00B4 /* ACUTE ACCENT */
5340 || ch == 0x1FFD /* GREEK OXIA */
5341 || ch == 0x02DF /* MODIFIER LETTER CROSS ACCENT */
5342 || ch == 0x02C8 /* MODIFIER LETTER VERTICAL LINE */
5343 || ch == 0x02CC /* MODIFIER LETTER LOW VERTICAL LINE */
5344 || ch == 0x0F01 /* TIBETAN MARK GTER YIG MGO TRUNCATED A */
5345 || ch == 0x0F02 /* TIBETAN MARK GTER YIG MGO -UM RNAM BCAD MA */
5346 || ch == 0x0F03 /* TIBETAN MARK GTER YIG MGO -UM GTER TSHEG MA */
5347 || ch == 0x0F04 /* TIBETAN MARK INITIAL YIG MGO MDUN MA */
5348 || ch == 0x0F06 /* TIBETAN MARK CARET YIG MGO PHUR SHAD MA */
5349 || ch == 0x0F07 /* TIBETAN MARK YIG MGO TSHEG SHAD MA */
5350 || ch == 0x0F09 /* TIBETAN MARK BSKUR YIG MGO */
5351 || ch == 0x0F0A /* TIBETAN MARK BKA- SHOG YIG MGO */
5352 || ch == 0x0FD0 /* TIBETAN MARK BSKA- SHOG GI MGO RGYAN */
5353 || ch == 0x0FD1 /* TIBETAN MARK MNYAM YIG GI MGO RGYAN */
5354 || ch == 0x0FD3 /* TIBETAN MARK INITIAL BRDA RNYING YIG MGO MDUN MA */
5355 || ch == 0xA874 /* PHAGS-PA SINGLE HEAD MARK */
5356 || ch == 0xA875 /* PHAGS-PA DOUBLE HEAD MARK */
5357 || ch == 0x1806 /* MONGOLIAN TODO SOFT HYPHEN */)
5358 attr |= 1 << LBP_BB;
5361 if (ch == 0x002D /* HYPHEN-MINUS */)
5362 attr |= 1 << LBP_HY;
5364 /* contingent break opportunity */
5365 if (ch == 0xFFFC /* OBJECT REPLACEMENT CHARACTER */)
5366 attr |= 1 << LBP_CB;
5368 /* closing punctuation */
5369 if ((unicode_attributes[ch].category[0] == 'P'
5370 && unicode_attributes[ch].category[1] == 'e')
5371 || ch == 0x3001 /* IDEOGRAPHIC COMMA */
5372 || ch == 0x3002 /* IDEOGRAPHIC FULL STOP */
5373 || ch == 0xFE11 /* PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC COMMA */
5374 || ch == 0xFE12 /* PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC FULL STOP */
5375 || ch == 0xFE50 /* SMALL COMMA */
5376 || ch == 0xFE52 /* SMALL FULL STOP */
5377 || ch == 0xFF0C /* FULLWIDTH COMMA */
5378 || ch == 0xFF0E /* FULLWIDTH FULL STOP */
5379 || ch == 0xFF61 /* HALFWIDTH IDEOGRAPHIC FULL STOP */
5380 || ch == 0xFF64 /* HALFWIDTH IDEOGRAPHIC COMMA */)
5381 attr |= 1 << LBP_CL;
5383 /* exclamation/interrogation */
5384 if (ch == 0x0021 /* EXCLAMATION MARK */
5385 || ch == 0x003F /* QUESTION MARK */
5386 || ch == 0x05C6 /* HEBREW PUNCTUATION NUN HAFUKHA */
5387 || ch == 0x061B /* ARABIC SEMICOLON */
5388 || ch == 0x061E /* ARABIC TRIPLE DOT PUNCTUATION MARK */
5389 || ch == 0x061F /* ARABIC QUESTION MARK */
5390 || ch == 0x06D4 /* ARABIC FULL STOP */
5391 || ch == 0x07F9 /* NKO EXCLAMATION MARK */
5392 || ch == 0x0F0D /* TIBETAN MARK SHAD */
5393 || ch == 0x0F0E /* TIBETAN MARK NYIS SHAD */
5394 || ch == 0x0F0F /* TIBETAN MARK TSHEG SHAD */
5395 || ch == 0x0F10 /* TIBETAN MARK NYIS TSHEG SHAD */
5396 || ch == 0x0F11 /* TIBETAN MARK RIN CHEN SPUNGS SHAD */
5397 || ch == 0x0F14 /* TIBETAN MARK GTER TSHEG */
5398 || ch == 0x1802 /* MONGOLIAN COMMA */
5399 || ch == 0x1803 /* MONGOLIAN FULL STOP */
5400 || ch == 0x1808 /* MONGOLIAN MANCHU COMMA */
5401 || ch == 0x1809 /* MONGOLIAN MANCHU FULL STOP */
5402 || ch == 0x1944 /* LIMBU EXCLAMATION MARK */
5403 || ch == 0x1945 /* LIMBU QUESTION MARK */
5404 || ch == 0x2762 /* HEAVY EXCLAMATION MARK ORNAMENT */
5405 || ch == 0x2763 /* HEAVY HEART EXCLAMATION MARK ORNAMENT */
5406 || ch == 0x2CF9 /* COPTIC OLD NUBIAN FULL STOP */
5407 || ch == 0x2CFE /* COPTIC FULL STOP */
5408 || ch == 0x2E2E /* REVERSED QUESTION MARK */
5410 || ch == 0xA60C /* VAI SYLLABLE LENGTHENER */
5412 || ch == 0xA60E /* VAI FULL STOP */
5413 || ch == 0xA876 /* PHAGS-PA MARK SHAD */
5414 || ch == 0xA877 /* PHAGS-PA MARK DOUBLE SHAD */
5415 || ch == 0xFE15 /* PRESENTATION FORM FOR VERTICAL EXCLAMATION MARK */
5416 || ch == 0xFE16 /* PRESENTATION FORM FOR VERTICAL QUESTION MARK */
5417 || ch == 0xFE56 /* SMALL QUESTION MARK */
5418 || ch == 0xFE57 /* SMALL EXCLAMATION MARK */
5419 || ch == 0xFF01 /* FULLWIDTH EXCLAMATION MARK */
5420 || ch == 0xFF1F /* FULLWIDTH QUESTION MARK */)
5421 attr |= 1 << LBP_EX;
5424 if (ch == 0x2024 /* ONE DOT LEADER */
5425 || ch == 0x2025 /* TWO DOT LEADER */
5426 || ch == 0x2026 /* HORIZONTAL ELLIPSIS */
5427 || ch == 0xFE19 /* PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS */)
5428 attr |= 1 << LBP_IN;
5431 if (ch == 0x17D6 /* KHMER SIGN CAMNUC PII KUUH */
5432 || ch == 0x203C /* DOUBLE EXCLAMATION MARK */
5433 || ch == 0x203D /* INTERROBANG */
5434 || ch == 0x2047 /* DOUBLE QUESTION MARK */
5435 || ch == 0x2048 /* QUESTION EXCLAMATION MARK */
5436 || ch == 0x2049 /* EXCLAMATION QUESTION MARK */
5437 || ch == 0x3005 /* IDEOGRAPHIC ITERATION MARK */
5438 || ch == 0x301C /* WAVE DASH */
5439 || ch == 0x303C /* MASU MARK */
5440 || ch == 0x303B /* VERTICAL IDEOGRAPHIC ITERATION MARK */
5441 || ch == 0x309B /* KATAKANA-HIRAGANA VOICED SOUND MARK */
5442 || ch == 0x309C /* KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK */
5443 || ch == 0x309D /* HIRAGANA ITERATION MARK */
5444 || ch == 0x309E /* HIRAGANA VOICED ITERATION MARK */
5445 || ch == 0x30A0 /* KATAKANA-HIRAGANA DOUBLE HYPHEN */
5446 || ch == 0x30FB /* KATAKANA MIDDLE DOT */
5447 || ch == 0x30FC /* KATAKANA-HIRAGANA PROLONGED SOUND MARK */
5448 || ch == 0x30FD /* KATAKANA ITERATION MARK */
5449 || ch == 0x30FE /* KATAKANA VOICED ITERATION MARK */
5450 || ch == 0xA015 /* YI SYLLABLE WU */
5451 || ch == 0xFE54 /* SMALL SEMICOLON */
5452 || ch == 0xFE55 /* SMALL COLON */
5453 || ch == 0xFF1A /* FULLWIDTH COLON */
5454 || ch == 0xFF1B /* FULLWIDTH SEMICOLON */
5455 || ch == 0xFF65 /* HALFWIDTH KATAKANA MIDDLE DOT */
5456 || ch == 0xFF70 /* HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK */
5457 || ch == 0xFF9E /* HALFWIDTH KATAKANA VOICED SOUND MARK */
5458 || ch == 0xFF9F /* HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK */
5459 || strstr (unicode_attributes[ch].name, "HIRAGANA LETTER SMALL ") != NULL
5460 || strstr (unicode_attributes[ch].name, "KATAKANA LETTER SMALL ") != NULL)
5461 attr |= 1 << LBP_NS;
5463 /* opening punctuation */
5464 if ((unicode_attributes[ch].category[0] == 'P'
5465 && unicode_attributes[ch].category[1] == 's')
5466 || ch == 0x00A1 /* INVERTED EXCLAMATION MARK */
5467 || ch == 0x00BF /* INVERTED QUESTION MARK */
5468 || ch == 0x2E18 /* INVERTED INTERROBANG */)
5469 attr |= 1 << LBP_OP;
5471 /* ambiguous quotation */
5472 if ((unicode_attributes[ch].category[0] == 'P'
5473 && (unicode_attributes[ch].category[1] == 'f'
5474 || unicode_attributes[ch].category[1] == 'i'))
5475 || ch == 0x0022 /* QUOTATION MARK */
5476 || ch == 0x0027 /* APOSTROPHE */
5477 || ch == 0x275B /* HEAVY SINGLE TURNED COMMA QUOTATION MARK ORNAMENT */
5478 || ch == 0x275C /* HEAVY SINGLE COMMA QUOTATION MARK ORNAMENT */
5479 || ch == 0x275D /* HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT */
5480 || ch == 0x275E /* HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT */
5481 || ch == 0x2E00 /* RIGHT ANGLE SUBSTITUTION MARKER */
5482 || ch == 0x2E01 /* RIGHT ANGLE DOTTED SUBSTITUTION MARKER */
5483 || ch == 0x2E06 /* RAISED INTERPOLATION MARKER */
5484 || ch == 0x2E07 /* RAISED DOTTED INTERPOLATION MARKER */
5485 || ch == 0x2E08 /* DOTTED TRANSPOSITION MARKER */
5486 || ch == 0x2E0B /* RAISED SQUARE */)
5487 attr |= 1 << LBP_QU;
5489 /* infix separator (numeric) */
5490 if (ch == 0x002C /* COMMA */
5491 || ch == 0x002E /* FULL STOP */
5492 || ch == 0x003A /* COLON */
5493 || ch == 0x003B /* SEMICOLON */
5494 || ch == 0x037E /* GREEK QUESTION MARK */
5495 || ch == 0x0589 /* ARMENIAN FULL STOP */
5496 || ch == 0x060C /* ARABIC COMMA */
5497 || ch == 0x060D /* ARABIC DATE SEPARATOR */
5498 || ch == 0x07F8 /* NKO COMMA */
5499 || ch == 0x2044 /* FRACTION SLASH */
5500 || ch == 0xFE10 /* PRESENTATION FORM FOR VERTICAL COMMA */
5501 || ch == 0xFE13 /* PRESENTATION FORM FOR VERTICAL COLON */
5502 || ch == 0xFE14 /* PRESENTATION FORM FOR VERTICAL SEMICOLON */)
5503 attr |= 1 << LBP_IS;
5506 if ((unicode_attributes[ch].category[0] == 'N'
5507 && unicode_attributes[ch].category[1] == 'd'
5508 && strstr (unicode_attributes[ch].name, "FULLWIDTH") == NULL)
5509 || ch == 0x066B /* ARABIC DECIMAL SEPARATOR */
5510 || ch == 0x066C /* ARABIC THOUSANDS SEPARATOR */)
5511 attr |= 1 << LBP_NU;
5513 /* postfix (numeric) */
5514 if (ch == 0x0025 /* PERCENT SIGN */
5515 || ch == 0x00A2 /* CENT SIGN */
5516 || ch == 0x00B0 /* DEGREE SIGN */
5517 || ch == 0x060B /* AFGHANI SIGN */
5518 || ch == 0x066A /* ARABIC PERCENT SIGN */
5519 || ch == 0x2030 /* PER MILLE SIGN */
5520 || ch == 0x2031 /* PER TEN THOUSAND SIGN */
5521 || ch == 0x2032 /* PRIME */
5522 || ch == 0x2033 /* DOUBLE PRIME */
5523 || ch == 0x2034 /* TRIPLE PRIME */
5524 || ch == 0x2035 /* REVERSED PRIME */
5525 || ch == 0x2036 /* REVERSED DOUBLE PRIME */
5526 || ch == 0x2037 /* REVERSED TRIPLE PRIME */
5527 || ch == 0x20A7 /* PESETA SIGN */
5528 || ch == 0x2103 /* DEGREE CELSIUS */
5529 || ch == 0x2109 /* DEGREE FAHRENHEIT */
5530 || ch == 0xFDFC /* RIAL SIGN */
5531 || ch == 0xFE6A /* SMALL PERCENT SIGN */
5532 || ch == 0xFF05 /* FULLWIDTH PERCENT SIGN */
5533 || ch == 0xFFE0 /* FULLWIDTH DIGIT ZERO */
5534 /* Extra characters for compatibility with Unicode LineBreak.txt. */
5535 || ch == 0x0609 /* ARABIC-INDIC PER MILLE SIGN */
5536 || ch == 0x060A /* ARABIC-INDIC PER TEN THOUSAND SIGN */
5537 || ch == 0x0D79 /* MALAYALAM DATE MARK */)
5538 attr |= 1 << LBP_PO;
5540 /* prefix (numeric) */
5541 if ((unicode_attributes[ch].category[0] == 'S'
5542 && unicode_attributes[ch].category[1] == 'c')
5543 || ch == 0x002B /* PLUS SIGN */
5544 || ch == 0x005C /* REVERSE SOLIDUS */
5545 || ch == 0x00B1 /* PLUS-MINUS SIGN */
5546 || ch == 0x2116 /* NUMERO SIGN */
5547 || ch == 0x2212 /* MINUS SIGN */
5548 || ch == 0x2213 /* MINUS-OR-PLUS SIGN */)
5549 if (!(attr & (1 << LBP_PO)))
5550 attr |= 1 << LBP_PR;
5552 /* symbols allowing breaks */
5553 if (ch == 0x002F /* SOLIDUS */)
5554 attr |= 1 << LBP_SY;
5556 if (ch >= 0xAC00 && ch <= 0xD7A3 && ((ch - 0xAC00) % 28) == 0)
5557 attr |= 1 << LBP_H2;
5559 if (ch >= 0xAC00 && ch <= 0xD7A3 && ((ch - 0xAC00) % 28) != 0)
5560 attr |= 1 << LBP_H3;
5562 if ((ch >= 0x1100 && ch <= 0x1159) || ch == 0x115F)
5563 attr |= 1 << LBP_JL;
5565 if (ch >= 0x1160 && ch <= 0x11A2)
5566 attr |= 1 << LBP_JV;
5568 if (ch >= 0x11A8 && ch <= 0x11F9)
5569 attr |= 1 << LBP_JT;
5571 /* complex context (South East Asian) */
5572 if (((unicode_attributes[ch].category[0] == 'C'
5573 && unicode_attributes[ch].category[1] == 'f')
5574 || (unicode_attributes[ch].category[0] == 'L'
5575 && (unicode_attributes[ch].category[1] == 'm'
5576 || unicode_attributes[ch].category[1] == 'o'))
5577 || (unicode_attributes[ch].category[0] == 'M'
5578 && (unicode_attributes[ch].category[1] == 'c'
5579 || unicode_attributes[ch].category[1] == 'n'))
5580 /* Extra characters for compatibility with Unicode LineBreak.txt. */
5581 || ch == 0x109E /* MYANMAR SYMBOL SHAN ONE */
5582 || ch == 0x109F /* MYANMAR SYMBOL SHAN EXCLAMATION */
5583 || ch == 0x19DE /* NEW TAI LUE SIGN LAE */
5584 || ch == 0x19DF /* NEW TAI LUE SIGN LAEV */)
5585 && ((ch >= 0x0E00 && ch <= 0x0EFF)
5586 || (ch >= 0x1000 && ch <= 0x109F)
5587 || (ch >= 0x1780 && ch <= 0x17FF)
5588 || (ch >= 0x1950 && ch <= 0x19DF)))
5589 attr |= 1 << LBP_SA;
5591 /* attached characters and combining marks */
5592 if ((unicode_attributes[ch].category[0] == 'M'
5593 && (unicode_attributes[ch].category[1] == 'c'
5594 || unicode_attributes[ch].category[1] == 'e'
5595 || unicode_attributes[ch].category[1] == 'n'))
5596 || (unicode_attributes[ch].category[0] == 'C'
5597 && (unicode_attributes[ch].category[1] == 'c'
5598 || unicode_attributes[ch].category[1] == 'f')))
5599 if (!(attr & ((1 << LBP_BK) | (1 << LBP_BA) | (1 << LBP_GL) | (1 << LBP_SA) | (1 << LBP_WJ) | (1 << LBP_ZW))))
5600 attr |= 1 << LBP_CM;
5603 if ((ch >= 0x2E80 && ch <= 0x2FFF) /* CJK RADICAL, KANGXI RADICAL, IDEOGRAPHIC DESCRIPTION */
5604 || ch == 0x3000 /* IDEOGRAPHIC SPACE */
5605 || (ch >= 0x3040 && ch <= 0x309F) /* HIRAGANA */
5606 || (ch >= 0x30A0 && ch <= 0x30FF) /* KATAKANA */
5607 || (ch >= 0x3400 && ch <= 0x4DB5) /* CJK Ideograph Extension A */
5608 || (ch >= 0x4E00 && ch <= 0x9FC3) /* CJK Ideograph */
5609 || (ch >= 0xF900 && ch <= 0xFAD9) /* CJK COMPATIBILITY IDEOGRAPH */
5610 || (ch >= 0xA000 && ch <= 0xA48F) /* YI SYLLABLE */
5611 || (ch >= 0xA490 && ch <= 0xA4CF) /* YI RADICAL */
5612 || ch == 0xFE62 /* SMALL PLUS SIGN */
5613 || ch == 0xFE63 /* SMALL HYPHEN-MINUS */
5614 || ch == 0xFE64 /* SMALL LESS-THAN SIGN */
5615 || ch == 0xFE65 /* SMALL GREATER-THAN SIGN */
5616 || ch == 0xFE66 /* SMALL EQUALS SIGN */
5617 || (ch >= 0xFF10 && ch <= 0xFF19) /* FULLWIDTH DIGIT */
5618 || (ch >= 0x20000 && ch <= 0x2A6D6) /* CJK Ideograph Extension B */
5619 || (ch >= 0x2F800 && ch <= 0x2FA1D) /* CJK COMPATIBILITY IDEOGRAPH */
5620 || strstr (unicode_attributes[ch].name, "FULLWIDTH LATIN ") != NULL
5621 || (ch >= 0x3000 && ch <= 0x33FF
5622 && !(attr & ((1 << LBP_CM) | (1 << LBP_NS) | (1 << LBP_OP) | (1 << LBP_CL))))
5623 /* Extra characters for compatibility with Unicode LineBreak.txt. */
5624 || ch == 0xFE30 /* PRESENTATION FORM FOR VERTICAL TWO DOT LEADER */
5625 || ch == 0xFE31 /* PRESENTATION FORM FOR VERTICAL EM DASH */
5626 || ch == 0xFE32 /* PRESENTATION FORM FOR VERTICAL EN DASH */
5627 || ch == 0xFE33 /* PRESENTATION FORM FOR VERTICAL LOW LINE */
5628 || ch == 0xFE34 /* PRESENTATION FORM FOR VERTICAL WAVY LOW LINE */
5629 || ch == 0xFE45 /* SESAME DOT */
5630 || ch == 0xFE46 /* WHITE SESAME DOT */
5631 || ch == 0xFE49 /* DASHED OVERLINE */
5632 || ch == 0xFE4A /* CENTRELINE OVERLINE */
5633 || ch == 0xFE4B /* WAVY OVERLINE */
5634 || ch == 0xFE4C /* DOUBLE WAVY OVERLINE */
5635 || ch == 0xFE4D /* DASHED LOW LINE */
5636 || ch == 0xFE4E /* CENTRELINE LOW LINE */
5637 || ch == 0xFE4F /* WAVY LOW LINE */
5638 || ch == 0xFE51 /* SMALL IDEOGRAPHIC COMMA */
5639 || ch == 0xFE58 /* SMALL EM DASH */
5640 || ch == 0xFE5F /* SMALL NUMBER SIGN */
5641 || ch == 0xFE60 /* SMALL AMPERSAND */
5642 || ch == 0xFE61 /* SMALL ASTERISK */
5643 || ch == 0xFE68 /* SMALL REVERSE SOLIDUS */
5644 || ch == 0xFE6B /* SMALL COMMERCIAL AT */
5645 || ch == 0xFF02 /* FULLWIDTH QUOTATION MARK */
5646 || ch == 0xFF03 /* FULLWIDTH NUMBER SIGN */
5647 || ch == 0xFF06 /* FULLWIDTH AMPERSAND */
5648 || ch == 0xFF07 /* FULLWIDTH APOSTROPHE */
5649 || ch == 0xFF0A /* FULLWIDTH ASTERISK */
5650 || ch == 0xFF0B /* FULLWIDTH PLUS SIGN */
5651 || ch == 0xFF0D /* FULLWIDTH HYPHEN-MINUS */
5652 || ch == 0xFF0F /* FULLWIDTH SOLIDUS */
5653 || ch == 0xFF1C /* FULLWIDTH LESS-THAN SIGN */
5654 || ch == 0xFF1D /* FULLWIDTH EQUALS SIGN */
5655 || ch == 0xFF1E /* FULLWIDTH GREATER-THAN SIGN */
5656 || ch == 0xFF20 /* FULLWIDTH COMMERCIAL AT */
5657 || ch == 0xFF3C /* FULLWIDTH REVERSE SOLIDUS */
5658 || ch == 0xFF3E /* FULLWIDTH CIRCUMFLEX ACCENT */
5659 || ch == 0xFF3F /* FULLWIDTH LOW LINE */
5660 || ch == 0xFF40 /* FULLWIDTH GRAVE ACCENT */
5661 || ch == 0xFF5C /* FULLWIDTH VERTICAL LINE */
5662 || ch == 0xFF5E /* FULLWIDTH TILDE */
5663 || ch == 0xFFE2 /* FULLWIDTH NOT SIGN */
5664 || ch == 0xFFE3 /* FULLWIDTH MACRON */
5665 || ch == 0xFFE4 /* FULLWIDTH BROKEN BAR */)
5666 if (!(attr & ((1 << LBP_NS) | (1 << LBP_CM))))
5668 /* ambiguous (ideograph) ? */
5669 if ((unicode_width[ch] != NULL
5670 && unicode_width[ch][0] == 'A'
5672 || ch == 0x24EA /* CIRCLED DIGIT ZERO */
5673 || (ch >= 0x2780 && ch <= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */)
5674 attr |= 1 << LBP_AI;
5676 attr |= 1 << LBP_ID;
5679 /* ordinary alphabetic and symbol characters */
5680 if ((unicode_attributes[ch].category[0] == 'L'
5681 && (unicode_attributes[ch].category[1] == 'u'
5682 || unicode_attributes[ch].category[1] == 'l'
5683 || unicode_attributes[ch].category[1] == 't'
5684 || unicode_attributes[ch].category[1] == 'm'
5685 || unicode_attributes[ch].category[1] == 'o'))
5686 || (unicode_attributes[ch].category[0] == 'S'
5687 && (unicode_attributes[ch].category[1] == 'm'
5688 || unicode_attributes[ch].category[1] == 'k'
5689 || unicode_attributes[ch].category[1] == 'o'))
5690 || (unicode_attributes[ch].category[0] == 'N'
5691 && (unicode_attributes[ch].category[1] == 'l'
5692 || unicode_attributes[ch].category[1] == 'o'))
5693 || (unicode_attributes[ch].category[0] == 'P'
5694 && (unicode_attributes[ch].category[1] == 'c'
5695 || unicode_attributes[ch].category[1] == 'd'
5696 || unicode_attributes[ch].category[1] == 'o'))
5697 || ch == 0x0600 /* ARABIC NUMBER SIGN */
5698 || ch == 0x0601 /* ARABIC SIGN SANAH */
5699 || ch == 0x0602 /* ARABIC FOOTNOTE MARKER */
5700 || ch == 0x0603 /* ARABIC SIGN SAFHA */
5701 || ch == 0x06DD /* ARABIC END OF AYAH */
5702 || ch == 0x070F /* SYRIAC ABBREVIATION MARK */
5703 || ch == 0x2061 /* FUNCTION APPLICATION */
5704 || ch == 0x2062 /* INVISIBLE TIMES */
5705 || ch == 0x2063 /* INVISIBLE SEPARATOR */
5706 || ch == 0x2064 /* INVISIBLE PLUS */)
5707 if (!(attr & ((1 << LBP_GL) | (1 << LBP_B2) | (1 << LBP_BA) | (1 << LBP_BB) | (1 << LBP_HY) | (1 << LBP_CB) | (1 << LBP_CL) | (1 << LBP_EX) | (1 << LBP_IN) | (1 << LBP_NS) | (1 << LBP_OP) | (1 << LBP_QU) | (1 << LBP_IS) | (1 << LBP_NU) | (1 << LBP_PO) | (1 << LBP_PR) | (1 << LBP_SY) | (1 << LBP_H2) | (1 << LBP_H3) | (1 << LBP_JL) | (1 << LBP_JV) | (1 << LBP_JT) | (1 << LBP_SA) | (1 << LBP_ID))))
5709 /* ambiguous (alphabetic) ? */
5710 if ((unicode_width[ch] != NULL
5711 && unicode_width[ch][0] == 'A'
5713 /* Extra exceptions for compatibility with Unicode LineBreak.txt. */
5714 && ch != 0x2022 /* BULLET */
5715 && ch != 0x203E /* OVERLINE */
5716 && ch != 0x2126 /* OHM SIGN */
5717 && ch != 0x2153 /* VULGAR FRACTION ONE THIRD */
5718 && ch != 0x215C /* VULGAR FRACTION THREE EIGHTHS */
5719 && ch != 0x215D /* VULGAR FRACTION FIVE EIGHTHS */
5720 && ch != 0x21B8 /* NORTH WEST ARROW TO LONG BAR */
5721 && ch != 0x21B9 /* LEFTWARDS ARROW TO BAR OVER RIGHTWARDS ARROW TO BAR */
5722 && ch != 0x21E7 /* UPWARDS WHITE ARROW */
5723 && ch != 0x24FF /* NEGATIVE CIRCLED DIGIT ZERO */
5724 && ch != 0x273D /* HEAVY TEARDROP-SPOKED ASTERISK */)
5726 || ch == 0x00A1 /* INVERTED EXCLAMATION MARK */
5727 || ch == 0x00A7 /* SECTION SIGN */
5728 || ch == 0x00A8 /* DIAERESIS */
5729 || ch == 0x00AA /* FEMININE ORDINAL INDICATOR */
5730 || ch == 0x00B2 /* SUPERSCRIPT TWO */
5731 || ch == 0x00B3 /* SUPERSCRIPT THREE */
5732 || ch == 0x00B6 /* PILCROW SIGN */
5733 || ch == 0x00B7 /* MIDDLE DOT */
5734 || ch == 0x00B8 /* CEDILLA */
5735 || ch == 0x00B9 /* SUPERSCRIPT ONE */
5736 || ch == 0x00BA /* MASCULINE ORDINAL INDICATOR */
5737 || ch == 0x00BC /* VULGAR FRACTION ONE QUARTER */
5738 || ch == 0x00BD /* VULGAR FRACTION ONE HALF */
5739 || ch == 0x00BE /* VULGAR FRACTION THREE QUARTERS */
5740 || ch == 0x00BF /* INVERTED QUESTION MARK */
5741 || ch == 0x00D7 /* MULTIPLICATION SIGN */
5742 || ch == 0x00F7 /* DIVISION SIGN */
5743 || ch == 0x02C7 /* CARON */
5744 || ch == 0x02C9 /* MODIFIER LETTER MACRON */
5745 || ch == 0x02CA /* MODIFIER LETTER ACUTE ACCENT */
5746 || ch == 0x02CB /* MODIFIER LETTER GRAVE ACCENT */
5747 || ch == 0x02CD /* MODIFIER LETTER LOW MACRON */
5748 || ch == 0x02D0 /* MODIFIER LETTER TRIANGULAR COLON */
5749 || ch == 0x02D8 /* BREVE */
5750 || ch == 0x02D9 /* DOT ABOVE */
5751 || ch == 0x02DA /* RING ABOVE */
5752 || ch == 0x02DB /* OGONEK */
5753 || ch == 0x02DD /* DOUBLE ACUTE ACCENT */
5755 || ch == 0x24EA /* CIRCLED DIGIT ZERO */
5756 || (ch >= 0x2780 && ch <= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */
5757 /* Extra characters for compatibility with Unicode LineBreak.txt. */
5758 || ch == 0x2155 /* VULGAR FRACTION ONE FIFTH */
5759 || ch == 0x2574 /* BOX DRAWINGS LIGHT LEFT */
5760 || ch == 0x2616 /* WHITE SHOGI PIECE */
5761 || ch == 0x2617 /* BLACK SHOGI PIECE */)
5762 attr |= 1 << LBP_AI;
5764 attr |= 1 << LBP_AL;
5765 attr &= ~(1 << LBP_CM);
5771 attr |= 1 << LBP_XX;
5776 /* Output the line breaking properties in a human readable format. */
5778 debug_output_lbp (FILE *stream)
5782 for (i = 0; i < 0x110000; i++)
5784 int attr = get_lbp (i);
5785 if (attr != 1 << LBP_XX)
5787 fprintf (stream, "0x%04X", i);
5788 #define PRINT_BIT(attr,bit) \
5789 if (attr & (1 << bit)) fprintf (stream, " " #bit);
5790 PRINT_BIT(attr,LBP_BK);
5791 PRINT_BIT(attr,LBP_CM);
5792 PRINT_BIT(attr,LBP_WJ);
5793 PRINT_BIT(attr,LBP_ZW);
5794 PRINT_BIT(attr,LBP_GL);
5795 PRINT_BIT(attr,LBP_SP);
5796 PRINT_BIT(attr,LBP_B2);
5797 PRINT_BIT(attr,LBP_BA);
5798 PRINT_BIT(attr,LBP_BB);
5799 PRINT_BIT(attr,LBP_HY);
5800 PRINT_BIT(attr,LBP_CB);
5801 PRINT_BIT(attr,LBP_CL);
5802 PRINT_BIT(attr,LBP_EX);
5803 PRINT_BIT(attr,LBP_IN);
5804 PRINT_BIT(attr,LBP_NS);
5805 PRINT_BIT(attr,LBP_OP);
5806 PRINT_BIT(attr,LBP_QU);
5807 PRINT_BIT(attr,LBP_IS);
5808 PRINT_BIT(attr,LBP_NU);
5809 PRINT_BIT(attr,LBP_PO);
5810 PRINT_BIT(attr,LBP_PR);
5811 PRINT_BIT(attr,LBP_SY);
5812 PRINT_BIT(attr,LBP_AI);
5813 PRINT_BIT(attr,LBP_AL);
5814 PRINT_BIT(attr,LBP_H2);
5815 PRINT_BIT(attr,LBP_H3);
5816 PRINT_BIT(attr,LBP_ID);
5817 PRINT_BIT(attr,LBP_JL);
5818 PRINT_BIT(attr,LBP_JV);
5819 PRINT_BIT(attr,LBP_JT);
5820 PRINT_BIT(attr,LBP_SA);
5821 PRINT_BIT(attr,LBP_XX);
5823 fprintf (stream, "\n");
5829 debug_output_lbrk_tables (const char *filename)
5833 stream = fopen (filename, "w");
5836 fprintf (stderr, "cannot open '%s' for writing\n", filename);
5840 debug_output_lbp (stream);
5842 if (ferror (stream) || fclose (stream))
5844 fprintf (stderr, "error writing to '%s'\n", filename);
5849 /* The line breaking property from the LineBreak.txt file. */
5850 int unicode_org_lbp[0x110000];
5852 /* Stores in unicode_org_lbp[] the line breaking property from the
5853 LineBreak.txt file. */
5855 fill_org_lbp (const char *linebreak_filename)
5859 char field0[FIELDLEN];
5860 char field1[FIELDLEN];
5861 char field2[FIELDLEN];
5864 for (i = 0; i < 0x110000; i++)
5865 unicode_org_lbp[i] = LBP_XX;
5867 stream = fopen (linebreak_filename, "r");
5870 fprintf (stderr, "error during fopen of '%s'\n", linebreak_filename);
5886 do c = getc (stream); while (c != EOF && c != '\n');
5890 n = getfield (stream, field0, ';');
5891 n += getfield (stream, field1, ' ');
5892 n += getfield (stream, field2, '\n');
5897 fprintf (stderr, "short line in '%s':%d\n", linebreak_filename,
5901 #define TRY(bit) else if (strcmp (field1, #bit + 4) == 0) value = bit;
5936 else if (strcmp (field1, "LF") == 0) value = LBP_BK;
5937 else if (strcmp (field1, "CR") == 0) value = LBP_BK;
5938 else if (strcmp (field1, "NL") == 0) value = LBP_BK;
5939 else if (strcmp (field1, "SG") == 0) value = LBP_XX;
5942 fprintf (stderr, "unknown property value \"%s\" in '%s':%d\n",
5943 field1, linebreak_filename, lineno);
5946 i = strtoul (field0, NULL, 16);
5947 if (strstr (field0, "..") != NULL)
5949 /* Deal with a range. */
5950 j = strtoul (strstr (field0, "..") + 2, NULL, 16);
5952 unicode_org_lbp[i] = value;
5956 /* Single character line. */
5957 unicode_org_lbp[i] = value;
5960 if (ferror (stream) || fclose (stream))
5962 fprintf (stderr, "error reading from '%s'\n", linebreak_filename);
5967 /* Output the line breaking properties in a human readable format. */
5969 debug_output_org_lbp (FILE *stream)
5973 for (i = 0; i < 0x110000; i++)
5975 int attr = unicode_org_lbp[i];
5978 fprintf (stream, "0x%04X", i);
5979 #define PRINT_BIT(attr,bit) \
5980 if (attr == bit) fprintf (stream, " " #bit);
5981 PRINT_BIT(attr,LBP_BK);
5982 PRINT_BIT(attr,LBP_CM);
5983 PRINT_BIT(attr,LBP_WJ);
5984 PRINT_BIT(attr,LBP_ZW);
5985 PRINT_BIT(attr,LBP_GL);
5986 PRINT_BIT(attr,LBP_SP);
5987 PRINT_BIT(attr,LBP_B2);
5988 PRINT_BIT(attr,LBP_BA);
5989 PRINT_BIT(attr,LBP_BB);
5990 PRINT_BIT(attr,LBP_HY);
5991 PRINT_BIT(attr,LBP_CB);
5992 PRINT_BIT(attr,LBP_CL);
5993 PRINT_BIT(attr,LBP_EX);
5994 PRINT_BIT(attr,LBP_IN);
5995 PRINT_BIT(attr,LBP_NS);
5996 PRINT_BIT(attr,LBP_OP);
5997 PRINT_BIT(attr,LBP_QU);
5998 PRINT_BIT(attr,LBP_IS);
5999 PRINT_BIT(attr,LBP_NU);
6000 PRINT_BIT(attr,LBP_PO);
6001 PRINT_BIT(attr,LBP_PR);
6002 PRINT_BIT(attr,LBP_SY);
6003 PRINT_BIT(attr,LBP_AI);
6004 PRINT_BIT(attr,LBP_AL);
6005 PRINT_BIT(attr,LBP_H2);
6006 PRINT_BIT(attr,LBP_H3);
6007 PRINT_BIT(attr,LBP_ID);
6008 PRINT_BIT(attr,LBP_JL);
6009 PRINT_BIT(attr,LBP_JV);
6010 PRINT_BIT(attr,LBP_JT);
6011 PRINT_BIT(attr,LBP_SA);
6012 PRINT_BIT(attr,LBP_XX);
6014 fprintf (stream, "\n");
6020 debug_output_org_lbrk_tables (const char *filename)
6024 stream = fopen (filename, "w");
6027 fprintf (stderr, "cannot open '%s' for writing\n", filename);
6031 debug_output_org_lbp (stream);
6033 if (ferror (stream) || fclose (stream))
6035 fprintf (stderr, "error writing to '%s'\n", filename);
6040 /* Construction of sparse 3-level tables. */
6041 #define TABLE lbp_table
6042 #define ELEMENT unsigned char
6043 #define DEFAULT LBP_XX
6044 #define xmalloc malloc
6045 #define xrealloc realloc
6049 output_lbp (FILE *stream1, FILE *stream2)
6053 unsigned int level1_offset, level2_offset, level3_offset;
6057 lbp_table_init (&t);
6059 for (i = 0; i < 0x110000; i++)
6061 int attr = get_lbp (i);
6063 /* Now attr should contain exactly one bit. */
6064 if (attr == 0 || ((attr & (attr - 1)) != 0))
6067 if (attr != 1 << LBP_XX)
6069 unsigned int log2_attr;
6070 for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++);
6072 lbp_table_add (&t, i, log2_attr);
6076 lbp_table_finalize (&t);
6079 5 * sizeof (uint32_t);
6081 5 * sizeof (uint32_t)
6082 + t.level1_size * sizeof (uint32_t);
6084 5 * sizeof (uint32_t)
6085 + t.level1_size * sizeof (uint32_t)
6086 + (t.level2_size << t.q) * sizeof (uint32_t);
6088 for (i = 0; i < 5; i++)
6089 fprintf (stream1, "#define lbrkprop_header_%d %d\n", i,
6090 ((uint32_t *) t.result)[i]);
6091 fprintf (stream1, "\n");
6092 fprintf (stream1, "typedef struct\n");
6093 fprintf (stream1, " {\n");
6094 fprintf (stream1, " int level1[%zu];\n", t.level1_size);
6095 fprintf (stream1, " int level2[%zu << %d];\n", t.level2_size, t.q);
6096 fprintf (stream1, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
6097 fprintf (stream1, " }\n");
6098 fprintf (stream1, "lbrkprop_t;\n");
6099 fprintf (stream1, "extern const lbrkprop_t unilbrkprop;\n");
6101 fprintf (stream2, "const lbrkprop_t unilbrkprop =\n");
6102 fprintf (stream2, "{\n");
6103 fprintf (stream2, " {");
6104 if (t.level1_size > 8)
6105 fprintf (stream2, "\n ");
6106 for (i = 0; i < t.level1_size; i++)
6109 if (i > 0 && (i % 8) == 0)
6110 fprintf (stream2, "\n ");
6111 offset = ((uint32_t *) (t.result + level1_offset))[i];
6112 fprintf (stream2, " %5zd%s",
6113 offset == 0 ? -1 : (offset - level2_offset) / sizeof (uint32_t),
6114 (i+1 < t.level1_size ? "," : ""));
6116 if (t.level1_size > 8)
6117 fprintf (stream2, "\n ");
6118 fprintf (stream2, " },\n");
6119 fprintf (stream2, " {");
6120 if (t.level2_size << t.q > 8)
6121 fprintf (stream2, "\n ");
6122 for (i = 0; i < t.level2_size << t.q; i++)
6125 if (i > 0 && (i % 8) == 0)
6126 fprintf (stream2, "\n ");
6127 offset = ((uint32_t *) (t.result + level2_offset))[i];
6128 fprintf (stream2, " %5zd%s",
6129 offset == 0 ? -1 : (offset - level3_offset) / sizeof (uint8_t),
6130 (i+1 < t.level2_size << t.q ? "," : ""));
6132 if (t.level2_size << t.q > 8)
6133 fprintf (stream2, "\n ");
6134 fprintf (stream2, " },\n");
6135 fprintf (stream2, " {");
6136 if (t.level3_size << t.p > 8)
6137 fprintf (stream2, "\n ");
6138 for (i = 0; i < t.level3_size << t.p; i++)
6140 unsigned char value = ((unsigned char *) (t.result + level3_offset))[i];
6141 const char *value_string;
6144 #define CASE(x) case x: value_string = #x; break;
6181 if (i > 0 && (i % 8) == 0)
6182 fprintf (stream2, "\n ");
6183 fprintf (stream2, " %s%s", value_string,
6184 (i+1 < t.level3_size << t.p ? "," : ""));
6186 if (t.level3_size << t.p > 8)
6187 fprintf (stream2, "\n ");
6188 fprintf (stream2, " }\n");
6189 fprintf (stream2, "};\n");
6193 output_lbrk_tables (const char *filename1, const char *filename2, const char *version)
6195 const char *filenames[2];
6199 filenames[0] = filename1;
6200 filenames[1] = filename2;
6202 for (i = 0; i < 2; i++)
6204 streams[i] = fopen (filenames[i], "w");
6205 if (streams[i] == NULL)
6207 fprintf (stderr, "cannot open '%s' for writing\n", filenames[i]);
6212 for (i = 0; i < 2; i++)
6214 FILE *stream = streams[i];
6216 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
6217 fprintf (stream, "/* Line breaking properties of Unicode characters. */\n");
6218 fprintf (stream, "/* Generated automatically by gen-lbrk for Unicode %s. */\n",
6220 fprintf (stream, "\n");
6222 /* Put a GPL header on it. The gnulib module is under LGPL (although it
6223 still carries the GPL header), and it's gnulib-tool which replaces the
6224 GPL header with an LGPL header. */
6225 fprintf (stream, "/* Copyright (C) 2000-2002, 2004, 2008 Free Software Foundation, Inc.\n");
6226 fprintf (stream, "\n");
6227 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
6228 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
6229 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
6230 fprintf (stream, " (at your option) any later version.\n");
6231 fprintf (stream, "\n");
6232 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
6233 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
6234 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
6235 fprintf (stream, " GNU General Public License for more details.\n");
6236 fprintf (stream, "\n");
6237 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
6238 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
6239 fprintf (stream, "\n");
6242 output_lbp (streams[0], streams[1]);
6244 for (i = 0; i < 2; i++)
6246 if (ferror (streams[i]) || fclose (streams[i]))
6248 fprintf (stderr, "error writing to '%s'\n", filenames[i]);
6254 /* ========================================================================= */
6256 /* Word break property. */
6258 /* Possible values of the Word_Break property. */
6273 WBP_EXTENDNUMLET = 7
6276 /* Returns the word breaking property for ch, as a bit mask. */
6278 get_wbp (unsigned int ch)
6282 if (unicode_attributes[ch].name != NULL)
6285 attr |= 1 << WBP_CR;
6288 attr |= 1 << WBP_LF;
6290 if (ch == 0x000B || ch == 0x000C
6292 || ch == 0x2028 || ch == 0x2029)
6293 attr |= 1 << WBP_NEWLINE;
6295 if (((unicode_properties[ch] >> PROP_GRAPHEME_EXTEND) & 1) != 0
6296 || (unicode_attributes[ch].category != NULL
6297 && strcmp (unicode_attributes[ch].category, "Mc") == 0))
6298 attr |= 1 << WBP_EXTEND;
6300 if (unicode_attributes[ch].category != NULL
6301 && strcmp (unicode_attributes[ch].category, "Cf") == 0
6302 && ch != 0x200C && ch != 0x200D)
6303 attr |= 1 << WBP_FORMAT;
6305 if ((unicode_scripts[ch] < numscripts
6306 && strcmp (scripts[unicode_scripts[ch]], "Katakana") == 0)
6307 || (ch >= 0x3031 && ch <= 0x3035)
6308 || ch == 0x309B || ch == 0x309C || ch == 0x30A0 || ch == 0x30FC
6310 attr |= 1 << WBP_KATAKANA;
6312 if ((((unicode_properties[ch] >> PROP_ALPHABETIC) & 1) != 0
6314 && ((unicode_properties[ch] >> PROP_IDEOGRAPHIC) & 1) == 0
6315 && (attr & (1 << WBP_KATAKANA)) == 0
6316 && ((get_lbp (ch) >> LBP_SA) & 1) == 0
6317 && !(unicode_scripts[ch] < numscripts
6318 && strcmp (scripts[unicode_scripts[ch]], "Hiragana") == 0)
6319 && (attr & (1 << WBP_EXTEND)) == 0)
6320 attr |= 1 << WBP_ALETTER;
6322 if (ch == 0x0027 || ch == 0x002E || ch == 0x2018 || ch == 0x2019
6323 || ch == 0x2024 || ch == 0xFE52 || ch == 0xFF07 || ch == 0xFF0E)
6324 attr |= 1 << WBP_MIDNUMLET;
6326 if (ch == 0x00B7 || ch == 0x05F4 || ch == 0x2027 || ch == 0x003A
6327 || ch == 0x0387 || ch == 0xFE13 || ch == 0xFE55 || ch == 0xFF1A)
6328 attr |= 1 << WBP_MIDLETTER;
6330 if ((((get_lbp (ch) >> LBP_IS) & 1) != 0
6331 || ch == 0x066C || ch == 0xFE50 || ch == 0xFE54 || ch == 0xFF0C
6333 && ch != 0x003A && ch != 0xFE13 && ch != 0x002E)
6334 attr |= 1 << WBP_MIDNUM;
6336 if (((get_lbp (ch) >> LBP_NU) & 1) != 0
6338 attr |= 1 << WBP_NUMERIC;
6340 if (unicode_attributes[ch].category != NULL
6341 && strcmp (unicode_attributes[ch].category, "Pc") == 0)
6342 attr |= 1 << WBP_EXTENDNUMLET;
6347 attr |= 1 << WBP_OTHER;
6352 /* Output the word break property in a human readable format. */
6354 debug_output_wbp (FILE *stream)
6358 for (i = 0; i < 0x110000; i++)
6360 int attr = get_wbp (i);
6361 if (attr != 1 << WBP_OTHER)
6363 fprintf (stream, "0x%04X", i);
6364 if (attr & (1 << WBP_CR))
6365 fprintf (stream, " CR");
6366 if (attr & (1 << WBP_LF))
6367 fprintf (stream, " LF");
6368 if (attr & (1 << WBP_NEWLINE))
6369 fprintf (stream, " Newline");
6370 if (attr & (1 << WBP_EXTEND))
6371 fprintf (stream, " Extend");
6372 if (attr & (1 << WBP_FORMAT))
6373 fprintf (stream, " Format");
6374 if (attr & (1 << WBP_KATAKANA))
6375 fprintf (stream, " Katakana");
6376 if (attr & (1 << WBP_ALETTER))
6377 fprintf (stream, " ALetter");
6378 if (attr & (1 << WBP_MIDNUMLET))
6379 fprintf (stream, " MidNumLet");
6380 if (attr & (1 << WBP_MIDLETTER))
6381 fprintf (stream, " MidLetter");
6382 if (attr & (1 << WBP_MIDNUM))
6383 fprintf (stream, " MidNum");
6384 if (attr & (1 << WBP_NUMERIC))
6385 fprintf (stream, " Numeric");
6386 if (attr & (1 << WBP_EXTENDNUMLET))
6387 fprintf (stream, " ExtendNumLet");
6388 fprintf (stream, "\n");
6394 debug_output_wbrk_tables (const char *filename)
6398 stream = fopen (filename, "w");
6401 fprintf (stderr, "cannot open '%s' for writing\n", filename);
6405 debug_output_wbp (stream);
6407 if (ferror (stream) || fclose (stream))
6409 fprintf (stderr, "error writing to '%s'\n", filename);
6414 /* The word break property from the WordBreakProperty.txt file. */
6415 int unicode_org_wbp[0x110000];
6417 /* Stores in unicode_org_wbp[] the word break property from the
6418 WordBreakProperty.txt file. */
6420 fill_org_wbp (const char *wordbreakproperty_filename)
6425 for (i = 0; i < 0x110000; i++)
6426 unicode_org_wbp[i] = WBP_OTHER;
6428 stream = fopen (wordbreakproperty_filename, "r");
6431 fprintf (stderr, "error during fopen of '%s'\n", wordbreakproperty_filename);
6438 unsigned int i1, i2;
6439 char padding[200+1];
6440 char propname[200+1];
6443 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
6446 if (buf[0] == '\0' || buf[0] == '#')
6449 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4)
6451 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3)
6453 fprintf (stderr, "parse error in '%s'\n",
6454 wordbreakproperty_filename);
6459 #define PROP(name,value) \
6460 if (strcmp (propname, name) == 0) propvalue = value; else
6463 PROP ("Newline", WBP_NEWLINE)
6464 PROP ("Extend", WBP_EXTEND)
6465 PROP ("Format", WBP_FORMAT)
6466 PROP ("Katakana", WBP_KATAKANA)
6467 PROP ("ALetter", WBP_ALETTER)
6468 PROP ("MidNumLet", WBP_MIDNUMLET)
6469 PROP ("MidLetter", WBP_MIDLETTER)
6470 PROP ("MidNum", WBP_MIDNUM)
6471 PROP ("Numeric", WBP_NUMERIC)
6472 PROP ("ExtendNumLet", WBP_EXTENDNUMLET)
6475 fprintf (stderr, "unknown property value '%s' in '%s'\n", propname,
6476 wordbreakproperty_filename);
6479 if (!(i1 <= i2 && i2 < 0x110000))
6482 for (i = i1; i <= i2; i++)
6483 unicode_org_wbp[i] = propvalue;
6486 if (ferror (stream) || fclose (stream))
6488 fprintf (stderr, "error reading from '%s'\n", wordbreakproperty_filename);
6493 /* Output the word break property in a human readable format. */
6495 debug_output_org_wbp (FILE *stream)
6499 for (i = 0; i < 0x110000; i++)
6501 int propvalue = unicode_org_wbp[i];
6502 if (propvalue != WBP_OTHER)
6504 fprintf (stream, "0x%04X", i);
6505 #define PROP(name,value) \
6506 if (propvalue == value) fprintf (stream, " " name); else
6509 PROP ("Newline", WBP_NEWLINE)
6510 PROP ("Extend", WBP_EXTEND)
6511 PROP ("Format", WBP_FORMAT)
6512 PROP ("Katakana", WBP_KATAKANA)
6513 PROP ("ALetter", WBP_ALETTER)
6514 PROP ("MidNumLet", WBP_MIDNUMLET)
6515 PROP ("MidLetter", WBP_MIDLETTER)
6516 PROP ("MidNum", WBP_MIDNUM)
6517 PROP ("Numeric", WBP_NUMERIC)
6518 PROP ("ExtendNumLet", WBP_EXTENDNUMLET)
6520 fprintf (stream, " ??");
6521 fprintf (stream, "\n");
6527 debug_output_org_wbrk_tables (const char *filename)
6531 stream = fopen (filename, "w");
6534 fprintf (stderr, "cannot open '%s' for writing\n", filename);
6538 debug_output_org_wbp (stream);
6540 if (ferror (stream) || fclose (stream))
6542 fprintf (stderr, "error writing to '%s'\n", filename);
6547 /* Construction of sparse 3-level tables. */
6548 #define TABLE wbp_table
6549 #define ELEMENT unsigned char
6550 #define DEFAULT WBP_OTHER
6551 #define xmalloc malloc
6552 #define xrealloc realloc
6556 output_wbp (FILE *stream)
6560 unsigned int level1_offset, level2_offset, level3_offset;
6564 wbp_table_init (&t);
6566 for (i = 0; i < 0x110000; i++)
6568 int attr = get_wbp (i);
6570 /* Now attr should contain exactly one bit. */
6571 if (attr == 0 || ((attr & (attr - 1)) != 0))
6574 if (attr != 1 << WBP_OTHER)
6576 unsigned int log2_attr;
6577 for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++);
6579 wbp_table_add (&t, i, log2_attr);
6583 wbp_table_finalize (&t);
6586 5 * sizeof (uint32_t);
6588 5 * sizeof (uint32_t)
6589 + t.level1_size * sizeof (uint32_t);
6591 5 * sizeof (uint32_t)
6592 + t.level1_size * sizeof (uint32_t)
6593 + (t.level2_size << t.q) * sizeof (uint32_t);
6595 for (i = 0; i < 5; i++)
6596 fprintf (stream, "#define wbrkprop_header_%d %d\n", i,
6597 ((uint32_t *) t.result)[i]);
6598 fprintf (stream, "\n");
6599 fprintf (stream, "typedef struct\n");
6600 fprintf (stream, " {\n");
6601 fprintf (stream, " int level1[%zu];\n", t.level1_size);
6602 fprintf (stream, " int level2[%zu << %d];\n", t.level2_size, t.q);
6603 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
6604 fprintf (stream, " }\n");
6605 fprintf (stream, "wbrkprop_t;\n");
6606 fprintf (stream, "static const wbrkprop_t uniwbrkprop =\n");
6607 fprintf (stream, "{\n");
6608 fprintf (stream, " {");
6609 if (t.level1_size > 8)
6610 fprintf (stream, "\n ");
6611 for (i = 0; i < t.level1_size; i++)
6614 if (i > 0 && (i % 8) == 0)
6615 fprintf (stream, "\n ");
6616 offset = ((uint32_t *) (t.result + level1_offset))[i];
6617 fprintf (stream, " %5zd%s",
6618 offset == 0 ? -1 : (offset - level2_offset) / sizeof (uint32_t),
6619 (i+1 < t.level1_size ? "," : ""));
6621 if (t.level1_size > 8)
6622 fprintf (stream, "\n ");
6623 fprintf (stream, " },\n");
6624 fprintf (stream, " {");
6625 if (t.level2_size << t.q > 8)
6626 fprintf (stream, "\n ");
6627 for (i = 0; i < t.level2_size << t.q; i++)
6630 if (i > 0 && (i % 8) == 0)
6631 fprintf (stream, "\n ");
6632 offset = ((uint32_t *) (t.result + level2_offset))[i];
6633 fprintf (stream, " %5zd%s",
6634 offset == 0 ? -1 : (offset - level3_offset) / sizeof (uint8_t),
6635 (i+1 < t.level2_size << t.q ? "," : ""));
6637 if (t.level2_size << t.q > 8)
6638 fprintf (stream, "\n ");
6639 fprintf (stream, " },\n");
6640 fprintf (stream, " {");
6641 if (t.level3_size << t.p > 4)
6642 fprintf (stream, "\n ");
6643 for (i = 0; i < t.level3_size << t.p; i++)
6645 unsigned char value = ((unsigned char *) (t.result + level3_offset))[i];
6646 const char *value_string;
6649 #define CASE(x) case x: value_string = #x; break;
6658 CASE(WBP_MIDNUMLET);
6659 CASE(WBP_MIDLETTER);
6662 CASE(WBP_EXTENDNUMLET);
6667 if (i > 0 && (i % 4) == 0)
6668 fprintf (stream, "\n ");
6669 fprintf (stream, " %s%s", value_string,
6670 (i+1 < t.level3_size << t.p ? "," : ""));
6672 if (t.level3_size << t.p > 4)
6673 fprintf (stream, "\n ");
6674 fprintf (stream, " }\n");
6675 fprintf (stream, "};\n");
6679 output_wbrk_tables (const char *filename, const char *version)
6683 stream = fopen (filename, "w");
6686 fprintf (stderr, "cannot open '%s' for writing\n", filename);
6690 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
6691 fprintf (stream, "/* Line breaking properties of Unicode characters. */\n");
6692 fprintf (stream, "/* Generated automatically by gen-uni-tables for Unicode %s. */\n",
6694 fprintf (stream, "\n");
6696 /* Put a GPL header on it. The gnulib module is under LGPL (although it
6697 still carries the GPL header), and it's gnulib-tool which replaces the
6698 GPL header with an LGPL header. */
6699 fprintf (stream, "/* Copyright (C) 2000-2002, 2004, 2007-2009 Free Software Foundation, Inc.\n");
6700 fprintf (stream, "\n");
6701 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
6702 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
6703 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
6704 fprintf (stream, " (at your option) any later version.\n");
6705 fprintf (stream, "\n");
6706 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
6707 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
6708 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
6709 fprintf (stream, " GNU General Public License for more details.\n");
6710 fprintf (stream, "\n");
6711 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
6712 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
6713 fprintf (stream, "\n");
6715 output_wbp (stream);
6717 if (ferror (stream) || fclose (stream))
6719 fprintf (stderr, "error writing to '%s'\n", filename);
6724 /* ========================================================================= */
6727 main (int argc, char * argv[])
6729 const char *unicodedata_filename;
6730 const char *proplist_filename;
6731 const char *derivedproplist_filename;
6732 const char *scripts_filename;
6733 const char *blocks_filename;
6734 const char *proplist30_filename;
6735 const char *eastasianwidth_filename;
6736 const char *linebreak_filename;
6737 const char *wordbreakproperty_filename;
6738 const char *version;
6742 fprintf (stderr, "Usage: %s UnicodeData.txt PropList.txt DerivedCoreProperties.txt Scripts.txt Blocks.txt PropList-3.0.1.txt EastAsianWidth.txt LineBreak.txt WordBreakProperty.txt version\n",
6747 unicodedata_filename = argv[1];
6748 proplist_filename = argv[2];
6749 derivedproplist_filename = argv[3];
6750 scripts_filename = argv[4];
6751 blocks_filename = argv[5];
6752 proplist30_filename = argv[6];
6753 eastasianwidth_filename = argv[7];
6754 linebreak_filename = argv[8];
6755 wordbreakproperty_filename = argv[9];
6758 fill_attributes (unicodedata_filename);
6759 clear_properties ();
6760 fill_properties (proplist_filename);
6761 fill_properties (derivedproplist_filename);
6762 fill_properties30 (proplist30_filename);
6763 fill_scripts (scripts_filename);
6764 fill_blocks (blocks_filename);
6765 fill_width (eastasianwidth_filename);
6766 fill_org_lbp (linebreak_filename);
6767 fill_org_wbp (wordbreakproperty_filename);
6769 output_categories (version);
6770 output_category ("unictype/categ_of.h", version);
6771 output_combclass ("unictype/combining.h", version);
6772 output_bidi_category ("unictype/bidi_of.h", version);
6773 output_decimal_digit_test ("../tests/unictype/test-decdigit.h", version);
6774 output_decimal_digit ("unictype/decdigit.h", version);
6775 output_digit_test ("../tests/unictype/test-digit.h", version);
6776 output_digit ("unictype/digit.h", version);
6777 output_numeric_test ("../tests/unictype/test-numeric.h", version);
6778 output_numeric ("unictype/numeric.h", version);
6779 output_mirror ("unictype/mirror.h", version);
6780 output_properties (version);
6781 output_scripts (version);
6782 output_scripts_byname (version);
6783 output_blocks (version);
6784 output_ident_properties (version);
6785 output_old_ctype (version);
6787 debug_output_lbrk_tables ("unilbrk/lbrkprop.txt");
6788 debug_output_org_lbrk_tables ("unilbrk/lbrkprop_org.txt");
6789 output_lbrk_tables ("unilbrk/lbrkprop1.h", "unilbrk/lbrkprop2.h", version);
6791 debug_output_wbrk_tables ("uniwbrk/wbrkprop.txt");
6792 debug_output_org_wbrk_tables ("uniwbrk/wbrkprop_org.txt");
6793 output_wbrk_tables ("uniwbrk/wbrkprop.h", version);
6799 * For Emacs M-x compile
6801 * compile-command: "
6802 gcc -O -Wall gen-uni-tables.c -Iunictype -o gen-uni-tables && \
6804 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/UnicodeData.txt \
6805 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/PropList.txt \
6806 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/DerivedCoreProperties.txt \
6807 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/Scripts.txt \
6808 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/Blocks.txt \
6809 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/3.0.1/PropList-3.0.1.txt \
6810 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/EastAsianWidth.txt \
6811 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/LineBreak.txt \
6812 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/auxiliary/WordBreakProperty.txt \