1 /* Generate Unicode conforming character classification tables and
2 Line Break Properties tables from a UnicodeData file.
3 Copyright (C) 2000-2002, 2004, 2007-2009 Free Software Foundation, Inc.
4 Written by Bruno Haible <bruno@clisp.org>, 2000-2002.
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <http://www.gnu.org/licenses/>. */
20 $ gen-uni-tables /usr/local/share/Unidata/UnicodeData.txt \
21 /usr/local/share/Unidata/PropList.txt \
22 /usr/local/share/Unidata/DerivedCoreProperties.txt \
23 /usr/local/share/Unidata/Scripts.txt \
24 /usr/local/share/Unidata/Blocks.txt \
25 /usr/local/share/Unidata/PropList-3.0.1.txt \
26 /usr/local/share/Unidata/EastAsianWidth.txt \
27 /usr/local/share/Unidata/LineBreak.txt \
38 /* ========================================================================= */
40 /* Reading UnicodeData.txt. */
43 /* This structure represents one line in the UnicodeData.txt file. */
44 struct unicode_attribute
46 const char *name; /* Character name */
47 const char *category; /* General category */
48 const char *combining; /* Canonical combining class */
49 const char *bidi; /* Bidirectional category */
50 const char *decomposition; /* Character decomposition mapping */
51 const char *decdigit; /* Decimal digit value */
52 const char *digit; /* Digit value */
53 const char *numeric; /* Numeric value */
54 bool mirrored; /* mirrored */
55 const char *oldname; /* Old Unicode 1.0 name */
56 const char *comment; /* Comment */
57 unsigned int upper; /* Uppercase mapping */
58 unsigned int lower; /* Lowercase mapping */
59 unsigned int title; /* Titlecase mapping */
62 /* Missing fields are represented with "" for strings, and NONE for
64 #define NONE (~(unsigned int)0)
66 /* The entire contents of the UnicodeData.txt file. */
67 struct unicode_attribute unicode_attributes [0x110000];
69 /* Stores in unicode_attributes[i] the values from the given fields. */
71 fill_attribute (unsigned int i,
72 const char *field1, const char *field2,
73 const char *field3, const char *field4,
74 const char *field5, const char *field6,
75 const char *field7, const char *field8,
76 const char *field9, const char *field10,
77 const char *field11, const char *field12,
78 const char *field13, const char *field14)
80 struct unicode_attribute * uni;
84 fprintf (stderr, "index too large\n");
87 if (strcmp (field2, "Cs") == 0)
88 /* Surrogates are UTF-16 artefacts, not real characters. Ignore them. */
90 uni = &unicode_attributes[i];
91 /* Copy the strings. */
92 uni->name = strdup (field1);
93 uni->category = (field2[0] == '\0' ? "" : strdup (field2));
94 uni->combining = (field3[0] == '\0' ? "" : strdup (field3));
95 uni->bidi = (field4[0] == '\0' ? "" : strdup (field4));
96 uni->decomposition = (field5[0] == '\0' ? "" : strdup (field5));
97 uni->decdigit = (field6[0] == '\0' ? "" : strdup (field6));
98 uni->digit = (field7[0] == '\0' ? "" : strdup (field7));
99 uni->numeric = (field8[0] == '\0' ? "" : strdup (field8));
100 uni->mirrored = (field9[0] == 'Y');
101 uni->oldname = (field10[0] == '\0' ? "" : strdup (field10));
102 uni->comment = (field11[0] == '\0' ? "" : strdup (field11));
103 uni->upper = (field12[0] =='\0' ? NONE : strtoul (field12, NULL, 16));
104 uni->lower = (field13[0] =='\0' ? NONE : strtoul (field13, NULL, 16));
105 uni->title = (field14[0] =='\0' ? NONE : strtoul (field14, NULL, 16));
108 /* Maximum length of a field in the UnicodeData.txt file. */
111 /* Reads the next field from STREAM. The buffer BUFFER has size FIELDLEN.
112 Reads up to (but excluding) DELIM.
113 Returns 1 when a field was successfully read, otherwise 0. */
115 getfield (FILE *stream, char *buffer, int delim)
120 for (; (c = getc (stream)), (c != EOF && c != delim); )
122 /* The original unicode.org UnicodeData.txt file happens to have
123 CR/LF line terminators. Silently convert to LF. */
127 /* Put c into the buffer. */
128 if (++count >= FIELDLEN - 1)
130 fprintf (stderr, "field longer than expected, increase FIELDLEN\n");
143 /* Stores in unicode_attributes[] the entire contents of the UnicodeData.txt
146 fill_attributes (const char *unicodedata_filename)
150 char field0[FIELDLEN];
151 char field1[FIELDLEN];
152 char field2[FIELDLEN];
153 char field3[FIELDLEN];
154 char field4[FIELDLEN];
155 char field5[FIELDLEN];
156 char field6[FIELDLEN];
157 char field7[FIELDLEN];
158 char field8[FIELDLEN];
159 char field9[FIELDLEN];
160 char field10[FIELDLEN];
161 char field11[FIELDLEN];
162 char field12[FIELDLEN];
163 char field13[FIELDLEN];
164 char field14[FIELDLEN];
167 for (i = 0; i < 0x110000; i++)
168 unicode_attributes[i].name = NULL;
170 stream = fopen (unicodedata_filename, "r");
173 fprintf (stderr, "error during fopen of '%s'\n", unicodedata_filename);
182 n = getfield (stream, field0, ';');
183 n += getfield (stream, field1, ';');
184 n += getfield (stream, field2, ';');
185 n += getfield (stream, field3, ';');
186 n += getfield (stream, field4, ';');
187 n += getfield (stream, field5, ';');
188 n += getfield (stream, field6, ';');
189 n += getfield (stream, field7, ';');
190 n += getfield (stream, field8, ';');
191 n += getfield (stream, field9, ';');
192 n += getfield (stream, field10, ';');
193 n += getfield (stream, field11, ';');
194 n += getfield (stream, field12, ';');
195 n += getfield (stream, field13, ';');
196 n += getfield (stream, field14, '\n');
201 fprintf (stderr, "short line in '%s':%d\n",
202 unicodedata_filename, lineno);
205 i = strtoul (field0, NULL, 16);
207 && strlen (field1) >= 9
208 && strcmp (field1 + strlen(field1) - 8, ", First>") == 0)
210 /* Deal with a range. */
212 n = getfield (stream, field0, ';');
213 n += getfield (stream, field1, ';');
214 n += getfield (stream, field2, ';');
215 n += getfield (stream, field3, ';');
216 n += getfield (stream, field4, ';');
217 n += getfield (stream, field5, ';');
218 n += getfield (stream, field6, ';');
219 n += getfield (stream, field7, ';');
220 n += getfield (stream, field8, ';');
221 n += getfield (stream, field9, ';');
222 n += getfield (stream, field10, ';');
223 n += getfield (stream, field11, ';');
224 n += getfield (stream, field12, ';');
225 n += getfield (stream, field13, ';');
226 n += getfield (stream, field14, '\n');
229 fprintf (stderr, "missing end range in '%s':%d\n",
230 unicodedata_filename, lineno);
233 if (!(field1[0] == '<'
234 && strlen (field1) >= 8
235 && strcmp (field1 + strlen (field1) - 7, ", Last>") == 0))
237 fprintf (stderr, "missing end range in '%s':%d\n",
238 unicodedata_filename, lineno);
241 field1[strlen (field1) - 7] = '\0';
242 j = strtoul (field0, NULL, 16);
244 fill_attribute (i, field1+1, field2, field3, field4, field5,
245 field6, field7, field8, field9, field10,
246 field11, field12, field13, field14);
250 /* Single character line */
251 fill_attribute (i, field1, field2, field3, field4, field5,
252 field6, field7, field8, field9, field10,
253 field11, field12, field13, field14);
256 if (ferror (stream) || fclose (stream))
258 fprintf (stderr, "error reading from '%s'\n", unicodedata_filename);
263 /* ========================================================================= */
265 /* General category. */
266 /* See Unicode 3.0 book, section 4.5,
270 is_category_L (unsigned int ch)
272 return (unicode_attributes[ch].name != NULL
273 && unicode_attributes[ch].category[0] == 'L');
277 is_category_Lu (unsigned int ch)
279 return (unicode_attributes[ch].name != NULL
280 && unicode_attributes[ch].category[0] == 'L'
281 && unicode_attributes[ch].category[1] == 'u');
285 is_category_Ll (unsigned int ch)
287 return (unicode_attributes[ch].name != NULL
288 && unicode_attributes[ch].category[0] == 'L'
289 && unicode_attributes[ch].category[1] == 'l');
293 is_category_Lt (unsigned int ch)
295 return (unicode_attributes[ch].name != NULL
296 && unicode_attributes[ch].category[0] == 'L'
297 && unicode_attributes[ch].category[1] == 't');
301 is_category_Lm (unsigned int ch)
303 return (unicode_attributes[ch].name != NULL
304 && unicode_attributes[ch].category[0] == 'L'
305 && unicode_attributes[ch].category[1] == 'm');
309 is_category_Lo (unsigned int ch)
311 return (unicode_attributes[ch].name != NULL
312 && unicode_attributes[ch].category[0] == 'L'
313 && unicode_attributes[ch].category[1] == 'o');
317 is_category_M (unsigned int ch)
319 return (unicode_attributes[ch].name != NULL
320 && unicode_attributes[ch].category[0] == 'M');
324 is_category_Mn (unsigned int ch)
326 return (unicode_attributes[ch].name != NULL
327 && unicode_attributes[ch].category[0] == 'M'
328 && unicode_attributes[ch].category[1] == 'n');
332 is_category_Mc (unsigned int ch)
334 return (unicode_attributes[ch].name != NULL
335 && unicode_attributes[ch].category[0] == 'M'
336 && unicode_attributes[ch].category[1] == 'c');
340 is_category_Me (unsigned int ch)
342 return (unicode_attributes[ch].name != NULL
343 && unicode_attributes[ch].category[0] == 'M'
344 && unicode_attributes[ch].category[1] == 'e');
348 is_category_N (unsigned int ch)
350 return (unicode_attributes[ch].name != NULL
351 && unicode_attributes[ch].category[0] == 'N');
355 is_category_Nd (unsigned int ch)
357 return (unicode_attributes[ch].name != NULL
358 && unicode_attributes[ch].category[0] == 'N'
359 && unicode_attributes[ch].category[1] == 'd');
363 is_category_Nl (unsigned int ch)
365 return (unicode_attributes[ch].name != NULL
366 && unicode_attributes[ch].category[0] == 'N'
367 && unicode_attributes[ch].category[1] == 'l');
371 is_category_No (unsigned int ch)
373 return (unicode_attributes[ch].name != NULL
374 && unicode_attributes[ch].category[0] == 'N'
375 && unicode_attributes[ch].category[1] == 'o');
379 is_category_P (unsigned int ch)
381 return (unicode_attributes[ch].name != NULL
382 && unicode_attributes[ch].category[0] == 'P');
386 is_category_Pc (unsigned int ch)
388 return (unicode_attributes[ch].name != NULL
389 && unicode_attributes[ch].category[0] == 'P'
390 && unicode_attributes[ch].category[1] == 'c');
394 is_category_Pd (unsigned int ch)
396 return (unicode_attributes[ch].name != NULL
397 && unicode_attributes[ch].category[0] == 'P'
398 && unicode_attributes[ch].category[1] == 'd');
402 is_category_Ps (unsigned int ch)
404 return (unicode_attributes[ch].name != NULL
405 && unicode_attributes[ch].category[0] == 'P'
406 && unicode_attributes[ch].category[1] == 's');
410 is_category_Pe (unsigned int ch)
412 return (unicode_attributes[ch].name != NULL
413 && unicode_attributes[ch].category[0] == 'P'
414 && unicode_attributes[ch].category[1] == 'e');
418 is_category_Pi (unsigned int ch)
420 return (unicode_attributes[ch].name != NULL
421 && unicode_attributes[ch].category[0] == 'P'
422 && unicode_attributes[ch].category[1] == 'i');
426 is_category_Pf (unsigned int ch)
428 return (unicode_attributes[ch].name != NULL
429 && unicode_attributes[ch].category[0] == 'P'
430 && unicode_attributes[ch].category[1] == 'f');
434 is_category_Po (unsigned int ch)
436 return (unicode_attributes[ch].name != NULL
437 && unicode_attributes[ch].category[0] == 'P'
438 && unicode_attributes[ch].category[1] == 'o');
442 is_category_S (unsigned int ch)
444 return (unicode_attributes[ch].name != NULL
445 && unicode_attributes[ch].category[0] == 'S');
449 is_category_Sm (unsigned int ch)
451 return (unicode_attributes[ch].name != NULL
452 && unicode_attributes[ch].category[0] == 'S'
453 && unicode_attributes[ch].category[1] == 'm');
457 is_category_Sc (unsigned int ch)
459 return (unicode_attributes[ch].name != NULL
460 && unicode_attributes[ch].category[0] == 'S'
461 && unicode_attributes[ch].category[1] == 'c');
465 is_category_Sk (unsigned int ch)
467 return (unicode_attributes[ch].name != NULL
468 && unicode_attributes[ch].category[0] == 'S'
469 && unicode_attributes[ch].category[1] == 'k');
473 is_category_So (unsigned int ch)
475 return (unicode_attributes[ch].name != NULL
476 && unicode_attributes[ch].category[0] == 'S'
477 && unicode_attributes[ch].category[1] == 'o');
481 is_category_Z (unsigned int ch)
483 return (unicode_attributes[ch].name != NULL
484 && unicode_attributes[ch].category[0] == 'Z');
488 is_category_Zs (unsigned int ch)
490 return (unicode_attributes[ch].name != NULL
491 && unicode_attributes[ch].category[0] == 'Z'
492 && unicode_attributes[ch].category[1] == 's');
496 is_category_Zl (unsigned int ch)
498 return (unicode_attributes[ch].name != NULL
499 && unicode_attributes[ch].category[0] == 'Z'
500 && unicode_attributes[ch].category[1] == 'l');
504 is_category_Zp (unsigned int ch)
506 return (unicode_attributes[ch].name != NULL
507 && unicode_attributes[ch].category[0] == 'Z'
508 && unicode_attributes[ch].category[1] == 'p');
512 is_category_C (unsigned int ch)
514 return (unicode_attributes[ch].name == NULL
515 || unicode_attributes[ch].category[0] == 'C');
519 is_category_Cc (unsigned int ch)
521 return (unicode_attributes[ch].name != NULL
522 && unicode_attributes[ch].category[0] == 'C'
523 && unicode_attributes[ch].category[1] == 'c');
527 is_category_Cf (unsigned int ch)
529 return (unicode_attributes[ch].name != NULL
530 && unicode_attributes[ch].category[0] == 'C'
531 && unicode_attributes[ch].category[1] == 'f');
535 is_category_Cs (unsigned int ch)
537 return (ch >= 0xd800 && ch < 0xe000);
541 is_category_Co (unsigned int ch)
543 return (unicode_attributes[ch].name != NULL
544 && unicode_attributes[ch].category[0] == 'C'
545 && unicode_attributes[ch].category[1] == 'o');
549 is_category_Cn (unsigned int ch)
551 return (unicode_attributes[ch].name == NULL
552 && !(ch >= 0xd800 && ch < 0xe000));
555 /* Output a boolean property in a human readable format. */
557 debug_output_predicate (const char *filename, bool (*predicate) (unsigned int))
562 stream = fopen (filename, "w");
565 fprintf (stderr, "cannot open '%s' for writing\n", filename);
569 #if 0 /* This yields huge text output. */
570 for (ch = 0; ch < 0x110000; ch++)
573 fprintf (stream, "0x%04X\n", ch);
576 for (ch = 0; ch < 0x110000; ch++)
579 unsigned int first = ch;
582 while (ch + 1 < 0x110000 && predicate (ch + 1))
586 fprintf (stream, "0x%04X..0x%04X\n", first, last);
588 fprintf (stream, "0x%04X\n", ch);
592 if (ferror (stream) || fclose (stream))
594 fprintf (stderr, "error writing to '%s'\n", filename);
599 /* Output the unit test for a boolean property. */
601 output_predicate_test (const char *filename, bool (*predicate) (unsigned int), const char *expression)
607 stream = fopen (filename, "w");
610 fprintf (stderr, "cannot open '%s' for writing\n", filename);
614 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
615 fprintf (stream, "/* Test the Unicode character type functions.\n");
616 fprintf (stream, " Copyright (C) 2007 Free Software Foundation, Inc.\n");
617 fprintf (stream, "\n");
618 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
619 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
620 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
621 fprintf (stream, " (at your option) any later version.\n");
622 fprintf (stream, "\n");
623 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
624 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
625 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
626 fprintf (stream, " GNU General Public License for more details.\n");
627 fprintf (stream, "\n");
628 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
629 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
630 fprintf (stream, "\n");
631 fprintf (stream, "#include \"test-predicate-part1.h\"\n");
632 fprintf (stream, "\n");
635 for (ch = 0; ch < 0x110000; ch++)
638 unsigned int first = ch;
641 while (ch + 1 < 0x110000 && predicate (ch + 1))
645 fprintf (stream, ",\n");
646 fprintf (stream, " { 0x%04X, 0x%04X }", first, last);
650 fprintf (stream, "\n");
652 fprintf (stream, "\n");
653 fprintf (stream, "#define PREDICATE(c) %s\n", expression);
654 fprintf (stream, "#include \"test-predicate-part2.h\"\n");
656 if (ferror (stream) || fclose (stream))
658 fprintf (stderr, "error writing to '%s'\n", filename);
663 /* Construction of sparse 3-level tables. */
664 #define TABLE predicate_table
665 #define xmalloc malloc
666 #define xrealloc realloc
667 #include "3levelbit.h"
669 /* Output a boolean property in a three-level bitmap. */
671 output_predicate (const char *filename, bool (*predicate) (unsigned int), const char *name, const char *comment, const char *version)
675 struct predicate_table t;
676 unsigned int level1_offset, level2_offset, level3_offset;
678 stream = fopen (filename, "w");
681 fprintf (stderr, "cannot open '%s' for writing\n", filename);
685 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
686 fprintf (stream, "/* %s of Unicode characters. */\n", comment);
687 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
692 predicate_table_init (&t);
694 for (ch = 0; ch < 0x110000; ch++)
696 predicate_table_add (&t, ch);
698 predicate_table_finalize (&t);
700 /* Offsets in t.result, in memory of this process. */
702 5 * sizeof (uint32_t);
704 5 * sizeof (uint32_t)
705 + t.level1_size * sizeof (uint32_t);
707 5 * sizeof (uint32_t)
708 + t.level1_size * sizeof (uint32_t)
709 + (t.level2_size << t.q) * sizeof (uint32_t);
711 for (i = 0; i < 5; i++)
713 fprintf (stream, "#define header_%d %d\n", i,
714 ((uint32_t *) t.result)[i]);
716 fprintf (stream, "static const\n");
717 fprintf (stream, "struct\n");
718 fprintf (stream, " {\n");
719 fprintf (stream, " int header[1];\n");
720 fprintf (stream, " int level1[%zu];\n", t.level1_size);
721 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
722 fprintf (stream, " /*unsigned*/ int level3[%zu << %d];\n", t.level3_size, t.p);
723 fprintf (stream, " }\n");
724 fprintf (stream, "%s =\n", name);
725 fprintf (stream, "{\n");
726 fprintf (stream, " { %d },\n", ((uint32_t *) t.result)[1]);
727 fprintf (stream, " {");
728 if (t.level1_size > 1)
729 fprintf (stream, "\n ");
730 for (i = 0; i < t.level1_size; i++)
733 if (i > 0 && (i % 1) == 0)
734 fprintf (stream, "\n ");
735 offset = ((uint32_t *) (t.result + level1_offset))[i];
737 fprintf (stream, " %5d", -1);
739 fprintf (stream, " %5zd * sizeof (int) / sizeof (short) + %5zd",
740 1 + t.level1_size, (offset - level2_offset) / sizeof (uint32_t));
741 if (i+1 < t.level1_size)
742 fprintf (stream, ",");
744 if (t.level1_size > 1)
745 fprintf (stream, "\n ");
746 fprintf (stream, " },\n");
747 fprintf (stream, " {");
748 if (t.level2_size << t.q > 1)
749 fprintf (stream, "\n ");
750 for (i = 0; i < t.level2_size << t.q; i++)
753 if (i > 0 && (i % 1) == 0)
754 fprintf (stream, "\n ");
755 offset = ((uint32_t *) (t.result + level2_offset))[i];
757 fprintf (stream, " %5d", -1);
759 fprintf (stream, " %5zd + %5zd * sizeof (short) / sizeof (int) + %5zd",
760 1 + t.level1_size, t.level2_size << t.q, (offset - level3_offset) / sizeof (uint32_t));
761 if (i+1 < t.level2_size << t.q)
762 fprintf (stream, ",");
764 if (t.level2_size << t.q > 1)
765 fprintf (stream, "\n ");
766 fprintf (stream, " },\n");
767 fprintf (stream, " {");
768 if (t.level3_size << t.p > 4)
769 fprintf (stream, "\n ");
770 for (i = 0; i < t.level3_size << t.p; i++)
772 if (i > 0 && (i % 4) == 0)
773 fprintf (stream, "\n ");
774 fprintf (stream, " 0x%08X",
775 ((uint32_t *) (t.result + level3_offset))[i]);
776 if (i+1 < t.level3_size << t.p)
777 fprintf (stream, ",");
779 if (t.level3_size << t.p > 4)
780 fprintf (stream, "\n ");
781 fprintf (stream, " }\n");
782 fprintf (stream, "};\n");
784 if (ferror (stream) || fclose (stream))
786 fprintf (stderr, "error writing to '%s'\n", filename);
791 /* Output all categories. */
793 output_categories (const char *version)
795 #define CATEGORY(C) \
796 debug_output_predicate ("unictype/categ_" #C ".txt", is_category_ ## C); \
797 output_predicate_test ("../tests/unictype/test-categ_" #C ".c", is_category_ ## C, "uc_is_general_category (c, UC_CATEGORY_" #C ")"); \
798 output_predicate ("unictype/categ_" #C ".h", is_category_ ## C, "u_categ_" #C, "Categories", version);
841 UC_CATEGORY_MASK_L = 0x0000001f,
842 UC_CATEGORY_MASK_Lu = 0x00000001,
843 UC_CATEGORY_MASK_Ll = 0x00000002,
844 UC_CATEGORY_MASK_Lt = 0x00000004,
845 UC_CATEGORY_MASK_Lm = 0x00000008,
846 UC_CATEGORY_MASK_Lo = 0x00000010,
847 UC_CATEGORY_MASK_M = 0x000000e0,
848 UC_CATEGORY_MASK_Mn = 0x00000020,
849 UC_CATEGORY_MASK_Mc = 0x00000040,
850 UC_CATEGORY_MASK_Me = 0x00000080,
851 UC_CATEGORY_MASK_N = 0x00000700,
852 UC_CATEGORY_MASK_Nd = 0x00000100,
853 UC_CATEGORY_MASK_Nl = 0x00000200,
854 UC_CATEGORY_MASK_No = 0x00000400,
855 UC_CATEGORY_MASK_P = 0x0003f800,
856 UC_CATEGORY_MASK_Pc = 0x00000800,
857 UC_CATEGORY_MASK_Pd = 0x00001000,
858 UC_CATEGORY_MASK_Ps = 0x00002000,
859 UC_CATEGORY_MASK_Pe = 0x00004000,
860 UC_CATEGORY_MASK_Pi = 0x00008000,
861 UC_CATEGORY_MASK_Pf = 0x00010000,
862 UC_CATEGORY_MASK_Po = 0x00020000,
863 UC_CATEGORY_MASK_S = 0x003c0000,
864 UC_CATEGORY_MASK_Sm = 0x00040000,
865 UC_CATEGORY_MASK_Sc = 0x00080000,
866 UC_CATEGORY_MASK_Sk = 0x00100000,
867 UC_CATEGORY_MASK_So = 0x00200000,
868 UC_CATEGORY_MASK_Z = 0x01c00000,
869 UC_CATEGORY_MASK_Zs = 0x00400000,
870 UC_CATEGORY_MASK_Zl = 0x00800000,
871 UC_CATEGORY_MASK_Zp = 0x01000000,
872 UC_CATEGORY_MASK_C = 0x3e000000,
873 UC_CATEGORY_MASK_Cc = 0x02000000,
874 UC_CATEGORY_MASK_Cf = 0x04000000,
875 UC_CATEGORY_MASK_Cs = 0x08000000,
876 UC_CATEGORY_MASK_Co = 0x10000000,
877 UC_CATEGORY_MASK_Cn = 0x20000000
881 general_category_byname (const char *category_name)
883 if (category_name[0] != '\0'
884 && (category_name[1] == '\0' || category_name[2] == '\0'))
885 switch (category_name[0])
888 switch (category_name[1])
890 case '\0': return UC_CATEGORY_MASK_L;
891 case 'u': return UC_CATEGORY_MASK_Lu;
892 case 'l': return UC_CATEGORY_MASK_Ll;
893 case 't': return UC_CATEGORY_MASK_Lt;
894 case 'm': return UC_CATEGORY_MASK_Lm;
895 case 'o': return UC_CATEGORY_MASK_Lo;
899 switch (category_name[1])
901 case '\0': return UC_CATEGORY_MASK_M;
902 case 'n': return UC_CATEGORY_MASK_Mn;
903 case 'c': return UC_CATEGORY_MASK_Mc;
904 case 'e': return UC_CATEGORY_MASK_Me;
908 switch (category_name[1])
910 case '\0': return UC_CATEGORY_MASK_N;
911 case 'd': return UC_CATEGORY_MASK_Nd;
912 case 'l': return UC_CATEGORY_MASK_Nl;
913 case 'o': return UC_CATEGORY_MASK_No;
917 switch (category_name[1])
919 case '\0': return UC_CATEGORY_MASK_P;
920 case 'c': return UC_CATEGORY_MASK_Pc;
921 case 'd': return UC_CATEGORY_MASK_Pd;
922 case 's': return UC_CATEGORY_MASK_Ps;
923 case 'e': return UC_CATEGORY_MASK_Pe;
924 case 'i': return UC_CATEGORY_MASK_Pi;
925 case 'f': return UC_CATEGORY_MASK_Pf;
926 case 'o': return UC_CATEGORY_MASK_Po;
930 switch (category_name[1])
932 case '\0': return UC_CATEGORY_MASK_S;
933 case 'm': return UC_CATEGORY_MASK_Sm;
934 case 'c': return UC_CATEGORY_MASK_Sc;
935 case 'k': return UC_CATEGORY_MASK_Sk;
936 case 'o': return UC_CATEGORY_MASK_So;
940 switch (category_name[1])
942 case '\0': return UC_CATEGORY_MASK_Z;
943 case 's': return UC_CATEGORY_MASK_Zs;
944 case 'l': return UC_CATEGORY_MASK_Zl;
945 case 'p': return UC_CATEGORY_MASK_Zp;
949 switch (category_name[1])
951 case '\0': return UC_CATEGORY_MASK_C;
952 case 'c': return UC_CATEGORY_MASK_Cc;
953 case 'f': return UC_CATEGORY_MASK_Cf;
954 case 's': return UC_CATEGORY_MASK_Cs;
955 case 'o': return UC_CATEGORY_MASK_Co;
956 case 'n': return UC_CATEGORY_MASK_Cn;
960 /* Invalid category name. */
964 /* Construction of sparse 3-level tables. */
965 #define TABLE category_table
966 #define ELEMENT uint8_t
967 #define DEFAULT 29 /* = log2(UC_CATEGORY_MASK_Cn) */
968 #define xmalloc malloc
969 #define xrealloc realloc
972 /* Output the per-character category table. */
974 output_category (const char *filename, const char *version)
978 struct category_table t;
979 unsigned int level1_offset, level2_offset, level3_offset;
980 uint16_t *level3_packed;
982 stream = fopen (filename, "w");
985 fprintf (stderr, "cannot open '%s' for writing\n", filename);
989 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
990 fprintf (stream, "/* Categories of Unicode characters. */\n");
991 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
996 category_table_init (&t);
998 for (ch = 0; ch < 0x110000; ch++)
1001 unsigned int log2_value;
1003 if (is_category_Cs (ch))
1004 value = UC_CATEGORY_MASK_Cs;
1005 else if (unicode_attributes[ch].name != NULL)
1006 value = general_category_byname (unicode_attributes[ch].category);
1010 /* Now value should contain exactly one bit. */
1011 if (value == 0 || ((value & (value - 1)) != 0))
1014 for (log2_value = 0; value > 1; value >>= 1, log2_value++);
1016 category_table_add (&t, ch, log2_value);
1019 category_table_finalize (&t);
1021 /* Offsets in t.result, in memory of this process. */
1023 5 * sizeof (uint32_t);
1025 5 * sizeof (uint32_t)
1026 + t.level1_size * sizeof (uint32_t);
1028 5 * sizeof (uint32_t)
1029 + t.level1_size * sizeof (uint32_t)
1030 + (t.level2_size << t.q) * sizeof (uint32_t);
1032 for (i = 0; i < 5; i++)
1033 fprintf (stream, "#define category_header_%d %d\n", i,
1034 ((uint32_t *) t.result)[i]);
1035 fprintf (stream, "static const\n");
1036 fprintf (stream, "struct\n");
1037 fprintf (stream, " {\n");
1038 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1039 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1040 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
1041 (1 << t.p) * 5 / 16);
1042 fprintf (stream, " }\n");
1043 fprintf (stream, "u_category =\n");
1044 fprintf (stream, "{\n");
1045 fprintf (stream, " {");
1046 if (t.level1_size > 8)
1047 fprintf (stream, "\n ");
1048 for (i = 0; i < t.level1_size; i++)
1051 if (i > 0 && (i % 8) == 0)
1052 fprintf (stream, "\n ");
1053 offset = ((uint32_t *) (t.result + level1_offset))[i];
1055 fprintf (stream, " %5d", -1);
1057 fprintf (stream, " %5zd",
1058 (offset - level2_offset) / sizeof (uint32_t));
1059 if (i+1 < t.level1_size)
1060 fprintf (stream, ",");
1062 if (t.level1_size > 8)
1063 fprintf (stream, "\n ");
1064 fprintf (stream, " },\n");
1065 fprintf (stream, " {");
1066 if (t.level2_size << t.q > 8)
1067 fprintf (stream, "\n ");
1068 for (i = 0; i < t.level2_size << t.q; i++)
1071 if (i > 0 && (i % 8) == 0)
1072 fprintf (stream, "\n ");
1073 offset = ((uint32_t *) (t.result + level2_offset))[i];
1075 fprintf (stream, " %5d", -1);
1077 fprintf (stream, " %5zd",
1078 (offset - level3_offset) / sizeof (uint8_t));
1079 if (i+1 < t.level2_size << t.q)
1080 fprintf (stream, ",");
1082 if (t.level2_size << t.q > 8)
1083 fprintf (stream, "\n ");
1084 fprintf (stream, " },\n");
1085 /* Pack the level3 array. Each entry needs 5 bits only. Use 16-bit units,
1086 not 32-bit units, in order to make the lookup function easier. */
1089 calloc ((t.level3_size << t.p) * 5 / 16 + 1, sizeof (uint16_t));
1090 for (i = 0; i < t.level3_size << t.p; i++)
1092 unsigned int j = (i * 5) / 16;
1093 unsigned int k = (i * 5) % 16;
1094 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
1095 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
1096 level3_packed[j] = value & 0xffff;
1097 level3_packed[j+1] = value >> 16;
1099 fprintf (stream, " {");
1100 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1101 fprintf (stream, "\n ");
1102 for (i = 0; i < (t.level3_size << t.p) * 5 / 16 + 1; i++)
1104 if (i > 0 && (i % 8) == 0)
1105 fprintf (stream, "\n ");
1106 fprintf (stream, " 0x%04x", level3_packed[i]);
1107 if (i+1 < (t.level3_size << t.p) * 5 / 16 + 1)
1108 fprintf (stream, ",");
1110 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1111 fprintf (stream, "\n ");
1112 fprintf (stream, " }\n");
1113 free (level3_packed);
1114 fprintf (stream, "};\n");
1116 if (ferror (stream) || fclose (stream))
1118 fprintf (stderr, "error writing to '%s'\n", filename);
1123 /* ========================================================================= */
1125 /* Canonical combining class. */
1126 /* See Unicode 3.0 book, section 4.2,
1129 /* Construction of sparse 3-level tables. */
1130 #define TABLE combclass_table
1131 #define ELEMENT uint8_t
1133 #define xmalloc malloc
1134 #define xrealloc realloc
1137 /* Output the per-character combining class table. */
1139 output_combclass (const char *filename, const char *version)
1143 struct combclass_table t;
1144 unsigned int level1_offset, level2_offset, level3_offset;
1146 stream = fopen (filename, "w");
1149 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1153 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1154 fprintf (stream, "/* Combining class of Unicode characters. */\n");
1155 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1160 combclass_table_init (&t);
1162 for (ch = 0; ch < 0x110000; ch++)
1163 if (unicode_attributes[ch].name != NULL)
1165 int value = atoi (unicode_attributes[ch].combining);
1166 if (!(value >= 0 && value <= 255))
1168 combclass_table_add (&t, ch, value);
1171 combclass_table_finalize (&t);
1173 /* Offsets in t.result, in memory of this process. */
1175 5 * sizeof (uint32_t);
1177 5 * sizeof (uint32_t)
1178 + t.level1_size * sizeof (uint32_t);
1180 5 * sizeof (uint32_t)
1181 + t.level1_size * sizeof (uint32_t)
1182 + (t.level2_size << t.q) * sizeof (uint32_t);
1184 for (i = 0; i < 5; i++)
1185 fprintf (stream, "#define combclass_header_%d %d\n", i,
1186 ((uint32_t *) t.result)[i]);
1187 fprintf (stream, "static const\n");
1188 fprintf (stream, "struct\n");
1189 fprintf (stream, " {\n");
1190 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1191 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1192 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
1193 fprintf (stream, " }\n");
1194 fprintf (stream, "u_combclass =\n");
1195 fprintf (stream, "{\n");
1196 fprintf (stream, " {");
1197 if (t.level1_size > 8)
1198 fprintf (stream, "\n ");
1199 for (i = 0; i < t.level1_size; i++)
1202 if (i > 0 && (i % 8) == 0)
1203 fprintf (stream, "\n ");
1204 offset = ((uint32_t *) (t.result + level1_offset))[i];
1206 fprintf (stream, " %5d", -1);
1208 fprintf (stream, " %5zd",
1209 (offset - level2_offset) / sizeof (uint32_t));
1210 if (i+1 < t.level1_size)
1211 fprintf (stream, ",");
1213 if (t.level1_size > 8)
1214 fprintf (stream, "\n ");
1215 fprintf (stream, " },\n");
1216 fprintf (stream, " {");
1217 if (t.level2_size << t.q > 8)
1218 fprintf (stream, "\n ");
1219 for (i = 0; i < t.level2_size << t.q; i++)
1222 if (i > 0 && (i % 8) == 0)
1223 fprintf (stream, "\n ");
1224 offset = ((uint32_t *) (t.result + level2_offset))[i];
1226 fprintf (stream, " %5d", -1);
1228 fprintf (stream, " %5zd",
1229 (offset - level3_offset) / sizeof (uint8_t));
1230 if (i+1 < t.level2_size << t.q)
1231 fprintf (stream, ",");
1233 if (t.level2_size << t.q > 8)
1234 fprintf (stream, "\n ");
1235 fprintf (stream, " },\n");
1236 fprintf (stream, " {");
1237 if (t.level3_size << t.p > 8)
1238 fprintf (stream, "\n ");
1239 for (i = 0; i < t.level3_size << t.p; i++)
1241 if (i > 0 && (i % 8) == 0)
1242 fprintf (stream, "\n ");
1243 fprintf (stream, " %3d", ((uint8_t *) (t.result + level3_offset))[i]);
1244 if (i+1 < t.level3_size << t.p)
1245 fprintf (stream, ",");
1247 if (t.level3_size << t.p > 8)
1248 fprintf (stream, "\n ");
1249 fprintf (stream, " }\n");
1250 fprintf (stream, "};\n");
1252 if (ferror (stream) || fclose (stream))
1254 fprintf (stderr, "error writing to '%s'\n", filename);
1259 /* ========================================================================= */
1261 /* Bidirectional category. */
1262 /* See Unicode 3.0 book, section 4.3,
1267 UC_BIDI_L, /* Left-to-Right */
1268 UC_BIDI_LRE, /* Left-to-Right Embedding */
1269 UC_BIDI_LRO, /* Left-to-Right Override */
1270 UC_BIDI_R, /* Right-to-Left */
1271 UC_BIDI_AL, /* Right-to-Left Arabic */
1272 UC_BIDI_RLE, /* Right-to-Left Embedding */
1273 UC_BIDI_RLO, /* Right-to-Left Override */
1274 UC_BIDI_PDF, /* Pop Directional Format */
1275 UC_BIDI_EN, /* European Number */
1276 UC_BIDI_ES, /* European Number Separator */
1277 UC_BIDI_ET, /* European Number Terminator */
1278 UC_BIDI_AN, /* Arabic Number */
1279 UC_BIDI_CS, /* Common Number Separator */
1280 UC_BIDI_NSM, /* Non-Spacing Mark */
1281 UC_BIDI_BN, /* Boundary Neutral */
1282 UC_BIDI_B, /* Paragraph Separator */
1283 UC_BIDI_S, /* Segment Separator */
1284 UC_BIDI_WS, /* Whitespace */
1285 UC_BIDI_ON /* Other Neutral */
1289 bidi_category_byname (const char *category_name)
1291 switch (category_name[0])
1294 switch (category_name[1])
1297 if (category_name[2] == '\0')
1301 if (category_name[2] == '\0')
1307 switch (category_name[1])
1312 if (category_name[2] == '\0')
1318 switch (category_name[1])
1321 if (category_name[2] == '\0')
1327 switch (category_name[1])
1330 if (category_name[2] == '\0')
1334 if (category_name[2] == '\0')
1338 if (category_name[2] == '\0')
1344 switch (category_name[1])
1349 switch (category_name[2])
1352 if (category_name[3] == '\0')
1356 if (category_name[3] == '\0')
1364 switch (category_name[1])
1367 switch (category_name[2])
1370 if (category_name[3] == '\0')
1378 switch (category_name[1])
1381 if (category_name[2] == '\0')
1387 switch (category_name[1])
1390 switch (category_name[2])
1393 if (category_name[3] == '\0')
1401 switch (category_name[1])
1406 switch (category_name[2])
1409 if (category_name[3] == '\0')
1413 if (category_name[3] == '\0')
1421 if (category_name[1] == '\0')
1425 switch (category_name[1])
1428 if (category_name[2] == '\0')
1434 /* Invalid bidi category name. */
1439 get_bidi_category (unsigned int ch)
1441 if (unicode_attributes[ch].name != NULL)
1442 return bidi_category_byname (unicode_attributes[ch].bidi);
1445 /* The bidi category of unassigned characters depends on the range.
1446 See UTR #9 and DerivedBidiClass.txt. */
1447 if ((ch >= 0x0590 && ch <= 0x05FF)
1448 || (ch >= 0x07FB && ch <= 0x08FF)
1449 || (ch >= 0xFB37 && ch <= 0xFB45)
1450 || (ch >= 0x10800 && ch <= 0x10FFF))
1452 else if ((ch >= 0x0600 && ch <= 0x07BF)
1453 || (ch >= 0x2064 && ch <= 0x2069)
1454 || (ch >= 0xFBB2 && ch <= 0xFDCF)
1455 || (ch >= 0xFDFE && ch <= 0xFEFE))
1457 else if ((ch >= 0xFDD0 && ch <= 0xFDEF)
1458 || (ch >= 0xFFF0 && ch <= 0xFFFF)
1459 || (ch & 0xFFFF) == 0xFFFE
1460 || (ch & 0xFFFF) == 0xFFFF
1461 || (ch >= 0xE0000 && ch <= 0xE0FFF))
1468 /* Construction of sparse 3-level tables. */
1469 #define TABLE bidi_category_table
1470 #define ELEMENT uint8_t
1471 #define DEFAULT UC_BIDI_L
1472 #define xmalloc malloc
1473 #define xrealloc realloc
1476 /* Output the per-character bidi category table. */
1478 output_bidi_category (const char *filename, const char *version)
1482 struct bidi_category_table t;
1483 unsigned int level1_offset, level2_offset, level3_offset;
1484 uint16_t *level3_packed;
1486 stream = fopen (filename, "w");
1489 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1493 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1494 fprintf (stream, "/* Bidi categories of Unicode characters. */\n");
1495 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1500 bidi_category_table_init (&t);
1502 for (ch = 0; ch < 0x110000; ch++)
1504 int value = get_bidi_category (ch);
1506 bidi_category_table_add (&t, ch, value);
1509 bidi_category_table_finalize (&t);
1511 /* Offsets in t.result, in memory of this process. */
1513 5 * sizeof (uint32_t);
1515 5 * sizeof (uint32_t)
1516 + t.level1_size * sizeof (uint32_t);
1518 5 * sizeof (uint32_t)
1519 + t.level1_size * sizeof (uint32_t)
1520 + (t.level2_size << t.q) * sizeof (uint32_t);
1522 for (i = 0; i < 5; i++)
1523 fprintf (stream, "#define bidi_category_header_%d %d\n", i,
1524 ((uint32_t *) t.result)[i]);
1525 fprintf (stream, "static const\n");
1526 fprintf (stream, "struct\n");
1527 fprintf (stream, " {\n");
1528 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1529 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1530 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
1531 (1 << t.p) * 5 / 16);
1532 fprintf (stream, " }\n");
1533 fprintf (stream, "u_bidi_category =\n");
1534 fprintf (stream, "{\n");
1535 fprintf (stream, " {");
1536 if (t.level1_size > 8)
1537 fprintf (stream, "\n ");
1538 for (i = 0; i < t.level1_size; i++)
1541 if (i > 0 && (i % 8) == 0)
1542 fprintf (stream, "\n ");
1543 offset = ((uint32_t *) (t.result + level1_offset))[i];
1545 fprintf (stream, " %5d", -1);
1547 fprintf (stream, " %5zd",
1548 (offset - level2_offset) / sizeof (uint32_t));
1549 if (i+1 < t.level1_size)
1550 fprintf (stream, ",");
1552 if (t.level1_size > 8)
1553 fprintf (stream, "\n ");
1554 fprintf (stream, " },\n");
1555 fprintf (stream, " {");
1556 if (t.level2_size << t.q > 8)
1557 fprintf (stream, "\n ");
1558 for (i = 0; i < t.level2_size << t.q; i++)
1561 if (i > 0 && (i % 8) == 0)
1562 fprintf (stream, "\n ");
1563 offset = ((uint32_t *) (t.result + level2_offset))[i];
1565 fprintf (stream, " %5d", -1);
1567 fprintf (stream, " %5zd",
1568 (offset - level3_offset) / sizeof (uint8_t));
1569 if (i+1 < t.level2_size << t.q)
1570 fprintf (stream, ",");
1572 if (t.level2_size << t.q > 8)
1573 fprintf (stream, "\n ");
1574 fprintf (stream, " },\n");
1575 /* Pack the level3 array. Each entry needs 5 bits only. Use 16-bit units,
1576 not 32-bit units, in order to make the lookup function easier. */
1579 calloc ((t.level3_size << t.p) * 5 / 16 + 1, sizeof (uint16_t));
1580 for (i = 0; i < t.level3_size << t.p; i++)
1582 unsigned int j = (i * 5) / 16;
1583 unsigned int k = (i * 5) % 16;
1584 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
1585 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
1586 level3_packed[j] = value & 0xffff;
1587 level3_packed[j+1] = value >> 16;
1589 fprintf (stream, " {");
1590 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1591 fprintf (stream, "\n ");
1592 for (i = 0; i < (t.level3_size << t.p) * 5 / 16 + 1; i++)
1594 if (i > 0 && (i % 8) == 0)
1595 fprintf (stream, "\n ");
1596 fprintf (stream, " 0x%04x", level3_packed[i]);
1597 if (i+1 < (t.level3_size << t.p) * 5 / 16 + 1)
1598 fprintf (stream, ",");
1600 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1601 fprintf (stream, "\n ");
1602 fprintf (stream, " }\n");
1603 free (level3_packed);
1604 fprintf (stream, "};\n");
1606 if (ferror (stream) || fclose (stream))
1608 fprintf (stderr, "error writing to '%s'\n", filename);
1613 /* ========================================================================= */
1615 /* Decimal digit value. */
1616 /* See Unicode 3.0 book, section 4.6. */
1619 get_decdigit_value (unsigned int ch)
1621 if (unicode_attributes[ch].name != NULL
1622 && unicode_attributes[ch].decdigit[0] != '\0')
1623 return atoi (unicode_attributes[ch].decdigit);
1627 /* Construction of sparse 3-level tables. */
1628 #define TABLE decdigit_table
1629 #define ELEMENT uint8_t
1631 #define xmalloc malloc
1632 #define xrealloc realloc
1635 /* Output the unit test for the per-character decimal digit value table. */
1637 output_decimal_digit_test (const char *filename, const char *version)
1643 stream = fopen (filename, "w");
1646 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1650 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1651 fprintf (stream, "/* Decimal digit values of Unicode characters. */\n");
1652 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1656 for (ch = 0; ch < 0x110000; ch++)
1658 int value = get_decdigit_value (ch);
1660 if (!(value >= -1 && value < 10))
1666 fprintf (stream, ",\n");
1667 fprintf (stream, " { 0x%04X, %d }", ch, value);
1672 fprintf (stream, "\n");
1674 if (ferror (stream) || fclose (stream))
1676 fprintf (stderr, "error writing to '%s'\n", filename);
1681 /* Output the per-character decimal digit value table. */
1683 output_decimal_digit (const char *filename, const char *version)
1687 struct decdigit_table t;
1688 unsigned int level1_offset, level2_offset, level3_offset;
1690 stream = fopen (filename, "w");
1693 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1697 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1698 fprintf (stream, "/* Decimal digit values of Unicode characters. */\n");
1699 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1704 decdigit_table_init (&t);
1706 for (ch = 0; ch < 0x110000; ch++)
1708 int value = 1 + get_decdigit_value (ch);
1710 if (!(value >= 0 && value <= 10))
1713 decdigit_table_add (&t, ch, value);
1716 decdigit_table_finalize (&t);
1718 /* Offsets in t.result, in memory of this process. */
1720 5 * sizeof (uint32_t);
1722 5 * sizeof (uint32_t)
1723 + t.level1_size * sizeof (uint32_t);
1725 5 * sizeof (uint32_t)
1726 + t.level1_size * sizeof (uint32_t)
1727 + (t.level2_size << t.q) * sizeof (uint32_t);
1729 for (i = 0; i < 5; i++)
1730 fprintf (stream, "#define decdigit_header_%d %d\n", i,
1731 ((uint32_t *) t.result)[i]);
1732 fprintf (stream, "static const\n");
1733 fprintf (stream, "struct\n");
1734 fprintf (stream, " {\n");
1735 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1736 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1737 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size,
1739 fprintf (stream, " }\n");
1740 fprintf (stream, "u_decdigit =\n");
1741 fprintf (stream, "{\n");
1742 fprintf (stream, " {");
1743 if (t.level1_size > 8)
1744 fprintf (stream, "\n ");
1745 for (i = 0; i < t.level1_size; i++)
1748 if (i > 0 && (i % 8) == 0)
1749 fprintf (stream, "\n ");
1750 offset = ((uint32_t *) (t.result + level1_offset))[i];
1752 fprintf (stream, " %5d", -1);
1754 fprintf (stream, " %5zd",
1755 (offset - level2_offset) / sizeof (uint32_t));
1756 if (i+1 < t.level1_size)
1757 fprintf (stream, ",");
1759 if (t.level1_size > 8)
1760 fprintf (stream, "\n ");
1761 fprintf (stream, " },\n");
1762 fprintf (stream, " {");
1763 if (t.level2_size << t.q > 8)
1764 fprintf (stream, "\n ");
1765 for (i = 0; i < t.level2_size << t.q; i++)
1768 if (i > 0 && (i % 8) == 0)
1769 fprintf (stream, "\n ");
1770 offset = ((uint32_t *) (t.result + level2_offset))[i];
1772 fprintf (stream, " %5d", -1);
1774 fprintf (stream, " %5zd",
1775 (offset - level3_offset) / sizeof (uint8_t));
1776 if (i+1 < t.level2_size << t.q)
1777 fprintf (stream, ",");
1779 if (t.level2_size << t.q > 8)
1780 fprintf (stream, "\n ");
1781 fprintf (stream, " },\n");
1782 /* Pack the level3 array. Each entry needs 4 bits only. */
1783 fprintf (stream, " {");
1784 if (t.level3_size << (t.p - 1) > 8)
1785 fprintf (stream, "\n ");
1786 for (i = 0; i < t.level3_size << (t.p - 1); i++)
1788 if (i > 0 && (i % 8) == 0)
1789 fprintf (stream, "\n ");
1790 fprintf (stream, " 0x%02x",
1791 ((uint8_t *) (t.result + level3_offset))[2*i]
1792 + (((uint8_t *) (t.result + level3_offset))[2*i+1] << 4));
1793 if (i+1 < t.level3_size << (t.p - 1))
1794 fprintf (stream, ",");
1796 if (t.level3_size << (t.p - 1) > 8)
1797 fprintf (stream, "\n ");
1798 fprintf (stream, " }\n");
1799 fprintf (stream, "};\n");
1801 if (ferror (stream) || fclose (stream))
1803 fprintf (stderr, "error writing to '%s'\n", filename);
1808 /* ========================================================================= */
1811 /* See Unicode 3.0 book, section 4.6. */
1814 get_digit_value (unsigned int ch)
1816 if (unicode_attributes[ch].name != NULL
1817 && unicode_attributes[ch].digit[0] != '\0')
1818 return atoi (unicode_attributes[ch].digit);
1822 /* Output the unit test for the per-character digit value table. */
1824 output_digit_test (const char *filename, const char *version)
1830 stream = fopen (filename, "w");
1833 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1837 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1838 fprintf (stream, "/* Digit values of Unicode characters. */\n");
1839 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1843 for (ch = 0; ch < 0x110000; ch++)
1845 int value = get_digit_value (ch);
1847 if (!(value >= -1 && value < 10))
1853 fprintf (stream, ",\n");
1854 fprintf (stream, " { 0x%04X, %d }", ch, value);
1859 fprintf (stream, "\n");
1861 if (ferror (stream) || fclose (stream))
1863 fprintf (stderr, "error writing to '%s'\n", filename);
1868 /* Output the per-character digit value table. */
1870 output_digit (const char *filename, const char *version)
1874 struct decdigit_table t;
1875 unsigned int level1_offset, level2_offset, level3_offset;
1877 stream = fopen (filename, "w");
1880 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1884 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1885 fprintf (stream, "/* Digit values of Unicode characters. */\n");
1886 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1891 decdigit_table_init (&t);
1893 for (ch = 0; ch < 0x110000; ch++)
1895 int value = 1 + get_digit_value (ch);
1897 if (!(value >= 0 && value <= 10))
1900 decdigit_table_add (&t, ch, value);
1903 decdigit_table_finalize (&t);
1905 /* Offsets in t.result, in memory of this process. */
1907 5 * sizeof (uint32_t);
1909 5 * sizeof (uint32_t)
1910 + t.level1_size * sizeof (uint32_t);
1912 5 * sizeof (uint32_t)
1913 + t.level1_size * sizeof (uint32_t)
1914 + (t.level2_size << t.q) * sizeof (uint32_t);
1916 for (i = 0; i < 5; i++)
1917 fprintf (stream, "#define digit_header_%d %d\n", i,
1918 ((uint32_t *) t.result)[i]);
1919 fprintf (stream, "static const\n");
1920 fprintf (stream, "struct\n");
1921 fprintf (stream, " {\n");
1922 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1923 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1924 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size,
1926 fprintf (stream, " }\n");
1927 fprintf (stream, "u_digit =\n");
1928 fprintf (stream, "{\n");
1929 fprintf (stream, " {");
1930 if (t.level1_size > 8)
1931 fprintf (stream, "\n ");
1932 for (i = 0; i < t.level1_size; i++)
1935 if (i > 0 && (i % 8) == 0)
1936 fprintf (stream, "\n ");
1937 offset = ((uint32_t *) (t.result + level1_offset))[i];
1939 fprintf (stream, " %5d", -1);
1941 fprintf (stream, " %5zd",
1942 (offset - level2_offset) / sizeof (uint32_t));
1943 if (i+1 < t.level1_size)
1944 fprintf (stream, ",");
1946 if (t.level1_size > 8)
1947 fprintf (stream, "\n ");
1948 fprintf (stream, " },\n");
1949 fprintf (stream, " {");
1950 if (t.level2_size << t.q > 8)
1951 fprintf (stream, "\n ");
1952 for (i = 0; i < t.level2_size << t.q; i++)
1955 if (i > 0 && (i % 8) == 0)
1956 fprintf (stream, "\n ");
1957 offset = ((uint32_t *) (t.result + level2_offset))[i];
1959 fprintf (stream, " %5d", -1);
1961 fprintf (stream, " %5zd",
1962 (offset - level3_offset) / sizeof (uint8_t));
1963 if (i+1 < t.level2_size << t.q)
1964 fprintf (stream, ",");
1966 if (t.level2_size << t.q > 8)
1967 fprintf (stream, "\n ");
1968 fprintf (stream, " },\n");
1969 /* Pack the level3 array. Each entry needs 4 bits only. */
1970 fprintf (stream, " {");
1971 if (t.level3_size << (t.p - 1) > 8)
1972 fprintf (stream, "\n ");
1973 for (i = 0; i < t.level3_size << (t.p - 1); i++)
1975 if (i > 0 && (i % 8) == 0)
1976 fprintf (stream, "\n ");
1977 fprintf (stream, " 0x%02x",
1978 ((uint8_t *) (t.result + level3_offset))[2*i]
1979 + (((uint8_t *) (t.result + level3_offset))[2*i+1] << 4));
1980 if (i+1 < t.level3_size << (t.p - 1))
1981 fprintf (stream, ",");
1983 if (t.level3_size << (t.p - 1) > 8)
1984 fprintf (stream, "\n ");
1985 fprintf (stream, " }\n");
1986 fprintf (stream, "};\n");
1988 if (ferror (stream) || fclose (stream))
1990 fprintf (stderr, "error writing to '%s'\n", filename);
1995 /* ========================================================================= */
1997 /* Numeric value. */
1998 /* See Unicode 3.0 book, section 4.6. */
2000 typedef struct { int numerator; int denominator; } uc_fraction_t;
2002 static uc_fraction_t
2003 get_numeric_value (unsigned int ch)
2005 uc_fraction_t value;
2007 if (unicode_attributes[ch].name != NULL
2008 && unicode_attributes[ch].numeric[0] != '\0')
2010 const char *str = unicode_attributes[ch].numeric;
2011 /* str is of the form "integer" or "integer/posinteger". */
2012 value.numerator = atoi (str);
2013 if (strchr (str, '/') != NULL)
2014 value.denominator = atoi (strchr (str, '/') + 1);
2016 value.denominator = 1;
2020 value.numerator = 0;
2021 value.denominator = 0;
2026 /* Output the unit test for the per-character numeric value table. */
2028 output_numeric_test (const char *filename, const char *version)
2034 stream = fopen (filename, "w");
2037 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2041 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2042 fprintf (stream, "/* Numeric values of Unicode characters. */\n");
2043 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
2047 for (ch = 0; ch < 0x110000; ch++)
2049 uc_fraction_t value = get_numeric_value (ch);
2051 if (value.numerator != 0 || value.denominator != 0)
2054 fprintf (stream, ",\n");
2055 fprintf (stream, " { 0x%04X, %d, %d }",
2056 ch, value.numerator, value.denominator);
2061 fprintf (stream, "\n");
2063 if (ferror (stream) || fclose (stream))
2065 fprintf (stderr, "error writing to '%s'\n", filename);
2070 /* Construction of sparse 3-level tables. */
2071 #define TABLE numeric_table
2072 #define ELEMENT uint8_t
2074 #define xmalloc malloc
2075 #define xrealloc realloc
2078 /* Output the per-character numeric value table. */
2080 output_numeric (const char *filename, const char *version)
2083 uc_fraction_t fractions[128];
2084 unsigned int nfractions;
2085 unsigned int ch, i, j;
2086 struct numeric_table t;
2087 unsigned int level1_offset, level2_offset, level3_offset;
2088 uint16_t *level3_packed;
2090 stream = fopen (filename, "w");
2093 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2097 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2098 fprintf (stream, "/* Numeric values of Unicode characters. */\n");
2099 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
2102 /* Create table of occurring fractions. */
2104 for (ch = 0; ch < 0x110000; ch++)
2106 uc_fraction_t value = get_numeric_value (ch);
2108 for (i = 0; i < nfractions; i++)
2109 if (value.numerator == fractions[i].numerator
2110 && value.denominator == fractions[i].denominator)
2112 if (i == nfractions)
2114 if (nfractions == 128)
2116 for (i = 0; i < nfractions; i++)
2117 if (value.denominator < fractions[i].denominator
2118 || (value.denominator == fractions[i].denominator
2119 && value.numerator < fractions[i].numerator))
2121 for (j = nfractions; j > i; j--)
2122 fractions[j] = fractions[j - 1];
2123 fractions[i] = value;
2128 fprintf (stream, "static const uc_fraction_t u_numeric_values[%d] =\n",
2130 fprintf (stream, "{\n");
2131 for (i = 0; i < nfractions; i++)
2133 fprintf (stream, " { %d, %d }", fractions[i].numerator,
2134 fractions[i].denominator);
2135 if (i+1 < nfractions)
2136 fprintf (stream, ",");
2137 fprintf (stream, "\n");
2139 fprintf (stream, "};\n");
2143 numeric_table_init (&t);
2145 for (ch = 0; ch < 0x110000; ch++)
2147 uc_fraction_t value = get_numeric_value (ch);
2149 for (i = 0; i < nfractions; i++)
2150 if (value.numerator == fractions[i].numerator
2151 && value.denominator == fractions[i].denominator)
2153 if (i == nfractions)
2156 numeric_table_add (&t, ch, i);
2159 numeric_table_finalize (&t);
2161 /* Offsets in t.result, in memory of this process. */
2163 5 * sizeof (uint32_t);
2165 5 * sizeof (uint32_t)
2166 + t.level1_size * sizeof (uint32_t);
2168 5 * sizeof (uint32_t)
2169 + t.level1_size * sizeof (uint32_t)
2170 + (t.level2_size << t.q) * sizeof (uint32_t);
2172 for (i = 0; i < 5; i++)
2173 fprintf (stream, "#define numeric_header_%d %d\n", i,
2174 ((uint32_t *) t.result)[i]);
2175 fprintf (stream, "static const\n");
2176 fprintf (stream, "struct\n");
2177 fprintf (stream, " {\n");
2178 fprintf (stream, " int level1[%zu];\n", t.level1_size);
2179 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
2180 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
2181 (1 << t.p) * 7 / 16);
2182 fprintf (stream, " }\n");
2183 fprintf (stream, "u_numeric =\n");
2184 fprintf (stream, "{\n");
2185 fprintf (stream, " {");
2186 if (t.level1_size > 8)
2187 fprintf (stream, "\n ");
2188 for (i = 0; i < t.level1_size; i++)
2191 if (i > 0 && (i % 8) == 0)
2192 fprintf (stream, "\n ");
2193 offset = ((uint32_t *) (t.result + level1_offset))[i];
2195 fprintf (stream, " %5d", -1);
2197 fprintf (stream, " %5zd",
2198 (offset - level2_offset) / sizeof (uint32_t));
2199 if (i+1 < t.level1_size)
2200 fprintf (stream, ",");
2202 if (t.level1_size > 8)
2203 fprintf (stream, "\n ");
2204 fprintf (stream, " },\n");
2205 fprintf (stream, " {");
2206 if (t.level2_size << t.q > 8)
2207 fprintf (stream, "\n ");
2208 for (i = 0; i < t.level2_size << t.q; i++)
2211 if (i > 0 && (i % 8) == 0)
2212 fprintf (stream, "\n ");
2213 offset = ((uint32_t *) (t.result + level2_offset))[i];
2215 fprintf (stream, " %5d", -1);
2217 fprintf (stream, " %5zd",
2218 (offset - level3_offset) / sizeof (uint8_t));
2219 if (i+1 < t.level2_size << t.q)
2220 fprintf (stream, ",");
2222 if (t.level2_size << t.q > 8)
2223 fprintf (stream, "\n ");
2224 fprintf (stream, " },\n");
2225 /* Pack the level3 array. Each entry needs 7 bits only. Use 16-bit units,
2226 not 32-bit units, in order to make the lookup function easier. */
2229 calloc ((t.level3_size << t.p) * 7 / 16 + 1, sizeof (uint16_t));
2230 for (i = 0; i < t.level3_size << t.p; i++)
2232 unsigned int j = (i * 7) / 16;
2233 unsigned int k = (i * 7) % 16;
2234 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
2235 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
2236 level3_packed[j] = value & 0xffff;
2237 level3_packed[j+1] = value >> 16;
2239 fprintf (stream, " {");
2240 if ((t.level3_size << t.p) * 7 / 16 + 1 > 8)
2241 fprintf (stream, "\n ");
2242 for (i = 0; i < (t.level3_size << t.p) * 7 / 16 + 1; i++)
2244 if (i > 0 && (i % 8) == 0)
2245 fprintf (stream, "\n ");
2246 fprintf (stream, " 0x%04x", level3_packed[i]);
2247 if (i+1 < (t.level3_size << t.p) * 7 / 16 + 1)
2248 fprintf (stream, ",");
2250 if ((t.level3_size << t.p) * 7 / 16 + 1 > 8)
2251 fprintf (stream, "\n ");
2252 fprintf (stream, " }\n");
2253 free (level3_packed);
2254 fprintf (stream, "};\n");
2256 if (ferror (stream) || fclose (stream))
2258 fprintf (stderr, "error writing to '%s'\n", filename);
2263 /* ========================================================================= */
2266 /* See Unicode 3.0 book, section 4.7,
2269 /* List of mirrored character pairs. This is a subset of the characters
2270 having the BidiMirrored property. */
2271 static unsigned int mirror_pairs[][2] =
2328 get_mirror_value (unsigned int ch)
2331 unsigned int mirror_char;
2334 mirrored = (unicode_attributes[ch].name != NULL
2335 && unicode_attributes[ch].mirrored);
2336 mirror_char = 0xfffd;
2337 for (i = 0; i < sizeof (mirror_pairs) / sizeof (mirror_pairs[0]); i++)
2338 if (ch == mirror_pairs[i][0])
2340 mirror_char = mirror_pairs[i][1];
2343 else if (ch == mirror_pairs[i][1])
2345 mirror_char = mirror_pairs[i][0];
2349 return (int) mirror_char - (int) ch;
2352 if (mirror_char != 0xfffd)
2358 /* Construction of sparse 3-level tables. */
2359 #define TABLE mirror_table
2360 #define ELEMENT int32_t
2362 #define xmalloc malloc
2363 #define xrealloc realloc
2366 /* Output the per-character mirror table. */
2368 output_mirror (const char *filename, const char *version)
2372 struct mirror_table t;
2373 unsigned int level1_offset, level2_offset, level3_offset;
2375 stream = fopen (filename, "w");
2378 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2382 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2383 fprintf (stream, "/* Mirrored Unicode characters. */\n");
2384 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
2389 mirror_table_init (&t);
2391 for (ch = 0; ch < 0x110000; ch++)
2393 int value = get_mirror_value (ch);
2395 mirror_table_add (&t, ch, value);
2398 mirror_table_finalize (&t);
2400 /* Offsets in t.result, in memory of this process. */
2402 5 * sizeof (uint32_t);
2404 5 * sizeof (uint32_t)
2405 + t.level1_size * sizeof (uint32_t);
2407 5 * sizeof (uint32_t)
2408 + t.level1_size * sizeof (uint32_t)
2409 + (t.level2_size << t.q) * sizeof (uint32_t);
2411 for (i = 0; i < 5; i++)
2412 fprintf (stream, "#define mirror_header_%d %d\n", i,
2413 ((uint32_t *) t.result)[i]);
2414 fprintf (stream, "static const\n");
2415 fprintf (stream, "struct\n");
2416 fprintf (stream, " {\n");
2417 fprintf (stream, " int level1[%zu];\n", t.level1_size);
2418 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
2419 fprintf (stream, " int level3[%zu << %d];\n", t.level3_size, t.p);
2420 fprintf (stream, " }\n");
2421 fprintf (stream, "u_mirror =\n");
2422 fprintf (stream, "{\n");
2423 fprintf (stream, " {");
2424 if (t.level1_size > 8)
2425 fprintf (stream, "\n ");
2426 for (i = 0; i < t.level1_size; i++)
2429 if (i > 0 && (i % 8) == 0)
2430 fprintf (stream, "\n ");
2431 offset = ((uint32_t *) (t.result + level1_offset))[i];
2433 fprintf (stream, " %5d", -1);
2435 fprintf (stream, " %5zd",
2436 (offset - level2_offset) / sizeof (uint32_t));
2437 if (i+1 < t.level1_size)
2438 fprintf (stream, ",");
2440 if (t.level1_size > 8)
2441 fprintf (stream, "\n ");
2442 fprintf (stream, " },\n");
2443 fprintf (stream, " {");
2444 if (t.level2_size << t.q > 8)
2445 fprintf (stream, "\n ");
2446 for (i = 0; i < t.level2_size << t.q; i++)
2449 if (i > 0 && (i % 8) == 0)
2450 fprintf (stream, "\n ");
2451 offset = ((uint32_t *) (t.result + level2_offset))[i];
2453 fprintf (stream, " %5d", -1);
2455 fprintf (stream, " %5zd",
2456 (offset - level3_offset) / sizeof (int32_t));
2457 if (i+1 < t.level2_size << t.q)
2458 fprintf (stream, ",");
2460 if (t.level2_size << t.q > 8)
2461 fprintf (stream, "\n ");
2462 fprintf (stream, " },\n");
2463 fprintf (stream, " {");
2464 if (t.level3_size << t.p > 8)
2465 fprintf (stream, "\n ");
2466 for (i = 0; i < t.level3_size << t.p; i++)
2468 if (i > 0 && (i % 8) == 0)
2469 fprintf (stream, "\n ");
2470 fprintf (stream, " %5d", ((int32_t *) (t.result + level3_offset))[i]);
2471 if (i+1 < t.level3_size << t.p)
2472 fprintf (stream, ",");
2474 if (t.level3_size << t.p > 8)
2475 fprintf (stream, "\n ");
2476 fprintf (stream, " }\n");
2477 fprintf (stream, "};\n");
2479 if (ferror (stream) || fclose (stream))
2481 fprintf (stderr, "error writing to '%s'\n", filename);
2486 /* ========================================================================= */
2490 /* Reading PropList.txt and DerivedCoreProperties.txt. */
2499 PROP_QUOTATION_MARK,
2500 PROP_TERMINAL_PUNCTUATION,
2503 PROP_ASCII_HEX_DIGIT,
2504 PROP_OTHER_ALPHABETIC,
2508 PROP_OTHER_LOWERCASE,
2509 PROP_OTHER_UPPERCASE,
2510 PROP_NONCHARACTER_CODE_POINT,
2511 PROP_OTHER_GRAPHEME_EXTEND,
2512 PROP_IDS_BINARY_OPERATOR,
2513 PROP_IDS_TRINARY_OPERATOR,
2515 PROP_UNIFIED_IDEOGRAPH,
2516 PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT,
2519 PROP_LOGICAL_ORDER_EXCEPTION,
2520 PROP_OTHER_ID_START,
2521 PROP_OTHER_ID_CONTINUE,
2523 PROP_VARIATION_SELECTOR,
2524 PROP_PATTERN_WHITE_SPACE,
2525 PROP_PATTERN_SYNTAX,
2526 /* DerivedCoreProperties.txt */
2535 PROP_DEFAULT_IGNORABLE_CODE_POINT,
2536 PROP_GRAPHEME_EXTEND,
2540 unsigned long long unicode_properties[0x110000];
2543 clear_properties (void)
2547 for (i = 0; i < 0x110000; i++)
2548 unicode_properties[i] = 0;
2551 /* Stores in unicode_properties[] the properties from the
2552 PropList.txt or DerivedCoreProperties.txt file. */
2554 fill_properties (const char *proplist_filename)
2559 stream = fopen (proplist_filename, "r");
2562 fprintf (stderr, "error during fopen of '%s'\n", proplist_filename);
2569 unsigned int i1, i2;
2570 char padding[200+1];
2571 char propname[200+1];
2572 unsigned int propvalue;
2574 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
2577 if (buf[0] == '\0' || buf[0] == '#')
2580 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4)
2582 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3)
2584 fprintf (stderr, "parse error in '%s'\n", proplist_filename);
2589 #define PROP(name,value) \
2590 if (strcmp (propname, name) == 0) propvalue = value; else
2592 PROP ("White_Space", PROP_WHITE_SPACE)
2593 PROP ("Bidi_Control", PROP_BIDI_CONTROL)
2594 PROP ("Join_Control", PROP_JOIN_CONTROL)
2595 PROP ("Dash", PROP_DASH)
2596 PROP ("Hyphen", PROP_HYPHEN)
2597 PROP ("Quotation_Mark", PROP_QUOTATION_MARK)
2598 PROP ("Terminal_Punctuation", PROP_TERMINAL_PUNCTUATION)
2599 PROP ("Other_Math", PROP_OTHER_MATH)
2600 PROP ("Hex_Digit", PROP_HEX_DIGIT)
2601 PROP ("ASCII_Hex_Digit", PROP_ASCII_HEX_DIGIT)
2602 PROP ("Other_Alphabetic", PROP_OTHER_ALPHABETIC)
2603 PROP ("Ideographic", PROP_IDEOGRAPHIC)
2604 PROP ("Diacritic", PROP_DIACRITIC)
2605 PROP ("Extender", PROP_EXTENDER)
2606 PROP ("Other_Lowercase", PROP_OTHER_LOWERCASE)
2607 PROP ("Other_Uppercase", PROP_OTHER_UPPERCASE)
2608 PROP ("Noncharacter_Code_Point", PROP_NONCHARACTER_CODE_POINT)
2609 PROP ("Other_Grapheme_Extend", PROP_OTHER_GRAPHEME_EXTEND)
2610 PROP ("IDS_Binary_Operator", PROP_IDS_BINARY_OPERATOR)
2611 PROP ("IDS_Trinary_Operator", PROP_IDS_TRINARY_OPERATOR)
2612 PROP ("Radical", PROP_RADICAL)
2613 PROP ("Unified_Ideograph", PROP_UNIFIED_IDEOGRAPH)
2614 PROP ("Other_Default_Ignorable_Code_Point", PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)
2615 PROP ("Deprecated", PROP_DEPRECATED)
2616 PROP ("Soft_Dotted", PROP_SOFT_DOTTED)
2617 PROP ("Logical_Order_Exception", PROP_LOGICAL_ORDER_EXCEPTION)
2618 PROP ("Other_ID_Start", PROP_OTHER_ID_START)
2619 PROP ("Other_ID_Continue", PROP_OTHER_ID_CONTINUE)
2620 PROP ("STerm", PROP_STERM)
2621 PROP ("Variation_Selector", PROP_VARIATION_SELECTOR)
2622 PROP ("Pattern_White_Space", PROP_PATTERN_WHITE_SPACE)
2623 PROP ("Pattern_Syntax", PROP_PATTERN_SYNTAX)
2624 /* DerivedCoreProperties.txt */
2625 PROP ("Math", PROP_MATH)
2626 PROP ("Alphabetic", PROP_ALPHABETIC)
2627 PROP ("Lowercase", PROP_LOWERCASE)
2628 PROP ("Uppercase", PROP_UPPERCASE)
2629 PROP ("ID_Start", PROP_ID_START)
2630 PROP ("ID_Continue", PROP_ID_CONTINUE)
2631 PROP ("XID_Start", PROP_XID_START)
2632 PROP ("XID_Continue", PROP_XID_CONTINUE)
2633 PROP ("Default_Ignorable_Code_Point", PROP_DEFAULT_IGNORABLE_CODE_POINT)
2634 PROP ("Grapheme_Extend", PROP_GRAPHEME_EXTEND)
2635 PROP ("Grapheme_Base", PROP_GRAPHEME_BASE)
2636 PROP ("Grapheme_Link", PROP_GRAPHEME_LINK)
2639 fprintf (stderr, "unknown property named '%s' in '%s'\n", propname,
2643 if (!(i1 <= i2 && i2 < 0x110000))
2646 for (i = i1; i <= i2; i++)
2647 unicode_properties[i] |= 1ULL << propvalue;
2650 if (ferror (stream) || fclose (stream))
2652 fprintf (stderr, "error reading from '%s'\n", proplist_filename);
2657 /* Stores in array the given property from the Unicode 3.0 PropList.txt
2660 fill_property30 (char array[0x110000], const char *proplist_filename, const char *property_name)
2666 for (i = 0; i < 0x110000; i++)
2669 stream = fopen (proplist_filename, "r");
2672 fprintf (stderr, "error during fopen of '%s'\n", proplist_filename);
2676 /* Search for the "Property dump for: ..." line. */
2679 if (fscanf (stream, "%100[^\n]\n", buf) < 1)
2681 fprintf (stderr, "no property found in '%s'\n", proplist_filename);
2685 while (strstr (buf, property_name) == NULL);
2689 unsigned int i1, i2;
2691 if (fscanf (stream, "%100[^\n]\n", buf) < 1)
2695 if (strlen (buf) >= 10 && buf[4] == '.' && buf[5] == '.')
2697 if (sscanf (buf, "%4X..%4X", &i1, &i2) < 2)
2699 fprintf (stderr, "parse error in property in '%s'\n",
2704 else if (strlen (buf) >= 4)
2706 if (sscanf (buf, "%4X", &i1) < 1)
2708 fprintf (stderr, "parse error in property in '%s'\n",
2716 fprintf (stderr, "parse error in property in '%s'\n",
2720 if (!(i1 <= i2 && i2 < 0x110000))
2722 for (i = i1; i <= i2; i++)
2725 if (ferror (stream) || fclose (stream))
2727 fprintf (stderr, "error reading from '%s'\n", proplist_filename);
2732 /* Properties from Unicode 3.0 PropList.txt file. */
2734 /* The paired punctuation property from the PropList.txt file. */
2735 char unicode_pairedpunctuation[0x110000];
2737 /* The left of pair property from the PropList.txt file. */
2738 char unicode_leftofpair[0x110000];
2741 fill_properties30 (const char *proplist30_filename)
2743 fill_property30 (unicode_pairedpunctuation, proplist30_filename, "(Paired Punctuation)");
2744 fill_property30 (unicode_leftofpair, proplist30_filename, "(Left of Pair)");
2747 /* ------------------------------------------------------------------------- */
2749 /* See PropList.txt, UCD.html. */
2751 is_property_white_space (unsigned int ch)
2753 return ((unicode_properties[ch] & (1ULL << PROP_WHITE_SPACE)) != 0);
2756 /* See Unicode 3.0 book, section 4.10,
2757 PropList.txt, UCD.html,
2758 DerivedCoreProperties.txt, UCD.html. */
2760 is_property_alphabetic (unsigned int ch)
2764 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_ALPHABETIC)) != 0)
2765 /* For some reason, the following are listed as having property
2766 Alphabetic but not as having property Other_Alphabetic. */
2767 || (ch >= 0x16EE && ch <= 0x16F0) /* RUNIC SYMBOLS */
2768 || (ch >= 0x2160 && ch <= 0x2182) /* ROMAN NUMERALS */
2769 || (ch >= 0x24D0 && ch <= 0x24E9) /* CIRCLED LATIN SMALL LETTER */
2770 || (ch == 0x3007) /* IDEOGRAPHIC NUMBER ZERO */
2771 || (ch >= 0x3021 && ch <= 0x3029) /* HANGZHOU NUMERAL */
2772 || (ch >= 0x3038 && ch <= 0x303A) /* HANGZHOU NUMERAL */
2773 || (ch >= 0x10140 && ch <= 0x10174) /* GREEK ACROPHONICS */
2774 || (ch == 0x10341) /* GOTHIC LETTER NINETY */
2775 || (ch == 0x1034A) /* GOTHIC LETTER NINE HUNDRED */
2776 || (ch >= 0x103D1 && ch <= 0x103D5) /* OLD PERSIAN NUMBERS */
2777 || (ch >= 0x12400 && ch <= 0x12462); /* CUNEIFORM NUMERIC SIGNS */
2779 ((unicode_properties[ch] & (1ULL << PROP_ALPHABETIC)) != 0);
2781 if (result1 != result2)
2786 /* See PropList.txt, UCD.html. */
2788 is_property_other_alphabetic (unsigned int ch)
2790 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ALPHABETIC)) != 0);
2793 /* See PropList.txt, UCD.html. */
2795 is_property_not_a_character (unsigned int ch)
2797 return ((unicode_properties[ch] & (1ULL << PROP_NONCHARACTER_CODE_POINT)) != 0);
2800 /* See PropList.txt, UCD.html,
2801 DerivedCoreProperties.txt, UCD.html. */
2803 is_property_default_ignorable_code_point (unsigned int ch)
2806 (is_category_Cf (ch)
2807 && !(ch >= 0xFFF9 && ch <= 0xFFFB)) /* Annotations */
2808 || ((is_category_Cc (ch) || is_category_Cs (ch))
2809 && !is_property_white_space (ch))
2810 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)) != 0)
2811 || ((unicode_properties[ch] & (1ULL << PROP_VARIATION_SELECTOR)) != 0)
2812 || is_property_not_a_character (ch);
2814 ((unicode_properties[ch] & (1ULL << PROP_DEFAULT_IGNORABLE_CODE_POINT)) != 0);
2816 if (result1 != result2)
2821 /* See PropList.txt, UCD.html. */
2823 is_property_other_default_ignorable_code_point (unsigned int ch)
2825 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)) != 0);
2828 /* See PropList.txt, UCD.html. */
2830 is_property_deprecated (unsigned int ch)
2832 return ((unicode_properties[ch] & (1ULL << PROP_DEPRECATED)) != 0);
2835 /* See PropList.txt, UCD.html. */
2837 is_property_logical_order_exception (unsigned int ch)
2839 return ((unicode_properties[ch] & (1ULL << PROP_LOGICAL_ORDER_EXCEPTION)) != 0);
2842 /* See PropList.txt, UCD.html. */
2844 is_property_variation_selector (unsigned int ch)
2846 return ((unicode_properties[ch] & (1ULL << PROP_VARIATION_SELECTOR)) != 0);
2849 /* See PropList-3.0.1.txt. */
2851 is_property_private_use (unsigned int ch)
2853 /* Determined through "grep 'Private Use,' UnicodeData-3.1.0.txt". */
2854 return (ch >= 0xE000 && ch <= 0xF8FF)
2855 || (ch >= 0xF0000 && ch <= 0xFFFFD)
2856 || (ch >= 0x100000 && ch <= 0x10FFFD);
2859 /* See PropList-3.0.1.txt. */
2861 is_property_unassigned_code_value (unsigned int ch)
2863 return (is_category_Cn (ch) && !is_property_not_a_character (ch));
2866 /* See PropList.txt, UCD.html,
2867 DerivedCoreProperties.txt, UCD.html. */
2869 is_property_uppercase (unsigned int ch)
2873 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_UPPERCASE)) != 0);
2875 ((unicode_properties[ch] & (1ULL << PROP_UPPERCASE)) != 0);
2877 if (result1 != result2)
2882 /* See PropList.txt, UCD.html. */
2884 is_property_other_uppercase (unsigned int ch)
2886 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_UPPERCASE)) != 0);
2889 /* See PropList.txt, UCD.html,
2890 DerivedCoreProperties.txt, UCD.html. */
2892 is_property_lowercase (unsigned int ch)
2896 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_LOWERCASE)) != 0);
2898 ((unicode_properties[ch] & (1ULL << PROP_LOWERCASE)) != 0);
2900 if (result1 != result2)
2905 /* See PropList.txt, UCD.html. */
2907 is_property_other_lowercase (unsigned int ch)
2909 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_LOWERCASE)) != 0);
2912 /* See PropList-3.0.1.txt. */
2914 is_property_titlecase (unsigned int ch)
2916 return is_category_Lt (ch);
2919 /* See PropList.txt, UCD.html. */
2921 is_property_soft_dotted (unsigned int ch)
2923 return ((unicode_properties[ch] & (1ULL << PROP_SOFT_DOTTED)) != 0);
2926 /* See DerivedCoreProperties.txt, UCD.html. */
2928 is_property_id_start (unsigned int ch)
2930 return ((unicode_properties[ch] & (1ULL << PROP_ID_START)) != 0);
2933 /* See PropList.txt, UCD.html. */
2935 is_property_other_id_start (unsigned int ch)
2937 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ID_START)) != 0);
2940 /* See DerivedCoreProperties.txt, UCD.html. */
2942 is_property_id_continue (unsigned int ch)
2944 return ((unicode_properties[ch] & (1ULL << PROP_ID_CONTINUE)) != 0);
2947 /* See PropList.txt, UCD.html. */
2949 is_property_other_id_continue (unsigned int ch)
2951 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ID_CONTINUE)) != 0);
2954 /* See DerivedCoreProperties.txt, UCD.html. */
2956 is_property_xid_start (unsigned int ch)
2958 return ((unicode_properties[ch] & (1ULL << PROP_XID_START)) != 0);
2961 /* See DerivedCoreProperties.txt, UCD.html. */
2963 is_property_xid_continue (unsigned int ch)
2965 return ((unicode_properties[ch] & (1ULL << PROP_XID_CONTINUE)) != 0);
2968 /* See PropList.txt, UCD.html. */
2970 is_property_pattern_white_space (unsigned int ch)
2972 return ((unicode_properties[ch] & (1ULL << PROP_PATTERN_WHITE_SPACE)) != 0);
2975 /* See PropList.txt, UCD.html. */
2977 is_property_pattern_syntax (unsigned int ch)
2979 return ((unicode_properties[ch] & (1ULL << PROP_PATTERN_SYNTAX)) != 0);
2982 /* See PropList.txt, UCD.html. */
2984 is_property_join_control (unsigned int ch)
2986 return ((unicode_properties[ch] & (1ULL << PROP_JOIN_CONTROL)) != 0);
2989 /* See DerivedCoreProperties.txt, UCD.html. */
2991 is_property_grapheme_base (unsigned int ch)
2993 return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_BASE)) != 0);
2996 /* See DerivedCoreProperties.txt, UCD.html. */
2998 is_property_grapheme_extend (unsigned int ch)
3000 return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_EXTEND)) != 0);
3003 /* See PropList.txt, UCD.html. */
3005 is_property_other_grapheme_extend (unsigned int ch)
3007 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_GRAPHEME_EXTEND)) != 0);
3010 /* See DerivedCoreProperties.txt, UCD.html. */
3012 is_property_grapheme_link (unsigned int ch)
3014 return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_LINK)) != 0);
3017 /* See PropList.txt, UCD.html. */
3019 is_property_bidi_control (unsigned int ch)
3021 return ((unicode_properties[ch] & (1ULL << PROP_BIDI_CONTROL)) != 0);
3024 /* See PropList-3.0.1.txt. */
3026 is_property_bidi_left_to_right (unsigned int ch)
3028 return (get_bidi_category (ch) == UC_BIDI_L);
3031 /* See PropList-3.0.1.txt. */
3033 is_property_bidi_hebrew_right_to_left (unsigned int ch)
3035 return (get_bidi_category (ch) == UC_BIDI_R);
3038 /* See PropList-3.0.1.txt. */
3040 is_property_bidi_arabic_right_to_left (unsigned int ch)
3042 return (get_bidi_category (ch) == UC_BIDI_AL);
3045 /* See PropList-3.0.1.txt. */
3047 is_property_bidi_european_digit (unsigned int ch)
3049 return (get_bidi_category (ch) == UC_BIDI_EN);
3052 /* See PropList-3.0.1.txt. */
3054 is_property_bidi_eur_num_separator (unsigned int ch)
3056 return (get_bidi_category (ch) == UC_BIDI_ES);
3059 /* See PropList-3.0.1.txt. */
3061 is_property_bidi_eur_num_terminator (unsigned int ch)
3063 return (get_bidi_category (ch) == UC_BIDI_ET);
3066 /* See PropList-3.0.1.txt. */
3068 is_property_bidi_arabic_digit (unsigned int ch)
3070 return (get_bidi_category (ch) == UC_BIDI_AN);
3073 /* See PropList-3.0.1.txt. */
3075 is_property_bidi_common_separator (unsigned int ch)
3077 return (get_bidi_category (ch) == UC_BIDI_CS);
3080 /* See PropList-3.0.1.txt. */
3082 is_property_bidi_block_separator (unsigned int ch)
3084 return (get_bidi_category (ch) == UC_BIDI_B);
3087 /* See PropList-3.0.1.txt. */
3089 is_property_bidi_segment_separator (unsigned int ch)
3091 return (get_bidi_category (ch) == UC_BIDI_S);
3094 /* See PropList-3.0.1.txt. */
3096 is_property_bidi_whitespace (unsigned int ch)
3098 return (get_bidi_category (ch) == UC_BIDI_WS);
3101 /* See PropList-3.0.1.txt. */
3103 is_property_bidi_non_spacing_mark (unsigned int ch)
3105 return (get_bidi_category (ch) == UC_BIDI_NSM);
3108 /* See PropList-3.0.1.txt. */
3110 is_property_bidi_boundary_neutral (unsigned int ch)
3112 return (get_bidi_category (ch) == UC_BIDI_BN);
3115 /* See PropList-3.0.1.txt. */
3117 is_property_bidi_pdf (unsigned int ch)
3119 return (get_bidi_category (ch) == UC_BIDI_PDF);
3122 /* See PropList-3.0.1.txt. */
3124 is_property_bidi_embedding_or_override (unsigned int ch)
3126 int category = get_bidi_category (ch);
3127 return (category == UC_BIDI_LRE || category == UC_BIDI_LRO
3128 || category == UC_BIDI_RLE || category == UC_BIDI_RLO);
3131 /* See PropList-3.0.1.txt. */
3133 is_property_bidi_other_neutral (unsigned int ch)
3135 return (get_bidi_category (ch) == UC_BIDI_ON);
3138 /* See PropList.txt, UCD.html. */
3140 is_property_hex_digit (unsigned int ch)
3142 return ((unicode_properties[ch] & (1ULL << PROP_HEX_DIGIT)) != 0);
3145 /* See PropList.txt, UCD.html. */
3147 is_property_ascii_hex_digit (unsigned int ch)
3149 return ((unicode_properties[ch] & (1ULL << PROP_ASCII_HEX_DIGIT)) != 0);
3152 /* See Unicode 3.0 book, section 4.10,
3153 PropList.txt, UCD.html. */
3155 is_property_ideographic (unsigned int ch)
3157 return ((unicode_properties[ch] & (1ULL << PROP_IDEOGRAPHIC)) != 0);
3160 /* See PropList.txt, UCD.html. */
3162 is_property_unified_ideograph (unsigned int ch)
3164 return ((unicode_properties[ch] & (1ULL << PROP_UNIFIED_IDEOGRAPH)) != 0);
3167 /* See PropList.txt, UCD.html. */
3169 is_property_radical (unsigned int ch)
3171 return ((unicode_properties[ch] & (1ULL << PROP_RADICAL)) != 0);
3174 /* See PropList.txt, UCD.html. */
3176 is_property_ids_binary_operator (unsigned int ch)
3178 return ((unicode_properties[ch] & (1ULL << PROP_IDS_BINARY_OPERATOR)) != 0);
3181 /* See PropList.txt, UCD.html. */
3183 is_property_ids_trinary_operator (unsigned int ch)
3185 return ((unicode_properties[ch] & (1ULL << PROP_IDS_TRINARY_OPERATOR)) != 0);
3188 /* See PropList-3.0.1.txt. */
3190 is_property_zero_width (unsigned int ch)
3192 return is_category_Cf (ch)
3193 || (unicode_attributes[ch].name != NULL
3194 && strstr (unicode_attributes[ch].name, "ZERO WIDTH") != NULL);
3197 /* See PropList-3.0.1.txt. */
3199 is_property_space (unsigned int ch)
3201 return is_category_Zs (ch);
3204 /* See PropList-3.0.1.txt. */
3206 is_property_non_break (unsigned int ch)
3208 /* This is exactly the set of characters having line breaking
3210 return (ch == 0x00A0 /* NO-BREAK SPACE */
3211 || ch == 0x034F /* COMBINING GRAPHEME JOINER */
3212 || ch == 0x035C /* COMBINING DOUBLE BREVE BELOW */
3213 || ch == 0x035D /* COMBINING DOUBLE BREVE */
3214 || ch == 0x035E /* COMBINING DOUBLE MACRON */
3215 || ch == 0x035F /* COMBINING DOUBLE MACRON BELOW */
3216 || ch == 0x0360 /* COMBINING DOUBLE TILDE */
3217 || ch == 0x0361 /* COMBINING DOUBLE INVERTED BREVE */
3218 || ch == 0x0362 /* COMBINING DOUBLE RIGHTWARDS ARROW BELOW */
3219 || ch == 0x0F08 /* TIBETAN MARK SBRUL SHAD */
3220 || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */
3221 || ch == 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */
3222 || ch == 0x180E /* MONGOLIAN VOWEL SEPARATOR */
3223 || ch == 0x2007 /* FIGURE SPACE */
3224 || ch == 0x2011 /* NON-BREAKING HYPHEN */
3225 || ch == 0x202F /* NARROW NO-BREAK SPACE */);
3228 /* See PropList-3.0.1.txt. */
3230 is_property_iso_control (unsigned int ch)
3233 (unicode_attributes[ch].name != NULL
3234 && strcmp (unicode_attributes[ch].name, "<control>") == 0);
3236 is_category_Cc (ch);
3238 if (result1 != result2)
3243 /* See PropList-3.0.1.txt. */
3245 is_property_format_control (unsigned int ch)
3247 return (is_category_Cf (ch)
3248 && get_bidi_category (ch) == UC_BIDI_BN
3249 && !is_property_join_control (ch)
3253 /* See PropList.txt, UCD.html. */
3255 is_property_dash (unsigned int ch)
3257 return ((unicode_properties[ch] & (1ULL << PROP_DASH)) != 0);
3260 /* See PropList.txt, UCD.html. */
3262 is_property_hyphen (unsigned int ch)
3264 return ((unicode_properties[ch] & (1ULL << PROP_HYPHEN)) != 0);
3267 /* See PropList-3.0.1.txt. */
3269 is_property_punctuation (unsigned int ch)
3271 return is_category_P (ch);
3274 /* See PropList-3.0.1.txt. */
3276 is_property_line_separator (unsigned int ch)
3278 return is_category_Zl (ch);
3281 /* See PropList-3.0.1.txt. */
3283 is_property_paragraph_separator (unsigned int ch)
3285 return is_category_Zp (ch);
3288 /* See PropList.txt, UCD.html. */
3290 is_property_quotation_mark (unsigned int ch)
3292 return ((unicode_properties[ch] & (1ULL << PROP_QUOTATION_MARK)) != 0);
3295 /* See PropList.txt, UCD.html. */
3297 is_property_sentence_terminal (unsigned int ch)
3299 return ((unicode_properties[ch] & (1ULL << PROP_STERM)) != 0);
3302 /* See PropList.txt, UCD.html. */
3304 is_property_terminal_punctuation (unsigned int ch)
3306 return ((unicode_properties[ch] & (1ULL << PROP_TERMINAL_PUNCTUATION)) != 0);
3309 /* See PropList-3.0.1.txt. */
3311 is_property_currency_symbol (unsigned int ch)
3313 return is_category_Sc (ch);
3316 /* See Unicode 3.0 book, section 4.9,
3317 PropList.txt, UCD.html,
3318 DerivedCoreProperties.txt, UCD.html. */
3320 is_property_math (unsigned int ch)
3324 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_MATH)) != 0);
3326 ((unicode_properties[ch] & (1ULL << PROP_MATH)) != 0);
3328 if (result1 != result2)
3333 /* See PropList.txt, UCD.html. */
3335 is_property_other_math (unsigned int ch)
3337 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_MATH)) != 0);
3340 /* See PropList-3.0.1.txt. */
3342 is_property_paired_punctuation (unsigned int ch)
3344 return unicode_pairedpunctuation[ch];
3347 /* See PropList-3.0.1.txt. */
3349 is_property_left_of_pair (unsigned int ch)
3351 return unicode_leftofpair[ch];
3354 /* See PropList-3.0.1.txt. */
3356 is_property_combining (unsigned int ch)
3358 return (unicode_attributes[ch].name != NULL
3359 && (strcmp (unicode_attributes[ch].combining, "0") != 0
3360 || is_category_Mc (ch)
3361 || is_category_Me (ch)
3362 || is_category_Mn (ch)));
3365 #if 0 /* same as is_property_bidi_non_spacing_mark */
3366 /* See PropList-3.0.1.txt. */
3368 is_property_non_spacing (unsigned int ch)
3370 return (unicode_attributes[ch].name != NULL
3371 && get_bidi_category (ch) == UC_BIDI_NSM);
3375 /* See PropList-3.0.1.txt. */
3377 is_property_composite (unsigned int ch)
3379 /* This definition differs from the one in PropList-3.0.1.txt, but is more
3380 logical in some sense. */
3381 if (ch >= 0xAC00 && ch <= 0xD7A4) /* Hangul Syllables */
3383 if (unicode_attributes[ch].name != NULL
3384 && unicode_attributes[ch].decomposition != NULL)
3386 /* Test whether the decomposition contains more than one character,
3387 and the first is not a space. */
3388 const char *decomp = unicode_attributes[ch].decomposition;
3389 if (decomp[0] == '<')
3391 decomp = strchr (decomp, '>') + 1;
3392 if (decomp[0] == ' ')
3395 return strchr (decomp, ' ') != NULL && strncmp (decomp, "0020 ", 5) != 0;
3400 /* See PropList-3.0.1.txt. */
3402 is_property_decimal_digit (unsigned int ch)
3404 return is_category_Nd (ch);
3407 /* See PropList-3.0.1.txt. */
3409 is_property_numeric (unsigned int ch)
3411 return ((get_numeric_value (ch)).denominator > 0)
3412 || (ch == 0x09F8) /* BENGALI CURRENCY NUMERATOR ONE LESS THAN THE DENOMINATOR */
3413 || (ch == 0x2183); /* ROMAN NUMERAL REVERSED ONE HUNDRED */
3416 /* See PropList.txt, UCD.html. */
3418 is_property_diacritic (unsigned int ch)
3420 return ((unicode_properties[ch] & (1ULL << PROP_DIACRITIC)) != 0);
3423 /* See PropList.txt, UCD.html. */
3425 is_property_extender (unsigned int ch)
3427 return ((unicode_properties[ch] & (1ULL << PROP_EXTENDER)) != 0);
3430 /* See PropList-3.0.1.txt. */
3432 is_property_ignorable_control (unsigned int ch)
3434 return ((is_category_Cc (ch) && get_bidi_category (ch) == UC_BIDI_BN)
3435 || is_category_Cf (ch))
3439 /* ------------------------------------------------------------------------- */
3441 /* Output all properties. */
3443 output_properties (const char *version)
3445 #define PROPERTY(P) \
3446 debug_output_predicate ("unictype/pr_" #P ".txt", is_property_ ## P); \
3447 output_predicate_test ("../tests/unictype/test-pr_" #P ".c", is_property_ ## P, "uc_is_property_" #P " (c)"); \
3448 output_predicate ("unictype/pr_" #P ".h", is_property_ ## P, "u_property_" #P, "Properties", version);
3449 PROPERTY(white_space)
3450 PROPERTY(alphabetic)
3451 PROPERTY(other_alphabetic)
3452 PROPERTY(not_a_character)
3453 PROPERTY(default_ignorable_code_point)
3454 PROPERTY(other_default_ignorable_code_point)
3455 PROPERTY(deprecated)
3456 PROPERTY(logical_order_exception)
3457 PROPERTY(variation_selector)
3458 PROPERTY(private_use)
3459 PROPERTY(unassigned_code_value)
3461 PROPERTY(other_uppercase)
3463 PROPERTY(other_lowercase)
3465 PROPERTY(soft_dotted)
3467 PROPERTY(other_id_start)
3468 PROPERTY(id_continue)
3469 PROPERTY(other_id_continue)
3471 PROPERTY(xid_continue)
3472 PROPERTY(pattern_white_space)
3473 PROPERTY(pattern_syntax)
3474 PROPERTY(join_control)
3475 PROPERTY(grapheme_base)
3476 PROPERTY(grapheme_extend)
3477 PROPERTY(other_grapheme_extend)
3478 PROPERTY(grapheme_link)
3479 PROPERTY(bidi_control)
3480 PROPERTY(bidi_left_to_right)
3481 PROPERTY(bidi_hebrew_right_to_left)
3482 PROPERTY(bidi_arabic_right_to_left)
3483 PROPERTY(bidi_european_digit)
3484 PROPERTY(bidi_eur_num_separator)
3485 PROPERTY(bidi_eur_num_terminator)
3486 PROPERTY(bidi_arabic_digit)
3487 PROPERTY(bidi_common_separator)
3488 PROPERTY(bidi_block_separator)
3489 PROPERTY(bidi_segment_separator)
3490 PROPERTY(bidi_whitespace)
3491 PROPERTY(bidi_non_spacing_mark)
3492 PROPERTY(bidi_boundary_neutral)
3494 PROPERTY(bidi_embedding_or_override)
3495 PROPERTY(bidi_other_neutral)
3497 PROPERTY(ascii_hex_digit)
3498 PROPERTY(ideographic)
3499 PROPERTY(unified_ideograph)
3501 PROPERTY(ids_binary_operator)
3502 PROPERTY(ids_trinary_operator)
3503 PROPERTY(zero_width)
3506 PROPERTY(iso_control)
3507 PROPERTY(format_control)
3510 PROPERTY(punctuation)
3511 PROPERTY(line_separator)
3512 PROPERTY(paragraph_separator)
3513 PROPERTY(quotation_mark)
3514 PROPERTY(sentence_terminal)
3515 PROPERTY(terminal_punctuation)
3516 PROPERTY(currency_symbol)
3518 PROPERTY(other_math)
3519 PROPERTY(paired_punctuation)
3520 PROPERTY(left_of_pair)
3523 PROPERTY(decimal_digit)
3527 PROPERTY(ignorable_control)
3531 /* ========================================================================= */
3535 static const char *scripts[256];
3536 static unsigned int numscripts;
3538 static uint8_t unicode_scripts[0x110000];
3541 fill_scripts (const char *scripts_filename)
3546 stream = fopen (scripts_filename, "r");
3549 fprintf (stderr, "error during fopen of '%s'\n", scripts_filename);
3555 for (i = 0; i < 0x110000; i++)
3556 unicode_scripts[i] = (uint8_t)~(uint8_t)0;
3561 unsigned int i1, i2;
3562 char padding[200+1];
3563 char scriptname[200+1];
3566 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
3569 if (buf[0] == '\0' || buf[0] == '#')
3572 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, scriptname) != 4)
3574 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, scriptname) != 3)
3576 fprintf (stderr, "parse error in '%s'\n", scripts_filename);
3586 for (script = numscripts - 1; script >= 0; script--)
3587 if (strcmp (scripts[script], scriptname) == 0)
3591 scripts[numscripts] = strdup (scriptname);
3592 script = numscripts;
3594 if (numscripts == 256)
3598 for (i = i1; i <= i2; i++)
3600 if (unicode_scripts[i] != (uint8_t)~(uint8_t)0)
3601 fprintf (stderr, "0x%04X belongs to multiple scripts\n", i);
3602 unicode_scripts[i] = script;
3606 if (ferror (stream) || fclose (stream))
3608 fprintf (stderr, "error reading from '%s'\n", scripts_filename);
3613 /* Construction of sparse 3-level tables. */
3614 #define TABLE script_table
3615 #define ELEMENT uint8_t
3616 #define DEFAULT (uint8_t)~(uint8_t)0
3617 #define xmalloc malloc
3618 #define xrealloc realloc
3622 output_scripts (const char *version)
3624 const char *filename = "unictype/scripts.h";
3626 unsigned int ch, s, i;
3627 struct script_table t;
3628 unsigned int level1_offset, level2_offset, level3_offset;
3632 const char *lowercase_name;
3635 scriptinfo_t scriptinfo[256];
3637 stream = fopen (filename, "w");
3640 fprintf (stderr, "cannot open '%s' for writing\n", filename);
3644 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
3645 fprintf (stream, "/* Unicode scripts. */\n");
3646 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
3649 for (s = 0; s < numscripts; s++)
3651 char *lcp = strdup (scripts[s]);
3654 for (cp = lcp; *cp != '\0'; cp++)
3655 if (*cp >= 'A' && *cp <= 'Z')
3658 scriptinfo[s].lowercase_name = lcp;
3661 for (s = 0; s < numscripts; s++)
3663 fprintf (stream, "static const uc_interval_t script_%s_intervals[] =\n",
3664 scriptinfo[s].lowercase_name);
3665 fprintf (stream, "{\n");
3667 for (ch = 0; ch < 0x110000; ch++)
3668 if (unicode_scripts[ch] == s)
3674 while (ch + 1 < 0x110000 && unicode_scripts[ch + 1] == s)
3679 fprintf (stream, ",\n");
3681 fprintf (stream, " { 0x%04X, 1, 1 }", start);
3683 fprintf (stream, " { 0x%04X, 1, 0 }, { 0x%04X, 0, 1 }",
3687 fprintf (stream, "\n");
3688 fprintf (stream, "};\n");
3691 fprintf (stream, "static const uc_script_t scripts[%d] =\n", numscripts);
3692 fprintf (stream, "{\n");
3693 for (s = 0; s < numscripts; s++)
3695 fprintf (stream, " {\n");
3696 fprintf (stream, " sizeof (script_%s_intervals) / sizeof (uc_interval_t),\n",
3697 scriptinfo[s].lowercase_name);
3698 fprintf (stream, " script_%s_intervals,\n",
3699 scriptinfo[s].lowercase_name);
3700 fprintf (stream, " \"%s\"\n", scripts[s]);
3701 fprintf (stream, " }");
3702 if (s+1 < numscripts)
3703 fprintf (stream, ",");
3704 fprintf (stream, "\n");
3706 fprintf (stream, "};\n");
3710 script_table_init (&t);
3712 for (ch = 0; ch < 0x110000; ch++)
3714 unsigned int s = unicode_scripts[ch];
3715 if (s != (uint8_t)~(uint8_t)0)
3716 script_table_add (&t, ch, s);
3719 script_table_finalize (&t);
3721 /* Offsets in t.result, in memory of this process. */
3723 5 * sizeof (uint32_t);
3725 5 * sizeof (uint32_t)
3726 + t.level1_size * sizeof (uint32_t);
3728 5 * sizeof (uint32_t)
3729 + t.level1_size * sizeof (uint32_t)
3730 + (t.level2_size << t.q) * sizeof (uint32_t);
3732 for (i = 0; i < 5; i++)
3733 fprintf (stream, "#define script_header_%d %d\n", i,
3734 ((uint32_t *) t.result)[i]);
3735 fprintf (stream, "static const\n");
3736 fprintf (stream, "struct\n");
3737 fprintf (stream, " {\n");
3738 fprintf (stream, " int level1[%zu];\n", t.level1_size);
3739 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
3740 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
3741 fprintf (stream, " }\n");
3742 fprintf (stream, "u_script =\n");
3743 fprintf (stream, "{\n");
3744 fprintf (stream, " {");
3745 if (t.level1_size > 8)
3746 fprintf (stream, "\n ");
3747 for (i = 0; i < t.level1_size; i++)
3750 if (i > 0 && (i % 8) == 0)
3751 fprintf (stream, "\n ");
3752 offset = ((uint32_t *) (t.result + level1_offset))[i];
3754 fprintf (stream, " %5d", -1);
3756 fprintf (stream, " %5zd",
3757 (offset - level2_offset) / sizeof (uint32_t));
3758 if (i+1 < t.level1_size)
3759 fprintf (stream, ",");
3761 if (t.level1_size > 8)
3762 fprintf (stream, "\n ");
3763 fprintf (stream, " },\n");
3764 fprintf (stream, " {");
3765 if (t.level2_size << t.q > 8)
3766 fprintf (stream, "\n ");
3767 for (i = 0; i < t.level2_size << t.q; i++)
3770 if (i > 0 && (i % 8) == 0)
3771 fprintf (stream, "\n ");
3772 offset = ((uint32_t *) (t.result + level2_offset))[i];
3774 fprintf (stream, " %5d", -1);
3776 fprintf (stream, " %5zd",
3777 (offset - level3_offset) / sizeof (uint8_t));
3778 if (i+1 < t.level2_size << t.q)
3779 fprintf (stream, ",");
3781 if (t.level2_size << t.q > 8)
3782 fprintf (stream, "\n ");
3783 fprintf (stream, " },\n");
3784 fprintf (stream, " {");
3785 if (t.level3_size << t.p > 8)
3786 fprintf (stream, "\n ");
3787 for (i = 0; i < t.level3_size << t.p; i++)
3789 if (i > 0 && (i % 8) == 0)
3790 fprintf (stream, "\n ");
3791 fprintf (stream, " %3d", ((uint8_t *) (t.result + level3_offset))[i]);
3792 if (i+1 < t.level3_size << t.p)
3793 fprintf (stream, ",");
3795 if (t.level3_size << t.p > 8)
3796 fprintf (stream, "\n ");
3797 fprintf (stream, " }\n");
3798 fprintf (stream, "};\n");
3800 if (ferror (stream) || fclose (stream))
3802 fprintf (stderr, "error writing to '%s'\n", filename);
3808 output_scripts_byname (const char *version)
3810 const char *filename = "unictype/scripts_byname.gperf";
3814 stream = fopen (filename, "w");
3817 fprintf (stderr, "cannot open '%s' for writing\n", filename);
3821 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
3822 fprintf (stream, "/* Unicode scripts. */\n");
3823 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
3825 fprintf (stream, "struct named_script { const char *name; unsigned int index; };\n");
3826 fprintf (stream, "%%struct-type\n");
3827 fprintf (stream, "%%language=ANSI-C\n");
3828 fprintf (stream, "%%define hash-function-name scripts_hash\n");
3829 fprintf (stream, "%%define lookup-function-name uc_script_lookup\n");
3830 fprintf (stream, "%%readonly-tables\n");
3831 fprintf (stream, "%%global-table\n");
3832 fprintf (stream, "%%define word-array-name script_names\n");
3833 fprintf (stream, "%%%%\n");
3834 for (s = 0; s < numscripts; s++)
3835 fprintf (stream, "%s, %u\n", scripts[s], s);
3837 if (ferror (stream) || fclose (stream))
3839 fprintf (stderr, "error writing to '%s'\n", filename);
3844 /* ========================================================================= */
3848 typedef struct { unsigned int start; unsigned int end; const char *name; }
3850 static block_t blocks[256];
3851 static unsigned int numblocks;
3854 fill_blocks (const char *blocks_filename)
3858 stream = fopen (blocks_filename, "r");
3861 fprintf (stderr, "error during fopen of '%s'\n", blocks_filename);
3868 unsigned int i1, i2;
3869 char padding[200+1];
3870 char blockname[200+1];
3872 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
3875 if (buf[0] == '\0' || buf[0] == '#')
3878 if (sscanf (buf, "%X..%X%[ ;]%[^\r]", &i1, &i2, padding, blockname) != 4)
3880 fprintf (stderr, "parse error in '%s'\n", blocks_filename);
3883 blocks[numblocks].start = i1;
3884 blocks[numblocks].end = i2;
3885 blocks[numblocks].name = strdup (blockname);
3886 /* It must be sorted. */
3887 if (numblocks > 0 && !(blocks[numblocks-1].end < blocks[numblocks].start))
3890 if (numblocks == 256)
3894 if (ferror (stream) || fclose (stream))
3896 fprintf (stderr, "error reading from '%s'\n", blocks_filename);
3901 /* Return the smallest block index among the blocks for characters >= ch. */
3903 block_first_index (unsigned int ch)
3905 /* Binary search. */
3906 unsigned int lo = 0;
3907 unsigned int hi = numblocks;
3909 All blocks[i], i < lo, have blocks[i].end < ch,
3910 all blocks[i], i >= hi, have blocks[i].end >= ch. */
3913 unsigned int mid = (lo + hi) / 2; /* >= lo, < hi */
3914 if (blocks[mid].end < ch)
3922 /* Return the largest block index among the blocks for characters <= ch,
3925 block_last_index (unsigned int ch)
3927 /* Binary search. */
3928 unsigned int lo = 0;
3929 unsigned int hi = numblocks;
3931 All blocks[i], i < lo, have blocks[i].start <= ch,
3932 all blocks[i], i >= hi, have blocks[i].start > ch. */
3935 unsigned int mid = (lo + hi) / 2; /* >= lo, < hi */
3936 if (blocks[mid].start <= ch)
3945 output_blocks (const char *version)
3947 const char *filename = "unictype/blocks.h";
3948 const unsigned int shift = 8; /* bits to shift away for array access */
3949 const unsigned int threshold = 0x30000; /* cut-off table here to save space */
3954 stream = fopen (filename, "w");
3957 fprintf (stderr, "cannot open '%s' for writing\n", filename);
3961 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
3962 fprintf (stream, "/* Unicode blocks. */\n");
3963 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
3966 fprintf (stream, "static const uc_block_t blocks[] =\n");
3967 fprintf (stream, "{\n");
3968 for (i = 0; i < numblocks; i++)
3970 fprintf (stream, " { 0x%04X, 0x%04X, \"%s\" }", blocks[i].start,
3971 blocks[i].end, blocks[i].name);
3972 if (i+1 < numblocks)
3973 fprintf (stream, ",");
3974 fprintf (stream, "\n");
3976 fprintf (stream, "};\n");
3977 fprintf (stream, "#define blocks_level1_shift %d\n", shift);
3978 fprintf (stream, "#define blocks_level1_threshold 0x%04X\n", threshold);
3979 fprintf (stream, "static const uint8_t blocks_level1[%d * 2] =\n",
3980 threshold >> shift);
3981 fprintf (stream, "{\n");
3982 for (i1 = 0; i1 < (threshold >> shift); i1++)
3984 unsigned int first_index = block_first_index (i1 << shift);
3985 unsigned int last_index = block_last_index (((i1 + 1) << shift) - 1);
3986 fprintf (stream, " %3d, %3d", first_index, last_index);
3987 if (i1+1 < (threshold >> shift))
3988 fprintf (stream, ",");
3989 fprintf (stream, "\n");
3991 fprintf (stream, "};\n");
3992 fprintf (stream, "#define blocks_upper_first_index %d\n",
3993 block_first_index (threshold));
3994 fprintf (stream, "#define blocks_upper_last_index %d\n",
3995 block_last_index (0x10FFFF));
3997 if (ferror (stream) || fclose (stream))
3999 fprintf (stderr, "error writing to '%s'\n", filename);
4004 /* ========================================================================= */
4006 /* C and Java syntax. */
4010 UC_IDENTIFIER_START, /* valid as first or subsequent character */
4011 UC_IDENTIFIER_VALID, /* valid as subsequent character only */
4012 UC_IDENTIFIER_INVALID, /* not valid */
4013 UC_IDENTIFIER_IGNORABLE /* ignorable (Java only) */
4016 /* ISO C 99 section 6.4.(3). */
4018 is_c_whitespace (unsigned int ch)
4020 return (ch == ' ' /* space */
4021 || ch == '\t' /* horizontal tab */
4022 || ch == '\n' || ch == '\r' /* new-line */
4023 || ch == '\v' /* vertical tab */
4024 || ch == '\f'); /* form-feed */
4027 /* ISO C 99 section 6.4.2.1 and appendix D. */
4029 c_ident_category (unsigned int ch)
4031 /* Section 6.4.2.1. */
4032 if (ch >= '0' && ch <= '9')
4033 return UC_IDENTIFIER_VALID;
4034 if ((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || ch == '_')
4035 return UC_IDENTIFIER_START;
4041 || (ch >= 0x00C0 && ch <= 0x00D6)
4042 || (ch >= 0x00D8 && ch <= 0x00F6)
4043 || (ch >= 0x00F8 && ch <= 0x01F5)
4044 || (ch >= 0x01FA && ch <= 0x0217)
4045 || (ch >= 0x0250 && ch <= 0x02A8)
4046 || (ch >= 0x1E00 && ch <= 0x1E9B)
4047 || (ch >= 0x1EA0 && ch <= 0x1EF9)
4051 || (ch >= 0x0388 && ch <= 0x038A)
4053 || (ch >= 0x038E && ch <= 0x03A1)
4054 || (ch >= 0x03A3 && ch <= 0x03CE)
4055 || (ch >= 0x03D0 && ch <= 0x03D6)
4060 || (ch >= 0x03E2 && ch <= 0x03F3)
4061 || (ch >= 0x1F00 && ch <= 0x1F15)
4062 || (ch >= 0x1F18 && ch <= 0x1F1D)
4063 || (ch >= 0x1F20 && ch <= 0x1F45)
4064 || (ch >= 0x1F48 && ch <= 0x1F4D)
4065 || (ch >= 0x1F50 && ch <= 0x1F57)
4069 || (ch >= 0x1F5F && ch <= 0x1F7D)
4070 || (ch >= 0x1F80 && ch <= 0x1FB4)
4071 || (ch >= 0x1FB6 && ch <= 0x1FBC)
4072 || (ch >= 0x1FC2 && ch <= 0x1FC4)
4073 || (ch >= 0x1FC6 && ch <= 0x1FCC)
4074 || (ch >= 0x1FD0 && ch <= 0x1FD3)
4075 || (ch >= 0x1FD6 && ch <= 0x1FDB)
4076 || (ch >= 0x1FE0 && ch <= 0x1FEC)
4077 || (ch >= 0x1FF2 && ch <= 0x1FF4)
4078 || (ch >= 0x1FF6 && ch <= 0x1FFC)
4080 || (ch >= 0x0401 && ch <= 0x040C)
4081 || (ch >= 0x040E && ch <= 0x044F)
4082 || (ch >= 0x0451 && ch <= 0x045C)
4083 || (ch >= 0x045E && ch <= 0x0481)
4084 || (ch >= 0x0490 && ch <= 0x04C4)
4085 || (ch >= 0x04C7 && ch <= 0x04C8)
4086 || (ch >= 0x04CB && ch <= 0x04CC)
4087 || (ch >= 0x04D0 && ch <= 0x04EB)
4088 || (ch >= 0x04EE && ch <= 0x04F5)
4089 || (ch >= 0x04F8 && ch <= 0x04F9)
4091 || (ch >= 0x0531 && ch <= 0x0556)
4092 || (ch >= 0x0561 && ch <= 0x0587)
4094 || (ch >= 0x05B0 && ch <= 0x05B9)
4095 || (ch >= 0x05BB && ch <= 0x05BD)
4097 || (ch >= 0x05C1 && ch <= 0x05C2)
4098 || (ch >= 0x05D0 && ch <= 0x05EA)
4099 || (ch >= 0x05F0 && ch <= 0x05F2)
4101 || (ch >= 0x0621 && ch <= 0x063A)
4102 || (ch >= 0x0640 && ch <= 0x0652)
4103 || (ch >= 0x0670 && ch <= 0x06B7)
4104 || (ch >= 0x06BA && ch <= 0x06BE)
4105 || (ch >= 0x06C0 && ch <= 0x06CE)
4106 || (ch >= 0x06D0 && ch <= 0x06DC)
4107 || (ch >= 0x06E5 && ch <= 0x06E8)
4108 || (ch >= 0x06EA && ch <= 0x06ED)
4110 || (ch >= 0x0901 && ch <= 0x0903)
4111 || (ch >= 0x0905 && ch <= 0x0939)
4112 || (ch >= 0x093E && ch <= 0x094D)
4113 || (ch >= 0x0950 && ch <= 0x0952)
4114 || (ch >= 0x0958 && ch <= 0x0963)
4116 || (ch >= 0x0981 && ch <= 0x0983)
4117 || (ch >= 0x0985 && ch <= 0x098C)
4118 || (ch >= 0x098F && ch <= 0x0990)
4119 || (ch >= 0x0993 && ch <= 0x09A8)
4120 || (ch >= 0x09AA && ch <= 0x09B0)
4122 || (ch >= 0x09B6 && ch <= 0x09B9)
4123 || (ch >= 0x09BE && ch <= 0x09C4)
4124 || (ch >= 0x09C7 && ch <= 0x09C8)
4125 || (ch >= 0x09CB && ch <= 0x09CD)
4126 || (ch >= 0x09DC && ch <= 0x09DD)
4127 || (ch >= 0x09DF && ch <= 0x09E3)
4128 || (ch >= 0x09F0 && ch <= 0x09F1)
4131 || (ch >= 0x0A05 && ch <= 0x0A0A)
4132 || (ch >= 0x0A0F && ch <= 0x0A10)
4133 || (ch >= 0x0A13 && ch <= 0x0A28)
4134 || (ch >= 0x0A2A && ch <= 0x0A30)
4135 || (ch >= 0x0A32 && ch <= 0x0A33)
4136 || (ch >= 0x0A35 && ch <= 0x0A36)
4137 || (ch >= 0x0A38 && ch <= 0x0A39)
4138 || (ch >= 0x0A3E && ch <= 0x0A42)
4139 || (ch >= 0x0A47 && ch <= 0x0A48)
4140 || (ch >= 0x0A4B && ch <= 0x0A4D)
4141 || (ch >= 0x0A59 && ch <= 0x0A5C)
4145 || (ch >= 0x0A81 && ch <= 0x0A83)
4146 || (ch >= 0x0A85 && ch <= 0x0A8B)
4148 || (ch >= 0x0A8F && ch <= 0x0A91)
4149 || (ch >= 0x0A93 && ch <= 0x0AA8)
4150 || (ch >= 0x0AAA && ch <= 0x0AB0)
4151 || (ch >= 0x0AB2 && ch <= 0x0AB3)
4152 || (ch >= 0x0AB5 && ch <= 0x0AB9)
4153 || (ch >= 0x0ABD && ch <= 0x0AC5)
4154 || (ch >= 0x0AC7 && ch <= 0x0AC9)
4155 || (ch >= 0x0ACB && ch <= 0x0ACD)
4159 || (ch >= 0x0B01 && ch <= 0x0B03)
4160 || (ch >= 0x0B05 && ch <= 0x0B0C)
4161 || (ch >= 0x0B0F && ch <= 0x0B10)
4162 || (ch >= 0x0B13 && ch <= 0x0B28)
4163 || (ch >= 0x0B2A && ch <= 0x0B30)
4164 || (ch >= 0x0B32 && ch <= 0x0B33)
4165 || (ch >= 0x0B36 && ch <= 0x0B39)
4166 || (ch >= 0x0B3E && ch <= 0x0B43)
4167 || (ch >= 0x0B47 && ch <= 0x0B48)
4168 || (ch >= 0x0B4B && ch <= 0x0B4D)
4169 || (ch >= 0x0B5C && ch <= 0x0B5D)
4170 || (ch >= 0x0B5F && ch <= 0x0B61)
4172 || (ch >= 0x0B82 && ch <= 0x0B83)
4173 || (ch >= 0x0B85 && ch <= 0x0B8A)
4174 || (ch >= 0x0B8E && ch <= 0x0B90)
4175 || (ch >= 0x0B92 && ch <= 0x0B95)
4176 || (ch >= 0x0B99 && ch <= 0x0B9A)
4178 || (ch >= 0x0B9E && ch <= 0x0B9F)
4179 || (ch >= 0x0BA3 && ch <= 0x0BA4)
4180 || (ch >= 0x0BA8 && ch <= 0x0BAA)
4181 || (ch >= 0x0BAE && ch <= 0x0BB5)
4182 || (ch >= 0x0BB7 && ch <= 0x0BB9)
4183 || (ch >= 0x0BBE && ch <= 0x0BC2)
4184 || (ch >= 0x0BC6 && ch <= 0x0BC8)
4185 || (ch >= 0x0BCA && ch <= 0x0BCD)
4187 || (ch >= 0x0C01 && ch <= 0x0C03)
4188 || (ch >= 0x0C05 && ch <= 0x0C0C)
4189 || (ch >= 0x0C0E && ch <= 0x0C10)
4190 || (ch >= 0x0C12 && ch <= 0x0C28)
4191 || (ch >= 0x0C2A && ch <= 0x0C33)
4192 || (ch >= 0x0C35 && ch <= 0x0C39)
4193 || (ch >= 0x0C3E && ch <= 0x0C44)
4194 || (ch >= 0x0C46 && ch <= 0x0C48)
4195 || (ch >= 0x0C4A && ch <= 0x0C4D)
4196 || (ch >= 0x0C60 && ch <= 0x0C61)
4198 || (ch >= 0x0C82 && ch <= 0x0C83)
4199 || (ch >= 0x0C85 && ch <= 0x0C8C)
4200 || (ch >= 0x0C8E && ch <= 0x0C90)
4201 || (ch >= 0x0C92 && ch <= 0x0CA8)
4202 || (ch >= 0x0CAA && ch <= 0x0CB3)
4203 || (ch >= 0x0CB5 && ch <= 0x0CB9)
4204 || (ch >= 0x0CBE && ch <= 0x0CC4)
4205 || (ch >= 0x0CC6 && ch <= 0x0CC8)
4206 || (ch >= 0x0CCA && ch <= 0x0CCD)
4208 || (ch >= 0x0CE0 && ch <= 0x0CE1)
4210 || (ch >= 0x0D02 && ch <= 0x0D03)
4211 || (ch >= 0x0D05 && ch <= 0x0D0C)
4212 || (ch >= 0x0D0E && ch <= 0x0D10)
4213 || (ch >= 0x0D12 && ch <= 0x0D28)
4214 || (ch >= 0x0D2A && ch <= 0x0D39)
4215 || (ch >= 0x0D3E && ch <= 0x0D43)
4216 || (ch >= 0x0D46 && ch <= 0x0D48)
4217 || (ch >= 0x0D4A && ch <= 0x0D4D)
4218 || (ch >= 0x0D60 && ch <= 0x0D61)
4220 || (ch >= 0x0E01 && ch <= 0x0E3A)
4221 || (ch >= 0x0E40 && ch <= 0x0E5B)
4223 || (ch >= 0x0E81 && ch <= 0x0E82)
4225 || (ch >= 0x0E87 && ch <= 0x0E88)
4228 || (ch >= 0x0E94 && ch <= 0x0E97)
4229 || (ch >= 0x0E99 && ch <= 0x0E9F)
4230 || (ch >= 0x0EA1 && ch <= 0x0EA3)
4233 || (ch >= 0x0EAA && ch <= 0x0EAB)
4234 || (ch >= 0x0EAD && ch <= 0x0EAE)
4235 || (ch >= 0x0EB0 && ch <= 0x0EB9)
4236 || (ch >= 0x0EBB && ch <= 0x0EBD)
4237 || (ch >= 0x0EC0 && ch <= 0x0EC4)
4239 || (ch >= 0x0EC8 && ch <= 0x0ECD)
4240 || (ch >= 0x0EDC && ch <= 0x0EDD)
4243 || (ch >= 0x0F18 && ch <= 0x0F19)
4247 || (ch >= 0x0F3E && ch <= 0x0F47)
4248 || (ch >= 0x0F49 && ch <= 0x0F69)
4249 || (ch >= 0x0F71 && ch <= 0x0F84)
4250 || (ch >= 0x0F86 && ch <= 0x0F8B)
4251 || (ch >= 0x0F90 && ch <= 0x0F95)
4253 || (ch >= 0x0F99 && ch <= 0x0FAD)
4254 || (ch >= 0x0FB1 && ch <= 0x0FB7)
4257 || (ch >= 0x10A0 && ch <= 0x10C5)
4258 || (ch >= 0x10D0 && ch <= 0x10F6)
4260 || (ch >= 0x3041 && ch <= 0x3093)
4261 || (ch >= 0x309B && ch <= 0x309C)
4263 || (ch >= 0x30A1 && ch <= 0x30F6)
4264 || (ch >= 0x30FB && ch <= 0x30FC)
4266 || (ch >= 0x3105 && ch <= 0x312C)
4267 /* CJK Unified Ideographs */
4268 || (ch >= 0x4E00 && ch <= 0x9FA5)
4270 || (ch >= 0xAC00 && ch <= 0xD7A3)
4272 || (ch >= 0x0660 && ch <= 0x0669)
4273 || (ch >= 0x06F0 && ch <= 0x06F9)
4274 || (ch >= 0x0966 && ch <= 0x096F)
4275 || (ch >= 0x09E6 && ch <= 0x09EF)
4276 || (ch >= 0x0A66 && ch <= 0x0A6F)
4277 || (ch >= 0x0AE6 && ch <= 0x0AEF)
4278 || (ch >= 0x0B66 && ch <= 0x0B6F)
4279 || (ch >= 0x0BE7 && ch <= 0x0BEF)
4280 || (ch >= 0x0C66 && ch <= 0x0C6F)
4281 || (ch >= 0x0CE6 && ch <= 0x0CEF)
4282 || (ch >= 0x0D66 && ch <= 0x0D6F)
4283 || (ch >= 0x0E50 && ch <= 0x0E59)
4284 || (ch >= 0x0ED0 && ch <= 0x0ED9)
4285 || (ch >= 0x0F20 && ch <= 0x0F33)
4286 /* Special characters */
4289 || (ch >= 0x02B0 && ch <= 0x02B8)
4291 || (ch >= 0x02BD && ch <= 0x02C1)
4292 || (ch >= 0x02D0 && ch <= 0x02D1)
4293 || (ch >= 0x02E0 && ch <= 0x02E4)
4299 || (ch >= 0x203F && ch <= 0x2040)
4302 || (ch >= 0x210A && ch <= 0x2113)
4304 || (ch >= 0x2118 && ch <= 0x211D)
4308 || (ch >= 0x212A && ch <= 0x2131)
4309 || (ch >= 0x2133 && ch <= 0x2138)
4310 || (ch >= 0x2160 && ch <= 0x2182)
4311 || (ch >= 0x3005 && ch <= 0x3007)
4312 || (ch >= 0x3021 && ch <= 0x3029)
4314 return UC_IDENTIFIER_START;
4315 return UC_IDENTIFIER_INVALID;
4318 /* The Java Language Specification, 3rd edition, §3.6.
4319 http://java.sun.com/docs/books/jls/third_edition/html/lexical.html#95710 */
4321 is_java_whitespace (unsigned int ch)
4323 return (ch == ' ' || ch == '\t' || ch == '\f'
4324 || ch == '\n' || ch == '\r');
4327 /* The Java Language Specification, 3rd edition, §3.8.
4328 http://java.sun.com/docs/books/jls/third_edition/html/lexical.html#40625
4329 and Character.isJavaIdentifierStart and Character.isJavaIdentifierPart */
4331 java_ident_category (unsigned int ch)
4333 /* FIXME: Check this against Sun's JDK implementation. */
4334 if (is_category_L (ch) /* = Character.isLetter(ch) */
4335 || is_category_Nl (ch) /* = Character.getType(ch)==LETTER_NUMBER */
4336 || is_category_Sc (ch) /* currency symbol */
4337 || is_category_Pc (ch) /* connector punctuation */
4339 return UC_IDENTIFIER_START;
4340 if (is_category_Nd (ch) /* digit */
4341 || is_category_Mc (ch) /* combining mark */
4342 || is_category_Mn (ch) /* non-spacing mark */
4344 return UC_IDENTIFIER_VALID;
4345 if ((ch >= 0x0000 && ch <= 0x0008)
4346 || (ch >= 0x000E && ch <= 0x001B)
4347 || (ch >= 0x007F && ch <= 0x009F)
4348 || is_category_Cf (ch) /* = Character.getType(ch)==FORMAT */
4350 return UC_IDENTIFIER_IGNORABLE;
4351 return UC_IDENTIFIER_INVALID;
4354 /* Construction of sparse 3-level tables. */
4355 #define TABLE identsyntax_table
4356 #define ELEMENT uint8_t
4357 #define DEFAULT UC_IDENTIFIER_INVALID
4358 #define xmalloc malloc
4359 #define xrealloc realloc
4362 /* Output an identifier syntax categorization in a three-level bitmap. */
4364 output_ident_category (const char *filename, int (*predicate) (unsigned int), const char *name, const char *version)
4368 struct identsyntax_table t;
4369 unsigned int level1_offset, level2_offset, level3_offset;
4371 stream = fopen (filename, "w");
4374 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4378 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4379 fprintf (stream, "/* Language syntax properties of Unicode characters. */\n");
4380 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
4385 identsyntax_table_init (&t);
4387 for (ch = 0; ch < 0x110000; ch++)
4389 int syntaxcode = predicate (ch);
4390 if (syntaxcode != UC_IDENTIFIER_INVALID)
4391 identsyntax_table_add (&t, ch, syntaxcode);
4394 identsyntax_table_finalize (&t);
4396 /* Offsets in t.result, in memory of this process. */
4398 5 * sizeof (uint32_t);
4400 5 * sizeof (uint32_t)
4401 + t.level1_size * sizeof (uint32_t);
4403 5 * sizeof (uint32_t)
4404 + t.level1_size * sizeof (uint32_t)
4405 + (t.level2_size << t.q) * sizeof (uint32_t);
4407 for (i = 0; i < 5; i++)
4408 fprintf (stream, "#define identsyntax_header_%d %d\n", i,
4409 ((uint32_t *) t.result)[i]);
4410 fprintf (stream, "static const\n");
4411 fprintf (stream, "struct\n");
4412 fprintf (stream, " {\n");
4413 fprintf (stream, " int level1[%zu];\n", t.level1_size);
4414 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
4415 fprintf (stream, " unsigned short level3[%zu * %d];\n", t.level3_size,
4416 (1 << t.p) * 2 / 16);
4417 fprintf (stream, " }\n");
4418 fprintf (stream, "%s =\n", name);
4419 fprintf (stream, "{\n");
4420 fprintf (stream, " {");
4421 if (t.level1_size > 8)
4422 fprintf (stream, "\n ");
4423 for (i = 0; i < t.level1_size; i++)
4426 if (i > 0 && (i % 8) == 0)
4427 fprintf (stream, "\n ");
4428 offset = ((uint32_t *) (t.result + level1_offset))[i];
4430 fprintf (stream, " %5d", -1);
4432 fprintf (stream, " %5zd",
4433 (offset - level2_offset) / sizeof (uint32_t));
4434 if (i+1 < t.level1_size)
4435 fprintf (stream, ",");
4437 if (t.level1_size > 8)
4438 fprintf (stream, "\n ");
4439 fprintf (stream, " },\n");
4440 fprintf (stream, " {");
4441 if (t.level2_size << t.q > 8)
4442 fprintf (stream, "\n ");
4443 for (i = 0; i < t.level2_size << t.q; i++)
4446 if (i > 0 && (i % 8) == 0)
4447 fprintf (stream, "\n ");
4448 offset = ((uint32_t *) (t.result + level2_offset))[i];
4450 fprintf (stream, " %5d", -1);
4452 fprintf (stream, " %5zd",
4453 (offset - level3_offset) / sizeof (uint8_t));
4454 if (i+1 < t.level2_size << t.q)
4455 fprintf (stream, ",");
4457 if (t.level2_size << t.q > 8)
4458 fprintf (stream, "\n ");
4459 fprintf (stream, " },\n");
4460 /* Pack the level3 array. Each entry needs 2 bits only. */
4461 fprintf (stream, " {");
4462 if ((t.level3_size << t.p) * 2 / 16 > 8)
4463 fprintf (stream, "\n ");
4464 for (i = 0; i < (t.level3_size << t.p) * 2 / 16; i++)
4466 if (i > 0 && (i % 8) == 0)
4467 fprintf (stream, "\n ");
4468 fprintf (stream, " 0x%04x",
4469 (((uint8_t *) (t.result + level3_offset))[8 * i] << 0)
4470 | (((uint8_t *) (t.result + level3_offset))[8 * i + 1] << 2)
4471 | (((uint8_t *) (t.result + level3_offset))[8 * i + 2] << 4)
4472 | (((uint8_t *) (t.result + level3_offset))[8 * i + 3] << 6)
4473 | (((uint8_t *) (t.result + level3_offset))[8 * i + 4] << 8)
4474 | (((uint8_t *) (t.result + level3_offset))[8 * i + 5] << 10)
4475 | (((uint8_t *) (t.result + level3_offset))[8 * i + 6] << 12)
4476 | (((uint8_t *) (t.result + level3_offset))[8 * i + 7] << 14));
4477 if (i+1 < (t.level3_size << t.p) * 2 / 16)
4478 fprintf (stream, ",");
4480 if ((t.level3_size << t.p) * 2 / 16 > 8)
4481 fprintf (stream, "\n ");
4482 fprintf (stream, " }\n");
4483 fprintf (stream, "};\n");
4485 if (ferror (stream) || fclose (stream))
4487 fprintf (stderr, "error writing to '%s'\n", filename);
4493 output_ident_properties (const char *version)
4495 #define PROPERTY(P) \
4496 debug_output_predicate ("unictype/sy_" #P ".txt", is_ ## P); \
4497 output_predicate_test ("../tests/unictype/test-sy_" #P ".c", is_ ## P, "uc_is_" #P " (c)"); \
4498 output_predicate ("unictype/sy_" #P ".h", is_ ## P, "u_" #P, "Language syntax properties", version);
4499 PROPERTY(c_whitespace)
4500 PROPERTY(java_whitespace)
4503 output_ident_category ("unictype/sy_c_ident.h", c_ident_category, "u_c_ident", version);
4504 output_ident_category ("unictype/sy_java_ident.h", java_ident_category, "u_java_ident", version);
4507 /* ========================================================================= */
4509 /* Like ISO C <ctype.h> and <wctype.h>. Compatible to glibc's
4510 glibc/localedata/locales/i18n file, generated by
4511 glibc/localedata/gen-unicode-ctype.c. */
4513 /* Character mappings. */
4516 to_upper (unsigned int ch)
4518 if (unicode_attributes[ch].name != NULL
4519 && unicode_attributes[ch].upper != NONE)
4520 return unicode_attributes[ch].upper;
4526 to_lower (unsigned int ch)
4528 if (unicode_attributes[ch].name != NULL
4529 && unicode_attributes[ch].lower != NONE)
4530 return unicode_attributes[ch].lower;
4536 to_title (unsigned int ch)
4538 if (unicode_attributes[ch].name != NULL
4539 && unicode_attributes[ch].title != NONE)
4540 return unicode_attributes[ch].title;
4545 /* Character class properties. */
4548 is_upper (unsigned int ch)
4550 return (to_lower (ch) != ch);
4554 is_lower (unsigned int ch)
4556 return (to_upper (ch) != ch)
4557 /* <U00DF> is lowercase, but without simple to_upper mapping. */
4562 is_alpha (unsigned int ch)
4564 return (unicode_attributes[ch].name != NULL
4565 && ((unicode_attributes[ch].category[0] == 'L'
4566 /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
4567 <U0E2F>, <U0E46> should belong to is_punct. */
4568 && (ch != 0x0E2F) && (ch != 0x0E46))
4569 /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
4570 <U0E31>, <U0E34>..<U0E3A>, <U0E47>..<U0E4E> are is_alpha. */
4572 || (ch >= 0x0E34 && ch <= 0x0E3A)
4573 || (ch >= 0x0E47 && ch <= 0x0E4E)
4574 /* Avoid warning for <U0345>. */
4576 /* Avoid warnings for <U2160>..<U217F>. */
4577 || (unicode_attributes[ch].category[0] == 'N'
4578 && unicode_attributes[ch].category[1] == 'l')
4579 /* Avoid warnings for <U24B6>..<U24E9>. */
4580 || (unicode_attributes[ch].category[0] == 'S'
4581 && unicode_attributes[ch].category[1] == 'o'
4582 && strstr (unicode_attributes[ch].name, " LETTER ")
4584 /* Consider all the non-ASCII digits as alphabetic.
4585 ISO C 99 forbids us to have them in category "digit",
4586 but we want iswalnum to return true on them. */
4587 || (unicode_attributes[ch].category[0] == 'N'
4588 && unicode_attributes[ch].category[1] == 'd'
4589 && !(ch >= 0x0030 && ch <= 0x0039))));
4593 is_digit (unsigned int ch)
4596 return (unicode_attributes[ch].name != NULL
4597 && unicode_attributes[ch].category[0] == 'N'
4598 && unicode_attributes[ch].category[1] == 'd');
4599 /* Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
4600 a zero. Must add <0> in front of them by hand. */
4602 /* SUSV2 gives us some freedom for the "digit" category, but ISO C 99
4605 The iswdigit function tests for any wide character that corresponds
4606 to a decimal-digit character (as defined in 5.2.1).
4608 the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
4610 return (ch >= 0x0030 && ch <= 0x0039);
4615 is_outdigit (unsigned int ch)
4617 return (ch >= 0x0030 && ch <= 0x0039);
4621 is_alnum (unsigned int ch)
4623 return is_alpha (ch) || is_digit (ch);
4627 is_blank (unsigned int ch)
4629 return (ch == 0x0009 /* '\t' */
4630 /* Category Zs without mention of "<noBreak>" */
4631 || (unicode_attributes[ch].name != NULL
4632 && unicode_attributes[ch].category[0] == 'Z'
4633 && unicode_attributes[ch].category[1] == 's'
4634 && !strstr (unicode_attributes[ch].decomposition, "<noBreak>")));
4638 is_space (unsigned int ch)
4640 /* Don't make U+00A0 a space. Non-breaking space means that all programs
4641 should treat it like a punctuation character, not like a space. */
4642 return (ch == 0x0020 /* ' ' */
4643 || ch == 0x000C /* '\f' */
4644 || ch == 0x000A /* '\n' */
4645 || ch == 0x000D /* '\r' */
4646 || ch == 0x0009 /* '\t' */
4647 || ch == 0x000B /* '\v' */
4648 /* Categories Zl, Zp, and Zs without mention of "<noBreak>" */
4649 || (unicode_attributes[ch].name != NULL
4650 && unicode_attributes[ch].category[0] == 'Z'
4651 && (unicode_attributes[ch].category[1] == 'l'
4652 || unicode_attributes[ch].category[1] == 'p'
4653 || (unicode_attributes[ch].category[1] == 's'
4654 && !strstr (unicode_attributes[ch].decomposition,
4659 is_cntrl (unsigned int ch)
4661 return (unicode_attributes[ch].name != NULL
4662 && (strcmp (unicode_attributes[ch].name, "<control>") == 0
4663 /* Categories Zl and Zp */
4664 || (unicode_attributes[ch].category[0] == 'Z'
4665 && (unicode_attributes[ch].category[1] == 'l'
4666 || unicode_attributes[ch].category[1] == 'p'))));
4670 is_xdigit (unsigned int ch)
4673 return is_digit (ch)
4674 || (ch >= 0x0041 && ch <= 0x0046)
4675 || (ch >= 0x0061 && ch <= 0x0066);
4677 /* SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
4680 The iswxdigit function tests for any wide character that corresponds
4681 to a hexadecimal-digit character (as defined in 6.4.4.1).
4683 hexadecimal-digit: one of 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
4685 return (ch >= 0x0030 && ch <= 0x0039)
4686 || (ch >= 0x0041 && ch <= 0x0046)
4687 || (ch >= 0x0061 && ch <= 0x0066);
4692 is_graph (unsigned int ch)
4694 return (unicode_attributes[ch].name != NULL
4695 && strcmp (unicode_attributes[ch].name, "<control>")
4700 is_print (unsigned int ch)
4702 return (unicode_attributes[ch].name != NULL
4703 && strcmp (unicode_attributes[ch].name, "<control>")
4704 /* Categories Zl and Zp */
4705 && !(unicode_attributes[ch].name != NULL
4706 && unicode_attributes[ch].category[0] == 'Z'
4707 && (unicode_attributes[ch].category[1] == 'l'
4708 || unicode_attributes[ch].category[1] == 'p')));
4712 is_punct (unsigned int ch)
4715 return (unicode_attributes[ch].name != NULL
4716 && unicode_attributes[ch].category[0] == 'P');
4718 /* The traditional POSIX definition of punctuation is every graphic,
4719 non-alphanumeric character. */
4720 return (is_graph (ch) && !is_alpha (ch) && !is_digit (ch));
4724 /* Output all properties. */
4726 output_old_ctype (const char *version)
4728 #define PROPERTY(P) \
4729 debug_output_predicate ("unictype/ctype_" #P ".txt", is_ ## P); \
4730 output_predicate_test ("../tests/unictype/test-ctype_" #P ".c", is_ ## P, "uc_is_" #P " (c)"); \
4731 output_predicate ("unictype/ctype_" #P ".h", is_ ## P, "u_is_" #P, "ISO C <ctype.h> like properties", version);
4750 is_combining (unsigned int ch)
4752 /* Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
4753 file. In 3.0.1 it was identical to the union of the general categories
4754 "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the
4755 PropList.txt file, so we take the latter definition. */
4756 return (unicode_attributes[ch].name != NULL
4757 && unicode_attributes[ch].category[0] == 'M'
4758 && (unicode_attributes[ch].category[1] == 'n'
4759 || unicode_attributes[ch].category[1] == 'c'
4760 || unicode_attributes[ch].category[1] == 'e'));
4764 is_combining_level3 (unsigned int ch)
4766 return is_combining (ch)
4767 && !(unicode_attributes[ch].combining[0] != '\0'
4768 && unicode_attributes[ch].combining[0] != '0'
4769 && strtoul (unicode_attributes[ch].combining, NULL, 10) >= 200);
4772 /* Return the UCS symbol string for a Unicode character. */
4774 ucs_symbol (unsigned int i)
4776 static char buf[11+1];
4778 sprintf (buf, (i < 0x10000 ? "<U%04X>" : "<U%08X>"), i);
4782 /* Return the UCS symbol range string for a Unicode characters interval. */
4784 ucs_symbol_range (unsigned int low, unsigned int high)
4786 static char buf[24+1];
4788 strcpy (buf, ucs_symbol (low));
4790 strcat (buf, ucs_symbol (high));
4794 /* Output a character class (= property) table. */
4797 output_charclass (FILE *stream, const char *classname,
4798 bool (*func) (unsigned int))
4800 char table[0x110000];
4802 bool need_semicolon;
4803 const int max_column = 75;
4806 for (i = 0; i < 0x110000; i++)
4807 table[i] = (int) func (i);
4809 fprintf (stream, "%s ", classname);
4810 need_semicolon = false;
4812 for (i = 0; i < 0x110000; )
4818 unsigned int low, high;
4824 while (i < 0x110000 && table[i]);
4828 strcpy (buf, ucs_symbol (low));
4830 strcpy (buf, ucs_symbol_range (low, high));
4834 fprintf (stream, ";");
4838 if (column + strlen (buf) > max_column)
4840 fprintf (stream, "/\n ");
4844 fprintf (stream, "%s", buf);
4845 column += strlen (buf);
4846 need_semicolon = true;
4849 fprintf (stream, "\n");
4852 /* Output a character mapping table. */
4855 output_charmap (FILE *stream, const char *mapname,
4856 unsigned int (*func) (unsigned int))
4858 char table[0x110000];
4860 bool need_semicolon;
4861 const int max_column = 75;
4864 for (i = 0; i < 0x110000; i++)
4865 table[i] = (func (i) != i);
4867 fprintf (stream, "%s ", mapname);
4868 need_semicolon = false;
4870 for (i = 0; i < 0x110000; i++)
4876 strcat (buf, ucs_symbol (i));
4878 strcat (buf, ucs_symbol (func (i)));
4883 fprintf (stream, ";");
4887 if (column + strlen (buf) > max_column)
4889 fprintf (stream, "/\n ");
4893 fprintf (stream, "%s", buf);
4894 column += strlen (buf);
4895 need_semicolon = true;
4897 fprintf (stream, "\n");
4900 /* Output the width table. */
4903 output_widthmap (FILE *stream)
4907 /* Output the tables to the given file. */
4910 output_tables (const char *filename, const char *version)
4915 stream = fopen (filename, "w");
4918 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4922 fprintf (stream, "escape_char /\n");
4923 fprintf (stream, "comment_char %%\n");
4924 fprintf (stream, "\n");
4925 fprintf (stream, "%% Generated automatically by gen-unicode-ctype for Unicode %s.\n",
4927 fprintf (stream, "\n");
4929 fprintf (stream, "LC_IDENTIFICATION\n");
4930 fprintf (stream, "title \"Unicode %s FDCC-set\"\n", version);
4931 fprintf (stream, "source \"UnicodeData.txt, PropList.txt\"\n");
4932 fprintf (stream, "address \"\"\n");
4933 fprintf (stream, "contact \"\"\n");
4934 fprintf (stream, "email \"bug-glibc@gnu.org\"\n");
4935 fprintf (stream, "tel \"\"\n");
4936 fprintf (stream, "fax \"\"\n");
4937 fprintf (stream, "language \"\"\n");
4938 fprintf (stream, "territory \"Earth\"\n");
4939 fprintf (stream, "revision \"%s\"\n", version);
4944 strftime (date, sizeof (date), "%Y-%m-%d", gmtime (&now));
4945 fprintf (stream, "date \"%s\"\n", date);
4947 fprintf (stream, "category \"unicode:2001\";LC_CTYPE\n");
4948 fprintf (stream, "END LC_IDENTIFICATION\n");
4949 fprintf (stream, "\n");
4951 /* Verifications. */
4952 for (ch = 0; ch < 0x110000; ch++)
4954 /* toupper restriction: "Only characters specified for the keywords
4955 lower and upper shall be specified. */
4956 if (to_upper (ch) != ch && !(is_lower (ch) || is_upper (ch)))
4958 "%s is not upper|lower but toupper(0x%04X) = 0x%04X\n",
4959 ucs_symbol (ch), ch, to_upper (ch));
4961 /* tolower restriction: "Only characters specified for the keywords
4962 lower and upper shall be specified. */
4963 if (to_lower (ch) != ch && !(is_lower (ch) || is_upper (ch)))
4965 "%s is not upper|lower but tolower(0x%04X) = 0x%04X\n",
4966 ucs_symbol (ch), ch, to_lower (ch));
4968 /* alpha restriction: "Characters classified as either upper or lower
4969 shall automatically belong to this class. */
4970 if ((is_lower (ch) || is_upper (ch)) && !is_alpha (ch))
4971 fprintf (stderr, "%s is upper|lower but not alpha\n", ucs_symbol (ch));
4973 /* alpha restriction: "No character specified for the keywords cntrl,
4974 digit, punct or space shall be specified." */
4975 if (is_alpha (ch) && is_cntrl (ch))
4976 fprintf (stderr, "%s is alpha and cntrl\n", ucs_symbol (ch));
4977 if (is_alpha (ch) && is_digit (ch))
4978 fprintf (stderr, "%s is alpha and digit\n", ucs_symbol (ch));
4979 if (is_alpha (ch) && is_punct (ch))
4980 fprintf (stderr, "%s is alpha and punct\n", ucs_symbol (ch));
4981 if (is_alpha (ch) && is_space (ch))
4982 fprintf (stderr, "%s is alpha and space\n", ucs_symbol (ch));
4984 /* space restriction: "No character specified for the keywords upper,
4985 lower, alpha, digit, graph or xdigit shall be specified."
4986 upper, lower, alpha already checked above. */
4987 if (is_space (ch) && is_digit (ch))
4988 fprintf (stderr, "%s is space and digit\n", ucs_symbol (ch));
4989 if (is_space (ch) && is_graph (ch))
4990 fprintf (stderr, "%s is space and graph\n", ucs_symbol (ch));
4991 if (is_space (ch) && is_xdigit (ch))
4992 fprintf (stderr, "%s is space and xdigit\n", ucs_symbol (ch));
4994 /* cntrl restriction: "No character specified for the keywords upper,
4995 lower, alpha, digit, punct, graph, print or xdigit shall be
4996 specified." upper, lower, alpha already checked above. */
4997 if (is_cntrl (ch) && is_digit (ch))
4998 fprintf (stderr, "%s is cntrl and digit\n", ucs_symbol (ch));
4999 if (is_cntrl (ch) && is_punct (ch))
5000 fprintf (stderr, "%s is cntrl and punct\n", ucs_symbol (ch));
5001 if (is_cntrl (ch) && is_graph (ch))
5002 fprintf (stderr, "%s is cntrl and graph\n", ucs_symbol (ch));
5003 if (is_cntrl (ch) && is_print (ch))
5004 fprintf (stderr, "%s is cntrl and print\n", ucs_symbol (ch));
5005 if (is_cntrl (ch) && is_xdigit (ch))
5006 fprintf (stderr, "%s is cntrl and xdigit\n", ucs_symbol (ch));
5008 /* punct restriction: "No character specified for the keywords upper,
5009 lower, alpha, digit, cntrl, xdigit or as the <space> character shall
5010 be specified." upper, lower, alpha, cntrl already checked above. */
5011 if (is_punct (ch) && is_digit (ch))
5012 fprintf (stderr, "%s is punct and digit\n", ucs_symbol (ch));
5013 if (is_punct (ch) && is_xdigit (ch))
5014 fprintf (stderr, "%s is punct and xdigit\n", ucs_symbol (ch));
5015 if (is_punct (ch) && (ch == 0x0020))
5016 fprintf (stderr, "%s is punct\n", ucs_symbol (ch));
5018 /* graph restriction: "No character specified for the keyword cntrl
5019 shall be specified." Already checked above. */
5021 /* print restriction: "No character specified for the keyword cntrl
5022 shall be specified." Already checked above. */
5024 /* graph - print relation: differ only in the <space> character.
5025 How is this possible if there are more than one space character?!
5026 I think susv2/xbd/locale.html should speak of "space characters",
5027 not "space character". */
5028 if (is_print (ch) && !(is_graph (ch) || /* ch == 0x0020 */ is_space (ch)))
5030 "%s is print but not graph|<space>\n", ucs_symbol (ch));
5031 if (!is_print (ch) && (is_graph (ch) || ch == 0x0020))
5033 "%s is graph|<space> but not print\n", ucs_symbol (ch));
5036 fprintf (stream, "LC_CTYPE\n");
5037 output_charclass (stream, "upper", is_upper);
5038 output_charclass (stream, "lower", is_lower);
5039 output_charclass (stream, "alpha", is_alpha);
5040 output_charclass (stream, "digit", is_digit);
5041 output_charclass (stream, "outdigit", is_outdigit);
5042 output_charclass (stream, "blank", is_blank);
5043 output_charclass (stream, "space", is_space);
5044 output_charclass (stream, "cntrl", is_cntrl);
5045 output_charclass (stream, "punct", is_punct);
5046 output_charclass (stream, "xdigit", is_xdigit);
5047 output_charclass (stream, "graph", is_graph);
5048 output_charclass (stream, "print", is_print);
5049 output_charclass (stream, "class \"combining\";", is_combining);
5050 output_charclass (stream, "class \"combining_level3\";", is_combining_level3);
5051 output_charmap (stream, "toupper", to_upper);
5052 output_charmap (stream, "tolower", to_lower);
5053 output_charmap (stream, "map \"totitle\";", to_title);
5054 output_widthmap (stream);
5055 fprintf (stream, "END LC_CTYPE\n");
5057 if (ferror (stream) || fclose (stream))
5059 fprintf (stderr, "error writing to '%s'\n", filename);
5066 /* ========================================================================= */
5068 /* The width property from the EastAsianWidth.txt file.
5069 Each is NULL (unassigned) or "N", "A", "H", "W", "F", "Na". */
5070 const char * unicode_width[0x110000];
5072 /* Stores in unicode_width[] the width property from the EastAsianWidth.txt
5075 fill_width (const char *width_filename)
5079 char field0[FIELDLEN];
5080 char field1[FIELDLEN];
5081 char field2[FIELDLEN];
5084 for (i = 0; i < 0x110000; i++)
5085 unicode_width[i] = (unicode_attributes[i].name != NULL ? "N" : NULL);
5087 stream = fopen (width_filename, "r");
5090 fprintf (stderr, "error during fopen of '%s'\n", width_filename);
5105 do c = getc (stream); while (c != EOF && c != '\n');
5109 n = getfield (stream, field0, ';');
5110 n += getfield (stream, field1, ' ');
5111 n += getfield (stream, field2, '\n');
5116 fprintf (stderr, "short line in '%s':%d\n", width_filename, lineno);
5119 i = strtoul (field0, NULL, 16);
5120 if (strstr (field0, "..") != NULL)
5122 /* Deal with a range. */
5123 j = strtoul (strstr (field0, "..") + 2, NULL, 16);
5125 unicode_width[i] = strdup (field1);
5129 /* Single character line. */
5130 unicode_width[i] = strdup (field1);
5133 if (ferror (stream) || fclose (stream))
5135 fprintf (stderr, "error reading from '%s'\n", width_filename);
5140 /* Line breaking classification. */
5144 /* Values >= 24 are resolved at run time. */
5145 LBP_BK = 24, /* mandatory break */
5146 /*LBP_CR, carriage return - not used here because it's a DOSism */
5147 /*LBP_LF, line feed - not used here because it's a DOSism */
5148 LBP_CM = 25, /* attached characters and combining marks */
5149 /*LBP_NL, next line - not used here because it's equivalent to LBP_BK */
5150 /*LBP_SG, surrogates - not used here because they are not characters */
5151 LBP_WJ = 0, /* word joiner */
5152 LBP_ZW = 26, /* zero width space */
5153 LBP_GL = 1, /* non-breaking (glue) */
5154 LBP_SP = 27, /* space */
5155 LBP_B2 = 2, /* break opportunity before and after */
5156 LBP_BA = 3, /* break opportunity after */
5157 LBP_BB = 4, /* break opportunity before */
5158 LBP_HY = 5, /* hyphen */
5159 LBP_CB = 28, /* contingent break opportunity */
5160 LBP_CL = 6, /* closing punctuation */
5161 LBP_EX = 7, /* exclamation/interrogation */
5162 LBP_IN = 8, /* inseparable */
5163 LBP_NS = 9, /* non starter */
5164 LBP_OP = 10, /* opening punctuation */
5165 LBP_QU = 11, /* ambiguous quotation */
5166 LBP_IS = 12, /* infix separator (numeric) */
5167 LBP_NU = 13, /* numeric */
5168 LBP_PO = 14, /* postfix (numeric) */
5169 LBP_PR = 15, /* prefix (numeric) */
5170 LBP_SY = 16, /* symbols allowing breaks */
5171 LBP_AI = 29, /* ambiguous (alphabetic or ideograph) */
5172 LBP_AL = 17, /* ordinary alphabetic and symbol characters */
5173 LBP_H2 = 18, /* Hangul LV syllable */
5174 LBP_H3 = 19, /* Hangul LVT syllable */
5175 LBP_ID = 20, /* ideographic */
5176 LBP_JL = 21, /* Hangul L Jamo */
5177 LBP_JV = 22, /* Hangul V Jamo */
5178 LBP_JT = 23, /* Hangul T Jamo */
5179 LBP_SA = 30, /* complex context (South East Asian) */
5180 LBP_XX = 31 /* unknown */
5183 /* Returns the line breaking classification for ch, as a bit mask. */
5185 get_lbp (unsigned int ch)
5189 if (unicode_attributes[ch].name != NULL)
5191 /* mandatory break */
5192 if (ch == 0x000A || ch == 0x000D || ch == 0x0085 /* newline */
5193 || ch == 0x000C /* form feed */
5194 || ch == 0x000B /* line tabulation */
5195 || ch == 0x2028 /* LINE SEPARATOR */
5196 || ch == 0x2029 /* PARAGRAPH SEPARATOR */)
5197 attr |= 1 << LBP_BK;
5199 if (ch == 0x2060 /* WORD JOINER */
5200 || ch == 0xFEFF /* ZERO WIDTH NO-BREAK SPACE */)
5201 attr |= 1 << LBP_WJ;
5203 /* zero width space */
5204 if (ch == 0x200B /* ZERO WIDTH SPACE */)
5205 attr |= 1 << LBP_ZW;
5207 /* non-breaking (glue) */
5208 if (ch == 0x00A0 /* NO-BREAK SPACE */
5209 || ch == 0x202F /* NARROW NO-BREAK SPACE */
5210 || ch == 0x180E /* MONGOLIAN VOWEL SEPARATOR */
5211 || ch == 0x034F /* COMBINING GRAPHEME JOINER */
5212 || ch == 0x2007 /* FIGURE SPACE */
5213 || ch == 0x2011 /* NON-BREAKING HYPHEN */
5214 || ch == 0x0F08 /* TIBETAN MARK SBRUL SHAD */
5215 || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */
5216 || ch == 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */
5217 || (ch >= 0x035C && ch <= 0x0362) /* COMBINING DOUBLE ... */)
5218 attr |= 1 << LBP_GL;
5221 if (ch == 0x0020 /* SPACE */)
5222 attr |= 1 << LBP_SP;
5224 /* break opportunity before and after */
5225 if (ch == 0x2014 /* EM DASH */)
5226 attr |= 1 << LBP_B2;
5228 /* break opportunity after */
5229 if (ch == 0x1680 /* OGHAM SPACE MARK */
5230 || ch == 0x2000 /* EN QUAD */
5231 || ch == 0x2001 /* EM QUAD */
5232 || ch == 0x2002 /* EN SPACE */
5233 || ch == 0x2003 /* EM SPACE */
5234 || ch == 0x2004 /* THREE-PER-EM SPACE */
5235 || ch == 0x2005 /* FOUR-PER-EM SPACE */
5236 || ch == 0x2006 /* SIX-PER-EM SPACE */
5237 || ch == 0x2008 /* PUNCTUATION SPACE */
5238 || ch == 0x2009 /* THIN SPACE */
5239 || ch == 0x200A /* HAIR SPACE */
5240 || ch == 0x205F /* MEDIUM MATHEMATICAL SPACE */
5241 || ch == 0x0009 /* tab */
5242 || ch == 0x00AD /* SOFT HYPHEN */
5243 || ch == 0x058A /* ARMENIAN HYPHEN */
5244 || ch == 0x2010 /* HYPHEN */
5245 || ch == 0x2012 /* FIGURE DASH */
5246 || ch == 0x2013 /* EN DASH */
5247 || ch == 0x05BE /* HEBREW PUNCTUATION MAQAF */
5248 || ch == 0x0F0B /* TIBETAN MARK INTERSYLLABIC TSHEG */
5249 || ch == 0x1361 /* ETHIOPIC WORDSPACE */
5250 || ch == 0x17D8 /* KHMER SIGN BEYYAL */
5251 || ch == 0x17DA /* KHMER SIGN KOOMUUT */
5252 || ch == 0x2027 /* HYPHENATION POINT */
5253 || ch == 0x007C /* VERTICAL LINE */
5254 || ch == 0x16EB /* RUNIC SINGLE PUNCTUATION */
5255 || ch == 0x16EC /* RUNIC MULTIPLE PUNCTUATION */
5256 || ch == 0x16ED /* RUNIC CROSS PUNCTUATION */
5257 || ch == 0x2056 /* THREE DOT PUNCTUATION */
5258 || ch == 0x2058 /* FOUR DOT PUNCTUATION */
5259 || ch == 0x2059 /* FIVE DOT PUNCTUATION */
5260 || ch == 0x205A /* TWO DOT PUNCTUATION */
5261 || ch == 0x205B /* FOUR DOT MARK */
5262 || ch == 0x205D /* TRICOLON */
5263 || ch == 0x205E /* VERTICAL FOUR DOTS */
5264 || ch == 0x2E19 /* PALM BRANCH */
5265 || ch == 0x2E2A /* TWO DOTS OVER ONE DOT PUNCTUATION */
5266 || ch == 0x2E2B /* ONE DOT OVER TWO DOTS PUNCTUATION */
5267 || ch == 0x2E2C /* SQUARED FOUR DOT PUNCTUATION */
5268 || ch == 0x2E2D /* FIVE DOT PUNCTUATION */
5269 || ch == 0x2E30 /* RING POINT */
5270 || ch == 0x10100 /* AEGEAN WORD SEPARATOR LINE */
5271 || ch == 0x10101 /* AEGEAN WORD SEPARATOR DOT */
5272 || ch == 0x10102 /* AEGEAN CHECK MARK */
5273 || ch == 0x1039F /* UGARITIC WORD DIVIDER */
5274 || ch == 0x103D0 /* OLD PERSIAN WORD DIVIDER */
5275 || ch == 0x1091F /* PHOENICIAN WORD SEPARATOR */
5276 || ch == 0x12470 /* CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER */
5277 || ch == 0x0964 /* DEVANAGARI DANDA */
5278 || ch == 0x0965 /* DEVANAGARI DOUBLE DANDA */
5279 || ch == 0x0E5A /* THAI CHARACTER ANGKHANKHU */
5280 || ch == 0x0E5B /* THAI CHARACTER KHOMUT */
5281 || ch == 0x104A /* MYANMAR SIGN LITTLE SECTION */
5282 || ch == 0x104B /* MYANMAR SIGN SECTION */
5283 || ch == 0x1735 /* PHILIPPINE SINGLE PUNCTUATION */
5284 || ch == 0x1736 /* PHILIPPINE DOUBLE PUNCTUATION */
5285 || ch == 0x17D4 /* KHMER SIGN KHAN */
5286 || ch == 0x17D5 /* KHMER SIGN BARIYOOSAN */
5287 || ch == 0x1B5E /* BALINESE CARIK SIKI */
5288 || ch == 0x1B5F /* BALINESE CARIK PAREREN */
5289 || ch == 0xA8CE /* SAURASHTRA DANDA */
5290 || ch == 0xA8CF /* SAURASHTRA DOUBLE DANDA */
5291 || ch == 0xAA5D /* CHAM PUNCTUATION DANDA */
5292 || ch == 0xAA5E /* CHAM PUNCTUATION DOUBLE DANDA */
5293 || ch == 0xAA5F /* CHAM PUNCTUATION TRIPLE DANDA */
5294 || ch == 0x10A56 /* KHAROSHTHI PUNCTUATION DANDA */
5295 || ch == 0x10A57 /* KHAROSHTHI PUNCTUATION DOUBLE DANDA */
5296 || ch == 0x0F34 /* TIBETAN MARK BSDUS RTAGS */
5297 || ch == 0x0F7F /* TIBETAN SIGN RNAM BCAD */
5298 || ch == 0x0F85 /* TIBETAN MARK PALUTA */
5299 || ch == 0x0FBE /* TIBETAN KU RU KHA */
5300 || ch == 0x0FBF /* TIBETAN KU RU KHA BZHI MIG CAN */
5301 || ch == 0x0FD2 /* TIBETAN MARK NYIS TSHEG */
5303 || ch == 0x1802 /* MONGOLIAN COMMA */
5304 || ch == 0x1803 /* MONGOLIAN FULL STOP */
5306 || ch == 0x1804 /* MONGOLIAN COLON */
5307 || ch == 0x1805 /* MONGOLIAN FOUR DOTS */
5309 || ch == 0x1808 /* MONGOLIAN MANCHU COMMA */
5310 || ch == 0x1809 /* MONGOLIAN MANCHU FULL STOP */
5312 || ch == 0x1B5A /* BALINESE PANTI */
5313 || ch == 0x1B5B /* BALINESE PAMADA */
5314 || ch == 0x1B5C /* BALINESE WINDU */
5315 || ch == 0x1B5D /* BALINESE CARIK PAMUNGKAH */
5316 || ch == 0x1B60 /* BALINESE PAMENENG */
5317 || ch == 0x1C3B /* LEPCHA PUNCTUATION TA-ROL */
5318 || ch == 0x1C3C /* LEPCHA PUNCTUATION NYET THYOOM TA-ROL */
5319 || ch == 0x1C3D /* LEPCHA PUNCTUATION CER-WA */
5320 || ch == 0x1C3E /* LEPCHA PUNCTUATION TSHOOK CER-WA */
5321 || ch == 0x1C3F /* LEPCHA PUNCTUATION TSHOOK */
5322 || ch == 0x1C7E /* OL CHIKI PUNCTUATION MUCAAD */
5323 || ch == 0x1C7F /* OL CHIKI PUNCTUATION DOUBLE MUCAAD */
5325 || ch == 0x2CF9 /* COPTIC OLD NUBIAN FULL STOP */
5327 || ch == 0x2CFA /* COPTIC OLD NUBIAN DIRECT QUESTION MARK */
5328 || ch == 0x2CFB /* COPTIC OLD NUBIAN INDIRECT QUESTION MARK */
5329 || ch == 0x2CFC /* COPTIC OLD NUBIAN VERSE DIVIDER */
5331 || ch == 0x2CFE /* COPTIC FULL STOP */
5333 || ch == 0x2CFF /* COPTIC MORPHOLOGICAL DIVIDER */
5334 || (ch >= 0x2E0E && ch <= 0x2E15) /* EDITORIAL CORONIS .. UPWARDS ANCORA */
5335 || ch == 0x2E17 /* DOUBLE OBLIQUE HYPHEN */
5336 || ch == 0xA60D /* VAI COMMA */
5337 || ch == 0xA60F /* VAI QUESTION MARK */
5338 || ch == 0xA92E /* KAYAH LI SIGN CWI */
5339 || ch == 0xA92F /* KAYAH LI SIGN SHYA */
5340 || ch == 0x10A50 /* KHAROSHTHI PUNCTUATION DOT */
5341 || ch == 0x10A51 /* KHAROSHTHI PUNCTUATION SMALL CIRCLE */
5342 || ch == 0x10A52 /* KHAROSHTHI PUNCTUATION CIRCLE */
5343 || ch == 0x10A53 /* KHAROSHTHI PUNCTUATION CRESCENT BAR */
5344 || ch == 0x10A54 /* KHAROSHTHI PUNCTUATION MANGALAM */
5345 || ch == 0x10A55 /* KHAROSHTHI PUNCTUATION LOTUS */
5346 /* Extra characters for compatibility with Unicode LineBreak.txt. */
5348 || ch == 0x1A1E /* BUGINESE PALLAWA */
5350 || ch == 0x12471 /* CUNEIFORM PUNCTUATION SIGN VERTICAL COLON */
5351 || ch == 0x12472 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL COLON */
5352 || ch == 0x12473 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL TRICOLON */)
5353 attr |= 1 << LBP_BA;
5355 /* break opportunity before */
5356 if (ch == 0x00B4 /* ACUTE ACCENT */
5358 || ch == 0x1FFD /* GREEK OXIA */
5359 || ch == 0x02DF /* MODIFIER LETTER CROSS ACCENT */
5361 || ch == 0x02C8 /* MODIFIER LETTER VERTICAL LINE */
5362 || ch == 0x02CC /* MODIFIER LETTER LOW VERTICAL LINE */
5363 || ch == 0x0F01 /* TIBETAN MARK GTER YIG MGO TRUNCATED A */
5364 || ch == 0x0F02 /* TIBETAN MARK GTER YIG MGO -UM RNAM BCAD MA */
5365 || ch == 0x0F03 /* TIBETAN MARK GTER YIG MGO -UM GTER TSHEG MA */
5366 || ch == 0x0F04 /* TIBETAN MARK INITIAL YIG MGO MDUN MA */
5367 || ch == 0x0F06 /* TIBETAN MARK CARET YIG MGO PHUR SHAD MA */
5368 || ch == 0x0F07 /* TIBETAN MARK YIG MGO TSHEG SHAD MA */
5369 || ch == 0x0F09 /* TIBETAN MARK BSKUR YIG MGO */
5370 || ch == 0x0F0A /* TIBETAN MARK BKA- SHOG YIG MGO */
5371 || ch == 0x0FD0 /* TIBETAN MARK BSKA- SHOG GI MGO RGYAN */
5372 || ch == 0x0FD1 /* TIBETAN MARK MNYAM YIG GI MGO RGYAN */
5373 || ch == 0x0FD3 /* TIBETAN MARK INITIAL BRDA RNYING YIG MGO MDUN MA */
5374 || ch == 0xA874 /* PHAGS-PA SINGLE HEAD MARK */
5375 || ch == 0xA875 /* PHAGS-PA DOUBLE HEAD MARK */
5376 || ch == 0x1806 /* MONGOLIAN TODO SOFT HYPHEN */)
5377 attr |= 1 << LBP_BB;
5380 if (ch == 0x002D /* HYPHEN-MINUS */)
5381 attr |= 1 << LBP_HY;
5383 /* contingent break opportunity */
5384 if (ch == 0xFFFC /* OBJECT REPLACEMENT CHARACTER */)
5385 attr |= 1 << LBP_CB;
5387 /* closing punctuation */
5388 if ((unicode_attributes[ch].category[0] == 'P'
5389 && unicode_attributes[ch].category[1] == 'e')
5390 || ch == 0x3001 /* IDEOGRAPHIC COMMA */
5391 || ch == 0x3002 /* IDEOGRAPHIC FULL STOP */
5392 || ch == 0xFE11 /* PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC COMMA */
5393 || ch == 0xFE12 /* PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC FULL STOP */
5394 || ch == 0xFE50 /* SMALL COMMA */
5395 || ch == 0xFE52 /* SMALL FULL STOP */
5396 || ch == 0xFF0C /* FULLWIDTH COMMA */
5397 || ch == 0xFF0E /* FULLWIDTH FULL STOP */
5398 || ch == 0xFF61 /* HALFWIDTH IDEOGRAPHIC FULL STOP */
5399 || ch == 0xFF64 /* HALFWIDTH IDEOGRAPHIC COMMA */)
5400 attr |= 1 << LBP_CL;
5402 /* exclamation/interrogation */
5403 if (ch == 0x0021 /* EXCLAMATION MARK */
5404 || ch == 0x003F /* QUESTION MARK */
5405 || ch == 0x05C6 /* HEBREW PUNCTUATION NUN HAFUKHA */
5407 || ch == 0x060C /* ARABIC COMMA */
5409 || ch == 0x061B /* ARABIC SEMICOLON */
5410 || ch == 0x061E /* ARABIC TRIPLE DOT PUNCTUATION MARK */
5411 || ch == 0x061F /* ARABIC QUESTION MARK */
5413 || ch == 0x066A /* ARABIC PERCENT SIGN */
5415 || ch == 0x06D4 /* ARABIC FULL STOP */
5416 || ch == 0x07F9 /* NKO EXCLAMATION MARK */
5417 || ch == 0x0F0D /* TIBETAN MARK SHAD */
5418 || ch == 0x0F0E /* TIBETAN MARK NYIS SHAD */
5419 || ch == 0x0F0F /* TIBETAN MARK TSHEG SHAD */
5420 || ch == 0x0F10 /* TIBETAN MARK NYIS TSHEG SHAD */
5421 || ch == 0x0F11 /* TIBETAN MARK RIN CHEN SPUNGS SHAD */
5422 || ch == 0x0F14 /* TIBETAN MARK GTER TSHEG */
5424 || ch == 0x1802 /* MONGOLIAN COMMA */
5425 || ch == 0x1803 /* MONGOLIAN FULL STOP */
5426 || ch == 0x1808 /* MONGOLIAN MANCHU COMMA */
5427 || ch == 0x1809 /* MONGOLIAN MANCHU FULL STOP */
5429 || ch == 0x1944 /* LIMBU EXCLAMATION MARK */
5430 || ch == 0x1945 /* LIMBU QUESTION MARK */
5431 || ch == 0x2762 /* HEAVY EXCLAMATION MARK ORNAMENT */
5432 || ch == 0x2763 /* HEAVY HEART EXCLAMATION MARK ORNAMENT */
5434 || ch == 0x2CF9 /* COPTIC OLD NUBIAN FULL STOP */
5435 || ch == 0x2CFE /* COPTIC FULL STOP */
5437 || ch == 0x2E2E /* REVERSED QUESTION MARK */
5438 || ch == 0xA60C /* VAI SYLLABLE LENGTHENER */
5439 || ch == 0xA60E /* VAI FULL STOP */
5440 || ch == 0xA876 /* PHAGS-PA MARK SHAD */
5441 || ch == 0xA877 /* PHAGS-PA MARK DOUBLE SHAD */
5442 || ch == 0xFE15 /* PRESENTATION FORM FOR VERTICAL EXCLAMATION MARK */
5443 || ch == 0xFE16 /* PRESENTATION FORM FOR VERTICAL QUESTION MARK */
5444 || ch == 0xFE56 /* SMALL QUESTION MARK */
5445 || ch == 0xFE57 /* SMALL EXCLAMATION MARK */
5446 || ch == 0xFF01 /* FULLWIDTH EXCLAMATION MARK */
5447 || ch == 0xFF1F /* FULLWIDTH QUESTION MARK */)
5448 attr |= 1 << LBP_EX;
5451 if (ch == 0x2024 /* ONE DOT LEADER */
5452 || ch == 0x2025 /* TWO DOT LEADER */
5453 || ch == 0x2026 /* HORIZONTAL ELLIPSIS */
5454 || ch == 0xFE19 /* PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS */)
5455 attr |= 1 << LBP_IN;
5458 if (ch == 0x17D6 /* KHMER SIGN CAMNUC PII KUUH */
5459 || ch == 0x203C /* DOUBLE EXCLAMATION MARK */
5460 || ch == 0x203D /* INTERROBANG */
5461 || ch == 0x2047 /* DOUBLE QUESTION MARK */
5462 || ch == 0x2048 /* QUESTION EXCLAMATION MARK */
5463 || ch == 0x2049 /* EXCLAMATION QUESTION MARK */
5464 || ch == 0x3005 /* IDEOGRAPHIC ITERATION MARK */
5465 || ch == 0x301C /* WAVE DASH */
5466 || ch == 0x303C /* MASU MARK */
5467 || ch == 0x303B /* VERTICAL IDEOGRAPHIC ITERATION MARK */
5468 || ch == 0x309B /* KATAKANA-HIRAGANA VOICED SOUND MARK */
5469 || ch == 0x309C /* KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK */
5470 || ch == 0x309D /* HIRAGANA ITERATION MARK */
5471 || ch == 0x309E /* HIRAGANA VOICED ITERATION MARK */
5472 || ch == 0x30A0 /* KATAKANA-HIRAGANA DOUBLE HYPHEN */
5473 || ch == 0x30FB /* KATAKANA MIDDLE DOT */
5474 || ch == 0x30FC /* KATAKANA-HIRAGANA PROLONGED SOUND MARK */
5475 || ch == 0x30FD /* KATAKANA ITERATION MARK */
5476 || ch == 0x30FE /* KATAKANA VOICED ITERATION MARK */
5477 || ch == 0xA015 /* YI SYLLABLE WU */
5478 || ch == 0xFE54 /* SMALL SEMICOLON */
5479 || ch == 0xFE55 /* SMALL COLON */
5480 || ch == 0xFF1A /* FULLWIDTH COLON */
5481 || ch == 0xFF1B /* FULLWIDTH SEMICOLON */
5482 || ch == 0xFF65 /* HALFWIDTH KATAKANA MIDDLE DOT */
5483 || ch == 0xFF70 /* HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK */
5484 || ch == 0xFF9E /* HALFWIDTH KATAKANA VOICED SOUND MARK */
5485 || ch == 0xFF9F /* HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK */
5486 || strstr (unicode_attributes[ch].name, "HIRAGANA LETTER SMALL ") != NULL
5487 || strstr (unicode_attributes[ch].name, "KATAKANA LETTER SMALL ") != NULL)
5488 attr |= 1 << LBP_NS;
5490 /* opening punctuation */
5491 if ((unicode_attributes[ch].category[0] == 'P'
5492 && unicode_attributes[ch].category[1] == 's')
5494 || ch == 0x00A1 /* INVERTED EXCLAMATION MARK */
5495 || ch == 0x00BF /* INVERTED QUESTION MARK */
5497 || ch == 0x2E18 /* INVERTED INTERROBANG */)
5498 attr |= 1 << LBP_OP;
5500 /* ambiguous quotation */
5501 if ((unicode_attributes[ch].category[0] == 'P'
5502 && (unicode_attributes[ch].category[1] == 'f'
5503 || unicode_attributes[ch].category[1] == 'i'))
5504 || ch == 0x0022 /* QUOTATION MARK */
5505 || ch == 0x0027 /* APOSTROPHE */
5506 || ch == 0x275B /* HEAVY SINGLE TURNED COMMA QUOTATION MARK ORNAMENT */
5507 || ch == 0x275C /* HEAVY SINGLE COMMA QUOTATION MARK ORNAMENT */
5508 || ch == 0x275D /* HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT */
5509 || ch == 0x275E /* HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT */
5510 || ch == 0x2E00 /* RIGHT ANGLE SUBSTITUTION MARKER */
5511 || ch == 0x2E01 /* RIGHT ANGLE DOTTED SUBSTITUTION MARKER */
5512 || ch == 0x2E06 /* RAISED INTERPOLATION MARKER */
5513 || ch == 0x2E07 /* RAISED DOTTED INTERPOLATION MARKER */
5514 || ch == 0x2E08 /* DOTTED TRANSPOSITION MARKER */
5515 || ch == 0x2E0B /* RAISED SQUARE */)
5516 attr |= 1 << LBP_QU;
5518 /* infix separator (numeric) */
5519 if (ch == 0x002C /* COMMA */
5520 || ch == 0x002E /* FULL STOP */
5521 || ch == 0x003A /* COLON */
5522 || ch == 0x003B /* SEMICOLON */
5523 || ch == 0x037E /* GREEK QUESTION MARK */
5524 || ch == 0x0589 /* ARMENIAN FULL STOP */
5526 || ch == 0x060C /* ARABIC COMMA */
5528 || ch == 0x060D /* ARABIC DATE SEPARATOR */
5529 || ch == 0x07F8 /* NKO COMMA */
5530 || ch == 0x2044 /* FRACTION SLASH */
5531 || ch == 0xFE10 /* PRESENTATION FORM FOR VERTICAL COMMA */
5532 || ch == 0xFE13 /* PRESENTATION FORM FOR VERTICAL COLON */
5533 || ch == 0xFE14 /* PRESENTATION FORM FOR VERTICAL SEMICOLON */)
5534 attr |= 1 << LBP_IS;
5537 if ((unicode_attributes[ch].category[0] == 'N'
5538 && unicode_attributes[ch].category[1] == 'd'
5539 && strstr (unicode_attributes[ch].name, "FULLWIDTH") == NULL)
5540 || ch == 0x066B /* ARABIC DECIMAL SEPARATOR */
5541 || ch == 0x066C /* ARABIC THOUSANDS SEPARATOR */)
5542 attr |= 1 << LBP_NU;
5544 /* postfix (numeric) */
5545 if (ch == 0x0025 /* PERCENT SIGN */
5546 || ch == 0x00A2 /* CENT SIGN */
5547 || ch == 0x00B0 /* DEGREE SIGN */
5548 || ch == 0x060B /* AFGHANI SIGN */
5550 || ch == 0x066A /* ARABIC PERCENT SIGN */
5552 || ch == 0x2030 /* PER MILLE SIGN */
5553 || ch == 0x2031 /* PER TEN THOUSAND SIGN */
5554 || ch == 0x2032 /* PRIME */
5555 || ch == 0x2033 /* DOUBLE PRIME */
5556 || ch == 0x2034 /* TRIPLE PRIME */
5557 || ch == 0x2035 /* REVERSED PRIME */
5558 || ch == 0x2036 /* REVERSED DOUBLE PRIME */
5559 || ch == 0x2037 /* REVERSED TRIPLE PRIME */
5560 || ch == 0x20A7 /* PESETA SIGN */
5561 || ch == 0x2103 /* DEGREE CELSIUS */
5562 || ch == 0x2109 /* DEGREE FAHRENHEIT */
5563 || ch == 0xFDFC /* RIAL SIGN */
5564 || ch == 0xFE6A /* SMALL PERCENT SIGN */
5565 || ch == 0xFF05 /* FULLWIDTH PERCENT SIGN */
5566 || ch == 0xFFE0 /* FULLWIDTH DIGIT ZERO */)
5567 attr |= 1 << LBP_PO;
5569 /* prefix (numeric) */
5570 if ((unicode_attributes[ch].category[0] == 'S'
5571 && unicode_attributes[ch].category[1] == 'c')
5572 || ch == 0x002B /* PLUS SIGN */
5573 || ch == 0x005C /* REVERSE SOLIDUS */
5574 || ch == 0x00B1 /* PLUS-MINUS SIGN */
5575 || ch == 0x2116 /* NUMERO SIGN */
5576 || ch == 0x2212 /* MINUS SIGN */
5577 || ch == 0x2213 /* MINUS-OR-PLUS SIGN */)
5578 if (!(attr & (1 << LBP_PO)))
5579 attr |= 1 << LBP_PR;
5581 /* symbols allowing breaks */
5582 if (ch == 0x002F /* SOLIDUS */)
5583 attr |= 1 << LBP_SY;
5585 if (ch >= 0xAC00 && ch <= 0xD7A3 && ((ch - 0xAC00) % 28) == 0)
5586 attr |= 1 << LBP_H2;
5588 if (ch >= 0xAC00 && ch <= 0xD7A3 && ((ch - 0xAC00) % 28) != 0)
5589 attr |= 1 << LBP_H3;
5591 if ((ch >= 0x1100 && ch <= 0x1159) || ch == 0x115F)
5592 attr |= 1 << LBP_JL;
5594 if (ch >= 0x1160 && ch <= 0x11A2)
5595 attr |= 1 << LBP_JV;
5597 if (ch >= 0x11A8 && ch <= 0x11F9)
5598 attr |= 1 << LBP_JT;
5600 /* complex context (South East Asian) */
5601 if (((unicode_attributes[ch].category[0] == 'C'
5602 && unicode_attributes[ch].category[1] == 'f')
5603 || (unicode_attributes[ch].category[0] == 'L'
5604 && (unicode_attributes[ch].category[1] == 'm'
5605 || unicode_attributes[ch].category[1] == 'o'))
5606 || (unicode_attributes[ch].category[0] == 'M'
5607 && (unicode_attributes[ch].category[1] == 'c'
5608 || unicode_attributes[ch].category[1] == 'n'))
5609 /* Extra characters for compatibility with Unicode LineBreak.txt. */
5610 || ch == 0x19DE /* NEW TAI LUE SIGN LAE */
5611 || ch == 0x19DF /* NEW TAI LUE SIGN LAEV */)
5612 && ((ch >= 0x0E00 && ch <= 0x0EFF)
5613 || (ch >= 0x1000 && ch <= 0x109F)
5614 || (ch >= 0x1780 && ch <= 0x17FF)
5615 || (ch >= 0x1950 && ch <= 0x19DF)))
5616 attr |= 1 << LBP_SA;
5618 /* attached characters and combining marks */
5619 if ((unicode_attributes[ch].category[0] == 'M'
5620 && (unicode_attributes[ch].category[1] == 'c'
5621 || unicode_attributes[ch].category[1] == 'e'
5622 || unicode_attributes[ch].category[1] == 'n'))
5623 || (unicode_attributes[ch].category[0] == 'C'
5624 && (unicode_attributes[ch].category[1] == 'c'
5625 || unicode_attributes[ch].category[1] == 'f')))
5626 if (!(attr & ((1 << LBP_BK) | (1 << LBP_BA) | (1 << LBP_GL) | (1 << LBP_SA) | (1 << LBP_WJ) | (1 << LBP_ZW))))
5627 attr |= 1 << LBP_CM;
5630 if ((ch >= 0x2E80 && ch <= 0x2FFF) /* CJK RADICAL, KANGXI RADICAL, IDEOGRAPHIC DESCRIPTION */
5631 || ch == 0x3000 /* IDEOGRAPHIC SPACE */
5632 || (ch >= 0x3040 && ch <= 0x309F) /* HIRAGANA */
5633 || (ch >= 0x30A0 && ch <= 0x30FF) /* KATAKANA */
5634 || (ch >= 0x3400 && ch <= 0x4DB5) /* CJK Ideograph Extension A */
5635 || (ch >= 0x4E00 && ch <= 0x9FBB) /* CJK Ideograph */
5636 || (ch >= 0xF900 && ch <= 0xFAD9) /* CJK COMPATIBILITY IDEOGRAPH */
5637 || (ch >= 0xA000 && ch <= 0xA48F) /* YI SYLLABLE */
5638 || (ch >= 0xA490 && ch <= 0xA4CF) /* YI RADICAL */
5639 || ch == 0xFE62 /* SMALL PLUS SIGN */
5640 || ch == 0xFE63 /* SMALL HYPHEN-MINUS */
5641 || ch == 0xFE64 /* SMALL LESS-THAN SIGN */
5642 || ch == 0xFE65 /* SMALL GREATER-THAN SIGN */
5643 || ch == 0xFE66 /* SMALL EQUALS SIGN */
5644 || (ch >= 0xFF10 && ch <= 0xFF19) /* FULLWIDTH DIGIT */
5645 || (ch >= 0x20000 && ch <= 0x2A6D6) /* CJK Ideograph Extension B */
5646 || (ch >= 0x2F800 && ch <= 0x2FA1D) /* CJK COMPATIBILITY IDEOGRAPH */
5647 || strstr (unicode_attributes[ch].name, "FULLWIDTH LATIN ") != NULL
5648 || (ch >= 0x3000 && ch <= 0x33FF
5649 && !(attr & ((1 << LBP_CM) | (1 << LBP_NS) | (1 << LBP_OP) | (1 << LBP_CL))))
5650 /* Extra characters for compatibility with Unicode LineBreak.txt. */
5651 || ch == 0xFE30 /* PRESENTATION FORM FOR VERTICAL TWO DOT LEADER */
5652 || ch == 0xFE31 /* PRESENTATION FORM FOR VERTICAL EM DASH */
5653 || ch == 0xFE32 /* PRESENTATION FORM FOR VERTICAL EN DASH */
5654 || ch == 0xFE33 /* PRESENTATION FORM FOR VERTICAL LOW LINE */
5655 || ch == 0xFE34 /* PRESENTATION FORM FOR VERTICAL WAVY LOW LINE */
5656 || ch == 0xFE45 /* SESAME DOT */
5657 || ch == 0xFE46 /* WHITE SESAME DOT */
5658 || ch == 0xFE49 /* DASHED OVERLINE */
5659 || ch == 0xFE4A /* CENTRELINE OVERLINE */
5660 || ch == 0xFE4B /* WAVY OVERLINE */
5661 || ch == 0xFE4C /* DOUBLE WAVY OVERLINE */
5662 || ch == 0xFE4D /* DASHED LOW LINE */
5663 || ch == 0xFE4E /* CENTRELINE LOW LINE */
5664 || ch == 0xFE4F /* WAVY LOW LINE */
5665 || ch == 0xFE51 /* SMALL IDEOGRAPHIC COMMA */
5666 || ch == 0xFE58 /* SMALL EM DASH */
5667 || ch == 0xFE5F /* SMALL NUMBER SIGN */
5668 || ch == 0xFE60 /* SMALL AMPERSAND */
5669 || ch == 0xFE61 /* SMALL ASTERISK */
5670 || ch == 0xFE68 /* SMALL REVERSE SOLIDUS */
5671 || ch == 0xFE6B /* SMALL COMMERCIAL AT */
5672 || ch == 0xFF02 /* FULLWIDTH QUOTATION MARK */
5673 || ch == 0xFF03 /* FULLWIDTH NUMBER SIGN */
5674 || ch == 0xFF06 /* FULLWIDTH AMPERSAND */
5675 || ch == 0xFF07 /* FULLWIDTH APOSTROPHE */
5676 || ch == 0xFF0A /* FULLWIDTH ASTERISK */
5677 || ch == 0xFF0B /* FULLWIDTH PLUS SIGN */
5678 || ch == 0xFF0D /* FULLWIDTH HYPHEN-MINUS */
5679 || ch == 0xFF0F /* FULLWIDTH SOLIDUS */
5680 || ch == 0xFF1C /* FULLWIDTH LESS-THAN SIGN */
5681 || ch == 0xFF1D /* FULLWIDTH EQUALS SIGN */
5682 || ch == 0xFF1E /* FULLWIDTH GREATER-THAN SIGN */
5683 || ch == 0xFF20 /* FULLWIDTH COMMERCIAL AT */
5684 || ch == 0xFF3C /* FULLWIDTH REVERSE SOLIDUS */
5685 || ch == 0xFF3E /* FULLWIDTH CIRCUMFLEX ACCENT */
5686 || ch == 0xFF3F /* FULLWIDTH LOW LINE */
5687 || ch == 0xFF40 /* FULLWIDTH GRAVE ACCENT */
5688 || ch == 0xFF5C /* FULLWIDTH VERTICAL LINE */
5689 || ch == 0xFF5E /* FULLWIDTH TILDE */
5690 || ch == 0xFFE2 /* FULLWIDTH NOT SIGN */
5691 || ch == 0xFFE3 /* FULLWIDTH MACRON */
5692 || ch == 0xFFE4 /* FULLWIDTH BROKEN BAR */)
5693 if (!(attr & ((1 << LBP_NS) | (1 << LBP_CM))))
5695 /* ambiguous (ideograph) ? */
5696 if ((unicode_width[ch] != NULL
5697 && unicode_width[ch][0] == 'A'
5699 || ch == 0x24EA /* CIRCLED DIGIT ZERO */
5700 || (ch >= 0x2780 && ch <= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */)
5701 attr |= 1 << LBP_AI;
5703 attr |= 1 << LBP_ID;
5706 /* ordinary alphabetic and symbol characters */
5707 if ((unicode_attributes[ch].category[0] == 'L'
5708 && (unicode_attributes[ch].category[1] == 'u'
5709 || unicode_attributes[ch].category[1] == 'l'
5710 || unicode_attributes[ch].category[1] == 't'
5711 || unicode_attributes[ch].category[1] == 'm'
5712 || unicode_attributes[ch].category[1] == 'o'))
5713 || (unicode_attributes[ch].category[0] == 'S'
5714 && (unicode_attributes[ch].category[1] == 'm'
5715 || unicode_attributes[ch].category[1] == 'k'
5716 || unicode_attributes[ch].category[1] == 'o'))
5717 || (unicode_attributes[ch].category[0] == 'N'
5718 && (unicode_attributes[ch].category[1] == 'l'
5719 || unicode_attributes[ch].category[1] == 'o'))
5720 || (unicode_attributes[ch].category[0] == 'P'
5721 && (unicode_attributes[ch].category[1] == 'c'
5722 || unicode_attributes[ch].category[1] == 'd'
5723 || unicode_attributes[ch].category[1] == 'o'))
5724 || ch == 0x0600 /* ARABIC NUMBER SIGN */
5725 || ch == 0x0601 /* ARABIC SIGN SANAH */
5726 || ch == 0x0602 /* ARABIC FOOTNOTE MARKER */
5727 || ch == 0x0603 /* ARABIC SIGN SAFHA */
5728 || ch == 0x06DD /* ARABIC END OF AYAH */
5729 || ch == 0x070F /* SYRIAC ABBREVIATION MARK */
5730 || ch == 0x2061 /* FUNCTION APPLICATION */
5731 || ch == 0x2062 /* INVISIBLE TIMES */
5732 || ch == 0x2063 /* INVISIBLE SEPARATOR */
5733 || ch == 0x2064 /* INVISIBLE PLUS */)
5734 if (!(attr & ((1 << LBP_GL) | (1 << LBP_B2) | (1 << LBP_BA) | (1 << LBP_BB) | (1 << LBP_HY) | (1 << LBP_CB) | (1 << LBP_CL) | (1 << LBP_EX) | (1 << LBP_IN) | (1 << LBP_NS) | (1 << LBP_OP) | (1 << LBP_QU) | (1 << LBP_IS) | (1 << LBP_NU) | (1 << LBP_PO) | (1 << LBP_PR) | (1 << LBP_SY) | (1 << LBP_H2) | (1 << LBP_H3) | (1 << LBP_JL) | (1 << LBP_JV) | (1 << LBP_JT) | (1 << LBP_SA) | (1 << LBP_ID))))
5736 /* ambiguous (alphabetic) ? */
5737 if ((unicode_width[ch] != NULL
5738 && unicode_width[ch][0] == 'A'
5740 /* Extra exceptions for compatibility with Unicode LineBreak.txt. */
5741 && ch != 0x2022 /* BULLET */
5742 && ch != 0x203E /* OVERLINE */
5743 && ch != 0x2126 /* OHM SIGN */
5744 && ch != 0x2153 /* VULGAR FRACTION ONE THIRD */
5745 && ch != 0x215C /* VULGAR FRACTION THREE EIGHTHS */
5746 && ch != 0x215D /* VULGAR FRACTION FIVE EIGHTHS */
5747 && ch != 0x21B8 /* NORTH WEST ARROW TO LONG BAR */
5748 && ch != 0x21B9 /* LEFTWARDS ARROW TO BAR OVER RIGHTWARDS ARROW TO BAR */
5749 && ch != 0x21E7 /* UPWARDS WHITE ARROW */
5750 && ch != 0x24FF /* NEGATIVE CIRCLED DIGIT ZERO */
5751 && ch != 0x273D /* HEAVY TEARDROP-SPOKED ASTERISK */)
5753 || ch == 0x00A1 /* INVERTED EXCLAMATION MARK */
5754 || ch == 0x00A7 /* SECTION SIGN */
5755 || ch == 0x00A8 /* DIAERESIS */
5756 || ch == 0x00AA /* FEMININE ORDINAL INDICATOR */
5757 || ch == 0x00B2 /* SUPERSCRIPT TWO */
5758 || ch == 0x00B3 /* SUPERSCRIPT THREE */
5759 || ch == 0x00B6 /* PILCROW SIGN */
5760 || ch == 0x00B7 /* MIDDLE DOT */
5761 || ch == 0x00B8 /* CEDILLA */
5762 || ch == 0x00B9 /* SUPERSCRIPT ONE */
5763 || ch == 0x00BA /* MASCULINE ORDINAL INDICATOR */
5764 || ch == 0x00BC /* VULGAR FRACTION ONE QUARTER */
5765 || ch == 0x00BD /* VULGAR FRACTION ONE HALF */
5766 || ch == 0x00BE /* VULGAR FRACTION THREE QUARTERS */
5767 || ch == 0x00BF /* INVERTED QUESTION MARK */
5768 || ch == 0x00D7 /* MULTIPLICATION SIGN */
5769 || ch == 0x00F7 /* DIVISION SIGN */
5770 || ch == 0x02C7 /* CARON */
5771 || ch == 0x02C9 /* MODIFIER LETTER MACRON */
5772 || ch == 0x02CA /* MODIFIER LETTER ACUTE ACCENT */
5773 || ch == 0x02CB /* MODIFIER LETTER GRAVE ACCENT */
5774 || ch == 0x02CD /* MODIFIER LETTER LOW MACRON */
5775 || ch == 0x02D0 /* MODIFIER LETTER TRIANGULAR COLON */
5776 || ch == 0x02D8 /* BREVE */
5777 || ch == 0x02D9 /* DOT ABOVE */
5778 || ch == 0x02DA /* RING ABOVE */
5779 || ch == 0x02DB /* OGONEK */
5780 || ch == 0x02DD /* DOUBLE ACUTE ACCENT */
5782 || ch == 0x24EA /* CIRCLED DIGIT ZERO */
5783 || (ch >= 0x2780 && ch <= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */
5784 /* Extra characters for compatibility with Unicode LineBreak.txt. */
5785 || ch == 0x2155 /* VULGAR FRACTION ONE FIFTH */
5786 || ch == 0x2574 /* BOX DRAWINGS LIGHT LEFT */
5787 || ch == 0x2616 /* WHITE SHOGI PIECE */
5788 || ch == 0x2617 /* BLACK SHOGI PIECE */)
5789 attr |= 1 << LBP_AI;
5791 attr |= 1 << LBP_AL;
5792 attr &= ~(1 << LBP_CM);
5798 attr |= 1 << LBP_XX;
5803 /* Output the line breaking properties in a human readable format. */
5805 debug_output_lbp (FILE *stream)
5809 for (i = 0; i < 0x110000; i++)
5811 int attr = get_lbp (i);
5812 if (attr != 1 << LBP_XX)
5814 fprintf (stream, "0x%04X", i);
5815 #define PRINT_BIT(attr,bit) \
5816 if (attr & (1 << bit)) fprintf (stream, " " #bit);
5817 PRINT_BIT(attr,LBP_BK);
5818 PRINT_BIT(attr,LBP_CM);
5819 PRINT_BIT(attr,LBP_WJ);
5820 PRINT_BIT(attr,LBP_ZW);
5821 PRINT_BIT(attr,LBP_GL);
5822 PRINT_BIT(attr,LBP_SP);
5823 PRINT_BIT(attr,LBP_B2);
5824 PRINT_BIT(attr,LBP_BA);
5825 PRINT_BIT(attr,LBP_BB);
5826 PRINT_BIT(attr,LBP_HY);
5827 PRINT_BIT(attr,LBP_CB);
5828 PRINT_BIT(attr,LBP_CL);
5829 PRINT_BIT(attr,LBP_EX);
5830 PRINT_BIT(attr,LBP_IN);
5831 PRINT_BIT(attr,LBP_NS);
5832 PRINT_BIT(attr,LBP_OP);
5833 PRINT_BIT(attr,LBP_QU);
5834 PRINT_BIT(attr,LBP_IS);
5835 PRINT_BIT(attr,LBP_NU);
5836 PRINT_BIT(attr,LBP_PO);
5837 PRINT_BIT(attr,LBP_PR);
5838 PRINT_BIT(attr,LBP_SY);
5839 PRINT_BIT(attr,LBP_AI);
5840 PRINT_BIT(attr,LBP_AL);
5841 PRINT_BIT(attr,LBP_H2);
5842 PRINT_BIT(attr,LBP_H3);
5843 PRINT_BIT(attr,LBP_ID);
5844 PRINT_BIT(attr,LBP_JL);
5845 PRINT_BIT(attr,LBP_JV);
5846 PRINT_BIT(attr,LBP_JT);
5847 PRINT_BIT(attr,LBP_SA);
5848 PRINT_BIT(attr,LBP_XX);
5850 fprintf (stream, "\n");
5856 debug_output_lbrk_tables (const char *filename)
5860 stream = fopen (filename, "w");
5863 fprintf (stderr, "cannot open '%s' for writing\n", filename);
5867 debug_output_lbp (stream);
5869 if (ferror (stream) || fclose (stream))
5871 fprintf (stderr, "error writing to '%s'\n", filename);
5876 /* The line breaking property from the LineBreak.txt file. */
5877 int unicode_org_lbp[0x110000];
5879 /* Stores in unicode_org_lbp[] the line breaking property from the
5880 LineBreak.txt file. */
5882 fill_org_lbp (const char *linebreak_filename)
5886 char field0[FIELDLEN];
5887 char field1[FIELDLEN];
5888 char field2[FIELDLEN];
5891 for (i = 0; i < 0x110000; i++)
5892 unicode_org_lbp[i] = LBP_XX;
5894 stream = fopen (linebreak_filename, "r");
5897 fprintf (stderr, "error during fopen of '%s'\n", linebreak_filename);
5913 do c = getc (stream); while (c != EOF && c != '\n');
5917 n = getfield (stream, field0, ';');
5918 n += getfield (stream, field1, ' ');
5919 n += getfield (stream, field2, '\n');
5924 fprintf (stderr, "short line in '%s':%d\n", linebreak_filename,
5928 #define TRY(bit) else if (strcmp (field1, #bit + 4) == 0) value = bit;
5963 else if (strcmp (field1, "LF") == 0) value = LBP_BK;
5964 else if (strcmp (field1, "CR") == 0) value = LBP_BK;
5965 else if (strcmp (field1, "NL") == 0) value = LBP_BK;
5966 else if (strcmp (field1, "SG") == 0) value = LBP_XX;
5969 fprintf (stderr, "unknown property value \"%s\" in '%s':%d\n",
5970 field1, linebreak_filename, lineno);
5973 i = strtoul (field0, NULL, 16);
5974 if (strstr (field0, "..") != NULL)
5976 /* Deal with a range. */
5977 j = strtoul (strstr (field0, "..") + 2, NULL, 16);
5979 unicode_org_lbp[i] = value;
5983 /* Single character line. */
5984 unicode_org_lbp[i] = value;
5987 if (ferror (stream) || fclose (stream))
5989 fprintf (stderr, "error reading from '%s'\n", linebreak_filename);
5994 /* Output the line breaking properties in a human readable format. */
5996 debug_output_org_lbp (FILE *stream)
6000 for (i = 0; i < 0x110000; i++)
6002 int attr = unicode_org_lbp[i];
6005 fprintf (stream, "0x%04X", i);
6006 #define PRINT_BIT(attr,bit) \
6007 if (attr == bit) fprintf (stream, " " #bit);
6008 PRINT_BIT(attr,LBP_BK);
6009 PRINT_BIT(attr,LBP_CM);
6010 PRINT_BIT(attr,LBP_WJ);
6011 PRINT_BIT(attr,LBP_ZW);
6012 PRINT_BIT(attr,LBP_GL);
6013 PRINT_BIT(attr,LBP_SP);
6014 PRINT_BIT(attr,LBP_B2);
6015 PRINT_BIT(attr,LBP_BA);
6016 PRINT_BIT(attr,LBP_BB);
6017 PRINT_BIT(attr,LBP_HY);
6018 PRINT_BIT(attr,LBP_CB);
6019 PRINT_BIT(attr,LBP_CL);
6020 PRINT_BIT(attr,LBP_EX);
6021 PRINT_BIT(attr,LBP_IN);
6022 PRINT_BIT(attr,LBP_NS);
6023 PRINT_BIT(attr,LBP_OP);
6024 PRINT_BIT(attr,LBP_QU);
6025 PRINT_BIT(attr,LBP_IS);
6026 PRINT_BIT(attr,LBP_NU);
6027 PRINT_BIT(attr,LBP_PO);
6028 PRINT_BIT(attr,LBP_PR);
6029 PRINT_BIT(attr,LBP_SY);
6030 PRINT_BIT(attr,LBP_AI);
6031 PRINT_BIT(attr,LBP_AL);
6032 PRINT_BIT(attr,LBP_H2);
6033 PRINT_BIT(attr,LBP_H3);
6034 PRINT_BIT(attr,LBP_ID);
6035 PRINT_BIT(attr,LBP_JL);
6036 PRINT_BIT(attr,LBP_JV);
6037 PRINT_BIT(attr,LBP_JT);
6038 PRINT_BIT(attr,LBP_SA);
6039 PRINT_BIT(attr,LBP_XX);
6041 fprintf (stream, "\n");
6047 debug_output_org_lbrk_tables (const char *filename)
6051 stream = fopen (filename, "w");
6054 fprintf (stderr, "cannot open '%s' for writing\n", filename);
6058 debug_output_org_lbp (stream);
6060 if (ferror (stream) || fclose (stream))
6062 fprintf (stderr, "error writing to '%s'\n", filename);
6067 /* Construction of sparse 3-level tables. */
6068 #define TABLE lbp_table
6069 #define ELEMENT unsigned char
6070 #define DEFAULT LBP_XX
6071 #define xmalloc malloc
6072 #define xrealloc realloc
6076 output_lbp (FILE *stream1, FILE *stream2)
6080 unsigned int level1_offset, level2_offset, level3_offset;
6084 lbp_table_init (&t);
6086 for (i = 0; i < 0x110000; i++)
6088 int attr = get_lbp (i);
6090 /* Now attr should contain exactly one bit. */
6091 if (attr == 0 || ((attr & (attr - 1)) != 0))
6094 if (attr != 1 << LBP_XX)
6096 unsigned int log2_attr;
6097 for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++);
6099 lbp_table_add (&t, i, log2_attr);
6103 lbp_table_finalize (&t);
6106 5 * sizeof (uint32_t);
6108 5 * sizeof (uint32_t)
6109 + t.level1_size * sizeof (uint32_t);
6111 5 * sizeof (uint32_t)
6112 + t.level1_size * sizeof (uint32_t)
6113 + (t.level2_size << t.q) * sizeof (uint32_t);
6115 for (i = 0; i < 5; i++)
6116 fprintf (stream1, "#define lbrkprop_header_%d %d\n", i,
6117 ((uint32_t *) t.result)[i]);
6118 fprintf (stream1, "\n");
6119 fprintf (stream1, "typedef struct\n");
6120 fprintf (stream1, " {\n");
6121 fprintf (stream1, " int level1[%zu];\n", t.level1_size);
6122 fprintf (stream1, " int level2[%zu << %d];\n", t.level2_size, t.q);
6123 fprintf (stream1, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
6124 fprintf (stream1, " }\n");
6125 fprintf (stream1, "lbrkprop_t;\n");
6126 fprintf (stream1, "extern const lbrkprop_t unilbrkprop;\n");
6128 fprintf (stream2, "const lbrkprop_t unilbrkprop =\n");
6129 fprintf (stream2, "{\n");
6130 fprintf (stream2, " {");
6131 if (t.level1_size > 8)
6132 fprintf (stream2, "\n ");
6133 for (i = 0; i < t.level1_size; i++)
6136 if (i > 0 && (i % 8) == 0)
6137 fprintf (stream2, "\n ");
6138 offset = ((uint32_t *) (t.result + level1_offset))[i];
6139 fprintf (stream2, " %5zd%s",
6140 offset == 0 ? -1 : (offset - level2_offset) / sizeof (uint32_t),
6141 (i+1 < t.level1_size ? "," : ""));
6143 if (t.level1_size > 8)
6144 fprintf (stream2, "\n ");
6145 fprintf (stream2, " },\n");
6146 fprintf (stream2, " {");
6147 if (t.level2_size << t.q > 8)
6148 fprintf (stream2, "\n ");
6149 for (i = 0; i < t.level2_size << t.q; i++)
6152 if (i > 0 && (i % 8) == 0)
6153 fprintf (stream2, "\n ");
6154 offset = ((uint32_t *) (t.result + level2_offset))[i];
6155 fprintf (stream2, " %5zd%s",
6156 offset == 0 ? -1 : (offset - level3_offset) / sizeof (uint8_t),
6157 (i+1 < t.level2_size << t.q ? "," : ""));
6159 if (t.level2_size << t.q > 8)
6160 fprintf (stream2, "\n ");
6161 fprintf (stream2, " },\n");
6162 fprintf (stream2, " {");
6163 if (t.level3_size << t.p > 8)
6164 fprintf (stream2, "\n ");
6165 for (i = 0; i < t.level3_size << t.p; i++)
6167 unsigned char value = ((unsigned char *) (t.result + level3_offset))[i];
6168 const char *value_string;
6171 #define CASE(x) case x: value_string = #x; break;
6208 if (i > 0 && (i % 8) == 0)
6209 fprintf (stream2, "\n ");
6210 fprintf (stream2, " %s%s", value_string,
6211 (i+1 < t.level3_size << t.p ? "," : ""));
6213 if (t.level3_size << t.p > 8)
6214 fprintf (stream2, "\n ");
6215 fprintf (stream2, " }\n");
6216 fprintf (stream2, "};\n");
6220 output_lbrk_tables (const char *filename1, const char *filename2, const char *version)
6222 const char *filenames[2];
6226 filenames[0] = filename1;
6227 filenames[1] = filename2;
6229 for (i = 0; i < 2; i++)
6231 streams[i] = fopen (filenames[i], "w");
6232 if (streams[i] == NULL)
6234 fprintf (stderr, "cannot open '%s' for writing\n", filenames[i]);
6239 for (i = 0; i < 2; i++)
6241 FILE *stream = streams[i];
6243 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
6244 fprintf (stream, "/* Line breaking properties of Unicode characters. */\n");
6245 fprintf (stream, "/* Generated automatically by gen-lbrk for Unicode %s. */\n",
6247 fprintf (stream, "\n");
6249 /* Put a GPL header on it. The gnulib module is under LGPL (although it
6250 still carries the GPL header), and it's gnulib-tool which replaces the
6251 GPL header with an LGPL header. */
6252 fprintf (stream, "/* Copyright (C) 2000-2002, 2004, 2008 Free Software Foundation, Inc.\n");
6253 fprintf (stream, "\n");
6254 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
6255 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
6256 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
6257 fprintf (stream, " (at your option) any later version.\n");
6258 fprintf (stream, "\n");
6259 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
6260 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
6261 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
6262 fprintf (stream, " GNU General Public License for more details.\n");
6263 fprintf (stream, "\n");
6264 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
6265 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
6266 fprintf (stream, "\n");
6269 output_lbp (streams[0], streams[1]);
6271 for (i = 0; i < 2; i++)
6273 if (ferror (streams[i]) || fclose (streams[i]))
6275 fprintf (stderr, "error writing to '%s'\n", filenames[i]);
6281 /* ========================================================================= */
6284 main (int argc, char * argv[])
6286 const char *unicodedata_filename;
6287 const char *proplist_filename;
6288 const char *derivedproplist_filename;
6289 const char *scripts_filename;
6290 const char *blocks_filename;
6291 const char *proplist30_filename;
6292 const char *eastasianwidth_filename;
6293 const char *linebreak_filename;
6294 const char *version;
6298 fprintf (stderr, "Usage: %s UnicodeData.txt PropList.txt DerivedCoreProperties.txt Scripts.txt Blocks.txt PropList-3.0.1.txt EastAsianWidth.txt LineBreak.txt version\n",
6303 unicodedata_filename = argv[1];
6304 proplist_filename = argv[2];
6305 derivedproplist_filename = argv[3];
6306 scripts_filename = argv[4];
6307 blocks_filename = argv[5];
6308 proplist30_filename = argv[6];
6309 eastasianwidth_filename = argv[7];
6310 linebreak_filename = argv[8];
6313 fill_attributes (unicodedata_filename);
6314 clear_properties ();
6315 fill_properties (proplist_filename);
6316 fill_properties (derivedproplist_filename);
6317 fill_properties30 (proplist30_filename);
6318 fill_scripts (scripts_filename);
6319 fill_blocks (blocks_filename);
6320 fill_width (eastasianwidth_filename);
6321 fill_org_lbp (linebreak_filename);
6323 output_categories (version);
6324 output_category ("unictype/categ_of.h", version);
6325 output_combclass ("unictype/combining.h", version);
6326 output_bidi_category ("unictype/bidi_of.h", version);
6327 output_decimal_digit_test ("../tests/unictype/test-decdigit.h", version);
6328 output_decimal_digit ("unictype/decdigit.h", version);
6329 output_digit_test ("../tests/unictype/test-digit.h", version);
6330 output_digit ("unictype/digit.h", version);
6331 output_numeric_test ("../tests/unictype/test-numeric.h", version);
6332 output_numeric ("unictype/numeric.h", version);
6333 output_mirror ("unictype/mirror.h", version);
6334 output_properties (version);
6335 output_scripts (version);
6336 output_scripts_byname (version);
6337 output_blocks (version);
6338 output_ident_properties (version);
6339 output_old_ctype (version);
6341 debug_output_lbrk_tables ("unilbrk/lbrkprop.txt");
6342 debug_output_org_lbrk_tables ("unilbrk/lbrkprop_org.txt");
6343 output_lbrk_tables ("unilbrk/lbrkprop1.h", "unilbrk/lbrkprop2.h", version);
6349 * For Emacs M-x compile
6351 * compile-command: "
6352 gcc -O -Wall gen-uni-tables.c -Iunictype -o gen-uni-tables && \
6354 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.0.0/ucd/UnicodeData.txt \
6355 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.0.0/ucd/PropList.txt \
6356 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.0.0/ucd/DerivedCoreProperties.txt \
6357 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.0.0/ucd/Scripts.txt \
6358 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.0.0/ucd/Blocks.txt \
6359 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/3.0.1/PropList-3.0.1.txt \
6360 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.0.0/ucd/EastAsianWidth.txt \
6361 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.0.0/ucd/LineBreak.txt \