1 /* Generate Unicode conforming character classification tables and
2 line break properties tables and word break property tables and
3 decomposition/composition and case mapping tables from a UnicodeData file.
4 Copyright (C) 2000-2002, 2004, 2007-2010 Free Software Foundation, Inc.
5 Written by Bruno Haible <bruno@clisp.org>, 2000-2002.
7 This program is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <http://www.gnu.org/licenses/>. */
21 $ gen-uni-tables /usr/local/share/Unidata/UnicodeData.txt \
22 /usr/local/share/Unidata/PropList.txt \
23 /usr/local/share/Unidata/DerivedCoreProperties.txt \
24 /usr/local/share/Unidata/Scripts.txt \
25 /usr/local/share/Unidata/Blocks.txt \
26 /usr/local/share/Unidata/PropList-3.0.1.txt \
27 /usr/local/share/Unidata/EastAsianWidth.txt \
28 /usr/local/share/Unidata/LineBreak.txt \
29 /usr/local/share/Unidata/WordBreakProperty.txt \
30 /usr/local/share/Unidata/CompositionExclusions.txt \
31 /usr/local/share/Unidata/SpecialCasing.txt \
32 /usr/local/share/Unidata/CaseFolding.txt \
43 /* ========================================================================= */
45 /* Reading UnicodeData.txt. */
48 /* This structure represents one line in the UnicodeData.txt file. */
49 struct unicode_attribute
51 const char *name; /* Character name */
52 const char *category; /* General category */
53 const char *combining; /* Canonical combining class */
54 const char *bidi; /* Bidirectional category */
55 const char *decomposition; /* Character decomposition mapping */
56 const char *decdigit; /* Decimal digit value */
57 const char *digit; /* Digit value */
58 const char *numeric; /* Numeric value */
59 bool mirrored; /* mirrored */
60 const char *oldname; /* Old Unicode 1.0 name */
61 const char *comment; /* Comment */
62 unsigned int upper; /* Uppercase mapping */
63 unsigned int lower; /* Lowercase mapping */
64 unsigned int title; /* Titlecase mapping */
67 /* Missing fields are represented with "" for strings, and NONE for
69 #define NONE (~(unsigned int)0)
71 /* The entire contents of the UnicodeData.txt file. */
72 struct unicode_attribute unicode_attributes [0x110000];
74 /* Stores in unicode_attributes[i] the values from the given fields. */
76 fill_attribute (unsigned int i,
77 const char *field1, const char *field2,
78 const char *field3, const char *field4,
79 const char *field5, const char *field6,
80 const char *field7, const char *field8,
81 const char *field9, const char *field10,
82 const char *field11, const char *field12,
83 const char *field13, const char *field14)
85 struct unicode_attribute * uni;
89 fprintf (stderr, "index too large\n");
92 if (strcmp (field2, "Cs") == 0)
93 /* Surrogates are UTF-16 artefacts, not real characters. Ignore them. */
95 uni = &unicode_attributes[i];
96 /* Copy the strings. */
97 uni->name = strdup (field1);
98 uni->category = (field2[0] == '\0' ? "" : strdup (field2));
99 uni->combining = (field3[0] == '\0' ? "" : strdup (field3));
100 uni->bidi = (field4[0] == '\0' ? "" : strdup (field4));
101 uni->decomposition = (field5[0] == '\0' ? "" : strdup (field5));
102 uni->decdigit = (field6[0] == '\0' ? "" : strdup (field6));
103 uni->digit = (field7[0] == '\0' ? "" : strdup (field7));
104 uni->numeric = (field8[0] == '\0' ? "" : strdup (field8));
105 uni->mirrored = (field9[0] == 'Y');
106 uni->oldname = (field10[0] == '\0' ? "" : strdup (field10));
107 uni->comment = (field11[0] == '\0' ? "" : strdup (field11));
108 uni->upper = (field12[0] =='\0' ? NONE : strtoul (field12, NULL, 16));
109 uni->lower = (field13[0] =='\0' ? NONE : strtoul (field13, NULL, 16));
110 uni->title = (field14[0] =='\0' ? NONE : strtoul (field14, NULL, 16));
113 /* Maximum length of a field in the UnicodeData.txt file. */
116 /* Reads the next field from STREAM. The buffer BUFFER has size FIELDLEN.
117 Reads up to (but excluding) DELIM.
118 Returns 1 when a field was successfully read, otherwise 0. */
120 getfield (FILE *stream, char *buffer, int delim)
125 for (; (c = getc (stream)), (c != EOF && c != delim); )
127 /* The original unicode.org UnicodeData.txt file happens to have
128 CR/LF line terminators. Silently convert to LF. */
132 /* Put c into the buffer. */
133 if (++count >= FIELDLEN - 1)
135 fprintf (stderr, "field longer than expected, increase FIELDLEN\n");
148 /* Stores in unicode_attributes[] the entire contents of the UnicodeData.txt
151 fill_attributes (const char *unicodedata_filename)
155 char field0[FIELDLEN];
156 char field1[FIELDLEN];
157 char field2[FIELDLEN];
158 char field3[FIELDLEN];
159 char field4[FIELDLEN];
160 char field5[FIELDLEN];
161 char field6[FIELDLEN];
162 char field7[FIELDLEN];
163 char field8[FIELDLEN];
164 char field9[FIELDLEN];
165 char field10[FIELDLEN];
166 char field11[FIELDLEN];
167 char field12[FIELDLEN];
168 char field13[FIELDLEN];
169 char field14[FIELDLEN];
172 for (i = 0; i < 0x110000; i++)
173 unicode_attributes[i].name = NULL;
175 stream = fopen (unicodedata_filename, "r");
178 fprintf (stderr, "error during fopen of '%s'\n", unicodedata_filename);
187 n = getfield (stream, field0, ';');
188 n += getfield (stream, field1, ';');
189 n += getfield (stream, field2, ';');
190 n += getfield (stream, field3, ';');
191 n += getfield (stream, field4, ';');
192 n += getfield (stream, field5, ';');
193 n += getfield (stream, field6, ';');
194 n += getfield (stream, field7, ';');
195 n += getfield (stream, field8, ';');
196 n += getfield (stream, field9, ';');
197 n += getfield (stream, field10, ';');
198 n += getfield (stream, field11, ';');
199 n += getfield (stream, field12, ';');
200 n += getfield (stream, field13, ';');
201 n += getfield (stream, field14, '\n');
206 fprintf (stderr, "short line in '%s':%d\n",
207 unicodedata_filename, lineno);
210 i = strtoul (field0, NULL, 16);
212 && strlen (field1) >= 9
213 && strcmp (field1 + strlen (field1) - 8, ", First>") == 0)
215 /* Deal with a range. */
217 n = getfield (stream, field0, ';');
218 n += getfield (stream, field1, ';');
219 n += getfield (stream, field2, ';');
220 n += getfield (stream, field3, ';');
221 n += getfield (stream, field4, ';');
222 n += getfield (stream, field5, ';');
223 n += getfield (stream, field6, ';');
224 n += getfield (stream, field7, ';');
225 n += getfield (stream, field8, ';');
226 n += getfield (stream, field9, ';');
227 n += getfield (stream, field10, ';');
228 n += getfield (stream, field11, ';');
229 n += getfield (stream, field12, ';');
230 n += getfield (stream, field13, ';');
231 n += getfield (stream, field14, '\n');
234 fprintf (stderr, "missing end range in '%s':%d\n",
235 unicodedata_filename, lineno);
238 if (!(field1[0] == '<'
239 && strlen (field1) >= 8
240 && strcmp (field1 + strlen (field1) - 7, ", Last>") == 0))
242 fprintf (stderr, "missing end range in '%s':%d\n",
243 unicodedata_filename, lineno);
246 field1[strlen (field1) - 7] = '\0';
247 j = strtoul (field0, NULL, 16);
249 fill_attribute (i, field1+1, field2, field3, field4, field5,
250 field6, field7, field8, field9, field10,
251 field11, field12, field13, field14);
255 /* Single character line */
256 fill_attribute (i, field1, field2, field3, field4, field5,
257 field6, field7, field8, field9, field10,
258 field11, field12, field13, field14);
261 if (ferror (stream) || fclose (stream))
263 fprintf (stderr, "error reading from '%s'\n", unicodedata_filename);
268 /* ========================================================================= */
270 /* General category. */
271 /* See Unicode 3.0 book, section 4.5,
275 is_category_L (unsigned int ch)
277 return (unicode_attributes[ch].name != NULL
278 && unicode_attributes[ch].category[0] == 'L');
282 is_category_Lu (unsigned int ch)
284 return (unicode_attributes[ch].name != NULL
285 && unicode_attributes[ch].category[0] == 'L'
286 && unicode_attributes[ch].category[1] == 'u');
290 is_category_Ll (unsigned int ch)
292 return (unicode_attributes[ch].name != NULL
293 && unicode_attributes[ch].category[0] == 'L'
294 && unicode_attributes[ch].category[1] == 'l');
298 is_category_Lt (unsigned int ch)
300 return (unicode_attributes[ch].name != NULL
301 && unicode_attributes[ch].category[0] == 'L'
302 && unicode_attributes[ch].category[1] == 't');
306 is_category_Lm (unsigned int ch)
308 return (unicode_attributes[ch].name != NULL
309 && unicode_attributes[ch].category[0] == 'L'
310 && unicode_attributes[ch].category[1] == 'm');
314 is_category_Lo (unsigned int ch)
316 return (unicode_attributes[ch].name != NULL
317 && unicode_attributes[ch].category[0] == 'L'
318 && unicode_attributes[ch].category[1] == 'o');
322 is_category_M (unsigned int ch)
324 return (unicode_attributes[ch].name != NULL
325 && unicode_attributes[ch].category[0] == 'M');
329 is_category_Mn (unsigned int ch)
331 return (unicode_attributes[ch].name != NULL
332 && unicode_attributes[ch].category[0] == 'M'
333 && unicode_attributes[ch].category[1] == 'n');
337 is_category_Mc (unsigned int ch)
339 return (unicode_attributes[ch].name != NULL
340 && unicode_attributes[ch].category[0] == 'M'
341 && unicode_attributes[ch].category[1] == 'c');
345 is_category_Me (unsigned int ch)
347 return (unicode_attributes[ch].name != NULL
348 && unicode_attributes[ch].category[0] == 'M'
349 && unicode_attributes[ch].category[1] == 'e');
353 is_category_N (unsigned int ch)
355 return (unicode_attributes[ch].name != NULL
356 && unicode_attributes[ch].category[0] == 'N');
360 is_category_Nd (unsigned int ch)
362 return (unicode_attributes[ch].name != NULL
363 && unicode_attributes[ch].category[0] == 'N'
364 && unicode_attributes[ch].category[1] == 'd');
368 is_category_Nl (unsigned int ch)
370 return (unicode_attributes[ch].name != NULL
371 && unicode_attributes[ch].category[0] == 'N'
372 && unicode_attributes[ch].category[1] == 'l');
376 is_category_No (unsigned int ch)
378 return (unicode_attributes[ch].name != NULL
379 && unicode_attributes[ch].category[0] == 'N'
380 && unicode_attributes[ch].category[1] == 'o');
384 is_category_P (unsigned int ch)
386 return (unicode_attributes[ch].name != NULL
387 && unicode_attributes[ch].category[0] == 'P');
391 is_category_Pc (unsigned int ch)
393 return (unicode_attributes[ch].name != NULL
394 && unicode_attributes[ch].category[0] == 'P'
395 && unicode_attributes[ch].category[1] == 'c');
399 is_category_Pd (unsigned int ch)
401 return (unicode_attributes[ch].name != NULL
402 && unicode_attributes[ch].category[0] == 'P'
403 && unicode_attributes[ch].category[1] == 'd');
407 is_category_Ps (unsigned int ch)
409 return (unicode_attributes[ch].name != NULL
410 && unicode_attributes[ch].category[0] == 'P'
411 && unicode_attributes[ch].category[1] == 's');
415 is_category_Pe (unsigned int ch)
417 return (unicode_attributes[ch].name != NULL
418 && unicode_attributes[ch].category[0] == 'P'
419 && unicode_attributes[ch].category[1] == 'e');
423 is_category_Pi (unsigned int ch)
425 return (unicode_attributes[ch].name != NULL
426 && unicode_attributes[ch].category[0] == 'P'
427 && unicode_attributes[ch].category[1] == 'i');
431 is_category_Pf (unsigned int ch)
433 return (unicode_attributes[ch].name != NULL
434 && unicode_attributes[ch].category[0] == 'P'
435 && unicode_attributes[ch].category[1] == 'f');
439 is_category_Po (unsigned int ch)
441 return (unicode_attributes[ch].name != NULL
442 && unicode_attributes[ch].category[0] == 'P'
443 && unicode_attributes[ch].category[1] == 'o');
447 is_category_S (unsigned int ch)
449 return (unicode_attributes[ch].name != NULL
450 && unicode_attributes[ch].category[0] == 'S');
454 is_category_Sm (unsigned int ch)
456 return (unicode_attributes[ch].name != NULL
457 && unicode_attributes[ch].category[0] == 'S'
458 && unicode_attributes[ch].category[1] == 'm');
462 is_category_Sc (unsigned int ch)
464 return (unicode_attributes[ch].name != NULL
465 && unicode_attributes[ch].category[0] == 'S'
466 && unicode_attributes[ch].category[1] == 'c');
470 is_category_Sk (unsigned int ch)
472 return (unicode_attributes[ch].name != NULL
473 && unicode_attributes[ch].category[0] == 'S'
474 && unicode_attributes[ch].category[1] == 'k');
478 is_category_So (unsigned int ch)
480 return (unicode_attributes[ch].name != NULL
481 && unicode_attributes[ch].category[0] == 'S'
482 && unicode_attributes[ch].category[1] == 'o');
486 is_category_Z (unsigned int ch)
488 return (unicode_attributes[ch].name != NULL
489 && unicode_attributes[ch].category[0] == 'Z');
493 is_category_Zs (unsigned int ch)
495 return (unicode_attributes[ch].name != NULL
496 && unicode_attributes[ch].category[0] == 'Z'
497 && unicode_attributes[ch].category[1] == 's');
501 is_category_Zl (unsigned int ch)
503 return (unicode_attributes[ch].name != NULL
504 && unicode_attributes[ch].category[0] == 'Z'
505 && unicode_attributes[ch].category[1] == 'l');
509 is_category_Zp (unsigned int ch)
511 return (unicode_attributes[ch].name != NULL
512 && unicode_attributes[ch].category[0] == 'Z'
513 && unicode_attributes[ch].category[1] == 'p');
517 is_category_C (unsigned int ch)
519 return (unicode_attributes[ch].name == NULL
520 || unicode_attributes[ch].category[0] == 'C');
524 is_category_Cc (unsigned int ch)
526 return (unicode_attributes[ch].name != NULL
527 && unicode_attributes[ch].category[0] == 'C'
528 && unicode_attributes[ch].category[1] == 'c');
532 is_category_Cf (unsigned int ch)
534 return (unicode_attributes[ch].name != NULL
535 && unicode_attributes[ch].category[0] == 'C'
536 && unicode_attributes[ch].category[1] == 'f');
540 is_category_Cs (unsigned int ch)
542 return (ch >= 0xd800 && ch < 0xe000);
546 is_category_Co (unsigned int ch)
548 return (unicode_attributes[ch].name != NULL
549 && unicode_attributes[ch].category[0] == 'C'
550 && unicode_attributes[ch].category[1] == 'o');
554 is_category_Cn (unsigned int ch)
556 return (unicode_attributes[ch].name == NULL
557 && !(ch >= 0xd800 && ch < 0xe000));
560 /* Output a boolean property in a human readable format. */
562 debug_output_predicate (const char *filename, bool (*predicate) (unsigned int))
567 stream = fopen (filename, "w");
570 fprintf (stderr, "cannot open '%s' for writing\n", filename);
574 #if 0 /* This yields huge text output. */
575 for (ch = 0; ch < 0x110000; ch++)
578 fprintf (stream, "0x%04X\n", ch);
581 for (ch = 0; ch < 0x110000; ch++)
584 unsigned int first = ch;
587 while (ch + 1 < 0x110000 && predicate (ch + 1))
591 fprintf (stream, "0x%04X..0x%04X\n", first, last);
593 fprintf (stream, "0x%04X\n", ch);
597 if (ferror (stream) || fclose (stream))
599 fprintf (stderr, "error writing to '%s'\n", filename);
604 /* Output the unit test for a boolean property. */
606 output_predicate_test (const char *filename, bool (*predicate) (unsigned int), const char *expression)
612 stream = fopen (filename, "w");
615 fprintf (stderr, "cannot open '%s' for writing\n", filename);
619 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
620 fprintf (stream, "/* Test the Unicode character type functions.\n");
621 fprintf (stream, " Copyright (C) 2007 Free Software Foundation, Inc.\n");
622 fprintf (stream, "\n");
623 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
624 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
625 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
626 fprintf (stream, " (at your option) any later version.\n");
627 fprintf (stream, "\n");
628 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
629 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
630 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
631 fprintf (stream, " GNU General Public License for more details.\n");
632 fprintf (stream, "\n");
633 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
634 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
635 fprintf (stream, "\n");
636 fprintf (stream, "#include \"test-predicate-part1.h\"\n");
637 fprintf (stream, "\n");
640 for (ch = 0; ch < 0x110000; ch++)
643 unsigned int first = ch;
646 while (ch + 1 < 0x110000 && predicate (ch + 1))
650 fprintf (stream, ",\n");
651 fprintf (stream, " { 0x%04X, 0x%04X }", first, last);
655 fprintf (stream, "\n");
657 fprintf (stream, "\n");
658 fprintf (stream, "#define PREDICATE(c) %s\n", expression);
659 fprintf (stream, "#include \"test-predicate-part2.h\"\n");
661 if (ferror (stream) || fclose (stream))
663 fprintf (stderr, "error writing to '%s'\n", filename);
668 /* Construction of sparse 3-level tables. */
669 #define TABLE predicate_table
670 #define xmalloc malloc
671 #define xrealloc realloc
672 #include "3levelbit.h"
674 /* Output a boolean property in a three-level bitmap. */
676 output_predicate (const char *filename, bool (*predicate) (unsigned int), const char *name, const char *comment, const char *version)
680 struct predicate_table t;
681 unsigned int level1_offset, level2_offset, level3_offset;
683 stream = fopen (filename, "w");
686 fprintf (stderr, "cannot open '%s' for writing\n", filename);
690 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
691 fprintf (stream, "/* %s of Unicode characters. */\n", comment);
692 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
697 predicate_table_init (&t);
699 for (ch = 0; ch < 0x110000; ch++)
701 predicate_table_add (&t, ch);
703 predicate_table_finalize (&t);
705 /* Offsets in t.result, in memory of this process. */
707 5 * sizeof (uint32_t);
709 5 * sizeof (uint32_t)
710 + t.level1_size * sizeof (uint32_t);
712 5 * sizeof (uint32_t)
713 + t.level1_size * sizeof (uint32_t)
714 + (t.level2_size << t.q) * sizeof (uint32_t);
716 for (i = 0; i < 5; i++)
718 fprintf (stream, "#define header_%d %d\n", i,
719 ((uint32_t *) t.result)[i]);
721 fprintf (stream, "static const\n");
722 fprintf (stream, "struct\n");
723 fprintf (stream, " {\n");
724 fprintf (stream, " int header[1];\n");
725 fprintf (stream, " int level1[%zu];\n", t.level1_size);
726 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
727 fprintf (stream, " /*unsigned*/ int level3[%zu << %d];\n", t.level3_size, t.p);
728 fprintf (stream, " }\n");
729 fprintf (stream, "%s =\n", name);
730 fprintf (stream, "{\n");
731 fprintf (stream, " { %d },\n", ((uint32_t *) t.result)[1]);
732 fprintf (stream, " {");
733 if (t.level1_size > 1)
734 fprintf (stream, "\n ");
735 for (i = 0; i < t.level1_size; i++)
738 if (i > 0 && (i % 1) == 0)
739 fprintf (stream, "\n ");
740 offset = ((uint32_t *) (t.result + level1_offset))[i];
742 fprintf (stream, " %5d", -1);
744 fprintf (stream, " %5zu * sizeof (int) / sizeof (short) + %5zu",
745 1 + t.level1_size, (offset - level2_offset) / sizeof (uint32_t));
746 if (i+1 < t.level1_size)
747 fprintf (stream, ",");
749 if (t.level1_size > 1)
750 fprintf (stream, "\n ");
751 fprintf (stream, " },\n");
752 fprintf (stream, " {");
753 if (t.level2_size << t.q > 1)
754 fprintf (stream, "\n ");
755 for (i = 0; i < t.level2_size << t.q; i++)
758 if (i > 0 && (i % 1) == 0)
759 fprintf (stream, "\n ");
760 offset = ((uint32_t *) (t.result + level2_offset))[i];
762 fprintf (stream, " %5d", -1);
764 fprintf (stream, " %5zu + %5zu * sizeof (short) / sizeof (int) + %5zu",
765 1 + t.level1_size, t.level2_size << t.q, (offset - level3_offset) / sizeof (uint32_t));
766 if (i+1 < t.level2_size << t.q)
767 fprintf (stream, ",");
769 if (t.level2_size << t.q > 1)
770 fprintf (stream, "\n ");
771 fprintf (stream, " },\n");
772 fprintf (stream, " {");
773 if (t.level3_size << t.p > 4)
774 fprintf (stream, "\n ");
775 for (i = 0; i < t.level3_size << t.p; i++)
777 if (i > 0 && (i % 4) == 0)
778 fprintf (stream, "\n ");
779 fprintf (stream, " 0x%08X",
780 ((uint32_t *) (t.result + level3_offset))[i]);
781 if (i+1 < t.level3_size << t.p)
782 fprintf (stream, ",");
784 if (t.level3_size << t.p > 4)
785 fprintf (stream, "\n ");
786 fprintf (stream, " }\n");
787 fprintf (stream, "};\n");
789 if (ferror (stream) || fclose (stream))
791 fprintf (stderr, "error writing to '%s'\n", filename);
796 /* Output all categories. */
798 output_categories (const char *version)
800 #define CATEGORY(C) \
801 debug_output_predicate ("unictype/categ_" #C ".txt", is_category_ ## C); \
802 output_predicate_test ("../tests/unictype/test-categ_" #C ".c", is_category_ ## C, "uc_is_general_category (c, UC_CATEGORY_" #C ")"); \
803 output_predicate ("unictype/categ_" #C ".h", is_category_ ## C, "u_categ_" #C, "Categories", version);
846 UC_CATEGORY_MASK_L = 0x0000001f,
847 UC_CATEGORY_MASK_Lu = 0x00000001,
848 UC_CATEGORY_MASK_Ll = 0x00000002,
849 UC_CATEGORY_MASK_Lt = 0x00000004,
850 UC_CATEGORY_MASK_Lm = 0x00000008,
851 UC_CATEGORY_MASK_Lo = 0x00000010,
852 UC_CATEGORY_MASK_M = 0x000000e0,
853 UC_CATEGORY_MASK_Mn = 0x00000020,
854 UC_CATEGORY_MASK_Mc = 0x00000040,
855 UC_CATEGORY_MASK_Me = 0x00000080,
856 UC_CATEGORY_MASK_N = 0x00000700,
857 UC_CATEGORY_MASK_Nd = 0x00000100,
858 UC_CATEGORY_MASK_Nl = 0x00000200,
859 UC_CATEGORY_MASK_No = 0x00000400,
860 UC_CATEGORY_MASK_P = 0x0003f800,
861 UC_CATEGORY_MASK_Pc = 0x00000800,
862 UC_CATEGORY_MASK_Pd = 0x00001000,
863 UC_CATEGORY_MASK_Ps = 0x00002000,
864 UC_CATEGORY_MASK_Pe = 0x00004000,
865 UC_CATEGORY_MASK_Pi = 0x00008000,
866 UC_CATEGORY_MASK_Pf = 0x00010000,
867 UC_CATEGORY_MASK_Po = 0x00020000,
868 UC_CATEGORY_MASK_S = 0x003c0000,
869 UC_CATEGORY_MASK_Sm = 0x00040000,
870 UC_CATEGORY_MASK_Sc = 0x00080000,
871 UC_CATEGORY_MASK_Sk = 0x00100000,
872 UC_CATEGORY_MASK_So = 0x00200000,
873 UC_CATEGORY_MASK_Z = 0x01c00000,
874 UC_CATEGORY_MASK_Zs = 0x00400000,
875 UC_CATEGORY_MASK_Zl = 0x00800000,
876 UC_CATEGORY_MASK_Zp = 0x01000000,
877 UC_CATEGORY_MASK_C = 0x3e000000,
878 UC_CATEGORY_MASK_Cc = 0x02000000,
879 UC_CATEGORY_MASK_Cf = 0x04000000,
880 UC_CATEGORY_MASK_Cs = 0x08000000,
881 UC_CATEGORY_MASK_Co = 0x10000000,
882 UC_CATEGORY_MASK_Cn = 0x20000000
886 general_category_byname (const char *category_name)
888 if (category_name[0] != '\0'
889 && (category_name[1] == '\0' || category_name[2] == '\0'))
890 switch (category_name[0])
893 switch (category_name[1])
895 case '\0': return UC_CATEGORY_MASK_L;
896 case 'u': return UC_CATEGORY_MASK_Lu;
897 case 'l': return UC_CATEGORY_MASK_Ll;
898 case 't': return UC_CATEGORY_MASK_Lt;
899 case 'm': return UC_CATEGORY_MASK_Lm;
900 case 'o': return UC_CATEGORY_MASK_Lo;
904 switch (category_name[1])
906 case '\0': return UC_CATEGORY_MASK_M;
907 case 'n': return UC_CATEGORY_MASK_Mn;
908 case 'c': return UC_CATEGORY_MASK_Mc;
909 case 'e': return UC_CATEGORY_MASK_Me;
913 switch (category_name[1])
915 case '\0': return UC_CATEGORY_MASK_N;
916 case 'd': return UC_CATEGORY_MASK_Nd;
917 case 'l': return UC_CATEGORY_MASK_Nl;
918 case 'o': return UC_CATEGORY_MASK_No;
922 switch (category_name[1])
924 case '\0': return UC_CATEGORY_MASK_P;
925 case 'c': return UC_CATEGORY_MASK_Pc;
926 case 'd': return UC_CATEGORY_MASK_Pd;
927 case 's': return UC_CATEGORY_MASK_Ps;
928 case 'e': return UC_CATEGORY_MASK_Pe;
929 case 'i': return UC_CATEGORY_MASK_Pi;
930 case 'f': return UC_CATEGORY_MASK_Pf;
931 case 'o': return UC_CATEGORY_MASK_Po;
935 switch (category_name[1])
937 case '\0': return UC_CATEGORY_MASK_S;
938 case 'm': return UC_CATEGORY_MASK_Sm;
939 case 'c': return UC_CATEGORY_MASK_Sc;
940 case 'k': return UC_CATEGORY_MASK_Sk;
941 case 'o': return UC_CATEGORY_MASK_So;
945 switch (category_name[1])
947 case '\0': return UC_CATEGORY_MASK_Z;
948 case 's': return UC_CATEGORY_MASK_Zs;
949 case 'l': return UC_CATEGORY_MASK_Zl;
950 case 'p': return UC_CATEGORY_MASK_Zp;
954 switch (category_name[1])
956 case '\0': return UC_CATEGORY_MASK_C;
957 case 'c': return UC_CATEGORY_MASK_Cc;
958 case 'f': return UC_CATEGORY_MASK_Cf;
959 case 's': return UC_CATEGORY_MASK_Cs;
960 case 'o': return UC_CATEGORY_MASK_Co;
961 case 'n': return UC_CATEGORY_MASK_Cn;
965 /* Invalid category name. */
969 /* Construction of sparse 3-level tables. */
970 #define TABLE category_table
971 #define ELEMENT uint8_t
972 #define DEFAULT 29 /* = log2(UC_CATEGORY_MASK_Cn) */
973 #define xmalloc malloc
974 #define xrealloc realloc
977 /* Output the per-character category table. */
979 output_category (const char *filename, const char *version)
983 struct category_table t;
984 unsigned int level1_offset, level2_offset, level3_offset;
985 uint16_t *level3_packed;
987 stream = fopen (filename, "w");
990 fprintf (stderr, "cannot open '%s' for writing\n", filename);
994 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
995 fprintf (stream, "/* Categories of Unicode characters. */\n");
996 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1001 category_table_init (&t);
1003 for (ch = 0; ch < 0x110000; ch++)
1006 unsigned int log2_value;
1008 if (is_category_Cs (ch))
1009 value = UC_CATEGORY_MASK_Cs;
1010 else if (unicode_attributes[ch].name != NULL)
1011 value = general_category_byname (unicode_attributes[ch].category);
1015 /* Now value should contain exactly one bit. */
1016 if (value == 0 || ((value & (value - 1)) != 0))
1019 for (log2_value = 0; value > 1; value >>= 1, log2_value++);
1021 category_table_add (&t, ch, log2_value);
1024 category_table_finalize (&t);
1026 /* Offsets in t.result, in memory of this process. */
1028 5 * sizeof (uint32_t);
1030 5 * sizeof (uint32_t)
1031 + t.level1_size * sizeof (uint32_t);
1033 5 * sizeof (uint32_t)
1034 + t.level1_size * sizeof (uint32_t)
1035 + (t.level2_size << t.q) * sizeof (uint32_t);
1037 for (i = 0; i < 5; i++)
1038 fprintf (stream, "#define category_header_%d %d\n", i,
1039 ((uint32_t *) t.result)[i]);
1040 fprintf (stream, "static const\n");
1041 fprintf (stream, "struct\n");
1042 fprintf (stream, " {\n");
1043 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1044 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1045 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
1046 (1 << t.p) * 5 / 16);
1047 fprintf (stream, " }\n");
1048 fprintf (stream, "u_category =\n");
1049 fprintf (stream, "{\n");
1050 fprintf (stream, " {");
1051 if (t.level1_size > 8)
1052 fprintf (stream, "\n ");
1053 for (i = 0; i < t.level1_size; i++)
1056 if (i > 0 && (i % 8) == 0)
1057 fprintf (stream, "\n ");
1058 offset = ((uint32_t *) (t.result + level1_offset))[i];
1060 fprintf (stream, " %5d", -1);
1062 fprintf (stream, " %5zu",
1063 (offset - level2_offset) / sizeof (uint32_t));
1064 if (i+1 < t.level1_size)
1065 fprintf (stream, ",");
1067 if (t.level1_size > 8)
1068 fprintf (stream, "\n ");
1069 fprintf (stream, " },\n");
1070 fprintf (stream, " {");
1071 if (t.level2_size << t.q > 8)
1072 fprintf (stream, "\n ");
1073 for (i = 0; i < t.level2_size << t.q; i++)
1076 if (i > 0 && (i % 8) == 0)
1077 fprintf (stream, "\n ");
1078 offset = ((uint32_t *) (t.result + level2_offset))[i];
1080 fprintf (stream, " %5d", -1);
1082 fprintf (stream, " %5zu",
1083 (offset - level3_offset) / sizeof (uint8_t));
1084 if (i+1 < t.level2_size << t.q)
1085 fprintf (stream, ",");
1087 if (t.level2_size << t.q > 8)
1088 fprintf (stream, "\n ");
1089 fprintf (stream, " },\n");
1090 /* Pack the level3 array. Each entry needs 5 bits only. Use 16-bit units,
1091 not 32-bit units, in order to make the lookup function easier. */
1094 calloc ((t.level3_size << t.p) * 5 / 16 + 1, sizeof (uint16_t));
1095 for (i = 0; i < t.level3_size << t.p; i++)
1097 unsigned int j = (i * 5) / 16;
1098 unsigned int k = (i * 5) % 16;
1099 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
1100 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
1101 level3_packed[j] = value & 0xffff;
1102 level3_packed[j+1] = value >> 16;
1104 fprintf (stream, " {");
1105 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1106 fprintf (stream, "\n ");
1107 for (i = 0; i < (t.level3_size << t.p) * 5 / 16 + 1; i++)
1109 if (i > 0 && (i % 8) == 0)
1110 fprintf (stream, "\n ");
1111 fprintf (stream, " 0x%04x", level3_packed[i]);
1112 if (i+1 < (t.level3_size << t.p) * 5 / 16 + 1)
1113 fprintf (stream, ",");
1115 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1116 fprintf (stream, "\n ");
1117 fprintf (stream, " }\n");
1118 free (level3_packed);
1119 fprintf (stream, "};\n");
1121 if (ferror (stream) || fclose (stream))
1123 fprintf (stderr, "error writing to '%s'\n", filename);
1128 /* ========================================================================= */
1130 /* Canonical combining class. */
1131 /* See Unicode 3.0 book, section 4.2,
1134 /* Construction of sparse 3-level tables. */
1135 #define TABLE combclass_table
1136 #define ELEMENT uint8_t
1138 #define xmalloc malloc
1139 #define xrealloc realloc
1142 /* Output the per-character combining class table. */
1144 output_combclass (const char *filename, const char *version)
1148 struct combclass_table t;
1149 unsigned int level1_offset, level2_offset, level3_offset;
1151 stream = fopen (filename, "w");
1154 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1158 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1159 fprintf (stream, "/* Combining class of Unicode characters. */\n");
1160 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1165 combclass_table_init (&t);
1167 for (ch = 0; ch < 0x110000; ch++)
1168 if (unicode_attributes[ch].name != NULL)
1170 int value = atoi (unicode_attributes[ch].combining);
1171 if (!(value >= 0 && value <= 255))
1173 combclass_table_add (&t, ch, value);
1176 combclass_table_finalize (&t);
1178 /* Offsets in t.result, in memory of this process. */
1180 5 * sizeof (uint32_t);
1182 5 * sizeof (uint32_t)
1183 + t.level1_size * sizeof (uint32_t);
1185 5 * sizeof (uint32_t)
1186 + t.level1_size * sizeof (uint32_t)
1187 + (t.level2_size << t.q) * sizeof (uint32_t);
1189 for (i = 0; i < 5; i++)
1190 fprintf (stream, "#define combclass_header_%d %d\n", i,
1191 ((uint32_t *) t.result)[i]);
1192 fprintf (stream, "static const\n");
1193 fprintf (stream, "struct\n");
1194 fprintf (stream, " {\n");
1195 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1196 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1197 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
1198 fprintf (stream, " }\n");
1199 fprintf (stream, "u_combclass =\n");
1200 fprintf (stream, "{\n");
1201 fprintf (stream, " {");
1202 if (t.level1_size > 8)
1203 fprintf (stream, "\n ");
1204 for (i = 0; i < t.level1_size; i++)
1207 if (i > 0 && (i % 8) == 0)
1208 fprintf (stream, "\n ");
1209 offset = ((uint32_t *) (t.result + level1_offset))[i];
1211 fprintf (stream, " %5d", -1);
1213 fprintf (stream, " %5zu",
1214 (offset - level2_offset) / sizeof (uint32_t));
1215 if (i+1 < t.level1_size)
1216 fprintf (stream, ",");
1218 if (t.level1_size > 8)
1219 fprintf (stream, "\n ");
1220 fprintf (stream, " },\n");
1221 fprintf (stream, " {");
1222 if (t.level2_size << t.q > 8)
1223 fprintf (stream, "\n ");
1224 for (i = 0; i < t.level2_size << t.q; i++)
1227 if (i > 0 && (i % 8) == 0)
1228 fprintf (stream, "\n ");
1229 offset = ((uint32_t *) (t.result + level2_offset))[i];
1231 fprintf (stream, " %5d", -1);
1233 fprintf (stream, " %5zu",
1234 (offset - level3_offset) / sizeof (uint8_t));
1235 if (i+1 < t.level2_size << t.q)
1236 fprintf (stream, ",");
1238 if (t.level2_size << t.q > 8)
1239 fprintf (stream, "\n ");
1240 fprintf (stream, " },\n");
1241 fprintf (stream, " {");
1242 if (t.level3_size << t.p > 8)
1243 fprintf (stream, "\n ");
1244 for (i = 0; i < t.level3_size << t.p; i++)
1246 if (i > 0 && (i % 8) == 0)
1247 fprintf (stream, "\n ");
1248 fprintf (stream, " %3d", ((uint8_t *) (t.result + level3_offset))[i]);
1249 if (i+1 < t.level3_size << t.p)
1250 fprintf (stream, ",");
1252 if (t.level3_size << t.p > 8)
1253 fprintf (stream, "\n ");
1254 fprintf (stream, " }\n");
1255 fprintf (stream, "};\n");
1257 if (ferror (stream) || fclose (stream))
1259 fprintf (stderr, "error writing to '%s'\n", filename);
1264 /* ========================================================================= */
1266 /* Bidirectional category. */
1267 /* See Unicode 3.0 book, section 4.3,
1272 UC_BIDI_L, /* Left-to-Right */
1273 UC_BIDI_LRE, /* Left-to-Right Embedding */
1274 UC_BIDI_LRO, /* Left-to-Right Override */
1275 UC_BIDI_R, /* Right-to-Left */
1276 UC_BIDI_AL, /* Right-to-Left Arabic */
1277 UC_BIDI_RLE, /* Right-to-Left Embedding */
1278 UC_BIDI_RLO, /* Right-to-Left Override */
1279 UC_BIDI_PDF, /* Pop Directional Format */
1280 UC_BIDI_EN, /* European Number */
1281 UC_BIDI_ES, /* European Number Separator */
1282 UC_BIDI_ET, /* European Number Terminator */
1283 UC_BIDI_AN, /* Arabic Number */
1284 UC_BIDI_CS, /* Common Number Separator */
1285 UC_BIDI_NSM, /* Non-Spacing Mark */
1286 UC_BIDI_BN, /* Boundary Neutral */
1287 UC_BIDI_B, /* Paragraph Separator */
1288 UC_BIDI_S, /* Segment Separator */
1289 UC_BIDI_WS, /* Whitespace */
1290 UC_BIDI_ON /* Other Neutral */
1294 bidi_category_byname (const char *category_name)
1296 switch (category_name[0])
1299 switch (category_name[1])
1302 if (category_name[2] == '\0')
1306 if (category_name[2] == '\0')
1312 switch (category_name[1])
1317 if (category_name[2] == '\0')
1323 switch (category_name[1])
1326 if (category_name[2] == '\0')
1332 switch (category_name[1])
1335 if (category_name[2] == '\0')
1339 if (category_name[2] == '\0')
1343 if (category_name[2] == '\0')
1349 switch (category_name[1])
1354 switch (category_name[2])
1357 if (category_name[3] == '\0')
1361 if (category_name[3] == '\0')
1369 switch (category_name[1])
1372 switch (category_name[2])
1375 if (category_name[3] == '\0')
1383 switch (category_name[1])
1386 if (category_name[2] == '\0')
1392 switch (category_name[1])
1395 switch (category_name[2])
1398 if (category_name[3] == '\0')
1406 switch (category_name[1])
1411 switch (category_name[2])
1414 if (category_name[3] == '\0')
1418 if (category_name[3] == '\0')
1426 if (category_name[1] == '\0')
1430 switch (category_name[1])
1433 if (category_name[2] == '\0')
1439 /* Invalid bidi category name. */
1444 get_bidi_category (unsigned int ch)
1446 if (unicode_attributes[ch].name != NULL)
1447 return bidi_category_byname (unicode_attributes[ch].bidi);
1450 /* The bidi category of unassigned characters depends on the range.
1451 See UTR #9 and DerivedBidiClass.txt. */
1452 if ((ch >= 0x0590 && ch <= 0x05FF)
1453 || (ch >= 0x07FB && ch <= 0x08FF)
1454 || (ch >= 0xFB37 && ch <= 0xFB45)
1455 || (ch >= 0x10800 && ch <= 0x10FFF))
1457 else if ((ch >= 0x0600 && ch <= 0x07BF)
1458 || (ch >= 0x2064 && ch <= 0x2069)
1459 || (ch >= 0xFBB2 && ch <= 0xFDCF)
1460 || (ch >= 0xFDFE && ch <= 0xFEFE))
1462 else if ((ch >= 0xFDD0 && ch <= 0xFDEF)
1463 || (ch >= 0xFFF0 && ch <= 0xFFFF)
1464 || (ch & 0xFFFF) == 0xFFFE
1465 || (ch & 0xFFFF) == 0xFFFF
1466 || (ch >= 0xE0000 && ch <= 0xE0FFF))
1473 /* Construction of sparse 3-level tables. */
1474 #define TABLE bidi_category_table
1475 #define ELEMENT uint8_t
1476 #define DEFAULT UC_BIDI_L
1477 #define xmalloc malloc
1478 #define xrealloc realloc
1481 /* Output the per-character bidi category table. */
1483 output_bidi_category (const char *filename, const char *version)
1487 struct bidi_category_table t;
1488 unsigned int level1_offset, level2_offset, level3_offset;
1489 uint16_t *level3_packed;
1491 stream = fopen (filename, "w");
1494 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1498 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1499 fprintf (stream, "/* Bidi categories of Unicode characters. */\n");
1500 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1505 bidi_category_table_init (&t);
1507 for (ch = 0; ch < 0x110000; ch++)
1509 int value = get_bidi_category (ch);
1511 bidi_category_table_add (&t, ch, value);
1514 bidi_category_table_finalize (&t);
1516 /* Offsets in t.result, in memory of this process. */
1518 5 * sizeof (uint32_t);
1520 5 * sizeof (uint32_t)
1521 + t.level1_size * sizeof (uint32_t);
1523 5 * sizeof (uint32_t)
1524 + t.level1_size * sizeof (uint32_t)
1525 + (t.level2_size << t.q) * sizeof (uint32_t);
1527 for (i = 0; i < 5; i++)
1528 fprintf (stream, "#define bidi_category_header_%d %d\n", i,
1529 ((uint32_t *) t.result)[i]);
1530 fprintf (stream, "static const\n");
1531 fprintf (stream, "struct\n");
1532 fprintf (stream, " {\n");
1533 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1534 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1535 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
1536 (1 << t.p) * 5 / 16);
1537 fprintf (stream, " }\n");
1538 fprintf (stream, "u_bidi_category =\n");
1539 fprintf (stream, "{\n");
1540 fprintf (stream, " {");
1541 if (t.level1_size > 8)
1542 fprintf (stream, "\n ");
1543 for (i = 0; i < t.level1_size; i++)
1546 if (i > 0 && (i % 8) == 0)
1547 fprintf (stream, "\n ");
1548 offset = ((uint32_t *) (t.result + level1_offset))[i];
1550 fprintf (stream, " %5d", -1);
1552 fprintf (stream, " %5zu",
1553 (offset - level2_offset) / sizeof (uint32_t));
1554 if (i+1 < t.level1_size)
1555 fprintf (stream, ",");
1557 if (t.level1_size > 8)
1558 fprintf (stream, "\n ");
1559 fprintf (stream, " },\n");
1560 fprintf (stream, " {");
1561 if (t.level2_size << t.q > 8)
1562 fprintf (stream, "\n ");
1563 for (i = 0; i < t.level2_size << t.q; i++)
1566 if (i > 0 && (i % 8) == 0)
1567 fprintf (stream, "\n ");
1568 offset = ((uint32_t *) (t.result + level2_offset))[i];
1570 fprintf (stream, " %5d", -1);
1572 fprintf (stream, " %5zu",
1573 (offset - level3_offset) / sizeof (uint8_t));
1574 if (i+1 < t.level2_size << t.q)
1575 fprintf (stream, ",");
1577 if (t.level2_size << t.q > 8)
1578 fprintf (stream, "\n ");
1579 fprintf (stream, " },\n");
1580 /* Pack the level3 array. Each entry needs 5 bits only. Use 16-bit units,
1581 not 32-bit units, in order to make the lookup function easier. */
1584 calloc ((t.level3_size << t.p) * 5 / 16 + 1, sizeof (uint16_t));
1585 for (i = 0; i < t.level3_size << t.p; i++)
1587 unsigned int j = (i * 5) / 16;
1588 unsigned int k = (i * 5) % 16;
1589 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
1590 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
1591 level3_packed[j] = value & 0xffff;
1592 level3_packed[j+1] = value >> 16;
1594 fprintf (stream, " {");
1595 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1596 fprintf (stream, "\n ");
1597 for (i = 0; i < (t.level3_size << t.p) * 5 / 16 + 1; i++)
1599 if (i > 0 && (i % 8) == 0)
1600 fprintf (stream, "\n ");
1601 fprintf (stream, " 0x%04x", level3_packed[i]);
1602 if (i+1 < (t.level3_size << t.p) * 5 / 16 + 1)
1603 fprintf (stream, ",");
1605 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1606 fprintf (stream, "\n ");
1607 fprintf (stream, " }\n");
1608 free (level3_packed);
1609 fprintf (stream, "};\n");
1611 if (ferror (stream) || fclose (stream))
1613 fprintf (stderr, "error writing to '%s'\n", filename);
1618 /* ========================================================================= */
1620 /* Decimal digit value. */
1621 /* See Unicode 3.0 book, section 4.6. */
1624 get_decdigit_value (unsigned int ch)
1626 if (unicode_attributes[ch].name != NULL
1627 && unicode_attributes[ch].decdigit[0] != '\0')
1628 return atoi (unicode_attributes[ch].decdigit);
1632 /* Construction of sparse 3-level tables. */
1633 #define TABLE decdigit_table
1634 #define ELEMENT uint8_t
1636 #define xmalloc malloc
1637 #define xrealloc realloc
1640 /* Output the unit test for the per-character decimal digit value table. */
1642 output_decimal_digit_test (const char *filename, const char *version)
1648 stream = fopen (filename, "w");
1651 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1655 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1656 fprintf (stream, "/* Decimal digit values of Unicode characters. */\n");
1657 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1661 for (ch = 0; ch < 0x110000; ch++)
1663 int value = get_decdigit_value (ch);
1665 if (!(value >= -1 && value < 10))
1671 fprintf (stream, ",\n");
1672 fprintf (stream, " { 0x%04X, %d }", ch, value);
1677 fprintf (stream, "\n");
1679 if (ferror (stream) || fclose (stream))
1681 fprintf (stderr, "error writing to '%s'\n", filename);
1686 /* Output the per-character decimal digit value table. */
1688 output_decimal_digit (const char *filename, const char *version)
1692 struct decdigit_table t;
1693 unsigned int level1_offset, level2_offset, level3_offset;
1695 stream = fopen (filename, "w");
1698 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1702 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1703 fprintf (stream, "/* Decimal digit values of Unicode characters. */\n");
1704 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1709 decdigit_table_init (&t);
1711 for (ch = 0; ch < 0x110000; ch++)
1713 int value = 1 + get_decdigit_value (ch);
1715 if (!(value >= 0 && value <= 10))
1718 decdigit_table_add (&t, ch, value);
1721 decdigit_table_finalize (&t);
1723 /* Offsets in t.result, in memory of this process. */
1725 5 * sizeof (uint32_t);
1727 5 * sizeof (uint32_t)
1728 + t.level1_size * sizeof (uint32_t);
1730 5 * sizeof (uint32_t)
1731 + t.level1_size * sizeof (uint32_t)
1732 + (t.level2_size << t.q) * sizeof (uint32_t);
1734 for (i = 0; i < 5; i++)
1735 fprintf (stream, "#define decdigit_header_%d %d\n", i,
1736 ((uint32_t *) t.result)[i]);
1737 fprintf (stream, "static const\n");
1738 fprintf (stream, "struct\n");
1739 fprintf (stream, " {\n");
1740 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1741 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1742 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size,
1744 fprintf (stream, " }\n");
1745 fprintf (stream, "u_decdigit =\n");
1746 fprintf (stream, "{\n");
1747 fprintf (stream, " {");
1748 if (t.level1_size > 8)
1749 fprintf (stream, "\n ");
1750 for (i = 0; i < t.level1_size; i++)
1753 if (i > 0 && (i % 8) == 0)
1754 fprintf (stream, "\n ");
1755 offset = ((uint32_t *) (t.result + level1_offset))[i];
1757 fprintf (stream, " %5d", -1);
1759 fprintf (stream, " %5zu",
1760 (offset - level2_offset) / sizeof (uint32_t));
1761 if (i+1 < t.level1_size)
1762 fprintf (stream, ",");
1764 if (t.level1_size > 8)
1765 fprintf (stream, "\n ");
1766 fprintf (stream, " },\n");
1767 fprintf (stream, " {");
1768 if (t.level2_size << t.q > 8)
1769 fprintf (stream, "\n ");
1770 for (i = 0; i < t.level2_size << t.q; i++)
1773 if (i > 0 && (i % 8) == 0)
1774 fprintf (stream, "\n ");
1775 offset = ((uint32_t *) (t.result + level2_offset))[i];
1777 fprintf (stream, " %5d", -1);
1779 fprintf (stream, " %5zu",
1780 (offset - level3_offset) / sizeof (uint8_t));
1781 if (i+1 < t.level2_size << t.q)
1782 fprintf (stream, ",");
1784 if (t.level2_size << t.q > 8)
1785 fprintf (stream, "\n ");
1786 fprintf (stream, " },\n");
1787 /* Pack the level3 array. Each entry needs 4 bits only. */
1788 fprintf (stream, " {");
1789 if (t.level3_size << (t.p - 1) > 8)
1790 fprintf (stream, "\n ");
1791 for (i = 0; i < t.level3_size << (t.p - 1); i++)
1793 if (i > 0 && (i % 8) == 0)
1794 fprintf (stream, "\n ");
1795 fprintf (stream, " 0x%02x",
1796 ((uint8_t *) (t.result + level3_offset))[2*i]
1797 + (((uint8_t *) (t.result + level3_offset))[2*i+1] << 4));
1798 if (i+1 < t.level3_size << (t.p - 1))
1799 fprintf (stream, ",");
1801 if (t.level3_size << (t.p - 1) > 8)
1802 fprintf (stream, "\n ");
1803 fprintf (stream, " }\n");
1804 fprintf (stream, "};\n");
1806 if (ferror (stream) || fclose (stream))
1808 fprintf (stderr, "error writing to '%s'\n", filename);
1813 /* ========================================================================= */
1816 /* See Unicode 3.0 book, section 4.6. */
1819 get_digit_value (unsigned int ch)
1821 if (unicode_attributes[ch].name != NULL
1822 && unicode_attributes[ch].digit[0] != '\0')
1823 return atoi (unicode_attributes[ch].digit);
1827 /* Output the unit test for the per-character digit value table. */
1829 output_digit_test (const char *filename, const char *version)
1835 stream = fopen (filename, "w");
1838 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1842 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1843 fprintf (stream, "/* Digit values of Unicode characters. */\n");
1844 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1848 for (ch = 0; ch < 0x110000; ch++)
1850 int value = get_digit_value (ch);
1852 if (!(value >= -1 && value < 10))
1858 fprintf (stream, ",\n");
1859 fprintf (stream, " { 0x%04X, %d }", ch, value);
1864 fprintf (stream, "\n");
1866 if (ferror (stream) || fclose (stream))
1868 fprintf (stderr, "error writing to '%s'\n", filename);
1873 /* Output the per-character digit value table. */
1875 output_digit (const char *filename, const char *version)
1879 struct decdigit_table t;
1880 unsigned int level1_offset, level2_offset, level3_offset;
1882 stream = fopen (filename, "w");
1885 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1889 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1890 fprintf (stream, "/* Digit values of Unicode characters. */\n");
1891 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1896 decdigit_table_init (&t);
1898 for (ch = 0; ch < 0x110000; ch++)
1900 int value = 1 + get_digit_value (ch);
1902 if (!(value >= 0 && value <= 10))
1905 decdigit_table_add (&t, ch, value);
1908 decdigit_table_finalize (&t);
1910 /* Offsets in t.result, in memory of this process. */
1912 5 * sizeof (uint32_t);
1914 5 * sizeof (uint32_t)
1915 + t.level1_size * sizeof (uint32_t);
1917 5 * sizeof (uint32_t)
1918 + t.level1_size * sizeof (uint32_t)
1919 + (t.level2_size << t.q) * sizeof (uint32_t);
1921 for (i = 0; i < 5; i++)
1922 fprintf (stream, "#define digit_header_%d %d\n", i,
1923 ((uint32_t *) t.result)[i]);
1924 fprintf (stream, "static const\n");
1925 fprintf (stream, "struct\n");
1926 fprintf (stream, " {\n");
1927 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1928 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1929 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size,
1931 fprintf (stream, " }\n");
1932 fprintf (stream, "u_digit =\n");
1933 fprintf (stream, "{\n");
1934 fprintf (stream, " {");
1935 if (t.level1_size > 8)
1936 fprintf (stream, "\n ");
1937 for (i = 0; i < t.level1_size; i++)
1940 if (i > 0 && (i % 8) == 0)
1941 fprintf (stream, "\n ");
1942 offset = ((uint32_t *) (t.result + level1_offset))[i];
1944 fprintf (stream, " %5d", -1);
1946 fprintf (stream, " %5zu",
1947 (offset - level2_offset) / sizeof (uint32_t));
1948 if (i+1 < t.level1_size)
1949 fprintf (stream, ",");
1951 if (t.level1_size > 8)
1952 fprintf (stream, "\n ");
1953 fprintf (stream, " },\n");
1954 fprintf (stream, " {");
1955 if (t.level2_size << t.q > 8)
1956 fprintf (stream, "\n ");
1957 for (i = 0; i < t.level2_size << t.q; i++)
1960 if (i > 0 && (i % 8) == 0)
1961 fprintf (stream, "\n ");
1962 offset = ((uint32_t *) (t.result + level2_offset))[i];
1964 fprintf (stream, " %5d", -1);
1966 fprintf (stream, " %5zu",
1967 (offset - level3_offset) / sizeof (uint8_t));
1968 if (i+1 < t.level2_size << t.q)
1969 fprintf (stream, ",");
1971 if (t.level2_size << t.q > 8)
1972 fprintf (stream, "\n ");
1973 fprintf (stream, " },\n");
1974 /* Pack the level3 array. Each entry needs 4 bits only. */
1975 fprintf (stream, " {");
1976 if (t.level3_size << (t.p - 1) > 8)
1977 fprintf (stream, "\n ");
1978 for (i = 0; i < t.level3_size << (t.p - 1); i++)
1980 if (i > 0 && (i % 8) == 0)
1981 fprintf (stream, "\n ");
1982 fprintf (stream, " 0x%02x",
1983 ((uint8_t *) (t.result + level3_offset))[2*i]
1984 + (((uint8_t *) (t.result + level3_offset))[2*i+1] << 4));
1985 if (i+1 < t.level3_size << (t.p - 1))
1986 fprintf (stream, ",");
1988 if (t.level3_size << (t.p - 1) > 8)
1989 fprintf (stream, "\n ");
1990 fprintf (stream, " }\n");
1991 fprintf (stream, "};\n");
1993 if (ferror (stream) || fclose (stream))
1995 fprintf (stderr, "error writing to '%s'\n", filename);
2000 /* ========================================================================= */
2002 /* Numeric value. */
2003 /* See Unicode 3.0 book, section 4.6. */
2005 typedef struct { int numerator; int denominator; } uc_fraction_t;
2007 static uc_fraction_t
2008 get_numeric_value (unsigned int ch)
2010 uc_fraction_t value;
2012 if (unicode_attributes[ch].name != NULL
2013 && unicode_attributes[ch].numeric[0] != '\0')
2015 const char *str = unicode_attributes[ch].numeric;
2016 /* str is of the form "integer" or "integer/posinteger". */
2017 value.numerator = atoi (str);
2018 if (strchr (str, '/') != NULL)
2019 value.denominator = atoi (strchr (str, '/') + 1);
2021 value.denominator = 1;
2025 value.numerator = 0;
2026 value.denominator = 0;
2031 /* Output the unit test for the per-character numeric value table. */
2033 output_numeric_test (const char *filename, const char *version)
2039 stream = fopen (filename, "w");
2042 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2046 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2047 fprintf (stream, "/* Numeric values of Unicode characters. */\n");
2048 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
2052 for (ch = 0; ch < 0x110000; ch++)
2054 uc_fraction_t value = get_numeric_value (ch);
2056 if (value.numerator != 0 || value.denominator != 0)
2059 fprintf (stream, ",\n");
2060 fprintf (stream, " { 0x%04X, %d, %d }",
2061 ch, value.numerator, value.denominator);
2066 fprintf (stream, "\n");
2068 if (ferror (stream) || fclose (stream))
2070 fprintf (stderr, "error writing to '%s'\n", filename);
2075 /* Construction of sparse 3-level tables. */
2076 #define TABLE numeric_table
2077 #define ELEMENT uint8_t
2079 #define xmalloc malloc
2080 #define xrealloc realloc
2083 /* Output the per-character numeric value table. */
2085 output_numeric (const char *filename, const char *version)
2088 uc_fraction_t fractions[128];
2089 unsigned int nfractions;
2090 unsigned int ch, i, j;
2091 struct numeric_table t;
2092 unsigned int level1_offset, level2_offset, level3_offset;
2093 uint16_t *level3_packed;
2095 stream = fopen (filename, "w");
2098 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2102 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2103 fprintf (stream, "/* Numeric values of Unicode characters. */\n");
2104 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
2107 /* Create table of occurring fractions. */
2109 for (ch = 0; ch < 0x110000; ch++)
2111 uc_fraction_t value = get_numeric_value (ch);
2113 for (i = 0; i < nfractions; i++)
2114 if (value.numerator == fractions[i].numerator
2115 && value.denominator == fractions[i].denominator)
2117 if (i == nfractions)
2119 if (nfractions == 128)
2121 for (i = 0; i < nfractions; i++)
2122 if (value.denominator < fractions[i].denominator
2123 || (value.denominator == fractions[i].denominator
2124 && value.numerator < fractions[i].numerator))
2126 for (j = nfractions; j > i; j--)
2127 fractions[j] = fractions[j - 1];
2128 fractions[i] = value;
2133 fprintf (stream, "static const uc_fraction_t u_numeric_values[%d] =\n",
2135 fprintf (stream, "{\n");
2136 for (i = 0; i < nfractions; i++)
2138 fprintf (stream, " { %d, %d }", fractions[i].numerator,
2139 fractions[i].denominator);
2140 if (i+1 < nfractions)
2141 fprintf (stream, ",");
2142 fprintf (stream, "\n");
2144 fprintf (stream, "};\n");
2148 numeric_table_init (&t);
2150 for (ch = 0; ch < 0x110000; ch++)
2152 uc_fraction_t value = get_numeric_value (ch);
2154 for (i = 0; i < nfractions; i++)
2155 if (value.numerator == fractions[i].numerator
2156 && value.denominator == fractions[i].denominator)
2158 if (i == nfractions)
2161 numeric_table_add (&t, ch, i);
2164 numeric_table_finalize (&t);
2166 /* Offsets in t.result, in memory of this process. */
2168 5 * sizeof (uint32_t);
2170 5 * sizeof (uint32_t)
2171 + t.level1_size * sizeof (uint32_t);
2173 5 * sizeof (uint32_t)
2174 + t.level1_size * sizeof (uint32_t)
2175 + (t.level2_size << t.q) * sizeof (uint32_t);
2177 for (i = 0; i < 5; i++)
2178 fprintf (stream, "#define numeric_header_%d %d\n", i,
2179 ((uint32_t *) t.result)[i]);
2180 fprintf (stream, "static const\n");
2181 fprintf (stream, "struct\n");
2182 fprintf (stream, " {\n");
2183 fprintf (stream, " int level1[%zu];\n", t.level1_size);
2184 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
2185 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
2186 (1 << t.p) * 7 / 16);
2187 fprintf (stream, " }\n");
2188 fprintf (stream, "u_numeric =\n");
2189 fprintf (stream, "{\n");
2190 fprintf (stream, " {");
2191 if (t.level1_size > 8)
2192 fprintf (stream, "\n ");
2193 for (i = 0; i < t.level1_size; i++)
2196 if (i > 0 && (i % 8) == 0)
2197 fprintf (stream, "\n ");
2198 offset = ((uint32_t *) (t.result + level1_offset))[i];
2200 fprintf (stream, " %5d", -1);
2202 fprintf (stream, " %5zu",
2203 (offset - level2_offset) / sizeof (uint32_t));
2204 if (i+1 < t.level1_size)
2205 fprintf (stream, ",");
2207 if (t.level1_size > 8)
2208 fprintf (stream, "\n ");
2209 fprintf (stream, " },\n");
2210 fprintf (stream, " {");
2211 if (t.level2_size << t.q > 8)
2212 fprintf (stream, "\n ");
2213 for (i = 0; i < t.level2_size << t.q; i++)
2216 if (i > 0 && (i % 8) == 0)
2217 fprintf (stream, "\n ");
2218 offset = ((uint32_t *) (t.result + level2_offset))[i];
2220 fprintf (stream, " %5d", -1);
2222 fprintf (stream, " %5zu",
2223 (offset - level3_offset) / sizeof (uint8_t));
2224 if (i+1 < t.level2_size << t.q)
2225 fprintf (stream, ",");
2227 if (t.level2_size << t.q > 8)
2228 fprintf (stream, "\n ");
2229 fprintf (stream, " },\n");
2230 /* Pack the level3 array. Each entry needs 7 bits only. Use 16-bit units,
2231 not 32-bit units, in order to make the lookup function easier. */
2234 calloc ((t.level3_size << t.p) * 7 / 16 + 1, sizeof (uint16_t));
2235 for (i = 0; i < t.level3_size << t.p; i++)
2237 unsigned int j = (i * 7) / 16;
2238 unsigned int k = (i * 7) % 16;
2239 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
2240 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
2241 level3_packed[j] = value & 0xffff;
2242 level3_packed[j+1] = value >> 16;
2244 fprintf (stream, " {");
2245 if ((t.level3_size << t.p) * 7 / 16 + 1 > 8)
2246 fprintf (stream, "\n ");
2247 for (i = 0; i < (t.level3_size << t.p) * 7 / 16 + 1; i++)
2249 if (i > 0 && (i % 8) == 0)
2250 fprintf (stream, "\n ");
2251 fprintf (stream, " 0x%04x", level3_packed[i]);
2252 if (i+1 < (t.level3_size << t.p) * 7 / 16 + 1)
2253 fprintf (stream, ",");
2255 if ((t.level3_size << t.p) * 7 / 16 + 1 > 8)
2256 fprintf (stream, "\n ");
2257 fprintf (stream, " }\n");
2258 free (level3_packed);
2259 fprintf (stream, "};\n");
2261 if (ferror (stream) || fclose (stream))
2263 fprintf (stderr, "error writing to '%s'\n", filename);
2268 /* ========================================================================= */
2271 /* See Unicode 3.0 book, section 4.7,
2274 /* List of mirrored character pairs. This is a subset of the characters
2275 having the BidiMirrored property. */
2276 static unsigned int mirror_pairs[][2] =
2333 get_mirror_value (unsigned int ch)
2336 unsigned int mirror_char;
2339 mirrored = (unicode_attributes[ch].name != NULL
2340 && unicode_attributes[ch].mirrored);
2341 mirror_char = 0xfffd;
2342 for (i = 0; i < sizeof (mirror_pairs) / sizeof (mirror_pairs[0]); i++)
2343 if (ch == mirror_pairs[i][0])
2345 mirror_char = mirror_pairs[i][1];
2348 else if (ch == mirror_pairs[i][1])
2350 mirror_char = mirror_pairs[i][0];
2354 return (int) mirror_char - (int) ch;
2357 if (mirror_char != 0xfffd)
2363 /* Construction of sparse 3-level tables. */
2364 #define TABLE mirror_table
2365 #define ELEMENT int32_t
2367 #define xmalloc malloc
2368 #define xrealloc realloc
2371 /* Output the per-character mirror table. */
2373 output_mirror (const char *filename, const char *version)
2377 struct mirror_table t;
2378 unsigned int level1_offset, level2_offset, level3_offset;
2380 stream = fopen (filename, "w");
2383 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2387 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2388 fprintf (stream, "/* Mirrored Unicode characters. */\n");
2389 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
2394 mirror_table_init (&t);
2396 for (ch = 0; ch < 0x110000; ch++)
2398 int value = get_mirror_value (ch);
2400 mirror_table_add (&t, ch, value);
2403 mirror_table_finalize (&t);
2405 /* Offsets in t.result, in memory of this process. */
2407 5 * sizeof (uint32_t);
2409 5 * sizeof (uint32_t)
2410 + t.level1_size * sizeof (uint32_t);
2412 5 * sizeof (uint32_t)
2413 + t.level1_size * sizeof (uint32_t)
2414 + (t.level2_size << t.q) * sizeof (uint32_t);
2416 for (i = 0; i < 5; i++)
2417 fprintf (stream, "#define mirror_header_%d %d\n", i,
2418 ((uint32_t *) t.result)[i]);
2419 fprintf (stream, "static const\n");
2420 fprintf (stream, "struct\n");
2421 fprintf (stream, " {\n");
2422 fprintf (stream, " int level1[%zu];\n", t.level1_size);
2423 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
2424 fprintf (stream, " int level3[%zu << %d];\n", t.level3_size, t.p);
2425 fprintf (stream, " }\n");
2426 fprintf (stream, "u_mirror =\n");
2427 fprintf (stream, "{\n");
2428 fprintf (stream, " {");
2429 if (t.level1_size > 8)
2430 fprintf (stream, "\n ");
2431 for (i = 0; i < t.level1_size; i++)
2434 if (i > 0 && (i % 8) == 0)
2435 fprintf (stream, "\n ");
2436 offset = ((uint32_t *) (t.result + level1_offset))[i];
2438 fprintf (stream, " %5d", -1);
2440 fprintf (stream, " %5zu",
2441 (offset - level2_offset) / sizeof (uint32_t));
2442 if (i+1 < t.level1_size)
2443 fprintf (stream, ",");
2445 if (t.level1_size > 8)
2446 fprintf (stream, "\n ");
2447 fprintf (stream, " },\n");
2448 fprintf (stream, " {");
2449 if (t.level2_size << t.q > 8)
2450 fprintf (stream, "\n ");
2451 for (i = 0; i < t.level2_size << t.q; i++)
2454 if (i > 0 && (i % 8) == 0)
2455 fprintf (stream, "\n ");
2456 offset = ((uint32_t *) (t.result + level2_offset))[i];
2458 fprintf (stream, " %5d", -1);
2460 fprintf (stream, " %5zu",
2461 (offset - level3_offset) / sizeof (int32_t));
2462 if (i+1 < t.level2_size << t.q)
2463 fprintf (stream, ",");
2465 if (t.level2_size << t.q > 8)
2466 fprintf (stream, "\n ");
2467 fprintf (stream, " },\n");
2468 fprintf (stream, " {");
2469 if (t.level3_size << t.p > 8)
2470 fprintf (stream, "\n ");
2471 for (i = 0; i < t.level3_size << t.p; i++)
2473 if (i > 0 && (i % 8) == 0)
2474 fprintf (stream, "\n ");
2475 fprintf (stream, " %5d", ((int32_t *) (t.result + level3_offset))[i]);
2476 if (i+1 < t.level3_size << t.p)
2477 fprintf (stream, ",");
2479 if (t.level3_size << t.p > 8)
2480 fprintf (stream, "\n ");
2481 fprintf (stream, " }\n");
2482 fprintf (stream, "};\n");
2484 if (ferror (stream) || fclose (stream))
2486 fprintf (stderr, "error writing to '%s'\n", filename);
2491 /* ========================================================================= */
2495 /* Reading PropList.txt and DerivedCoreProperties.txt. */
2504 PROP_QUOTATION_MARK,
2505 PROP_TERMINAL_PUNCTUATION,
2508 PROP_ASCII_HEX_DIGIT,
2509 PROP_OTHER_ALPHABETIC,
2513 PROP_OTHER_LOWERCASE,
2514 PROP_OTHER_UPPERCASE,
2515 PROP_NONCHARACTER_CODE_POINT,
2516 PROP_OTHER_GRAPHEME_EXTEND,
2517 PROP_IDS_BINARY_OPERATOR,
2518 PROP_IDS_TRINARY_OPERATOR,
2520 PROP_UNIFIED_IDEOGRAPH,
2521 PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT,
2524 PROP_LOGICAL_ORDER_EXCEPTION,
2525 PROP_OTHER_ID_START,
2526 PROP_OTHER_ID_CONTINUE,
2528 PROP_VARIATION_SELECTOR,
2529 PROP_PATTERN_WHITE_SPACE,
2530 PROP_PATTERN_SYNTAX,
2531 /* DerivedCoreProperties.txt */
2540 PROP_DEFAULT_IGNORABLE_CODE_POINT,
2541 PROP_GRAPHEME_EXTEND,
2545 unsigned long long unicode_properties[0x110000];
2548 clear_properties (void)
2552 for (i = 0; i < 0x110000; i++)
2553 unicode_properties[i] = 0;
2556 /* Stores in unicode_properties[] the properties from the
2557 PropList.txt or DerivedCoreProperties.txt file. */
2559 fill_properties (const char *proplist_filename)
2564 stream = fopen (proplist_filename, "r");
2567 fprintf (stderr, "error during fopen of '%s'\n", proplist_filename);
2574 unsigned int i1, i2;
2575 char padding[200+1];
2576 char propname[200+1];
2577 unsigned int propvalue;
2579 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
2582 if (buf[0] == '\0' || buf[0] == '#')
2585 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4)
2587 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3)
2589 fprintf (stderr, "parse error in '%s'\n", proplist_filename);
2594 #define PROP(name,value) \
2595 if (strcmp (propname, name) == 0) propvalue = value; else
2597 PROP ("White_Space", PROP_WHITE_SPACE)
2598 PROP ("Bidi_Control", PROP_BIDI_CONTROL)
2599 PROP ("Join_Control", PROP_JOIN_CONTROL)
2600 PROP ("Dash", PROP_DASH)
2601 PROP ("Hyphen", PROP_HYPHEN)
2602 PROP ("Quotation_Mark", PROP_QUOTATION_MARK)
2603 PROP ("Terminal_Punctuation", PROP_TERMINAL_PUNCTUATION)
2604 PROP ("Other_Math", PROP_OTHER_MATH)
2605 PROP ("Hex_Digit", PROP_HEX_DIGIT)
2606 PROP ("ASCII_Hex_Digit", PROP_ASCII_HEX_DIGIT)
2607 PROP ("Other_Alphabetic", PROP_OTHER_ALPHABETIC)
2608 PROP ("Ideographic", PROP_IDEOGRAPHIC)
2609 PROP ("Diacritic", PROP_DIACRITIC)
2610 PROP ("Extender", PROP_EXTENDER)
2611 PROP ("Other_Lowercase", PROP_OTHER_LOWERCASE)
2612 PROP ("Other_Uppercase", PROP_OTHER_UPPERCASE)
2613 PROP ("Noncharacter_Code_Point", PROP_NONCHARACTER_CODE_POINT)
2614 PROP ("Other_Grapheme_Extend", PROP_OTHER_GRAPHEME_EXTEND)
2615 PROP ("IDS_Binary_Operator", PROP_IDS_BINARY_OPERATOR)
2616 PROP ("IDS_Trinary_Operator", PROP_IDS_TRINARY_OPERATOR)
2617 PROP ("Radical", PROP_RADICAL)
2618 PROP ("Unified_Ideograph", PROP_UNIFIED_IDEOGRAPH)
2619 PROP ("Other_Default_Ignorable_Code_Point", PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)
2620 PROP ("Deprecated", PROP_DEPRECATED)
2621 PROP ("Soft_Dotted", PROP_SOFT_DOTTED)
2622 PROP ("Logical_Order_Exception", PROP_LOGICAL_ORDER_EXCEPTION)
2623 PROP ("Other_ID_Start", PROP_OTHER_ID_START)
2624 PROP ("Other_ID_Continue", PROP_OTHER_ID_CONTINUE)
2625 PROP ("STerm", PROP_STERM)
2626 PROP ("Variation_Selector", PROP_VARIATION_SELECTOR)
2627 PROP ("Pattern_White_Space", PROP_PATTERN_WHITE_SPACE)
2628 PROP ("Pattern_Syntax", PROP_PATTERN_SYNTAX)
2629 /* DerivedCoreProperties.txt */
2630 PROP ("Math", PROP_MATH)
2631 PROP ("Alphabetic", PROP_ALPHABETIC)
2632 PROP ("Lowercase", PROP_LOWERCASE)
2633 PROP ("Uppercase", PROP_UPPERCASE)
2634 PROP ("ID_Start", PROP_ID_START)
2635 PROP ("ID_Continue", PROP_ID_CONTINUE)
2636 PROP ("XID_Start", PROP_XID_START)
2637 PROP ("XID_Continue", PROP_XID_CONTINUE)
2638 PROP ("Default_Ignorable_Code_Point", PROP_DEFAULT_IGNORABLE_CODE_POINT)
2639 PROP ("Grapheme_Extend", PROP_GRAPHEME_EXTEND)
2640 PROP ("Grapheme_Base", PROP_GRAPHEME_BASE)
2641 PROP ("Grapheme_Link", PROP_GRAPHEME_LINK)
2644 fprintf (stderr, "unknown property named '%s' in '%s'\n", propname,
2648 if (!(i1 <= i2 && i2 < 0x110000))
2651 for (i = i1; i <= i2; i++)
2652 unicode_properties[i] |= 1ULL << propvalue;
2655 if (ferror (stream) || fclose (stream))
2657 fprintf (stderr, "error reading from '%s'\n", proplist_filename);
2662 /* Stores in array the given property from the Unicode 3.0 PropList.txt
2665 fill_property30 (char array[0x110000], const char *proplist_filename, const char *property_name)
2671 for (i = 0; i < 0x110000; i++)
2674 stream = fopen (proplist_filename, "r");
2677 fprintf (stderr, "error during fopen of '%s'\n", proplist_filename);
2681 /* Search for the "Property dump for: ..." line. */
2684 if (fscanf (stream, "%100[^\n]\n", buf) < 1)
2686 fprintf (stderr, "no property found in '%s'\n", proplist_filename);
2690 while (strstr (buf, property_name) == NULL);
2694 unsigned int i1, i2;
2696 if (fscanf (stream, "%100[^\n]\n", buf) < 1)
2700 if (strlen (buf) >= 10 && buf[4] == '.' && buf[5] == '.')
2702 if (sscanf (buf, "%4X..%4X", &i1, &i2) < 2)
2704 fprintf (stderr, "parse error in property in '%s'\n",
2709 else if (strlen (buf) >= 4)
2711 if (sscanf (buf, "%4X", &i1) < 1)
2713 fprintf (stderr, "parse error in property in '%s'\n",
2721 fprintf (stderr, "parse error in property in '%s'\n",
2725 if (!(i1 <= i2 && i2 < 0x110000))
2727 for (i = i1; i <= i2; i++)
2730 if (ferror (stream) || fclose (stream))
2732 fprintf (stderr, "error reading from '%s'\n", proplist_filename);
2737 /* Properties from Unicode 3.0 PropList.txt file. */
2739 /* The paired punctuation property from the PropList.txt file. */
2740 char unicode_pairedpunctuation[0x110000];
2742 /* The left of pair property from the PropList.txt file. */
2743 char unicode_leftofpair[0x110000];
2746 fill_properties30 (const char *proplist30_filename)
2748 fill_property30 (unicode_pairedpunctuation, proplist30_filename, "(Paired Punctuation)");
2749 fill_property30 (unicode_leftofpair, proplist30_filename, "(Left of Pair)");
2752 /* ------------------------------------------------------------------------- */
2754 /* See PropList.txt, UCD.html. */
2756 is_property_white_space (unsigned int ch)
2758 return ((unicode_properties[ch] & (1ULL << PROP_WHITE_SPACE)) != 0);
2761 /* See Unicode 3.0 book, section 4.10,
2762 PropList.txt, UCD.html,
2763 DerivedCoreProperties.txt, UCD.html. */
2765 is_property_alphabetic (unsigned int ch)
2769 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_ALPHABETIC)) != 0)
2770 /* For some reason, the following are listed as having property
2771 Alphabetic but not as having property Other_Alphabetic. */
2772 || (ch >= 0x16EE && ch <= 0x16F0) /* RUNIC SYMBOLS */
2773 || (ch >= 0x2160 && ch <= 0x2182) /* ROMAN NUMERALS */
2774 || (ch >= 0x2185 && ch <= 0x2188) /* ROMAN NUMERALS */
2775 || (ch >= 0x24D0 && ch <= 0x24E9) /* CIRCLED LATIN SMALL LETTER */
2776 || (ch == 0x3007) /* IDEOGRAPHIC NUMBER ZERO */
2777 || (ch >= 0x3021 && ch <= 0x3029) /* HANGZHOU NUMERAL */
2778 || (ch >= 0x3038 && ch <= 0x303A) /* HANGZHOU NUMERAL */
2779 || (ch >= 0x10140 && ch <= 0x10174) /* GREEK ACROPHONICS */
2780 || (ch == 0x10341) /* GOTHIC LETTER NINETY */
2781 || (ch == 0x1034A) /* GOTHIC LETTER NINE HUNDRED */
2782 || (ch >= 0x103D1 && ch <= 0x103D5) /* OLD PERSIAN NUMBERS */
2783 || (ch >= 0x12400 && ch <= 0x12462); /* CUNEIFORM NUMERIC SIGNS */
2785 ((unicode_properties[ch] & (1ULL << PROP_ALPHABETIC)) != 0);
2787 if (result1 != result2)
2792 /* See PropList.txt, UCD.html. */
2794 is_property_other_alphabetic (unsigned int ch)
2796 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ALPHABETIC)) != 0);
2799 /* See PropList.txt, UCD.html. */
2801 is_property_not_a_character (unsigned int ch)
2803 return ((unicode_properties[ch] & (1ULL << PROP_NONCHARACTER_CODE_POINT)) != 0);
2806 /* See PropList.txt, UCD.html,
2807 DerivedCoreProperties.txt, UCD.html. */
2809 is_property_default_ignorable_code_point (unsigned int ch)
2812 (is_category_Cf (ch)
2813 && !(ch >= 0xFFF9 && ch <= 0xFFFB) /* Annotations */
2814 && !((ch >= 0x0600 && ch <= 0x0603) || ch == 0x06DD || ch == 0x070F))
2815 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)) != 0)
2816 || ((unicode_properties[ch] & (1ULL << PROP_VARIATION_SELECTOR)) != 0);
2818 ((unicode_properties[ch] & (1ULL << PROP_DEFAULT_IGNORABLE_CODE_POINT)) != 0);
2820 if (result1 != result2)
2825 /* See PropList.txt, UCD.html. */
2827 is_property_other_default_ignorable_code_point (unsigned int ch)
2829 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)) != 0);
2832 /* See PropList.txt, UCD.html. */
2834 is_property_deprecated (unsigned int ch)
2836 return ((unicode_properties[ch] & (1ULL << PROP_DEPRECATED)) != 0);
2839 /* See PropList.txt, UCD.html. */
2841 is_property_logical_order_exception (unsigned int ch)
2843 return ((unicode_properties[ch] & (1ULL << PROP_LOGICAL_ORDER_EXCEPTION)) != 0);
2846 /* See PropList.txt, UCD.html. */
2848 is_property_variation_selector (unsigned int ch)
2850 return ((unicode_properties[ch] & (1ULL << PROP_VARIATION_SELECTOR)) != 0);
2853 /* See PropList-3.0.1.txt. */
2855 is_property_private_use (unsigned int ch)
2857 /* Determined through "grep 'Private Use,' UnicodeData-3.1.0.txt". */
2858 return (ch >= 0xE000 && ch <= 0xF8FF)
2859 || (ch >= 0xF0000 && ch <= 0xFFFFD)
2860 || (ch >= 0x100000 && ch <= 0x10FFFD);
2863 /* See PropList-3.0.1.txt. */
2865 is_property_unassigned_code_value (unsigned int ch)
2867 return (is_category_Cn (ch) && !is_property_not_a_character (ch));
2870 /* See PropList.txt, UCD.html,
2871 DerivedCoreProperties.txt, UCD.html. */
2873 is_property_uppercase (unsigned int ch)
2877 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_UPPERCASE)) != 0);
2879 ((unicode_properties[ch] & (1ULL << PROP_UPPERCASE)) != 0);
2881 if (result1 != result2)
2886 /* See PropList.txt, UCD.html. */
2888 is_property_other_uppercase (unsigned int ch)
2890 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_UPPERCASE)) != 0);
2893 /* See PropList.txt, UCD.html,
2894 DerivedCoreProperties.txt, UCD.html. */
2896 is_property_lowercase (unsigned int ch)
2900 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_LOWERCASE)) != 0);
2902 ((unicode_properties[ch] & (1ULL << PROP_LOWERCASE)) != 0);
2904 if (result1 != result2)
2909 /* See PropList.txt, UCD.html. */
2911 is_property_other_lowercase (unsigned int ch)
2913 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_LOWERCASE)) != 0);
2916 /* See PropList-3.0.1.txt. */
2918 is_property_titlecase (unsigned int ch)
2920 return is_category_Lt (ch);
2923 /* See PropList.txt, UCD.html. */
2925 is_property_soft_dotted (unsigned int ch)
2927 return ((unicode_properties[ch] & (1ULL << PROP_SOFT_DOTTED)) != 0);
2930 /* See DerivedCoreProperties.txt, UCD.html. */
2932 is_property_id_start (unsigned int ch)
2934 return ((unicode_properties[ch] & (1ULL << PROP_ID_START)) != 0);
2937 /* See PropList.txt, UCD.html. */
2939 is_property_other_id_start (unsigned int ch)
2941 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ID_START)) != 0);
2944 /* See DerivedCoreProperties.txt, UCD.html. */
2946 is_property_id_continue (unsigned int ch)
2948 return ((unicode_properties[ch] & (1ULL << PROP_ID_CONTINUE)) != 0);
2951 /* See PropList.txt, UCD.html. */
2953 is_property_other_id_continue (unsigned int ch)
2955 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ID_CONTINUE)) != 0);
2958 /* See DerivedCoreProperties.txt, UCD.html. */
2960 is_property_xid_start (unsigned int ch)
2962 return ((unicode_properties[ch] & (1ULL << PROP_XID_START)) != 0);
2965 /* See DerivedCoreProperties.txt, UCD.html. */
2967 is_property_xid_continue (unsigned int ch)
2969 return ((unicode_properties[ch] & (1ULL << PROP_XID_CONTINUE)) != 0);
2972 /* See PropList.txt, UCD.html. */
2974 is_property_pattern_white_space (unsigned int ch)
2976 return ((unicode_properties[ch] & (1ULL << PROP_PATTERN_WHITE_SPACE)) != 0);
2979 /* See PropList.txt, UCD.html. */
2981 is_property_pattern_syntax (unsigned int ch)
2983 return ((unicode_properties[ch] & (1ULL << PROP_PATTERN_SYNTAX)) != 0);
2986 /* See PropList.txt, UCD.html. */
2988 is_property_join_control (unsigned int ch)
2990 return ((unicode_properties[ch] & (1ULL << PROP_JOIN_CONTROL)) != 0);
2993 /* See DerivedCoreProperties.txt, UCD.html. */
2995 is_property_grapheme_base (unsigned int ch)
2997 return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_BASE)) != 0);
3000 /* See DerivedCoreProperties.txt, UCD.html. */
3002 is_property_grapheme_extend (unsigned int ch)
3004 return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_EXTEND)) != 0);
3007 /* See PropList.txt, UCD.html. */
3009 is_property_other_grapheme_extend (unsigned int ch)
3011 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_GRAPHEME_EXTEND)) != 0);
3014 /* See DerivedCoreProperties.txt, UCD.html. */
3016 is_property_grapheme_link (unsigned int ch)
3018 return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_LINK)) != 0);
3021 /* See PropList.txt, UCD.html. */
3023 is_property_bidi_control (unsigned int ch)
3025 return ((unicode_properties[ch] & (1ULL << PROP_BIDI_CONTROL)) != 0);
3028 /* See PropList-3.0.1.txt. */
3030 is_property_bidi_left_to_right (unsigned int ch)
3032 return (get_bidi_category (ch) == UC_BIDI_L);
3035 /* See PropList-3.0.1.txt. */
3037 is_property_bidi_hebrew_right_to_left (unsigned int ch)
3039 return (get_bidi_category (ch) == UC_BIDI_R);
3042 /* See PropList-3.0.1.txt. */
3044 is_property_bidi_arabic_right_to_left (unsigned int ch)
3046 return (get_bidi_category (ch) == UC_BIDI_AL);
3049 /* See PropList-3.0.1.txt. */
3051 is_property_bidi_european_digit (unsigned int ch)
3053 return (get_bidi_category (ch) == UC_BIDI_EN);
3056 /* See PropList-3.0.1.txt. */
3058 is_property_bidi_eur_num_separator (unsigned int ch)
3060 return (get_bidi_category (ch) == UC_BIDI_ES);
3063 /* See PropList-3.0.1.txt. */
3065 is_property_bidi_eur_num_terminator (unsigned int ch)
3067 return (get_bidi_category (ch) == UC_BIDI_ET);
3070 /* See PropList-3.0.1.txt. */
3072 is_property_bidi_arabic_digit (unsigned int ch)
3074 return (get_bidi_category (ch) == UC_BIDI_AN);
3077 /* See PropList-3.0.1.txt. */
3079 is_property_bidi_common_separator (unsigned int ch)
3081 return (get_bidi_category (ch) == UC_BIDI_CS);
3084 /* See PropList-3.0.1.txt. */
3086 is_property_bidi_block_separator (unsigned int ch)
3088 return (get_bidi_category (ch) == UC_BIDI_B);
3091 /* See PropList-3.0.1.txt. */
3093 is_property_bidi_segment_separator (unsigned int ch)
3095 return (get_bidi_category (ch) == UC_BIDI_S);
3098 /* See PropList-3.0.1.txt. */
3100 is_property_bidi_whitespace (unsigned int ch)
3102 return (get_bidi_category (ch) == UC_BIDI_WS);
3105 /* See PropList-3.0.1.txt. */
3107 is_property_bidi_non_spacing_mark (unsigned int ch)
3109 return (get_bidi_category (ch) == UC_BIDI_NSM);
3112 /* See PropList-3.0.1.txt. */
3114 is_property_bidi_boundary_neutral (unsigned int ch)
3116 return (get_bidi_category (ch) == UC_BIDI_BN);
3119 /* See PropList-3.0.1.txt. */
3121 is_property_bidi_pdf (unsigned int ch)
3123 return (get_bidi_category (ch) == UC_BIDI_PDF);
3126 /* See PropList-3.0.1.txt. */
3128 is_property_bidi_embedding_or_override (unsigned int ch)
3130 int category = get_bidi_category (ch);
3131 return (category == UC_BIDI_LRE || category == UC_BIDI_LRO
3132 || category == UC_BIDI_RLE || category == UC_BIDI_RLO);
3135 /* See PropList-3.0.1.txt. */
3137 is_property_bidi_other_neutral (unsigned int ch)
3139 return (get_bidi_category (ch) == UC_BIDI_ON);
3142 /* See PropList.txt, UCD.html. */
3144 is_property_hex_digit (unsigned int ch)
3146 return ((unicode_properties[ch] & (1ULL << PROP_HEX_DIGIT)) != 0);
3149 /* See PropList.txt, UCD.html. */
3151 is_property_ascii_hex_digit (unsigned int ch)
3153 return ((unicode_properties[ch] & (1ULL << PROP_ASCII_HEX_DIGIT)) != 0);
3156 /* See Unicode 3.0 book, section 4.10,
3157 PropList.txt, UCD.html. */
3159 is_property_ideographic (unsigned int ch)
3161 return ((unicode_properties[ch] & (1ULL << PROP_IDEOGRAPHIC)) != 0);
3164 /* See PropList.txt, UCD.html. */
3166 is_property_unified_ideograph (unsigned int ch)
3168 return ((unicode_properties[ch] & (1ULL << PROP_UNIFIED_IDEOGRAPH)) != 0);
3171 /* See PropList.txt, UCD.html. */
3173 is_property_radical (unsigned int ch)
3175 return ((unicode_properties[ch] & (1ULL << PROP_RADICAL)) != 0);
3178 /* See PropList.txt, UCD.html. */
3180 is_property_ids_binary_operator (unsigned int ch)
3182 return ((unicode_properties[ch] & (1ULL << PROP_IDS_BINARY_OPERATOR)) != 0);
3185 /* See PropList.txt, UCD.html. */
3187 is_property_ids_trinary_operator (unsigned int ch)
3189 return ((unicode_properties[ch] & (1ULL << PROP_IDS_TRINARY_OPERATOR)) != 0);
3192 /* See PropList-3.0.1.txt. */
3194 is_property_zero_width (unsigned int ch)
3196 return is_category_Cf (ch)
3197 || (unicode_attributes[ch].name != NULL
3198 && strstr (unicode_attributes[ch].name, "ZERO WIDTH") != NULL);
3201 /* See PropList-3.0.1.txt. */
3203 is_property_space (unsigned int ch)
3205 return is_category_Zs (ch);
3208 /* See PropList-3.0.1.txt. */
3210 is_property_non_break (unsigned int ch)
3212 /* This is exactly the set of characters having line breaking
3214 return (ch == 0x00A0 /* NO-BREAK SPACE */
3215 || ch == 0x034F /* COMBINING GRAPHEME JOINER */
3216 || ch == 0x035C /* COMBINING DOUBLE BREVE BELOW */
3217 || ch == 0x035D /* COMBINING DOUBLE BREVE */
3218 || ch == 0x035E /* COMBINING DOUBLE MACRON */
3219 || ch == 0x035F /* COMBINING DOUBLE MACRON BELOW */
3220 || ch == 0x0360 /* COMBINING DOUBLE TILDE */
3221 || ch == 0x0361 /* COMBINING DOUBLE INVERTED BREVE */
3222 || ch == 0x0362 /* COMBINING DOUBLE RIGHTWARDS ARROW BELOW */
3223 || ch == 0x0F08 /* TIBETAN MARK SBRUL SHAD */
3224 || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */
3225 || ch == 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */
3226 || ch == 0x180E /* MONGOLIAN VOWEL SEPARATOR */
3227 || ch == 0x2007 /* FIGURE SPACE */
3228 || ch == 0x2011 /* NON-BREAKING HYPHEN */
3229 || ch == 0x202F /* NARROW NO-BREAK SPACE */);
3232 /* See PropList-3.0.1.txt. */
3234 is_property_iso_control (unsigned int ch)
3237 (unicode_attributes[ch].name != NULL
3238 && strcmp (unicode_attributes[ch].name, "<control>") == 0);
3240 is_category_Cc (ch);
3242 if (result1 != result2)
3247 /* See PropList-3.0.1.txt. */
3249 is_property_format_control (unsigned int ch)
3251 return (is_category_Cf (ch)
3252 && get_bidi_category (ch) == UC_BIDI_BN
3253 && !is_property_join_control (ch)
3257 /* See PropList.txt, UCD.html. */
3259 is_property_dash (unsigned int ch)
3261 return ((unicode_properties[ch] & (1ULL << PROP_DASH)) != 0);
3264 /* See PropList.txt, UCD.html. */
3266 is_property_hyphen (unsigned int ch)
3268 return ((unicode_properties[ch] & (1ULL << PROP_HYPHEN)) != 0);
3271 /* See PropList-3.0.1.txt. */
3273 is_property_punctuation (unsigned int ch)
3275 return is_category_P (ch);
3278 /* See PropList-3.0.1.txt. */
3280 is_property_line_separator (unsigned int ch)
3282 return is_category_Zl (ch);
3285 /* See PropList-3.0.1.txt. */
3287 is_property_paragraph_separator (unsigned int ch)
3289 return is_category_Zp (ch);
3292 /* See PropList.txt, UCD.html. */
3294 is_property_quotation_mark (unsigned int ch)
3296 return ((unicode_properties[ch] & (1ULL << PROP_QUOTATION_MARK)) != 0);
3299 /* See PropList.txt, UCD.html. */
3301 is_property_sentence_terminal (unsigned int ch)
3303 return ((unicode_properties[ch] & (1ULL << PROP_STERM)) != 0);
3306 /* See PropList.txt, UCD.html. */
3308 is_property_terminal_punctuation (unsigned int ch)
3310 return ((unicode_properties[ch] & (1ULL << PROP_TERMINAL_PUNCTUATION)) != 0);
3313 /* See PropList-3.0.1.txt. */
3315 is_property_currency_symbol (unsigned int ch)
3317 return is_category_Sc (ch);
3320 /* See Unicode 3.0 book, section 4.9,
3321 PropList.txt, UCD.html,
3322 DerivedCoreProperties.txt, UCD.html. */
3324 is_property_math (unsigned int ch)
3328 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_MATH)) != 0);
3330 ((unicode_properties[ch] & (1ULL << PROP_MATH)) != 0);
3332 if (result1 != result2)
3337 /* See PropList.txt, UCD.html. */
3339 is_property_other_math (unsigned int ch)
3341 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_MATH)) != 0);
3344 /* See PropList-3.0.1.txt. */
3346 is_property_paired_punctuation (unsigned int ch)
3348 return unicode_pairedpunctuation[ch];
3351 /* See PropList-3.0.1.txt. */
3353 is_property_left_of_pair (unsigned int ch)
3355 return unicode_leftofpair[ch];
3358 /* See PropList-3.0.1.txt. */
3360 is_property_combining (unsigned int ch)
3362 return (unicode_attributes[ch].name != NULL
3363 && (strcmp (unicode_attributes[ch].combining, "0") != 0
3364 || is_category_Mc (ch)
3365 || is_category_Me (ch)
3366 || is_category_Mn (ch)));
3369 #if 0 /* same as is_property_bidi_non_spacing_mark */
3370 /* See PropList-3.0.1.txt. */
3372 is_property_non_spacing (unsigned int ch)
3374 return (unicode_attributes[ch].name != NULL
3375 && get_bidi_category (ch) == UC_BIDI_NSM);
3379 /* See PropList-3.0.1.txt. */
3381 is_property_composite (unsigned int ch)
3383 /* This definition differs from the one in PropList-3.0.1.txt, but is more
3384 logical in some sense. */
3385 if (ch >= 0xAC00 && ch <= 0xD7A4) /* Hangul Syllables */
3387 if (unicode_attributes[ch].name != NULL
3388 && unicode_attributes[ch].decomposition != NULL)
3390 /* Test whether the decomposition contains more than one character,
3391 and the first is not a space. */
3392 const char *decomp = unicode_attributes[ch].decomposition;
3393 if (decomp[0] == '<')
3395 decomp = strchr (decomp, '>') + 1;
3396 if (decomp[0] == ' ')
3399 return strchr (decomp, ' ') != NULL && strncmp (decomp, "0020 ", 5) != 0;
3404 /* See PropList-3.0.1.txt. */
3406 is_property_decimal_digit (unsigned int ch)
3408 return is_category_Nd (ch);
3411 /* See PropList-3.0.1.txt. */
3413 is_property_numeric (unsigned int ch)
3415 return ((get_numeric_value (ch)).denominator > 0)
3416 || (ch == 0x09F8) /* BENGALI CURRENCY NUMERATOR ONE LESS THAN THE DENOMINATOR */
3417 || (ch == 0x2183); /* ROMAN NUMERAL REVERSED ONE HUNDRED */
3420 /* See PropList.txt, UCD.html. */
3422 is_property_diacritic (unsigned int ch)
3424 return ((unicode_properties[ch] & (1ULL << PROP_DIACRITIC)) != 0);
3427 /* See PropList.txt, UCD.html. */
3429 is_property_extender (unsigned int ch)
3431 return ((unicode_properties[ch] & (1ULL << PROP_EXTENDER)) != 0);
3434 /* See PropList-3.0.1.txt. */
3436 is_property_ignorable_control (unsigned int ch)
3438 return ((is_category_Cc (ch) && get_bidi_category (ch) == UC_BIDI_BN)
3439 || is_category_Cf (ch))
3443 /* ------------------------------------------------------------------------- */
3445 /* Output all properties. */
3447 output_properties (const char *version)
3449 #define PROPERTY(P) \
3450 debug_output_predicate ("unictype/pr_" #P ".txt", is_property_ ## P); \
3451 output_predicate_test ("../tests/unictype/test-pr_" #P ".c", is_property_ ## P, "uc_is_property_" #P " (c)"); \
3452 output_predicate ("unictype/pr_" #P ".h", is_property_ ## P, "u_property_" #P, "Properties", version);
3453 PROPERTY(white_space)
3454 PROPERTY(alphabetic)
3455 PROPERTY(other_alphabetic)
3456 PROPERTY(not_a_character)
3457 PROPERTY(default_ignorable_code_point)
3458 PROPERTY(other_default_ignorable_code_point)
3459 PROPERTY(deprecated)
3460 PROPERTY(logical_order_exception)
3461 PROPERTY(variation_selector)
3462 PROPERTY(private_use)
3463 PROPERTY(unassigned_code_value)
3465 PROPERTY(other_uppercase)
3467 PROPERTY(other_lowercase)
3469 PROPERTY(soft_dotted)
3471 PROPERTY(other_id_start)
3472 PROPERTY(id_continue)
3473 PROPERTY(other_id_continue)
3475 PROPERTY(xid_continue)
3476 PROPERTY(pattern_white_space)
3477 PROPERTY(pattern_syntax)
3478 PROPERTY(join_control)
3479 PROPERTY(grapheme_base)
3480 PROPERTY(grapheme_extend)
3481 PROPERTY(other_grapheme_extend)
3482 PROPERTY(grapheme_link)
3483 PROPERTY(bidi_control)
3484 PROPERTY(bidi_left_to_right)
3485 PROPERTY(bidi_hebrew_right_to_left)
3486 PROPERTY(bidi_arabic_right_to_left)
3487 PROPERTY(bidi_european_digit)
3488 PROPERTY(bidi_eur_num_separator)
3489 PROPERTY(bidi_eur_num_terminator)
3490 PROPERTY(bidi_arabic_digit)
3491 PROPERTY(bidi_common_separator)
3492 PROPERTY(bidi_block_separator)
3493 PROPERTY(bidi_segment_separator)
3494 PROPERTY(bidi_whitespace)
3495 PROPERTY(bidi_non_spacing_mark)
3496 PROPERTY(bidi_boundary_neutral)
3498 PROPERTY(bidi_embedding_or_override)
3499 PROPERTY(bidi_other_neutral)
3501 PROPERTY(ascii_hex_digit)
3502 PROPERTY(ideographic)
3503 PROPERTY(unified_ideograph)
3505 PROPERTY(ids_binary_operator)
3506 PROPERTY(ids_trinary_operator)
3507 PROPERTY(zero_width)
3510 PROPERTY(iso_control)
3511 PROPERTY(format_control)
3514 PROPERTY(punctuation)
3515 PROPERTY(line_separator)
3516 PROPERTY(paragraph_separator)
3517 PROPERTY(quotation_mark)
3518 PROPERTY(sentence_terminal)
3519 PROPERTY(terminal_punctuation)
3520 PROPERTY(currency_symbol)
3522 PROPERTY(other_math)
3523 PROPERTY(paired_punctuation)
3524 PROPERTY(left_of_pair)
3527 PROPERTY(decimal_digit)
3531 PROPERTY(ignorable_control)
3535 /* ========================================================================= */
3539 static const char *scripts[256];
3540 static unsigned int numscripts;
3542 static uint8_t unicode_scripts[0x110000];
3545 fill_scripts (const char *scripts_filename)
3550 stream = fopen (scripts_filename, "r");
3553 fprintf (stderr, "error during fopen of '%s'\n", scripts_filename);
3559 for (i = 0; i < 0x110000; i++)
3560 unicode_scripts[i] = (uint8_t)~(uint8_t)0;
3565 unsigned int i1, i2;
3566 char padding[200+1];
3567 char scriptname[200+1];
3570 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
3573 if (buf[0] == '\0' || buf[0] == '#')
3576 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, scriptname) != 4)
3578 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, scriptname) != 3)
3580 fprintf (stderr, "parse error in '%s'\n", scripts_filename);
3590 for (script = numscripts - 1; script >= 0; script--)
3591 if (strcmp (scripts[script], scriptname) == 0)
3595 scripts[numscripts] = strdup (scriptname);
3596 script = numscripts;
3598 if (numscripts == 256)
3602 for (i = i1; i <= i2; i++)
3604 if (unicode_scripts[i] != (uint8_t)~(uint8_t)0)
3605 fprintf (stderr, "0x%04X belongs to multiple scripts\n", i);
3606 unicode_scripts[i] = script;
3610 if (ferror (stream) || fclose (stream))
3612 fprintf (stderr, "error reading from '%s'\n", scripts_filename);
3617 /* Construction of sparse 3-level tables. */
3618 #define TABLE script_table
3619 #define ELEMENT uint8_t
3620 #define DEFAULT (uint8_t)~(uint8_t)0
3621 #define xmalloc malloc
3622 #define xrealloc realloc
3626 output_scripts (const char *version)
3628 const char *filename = "unictype/scripts.h";
3630 unsigned int ch, s, i;
3631 struct script_table t;
3632 unsigned int level1_offset, level2_offset, level3_offset;
3636 const char *lowercase_name;
3639 scriptinfo_t scriptinfo[256];
3641 stream = fopen (filename, "w");
3644 fprintf (stderr, "cannot open '%s' for writing\n", filename);
3648 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
3649 fprintf (stream, "/* Unicode scripts. */\n");
3650 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
3653 for (s = 0; s < numscripts; s++)
3655 char *lcp = strdup (scripts[s]);
3658 for (cp = lcp; *cp != '\0'; cp++)
3659 if (*cp >= 'A' && *cp <= 'Z')
3662 scriptinfo[s].lowercase_name = lcp;
3665 for (s = 0; s < numscripts; s++)
3667 fprintf (stream, "static const uc_interval_t script_%s_intervals[] =\n",
3668 scriptinfo[s].lowercase_name);
3669 fprintf (stream, "{\n");
3671 for (ch = 0; ch < 0x110000; ch++)
3672 if (unicode_scripts[ch] == s)
3678 while (ch + 1 < 0x110000 && unicode_scripts[ch + 1] == s)
3683 fprintf (stream, ",\n");
3685 fprintf (stream, " { 0x%04X, 1, 1 }", start);
3687 fprintf (stream, " { 0x%04X, 1, 0 }, { 0x%04X, 0, 1 }",
3691 fprintf (stream, "\n");
3692 fprintf (stream, "};\n");
3695 fprintf (stream, "static const uc_script_t scripts[%d] =\n", numscripts);
3696 fprintf (stream, "{\n");
3697 for (s = 0; s < numscripts; s++)
3699 fprintf (stream, " {\n");
3700 fprintf (stream, " sizeof (script_%s_intervals) / sizeof (uc_interval_t),\n",
3701 scriptinfo[s].lowercase_name);
3702 fprintf (stream, " script_%s_intervals,\n",
3703 scriptinfo[s].lowercase_name);
3704 fprintf (stream, " \"%s\"\n", scripts[s]);
3705 fprintf (stream, " }");
3706 if (s+1 < numscripts)
3707 fprintf (stream, ",");
3708 fprintf (stream, "\n");
3710 fprintf (stream, "};\n");
3714 script_table_init (&t);
3716 for (ch = 0; ch < 0x110000; ch++)
3718 unsigned int s = unicode_scripts[ch];
3719 if (s != (uint8_t)~(uint8_t)0)
3720 script_table_add (&t, ch, s);
3723 script_table_finalize (&t);
3725 /* Offsets in t.result, in memory of this process. */
3727 5 * sizeof (uint32_t);
3729 5 * sizeof (uint32_t)
3730 + t.level1_size * sizeof (uint32_t);
3732 5 * sizeof (uint32_t)
3733 + t.level1_size * sizeof (uint32_t)
3734 + (t.level2_size << t.q) * sizeof (uint32_t);
3736 for (i = 0; i < 5; i++)
3737 fprintf (stream, "#define script_header_%d %d\n", i,
3738 ((uint32_t *) t.result)[i]);
3739 fprintf (stream, "static const\n");
3740 fprintf (stream, "struct\n");
3741 fprintf (stream, " {\n");
3742 fprintf (stream, " int level1[%zu];\n", t.level1_size);
3743 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
3744 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
3745 fprintf (stream, " }\n");
3746 fprintf (stream, "u_script =\n");
3747 fprintf (stream, "{\n");
3748 fprintf (stream, " {");
3749 if (t.level1_size > 8)
3750 fprintf (stream, "\n ");
3751 for (i = 0; i < t.level1_size; i++)
3754 if (i > 0 && (i % 8) == 0)
3755 fprintf (stream, "\n ");
3756 offset = ((uint32_t *) (t.result + level1_offset))[i];
3758 fprintf (stream, " %5d", -1);
3760 fprintf (stream, " %5zu",
3761 (offset - level2_offset) / sizeof (uint32_t));
3762 if (i+1 < t.level1_size)
3763 fprintf (stream, ",");
3765 if (t.level1_size > 8)
3766 fprintf (stream, "\n ");
3767 fprintf (stream, " },\n");
3768 fprintf (stream, " {");
3769 if (t.level2_size << t.q > 8)
3770 fprintf (stream, "\n ");
3771 for (i = 0; i < t.level2_size << t.q; i++)
3774 if (i > 0 && (i % 8) == 0)
3775 fprintf (stream, "\n ");
3776 offset = ((uint32_t *) (t.result + level2_offset))[i];
3778 fprintf (stream, " %5d", -1);
3780 fprintf (stream, " %5zu",
3781 (offset - level3_offset) / sizeof (uint8_t));
3782 if (i+1 < t.level2_size << t.q)
3783 fprintf (stream, ",");
3785 if (t.level2_size << t.q > 8)
3786 fprintf (stream, "\n ");
3787 fprintf (stream, " },\n");
3788 fprintf (stream, " {");
3789 if (t.level3_size << t.p > 8)
3790 fprintf (stream, "\n ");
3791 for (i = 0; i < t.level3_size << t.p; i++)
3793 if (i > 0 && (i % 8) == 0)
3794 fprintf (stream, "\n ");
3795 fprintf (stream, " %3d", ((uint8_t *) (t.result + level3_offset))[i]);
3796 if (i+1 < t.level3_size << t.p)
3797 fprintf (stream, ",");
3799 if (t.level3_size << t.p > 8)
3800 fprintf (stream, "\n ");
3801 fprintf (stream, " }\n");
3802 fprintf (stream, "};\n");
3804 if (ferror (stream) || fclose (stream))
3806 fprintf (stderr, "error writing to '%s'\n", filename);
3812 output_scripts_byname (const char *version)
3814 const char *filename = "unictype/scripts_byname.gperf";
3818 stream = fopen (filename, "w");
3821 fprintf (stderr, "cannot open '%s' for writing\n", filename);
3825 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
3826 fprintf (stream, "/* Unicode scripts. */\n");
3827 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
3829 fprintf (stream, "struct named_script { const char *name; unsigned int index; };\n");
3830 fprintf (stream, "%%struct-type\n");
3831 fprintf (stream, "%%language=ANSI-C\n");
3832 fprintf (stream, "%%define hash-function-name scripts_hash\n");
3833 fprintf (stream, "%%define lookup-function-name uc_script_lookup\n");
3834 fprintf (stream, "%%readonly-tables\n");
3835 fprintf (stream, "%%global-table\n");
3836 fprintf (stream, "%%define word-array-name script_names\n");
3837 fprintf (stream, "%%%%\n");
3838 for (s = 0; s < numscripts; s++)
3839 fprintf (stream, "%s, %u\n", scripts[s], s);
3841 if (ferror (stream) || fclose (stream))
3843 fprintf (stderr, "error writing to '%s'\n", filename);
3848 /* ========================================================================= */
3852 typedef struct { unsigned int start; unsigned int end; const char *name; }
3854 static block_t blocks[256];
3855 static unsigned int numblocks;
3858 fill_blocks (const char *blocks_filename)
3862 stream = fopen (blocks_filename, "r");
3865 fprintf (stderr, "error during fopen of '%s'\n", blocks_filename);
3872 unsigned int i1, i2;
3873 char padding[200+1];
3874 char blockname[200+1];
3876 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
3879 if (buf[0] == '\0' || buf[0] == '#')
3882 if (sscanf (buf, "%X..%X%[ ;]%[^\r]", &i1, &i2, padding, blockname) != 4)
3884 fprintf (stderr, "parse error in '%s'\n", blocks_filename);
3887 blocks[numblocks].start = i1;
3888 blocks[numblocks].end = i2;
3889 blocks[numblocks].name = strdup (blockname);
3890 /* It must be sorted. */
3891 if (numblocks > 0 && !(blocks[numblocks-1].end < blocks[numblocks].start))
3894 if (numblocks == 256)
3898 if (ferror (stream) || fclose (stream))
3900 fprintf (stderr, "error reading from '%s'\n", blocks_filename);
3905 /* Return the smallest block index among the blocks for characters >= ch. */
3907 block_first_index (unsigned int ch)
3909 /* Binary search. */
3910 unsigned int lo = 0;
3911 unsigned int hi = numblocks;
3913 All blocks[i], i < lo, have blocks[i].end < ch,
3914 all blocks[i], i >= hi, have blocks[i].end >= ch. */
3917 unsigned int mid = (lo + hi) / 2; /* >= lo, < hi */
3918 if (blocks[mid].end < ch)
3926 /* Return the largest block index among the blocks for characters <= ch,
3929 block_last_index (unsigned int ch)
3931 /* Binary search. */
3932 unsigned int lo = 0;
3933 unsigned int hi = numblocks;
3935 All blocks[i], i < lo, have blocks[i].start <= ch,
3936 all blocks[i], i >= hi, have blocks[i].start > ch. */
3939 unsigned int mid = (lo + hi) / 2; /* >= lo, < hi */
3940 if (blocks[mid].start <= ch)
3949 output_blocks (const char *version)
3951 const char *filename = "unictype/blocks.h";
3952 const unsigned int shift = 8; /* bits to shift away for array access */
3953 const unsigned int threshold = 0x30000; /* cut-off table here to save space */
3958 stream = fopen (filename, "w");
3961 fprintf (stderr, "cannot open '%s' for writing\n", filename);
3965 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
3966 fprintf (stream, "/* Unicode blocks. */\n");
3967 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
3970 fprintf (stream, "static const uc_block_t blocks[] =\n");
3971 fprintf (stream, "{\n");
3972 for (i = 0; i < numblocks; i++)
3974 fprintf (stream, " { 0x%04X, 0x%04X, \"%s\" }", blocks[i].start,
3975 blocks[i].end, blocks[i].name);
3976 if (i+1 < numblocks)
3977 fprintf (stream, ",");
3978 fprintf (stream, "\n");
3980 fprintf (stream, "};\n");
3981 fprintf (stream, "#define blocks_level1_shift %d\n", shift);
3982 fprintf (stream, "#define blocks_level1_threshold 0x%04X\n", threshold);
3983 fprintf (stream, "static const uint8_t blocks_level1[%d * 2] =\n",
3984 threshold >> shift);
3985 fprintf (stream, "{\n");
3986 for (i1 = 0; i1 < (threshold >> shift); i1++)
3988 unsigned int first_index = block_first_index (i1 << shift);
3989 unsigned int last_index = block_last_index (((i1 + 1) << shift) - 1);
3990 fprintf (stream, " %3d, %3d", first_index, last_index);
3991 if (i1+1 < (threshold >> shift))
3992 fprintf (stream, ",");
3993 fprintf (stream, "\n");
3995 fprintf (stream, "};\n");
3996 fprintf (stream, "#define blocks_upper_first_index %d\n",
3997 block_first_index (threshold));
3998 fprintf (stream, "#define blocks_upper_last_index %d\n",
3999 block_last_index (0x10FFFF));
4001 if (ferror (stream) || fclose (stream))
4003 fprintf (stderr, "error writing to '%s'\n", filename);
4008 /* ========================================================================= */
4010 /* C and Java syntax. */
4014 UC_IDENTIFIER_START, /* valid as first or subsequent character */
4015 UC_IDENTIFIER_VALID, /* valid as subsequent character only */
4016 UC_IDENTIFIER_INVALID, /* not valid */
4017 UC_IDENTIFIER_IGNORABLE /* ignorable (Java only) */
4020 /* ISO C 99 section 6.4.(3). */
4022 is_c_whitespace (unsigned int ch)
4024 return (ch == ' ' /* space */
4025 || ch == '\t' /* horizontal tab */
4026 || ch == '\n' || ch == '\r' /* new-line */
4027 || ch == '\v' /* vertical tab */
4028 || ch == '\f'); /* form-feed */
4031 /* ISO C 99 section 6.4.2.1 and appendix D. */
4033 c_ident_category (unsigned int ch)
4035 /* Section 6.4.2.1. */
4036 if (ch >= '0' && ch <= '9')
4037 return UC_IDENTIFIER_VALID;
4038 if ((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || ch == '_')
4039 return UC_IDENTIFIER_START;
4045 || (ch >= 0x00C0 && ch <= 0x00D6)
4046 || (ch >= 0x00D8 && ch <= 0x00F6)
4047 || (ch >= 0x00F8 && ch <= 0x01F5)
4048 || (ch >= 0x01FA && ch <= 0x0217)
4049 || (ch >= 0x0250 && ch <= 0x02A8)
4050 || (ch >= 0x1E00 && ch <= 0x1E9B)
4051 || (ch >= 0x1EA0 && ch <= 0x1EF9)
4055 || (ch >= 0x0388 && ch <= 0x038A)
4057 || (ch >= 0x038E && ch <= 0x03A1)
4058 || (ch >= 0x03A3 && ch <= 0x03CE)
4059 || (ch >= 0x03D0 && ch <= 0x03D6)
4064 || (ch >= 0x03E2 && ch <= 0x03F3)
4065 || (ch >= 0x1F00 && ch <= 0x1F15)
4066 || (ch >= 0x1F18 && ch <= 0x1F1D)
4067 || (ch >= 0x1F20 && ch <= 0x1F45)
4068 || (ch >= 0x1F48 && ch <= 0x1F4D)
4069 || (ch >= 0x1F50 && ch <= 0x1F57)
4073 || (ch >= 0x1F5F && ch <= 0x1F7D)
4074 || (ch >= 0x1F80 && ch <= 0x1FB4)
4075 || (ch >= 0x1FB6 && ch <= 0x1FBC)
4076 || (ch >= 0x1FC2 && ch <= 0x1FC4)
4077 || (ch >= 0x1FC6 && ch <= 0x1FCC)
4078 || (ch >= 0x1FD0 && ch <= 0x1FD3)
4079 || (ch >= 0x1FD6 && ch <= 0x1FDB)
4080 || (ch >= 0x1FE0 && ch <= 0x1FEC)
4081 || (ch >= 0x1FF2 && ch <= 0x1FF4)
4082 || (ch >= 0x1FF6 && ch <= 0x1FFC)
4084 || (ch >= 0x0401 && ch <= 0x040C)
4085 || (ch >= 0x040E && ch <= 0x044F)
4086 || (ch >= 0x0451 && ch <= 0x045C)
4087 || (ch >= 0x045E && ch <= 0x0481)
4088 || (ch >= 0x0490 && ch <= 0x04C4)
4089 || (ch >= 0x04C7 && ch <= 0x04C8)
4090 || (ch >= 0x04CB && ch <= 0x04CC)
4091 || (ch >= 0x04D0 && ch <= 0x04EB)
4092 || (ch >= 0x04EE && ch <= 0x04F5)
4093 || (ch >= 0x04F8 && ch <= 0x04F9)
4095 || (ch >= 0x0531 && ch <= 0x0556)
4096 || (ch >= 0x0561 && ch <= 0x0587)
4098 || (ch >= 0x05B0 && ch <= 0x05B9)
4099 || (ch >= 0x05BB && ch <= 0x05BD)
4101 || (ch >= 0x05C1 && ch <= 0x05C2)
4102 || (ch >= 0x05D0 && ch <= 0x05EA)
4103 || (ch >= 0x05F0 && ch <= 0x05F2)
4105 || (ch >= 0x0621 && ch <= 0x063A)
4106 || (ch >= 0x0640 && ch <= 0x0652)
4107 || (ch >= 0x0670 && ch <= 0x06B7)
4108 || (ch >= 0x06BA && ch <= 0x06BE)
4109 || (ch >= 0x06C0 && ch <= 0x06CE)
4110 || (ch >= 0x06D0 && ch <= 0x06DC)
4111 || (ch >= 0x06E5 && ch <= 0x06E8)
4112 || (ch >= 0x06EA && ch <= 0x06ED)
4114 || (ch >= 0x0901 && ch <= 0x0903)
4115 || (ch >= 0x0905 && ch <= 0x0939)
4116 || (ch >= 0x093E && ch <= 0x094D)
4117 || (ch >= 0x0950 && ch <= 0x0952)
4118 || (ch >= 0x0958 && ch <= 0x0963)
4120 || (ch >= 0x0981 && ch <= 0x0983)
4121 || (ch >= 0x0985 && ch <= 0x098C)
4122 || (ch >= 0x098F && ch <= 0x0990)
4123 || (ch >= 0x0993 && ch <= 0x09A8)
4124 || (ch >= 0x09AA && ch <= 0x09B0)
4126 || (ch >= 0x09B6 && ch <= 0x09B9)
4127 || (ch >= 0x09BE && ch <= 0x09C4)
4128 || (ch >= 0x09C7 && ch <= 0x09C8)
4129 || (ch >= 0x09CB && ch <= 0x09CD)
4130 || (ch >= 0x09DC && ch <= 0x09DD)
4131 || (ch >= 0x09DF && ch <= 0x09E3)
4132 || (ch >= 0x09F0 && ch <= 0x09F1)
4135 || (ch >= 0x0A05 && ch <= 0x0A0A)
4136 || (ch >= 0x0A0F && ch <= 0x0A10)
4137 || (ch >= 0x0A13 && ch <= 0x0A28)
4138 || (ch >= 0x0A2A && ch <= 0x0A30)
4139 || (ch >= 0x0A32 && ch <= 0x0A33)
4140 || (ch >= 0x0A35 && ch <= 0x0A36)
4141 || (ch >= 0x0A38 && ch <= 0x0A39)
4142 || (ch >= 0x0A3E && ch <= 0x0A42)
4143 || (ch >= 0x0A47 && ch <= 0x0A48)
4144 || (ch >= 0x0A4B && ch <= 0x0A4D)
4145 || (ch >= 0x0A59 && ch <= 0x0A5C)
4149 || (ch >= 0x0A81 && ch <= 0x0A83)
4150 || (ch >= 0x0A85 && ch <= 0x0A8B)
4152 || (ch >= 0x0A8F && ch <= 0x0A91)
4153 || (ch >= 0x0A93 && ch <= 0x0AA8)
4154 || (ch >= 0x0AAA && ch <= 0x0AB0)
4155 || (ch >= 0x0AB2 && ch <= 0x0AB3)
4156 || (ch >= 0x0AB5 && ch <= 0x0AB9)
4157 || (ch >= 0x0ABD && ch <= 0x0AC5)
4158 || (ch >= 0x0AC7 && ch <= 0x0AC9)
4159 || (ch >= 0x0ACB && ch <= 0x0ACD)
4163 || (ch >= 0x0B01 && ch <= 0x0B03)
4164 || (ch >= 0x0B05 && ch <= 0x0B0C)
4165 || (ch >= 0x0B0F && ch <= 0x0B10)
4166 || (ch >= 0x0B13 && ch <= 0x0B28)
4167 || (ch >= 0x0B2A && ch <= 0x0B30)
4168 || (ch >= 0x0B32 && ch <= 0x0B33)
4169 || (ch >= 0x0B36 && ch <= 0x0B39)
4170 || (ch >= 0x0B3E && ch <= 0x0B43)
4171 || (ch >= 0x0B47 && ch <= 0x0B48)
4172 || (ch >= 0x0B4B && ch <= 0x0B4D)
4173 || (ch >= 0x0B5C && ch <= 0x0B5D)
4174 || (ch >= 0x0B5F && ch <= 0x0B61)
4176 || (ch >= 0x0B82 && ch <= 0x0B83)
4177 || (ch >= 0x0B85 && ch <= 0x0B8A)
4178 || (ch >= 0x0B8E && ch <= 0x0B90)
4179 || (ch >= 0x0B92 && ch <= 0x0B95)
4180 || (ch >= 0x0B99 && ch <= 0x0B9A)
4182 || (ch >= 0x0B9E && ch <= 0x0B9F)
4183 || (ch >= 0x0BA3 && ch <= 0x0BA4)
4184 || (ch >= 0x0BA8 && ch <= 0x0BAA)
4185 || (ch >= 0x0BAE && ch <= 0x0BB5)
4186 || (ch >= 0x0BB7 && ch <= 0x0BB9)
4187 || (ch >= 0x0BBE && ch <= 0x0BC2)
4188 || (ch >= 0x0BC6 && ch <= 0x0BC8)
4189 || (ch >= 0x0BCA && ch <= 0x0BCD)
4191 || (ch >= 0x0C01 && ch <= 0x0C03)
4192 || (ch >= 0x0C05 && ch <= 0x0C0C)
4193 || (ch >= 0x0C0E && ch <= 0x0C10)
4194 || (ch >= 0x0C12 && ch <= 0x0C28)
4195 || (ch >= 0x0C2A && ch <= 0x0C33)
4196 || (ch >= 0x0C35 && ch <= 0x0C39)
4197 || (ch >= 0x0C3E && ch <= 0x0C44)
4198 || (ch >= 0x0C46 && ch <= 0x0C48)
4199 || (ch >= 0x0C4A && ch <= 0x0C4D)
4200 || (ch >= 0x0C60 && ch <= 0x0C61)
4202 || (ch >= 0x0C82 && ch <= 0x0C83)
4203 || (ch >= 0x0C85 && ch <= 0x0C8C)
4204 || (ch >= 0x0C8E && ch <= 0x0C90)
4205 || (ch >= 0x0C92 && ch <= 0x0CA8)
4206 || (ch >= 0x0CAA && ch <= 0x0CB3)
4207 || (ch >= 0x0CB5 && ch <= 0x0CB9)
4208 || (ch >= 0x0CBE && ch <= 0x0CC4)
4209 || (ch >= 0x0CC6 && ch <= 0x0CC8)
4210 || (ch >= 0x0CCA && ch <= 0x0CCD)
4212 || (ch >= 0x0CE0 && ch <= 0x0CE1)
4214 || (ch >= 0x0D02 && ch <= 0x0D03)
4215 || (ch >= 0x0D05 && ch <= 0x0D0C)
4216 || (ch >= 0x0D0E && ch <= 0x0D10)
4217 || (ch >= 0x0D12 && ch <= 0x0D28)
4218 || (ch >= 0x0D2A && ch <= 0x0D39)
4219 || (ch >= 0x0D3E && ch <= 0x0D43)
4220 || (ch >= 0x0D46 && ch <= 0x0D48)
4221 || (ch >= 0x0D4A && ch <= 0x0D4D)
4222 || (ch >= 0x0D60 && ch <= 0x0D61)
4224 || (ch >= 0x0E01 && ch <= 0x0E3A)
4225 || (ch >= 0x0E40 && ch <= 0x0E5B)
4227 || (ch >= 0x0E81 && ch <= 0x0E82)
4229 || (ch >= 0x0E87 && ch <= 0x0E88)
4232 || (ch >= 0x0E94 && ch <= 0x0E97)
4233 || (ch >= 0x0E99 && ch <= 0x0E9F)
4234 || (ch >= 0x0EA1 && ch <= 0x0EA3)
4237 || (ch >= 0x0EAA && ch <= 0x0EAB)
4238 || (ch >= 0x0EAD && ch <= 0x0EAE)
4239 || (ch >= 0x0EB0 && ch <= 0x0EB9)
4240 || (ch >= 0x0EBB && ch <= 0x0EBD)
4241 || (ch >= 0x0EC0 && ch <= 0x0EC4)
4243 || (ch >= 0x0EC8 && ch <= 0x0ECD)
4244 || (ch >= 0x0EDC && ch <= 0x0EDD)
4247 || (ch >= 0x0F18 && ch <= 0x0F19)
4251 || (ch >= 0x0F3E && ch <= 0x0F47)
4252 || (ch >= 0x0F49 && ch <= 0x0F69)
4253 || (ch >= 0x0F71 && ch <= 0x0F84)
4254 || (ch >= 0x0F86 && ch <= 0x0F8B)
4255 || (ch >= 0x0F90 && ch <= 0x0F95)
4257 || (ch >= 0x0F99 && ch <= 0x0FAD)
4258 || (ch >= 0x0FB1 && ch <= 0x0FB7)
4261 || (ch >= 0x10A0 && ch <= 0x10C5)
4262 || (ch >= 0x10D0 && ch <= 0x10F6)
4264 || (ch >= 0x3041 && ch <= 0x3093)
4265 || (ch >= 0x309B && ch <= 0x309C)
4267 || (ch >= 0x30A1 && ch <= 0x30F6)
4268 || (ch >= 0x30FB && ch <= 0x30FC)
4270 || (ch >= 0x3105 && ch <= 0x312C)
4271 /* CJK Unified Ideographs */
4272 || (ch >= 0x4E00 && ch <= 0x9FA5)
4274 || (ch >= 0xAC00 && ch <= 0xD7A3)
4276 || (ch >= 0x0660 && ch <= 0x0669)
4277 || (ch >= 0x06F0 && ch <= 0x06F9)
4278 || (ch >= 0x0966 && ch <= 0x096F)
4279 || (ch >= 0x09E6 && ch <= 0x09EF)
4280 || (ch >= 0x0A66 && ch <= 0x0A6F)
4281 || (ch >= 0x0AE6 && ch <= 0x0AEF)
4282 || (ch >= 0x0B66 && ch <= 0x0B6F)
4283 || (ch >= 0x0BE7 && ch <= 0x0BEF)
4284 || (ch >= 0x0C66 && ch <= 0x0C6F)
4285 || (ch >= 0x0CE6 && ch <= 0x0CEF)
4286 || (ch >= 0x0D66 && ch <= 0x0D6F)
4287 || (ch >= 0x0E50 && ch <= 0x0E59)
4288 || (ch >= 0x0ED0 && ch <= 0x0ED9)
4289 || (ch >= 0x0F20 && ch <= 0x0F33)
4290 /* Special characters */
4293 || (ch >= 0x02B0 && ch <= 0x02B8)
4295 || (ch >= 0x02BD && ch <= 0x02C1)
4296 || (ch >= 0x02D0 && ch <= 0x02D1)
4297 || (ch >= 0x02E0 && ch <= 0x02E4)
4303 || (ch >= 0x203F && ch <= 0x2040)
4306 || (ch >= 0x210A && ch <= 0x2113)
4308 || (ch >= 0x2118 && ch <= 0x211D)
4312 || (ch >= 0x212A && ch <= 0x2131)
4313 || (ch >= 0x2133 && ch <= 0x2138)
4314 || (ch >= 0x2160 && ch <= 0x2182)
4315 || (ch >= 0x3005 && ch <= 0x3007)
4316 || (ch >= 0x3021 && ch <= 0x3029)
4318 return UC_IDENTIFIER_START;
4319 return UC_IDENTIFIER_INVALID;
4322 /* The Java Language Specification, 3rd edition, §3.6.
4323 http://java.sun.com/docs/books/jls/third_edition/html/lexical.html#95710 */
4325 is_java_whitespace (unsigned int ch)
4327 return (ch == ' ' || ch == '\t' || ch == '\f'
4328 || ch == '\n' || ch == '\r');
4331 /* The Java Language Specification, 3rd edition, §3.8.
4332 http://java.sun.com/docs/books/jls/third_edition/html/lexical.html#40625
4333 and Character.isJavaIdentifierStart and Character.isJavaIdentifierPart */
4335 java_ident_category (unsigned int ch)
4337 /* FIXME: Check this against Sun's JDK implementation. */
4338 if (is_category_L (ch) /* = Character.isLetter(ch) */
4339 || is_category_Nl (ch) /* = Character.getType(ch)==LETTER_NUMBER */
4340 || is_category_Sc (ch) /* currency symbol */
4341 || is_category_Pc (ch) /* connector punctuation */
4343 return UC_IDENTIFIER_START;
4344 if (is_category_Nd (ch) /* digit */
4345 || is_category_Mc (ch) /* combining mark */
4346 || is_category_Mn (ch) /* non-spacing mark */
4348 return UC_IDENTIFIER_VALID;
4349 if ((ch >= 0x0000 && ch <= 0x0008)
4350 || (ch >= 0x000E && ch <= 0x001B)
4351 || (ch >= 0x007F && ch <= 0x009F)
4352 || is_category_Cf (ch) /* = Character.getType(ch)==FORMAT */
4354 return UC_IDENTIFIER_IGNORABLE;
4355 return UC_IDENTIFIER_INVALID;
4358 /* Construction of sparse 3-level tables. */
4359 #define TABLE identsyntax_table
4360 #define ELEMENT uint8_t
4361 #define DEFAULT UC_IDENTIFIER_INVALID
4362 #define xmalloc malloc
4363 #define xrealloc realloc
4366 /* Output an identifier syntax categorization in a three-level bitmap. */
4368 output_ident_category (const char *filename, int (*predicate) (unsigned int), const char *name, const char *version)
4372 struct identsyntax_table t;
4373 unsigned int level1_offset, level2_offset, level3_offset;
4375 stream = fopen (filename, "w");
4378 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4382 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4383 fprintf (stream, "/* Language syntax properties of Unicode characters. */\n");
4384 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
4389 identsyntax_table_init (&t);
4391 for (ch = 0; ch < 0x110000; ch++)
4393 int syntaxcode = predicate (ch);
4394 if (syntaxcode != UC_IDENTIFIER_INVALID)
4395 identsyntax_table_add (&t, ch, syntaxcode);
4398 identsyntax_table_finalize (&t);
4400 /* Offsets in t.result, in memory of this process. */
4402 5 * sizeof (uint32_t);
4404 5 * sizeof (uint32_t)
4405 + t.level1_size * sizeof (uint32_t);
4407 5 * sizeof (uint32_t)
4408 + t.level1_size * sizeof (uint32_t)
4409 + (t.level2_size << t.q) * sizeof (uint32_t);
4411 for (i = 0; i < 5; i++)
4412 fprintf (stream, "#define identsyntax_header_%d %d\n", i,
4413 ((uint32_t *) t.result)[i]);
4414 fprintf (stream, "static const\n");
4415 fprintf (stream, "struct\n");
4416 fprintf (stream, " {\n");
4417 fprintf (stream, " int level1[%zu];\n", t.level1_size);
4418 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
4419 fprintf (stream, " unsigned short level3[%zu * %d];\n", t.level3_size,
4420 (1 << t.p) * 2 / 16);
4421 fprintf (stream, " }\n");
4422 fprintf (stream, "%s =\n", name);
4423 fprintf (stream, "{\n");
4424 fprintf (stream, " {");
4425 if (t.level1_size > 8)
4426 fprintf (stream, "\n ");
4427 for (i = 0; i < t.level1_size; i++)
4430 if (i > 0 && (i % 8) == 0)
4431 fprintf (stream, "\n ");
4432 offset = ((uint32_t *) (t.result + level1_offset))[i];
4434 fprintf (stream, " %5d", -1);
4436 fprintf (stream, " %5zu",
4437 (offset - level2_offset) / sizeof (uint32_t));
4438 if (i+1 < t.level1_size)
4439 fprintf (stream, ",");
4441 if (t.level1_size > 8)
4442 fprintf (stream, "\n ");
4443 fprintf (stream, " },\n");
4444 fprintf (stream, " {");
4445 if (t.level2_size << t.q > 8)
4446 fprintf (stream, "\n ");
4447 for (i = 0; i < t.level2_size << t.q; i++)
4450 if (i > 0 && (i % 8) == 0)
4451 fprintf (stream, "\n ");
4452 offset = ((uint32_t *) (t.result + level2_offset))[i];
4454 fprintf (stream, " %5d", -1);
4456 fprintf (stream, " %5zu",
4457 (offset - level3_offset) / sizeof (uint8_t));
4458 if (i+1 < t.level2_size << t.q)
4459 fprintf (stream, ",");
4461 if (t.level2_size << t.q > 8)
4462 fprintf (stream, "\n ");
4463 fprintf (stream, " },\n");
4464 /* Pack the level3 array. Each entry needs 2 bits only. */
4465 fprintf (stream, " {");
4466 if ((t.level3_size << t.p) * 2 / 16 > 8)
4467 fprintf (stream, "\n ");
4468 for (i = 0; i < (t.level3_size << t.p) * 2 / 16; i++)
4470 if (i > 0 && (i % 8) == 0)
4471 fprintf (stream, "\n ");
4472 fprintf (stream, " 0x%04x",
4473 (((uint8_t *) (t.result + level3_offset))[8 * i] << 0)
4474 | (((uint8_t *) (t.result + level3_offset))[8 * i + 1] << 2)
4475 | (((uint8_t *) (t.result + level3_offset))[8 * i + 2] << 4)
4476 | (((uint8_t *) (t.result + level3_offset))[8 * i + 3] << 6)
4477 | (((uint8_t *) (t.result + level3_offset))[8 * i + 4] << 8)
4478 | (((uint8_t *) (t.result + level3_offset))[8 * i + 5] << 10)
4479 | (((uint8_t *) (t.result + level3_offset))[8 * i + 6] << 12)
4480 | (((uint8_t *) (t.result + level3_offset))[8 * i + 7] << 14));
4481 if (i+1 < (t.level3_size << t.p) * 2 / 16)
4482 fprintf (stream, ",");
4484 if ((t.level3_size << t.p) * 2 / 16 > 8)
4485 fprintf (stream, "\n ");
4486 fprintf (stream, " }\n");
4487 fprintf (stream, "};\n");
4489 if (ferror (stream) || fclose (stream))
4491 fprintf (stderr, "error writing to '%s'\n", filename);
4497 output_ident_properties (const char *version)
4499 #define PROPERTY(P) \
4500 debug_output_predicate ("unictype/sy_" #P ".txt", is_ ## P); \
4501 output_predicate_test ("../tests/unictype/test-sy_" #P ".c", is_ ## P, "uc_is_" #P " (c)"); \
4502 output_predicate ("unictype/sy_" #P ".h", is_ ## P, "u_" #P, "Language syntax properties", version);
4503 PROPERTY(c_whitespace)
4504 PROPERTY(java_whitespace)
4507 output_ident_category ("unictype/sy_c_ident.h", c_ident_category, "u_c_ident", version);
4508 output_ident_category ("unictype/sy_java_ident.h", java_ident_category, "u_java_ident", version);
4511 /* ========================================================================= */
4513 /* Like ISO C <ctype.h> and <wctype.h>. Compatible to glibc's
4514 glibc/localedata/locales/i18n file, generated by
4515 glibc/localedata/gen-unicode-ctype.c. */
4517 /* Character mappings. */
4520 to_upper (unsigned int ch)
4522 if (unicode_attributes[ch].name != NULL
4523 && unicode_attributes[ch].upper != NONE)
4524 return unicode_attributes[ch].upper;
4530 to_lower (unsigned int ch)
4532 if (unicode_attributes[ch].name != NULL
4533 && unicode_attributes[ch].lower != NONE)
4534 return unicode_attributes[ch].lower;
4540 to_title (unsigned int ch)
4542 if (unicode_attributes[ch].name != NULL
4543 && unicode_attributes[ch].title != NONE)
4544 return unicode_attributes[ch].title;
4549 /* Character class properties. */
4552 is_upper (unsigned int ch)
4554 return (to_lower (ch) != ch);
4558 is_lower (unsigned int ch)
4560 return (to_upper (ch) != ch)
4561 /* <U00DF> is lowercase, but without simple to_upper mapping. */
4566 is_alpha (unsigned int ch)
4568 return (unicode_attributes[ch].name != NULL
4569 && ((unicode_attributes[ch].category[0] == 'L'
4570 /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
4571 <U0E2F>, <U0E46> should belong to is_punct. */
4572 && (ch != 0x0E2F) && (ch != 0x0E46))
4573 /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
4574 <U0E31>, <U0E34>..<U0E3A>, <U0E47>..<U0E4E> are is_alpha. */
4576 || (ch >= 0x0E34 && ch <= 0x0E3A)
4577 || (ch >= 0x0E47 && ch <= 0x0E4E)
4578 /* Avoid warning for <U0345>. */
4580 /* Avoid warnings for <U2160>..<U217F>. */
4581 || (unicode_attributes[ch].category[0] == 'N'
4582 && unicode_attributes[ch].category[1] == 'l')
4583 /* Avoid warnings for <U24B6>..<U24E9>. */
4584 || (unicode_attributes[ch].category[0] == 'S'
4585 && unicode_attributes[ch].category[1] == 'o'
4586 && strstr (unicode_attributes[ch].name, " LETTER ")
4588 /* Consider all the non-ASCII digits as alphabetic.
4589 ISO C 99 forbids us to have them in category "digit",
4590 but we want iswalnum to return true on them. */
4591 || (unicode_attributes[ch].category[0] == 'N'
4592 && unicode_attributes[ch].category[1] == 'd'
4593 && !(ch >= 0x0030 && ch <= 0x0039))));
4597 is_digit (unsigned int ch)
4600 return (unicode_attributes[ch].name != NULL
4601 && unicode_attributes[ch].category[0] == 'N'
4602 && unicode_attributes[ch].category[1] == 'd');
4603 /* Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
4604 a zero. Must add <0> in front of them by hand. */
4606 /* SUSV2 gives us some freedom for the "digit" category, but ISO C 99
4609 The iswdigit function tests for any wide character that corresponds
4610 to a decimal-digit character (as defined in 5.2.1).
4612 the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
4614 return (ch >= 0x0030 && ch <= 0x0039);
4619 is_outdigit (unsigned int ch)
4621 return (ch >= 0x0030 && ch <= 0x0039);
4625 is_alnum (unsigned int ch)
4627 return is_alpha (ch) || is_digit (ch);
4631 is_blank (unsigned int ch)
4633 return (ch == 0x0009 /* '\t' */
4634 /* Category Zs without mention of "<noBreak>" */
4635 || (unicode_attributes[ch].name != NULL
4636 && unicode_attributes[ch].category[0] == 'Z'
4637 && unicode_attributes[ch].category[1] == 's'
4638 && !strstr (unicode_attributes[ch].decomposition, "<noBreak>")));
4642 is_space (unsigned int ch)
4644 /* Don't make U+00A0 a space. Non-breaking space means that all programs
4645 should treat it like a punctuation character, not like a space. */
4646 return (ch == 0x0020 /* ' ' */
4647 || ch == 0x000C /* '\f' */
4648 || ch == 0x000A /* '\n' */
4649 || ch == 0x000D /* '\r' */
4650 || ch == 0x0009 /* '\t' */
4651 || ch == 0x000B /* '\v' */
4652 /* Categories Zl, Zp, and Zs without mention of "<noBreak>" */
4653 || (unicode_attributes[ch].name != NULL
4654 && unicode_attributes[ch].category[0] == 'Z'
4655 && (unicode_attributes[ch].category[1] == 'l'
4656 || unicode_attributes[ch].category[1] == 'p'
4657 || (unicode_attributes[ch].category[1] == 's'
4658 && !strstr (unicode_attributes[ch].decomposition,
4663 is_cntrl (unsigned int ch)
4665 return (unicode_attributes[ch].name != NULL
4666 && (strcmp (unicode_attributes[ch].name, "<control>") == 0
4667 /* Categories Zl and Zp */
4668 || (unicode_attributes[ch].category[0] == 'Z'
4669 && (unicode_attributes[ch].category[1] == 'l'
4670 || unicode_attributes[ch].category[1] == 'p'))));
4674 is_xdigit (unsigned int ch)
4677 return is_digit (ch)
4678 || (ch >= 0x0041 && ch <= 0x0046)
4679 || (ch >= 0x0061 && ch <= 0x0066);
4681 /* SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
4684 The iswxdigit function tests for any wide character that corresponds
4685 to a hexadecimal-digit character (as defined in 6.4.4.1).
4687 hexadecimal-digit: one of 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
4689 return (ch >= 0x0030 && ch <= 0x0039)
4690 || (ch >= 0x0041 && ch <= 0x0046)
4691 || (ch >= 0x0061 && ch <= 0x0066);
4696 is_graph (unsigned int ch)
4698 return (unicode_attributes[ch].name != NULL
4699 && strcmp (unicode_attributes[ch].name, "<control>")
4704 is_print (unsigned int ch)
4706 return (unicode_attributes[ch].name != NULL
4707 && strcmp (unicode_attributes[ch].name, "<control>")
4708 /* Categories Zl and Zp */
4709 && !(unicode_attributes[ch].name != NULL
4710 && unicode_attributes[ch].category[0] == 'Z'
4711 && (unicode_attributes[ch].category[1] == 'l'
4712 || unicode_attributes[ch].category[1] == 'p')));
4716 is_punct (unsigned int ch)
4719 return (unicode_attributes[ch].name != NULL
4720 && unicode_attributes[ch].category[0] == 'P');
4722 /* The traditional POSIX definition of punctuation is every graphic,
4723 non-alphanumeric character. */
4724 return (is_graph (ch) && !is_alpha (ch) && !is_digit (ch));
4728 /* Output all properties. */
4730 output_old_ctype (const char *version)
4732 #define PROPERTY(P) \
4733 debug_output_predicate ("unictype/ctype_" #P ".txt", is_ ## P); \
4734 output_predicate_test ("../tests/unictype/test-ctype_" #P ".c", is_ ## P, "uc_is_" #P " (c)"); \
4735 output_predicate ("unictype/ctype_" #P ".h", is_ ## P, "u_is_" #P, "ISO C <ctype.h> like properties", version);
4754 is_combining (unsigned int ch)
4756 /* Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
4757 file. In 3.0.1 it was identical to the union of the general categories
4758 "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the
4759 PropList.txt file, so we take the latter definition. */
4760 return (unicode_attributes[ch].name != NULL
4761 && unicode_attributes[ch].category[0] == 'M'
4762 && (unicode_attributes[ch].category[1] == 'n'
4763 || unicode_attributes[ch].category[1] == 'c'
4764 || unicode_attributes[ch].category[1] == 'e'));
4768 is_combining_level3 (unsigned int ch)
4770 return is_combining (ch)
4771 && !(unicode_attributes[ch].combining[0] != '\0'
4772 && unicode_attributes[ch].combining[0] != '0'
4773 && strtoul (unicode_attributes[ch].combining, NULL, 10) >= 200);
4776 /* Return the UCS symbol string for a Unicode character. */
4778 ucs_symbol (unsigned int i)
4780 static char buf[11+1];
4782 sprintf (buf, (i < 0x10000 ? "<U%04X>" : "<U%08X>"), i);
4786 /* Return the UCS symbol range string for a Unicode characters interval. */
4788 ucs_symbol_range (unsigned int low, unsigned int high)
4790 static char buf[24+1];
4792 strcpy (buf, ucs_symbol (low));
4794 strcat (buf, ucs_symbol (high));
4798 /* Output a character class (= property) table. */
4801 output_charclass (FILE *stream, const char *classname,
4802 bool (*func) (unsigned int))
4804 char table[0x110000];
4806 bool need_semicolon;
4807 const int max_column = 75;
4810 for (i = 0; i < 0x110000; i++)
4811 table[i] = (int) func (i);
4813 fprintf (stream, "%s ", classname);
4814 need_semicolon = false;
4816 for (i = 0; i < 0x110000; )
4822 unsigned int low, high;
4828 while (i < 0x110000 && table[i]);
4832 strcpy (buf, ucs_symbol (low));
4834 strcpy (buf, ucs_symbol_range (low, high));
4838 fprintf (stream, ";");
4842 if (column + strlen (buf) > max_column)
4844 fprintf (stream, "/\n ");
4848 fprintf (stream, "%s", buf);
4849 column += strlen (buf);
4850 need_semicolon = true;
4853 fprintf (stream, "\n");
4856 /* Output a character mapping table. */
4859 output_charmap (FILE *stream, const char *mapname,
4860 unsigned int (*func) (unsigned int))
4862 char table[0x110000];
4864 bool need_semicolon;
4865 const int max_column = 75;
4868 for (i = 0; i < 0x110000; i++)
4869 table[i] = (func (i) != i);
4871 fprintf (stream, "%s ", mapname);
4872 need_semicolon = false;
4874 for (i = 0; i < 0x110000; i++)
4880 strcat (buf, ucs_symbol (i));
4882 strcat (buf, ucs_symbol (func (i)));
4887 fprintf (stream, ";");
4891 if (column + strlen (buf) > max_column)
4893 fprintf (stream, "/\n ");
4897 fprintf (stream, "%s", buf);
4898 column += strlen (buf);
4899 need_semicolon = true;
4901 fprintf (stream, "\n");
4904 /* Output the width table. */
4907 output_widthmap (FILE *stream)
4911 /* Output the tables to the given file. */
4914 output_tables (const char *filename, const char *version)
4919 stream = fopen (filename, "w");
4922 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4926 fprintf (stream, "escape_char /\n");
4927 fprintf (stream, "comment_char %%\n");
4928 fprintf (stream, "\n");
4929 fprintf (stream, "%% Generated automatically by gen-unicode-ctype for Unicode %s.\n",
4931 fprintf (stream, "\n");
4933 fprintf (stream, "LC_IDENTIFICATION\n");
4934 fprintf (stream, "title \"Unicode %s FDCC-set\"\n", version);
4935 fprintf (stream, "source \"UnicodeData.txt, PropList.txt\"\n");
4936 fprintf (stream, "address \"\"\n");
4937 fprintf (stream, "contact \"\"\n");
4938 fprintf (stream, "email \"bug-glibc@gnu.org\"\n");
4939 fprintf (stream, "tel \"\"\n");
4940 fprintf (stream, "fax \"\"\n");
4941 fprintf (stream, "language \"\"\n");
4942 fprintf (stream, "territory \"Earth\"\n");
4943 fprintf (stream, "revision \"%s\"\n", version);
4948 strftime (date, sizeof (date), "%Y-%m-%d", gmtime (&now));
4949 fprintf (stream, "date \"%s\"\n", date);
4951 fprintf (stream, "category \"unicode:2001\";LC_CTYPE\n");
4952 fprintf (stream, "END LC_IDENTIFICATION\n");
4953 fprintf (stream, "\n");
4955 /* Verifications. */
4956 for (ch = 0; ch < 0x110000; ch++)
4958 /* toupper restriction: "Only characters specified for the keywords
4959 lower and upper shall be specified. */
4960 if (to_upper (ch) != ch && !(is_lower (ch) || is_upper (ch)))
4962 "%s is not upper|lower but toupper(0x%04X) = 0x%04X\n",
4963 ucs_symbol (ch), ch, to_upper (ch));
4965 /* tolower restriction: "Only characters specified for the keywords
4966 lower and upper shall be specified. */
4967 if (to_lower (ch) != ch && !(is_lower (ch) || is_upper (ch)))
4969 "%s is not upper|lower but tolower(0x%04X) = 0x%04X\n",
4970 ucs_symbol (ch), ch, to_lower (ch));
4972 /* alpha restriction: "Characters classified as either upper or lower
4973 shall automatically belong to this class. */
4974 if ((is_lower (ch) || is_upper (ch)) && !is_alpha (ch))
4975 fprintf (stderr, "%s is upper|lower but not alpha\n", ucs_symbol (ch));
4977 /* alpha restriction: "No character specified for the keywords cntrl,
4978 digit, punct or space shall be specified." */
4979 if (is_alpha (ch) && is_cntrl (ch))
4980 fprintf (stderr, "%s is alpha and cntrl\n", ucs_symbol (ch));
4981 if (is_alpha (ch) && is_digit (ch))
4982 fprintf (stderr, "%s is alpha and digit\n", ucs_symbol (ch));
4983 if (is_alpha (ch) && is_punct (ch))
4984 fprintf (stderr, "%s is alpha and punct\n", ucs_symbol (ch));
4985 if (is_alpha (ch) && is_space (ch))
4986 fprintf (stderr, "%s is alpha and space\n", ucs_symbol (ch));
4988 /* space restriction: "No character specified for the keywords upper,
4989 lower, alpha, digit, graph or xdigit shall be specified."
4990 upper, lower, alpha already checked above. */
4991 if (is_space (ch) && is_digit (ch))
4992 fprintf (stderr, "%s is space and digit\n", ucs_symbol (ch));
4993 if (is_space (ch) && is_graph (ch))
4994 fprintf (stderr, "%s is space and graph\n", ucs_symbol (ch));
4995 if (is_space (ch) && is_xdigit (ch))
4996 fprintf (stderr, "%s is space and xdigit\n", ucs_symbol (ch));
4998 /* cntrl restriction: "No character specified for the keywords upper,
4999 lower, alpha, digit, punct, graph, print or xdigit shall be
5000 specified." upper, lower, alpha already checked above. */
5001 if (is_cntrl (ch) && is_digit (ch))
5002 fprintf (stderr, "%s is cntrl and digit\n", ucs_symbol (ch));
5003 if (is_cntrl (ch) && is_punct (ch))
5004 fprintf (stderr, "%s is cntrl and punct\n", ucs_symbol (ch));
5005 if (is_cntrl (ch) && is_graph (ch))
5006 fprintf (stderr, "%s is cntrl and graph\n", ucs_symbol (ch));
5007 if (is_cntrl (ch) && is_print (ch))
5008 fprintf (stderr, "%s is cntrl and print\n", ucs_symbol (ch));
5009 if (is_cntrl (ch) && is_xdigit (ch))
5010 fprintf (stderr, "%s is cntrl and xdigit\n", ucs_symbol (ch));
5012 /* punct restriction: "No character specified for the keywords upper,
5013 lower, alpha, digit, cntrl, xdigit or as the <space> character shall
5014 be specified." upper, lower, alpha, cntrl already checked above. */
5015 if (is_punct (ch) && is_digit (ch))
5016 fprintf (stderr, "%s is punct and digit\n", ucs_symbol (ch));
5017 if (is_punct (ch) && is_xdigit (ch))
5018 fprintf (stderr, "%s is punct and xdigit\n", ucs_symbol (ch));
5019 if (is_punct (ch) && (ch == 0x0020))
5020 fprintf (stderr, "%s is punct\n", ucs_symbol (ch));
5022 /* graph restriction: "No character specified for the keyword cntrl
5023 shall be specified." Already checked above. */
5025 /* print restriction: "No character specified for the keyword cntrl
5026 shall be specified." Already checked above. */
5028 /* graph - print relation: differ only in the <space> character.
5029 How is this possible if there are more than one space character?!
5030 I think susv2/xbd/locale.html should speak of "space characters",
5031 not "space character". */
5032 if (is_print (ch) && !(is_graph (ch) || /* ch == 0x0020 */ is_space (ch)))
5034 "%s is print but not graph|<space>\n", ucs_symbol (ch));
5035 if (!is_print (ch) && (is_graph (ch) || ch == 0x0020))
5037 "%s is graph|<space> but not print\n", ucs_symbol (ch));
5040 fprintf (stream, "LC_CTYPE\n");
5041 output_charclass (stream, "upper", is_upper);
5042 output_charclass (stream, "lower", is_lower);
5043 output_charclass (stream, "alpha", is_alpha);
5044 output_charclass (stream, "digit", is_digit);
5045 output_charclass (stream, "outdigit", is_outdigit);
5046 output_charclass (stream, "blank", is_blank);
5047 output_charclass (stream, "space", is_space);
5048 output_charclass (stream, "cntrl", is_cntrl);
5049 output_charclass (stream, "punct", is_punct);
5050 output_charclass (stream, "xdigit", is_xdigit);
5051 output_charclass (stream, "graph", is_graph);
5052 output_charclass (stream, "print", is_print);
5053 output_charclass (stream, "class \"combining\";", is_combining);
5054 output_charclass (stream, "class \"combining_level3\";", is_combining_level3);
5055 output_charmap (stream, "toupper", to_upper);
5056 output_charmap (stream, "tolower", to_lower);
5057 output_charmap (stream, "map \"totitle\";", to_title);
5058 output_widthmap (stream);
5059 fprintf (stream, "END LC_CTYPE\n");
5061 if (ferror (stream) || fclose (stream))
5063 fprintf (stderr, "error writing to '%s'\n", filename);
5070 /* ========================================================================= */
5072 /* The width property from the EastAsianWidth.txt file.
5073 Each is NULL (unassigned) or "N", "A", "H", "W", "F", "Na". */
5074 const char * unicode_width[0x110000];
5076 /* Stores in unicode_width[] the width property from the EastAsianWidth.txt
5079 fill_width (const char *width_filename)
5083 char field0[FIELDLEN];
5084 char field1[FIELDLEN];
5085 char field2[FIELDLEN];
5088 for (i = 0; i < 0x110000; i++)
5089 unicode_width[i] = (unicode_attributes[i].name != NULL ? "N" : NULL);
5091 stream = fopen (width_filename, "r");
5094 fprintf (stderr, "error during fopen of '%s'\n", width_filename);
5109 do c = getc (stream); while (c != EOF && c != '\n');
5113 n = getfield (stream, field0, ';');
5114 n += getfield (stream, field1, ' ');
5115 n += getfield (stream, field2, '\n');
5120 fprintf (stderr, "short line in '%s':%d\n", width_filename, lineno);
5123 i = strtoul (field0, NULL, 16);
5124 if (strstr (field0, "..") != NULL)
5126 /* Deal with a range. */
5127 j = strtoul (strstr (field0, "..") + 2, NULL, 16);
5129 unicode_width[i] = strdup (field1);
5133 /* Single character line. */
5134 unicode_width[i] = strdup (field1);
5137 if (ferror (stream) || fclose (stream))
5139 fprintf (stderr, "error reading from '%s'\n", width_filename);
5144 /* Line breaking classification. */
5148 /* Values >= 24 are resolved at run time. */
5149 LBP_BK = 24, /* mandatory break */
5150 /*LBP_CR, carriage return - not used here because it's a DOSism */
5151 /*LBP_LF, line feed - not used here because it's a DOSism */
5152 LBP_CM = 25, /* attached characters and combining marks */
5153 /*LBP_NL, next line - not used here because it's equivalent to LBP_BK */
5154 /*LBP_SG, surrogates - not used here because they are not characters */
5155 LBP_WJ = 0, /* word joiner */
5156 LBP_ZW = 26, /* zero width space */
5157 LBP_GL = 1, /* non-breaking (glue) */
5158 LBP_SP = 27, /* space */
5159 LBP_B2 = 2, /* break opportunity before and after */
5160 LBP_BA = 3, /* break opportunity after */
5161 LBP_BB = 4, /* break opportunity before */
5162 LBP_HY = 5, /* hyphen */
5163 LBP_CB = 28, /* contingent break opportunity */
5164 LBP_CL = 6, /* closing punctuation */
5165 LBP_EX = 7, /* exclamation/interrogation */
5166 LBP_IN = 8, /* inseparable */
5167 LBP_NS = 9, /* non starter */
5168 LBP_OP = 10, /* opening punctuation */
5169 LBP_QU = 11, /* ambiguous quotation */
5170 LBP_IS = 12, /* infix separator (numeric) */
5171 LBP_NU = 13, /* numeric */
5172 LBP_PO = 14, /* postfix (numeric) */
5173 LBP_PR = 15, /* prefix (numeric) */
5174 LBP_SY = 16, /* symbols allowing breaks */
5175 LBP_AI = 29, /* ambiguous (alphabetic or ideograph) */
5176 LBP_AL = 17, /* ordinary alphabetic and symbol characters */
5177 LBP_H2 = 18, /* Hangul LV syllable */
5178 LBP_H3 = 19, /* Hangul LVT syllable */
5179 LBP_ID = 20, /* ideographic */
5180 LBP_JL = 21, /* Hangul L Jamo */
5181 LBP_JV = 22, /* Hangul V Jamo */
5182 LBP_JT = 23, /* Hangul T Jamo */
5183 LBP_SA = 30, /* complex context (South East Asian) */
5184 LBP_XX = 31 /* unknown */
5187 /* Returns the line breaking classification for ch, as a bit mask. */
5189 get_lbp (unsigned int ch)
5193 if (unicode_attributes[ch].name != NULL)
5195 /* mandatory break */
5196 if (ch == 0x000A || ch == 0x000D || ch == 0x0085 /* newline */
5197 || ch == 0x000C /* form feed */
5198 || ch == 0x000B /* line tabulation */
5199 || ch == 0x2028 /* LINE SEPARATOR */
5200 || ch == 0x2029 /* PARAGRAPH SEPARATOR */)
5201 attr |= 1 << LBP_BK;
5203 if (ch == 0x2060 /* WORD JOINER */
5204 || ch == 0xFEFF /* ZERO WIDTH NO-BREAK SPACE */)
5205 attr |= 1 << LBP_WJ;
5207 /* zero width space */
5208 if (ch == 0x200B /* ZERO WIDTH SPACE */)
5209 attr |= 1 << LBP_ZW;
5211 /* non-breaking (glue) */
5212 if (ch == 0x00A0 /* NO-BREAK SPACE */
5213 || ch == 0x202F /* NARROW NO-BREAK SPACE */
5214 || ch == 0x180E /* MONGOLIAN VOWEL SEPARATOR */
5215 || ch == 0x034F /* COMBINING GRAPHEME JOINER */
5216 || ch == 0x2007 /* FIGURE SPACE */
5217 || ch == 0x2011 /* NON-BREAKING HYPHEN */
5218 || ch == 0x0F08 /* TIBETAN MARK SBRUL SHAD */
5219 || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */
5220 || ch == 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */
5221 || (ch >= 0x035C && ch <= 0x0362) /* COMBINING DOUBLE ... */)
5222 attr |= 1 << LBP_GL;
5225 if (ch == 0x0020 /* SPACE */)
5226 attr |= 1 << LBP_SP;
5228 /* break opportunity before and after */
5229 if (ch == 0x2014 /* EM DASH */)
5230 attr |= 1 << LBP_B2;
5232 /* break opportunity after */
5233 if (ch == 0x1680 /* OGHAM SPACE MARK */
5234 || ch == 0x2000 /* EN QUAD */
5235 || ch == 0x2001 /* EM QUAD */
5236 || ch == 0x2002 /* EN SPACE */
5237 || ch == 0x2003 /* EM SPACE */
5238 || ch == 0x2004 /* THREE-PER-EM SPACE */
5239 || ch == 0x2005 /* FOUR-PER-EM SPACE */
5240 || ch == 0x2006 /* SIX-PER-EM SPACE */
5241 || ch == 0x2008 /* PUNCTUATION SPACE */
5242 || ch == 0x2009 /* THIN SPACE */
5243 || ch == 0x200A /* HAIR SPACE */
5244 || ch == 0x205F /* MEDIUM MATHEMATICAL SPACE */
5245 || ch == 0x0009 /* tab */
5246 || ch == 0x00AD /* SOFT HYPHEN */
5247 || ch == 0x058A /* ARMENIAN HYPHEN */
5248 || ch == 0x2010 /* HYPHEN */
5249 || ch == 0x2012 /* FIGURE DASH */
5250 || ch == 0x2013 /* EN DASH */
5251 || ch == 0x05BE /* HEBREW PUNCTUATION MAQAF */
5252 || ch == 0x0F0B /* TIBETAN MARK INTERSYLLABIC TSHEG */
5253 || ch == 0x1361 /* ETHIOPIC WORDSPACE */
5254 || ch == 0x17D8 /* KHMER SIGN BEYYAL */
5255 || ch == 0x17DA /* KHMER SIGN KOOMUUT */
5256 || ch == 0x2027 /* HYPHENATION POINT */
5257 || ch == 0x007C /* VERTICAL LINE */
5258 || ch == 0x16EB /* RUNIC SINGLE PUNCTUATION */
5259 || ch == 0x16EC /* RUNIC MULTIPLE PUNCTUATION */
5260 || ch == 0x16ED /* RUNIC CROSS PUNCTUATION */
5261 || ch == 0x2056 /* THREE DOT PUNCTUATION */
5262 || ch == 0x2058 /* FOUR DOT PUNCTUATION */
5263 || ch == 0x2059 /* FIVE DOT PUNCTUATION */
5264 || ch == 0x205A /* TWO DOT PUNCTUATION */
5265 || ch == 0x205B /* FOUR DOT MARK */
5266 || ch == 0x205D /* TRICOLON */
5267 || ch == 0x205E /* VERTICAL FOUR DOTS */
5268 || ch == 0x2E19 /* PALM BRANCH */
5269 || ch == 0x2E2A /* TWO DOTS OVER ONE DOT PUNCTUATION */
5270 || ch == 0x2E2B /* ONE DOT OVER TWO DOTS PUNCTUATION */
5271 || ch == 0x2E2C /* SQUARED FOUR DOT PUNCTUATION */
5272 || ch == 0x2E2D /* FIVE DOT PUNCTUATION */
5273 || ch == 0x2E30 /* RING POINT */
5274 || ch == 0x10100 /* AEGEAN WORD SEPARATOR LINE */
5275 || ch == 0x10101 /* AEGEAN WORD SEPARATOR DOT */
5276 || ch == 0x10102 /* AEGEAN CHECK MARK */
5277 || ch == 0x1039F /* UGARITIC WORD DIVIDER */
5278 || ch == 0x103D0 /* OLD PERSIAN WORD DIVIDER */
5279 || ch == 0x1091F /* PHOENICIAN WORD SEPARATOR */
5280 || ch == 0x12470 /* CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER */
5281 || ch == 0x0964 /* DEVANAGARI DANDA */
5282 || ch == 0x0965 /* DEVANAGARI DOUBLE DANDA */
5283 || ch == 0x0E5A /* THAI CHARACTER ANGKHANKHU */
5284 || ch == 0x0E5B /* THAI CHARACTER KHOMUT */
5285 || ch == 0x104A /* MYANMAR SIGN LITTLE SECTION */
5286 || ch == 0x104B /* MYANMAR SIGN SECTION */
5287 || ch == 0x1735 /* PHILIPPINE SINGLE PUNCTUATION */
5288 || ch == 0x1736 /* PHILIPPINE DOUBLE PUNCTUATION */
5289 || ch == 0x17D4 /* KHMER SIGN KHAN */
5290 || ch == 0x17D5 /* KHMER SIGN BARIYOOSAN */
5291 || ch == 0x1B5E /* BALINESE CARIK SIKI */
5292 || ch == 0x1B5F /* BALINESE CARIK PAREREN */
5293 || ch == 0xA8CE /* SAURASHTRA DANDA */
5294 || ch == 0xA8CF /* SAURASHTRA DOUBLE DANDA */
5295 || ch == 0xAA5D /* CHAM PUNCTUATION DANDA */
5296 || ch == 0xAA5E /* CHAM PUNCTUATION DOUBLE DANDA */
5297 || ch == 0xAA5F /* CHAM PUNCTUATION TRIPLE DANDA */
5298 || ch == 0x10A56 /* KHAROSHTHI PUNCTUATION DANDA */
5299 || ch == 0x10A57 /* KHAROSHTHI PUNCTUATION DOUBLE DANDA */
5300 || ch == 0x0F34 /* TIBETAN MARK BSDUS RTAGS */
5301 || ch == 0x0F7F /* TIBETAN SIGN RNAM BCAD */
5302 || ch == 0x0F85 /* TIBETAN MARK PALUTA */
5303 || ch == 0x0FBE /* TIBETAN KU RU KHA */
5304 || ch == 0x0FBF /* TIBETAN KU RU KHA BZHI MIG CAN */
5305 || ch == 0x0FD2 /* TIBETAN MARK NYIS TSHEG */
5306 || ch == 0x1804 /* MONGOLIAN COLON */
5307 || ch == 0x1805 /* MONGOLIAN FOUR DOTS */
5308 || ch == 0x1B5A /* BALINESE PANTI */
5309 || ch == 0x1B5B /* BALINESE PAMADA */
5310 || ch == 0x1B5C /* BALINESE WINDU */
5311 || ch == 0x1B5D /* BALINESE CARIK PAMUNGKAH */
5312 || ch == 0x1B60 /* BALINESE PAMENENG */
5313 || ch == 0x1C3B /* LEPCHA PUNCTUATION TA-ROL */
5314 || ch == 0x1C3C /* LEPCHA PUNCTUATION NYET THYOOM TA-ROL */
5315 || ch == 0x1C3D /* LEPCHA PUNCTUATION CER-WA */
5316 || ch == 0x1C3E /* LEPCHA PUNCTUATION TSHOOK CER-WA */
5317 || ch == 0x1C3F /* LEPCHA PUNCTUATION TSHOOK */
5318 || ch == 0x1C7E /* OL CHIKI PUNCTUATION MUCAAD */
5319 || ch == 0x1C7F /* OL CHIKI PUNCTUATION DOUBLE MUCAAD */
5320 || ch == 0x2CFA /* COPTIC OLD NUBIAN DIRECT QUESTION MARK */
5321 || ch == 0x2CFB /* COPTIC OLD NUBIAN INDIRECT QUESTION MARK */
5322 || ch == 0x2CFC /* COPTIC OLD NUBIAN VERSE DIVIDER */
5323 || ch == 0x2CFF /* COPTIC MORPHOLOGICAL DIVIDER */
5324 || (ch >= 0x2E0E && ch <= 0x2E15) /* EDITORIAL CORONIS .. UPWARDS ANCORA */
5325 || ch == 0x2E17 /* DOUBLE OBLIQUE HYPHEN */
5326 || ch == 0xA60D /* VAI COMMA */
5327 || ch == 0xA60F /* VAI QUESTION MARK */
5328 || ch == 0xA92E /* KAYAH LI SIGN CWI */
5329 || ch == 0xA92F /* KAYAH LI SIGN SHYA */
5330 || ch == 0x10A50 /* KHAROSHTHI PUNCTUATION DOT */
5331 || ch == 0x10A51 /* KHAROSHTHI PUNCTUATION SMALL CIRCLE */
5332 || ch == 0x10A52 /* KHAROSHTHI PUNCTUATION CIRCLE */
5333 || ch == 0x10A53 /* KHAROSHTHI PUNCTUATION CRESCENT BAR */
5334 || ch == 0x10A54 /* KHAROSHTHI PUNCTUATION MANGALAM */
5335 || ch == 0x10A55 /* KHAROSHTHI PUNCTUATION LOTUS */
5336 /* Extra characters for compatibility with Unicode LineBreak.txt. */
5337 || ch == 0x12471 /* CUNEIFORM PUNCTUATION SIGN VERTICAL COLON */
5338 || ch == 0x12472 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL COLON */
5339 || ch == 0x12473 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL TRICOLON */)
5340 attr |= 1 << LBP_BA;
5342 /* break opportunity before */
5343 if (ch == 0x00B4 /* ACUTE ACCENT */
5344 || ch == 0x1FFD /* GREEK OXIA */
5345 || ch == 0x02DF /* MODIFIER LETTER CROSS ACCENT */
5346 || ch == 0x02C8 /* MODIFIER LETTER VERTICAL LINE */
5347 || ch == 0x02CC /* MODIFIER LETTER LOW VERTICAL LINE */
5348 || ch == 0x0F01 /* TIBETAN MARK GTER YIG MGO TRUNCATED A */
5349 || ch == 0x0F02 /* TIBETAN MARK GTER YIG MGO -UM RNAM BCAD MA */
5350 || ch == 0x0F03 /* TIBETAN MARK GTER YIG MGO -UM GTER TSHEG MA */
5351 || ch == 0x0F04 /* TIBETAN MARK INITIAL YIG MGO MDUN MA */
5352 || ch == 0x0F06 /* TIBETAN MARK CARET YIG MGO PHUR SHAD MA */
5353 || ch == 0x0F07 /* TIBETAN MARK YIG MGO TSHEG SHAD MA */
5354 || ch == 0x0F09 /* TIBETAN MARK BSKUR YIG MGO */
5355 || ch == 0x0F0A /* TIBETAN MARK BKA- SHOG YIG MGO */
5356 || ch == 0x0FD0 /* TIBETAN MARK BSKA- SHOG GI MGO RGYAN */
5357 || ch == 0x0FD1 /* TIBETAN MARK MNYAM YIG GI MGO RGYAN */
5358 || ch == 0x0FD3 /* TIBETAN MARK INITIAL BRDA RNYING YIG MGO MDUN MA */
5359 || ch == 0xA874 /* PHAGS-PA SINGLE HEAD MARK */
5360 || ch == 0xA875 /* PHAGS-PA DOUBLE HEAD MARK */
5361 || ch == 0x1806 /* MONGOLIAN TODO SOFT HYPHEN */)
5362 attr |= 1 << LBP_BB;
5365 if (ch == 0x002D /* HYPHEN-MINUS */)
5366 attr |= 1 << LBP_HY;
5368 /* contingent break opportunity */
5369 if (ch == 0xFFFC /* OBJECT REPLACEMENT CHARACTER */)
5370 attr |= 1 << LBP_CB;
5372 /* closing punctuation */
5373 if ((unicode_attributes[ch].category[0] == 'P'
5374 && unicode_attributes[ch].category[1] == 'e')
5375 || ch == 0x3001 /* IDEOGRAPHIC COMMA */
5376 || ch == 0x3002 /* IDEOGRAPHIC FULL STOP */
5377 || ch == 0xFE11 /* PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC COMMA */
5378 || ch == 0xFE12 /* PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC FULL STOP */
5379 || ch == 0xFE50 /* SMALL COMMA */
5380 || ch == 0xFE52 /* SMALL FULL STOP */
5381 || ch == 0xFF0C /* FULLWIDTH COMMA */
5382 || ch == 0xFF0E /* FULLWIDTH FULL STOP */
5383 || ch == 0xFF61 /* HALFWIDTH IDEOGRAPHIC FULL STOP */
5384 || ch == 0xFF64 /* HALFWIDTH IDEOGRAPHIC COMMA */)
5385 attr |= 1 << LBP_CL;
5387 /* exclamation/interrogation */
5388 if (ch == 0x0021 /* EXCLAMATION MARK */
5389 || ch == 0x003F /* QUESTION MARK */
5390 || ch == 0x05C6 /* HEBREW PUNCTUATION NUN HAFUKHA */
5391 || ch == 0x061B /* ARABIC SEMICOLON */
5392 || ch == 0x061E /* ARABIC TRIPLE DOT PUNCTUATION MARK */
5393 || ch == 0x061F /* ARABIC QUESTION MARK */
5394 || ch == 0x06D4 /* ARABIC FULL STOP */
5395 || ch == 0x07F9 /* NKO EXCLAMATION MARK */
5396 || ch == 0x0F0D /* TIBETAN MARK SHAD */
5397 || ch == 0x0F0E /* TIBETAN MARK NYIS SHAD */
5398 || ch == 0x0F0F /* TIBETAN MARK TSHEG SHAD */
5399 || ch == 0x0F10 /* TIBETAN MARK NYIS TSHEG SHAD */
5400 || ch == 0x0F11 /* TIBETAN MARK RIN CHEN SPUNGS SHAD */
5401 || ch == 0x0F14 /* TIBETAN MARK GTER TSHEG */
5402 || ch == 0x1802 /* MONGOLIAN COMMA */
5403 || ch == 0x1803 /* MONGOLIAN FULL STOP */
5404 || ch == 0x1808 /* MONGOLIAN MANCHU COMMA */
5405 || ch == 0x1809 /* MONGOLIAN MANCHU FULL STOP */
5406 || ch == 0x1944 /* LIMBU EXCLAMATION MARK */
5407 || ch == 0x1945 /* LIMBU QUESTION MARK */
5408 || ch == 0x2762 /* HEAVY EXCLAMATION MARK ORNAMENT */
5409 || ch == 0x2763 /* HEAVY HEART EXCLAMATION MARK ORNAMENT */
5410 || ch == 0x2CF9 /* COPTIC OLD NUBIAN FULL STOP */
5411 || ch == 0x2CFE /* COPTIC FULL STOP */
5412 || ch == 0x2E2E /* REVERSED QUESTION MARK */
5414 || ch == 0xA60C /* VAI SYLLABLE LENGTHENER */
5416 || ch == 0xA60E /* VAI FULL STOP */
5417 || ch == 0xA876 /* PHAGS-PA MARK SHAD */
5418 || ch == 0xA877 /* PHAGS-PA MARK DOUBLE SHAD */
5419 || ch == 0xFE15 /* PRESENTATION FORM FOR VERTICAL EXCLAMATION MARK */
5420 || ch == 0xFE16 /* PRESENTATION FORM FOR VERTICAL QUESTION MARK */
5421 || ch == 0xFE56 /* SMALL QUESTION MARK */
5422 || ch == 0xFE57 /* SMALL EXCLAMATION MARK */
5423 || ch == 0xFF01 /* FULLWIDTH EXCLAMATION MARK */
5424 || ch == 0xFF1F /* FULLWIDTH QUESTION MARK */)
5425 attr |= 1 << LBP_EX;
5428 if (ch == 0x2024 /* ONE DOT LEADER */
5429 || ch == 0x2025 /* TWO DOT LEADER */
5430 || ch == 0x2026 /* HORIZONTAL ELLIPSIS */
5431 || ch == 0xFE19 /* PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS */)
5432 attr |= 1 << LBP_IN;
5435 if (ch == 0x17D6 /* KHMER SIGN CAMNUC PII KUUH */
5436 || ch == 0x203C /* DOUBLE EXCLAMATION MARK */
5437 || ch == 0x203D /* INTERROBANG */
5438 || ch == 0x2047 /* DOUBLE QUESTION MARK */
5439 || ch == 0x2048 /* QUESTION EXCLAMATION MARK */
5440 || ch == 0x2049 /* EXCLAMATION QUESTION MARK */
5441 || ch == 0x3005 /* IDEOGRAPHIC ITERATION MARK */
5442 || ch == 0x301C /* WAVE DASH */
5443 || ch == 0x303C /* MASU MARK */
5444 || ch == 0x303B /* VERTICAL IDEOGRAPHIC ITERATION MARK */
5445 || ch == 0x309B /* KATAKANA-HIRAGANA VOICED SOUND MARK */
5446 || ch == 0x309C /* KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK */
5447 || ch == 0x309D /* HIRAGANA ITERATION MARK */
5448 || ch == 0x309E /* HIRAGANA VOICED ITERATION MARK */
5449 || ch == 0x30A0 /* KATAKANA-HIRAGANA DOUBLE HYPHEN */
5450 || ch == 0x30FB /* KATAKANA MIDDLE DOT */
5451 || ch == 0x30FC /* KATAKANA-HIRAGANA PROLONGED SOUND MARK */
5452 || ch == 0x30FD /* KATAKANA ITERATION MARK */
5453 || ch == 0x30FE /* KATAKANA VOICED ITERATION MARK */
5454 || ch == 0xA015 /* YI SYLLABLE WU */
5455 || ch == 0xFE54 /* SMALL SEMICOLON */
5456 || ch == 0xFE55 /* SMALL COLON */
5457 || ch == 0xFF1A /* FULLWIDTH COLON */
5458 || ch == 0xFF1B /* FULLWIDTH SEMICOLON */
5459 || ch == 0xFF65 /* HALFWIDTH KATAKANA MIDDLE DOT */
5460 || ch == 0xFF70 /* HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK */
5461 || ch == 0xFF9E /* HALFWIDTH KATAKANA VOICED SOUND MARK */
5462 || ch == 0xFF9F /* HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK */
5463 || strstr (unicode_attributes[ch].name, "HIRAGANA LETTER SMALL ") != NULL
5464 || strstr (unicode_attributes[ch].name, "KATAKANA LETTER SMALL ") != NULL)
5465 attr |= 1 << LBP_NS;
5467 /* opening punctuation */
5468 if ((unicode_attributes[ch].category[0] == 'P'
5469 && unicode_attributes[ch].category[1] == 's')
5470 || ch == 0x00A1 /* INVERTED EXCLAMATION MARK */
5471 || ch == 0x00BF /* INVERTED QUESTION MARK */
5472 || ch == 0x2E18 /* INVERTED INTERROBANG */)
5473 attr |= 1 << LBP_OP;
5475 /* ambiguous quotation */
5476 if ((unicode_attributes[ch].category[0] == 'P'
5477 && (unicode_attributes[ch].category[1] == 'f'
5478 || unicode_attributes[ch].category[1] == 'i'))
5479 || ch == 0x0022 /* QUOTATION MARK */
5480 || ch == 0x0027 /* APOSTROPHE */
5481 || ch == 0x275B /* HEAVY SINGLE TURNED COMMA QUOTATION MARK ORNAMENT */
5482 || ch == 0x275C /* HEAVY SINGLE COMMA QUOTATION MARK ORNAMENT */
5483 || ch == 0x275D /* HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT */
5484 || ch == 0x275E /* HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT */
5485 || ch == 0x2E00 /* RIGHT ANGLE SUBSTITUTION MARKER */
5486 || ch == 0x2E01 /* RIGHT ANGLE DOTTED SUBSTITUTION MARKER */
5487 || ch == 0x2E06 /* RAISED INTERPOLATION MARKER */
5488 || ch == 0x2E07 /* RAISED DOTTED INTERPOLATION MARKER */
5489 || ch == 0x2E08 /* DOTTED TRANSPOSITION MARKER */
5490 || ch == 0x2E0B /* RAISED SQUARE */)
5491 attr |= 1 << LBP_QU;
5493 /* infix separator (numeric) */
5494 if (ch == 0x002C /* COMMA */
5495 || ch == 0x002E /* FULL STOP */
5496 || ch == 0x003A /* COLON */
5497 || ch == 0x003B /* SEMICOLON */
5498 || ch == 0x037E /* GREEK QUESTION MARK */
5499 || ch == 0x0589 /* ARMENIAN FULL STOP */
5500 || ch == 0x060C /* ARABIC COMMA */
5501 || ch == 0x060D /* ARABIC DATE SEPARATOR */
5502 || ch == 0x07F8 /* NKO COMMA */
5503 || ch == 0x2044 /* FRACTION SLASH */
5504 || ch == 0xFE10 /* PRESENTATION FORM FOR VERTICAL COMMA */
5505 || ch == 0xFE13 /* PRESENTATION FORM FOR VERTICAL COLON */
5506 || ch == 0xFE14 /* PRESENTATION FORM FOR VERTICAL SEMICOLON */)
5507 attr |= 1 << LBP_IS;
5510 if ((unicode_attributes[ch].category[0] == 'N'
5511 && unicode_attributes[ch].category[1] == 'd'
5512 && strstr (unicode_attributes[ch].name, "FULLWIDTH") == NULL)
5513 || ch == 0x066B /* ARABIC DECIMAL SEPARATOR */
5514 || ch == 0x066C /* ARABIC THOUSANDS SEPARATOR */)
5515 attr |= 1 << LBP_NU;
5517 /* postfix (numeric) */
5518 if (ch == 0x0025 /* PERCENT SIGN */
5519 || ch == 0x00A2 /* CENT SIGN */
5520 || ch == 0x00B0 /* DEGREE SIGN */
5521 || ch == 0x060B /* AFGHANI SIGN */
5522 || ch == 0x066A /* ARABIC PERCENT SIGN */
5523 || ch == 0x2030 /* PER MILLE SIGN */
5524 || ch == 0x2031 /* PER TEN THOUSAND SIGN */
5525 || ch == 0x2032 /* PRIME */
5526 || ch == 0x2033 /* DOUBLE PRIME */
5527 || ch == 0x2034 /* TRIPLE PRIME */
5528 || ch == 0x2035 /* REVERSED PRIME */
5529 || ch == 0x2036 /* REVERSED DOUBLE PRIME */
5530 || ch == 0x2037 /* REVERSED TRIPLE PRIME */
5531 || ch == 0x20A7 /* PESETA SIGN */
5532 || ch == 0x2103 /* DEGREE CELSIUS */
5533 || ch == 0x2109 /* DEGREE FAHRENHEIT */
5534 || ch == 0xFDFC /* RIAL SIGN */
5535 || ch == 0xFE6A /* SMALL PERCENT SIGN */
5536 || ch == 0xFF05 /* FULLWIDTH PERCENT SIGN */
5537 || ch == 0xFFE0 /* FULLWIDTH DIGIT ZERO */
5538 /* Extra characters for compatibility with Unicode LineBreak.txt. */
5539 || ch == 0x0609 /* ARABIC-INDIC PER MILLE SIGN */
5540 || ch == 0x060A /* ARABIC-INDIC PER TEN THOUSAND SIGN */
5541 || ch == 0x0D79 /* MALAYALAM DATE MARK */)
5542 attr |= 1 << LBP_PO;
5544 /* prefix (numeric) */
5545 if ((unicode_attributes[ch].category[0] == 'S'
5546 && unicode_attributes[ch].category[1] == 'c')
5547 || ch == 0x002B /* PLUS SIGN */
5548 || ch == 0x005C /* REVERSE SOLIDUS */
5549 || ch == 0x00B1 /* PLUS-MINUS SIGN */
5550 || ch == 0x2116 /* NUMERO SIGN */
5551 || ch == 0x2212 /* MINUS SIGN */
5552 || ch == 0x2213 /* MINUS-OR-PLUS SIGN */)
5553 if (!(attr & (1 << LBP_PO)))
5554 attr |= 1 << LBP_PR;
5556 /* symbols allowing breaks */
5557 if (ch == 0x002F /* SOLIDUS */)
5558 attr |= 1 << LBP_SY;
5560 if (ch >= 0xAC00 && ch <= 0xD7A3 && ((ch - 0xAC00) % 28) == 0)
5561 attr |= 1 << LBP_H2;
5563 if (ch >= 0xAC00 && ch <= 0xD7A3 && ((ch - 0xAC00) % 28) != 0)
5564 attr |= 1 << LBP_H3;
5566 if ((ch >= 0x1100 && ch <= 0x1159) || ch == 0x115F)
5567 attr |= 1 << LBP_JL;
5569 if (ch >= 0x1160 && ch <= 0x11A2)
5570 attr |= 1 << LBP_JV;
5572 if (ch >= 0x11A8 && ch <= 0x11F9)
5573 attr |= 1 << LBP_JT;
5575 /* complex context (South East Asian) */
5576 if (((unicode_attributes[ch].category[0] == 'C'
5577 && unicode_attributes[ch].category[1] == 'f')
5578 || (unicode_attributes[ch].category[0] == 'L'
5579 && (unicode_attributes[ch].category[1] == 'm'
5580 || unicode_attributes[ch].category[1] == 'o'))
5581 || (unicode_attributes[ch].category[0] == 'M'
5582 && (unicode_attributes[ch].category[1] == 'c'
5583 || unicode_attributes[ch].category[1] == 'n'))
5584 /* Extra characters for compatibility with Unicode LineBreak.txt. */
5585 || ch == 0x109E /* MYANMAR SYMBOL SHAN ONE */
5586 || ch == 0x109F /* MYANMAR SYMBOL SHAN EXCLAMATION */
5587 || ch == 0x19DE /* NEW TAI LUE SIGN LAE */
5588 || ch == 0x19DF /* NEW TAI LUE SIGN LAEV */)
5589 && ((ch >= 0x0E00 && ch <= 0x0EFF)
5590 || (ch >= 0x1000 && ch <= 0x109F)
5591 || (ch >= 0x1780 && ch <= 0x17FF)
5592 || (ch >= 0x1950 && ch <= 0x19DF)))
5593 attr |= 1 << LBP_SA;
5595 /* attached characters and combining marks */
5596 if ((unicode_attributes[ch].category[0] == 'M'
5597 && (unicode_attributes[ch].category[1] == 'c'
5598 || unicode_attributes[ch].category[1] == 'e'
5599 || unicode_attributes[ch].category[1] == 'n'))
5600 || (unicode_attributes[ch].category[0] == 'C'
5601 && (unicode_attributes[ch].category[1] == 'c'
5602 || unicode_attributes[ch].category[1] == 'f')))
5603 if (!(attr & ((1 << LBP_BK) | (1 << LBP_BA) | (1 << LBP_GL) | (1 << LBP_SA) | (1 << LBP_WJ) | (1 << LBP_ZW))))
5604 attr |= 1 << LBP_CM;
5607 if ((ch >= 0x2E80 && ch <= 0x2FFF) /* CJK RADICAL, KANGXI RADICAL, IDEOGRAPHIC DESCRIPTION */
5608 || ch == 0x3000 /* IDEOGRAPHIC SPACE */
5609 || (ch >= 0x3040 && ch <= 0x309F) /* HIRAGANA */
5610 || (ch >= 0x30A0 && ch <= 0x30FF) /* KATAKANA */
5611 || (ch >= 0x3400 && ch <= 0x4DB5) /* CJK Ideograph Extension A */
5612 || (ch >= 0x4E00 && ch <= 0x9FC3) /* CJK Ideograph */
5613 || (ch >= 0xF900 && ch <= 0xFAD9) /* CJK COMPATIBILITY IDEOGRAPH */
5614 || (ch >= 0xA000 && ch <= 0xA48F) /* YI SYLLABLE */
5615 || (ch >= 0xA490 && ch <= 0xA4CF) /* YI RADICAL */
5616 || ch == 0xFE62 /* SMALL PLUS SIGN */
5617 || ch == 0xFE63 /* SMALL HYPHEN-MINUS */
5618 || ch == 0xFE64 /* SMALL LESS-THAN SIGN */
5619 || ch == 0xFE65 /* SMALL GREATER-THAN SIGN */
5620 || ch == 0xFE66 /* SMALL EQUALS SIGN */
5621 || (ch >= 0xFF10 && ch <= 0xFF19) /* FULLWIDTH DIGIT */
5622 || (ch >= 0x20000 && ch <= 0x2A6D6) /* CJK Ideograph Extension B */
5623 || (ch >= 0x2F800 && ch <= 0x2FA1D) /* CJK COMPATIBILITY IDEOGRAPH */
5624 || strstr (unicode_attributes[ch].name, "FULLWIDTH LATIN ") != NULL
5625 || (ch >= 0x3000 && ch <= 0x33FF
5626 && !(attr & ((1 << LBP_CM) | (1 << LBP_NS) | (1 << LBP_OP) | (1 << LBP_CL))))
5627 /* Extra characters for compatibility with Unicode LineBreak.txt. */
5628 || ch == 0xFE30 /* PRESENTATION FORM FOR VERTICAL TWO DOT LEADER */
5629 || ch == 0xFE31 /* PRESENTATION FORM FOR VERTICAL EM DASH */
5630 || ch == 0xFE32 /* PRESENTATION FORM FOR VERTICAL EN DASH */
5631 || ch == 0xFE33 /* PRESENTATION FORM FOR VERTICAL LOW LINE */
5632 || ch == 0xFE34 /* PRESENTATION FORM FOR VERTICAL WAVY LOW LINE */
5633 || ch == 0xFE45 /* SESAME DOT */
5634 || ch == 0xFE46 /* WHITE SESAME DOT */
5635 || ch == 0xFE49 /* DASHED OVERLINE */
5636 || ch == 0xFE4A /* CENTRELINE OVERLINE */
5637 || ch == 0xFE4B /* WAVY OVERLINE */
5638 || ch == 0xFE4C /* DOUBLE WAVY OVERLINE */
5639 || ch == 0xFE4D /* DASHED LOW LINE */
5640 || ch == 0xFE4E /* CENTRELINE LOW LINE */
5641 || ch == 0xFE4F /* WAVY LOW LINE */
5642 || ch == 0xFE51 /* SMALL IDEOGRAPHIC COMMA */
5643 || ch == 0xFE58 /* SMALL EM DASH */
5644 || ch == 0xFE5F /* SMALL NUMBER SIGN */
5645 || ch == 0xFE60 /* SMALL AMPERSAND */
5646 || ch == 0xFE61 /* SMALL ASTERISK */
5647 || ch == 0xFE68 /* SMALL REVERSE SOLIDUS */
5648 || ch == 0xFE6B /* SMALL COMMERCIAL AT */
5649 || ch == 0xFF02 /* FULLWIDTH QUOTATION MARK */
5650 || ch == 0xFF03 /* FULLWIDTH NUMBER SIGN */
5651 || ch == 0xFF06 /* FULLWIDTH AMPERSAND */
5652 || ch == 0xFF07 /* FULLWIDTH APOSTROPHE */
5653 || ch == 0xFF0A /* FULLWIDTH ASTERISK */
5654 || ch == 0xFF0B /* FULLWIDTH PLUS SIGN */
5655 || ch == 0xFF0D /* FULLWIDTH HYPHEN-MINUS */
5656 || ch == 0xFF0F /* FULLWIDTH SOLIDUS */
5657 || ch == 0xFF1C /* FULLWIDTH LESS-THAN SIGN */
5658 || ch == 0xFF1D /* FULLWIDTH EQUALS SIGN */
5659 || ch == 0xFF1E /* FULLWIDTH GREATER-THAN SIGN */
5660 || ch == 0xFF20 /* FULLWIDTH COMMERCIAL AT */
5661 || ch == 0xFF3C /* FULLWIDTH REVERSE SOLIDUS */
5662 || ch == 0xFF3E /* FULLWIDTH CIRCUMFLEX ACCENT */
5663 || ch == 0xFF3F /* FULLWIDTH LOW LINE */
5664 || ch == 0xFF40 /* FULLWIDTH GRAVE ACCENT */
5665 || ch == 0xFF5C /* FULLWIDTH VERTICAL LINE */
5666 || ch == 0xFF5E /* FULLWIDTH TILDE */
5667 || ch == 0xFFE2 /* FULLWIDTH NOT SIGN */
5668 || ch == 0xFFE3 /* FULLWIDTH MACRON */
5669 || ch == 0xFFE4 /* FULLWIDTH BROKEN BAR */)
5670 if (!(attr & ((1 << LBP_NS) | (1 << LBP_CM))))
5672 /* ambiguous (ideograph) ? */
5673 if ((unicode_width[ch] != NULL
5674 && unicode_width[ch][0] == 'A'
5676 || ch == 0x24EA /* CIRCLED DIGIT ZERO */
5677 || (ch >= 0x2780 && ch <= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */)
5678 attr |= 1 << LBP_AI;
5680 attr |= 1 << LBP_ID;
5683 /* ordinary alphabetic and symbol characters */
5684 if ((unicode_attributes[ch].category[0] == 'L'
5685 && (unicode_attributes[ch].category[1] == 'u'
5686 || unicode_attributes[ch].category[1] == 'l'
5687 || unicode_attributes[ch].category[1] == 't'
5688 || unicode_attributes[ch].category[1] == 'm'
5689 || unicode_attributes[ch].category[1] == 'o'))
5690 || (unicode_attributes[ch].category[0] == 'S'
5691 && (unicode_attributes[ch].category[1] == 'm'
5692 || unicode_attributes[ch].category[1] == 'k'
5693 || unicode_attributes[ch].category[1] == 'o'))
5694 || (unicode_attributes[ch].category[0] == 'N'
5695 && (unicode_attributes[ch].category[1] == 'l'
5696 || unicode_attributes[ch].category[1] == 'o'))
5697 || (unicode_attributes[ch].category[0] == 'P'
5698 && (unicode_attributes[ch].category[1] == 'c'
5699 || unicode_attributes[ch].category[1] == 'd'
5700 || unicode_attributes[ch].category[1] == 'o'))
5701 || ch == 0x0600 /* ARABIC NUMBER SIGN */
5702 || ch == 0x0601 /* ARABIC SIGN SANAH */
5703 || ch == 0x0602 /* ARABIC FOOTNOTE MARKER */
5704 || ch == 0x0603 /* ARABIC SIGN SAFHA */
5705 || ch == 0x06DD /* ARABIC END OF AYAH */
5706 || ch == 0x070F /* SYRIAC ABBREVIATION MARK */
5707 || ch == 0x2061 /* FUNCTION APPLICATION */
5708 || ch == 0x2062 /* INVISIBLE TIMES */
5709 || ch == 0x2063 /* INVISIBLE SEPARATOR */
5710 || ch == 0x2064 /* INVISIBLE PLUS */)
5711 if (!(attr & ((1 << LBP_GL) | (1 << LBP_B2) | (1 << LBP_BA) | (1 << LBP_BB) | (1 << LBP_HY) | (1 << LBP_CB) | (1 << LBP_CL) | (1 << LBP_EX) | (1 << LBP_IN) | (1 << LBP_NS) | (1 << LBP_OP) | (1 << LBP_QU) | (1 << LBP_IS) | (1 << LBP_NU) | (1 << LBP_PO) | (1 << LBP_PR) | (1 << LBP_SY) | (1 << LBP_H2) | (1 << LBP_H3) | (1 << LBP_JL) | (1 << LBP_JV) | (1 << LBP_JT) | (1 << LBP_SA) | (1 << LBP_ID))))
5713 /* ambiguous (alphabetic) ? */
5714 if ((unicode_width[ch] != NULL
5715 && unicode_width[ch][0] == 'A'
5717 /* Extra exceptions for compatibility with Unicode LineBreak.txt. */
5718 && ch != 0x2022 /* BULLET */
5719 && ch != 0x203E /* OVERLINE */
5720 && ch != 0x2126 /* OHM SIGN */
5721 && ch != 0x2153 /* VULGAR FRACTION ONE THIRD */
5722 && ch != 0x215C /* VULGAR FRACTION THREE EIGHTHS */
5723 && ch != 0x215D /* VULGAR FRACTION FIVE EIGHTHS */
5724 && ch != 0x21B8 /* NORTH WEST ARROW TO LONG BAR */
5725 && ch != 0x21B9 /* LEFTWARDS ARROW TO BAR OVER RIGHTWARDS ARROW TO BAR */
5726 && ch != 0x21E7 /* UPWARDS WHITE ARROW */
5727 && ch != 0x24FF /* NEGATIVE CIRCLED DIGIT ZERO */
5728 && ch != 0x273D /* HEAVY TEARDROP-SPOKED ASTERISK */)
5730 || ch == 0x00A1 /* INVERTED EXCLAMATION MARK */
5731 || ch == 0x00A7 /* SECTION SIGN */
5732 || ch == 0x00A8 /* DIAERESIS */
5733 || ch == 0x00AA /* FEMININE ORDINAL INDICATOR */
5734 || ch == 0x00B2 /* SUPERSCRIPT TWO */
5735 || ch == 0x00B3 /* SUPERSCRIPT THREE */
5736 || ch == 0x00B6 /* PILCROW SIGN */
5737 || ch == 0x00B7 /* MIDDLE DOT */
5738 || ch == 0x00B8 /* CEDILLA */
5739 || ch == 0x00B9 /* SUPERSCRIPT ONE */
5740 || ch == 0x00BA /* MASCULINE ORDINAL INDICATOR */
5741 || ch == 0x00BC /* VULGAR FRACTION ONE QUARTER */
5742 || ch == 0x00BD /* VULGAR FRACTION ONE HALF */
5743 || ch == 0x00BE /* VULGAR FRACTION THREE QUARTERS */
5744 || ch == 0x00BF /* INVERTED QUESTION MARK */
5745 || ch == 0x00D7 /* MULTIPLICATION SIGN */
5746 || ch == 0x00F7 /* DIVISION SIGN */
5747 || ch == 0x02C7 /* CARON */
5748 || ch == 0x02C9 /* MODIFIER LETTER MACRON */
5749 || ch == 0x02CA /* MODIFIER LETTER ACUTE ACCENT */
5750 || ch == 0x02CB /* MODIFIER LETTER GRAVE ACCENT */
5751 || ch == 0x02CD /* MODIFIER LETTER LOW MACRON */
5752 || ch == 0x02D0 /* MODIFIER LETTER TRIANGULAR COLON */
5753 || ch == 0x02D8 /* BREVE */
5754 || ch == 0x02D9 /* DOT ABOVE */
5755 || ch == 0x02DA /* RING ABOVE */
5756 || ch == 0x02DB /* OGONEK */
5757 || ch == 0x02DD /* DOUBLE ACUTE ACCENT */
5759 || ch == 0x24EA /* CIRCLED DIGIT ZERO */
5760 || (ch >= 0x2780 && ch <= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */
5761 /* Extra characters for compatibility with Unicode LineBreak.txt. */
5762 || ch == 0x2155 /* VULGAR FRACTION ONE FIFTH */
5763 || ch == 0x2574 /* BOX DRAWINGS LIGHT LEFT */
5764 || ch == 0x2616 /* WHITE SHOGI PIECE */
5765 || ch == 0x2617 /* BLACK SHOGI PIECE */)
5766 attr |= 1 << LBP_AI;
5768 attr |= 1 << LBP_AL;
5769 attr &= ~(1 << LBP_CM);
5775 attr |= 1 << LBP_XX;
5780 /* Output the line breaking properties in a human readable format. */
5782 debug_output_lbp (FILE *stream)
5786 for (i = 0; i < 0x110000; i++)
5788 int attr = get_lbp (i);
5789 if (attr != 1 << LBP_XX)
5791 fprintf (stream, "0x%04X", i);
5792 #define PRINT_BIT(attr,bit) \
5793 if (attr & (1 << bit)) fprintf (stream, " " #bit);
5794 PRINT_BIT(attr,LBP_BK);
5795 PRINT_BIT(attr,LBP_CM);
5796 PRINT_BIT(attr,LBP_WJ);
5797 PRINT_BIT(attr,LBP_ZW);
5798 PRINT_BIT(attr,LBP_GL);
5799 PRINT_BIT(attr,LBP_SP);
5800 PRINT_BIT(attr,LBP_B2);
5801 PRINT_BIT(attr,LBP_BA);
5802 PRINT_BIT(attr,LBP_BB);
5803 PRINT_BIT(attr,LBP_HY);
5804 PRINT_BIT(attr,LBP_CB);
5805 PRINT_BIT(attr,LBP_CL);
5806 PRINT_BIT(attr,LBP_EX);
5807 PRINT_BIT(attr,LBP_IN);
5808 PRINT_BIT(attr,LBP_NS);
5809 PRINT_BIT(attr,LBP_OP);
5810 PRINT_BIT(attr,LBP_QU);
5811 PRINT_BIT(attr,LBP_IS);
5812 PRINT_BIT(attr,LBP_NU);
5813 PRINT_BIT(attr,LBP_PO);
5814 PRINT_BIT(attr,LBP_PR);
5815 PRINT_BIT(attr,LBP_SY);
5816 PRINT_BIT(attr,LBP_AI);
5817 PRINT_BIT(attr,LBP_AL);
5818 PRINT_BIT(attr,LBP_H2);
5819 PRINT_BIT(attr,LBP_H3);
5820 PRINT_BIT(attr,LBP_ID);
5821 PRINT_BIT(attr,LBP_JL);
5822 PRINT_BIT(attr,LBP_JV);
5823 PRINT_BIT(attr,LBP_JT);
5824 PRINT_BIT(attr,LBP_SA);
5825 PRINT_BIT(attr,LBP_XX);
5827 fprintf (stream, "\n");
5833 debug_output_lbrk_tables (const char *filename)
5837 stream = fopen (filename, "w");
5840 fprintf (stderr, "cannot open '%s' for writing\n", filename);
5844 debug_output_lbp (stream);
5846 if (ferror (stream) || fclose (stream))
5848 fprintf (stderr, "error writing to '%s'\n", filename);
5853 /* The line breaking property from the LineBreak.txt file. */
5854 int unicode_org_lbp[0x110000];
5856 /* Stores in unicode_org_lbp[] the line breaking property from the
5857 LineBreak.txt file. */
5859 fill_org_lbp (const char *linebreak_filename)
5863 char field0[FIELDLEN];
5864 char field1[FIELDLEN];
5865 char field2[FIELDLEN];
5868 for (i = 0; i < 0x110000; i++)
5869 unicode_org_lbp[i] = LBP_XX;
5871 stream = fopen (linebreak_filename, "r");
5874 fprintf (stderr, "error during fopen of '%s'\n", linebreak_filename);
5890 do c = getc (stream); while (c != EOF && c != '\n');
5894 n = getfield (stream, field0, ';');
5895 n += getfield (stream, field1, ' ');
5896 n += getfield (stream, field2, '\n');
5901 fprintf (stderr, "short line in '%s':%d\n", linebreak_filename,
5905 #define TRY(bit) else if (strcmp (field1, #bit + 4) == 0) value = bit;
5940 else if (strcmp (field1, "LF") == 0) value = LBP_BK;
5941 else if (strcmp (field1, "CR") == 0) value = LBP_BK;
5942 else if (strcmp (field1, "NL") == 0) value = LBP_BK;
5943 else if (strcmp (field1, "SG") == 0) value = LBP_XX;
5946 fprintf (stderr, "unknown property value \"%s\" in '%s':%d\n",
5947 field1, linebreak_filename, lineno);
5950 i = strtoul (field0, NULL, 16);
5951 if (strstr (field0, "..") != NULL)
5953 /* Deal with a range. */
5954 j = strtoul (strstr (field0, "..") + 2, NULL, 16);
5956 unicode_org_lbp[i] = value;
5960 /* Single character line. */
5961 unicode_org_lbp[i] = value;
5964 if (ferror (stream) || fclose (stream))
5966 fprintf (stderr, "error reading from '%s'\n", linebreak_filename);
5971 /* Output the line breaking properties in a human readable format. */
5973 debug_output_org_lbp (FILE *stream)
5977 for (i = 0; i < 0x110000; i++)
5979 int attr = unicode_org_lbp[i];
5982 fprintf (stream, "0x%04X", i);
5983 #define PRINT_BIT(attr,bit) \
5984 if (attr == bit) fprintf (stream, " " #bit);
5985 PRINT_BIT(attr,LBP_BK);
5986 PRINT_BIT(attr,LBP_CM);
5987 PRINT_BIT(attr,LBP_WJ);
5988 PRINT_BIT(attr,LBP_ZW);
5989 PRINT_BIT(attr,LBP_GL);
5990 PRINT_BIT(attr,LBP_SP);
5991 PRINT_BIT(attr,LBP_B2);
5992 PRINT_BIT(attr,LBP_BA);
5993 PRINT_BIT(attr,LBP_BB);
5994 PRINT_BIT(attr,LBP_HY);
5995 PRINT_BIT(attr,LBP_CB);
5996 PRINT_BIT(attr,LBP_CL);
5997 PRINT_BIT(attr,LBP_EX);
5998 PRINT_BIT(attr,LBP_IN);
5999 PRINT_BIT(attr,LBP_NS);
6000 PRINT_BIT(attr,LBP_OP);
6001 PRINT_BIT(attr,LBP_QU);
6002 PRINT_BIT(attr,LBP_IS);
6003 PRINT_BIT(attr,LBP_NU);
6004 PRINT_BIT(attr,LBP_PO);
6005 PRINT_BIT(attr,LBP_PR);
6006 PRINT_BIT(attr,LBP_SY);
6007 PRINT_BIT(attr,LBP_AI);
6008 PRINT_BIT(attr,LBP_AL);
6009 PRINT_BIT(attr,LBP_H2);
6010 PRINT_BIT(attr,LBP_H3);
6011 PRINT_BIT(attr,LBP_ID);
6012 PRINT_BIT(attr,LBP_JL);
6013 PRINT_BIT(attr,LBP_JV);
6014 PRINT_BIT(attr,LBP_JT);
6015 PRINT_BIT(attr,LBP_SA);
6016 PRINT_BIT(attr,LBP_XX);
6018 fprintf (stream, "\n");
6024 debug_output_org_lbrk_tables (const char *filename)
6028 stream = fopen (filename, "w");
6031 fprintf (stderr, "cannot open '%s' for writing\n", filename);
6035 debug_output_org_lbp (stream);
6037 if (ferror (stream) || fclose (stream))
6039 fprintf (stderr, "error writing to '%s'\n", filename);
6044 /* Construction of sparse 3-level tables. */
6045 #define TABLE lbp_table
6046 #define ELEMENT unsigned char
6047 #define DEFAULT LBP_XX
6048 #define xmalloc malloc
6049 #define xrealloc realloc
6053 output_lbp (FILE *stream1, FILE *stream2)
6057 unsigned int level1_offset, level2_offset, level3_offset;
6061 lbp_table_init (&t);
6063 for (i = 0; i < 0x110000; i++)
6065 int attr = get_lbp (i);
6067 /* Now attr should contain exactly one bit. */
6068 if (attr == 0 || ((attr & (attr - 1)) != 0))
6071 if (attr != 1 << LBP_XX)
6073 unsigned int log2_attr;
6074 for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++);
6076 lbp_table_add (&t, i, log2_attr);
6080 lbp_table_finalize (&t);
6083 5 * sizeof (uint32_t);
6085 5 * sizeof (uint32_t)
6086 + t.level1_size * sizeof (uint32_t);
6088 5 * sizeof (uint32_t)
6089 + t.level1_size * sizeof (uint32_t)
6090 + (t.level2_size << t.q) * sizeof (uint32_t);
6092 for (i = 0; i < 5; i++)
6093 fprintf (stream1, "#define lbrkprop_header_%d %d\n", i,
6094 ((uint32_t *) t.result)[i]);
6095 fprintf (stream1, "\n");
6096 fprintf (stream1, "typedef struct\n");
6097 fprintf (stream1, " {\n");
6098 fprintf (stream1, " int level1[%zu];\n", t.level1_size);
6099 fprintf (stream1, " int level2[%zu << %d];\n", t.level2_size, t.q);
6100 fprintf (stream1, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
6101 fprintf (stream1, " }\n");
6102 fprintf (stream1, "lbrkprop_t;\n");
6103 fprintf (stream1, "extern const lbrkprop_t unilbrkprop;\n");
6105 fprintf (stream2, "const lbrkprop_t unilbrkprop =\n");
6106 fprintf (stream2, "{\n");
6107 fprintf (stream2, " {");
6108 if (t.level1_size > 8)
6109 fprintf (stream2, "\n ");
6110 for (i = 0; i < t.level1_size; i++)
6113 if (i > 0 && (i % 8) == 0)
6114 fprintf (stream2, "\n ");
6115 offset = ((uint32_t *) (t.result + level1_offset))[i];
6117 fprintf (stream2, " %5d", -1);
6119 fprintf (stream2, " %5zu",
6120 (offset - level2_offset) / sizeof (uint32_t));
6121 if (i+1 < t.level1_size)
6122 fprintf (stream2, ",");
6124 if (t.level1_size > 8)
6125 fprintf (stream2, "\n ");
6126 fprintf (stream2, " },\n");
6127 fprintf (stream2, " {");
6128 if (t.level2_size << t.q > 8)
6129 fprintf (stream2, "\n ");
6130 for (i = 0; i < t.level2_size << t.q; i++)
6133 if (i > 0 && (i % 8) == 0)
6134 fprintf (stream2, "\n ");
6135 offset = ((uint32_t *) (t.result + level2_offset))[i];
6137 fprintf (stream2, " %5d", -1);
6139 fprintf (stream2, " %5zu",
6140 (offset - level3_offset) / sizeof (unsigned char));
6141 if (i+1 < t.level2_size << t.q)
6142 fprintf (stream2, ",");
6144 if (t.level2_size << t.q > 8)
6145 fprintf (stream2, "\n ");
6146 fprintf (stream2, " },\n");
6147 fprintf (stream2, " {");
6148 if (t.level3_size << t.p > 8)
6149 fprintf (stream2, "\n ");
6150 for (i = 0; i < t.level3_size << t.p; i++)
6152 unsigned char value = ((unsigned char *) (t.result + level3_offset))[i];
6153 const char *value_string;
6156 #define CASE(x) case x: value_string = #x; break;
6193 if (i > 0 && (i % 8) == 0)
6194 fprintf (stream2, "\n ");
6195 fprintf (stream2, " %s%s", value_string,
6196 (i+1 < t.level3_size << t.p ? "," : ""));
6198 if (t.level3_size << t.p > 8)
6199 fprintf (stream2, "\n ");
6200 fprintf (stream2, " }\n");
6201 fprintf (stream2, "};\n");
6205 output_lbrk_tables (const char *filename1, const char *filename2, const char *version)
6207 const char *filenames[2];
6211 filenames[0] = filename1;
6212 filenames[1] = filename2;
6214 for (i = 0; i < 2; i++)
6216 streams[i] = fopen (filenames[i], "w");
6217 if (streams[i] == NULL)
6219 fprintf (stderr, "cannot open '%s' for writing\n", filenames[i]);
6224 for (i = 0; i < 2; i++)
6226 FILE *stream = streams[i];
6228 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
6229 fprintf (stream, "/* Line breaking properties of Unicode characters. */\n");
6230 fprintf (stream, "/* Generated automatically by gen-lbrk for Unicode %s. */\n",
6232 fprintf (stream, "\n");
6234 /* Put a GPL header on it. The gnulib module is under LGPL (although it
6235 still carries the GPL header), and it's gnulib-tool which replaces the
6236 GPL header with an LGPL header. */
6237 fprintf (stream, "/* Copyright (C) 2000-2002, 2004, 2008 Free Software Foundation, Inc.\n");
6238 fprintf (stream, "\n");
6239 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
6240 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
6241 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
6242 fprintf (stream, " (at your option) any later version.\n");
6243 fprintf (stream, "\n");
6244 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
6245 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
6246 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
6247 fprintf (stream, " GNU General Public License for more details.\n");
6248 fprintf (stream, "\n");
6249 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
6250 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
6251 fprintf (stream, "\n");
6254 output_lbp (streams[0], streams[1]);
6256 for (i = 0; i < 2; i++)
6258 if (ferror (streams[i]) || fclose (streams[i]))
6260 fprintf (stderr, "error writing to '%s'\n", filenames[i]);
6266 /* ========================================================================= */
6268 /* Word break property. */
6270 /* Possible values of the Word_Break property. */
6285 WBP_EXTENDNUMLET = 7
6288 /* Returns the word breaking property for ch, as a bit mask. */
6290 get_wbp (unsigned int ch)
6294 if (unicode_attributes[ch].name != NULL)
6297 attr |= 1 << WBP_CR;
6300 attr |= 1 << WBP_LF;
6302 if (ch == 0x000B || ch == 0x000C
6304 || ch == 0x2028 || ch == 0x2029)
6305 attr |= 1 << WBP_NEWLINE;
6307 if (((unicode_properties[ch] >> PROP_GRAPHEME_EXTEND) & 1) != 0
6308 || (unicode_attributes[ch].category != NULL
6309 && strcmp (unicode_attributes[ch].category, "Mc") == 0))
6310 attr |= 1 << WBP_EXTEND;
6312 if (unicode_attributes[ch].category != NULL
6313 && strcmp (unicode_attributes[ch].category, "Cf") == 0
6314 && ch != 0x200C && ch != 0x200D)
6315 attr |= 1 << WBP_FORMAT;
6317 if ((unicode_scripts[ch] < numscripts
6318 && strcmp (scripts[unicode_scripts[ch]], "Katakana") == 0)
6319 || (ch >= 0x3031 && ch <= 0x3035)
6320 || ch == 0x309B || ch == 0x309C || ch == 0x30A0 || ch == 0x30FC
6322 attr |= 1 << WBP_KATAKANA;
6324 if ((((unicode_properties[ch] >> PROP_ALPHABETIC) & 1) != 0
6326 && ((unicode_properties[ch] >> PROP_IDEOGRAPHIC) & 1) == 0
6327 && (attr & (1 << WBP_KATAKANA)) == 0
6328 && ((get_lbp (ch) >> LBP_SA) & 1) == 0
6329 && !(unicode_scripts[ch] < numscripts
6330 && strcmp (scripts[unicode_scripts[ch]], "Hiragana") == 0)
6331 && (attr & (1 << WBP_EXTEND)) == 0)
6332 attr |= 1 << WBP_ALETTER;
6334 if (ch == 0x0027 || ch == 0x002E || ch == 0x2018 || ch == 0x2019
6335 || ch == 0x2024 || ch == 0xFE52 || ch == 0xFF07 || ch == 0xFF0E)
6336 attr |= 1 << WBP_MIDNUMLET;
6338 if (ch == 0x00B7 || ch == 0x05F4 || ch == 0x2027 || ch == 0x003A
6339 || ch == 0x0387 || ch == 0xFE13 || ch == 0xFE55 || ch == 0xFF1A)
6340 attr |= 1 << WBP_MIDLETTER;
6342 if ((((get_lbp (ch) >> LBP_IS) & 1) != 0
6343 || ch == 0x066C || ch == 0xFE50 || ch == 0xFE54 || ch == 0xFF0C
6345 && ch != 0x003A && ch != 0xFE13 && ch != 0x002E)
6346 attr |= 1 << WBP_MIDNUM;
6348 if (((get_lbp (ch) >> LBP_NU) & 1) != 0
6350 attr |= 1 << WBP_NUMERIC;
6352 if (unicode_attributes[ch].category != NULL
6353 && strcmp (unicode_attributes[ch].category, "Pc") == 0)
6354 attr |= 1 << WBP_EXTENDNUMLET;
6359 attr |= 1 << WBP_OTHER;
6364 /* Output the word break property in a human readable format. */
6366 debug_output_wbp (FILE *stream)
6370 for (i = 0; i < 0x110000; i++)
6372 int attr = get_wbp (i);
6373 if (attr != 1 << WBP_OTHER)
6375 fprintf (stream, "0x%04X", i);
6376 if (attr & (1 << WBP_CR))
6377 fprintf (stream, " CR");
6378 if (attr & (1 << WBP_LF))
6379 fprintf (stream, " LF");
6380 if (attr & (1 << WBP_NEWLINE))
6381 fprintf (stream, " Newline");
6382 if (attr & (1 << WBP_EXTEND))
6383 fprintf (stream, " Extend");
6384 if (attr & (1 << WBP_FORMAT))
6385 fprintf (stream, " Format");
6386 if (attr & (1 << WBP_KATAKANA))
6387 fprintf (stream, " Katakana");
6388 if (attr & (1 << WBP_ALETTER))
6389 fprintf (stream, " ALetter");
6390 if (attr & (1 << WBP_MIDNUMLET))
6391 fprintf (stream, " MidNumLet");
6392 if (attr & (1 << WBP_MIDLETTER))
6393 fprintf (stream, " MidLetter");
6394 if (attr & (1 << WBP_MIDNUM))
6395 fprintf (stream, " MidNum");
6396 if (attr & (1 << WBP_NUMERIC))
6397 fprintf (stream, " Numeric");
6398 if (attr & (1 << WBP_EXTENDNUMLET))
6399 fprintf (stream, " ExtendNumLet");
6400 fprintf (stream, "\n");
6406 debug_output_wbrk_tables (const char *filename)
6410 stream = fopen (filename, "w");
6413 fprintf (stderr, "cannot open '%s' for writing\n", filename);
6417 debug_output_wbp (stream);
6419 if (ferror (stream) || fclose (stream))
6421 fprintf (stderr, "error writing to '%s'\n", filename);
6426 /* The word break property from the WordBreakProperty.txt file. */
6427 int unicode_org_wbp[0x110000];
6429 /* Stores in unicode_org_wbp[] the word break property from the
6430 WordBreakProperty.txt file. */
6432 fill_org_wbp (const char *wordbreakproperty_filename)
6437 for (i = 0; i < 0x110000; i++)
6438 unicode_org_wbp[i] = WBP_OTHER;
6440 stream = fopen (wordbreakproperty_filename, "r");
6443 fprintf (stderr, "error during fopen of '%s'\n", wordbreakproperty_filename);
6450 unsigned int i1, i2;
6451 char padding[200+1];
6452 char propname[200+1];
6455 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
6458 if (buf[0] == '\0' || buf[0] == '#')
6461 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4)
6463 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3)
6465 fprintf (stderr, "parse error in '%s'\n",
6466 wordbreakproperty_filename);
6471 #define PROP(name,value) \
6472 if (strcmp (propname, name) == 0) propvalue = value; else
6475 PROP ("Newline", WBP_NEWLINE)
6476 PROP ("Extend", WBP_EXTEND)
6477 PROP ("Format", WBP_FORMAT)
6478 PROP ("Katakana", WBP_KATAKANA)
6479 PROP ("ALetter", WBP_ALETTER)
6480 PROP ("MidNumLet", WBP_MIDNUMLET)
6481 PROP ("MidLetter", WBP_MIDLETTER)
6482 PROP ("MidNum", WBP_MIDNUM)
6483 PROP ("Numeric", WBP_NUMERIC)
6484 PROP ("ExtendNumLet", WBP_EXTENDNUMLET)
6487 fprintf (stderr, "unknown property value '%s' in '%s'\n", propname,
6488 wordbreakproperty_filename);
6491 if (!(i1 <= i2 && i2 < 0x110000))
6494 for (i = i1; i <= i2; i++)
6495 unicode_org_wbp[i] = propvalue;
6498 if (ferror (stream) || fclose (stream))
6500 fprintf (stderr, "error reading from '%s'\n", wordbreakproperty_filename);
6505 /* Output the word break property in a human readable format. */
6507 debug_output_org_wbp (FILE *stream)
6511 for (i = 0; i < 0x110000; i++)
6513 int propvalue = unicode_org_wbp[i];
6514 if (propvalue != WBP_OTHER)
6516 fprintf (stream, "0x%04X", i);
6517 #define PROP(name,value) \
6518 if (propvalue == value) fprintf (stream, " " name); else
6521 PROP ("Newline", WBP_NEWLINE)
6522 PROP ("Extend", WBP_EXTEND)
6523 PROP ("Format", WBP_FORMAT)
6524 PROP ("Katakana", WBP_KATAKANA)
6525 PROP ("ALetter", WBP_ALETTER)
6526 PROP ("MidNumLet", WBP_MIDNUMLET)
6527 PROP ("MidLetter", WBP_MIDLETTER)
6528 PROP ("MidNum", WBP_MIDNUM)
6529 PROP ("Numeric", WBP_NUMERIC)
6530 PROP ("ExtendNumLet", WBP_EXTENDNUMLET)
6532 fprintf (stream, " ??");
6533 fprintf (stream, "\n");
6539 debug_output_org_wbrk_tables (const char *filename)
6543 stream = fopen (filename, "w");
6546 fprintf (stderr, "cannot open '%s' for writing\n", filename);
6550 debug_output_org_wbp (stream);
6552 if (ferror (stream) || fclose (stream))
6554 fprintf (stderr, "error writing to '%s'\n", filename);
6559 /* Construction of sparse 3-level tables. */
6560 #define TABLE wbp_table
6561 #define ELEMENT unsigned char
6562 #define DEFAULT WBP_OTHER
6563 #define xmalloc malloc
6564 #define xrealloc realloc
6568 output_wbp (FILE *stream)
6572 unsigned int level1_offset, level2_offset, level3_offset;
6576 wbp_table_init (&t);
6578 for (i = 0; i < 0x110000; i++)
6580 int attr = get_wbp (i);
6582 /* Now attr should contain exactly one bit. */
6583 if (attr == 0 || ((attr & (attr - 1)) != 0))
6586 if (attr != 1 << WBP_OTHER)
6588 unsigned int log2_attr;
6589 for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++);
6591 wbp_table_add (&t, i, log2_attr);
6595 wbp_table_finalize (&t);
6598 5 * sizeof (uint32_t);
6600 5 * sizeof (uint32_t)
6601 + t.level1_size * sizeof (uint32_t);
6603 5 * sizeof (uint32_t)
6604 + t.level1_size * sizeof (uint32_t)
6605 + (t.level2_size << t.q) * sizeof (uint32_t);
6607 for (i = 0; i < 5; i++)
6608 fprintf (stream, "#define wbrkprop_header_%d %d\n", i,
6609 ((uint32_t *) t.result)[i]);
6610 fprintf (stream, "\n");
6611 fprintf (stream, "typedef struct\n");
6612 fprintf (stream, " {\n");
6613 fprintf (stream, " int level1[%zu];\n", t.level1_size);
6614 fprintf (stream, " int level2[%zu << %d];\n", t.level2_size, t.q);
6615 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
6616 fprintf (stream, " }\n");
6617 fprintf (stream, "wbrkprop_t;\n");
6618 fprintf (stream, "static const wbrkprop_t uniwbrkprop =\n");
6619 fprintf (stream, "{\n");
6620 fprintf (stream, " {");
6621 if (t.level1_size > 8)
6622 fprintf (stream, "\n ");
6623 for (i = 0; i < t.level1_size; i++)
6626 if (i > 0 && (i % 8) == 0)
6627 fprintf (stream, "\n ");
6628 offset = ((uint32_t *) (t.result + level1_offset))[i];
6630 fprintf (stream, " %5d", -1);
6632 fprintf (stream, " %5zu",
6633 (offset - level2_offset) / sizeof (uint32_t));
6634 if (i+1 < t.level1_size)
6635 fprintf (stream, ",");
6637 if (t.level1_size > 8)
6638 fprintf (stream, "\n ");
6639 fprintf (stream, " },\n");
6640 fprintf (stream, " {");
6641 if (t.level2_size << t.q > 8)
6642 fprintf (stream, "\n ");
6643 for (i = 0; i < t.level2_size << t.q; i++)
6646 if (i > 0 && (i % 8) == 0)
6647 fprintf (stream, "\n ");
6648 offset = ((uint32_t *) (t.result + level2_offset))[i];
6650 fprintf (stream, " %5d", -1);
6652 fprintf (stream, " %5zu",
6653 (offset - level3_offset) / sizeof (unsigned char));
6654 if (i+1 < t.level2_size << t.q)
6655 fprintf (stream, ",");
6657 if (t.level2_size << t.q > 8)
6658 fprintf (stream, "\n ");
6659 fprintf (stream, " },\n");
6660 fprintf (stream, " {");
6661 if (t.level3_size << t.p > 4)
6662 fprintf (stream, "\n ");
6663 for (i = 0; i < t.level3_size << t.p; i++)
6665 unsigned char value = ((unsigned char *) (t.result + level3_offset))[i];
6666 const char *value_string;
6669 #define CASE(x) case x: value_string = #x; break;
6678 CASE(WBP_MIDNUMLET);
6679 CASE(WBP_MIDLETTER);
6682 CASE(WBP_EXTENDNUMLET);
6687 if (i > 0 && (i % 4) == 0)
6688 fprintf (stream, "\n ");
6689 fprintf (stream, " %s%s", value_string,
6690 (i+1 < t.level3_size << t.p ? "," : ""));
6692 if (t.level3_size << t.p > 4)
6693 fprintf (stream, "\n ");
6694 fprintf (stream, " }\n");
6695 fprintf (stream, "};\n");
6699 output_wbrk_tables (const char *filename, const char *version)
6703 stream = fopen (filename, "w");
6706 fprintf (stderr, "cannot open '%s' for writing\n", filename);
6710 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
6711 fprintf (stream, "/* Line breaking properties of Unicode characters. */\n");
6712 fprintf (stream, "/* Generated automatically by gen-uni-tables for Unicode %s. */\n",
6714 fprintf (stream, "\n");
6716 /* Put a GPL header on it. The gnulib module is under LGPL (although it
6717 still carries the GPL header), and it's gnulib-tool which replaces the
6718 GPL header with an LGPL header. */
6719 fprintf (stream, "/* Copyright (C) 2000-2002, 2004, 2007-2009 Free Software Foundation, Inc.\n");
6720 fprintf (stream, "\n");
6721 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
6722 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
6723 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
6724 fprintf (stream, " (at your option) any later version.\n");
6725 fprintf (stream, "\n");
6726 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
6727 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
6728 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
6729 fprintf (stream, " GNU General Public License for more details.\n");
6730 fprintf (stream, "\n");
6731 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
6732 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
6733 fprintf (stream, "\n");
6735 output_wbp (stream);
6737 if (ferror (stream) || fclose (stream))
6739 fprintf (stderr, "error writing to '%s'\n", filename);
6744 /* ========================================================================= */
6746 /* Maximum number of characters into which a single Unicode character can be
6748 #define MAX_DECOMP_LENGTH 18
6752 UC_DECOMP_CANONICAL,/* Canonical decomposition. */
6753 UC_DECOMP_FONT, /* <font> A font variant (e.g. a blackletter form). */
6754 UC_DECOMP_NOBREAK, /* <noBreak> A no-break version of a space or hyphen. */
6755 UC_DECOMP_INITIAL, /* <initial> An initial presentation form (Arabic). */
6756 UC_DECOMP_MEDIAL, /* <medial> A medial presentation form (Arabic). */
6757 UC_DECOMP_FINAL, /* <final> A final presentation form (Arabic). */
6758 UC_DECOMP_ISOLATED,/* <isolated> An isolated presentation form (Arabic). */
6759 UC_DECOMP_CIRCLE, /* <circle> An encircled form. */
6760 UC_DECOMP_SUPER, /* <super> A superscript form. */
6761 UC_DECOMP_SUB, /* <sub> A subscript form. */
6762 UC_DECOMP_VERTICAL,/* <vertical> A vertical layout presentation form. */
6763 UC_DECOMP_WIDE, /* <wide> A wide (or zenkaku) compatibility character. */
6764 UC_DECOMP_NARROW, /* <narrow> A narrow (or hankaku) compatibility character. */
6765 UC_DECOMP_SMALL, /* <small> A small variant form (CNS compatibility). */
6766 UC_DECOMP_SQUARE, /* <square> A CJK squared font variant. */
6767 UC_DECOMP_FRACTION,/* <fraction> A vulgar fraction form. */
6768 UC_DECOMP_COMPAT /* <compat> Otherwise unspecified compatibility character. */
6771 /* Return the decomposition for a Unicode character (ignoring Hangul Jamo
6772 decompositions). Return the type, or -1 for none. */
6774 get_decomposition (unsigned int ch,
6775 unsigned int *lengthp, unsigned int decomposed[MAX_DECOMP_LENGTH])
6777 const char *decomposition = unicode_attributes[ch].decomposition;
6779 if (decomposition != NULL && decomposition[0] != '\0')
6781 int type = UC_DECOMP_CANONICAL;
6782 unsigned int length;
6785 if (decomposition[0] == '<')
6790 rangle = strchr (decomposition + 1, '>');
6793 typelen = rangle + 1 - decomposition;
6794 #define TYPE(t1,t2) \
6795 if (typelen == (sizeof (t1) - 1) && memcmp (decomposition, t1, typelen) == 0) \
6798 TYPE ("<font>", UC_DECOMP_FONT)
6799 TYPE ("<noBreak>", UC_DECOMP_NOBREAK)
6800 TYPE ("<initial>", UC_DECOMP_INITIAL)
6801 TYPE ("<medial>", UC_DECOMP_MEDIAL)
6802 TYPE ("<final>", UC_DECOMP_FINAL)
6803 TYPE ("<isolated>", UC_DECOMP_ISOLATED)
6804 TYPE ("<circle>", UC_DECOMP_CIRCLE)
6805 TYPE ("<super>", UC_DECOMP_SUPER)
6806 TYPE ("<sub>", UC_DECOMP_SUB)
6807 TYPE ("<vertical>", UC_DECOMP_VERTICAL)
6808 TYPE ("<wide>", UC_DECOMP_WIDE)
6809 TYPE ("<narrow>", UC_DECOMP_NARROW)
6810 TYPE ("<small>", UC_DECOMP_SMALL)
6811 TYPE ("<square>", UC_DECOMP_SQUARE)
6812 TYPE ("<fraction>", UC_DECOMP_FRACTION)
6813 TYPE ("<compat>", UC_DECOMP_COMPAT)
6815 fprintf (stderr, "unknown decomposition type %*s\n", (int)typelen, decomposition);
6819 decomposition = rangle + 1;
6820 if (decomposition[0] == ' ')
6823 for (length = 0; length < MAX_DECOMP_LENGTH; length++)
6825 decomposed[length] = strtoul (decomposition, &endptr, 16);
6826 if (endptr == decomposition)
6828 decomposition = endptr;
6829 if (decomposition[0] == ' ')
6832 if (*decomposition != '\0')
6833 /* MAX_DECOMP_LENGTH is too small. */
6843 /* Construction of sparse 3-level tables. */
6844 #define TABLE decomp_table
6845 #define ELEMENT uint16_t
6846 #define DEFAULT (uint16_t)(-1)
6847 #define xmalloc malloc
6848 #define xrealloc realloc
6852 output_decomposition (FILE *stream1, FILE *stream2)
6854 struct decomp_table t;
6855 unsigned int level1_offset, level2_offset, level3_offset;
6856 unsigned int offset;
6862 decomp_table_init (&t);
6864 fprintf (stream1, "extern const unsigned char gl_uninorm_decomp_chars_table[];\n");
6865 fprintf (stream1, "\n");
6866 fprintf (stream2, "const unsigned char gl_uninorm_decomp_chars_table[] =\n{");
6869 for (ch = 0; ch < 0x110000; ch++)
6871 unsigned int length;
6872 unsigned int decomposed[MAX_DECOMP_LENGTH];
6873 int type = get_decomposition (ch, &length, decomposed);
6877 if (!(offset < (1 << 15)))
6879 decomp_table_add (&t, ch, ((type == UC_DECOMP_CANONICAL ? 0 : 1) << 15) | offset);
6881 /* Produce length 3-bytes entries. */
6883 /* We would need a special representation of zero-length entries. */
6885 for (i = 0; i < length; i++)
6888 fprintf (stream2, ",");
6889 if ((offset % 4) == 0)
6890 fprintf (stream2, "\n ");
6891 if (!(decomposed[i] < (1 << 18)))
6893 fprintf (stream2, " 0x%02X, 0x%02X, 0x%02X",
6894 (((i+1 < length ? (1 << 23) : 0)
6895 | (i == 0 ? (type << 18) : 0)
6896 | decomposed[i]) >> 16) & 0xff,
6897 (decomposed[i] >> 8) & 0xff,
6898 decomposed[i] & 0xff);
6904 fprintf (stream2, "\n};\n");
6905 fprintf (stream2, "\n");
6907 decomp_table_finalize (&t);
6910 5 * sizeof (uint32_t);
6912 5 * sizeof (uint32_t)
6913 + t.level1_size * sizeof (uint32_t);
6915 5 * sizeof (uint32_t)
6916 + t.level1_size * sizeof (uint32_t)
6917 + (t.level2_size << t.q) * sizeof (uint32_t);
6919 for (i = 0; i < 5; i++)
6920 fprintf (stream1, "#define decomp_header_%d %d\n", i,
6921 ((uint32_t *) t.result)[i]);
6922 fprintf (stream1, "\n");
6923 fprintf (stream1, "typedef struct\n");
6924 fprintf (stream1, " {\n");
6925 fprintf (stream1, " int level1[%zu];\n", t.level1_size);
6926 fprintf (stream1, " int level2[%zu << %d];\n", t.level2_size, t.q);
6927 fprintf (stream1, " unsigned short level3[%zu << %d];\n", t.level3_size, t.p);
6928 fprintf (stream1, " }\n");
6929 fprintf (stream1, "decomp_index_table_t;\n");
6930 fprintf (stream1, "extern const decomp_index_table_t gl_uninorm_decomp_index_table;\n");
6931 fprintf (stream2, "const decomp_index_table_t gl_uninorm_decomp_index_table =\n");
6932 fprintf (stream2, "{\n");
6933 fprintf (stream2, " {");
6934 if (t.level1_size > 8)
6935 fprintf (stream2, "\n ");
6936 for (i = 0; i < t.level1_size; i++)
6939 if (i > 0 && (i % 8) == 0)
6940 fprintf (stream2, "\n ");
6941 offset = ((uint32_t *) (t.result + level1_offset))[i];
6943 fprintf (stream2, " %5d", -1);
6945 fprintf (stream2, " %5zu",
6946 (offset - level2_offset) / sizeof (uint32_t));
6947 if (i+1 < t.level1_size)
6948 fprintf (stream2, ",");
6950 if (t.level1_size > 8)
6951 fprintf (stream2, "\n ");
6952 fprintf (stream2, " },\n");
6953 fprintf (stream2, " {");
6954 if (t.level2_size << t.q > 8)
6955 fprintf (stream2, "\n ");
6956 for (i = 0; i < t.level2_size << t.q; i++)
6959 if (i > 0 && (i % 8) == 0)
6960 fprintf (stream2, "\n ");
6961 offset = ((uint32_t *) (t.result + level2_offset))[i];
6963 fprintf (stream2, " %5d", -1);
6965 fprintf (stream2, " %5zu",
6966 (offset - level3_offset) / sizeof (uint16_t));
6967 if (i+1 < t.level2_size << t.q)
6968 fprintf (stream2, ",");
6970 if (t.level2_size << t.q > 8)
6971 fprintf (stream2, "\n ");
6972 fprintf (stream2, " },\n");
6973 fprintf (stream2, " {");
6974 if (t.level3_size << t.p > 8)
6975 fprintf (stream2, "\n ");
6976 for (i = 0; i < t.level3_size << t.p; i++)
6978 uint16_t value = ((uint16_t *) (t.result + level3_offset))[i];
6979 if (i > 0 && (i % 8) == 0)
6980 fprintf (stream2, "\n ");
6981 fprintf (stream2, " %5d", value == (uint16_t)(-1) ? -1 : value);
6982 if (i+1 < t.level3_size << t.p)
6983 fprintf (stream2, ",");
6985 if (t.level3_size << t.p > 8)
6986 fprintf (stream2, "\n ");
6987 fprintf (stream2, " }\n");
6988 fprintf (stream2, "};\n");
6992 output_decomposition_tables (const char *filename1, const char *filename2, const char *version)
6994 const char *filenames[2];
6998 filenames[0] = filename1;
6999 filenames[1] = filename2;
7001 for (i = 0; i < 2; i++)
7003 streams[i] = fopen (filenames[i], "w");
7004 if (streams[i] == NULL)
7006 fprintf (stderr, "cannot open '%s' for writing\n", filenames[i]);
7011 for (i = 0; i < 2; i++)
7013 FILE *stream = streams[i];
7015 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
7016 fprintf (stream, "/* Decomposition of Unicode characters. */\n");
7017 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
7019 fprintf (stream, "\n");
7022 output_decomposition (streams[0], streams[1]);
7024 for (i = 0; i < 2; i++)
7026 if (ferror (streams[i]) || fclose (streams[i]))
7028 fprintf (stderr, "error writing to '%s'\n", filenames[i]);
7034 /* The "excluded from composition" property from the CompositionExclusions.txt file. */
7035 char unicode_composition_exclusions[0x110000];
7038 fill_composition_exclusions (const char *compositionexclusions_filename)
7043 stream = fopen (compositionexclusions_filename, "r");
7046 fprintf (stderr, "error during fopen of '%s'\n", compositionexclusions_filename);
7050 for (i = 0; i < 0x110000; i++)
7051 unicode_composition_exclusions[i] = 0;
7058 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
7061 if (buf[0] == '\0' || buf[0] == '#')
7064 if (sscanf (buf, "%X", &i) != 1)
7066 fprintf (stderr, "parse error in '%s'\n", compositionexclusions_filename);
7069 if (!(i < 0x110000))
7072 unicode_composition_exclusions[i] = 1;
7075 if (ferror (stream) || fclose (stream))
7077 fprintf (stderr, "error reading from '%s'\n", compositionexclusions_filename);
7083 debug_output_composition_tables (const char *filename)
7088 stream = fopen (filename, "w");
7091 fprintf (stderr, "cannot open '%s' for writing\n", filename);
7095 for (ch = 0; ch < 0x110000; ch++)
7097 unsigned int length;
7098 unsigned int decomposed[MAX_DECOMP_LENGTH];
7099 int type = get_decomposition (ch, &length, decomposed);
7101 if (type == UC_DECOMP_CANONICAL
7102 /* Consider only binary decompositions.
7103 Exclude singleton decompositions. */
7106 unsigned int code1 = decomposed[0];
7107 unsigned int code2 = decomposed[1];
7108 unsigned int combined = ch;
7110 /* Exclude decompositions where the first part is not a starter,
7111 i.e. is not of canonical combining class 0. */
7112 if (strcmp (unicode_attributes[code1].combining, "0") == 0
7113 /* Exclude characters listed in CompositionExclusions.txt. */
7114 && !unicode_composition_exclusions[combined])
7116 /* The combined character must now also be a starter.
7118 if (strcmp (unicode_attributes[combined].combining, "0") != 0)
7121 fprintf (stream, "0x%04X\t0x%04X\t0x%04X\t%s\n",
7125 unicode_attributes[code2].combining);
7130 if (ferror (stream) || fclose (stream))
7132 fprintf (stderr, "error writing to '%s'\n", filename);
7138 output_composition_tables (const char *filename, const char *version)
7143 stream = fopen (filename, "w");
7146 fprintf (stderr, "cannot open '%s' for writing\n", filename);
7150 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
7151 fprintf (stream, "/* Canonical composition of Unicode characters. */\n");
7152 fprintf (stream, "/* Generated automatically by gen-uni-tables for Unicode %s. */\n",
7154 fprintf (stream, "\n");
7156 /* Put a GPL header on it. The gnulib module is under LGPL (although it
7157 still carries the GPL header), and it's gnulib-tool which replaces the
7158 GPL header with an LGPL header. */
7159 fprintf (stream, "/* Copyright (C) 2009 Free Software Foundation, Inc.\n");
7160 fprintf (stream, "\n");
7161 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
7162 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
7163 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
7164 fprintf (stream, " (at your option) any later version.\n");
7165 fprintf (stream, "\n");
7166 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
7167 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
7168 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
7169 fprintf (stream, " GNU General Public License for more details.\n");
7170 fprintf (stream, "\n");
7171 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
7172 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
7173 fprintf (stream, "\n");
7175 /* The composition table is a set of mappings (code1, code2) -> combined,
7177 367 values for code1 (from 0x003C to 0x30FD),
7178 54 values for code2 (from 0x0300 to 0x309A).
7179 For a fixed code1, there are from 1 to 19 possible values for code2.
7180 For a fixed code2, there are from 1 to 117 possible values for code1.
7181 This is a very sparse matrix.
7183 We want an O(1) hash lookup.
7185 We could implement the hash lookup by mapping (code1, code2) to a linear
7186 combination mul1*code1 + mul2*code2, which is then used as an index into
7187 a 3-level table. But this leads to a table of size 37 KB.
7189 We use gperf to implement the hash lookup, giving it the 928 sets of
7190 4 bytes (code1, code2) as input. gperf generates a hash table of size
7191 1527, which is quite good (60% filled). It requires an auxiliary table
7192 lookup in a table of size 0.5 KB. The total tables size is 11 KB. */
7194 fprintf (stream, "struct composition_rule { char codes[4]; };\n");
7195 fprintf (stream, "%%struct-type\n");
7196 fprintf (stream, "%%language=ANSI-C\n");
7197 fprintf (stream, "%%define slot-name codes\n");
7198 fprintf (stream, "%%define hash-function-name gl_uninorm_compose_hash\n");
7199 fprintf (stream, "%%define lookup-function-name gl_uninorm_compose_lookup\n");
7200 fprintf (stream, "%%compare-lengths\n");
7201 fprintf (stream, "%%compare-strncmp\n");
7202 fprintf (stream, "%%readonly-tables\n");
7203 fprintf (stream, "%%omit-struct-type\n");
7204 fprintf (stream, "%%%%\n");
7206 for (ch = 0; ch < 0x110000; ch++)
7208 unsigned int length;
7209 unsigned int decomposed[MAX_DECOMP_LENGTH];
7210 int type = get_decomposition (ch, &length, decomposed);
7212 if (type == UC_DECOMP_CANONICAL
7213 /* Consider only binary decompositions.
7214 Exclude singleton decompositions. */
7217 unsigned int code1 = decomposed[0];
7218 unsigned int code2 = decomposed[1];
7219 unsigned int combined = ch;
7221 /* Exclude decompositions where the first part is not a starter,
7222 i.e. is not of canonical combining class 0. */
7223 if (strcmp (unicode_attributes[code1].combining, "0") == 0
7224 /* Exclude characters listed in CompositionExclusions.txt. */
7225 && !unicode_composition_exclusions[combined])
7227 /* The combined character must now also be a starter.
7229 if (strcmp (unicode_attributes[combined].combining, "0") != 0)
7232 if (!(code1 < 0x10000))
7234 if (!(code2 < 0x10000))
7236 if (!(combined < 0x10000))
7239 fprintf (stream, "\"\\x%02x\\x%02x\\x%02x\\x%02x\", 0x%04x\n",
7240 (code1 >> 8) & 0xff, code1 & 0xff,
7241 (code2 >> 8) & 0xff, code2 & 0xff,
7247 if (ferror (stream) || fclose (stream))
7249 fprintf (stderr, "error writing to '%s'\n", filename);
7254 /* ========================================================================= */
7256 /* Output the test for a simple character mapping table to the given file. */
7259 output_simple_mapping_test (const char *filename,
7260 const char *function_name,
7261 unsigned int (*func) (unsigned int),
7262 const char *version)
7268 stream = fopen (filename, "w");
7271 fprintf (stderr, "cannot open '%s' for writing\n", filename);
7275 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
7276 fprintf (stream, "/* Test the Unicode character mapping functions.\n");
7277 fprintf (stream, " Copyright (C) 2009 Free Software Foundation, Inc.\n");
7278 fprintf (stream, "\n");
7279 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
7280 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
7281 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
7282 fprintf (stream, " (at your option) any later version.\n");
7283 fprintf (stream, "\n");
7284 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
7285 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
7286 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
7287 fprintf (stream, " GNU General Public License for more details.\n");
7288 fprintf (stream, "\n");
7289 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
7290 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
7291 fprintf (stream, "\n");
7292 fprintf (stream, "/* Generated automatically by gen-case.c for Unicode %s. */\n",
7294 fprintf (stream, "\n");
7295 fprintf (stream, "#include \"test-mapping-part1.h\"\n");
7296 fprintf (stream, "\n");
7299 for (ch = 0; ch < 0x110000; ch++)
7301 unsigned int value = func (ch);
7306 fprintf (stream, ",\n");
7307 fprintf (stream, " { 0x%04X, 0x%04X }", ch, value);
7312 fprintf (stream, "\n");
7314 fprintf (stream, "\n");
7315 fprintf (stream, "#define MAP(c) %s (c)\n", function_name);
7316 fprintf (stream, "#include \"test-mapping-part2.h\"\n");
7318 if (ferror (stream) || fclose (stream))
7320 fprintf (stderr, "error writing to '%s'\n", filename);
7325 /* Construction of sparse 3-level tables. */
7326 #define TABLE mapping_table
7327 #define ELEMENT int32_t
7329 #define xmalloc malloc
7330 #define xrealloc realloc
7333 /* Output a simple character mapping table to the given file. */
7336 output_simple_mapping (const char *filename,
7337 unsigned int (*func) (unsigned int),
7338 const char *version)
7342 struct mapping_table t;
7343 unsigned int level1_offset, level2_offset, level3_offset;
7345 stream = fopen (filename, "w");
7348 fprintf (stderr, "cannot open '%s' for writing\n", filename);
7352 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
7353 fprintf (stream, "/* Simple character mapping of Unicode characters. */\n");
7354 fprintf (stream, "/* Generated automatically by gen-case.c for Unicode %s. */\n",
7359 mapping_table_init (&t);
7361 for (ch = 0; ch < 0x110000; ch++)
7363 int value = (int) func (ch) - (int) ch;
7365 mapping_table_add (&t, ch, value);
7368 mapping_table_finalize (&t);
7370 /* Offsets in t.result, in memory of this process. */
7372 5 * sizeof (uint32_t);
7374 5 * sizeof (uint32_t)
7375 + t.level1_size * sizeof (uint32_t);
7377 5 * sizeof (uint32_t)
7378 + t.level1_size * sizeof (uint32_t)
7379 + (t.level2_size << t.q) * sizeof (uint32_t);
7381 for (i = 0; i < 5; i++)
7382 fprintf (stream, "#define mapping_header_%d %d\n", i,
7383 ((uint32_t *) t.result)[i]);
7384 fprintf (stream, "static const\n");
7385 fprintf (stream, "struct\n");
7386 fprintf (stream, " {\n");
7387 fprintf (stream, " int level1[%zu];\n", t.level1_size);
7388 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
7389 fprintf (stream, " int level3[%zu << %d];\n", t.level3_size, t.p);
7390 fprintf (stream, " }\n");
7391 fprintf (stream, "u_mapping =\n");
7392 fprintf (stream, "{\n");
7393 fprintf (stream, " {");
7394 if (t.level1_size > 8)
7395 fprintf (stream, "\n ");
7396 for (i = 0; i < t.level1_size; i++)
7399 if (i > 0 && (i % 8) == 0)
7400 fprintf (stream, "\n ");
7401 offset = ((uint32_t *) (t.result + level1_offset))[i];
7403 fprintf (stream, " %5d", -1);
7405 fprintf (stream, " %5zu",
7406 (offset - level2_offset) / sizeof (uint32_t));
7407 if (i+1 < t.level1_size)
7408 fprintf (stream, ",");
7410 if (t.level1_size > 8)
7411 fprintf (stream, "\n ");
7412 fprintf (stream, " },\n");
7413 fprintf (stream, " {");
7414 if (t.level2_size << t.q > 8)
7415 fprintf (stream, "\n ");
7416 for (i = 0; i < t.level2_size << t.q; i++)
7419 if (i > 0 && (i % 8) == 0)
7420 fprintf (stream, "\n ");
7421 offset = ((uint32_t *) (t.result + level2_offset))[i];
7423 fprintf (stream, " %5d", -1);
7425 fprintf (stream, " %5zu",
7426 (offset - level3_offset) / sizeof (int32_t));
7427 if (i+1 < t.level2_size << t.q)
7428 fprintf (stream, ",");
7430 if (t.level2_size << t.q > 8)
7431 fprintf (stream, "\n ");
7432 fprintf (stream, " },\n");
7433 fprintf (stream, " {");
7434 if (t.level3_size << t.p > 8)
7435 fprintf (stream, "\n ");
7436 for (i = 0; i < t.level3_size << t.p; i++)
7438 if (i > 0 && (i % 8) == 0)
7439 fprintf (stream, "\n ");
7440 fprintf (stream, " %5d", ((int32_t *) (t.result + level3_offset))[i]);
7441 if (i+1 < t.level3_size << t.p)
7442 fprintf (stream, ",");
7444 if (t.level3_size << t.p > 8)
7445 fprintf (stream, "\n ");
7446 fprintf (stream, " }\n");
7447 fprintf (stream, "};\n");
7449 if (ferror (stream) || fclose (stream))
7451 fprintf (stderr, "error writing to '%s'\n", filename);
7456 /* ========================================================================= */
7458 /* A special casing context.
7459 A context is negated through x -> -x. */
7464 SCC_AFTER_SOFT_DOTTED,
7470 /* A special casing rule. */
7471 struct special_casing_rule
7474 unsigned int lower_mapping[3];
7475 unsigned int title_mapping[3];
7476 unsigned int upper_mapping[3];
7477 unsigned int casefold_mapping[3];
7478 const char *language;
7482 /* The special casing rules. */
7483 struct special_casing_rule **casing_rules;
7484 unsigned int num_casing_rules;
7485 unsigned int allocated_casing_rules;
7488 add_casing_rule (struct special_casing_rule *new_rule)
7490 if (num_casing_rules == allocated_casing_rules)
7492 allocated_casing_rules = 2 * allocated_casing_rules;
7493 if (allocated_casing_rules < 16)
7494 allocated_casing_rules = 16;
7496 (struct special_casing_rule **)
7497 realloc (casing_rules, allocated_casing_rules * sizeof (struct special_casing_rule *));
7499 casing_rules[num_casing_rules++] = new_rule;
7502 /* Stores in casing_rules the special casing rules found in
7503 specialcasing_filename. */
7505 fill_casing_rules (const char *specialcasing_filename)
7509 stream = fopen (specialcasing_filename, "r");
7512 fprintf (stderr, "error during fopen of '%s'\n", specialcasing_filename);
7516 casing_rules = NULL;
7517 num_casing_rules = 0;
7518 allocated_casing_rules = 0;
7528 unsigned int lower_mapping[3];
7529 unsigned int title_mapping[3];
7530 unsigned int upper_mapping[3];
7534 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
7537 if (buf[0] == '\0' || buf[0] == '#')
7542 code = strtoul (scanptr, &endptr, 16);
7543 if (endptr == scanptr)
7545 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
7549 if (*scanptr != ';')
7551 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
7556 /* Scan lower mapping. */
7557 for (i = 0; i < 3; i++)
7558 lower_mapping[i] = 0;
7559 for (i = 0; i < 3; i++)
7561 while (*scanptr == ' ')
7563 if (*scanptr == ';')
7565 lower_mapping[i] = strtoul (scanptr, &endptr, 16);
7566 if (endptr == scanptr)
7568 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
7573 if (*scanptr != ';')
7575 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
7580 /* Scan title mapping. */
7581 for (i = 0; i < 3; i++)
7582 title_mapping[i] = 0;
7583 for (i = 0; i < 3; i++)
7585 while (*scanptr == ' ')
7587 if (*scanptr == ';')
7589 title_mapping[i] = strtoul (scanptr, &endptr, 16);
7590 if (endptr == scanptr)
7592 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
7597 if (*scanptr != ';')
7599 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
7604 /* Scan upper mapping. */
7605 for (i = 0; i < 3; i++)
7606 upper_mapping[i] = 0;
7607 for (i = 0; i < 3; i++)
7609 while (*scanptr == ' ')
7611 if (*scanptr == ';')
7613 upper_mapping[i] = strtoul (scanptr, &endptr, 16);
7614 if (endptr == scanptr)
7616 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
7621 if (*scanptr != ';')
7623 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
7628 /* Scan language and context. */
7630 context = SCC_ALWAYS;
7631 while (*scanptr == ' ')
7633 if (*scanptr != '\0' && *scanptr != '#')
7635 const char *word_begin = scanptr;
7636 const char *word_end;
7638 while (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';' && *scanptr != ' ')
7642 while (*scanptr == ' ')
7645 if (word_end - word_begin == 2)
7647 language = (char *) malloc ((word_end - word_begin) + 1);
7648 memcpy (language, word_begin, 2);
7649 language[word_end - word_begin] = '\0';
7650 word_begin = word_end = NULL;
7652 if (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';')
7654 word_begin = scanptr;
7655 while (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';' && *scanptr != ' ')
7661 if (word_end > word_begin)
7663 bool negate = false;
7665 if (word_end - word_begin >= 4 && memcmp (word_begin, "Not_", 4) == 0)
7670 if (word_end - word_begin == 11 && memcmp (word_begin, "Final_Sigma", 11) == 0)
7671 context = SCC_FINAL_SIGMA;
7672 else if (word_end - word_begin == 17 && memcmp (word_begin, "After_Soft_Dotted", 17) == 0)
7673 context = SCC_AFTER_SOFT_DOTTED;
7674 else if (word_end - word_begin == 10 && memcmp (word_begin, "More_Above", 10) == 0)
7675 context = SCC_MORE_ABOVE;
7676 else if (word_end - word_begin == 10 && memcmp (word_begin, "Before_Dot", 10) == 0)
7677 context = SCC_BEFORE_DOT;
7678 else if (word_end - word_begin == 7 && memcmp (word_begin, "After_I", 7) == 0)
7679 context = SCC_AFTER_I;
7682 fprintf (stderr, "unknown context type in '%s'\n", specialcasing_filename);
7686 context = - context;
7689 if (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';')
7691 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
7696 /* Store the rule. */
7698 struct special_casing_rule *new_rule =
7699 (struct special_casing_rule *) malloc (sizeof (struct special_casing_rule));
7700 new_rule->code = code;
7701 new_rule->language = language;
7702 new_rule->context = context;
7703 memcpy (new_rule->lower_mapping, lower_mapping, sizeof (new_rule->lower_mapping));
7704 memcpy (new_rule->title_mapping, title_mapping, sizeof (new_rule->title_mapping));
7705 memcpy (new_rule->upper_mapping, upper_mapping, sizeof (new_rule->upper_mapping));
7707 add_casing_rule (new_rule);
7711 if (ferror (stream) || fclose (stream))
7713 fprintf (stderr, "error reading from '%s'\n", specialcasing_filename);
7718 /* A casefolding rule. */
7719 struct casefold_rule
7722 unsigned int mapping[3];
7723 const char *language;
7726 /* The casefolding rules. */
7727 struct casefold_rule **casefolding_rules;
7728 unsigned int num_casefolding_rules;
7729 unsigned int allocated_casefolding_rules;
7731 /* Stores in casefolding_rules the case folding rules found in
7732 casefolding_filename. */
7734 fill_casefolding_rules (const char *casefolding_filename)
7738 stream = fopen (casefolding_filename, "r");
7741 fprintf (stderr, "error during fopen of '%s'\n", casefolding_filename);
7745 casefolding_rules = NULL;
7746 num_casefolding_rules = 0;
7747 allocated_casefolding_rules = 0;
7758 unsigned int mapping[3];
7760 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
7763 if (buf[0] == '\0' || buf[0] == '#')
7768 code = strtoul (scanptr, &endptr, 16);
7769 if (endptr == scanptr)
7771 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
7775 if (*scanptr != ';')
7777 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
7783 while (*scanptr == ' ')
7788 case 'C': case 'F': case 'S': case 'T':
7792 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
7796 if (*scanptr != ';')
7798 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
7803 /* Scan casefold mapping. */
7804 for (i = 0; i < 3; i++)
7806 for (i = 0; i < 3; i++)
7808 while (*scanptr == ' ')
7810 if (*scanptr == ';')
7812 mapping[i] = strtoul (scanptr, &endptr, 16);
7813 if (endptr == scanptr)
7815 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
7820 if (*scanptr != ';')
7822 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
7827 /* Ignore rules of type 'S'; we use the rules of type 'F' instead. */
7830 const char * const *languages;
7831 unsigned int languages_count;
7833 /* Type 'T' indicates that the rule is applicable to Turkish
7837 static const char * const turkish_languages[] = { "tr", "az" };
7838 languages = turkish_languages;
7839 languages_count = 2;
7843 static const char * const all_languages[] = { NULL };
7844 languages = all_languages;
7845 languages_count = 1;
7848 for (i = 0; i < languages_count; i++)
7850 /* Store a new rule. */
7851 struct casefold_rule *new_rule =
7852 (struct casefold_rule *) malloc (sizeof (struct casefold_rule));
7853 new_rule->code = code;
7854 memcpy (new_rule->mapping, mapping, sizeof (new_rule->mapping));
7855 new_rule->language = languages[i];
7857 if (num_casefolding_rules == allocated_casefolding_rules)
7859 allocated_casefolding_rules = 2 * allocated_casefolding_rules;
7860 if (allocated_casefolding_rules < 16)
7861 allocated_casefolding_rules = 16;
7863 (struct casefold_rule **)
7864 realloc (casefolding_rules,
7865 allocated_casefolding_rules * sizeof (struct casefold_rule *));
7867 casefolding_rules[num_casefolding_rules++] = new_rule;
7872 if (ferror (stream) || fclose (stream))
7874 fprintf (stderr, "error reading from '%s'\n", casefolding_filename);
7879 /* Casefold mapping, when it maps to a single character. */
7880 unsigned int unicode_casefold[0x110000];
7883 to_casefold (unsigned int ch)
7885 return unicode_casefold[ch];
7888 /* Redistribute the casefolding_rules:
7889 - Rules that map to a single character, language independently, are stored
7890 in unicode_casefold.
7891 - Other rules are merged into casing_rules. */
7893 redistribute_casefolding_rules (void)
7895 unsigned int ch, i, j;
7897 /* Fill unicode_casefold[]. */
7898 for (ch = 0; ch < 0x110000; ch++)
7899 unicode_casefold[ch] = ch;
7900 for (i = 0; i < num_casefolding_rules; i++)
7902 struct casefold_rule *cfrule = casefolding_rules[i];
7904 if (cfrule->language == NULL && cfrule->mapping[1] == 0)
7907 if (!(ch < 0x110000))
7909 unicode_casefold[ch] = cfrule->mapping[0];
7913 /* Extend the special casing rules by filling in their casefold_mapping[]
7915 for (j = 0; j < num_casing_rules; j++)
7917 struct special_casing_rule *rule = casing_rules[j];
7920 rule->casefold_mapping[0] = to_casefold (rule->code);
7921 for (k = 1; k < 3; k++)
7922 rule->casefold_mapping[k] = 0;
7925 /* Now merge the other casefolding rules into casing_rules. */
7926 for (i = 0; i < num_casefolding_rules; i++)
7928 struct casefold_rule *cfrule = casefolding_rules[i];
7930 if (!(cfrule->language == NULL && cfrule->mapping[1] == 0))
7932 /* Find a rule that applies to the same code, same language, and it
7933 has context SCC_ALWAYS. At the same time, update all rules that
7934 have the same code and same or more specific language. */
7935 struct special_casing_rule *found_rule = NULL;
7937 for (j = 0; j < num_casing_rules; j++)
7939 struct special_casing_rule *rule = casing_rules[j];
7941 if (rule->code == cfrule->code
7942 && (cfrule->language == NULL
7943 || (rule->language != NULL
7944 && strcmp (rule->language, cfrule->language) == 0)))
7946 memcpy (rule->casefold_mapping, cfrule->mapping,
7947 sizeof (rule->casefold_mapping));
7949 if ((cfrule->language == NULL
7950 ? rule->language == NULL
7951 : rule->language != NULL
7952 && strcmp (rule->language, cfrule->language) == 0)
7953 && rule->context == SCC_ALWAYS)
7961 if (found_rule == NULL)
7963 /* Create a new rule. */
7964 struct special_casing_rule *new_rule =
7965 (struct special_casing_rule *) malloc (sizeof (struct special_casing_rule));
7967 /* Try to find a rule that applies to the same code, no language
7968 restriction, and with context SCC_ALWAYS. */
7969 for (j = 0; j < num_casing_rules; j++)
7971 struct special_casing_rule *rule = casing_rules[j];
7973 if (rule->code == cfrule->code
7974 && rule->context == SCC_ALWAYS
7975 && rule->language == NULL)
7983 new_rule->code = cfrule->code;
7984 new_rule->language = cfrule->language;
7985 new_rule->context = SCC_ALWAYS;
7986 if (found_rule != NULL)
7988 memcpy (new_rule->lower_mapping, found_rule->lower_mapping,
7989 sizeof (new_rule->lower_mapping));
7990 memcpy (new_rule->title_mapping, found_rule->title_mapping,
7991 sizeof (new_rule->title_mapping));
7992 memcpy (new_rule->upper_mapping, found_rule->upper_mapping,
7993 sizeof (new_rule->upper_mapping));
7999 new_rule->lower_mapping[0] = to_lower (cfrule->code);
8000 for (k = 1; k < 3; k++)
8001 new_rule->lower_mapping[k] = 0;
8002 new_rule->title_mapping[0] = to_title (cfrule->code);
8003 for (k = 1; k < 3; k++)
8004 new_rule->title_mapping[k] = 0;
8005 new_rule->upper_mapping[0] = to_upper (cfrule->code);
8006 for (k = 1; k < 3; k++)
8007 new_rule->upper_mapping[k] = 0;
8009 memcpy (new_rule->casefold_mapping, cfrule->mapping,
8010 sizeof (new_rule->casefold_mapping));
8012 add_casing_rule (new_rule);
8019 compare_casing_rules (const void *a, const void *b)
8021 struct special_casing_rule *a_rule = *(struct special_casing_rule **) a;
8022 struct special_casing_rule *b_rule = *(struct special_casing_rule **) b;
8023 unsigned int a_code = a_rule->code;
8024 unsigned int b_code = b_rule->code;
8026 if (a_code < b_code)
8028 if (a_code > b_code)
8031 /* Sort the more specific rules before the more general ones. */
8032 return (- ((a_rule->language != NULL ? 1 : 0) + (a_rule->context != SCC_ALWAYS ? 1 : 0))
8033 + ((b_rule->language != NULL ? 1 : 0) + (b_rule->context != SCC_ALWAYS ? 1 : 0)));
8037 sort_casing_rules (void)
8039 /* Sort the rules 1. by code, 2. by specificity. */
8040 if (num_casing_rules > 1)
8041 qsort (casing_rules, num_casing_rules, sizeof (struct special_casing_rule *),
8042 compare_casing_rules);
8045 /* Output the special casing rules. */
8047 output_casing_rules (const char *filename, const char *version)
8053 stream = fopen (filename, "w");
8056 fprintf (stderr, "cannot open '%s' for writing\n", filename);
8060 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
8061 fprintf (stream, "/* Special casing rules of Unicode characters. */\n");
8062 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
8064 fprintf (stream, "struct special_casing_rule { char code[3]; };\n");
8065 fprintf (stream, "%%struct-type\n");
8066 fprintf (stream, "%%language=ANSI-C\n");
8067 fprintf (stream, "%%define slot-name code\n");
8068 fprintf (stream, "%%define hash-function-name gl_unicase_special_hash\n");
8069 fprintf (stream, "%%define lookup-function-name gl_unicase_special_lookup\n");
8070 fprintf (stream, "%%compare-lengths\n");
8071 fprintf (stream, "%%compare-strncmp\n");
8072 fprintf (stream, "%%readonly-tables\n");
8073 fprintf (stream, "%%omit-struct-type\n");
8074 fprintf (stream, "%%%%\n");
8077 for (i = 0; i < num_casing_rules; i++)
8079 struct special_casing_rule *rule = casing_rules[i];
8082 if (i > 0 && rule->code == casing_rules[i - 1]->code)
8087 if (!(rule->code < 0x10000))
8089 fprintf (stderr, "special rule #%u: code %u out of range\n", i, rule->code);
8093 fprintf (stream, "\"\\x%02x\\x%02x\\x%02x\", ",
8094 (rule->code >> 8) & 0xff, rule->code & 0xff, minor);
8096 fprintf (stream, "%d, ",
8097 i + 1 < num_casing_rules && casing_rules[i + 1]->code == rule->code ? 1 : 0);
8099 context = rule->context;
8102 fprintf (stream, "-");
8103 context = - context;
8106 fprintf (stream, " ");
8110 fprintf (stream, "SCC_ALWAYS ");
8112 case SCC_FINAL_SIGMA:
8113 fprintf (stream, "SCC_FINAL_SIGMA ");
8115 case SCC_AFTER_SOFT_DOTTED:
8116 fprintf (stream, "SCC_AFTER_SOFT_DOTTED");
8118 case SCC_MORE_ABOVE:
8119 fprintf (stream, "SCC_MORE_ABOVE ");
8121 case SCC_BEFORE_DOT:
8122 fprintf (stream, "SCC_BEFORE_DOT ");
8125 fprintf (stream, "SCC_AFTER_I ");
8130 fprintf (stream, ", ");
8132 if (rule->language != NULL)
8134 if (strlen (rule->language) != 2)
8136 fprintf (stream, "{ '%c', '%c' }, ", rule->language[0], rule->language[1]);
8139 fprintf (stream, "{ '\\0', '\\0' }, ");
8141 fprintf (stream, "{ ");
8142 for (j = 0; j < 3; j++)
8145 fprintf (stream, ", ");
8146 if (!(rule->upper_mapping[j] < 0x10000))
8148 fprintf (stderr, "special rule #%u: upper mapping of code %u out of range\n", i, rule->code);
8151 if (rule->upper_mapping[j] != 0)
8152 fprintf (stream, "0x%04X", rule->upper_mapping[j]);
8154 fprintf (stream, " 0");
8156 fprintf (stream, " }, { ");
8157 for (j = 0; j < 3; j++)
8160 fprintf (stream, ", ");
8161 if (!(rule->lower_mapping[j] < 0x10000))
8163 fprintf (stderr, "special rule #%u: lower mapping of code %u out of range\n", i, rule->code);
8166 if (rule->lower_mapping[j] != 0)
8167 fprintf (stream, "0x%04X", rule->lower_mapping[j]);
8169 fprintf (stream, " 0");
8171 fprintf (stream, " }, { ");
8172 for (j = 0; j < 3; j++)
8175 fprintf (stream, ", ");
8176 if (!(rule->title_mapping[j] < 0x10000))
8178 fprintf (stderr, "special rule #%u: title mapping of code %u out of range\n", i, rule->code);
8181 if (rule->title_mapping[j] != 0)
8182 fprintf (stream, "0x%04X", rule->title_mapping[j]);
8184 fprintf (stream, " 0");
8186 fprintf (stream, " }, { ");
8187 for (j = 0; j < 3; j++)
8190 fprintf (stream, ", ");
8191 if (!(rule->casefold_mapping[j] < 0x10000))
8193 fprintf (stderr, "special rule #%u: casefold mapping of code %u out of range\n", i, rule->code);
8196 if (rule->casefold_mapping[j] != 0)
8197 fprintf (stream, "0x%04X", rule->casefold_mapping[j]);
8199 fprintf (stream, " 0");
8201 fprintf (stream, " }\n");
8204 if (ferror (stream) || fclose (stream))
8206 fprintf (stderr, "error writing to '%s'\n", filename);
8211 /* ========================================================================= */
8213 /* Quoting the Unicode standard:
8214 Definition: A character is defined to be "cased" if it has the Lowercase
8215 or Uppercase property or has a General_Category value of
8216 Titlecase_Letter. */
8218 is_cased (unsigned int ch)
8220 return (is_property_lowercase (ch)
8221 || is_property_uppercase (ch)
8222 || is_category_Lt (ch));
8225 /* Quoting the Unicode standard:
8226 Definition: A character is defined to be "case-ignorable" if it has the
8227 value MidLetter {or the value MidNumLet} for the Word_Break property or
8228 its General_Category is one of Nonspacing_Mark (Mn), Enclosing_Mark (Me),
8229 Format (Cf), Modifier_Letter (Lm), or Modifier_Symbol (Sk).
8230 The text marked in braces was added in Unicode 5.1.0, see
8231 <http://www.unicode.org/versions/Unicode5.1.0/> section "Update of
8232 Definition of case-ignorable". */
8233 /* Since this predicate is only used for the "Before C" and "After C"
8234 conditions of FINAL_SIGMA, we exclude the "cased" characters here.
8235 This simplifies the evaluation of the regular expressions
8236 \p{cased} (\p{case-ignorable})* C
8238 C (\p{case-ignorable})* \p{cased}
8241 is_case_ignorable (unsigned int ch)
8243 return (unicode_org_wbp[ch] == WBP_MIDLETTER
8244 || unicode_org_wbp[ch] == WBP_MIDNUMLET
8245 || is_category_Mn (ch)
8246 || is_category_Me (ch)
8247 || is_category_Cf (ch)
8248 || is_category_Lm (ch)
8249 || is_category_Sk (ch))
8253 /* ------------------------------------------------------------------------- */
8255 /* Output all case related properties. */
8257 output_casing_properties (const char *version)
8259 #define PROPERTY(FN,P) \
8260 debug_output_predicate ("unicase/" #FN ".txt", is_ ## P); \
8261 output_predicate_test ("../tests/unicase/test-" #FN ".c", is_ ## P, "uc_is_" #P " (c)"); \
8262 output_predicate ("unicase/" #FN ".h", is_ ## P, "u_casing_property_" #P, "Casing Properties", version);
8263 PROPERTY(cased, cased)
8264 PROPERTY(ignorable, case_ignorable)
8268 /* ========================================================================= */
8271 main (int argc, char * argv[])
8273 const char *unicodedata_filename;
8274 const char *proplist_filename;
8275 const char *derivedproplist_filename;
8276 const char *scripts_filename;
8277 const char *blocks_filename;
8278 const char *proplist30_filename;
8279 const char *eastasianwidth_filename;
8280 const char *linebreak_filename;
8281 const char *wordbreakproperty_filename;
8282 const char *compositionexclusions_filename;
8283 const char *specialcasing_filename;
8284 const char *casefolding_filename;
8285 const char *version;
8289 fprintf (stderr, "Usage: %s UnicodeData.txt PropList.txt DerivedCoreProperties.txt Scripts.txt Blocks.txt PropList-3.0.1.txt EastAsianWidth.txt LineBreak.txt WordBreakProperty.txt CompositionExclusions.txt SpecialCasing.txt CaseFolding.txt version\n",
8294 unicodedata_filename = argv[1];
8295 proplist_filename = argv[2];
8296 derivedproplist_filename = argv[3];
8297 scripts_filename = argv[4];
8298 blocks_filename = argv[5];
8299 proplist30_filename = argv[6];
8300 eastasianwidth_filename = argv[7];
8301 linebreak_filename = argv[8];
8302 wordbreakproperty_filename = argv[9];
8303 compositionexclusions_filename = argv[10];
8304 specialcasing_filename = argv[11];
8305 casefolding_filename = argv[12];
8308 fill_attributes (unicodedata_filename);
8309 clear_properties ();
8310 fill_properties (proplist_filename);
8311 fill_properties (derivedproplist_filename);
8312 fill_properties30 (proplist30_filename);
8313 fill_scripts (scripts_filename);
8314 fill_blocks (blocks_filename);
8315 fill_width (eastasianwidth_filename);
8316 fill_org_lbp (linebreak_filename);
8317 fill_org_wbp (wordbreakproperty_filename);
8318 fill_composition_exclusions (compositionexclusions_filename);
8319 fill_casing_rules (specialcasing_filename);
8320 fill_casefolding_rules (casefolding_filename);
8321 redistribute_casefolding_rules ();
8322 sort_casing_rules ();
8324 output_categories (version);
8325 output_category ("unictype/categ_of.h", version);
8326 output_combclass ("unictype/combining.h", version);
8327 output_bidi_category ("unictype/bidi_of.h", version);
8328 output_decimal_digit_test ("../tests/unictype/test-decdigit.h", version);
8329 output_decimal_digit ("unictype/decdigit.h", version);
8330 output_digit_test ("../tests/unictype/test-digit.h", version);
8331 output_digit ("unictype/digit.h", version);
8332 output_numeric_test ("../tests/unictype/test-numeric.h", version);
8333 output_numeric ("unictype/numeric.h", version);
8334 output_mirror ("unictype/mirror.h", version);
8335 output_properties (version);
8336 output_scripts (version);
8337 output_scripts_byname (version);
8338 output_blocks (version);
8339 output_ident_properties (version);
8340 output_old_ctype (version);
8342 debug_output_lbrk_tables ("unilbrk/lbrkprop.txt");
8343 debug_output_org_lbrk_tables ("unilbrk/lbrkprop_org.txt");
8344 output_lbrk_tables ("unilbrk/lbrkprop1.h", "unilbrk/lbrkprop2.h", version);
8346 debug_output_wbrk_tables ("uniwbrk/wbrkprop.txt");
8347 debug_output_org_wbrk_tables ("uniwbrk/wbrkprop_org.txt");
8348 output_wbrk_tables ("uniwbrk/wbrkprop.h", version);
8350 output_decomposition_tables ("uninorm/decomposition-table1.h", "uninorm/decomposition-table2.h", version);
8351 debug_output_composition_tables ("uninorm/composition.txt");
8352 output_composition_tables ("uninorm/composition-table.gperf", version);
8354 output_simple_mapping_test ("../tests/unicase/test-uc_toupper.c", "uc_toupper", to_upper, version);
8355 output_simple_mapping_test ("../tests/unicase/test-uc_tolower.c", "uc_tolower", to_lower, version);
8356 output_simple_mapping_test ("../tests/unicase/test-uc_totitle.c", "uc_totitle", to_title, version);
8357 output_simple_mapping ("unicase/toupper.h", to_upper, version);
8358 output_simple_mapping ("unicase/tolower.h", to_lower, version);
8359 output_simple_mapping ("unicase/totitle.h", to_title, version);
8360 output_simple_mapping ("unicase/tocasefold.h", to_casefold, version);
8361 output_casing_rules ("unicase/special-casing-table.gperf", version);
8362 output_casing_properties (version);
8368 * For Emacs M-x compile
8370 * compile-command: "
8371 gcc -O -Wall gen-uni-tables.c -Iunictype -o gen-uni-tables && \
8373 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/UnicodeData.txt \
8374 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/PropList.txt \
8375 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/DerivedCoreProperties.txt \
8376 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/Scripts.txt \
8377 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/Blocks.txt \
8378 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/3.0.1/PropList-3.0.1.txt \
8379 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/EastAsianWidth.txt \
8380 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/LineBreak.txt \
8381 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/auxiliary/WordBreakProperty.txt \
8382 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/CompositionExclusions.txt \
8383 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/SpecialCasing.txt \
8384 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/CaseFolding.txt \