1 /* Generate Unicode conforming character classification tables and
2 line break properties tables and word break property tables and
3 decomposition/composition and case mapping tables from a UnicodeData file.
4 Copyright (C) 2000-2002, 2004, 2007-2009 Free Software Foundation, Inc.
5 Written by Bruno Haible <bruno@clisp.org>, 2000-2002.
7 This program is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <http://www.gnu.org/licenses/>. */
21 $ gen-uni-tables /usr/local/share/Unidata/UnicodeData.txt \
22 /usr/local/share/Unidata/PropList.txt \
23 /usr/local/share/Unidata/DerivedCoreProperties.txt \
24 /usr/local/share/Unidata/Scripts.txt \
25 /usr/local/share/Unidata/Blocks.txt \
26 /usr/local/share/Unidata/PropList-3.0.1.txt \
27 /usr/local/share/Unidata/EastAsianWidth.txt \
28 /usr/local/share/Unidata/LineBreak.txt \
29 /usr/local/share/Unidata/WordBreakProperty.txt \
30 /usr/local/share/Unidata/CompositionExclusions.txt \
41 /* ========================================================================= */
43 /* Reading UnicodeData.txt. */
46 /* This structure represents one line in the UnicodeData.txt file. */
47 struct unicode_attribute
49 const char *name; /* Character name */
50 const char *category; /* General category */
51 const char *combining; /* Canonical combining class */
52 const char *bidi; /* Bidirectional category */
53 const char *decomposition; /* Character decomposition mapping */
54 const char *decdigit; /* Decimal digit value */
55 const char *digit; /* Digit value */
56 const char *numeric; /* Numeric value */
57 bool mirrored; /* mirrored */
58 const char *oldname; /* Old Unicode 1.0 name */
59 const char *comment; /* Comment */
60 unsigned int upper; /* Uppercase mapping */
61 unsigned int lower; /* Lowercase mapping */
62 unsigned int title; /* Titlecase mapping */
65 /* Missing fields are represented with "" for strings, and NONE for
67 #define NONE (~(unsigned int)0)
69 /* The entire contents of the UnicodeData.txt file. */
70 struct unicode_attribute unicode_attributes [0x110000];
72 /* Stores in unicode_attributes[i] the values from the given fields. */
74 fill_attribute (unsigned int i,
75 const char *field1, const char *field2,
76 const char *field3, const char *field4,
77 const char *field5, const char *field6,
78 const char *field7, const char *field8,
79 const char *field9, const char *field10,
80 const char *field11, const char *field12,
81 const char *field13, const char *field14)
83 struct unicode_attribute * uni;
87 fprintf (stderr, "index too large\n");
90 if (strcmp (field2, "Cs") == 0)
91 /* Surrogates are UTF-16 artefacts, not real characters. Ignore them. */
93 uni = &unicode_attributes[i];
94 /* Copy the strings. */
95 uni->name = strdup (field1);
96 uni->category = (field2[0] == '\0' ? "" : strdup (field2));
97 uni->combining = (field3[0] == '\0' ? "" : strdup (field3));
98 uni->bidi = (field4[0] == '\0' ? "" : strdup (field4));
99 uni->decomposition = (field5[0] == '\0' ? "" : strdup (field5));
100 uni->decdigit = (field6[0] == '\0' ? "" : strdup (field6));
101 uni->digit = (field7[0] == '\0' ? "" : strdup (field7));
102 uni->numeric = (field8[0] == '\0' ? "" : strdup (field8));
103 uni->mirrored = (field9[0] == 'Y');
104 uni->oldname = (field10[0] == '\0' ? "" : strdup (field10));
105 uni->comment = (field11[0] == '\0' ? "" : strdup (field11));
106 uni->upper = (field12[0] =='\0' ? NONE : strtoul (field12, NULL, 16));
107 uni->lower = (field13[0] =='\0' ? NONE : strtoul (field13, NULL, 16));
108 uni->title = (field14[0] =='\0' ? NONE : strtoul (field14, NULL, 16));
111 /* Maximum length of a field in the UnicodeData.txt file. */
114 /* Reads the next field from STREAM. The buffer BUFFER has size FIELDLEN.
115 Reads up to (but excluding) DELIM.
116 Returns 1 when a field was successfully read, otherwise 0. */
118 getfield (FILE *stream, char *buffer, int delim)
123 for (; (c = getc (stream)), (c != EOF && c != delim); )
125 /* The original unicode.org UnicodeData.txt file happens to have
126 CR/LF line terminators. Silently convert to LF. */
130 /* Put c into the buffer. */
131 if (++count >= FIELDLEN - 1)
133 fprintf (stderr, "field longer than expected, increase FIELDLEN\n");
146 /* Stores in unicode_attributes[] the entire contents of the UnicodeData.txt
149 fill_attributes (const char *unicodedata_filename)
153 char field0[FIELDLEN];
154 char field1[FIELDLEN];
155 char field2[FIELDLEN];
156 char field3[FIELDLEN];
157 char field4[FIELDLEN];
158 char field5[FIELDLEN];
159 char field6[FIELDLEN];
160 char field7[FIELDLEN];
161 char field8[FIELDLEN];
162 char field9[FIELDLEN];
163 char field10[FIELDLEN];
164 char field11[FIELDLEN];
165 char field12[FIELDLEN];
166 char field13[FIELDLEN];
167 char field14[FIELDLEN];
170 for (i = 0; i < 0x110000; i++)
171 unicode_attributes[i].name = NULL;
173 stream = fopen (unicodedata_filename, "r");
176 fprintf (stderr, "error during fopen of '%s'\n", unicodedata_filename);
185 n = getfield (stream, field0, ';');
186 n += getfield (stream, field1, ';');
187 n += getfield (stream, field2, ';');
188 n += getfield (stream, field3, ';');
189 n += getfield (stream, field4, ';');
190 n += getfield (stream, field5, ';');
191 n += getfield (stream, field6, ';');
192 n += getfield (stream, field7, ';');
193 n += getfield (stream, field8, ';');
194 n += getfield (stream, field9, ';');
195 n += getfield (stream, field10, ';');
196 n += getfield (stream, field11, ';');
197 n += getfield (stream, field12, ';');
198 n += getfield (stream, field13, ';');
199 n += getfield (stream, field14, '\n');
204 fprintf (stderr, "short line in '%s':%d\n",
205 unicodedata_filename, lineno);
208 i = strtoul (field0, NULL, 16);
210 && strlen (field1) >= 9
211 && strcmp (field1 + strlen(field1) - 8, ", First>") == 0)
213 /* Deal with a range. */
215 n = getfield (stream, field0, ';');
216 n += getfield (stream, field1, ';');
217 n += getfield (stream, field2, ';');
218 n += getfield (stream, field3, ';');
219 n += getfield (stream, field4, ';');
220 n += getfield (stream, field5, ';');
221 n += getfield (stream, field6, ';');
222 n += getfield (stream, field7, ';');
223 n += getfield (stream, field8, ';');
224 n += getfield (stream, field9, ';');
225 n += getfield (stream, field10, ';');
226 n += getfield (stream, field11, ';');
227 n += getfield (stream, field12, ';');
228 n += getfield (stream, field13, ';');
229 n += getfield (stream, field14, '\n');
232 fprintf (stderr, "missing end range in '%s':%d\n",
233 unicodedata_filename, lineno);
236 if (!(field1[0] == '<'
237 && strlen (field1) >= 8
238 && strcmp (field1 + strlen (field1) - 7, ", Last>") == 0))
240 fprintf (stderr, "missing end range in '%s':%d\n",
241 unicodedata_filename, lineno);
244 field1[strlen (field1) - 7] = '\0';
245 j = strtoul (field0, NULL, 16);
247 fill_attribute (i, field1+1, field2, field3, field4, field5,
248 field6, field7, field8, field9, field10,
249 field11, field12, field13, field14);
253 /* Single character line */
254 fill_attribute (i, field1, field2, field3, field4, field5,
255 field6, field7, field8, field9, field10,
256 field11, field12, field13, field14);
259 if (ferror (stream) || fclose (stream))
261 fprintf (stderr, "error reading from '%s'\n", unicodedata_filename);
266 /* ========================================================================= */
268 /* General category. */
269 /* See Unicode 3.0 book, section 4.5,
273 is_category_L (unsigned int ch)
275 return (unicode_attributes[ch].name != NULL
276 && unicode_attributes[ch].category[0] == 'L');
280 is_category_Lu (unsigned int ch)
282 return (unicode_attributes[ch].name != NULL
283 && unicode_attributes[ch].category[0] == 'L'
284 && unicode_attributes[ch].category[1] == 'u');
288 is_category_Ll (unsigned int ch)
290 return (unicode_attributes[ch].name != NULL
291 && unicode_attributes[ch].category[0] == 'L'
292 && unicode_attributes[ch].category[1] == 'l');
296 is_category_Lt (unsigned int ch)
298 return (unicode_attributes[ch].name != NULL
299 && unicode_attributes[ch].category[0] == 'L'
300 && unicode_attributes[ch].category[1] == 't');
304 is_category_Lm (unsigned int ch)
306 return (unicode_attributes[ch].name != NULL
307 && unicode_attributes[ch].category[0] == 'L'
308 && unicode_attributes[ch].category[1] == 'm');
312 is_category_Lo (unsigned int ch)
314 return (unicode_attributes[ch].name != NULL
315 && unicode_attributes[ch].category[0] == 'L'
316 && unicode_attributes[ch].category[1] == 'o');
320 is_category_M (unsigned int ch)
322 return (unicode_attributes[ch].name != NULL
323 && unicode_attributes[ch].category[0] == 'M');
327 is_category_Mn (unsigned int ch)
329 return (unicode_attributes[ch].name != NULL
330 && unicode_attributes[ch].category[0] == 'M'
331 && unicode_attributes[ch].category[1] == 'n');
335 is_category_Mc (unsigned int ch)
337 return (unicode_attributes[ch].name != NULL
338 && unicode_attributes[ch].category[0] == 'M'
339 && unicode_attributes[ch].category[1] == 'c');
343 is_category_Me (unsigned int ch)
345 return (unicode_attributes[ch].name != NULL
346 && unicode_attributes[ch].category[0] == 'M'
347 && unicode_attributes[ch].category[1] == 'e');
351 is_category_N (unsigned int ch)
353 return (unicode_attributes[ch].name != NULL
354 && unicode_attributes[ch].category[0] == 'N');
358 is_category_Nd (unsigned int ch)
360 return (unicode_attributes[ch].name != NULL
361 && unicode_attributes[ch].category[0] == 'N'
362 && unicode_attributes[ch].category[1] == 'd');
366 is_category_Nl (unsigned int ch)
368 return (unicode_attributes[ch].name != NULL
369 && unicode_attributes[ch].category[0] == 'N'
370 && unicode_attributes[ch].category[1] == 'l');
374 is_category_No (unsigned int ch)
376 return (unicode_attributes[ch].name != NULL
377 && unicode_attributes[ch].category[0] == 'N'
378 && unicode_attributes[ch].category[1] == 'o');
382 is_category_P (unsigned int ch)
384 return (unicode_attributes[ch].name != NULL
385 && unicode_attributes[ch].category[0] == 'P');
389 is_category_Pc (unsigned int ch)
391 return (unicode_attributes[ch].name != NULL
392 && unicode_attributes[ch].category[0] == 'P'
393 && unicode_attributes[ch].category[1] == 'c');
397 is_category_Pd (unsigned int ch)
399 return (unicode_attributes[ch].name != NULL
400 && unicode_attributes[ch].category[0] == 'P'
401 && unicode_attributes[ch].category[1] == 'd');
405 is_category_Ps (unsigned int ch)
407 return (unicode_attributes[ch].name != NULL
408 && unicode_attributes[ch].category[0] == 'P'
409 && unicode_attributes[ch].category[1] == 's');
413 is_category_Pe (unsigned int ch)
415 return (unicode_attributes[ch].name != NULL
416 && unicode_attributes[ch].category[0] == 'P'
417 && unicode_attributes[ch].category[1] == 'e');
421 is_category_Pi (unsigned int ch)
423 return (unicode_attributes[ch].name != NULL
424 && unicode_attributes[ch].category[0] == 'P'
425 && unicode_attributes[ch].category[1] == 'i');
429 is_category_Pf (unsigned int ch)
431 return (unicode_attributes[ch].name != NULL
432 && unicode_attributes[ch].category[0] == 'P'
433 && unicode_attributes[ch].category[1] == 'f');
437 is_category_Po (unsigned int ch)
439 return (unicode_attributes[ch].name != NULL
440 && unicode_attributes[ch].category[0] == 'P'
441 && unicode_attributes[ch].category[1] == 'o');
445 is_category_S (unsigned int ch)
447 return (unicode_attributes[ch].name != NULL
448 && unicode_attributes[ch].category[0] == 'S');
452 is_category_Sm (unsigned int ch)
454 return (unicode_attributes[ch].name != NULL
455 && unicode_attributes[ch].category[0] == 'S'
456 && unicode_attributes[ch].category[1] == 'm');
460 is_category_Sc (unsigned int ch)
462 return (unicode_attributes[ch].name != NULL
463 && unicode_attributes[ch].category[0] == 'S'
464 && unicode_attributes[ch].category[1] == 'c');
468 is_category_Sk (unsigned int ch)
470 return (unicode_attributes[ch].name != NULL
471 && unicode_attributes[ch].category[0] == 'S'
472 && unicode_attributes[ch].category[1] == 'k');
476 is_category_So (unsigned int ch)
478 return (unicode_attributes[ch].name != NULL
479 && unicode_attributes[ch].category[0] == 'S'
480 && unicode_attributes[ch].category[1] == 'o');
484 is_category_Z (unsigned int ch)
486 return (unicode_attributes[ch].name != NULL
487 && unicode_attributes[ch].category[0] == 'Z');
491 is_category_Zs (unsigned int ch)
493 return (unicode_attributes[ch].name != NULL
494 && unicode_attributes[ch].category[0] == 'Z'
495 && unicode_attributes[ch].category[1] == 's');
499 is_category_Zl (unsigned int ch)
501 return (unicode_attributes[ch].name != NULL
502 && unicode_attributes[ch].category[0] == 'Z'
503 && unicode_attributes[ch].category[1] == 'l');
507 is_category_Zp (unsigned int ch)
509 return (unicode_attributes[ch].name != NULL
510 && unicode_attributes[ch].category[0] == 'Z'
511 && unicode_attributes[ch].category[1] == 'p');
515 is_category_C (unsigned int ch)
517 return (unicode_attributes[ch].name == NULL
518 || unicode_attributes[ch].category[0] == 'C');
522 is_category_Cc (unsigned int ch)
524 return (unicode_attributes[ch].name != NULL
525 && unicode_attributes[ch].category[0] == 'C'
526 && unicode_attributes[ch].category[1] == 'c');
530 is_category_Cf (unsigned int ch)
532 return (unicode_attributes[ch].name != NULL
533 && unicode_attributes[ch].category[0] == 'C'
534 && unicode_attributes[ch].category[1] == 'f');
538 is_category_Cs (unsigned int ch)
540 return (ch >= 0xd800 && ch < 0xe000);
544 is_category_Co (unsigned int ch)
546 return (unicode_attributes[ch].name != NULL
547 && unicode_attributes[ch].category[0] == 'C'
548 && unicode_attributes[ch].category[1] == 'o');
552 is_category_Cn (unsigned int ch)
554 return (unicode_attributes[ch].name == NULL
555 && !(ch >= 0xd800 && ch < 0xe000));
558 /* Output a boolean property in a human readable format. */
560 debug_output_predicate (const char *filename, bool (*predicate) (unsigned int))
565 stream = fopen (filename, "w");
568 fprintf (stderr, "cannot open '%s' for writing\n", filename);
572 #if 0 /* This yields huge text output. */
573 for (ch = 0; ch < 0x110000; ch++)
576 fprintf (stream, "0x%04X\n", ch);
579 for (ch = 0; ch < 0x110000; ch++)
582 unsigned int first = ch;
585 while (ch + 1 < 0x110000 && predicate (ch + 1))
589 fprintf (stream, "0x%04X..0x%04X\n", first, last);
591 fprintf (stream, "0x%04X\n", ch);
595 if (ferror (stream) || fclose (stream))
597 fprintf (stderr, "error writing to '%s'\n", filename);
602 /* Output the unit test for a boolean property. */
604 output_predicate_test (const char *filename, bool (*predicate) (unsigned int), const char *expression)
610 stream = fopen (filename, "w");
613 fprintf (stderr, "cannot open '%s' for writing\n", filename);
617 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
618 fprintf (stream, "/* Test the Unicode character type functions.\n");
619 fprintf (stream, " Copyright (C) 2007 Free Software Foundation, Inc.\n");
620 fprintf (stream, "\n");
621 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
622 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
623 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
624 fprintf (stream, " (at your option) any later version.\n");
625 fprintf (stream, "\n");
626 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
627 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
628 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
629 fprintf (stream, " GNU General Public License for more details.\n");
630 fprintf (stream, "\n");
631 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
632 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
633 fprintf (stream, "\n");
634 fprintf (stream, "#include \"test-predicate-part1.h\"\n");
635 fprintf (stream, "\n");
638 for (ch = 0; ch < 0x110000; ch++)
641 unsigned int first = ch;
644 while (ch + 1 < 0x110000 && predicate (ch + 1))
648 fprintf (stream, ",\n");
649 fprintf (stream, " { 0x%04X, 0x%04X }", first, last);
653 fprintf (stream, "\n");
655 fprintf (stream, "\n");
656 fprintf (stream, "#define PREDICATE(c) %s\n", expression);
657 fprintf (stream, "#include \"test-predicate-part2.h\"\n");
659 if (ferror (stream) || fclose (stream))
661 fprintf (stderr, "error writing to '%s'\n", filename);
666 /* Construction of sparse 3-level tables. */
667 #define TABLE predicate_table
668 #define xmalloc malloc
669 #define xrealloc realloc
670 #include "3levelbit.h"
672 /* Output a boolean property in a three-level bitmap. */
674 output_predicate (const char *filename, bool (*predicate) (unsigned int), const char *name, const char *comment, const char *version)
678 struct predicate_table t;
679 unsigned int level1_offset, level2_offset, level3_offset;
681 stream = fopen (filename, "w");
684 fprintf (stderr, "cannot open '%s' for writing\n", filename);
688 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
689 fprintf (stream, "/* %s of Unicode characters. */\n", comment);
690 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
695 predicate_table_init (&t);
697 for (ch = 0; ch < 0x110000; ch++)
699 predicate_table_add (&t, ch);
701 predicate_table_finalize (&t);
703 /* Offsets in t.result, in memory of this process. */
705 5 * sizeof (uint32_t);
707 5 * sizeof (uint32_t)
708 + t.level1_size * sizeof (uint32_t);
710 5 * sizeof (uint32_t)
711 + t.level1_size * sizeof (uint32_t)
712 + (t.level2_size << t.q) * sizeof (uint32_t);
714 for (i = 0; i < 5; i++)
716 fprintf (stream, "#define header_%d %d\n", i,
717 ((uint32_t *) t.result)[i]);
719 fprintf (stream, "static const\n");
720 fprintf (stream, "struct\n");
721 fprintf (stream, " {\n");
722 fprintf (stream, " int header[1];\n");
723 fprintf (stream, " int level1[%zu];\n", t.level1_size);
724 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
725 fprintf (stream, " /*unsigned*/ int level3[%zu << %d];\n", t.level3_size, t.p);
726 fprintf (stream, " }\n");
727 fprintf (stream, "%s =\n", name);
728 fprintf (stream, "{\n");
729 fprintf (stream, " { %d },\n", ((uint32_t *) t.result)[1]);
730 fprintf (stream, " {");
731 if (t.level1_size > 1)
732 fprintf (stream, "\n ");
733 for (i = 0; i < t.level1_size; i++)
736 if (i > 0 && (i % 1) == 0)
737 fprintf (stream, "\n ");
738 offset = ((uint32_t *) (t.result + level1_offset))[i];
740 fprintf (stream, " %5d", -1);
742 fprintf (stream, " %5zu * sizeof (int) / sizeof (short) + %5zu",
743 1 + t.level1_size, (offset - level2_offset) / sizeof (uint32_t));
744 if (i+1 < t.level1_size)
745 fprintf (stream, ",");
747 if (t.level1_size > 1)
748 fprintf (stream, "\n ");
749 fprintf (stream, " },\n");
750 fprintf (stream, " {");
751 if (t.level2_size << t.q > 1)
752 fprintf (stream, "\n ");
753 for (i = 0; i < t.level2_size << t.q; i++)
756 if (i > 0 && (i % 1) == 0)
757 fprintf (stream, "\n ");
758 offset = ((uint32_t *) (t.result + level2_offset))[i];
760 fprintf (stream, " %5d", -1);
762 fprintf (stream, " %5zu + %5zu * sizeof (short) / sizeof (int) + %5zu",
763 1 + t.level1_size, t.level2_size << t.q, (offset - level3_offset) / sizeof (uint32_t));
764 if (i+1 < t.level2_size << t.q)
765 fprintf (stream, ",");
767 if (t.level2_size << t.q > 1)
768 fprintf (stream, "\n ");
769 fprintf (stream, " },\n");
770 fprintf (stream, " {");
771 if (t.level3_size << t.p > 4)
772 fprintf (stream, "\n ");
773 for (i = 0; i < t.level3_size << t.p; i++)
775 if (i > 0 && (i % 4) == 0)
776 fprintf (stream, "\n ");
777 fprintf (stream, " 0x%08X",
778 ((uint32_t *) (t.result + level3_offset))[i]);
779 if (i+1 < t.level3_size << t.p)
780 fprintf (stream, ",");
782 if (t.level3_size << t.p > 4)
783 fprintf (stream, "\n ");
784 fprintf (stream, " }\n");
785 fprintf (stream, "};\n");
787 if (ferror (stream) || fclose (stream))
789 fprintf (stderr, "error writing to '%s'\n", filename);
794 /* Output all categories. */
796 output_categories (const char *version)
798 #define CATEGORY(C) \
799 debug_output_predicate ("unictype/categ_" #C ".txt", is_category_ ## C); \
800 output_predicate_test ("../tests/unictype/test-categ_" #C ".c", is_category_ ## C, "uc_is_general_category (c, UC_CATEGORY_" #C ")"); \
801 output_predicate ("unictype/categ_" #C ".h", is_category_ ## C, "u_categ_" #C, "Categories", version);
844 UC_CATEGORY_MASK_L = 0x0000001f,
845 UC_CATEGORY_MASK_Lu = 0x00000001,
846 UC_CATEGORY_MASK_Ll = 0x00000002,
847 UC_CATEGORY_MASK_Lt = 0x00000004,
848 UC_CATEGORY_MASK_Lm = 0x00000008,
849 UC_CATEGORY_MASK_Lo = 0x00000010,
850 UC_CATEGORY_MASK_M = 0x000000e0,
851 UC_CATEGORY_MASK_Mn = 0x00000020,
852 UC_CATEGORY_MASK_Mc = 0x00000040,
853 UC_CATEGORY_MASK_Me = 0x00000080,
854 UC_CATEGORY_MASK_N = 0x00000700,
855 UC_CATEGORY_MASK_Nd = 0x00000100,
856 UC_CATEGORY_MASK_Nl = 0x00000200,
857 UC_CATEGORY_MASK_No = 0x00000400,
858 UC_CATEGORY_MASK_P = 0x0003f800,
859 UC_CATEGORY_MASK_Pc = 0x00000800,
860 UC_CATEGORY_MASK_Pd = 0x00001000,
861 UC_CATEGORY_MASK_Ps = 0x00002000,
862 UC_CATEGORY_MASK_Pe = 0x00004000,
863 UC_CATEGORY_MASK_Pi = 0x00008000,
864 UC_CATEGORY_MASK_Pf = 0x00010000,
865 UC_CATEGORY_MASK_Po = 0x00020000,
866 UC_CATEGORY_MASK_S = 0x003c0000,
867 UC_CATEGORY_MASK_Sm = 0x00040000,
868 UC_CATEGORY_MASK_Sc = 0x00080000,
869 UC_CATEGORY_MASK_Sk = 0x00100000,
870 UC_CATEGORY_MASK_So = 0x00200000,
871 UC_CATEGORY_MASK_Z = 0x01c00000,
872 UC_CATEGORY_MASK_Zs = 0x00400000,
873 UC_CATEGORY_MASK_Zl = 0x00800000,
874 UC_CATEGORY_MASK_Zp = 0x01000000,
875 UC_CATEGORY_MASK_C = 0x3e000000,
876 UC_CATEGORY_MASK_Cc = 0x02000000,
877 UC_CATEGORY_MASK_Cf = 0x04000000,
878 UC_CATEGORY_MASK_Cs = 0x08000000,
879 UC_CATEGORY_MASK_Co = 0x10000000,
880 UC_CATEGORY_MASK_Cn = 0x20000000
884 general_category_byname (const char *category_name)
886 if (category_name[0] != '\0'
887 && (category_name[1] == '\0' || category_name[2] == '\0'))
888 switch (category_name[0])
891 switch (category_name[1])
893 case '\0': return UC_CATEGORY_MASK_L;
894 case 'u': return UC_CATEGORY_MASK_Lu;
895 case 'l': return UC_CATEGORY_MASK_Ll;
896 case 't': return UC_CATEGORY_MASK_Lt;
897 case 'm': return UC_CATEGORY_MASK_Lm;
898 case 'o': return UC_CATEGORY_MASK_Lo;
902 switch (category_name[1])
904 case '\0': return UC_CATEGORY_MASK_M;
905 case 'n': return UC_CATEGORY_MASK_Mn;
906 case 'c': return UC_CATEGORY_MASK_Mc;
907 case 'e': return UC_CATEGORY_MASK_Me;
911 switch (category_name[1])
913 case '\0': return UC_CATEGORY_MASK_N;
914 case 'd': return UC_CATEGORY_MASK_Nd;
915 case 'l': return UC_CATEGORY_MASK_Nl;
916 case 'o': return UC_CATEGORY_MASK_No;
920 switch (category_name[1])
922 case '\0': return UC_CATEGORY_MASK_P;
923 case 'c': return UC_CATEGORY_MASK_Pc;
924 case 'd': return UC_CATEGORY_MASK_Pd;
925 case 's': return UC_CATEGORY_MASK_Ps;
926 case 'e': return UC_CATEGORY_MASK_Pe;
927 case 'i': return UC_CATEGORY_MASK_Pi;
928 case 'f': return UC_CATEGORY_MASK_Pf;
929 case 'o': return UC_CATEGORY_MASK_Po;
933 switch (category_name[1])
935 case '\0': return UC_CATEGORY_MASK_S;
936 case 'm': return UC_CATEGORY_MASK_Sm;
937 case 'c': return UC_CATEGORY_MASK_Sc;
938 case 'k': return UC_CATEGORY_MASK_Sk;
939 case 'o': return UC_CATEGORY_MASK_So;
943 switch (category_name[1])
945 case '\0': return UC_CATEGORY_MASK_Z;
946 case 's': return UC_CATEGORY_MASK_Zs;
947 case 'l': return UC_CATEGORY_MASK_Zl;
948 case 'p': return UC_CATEGORY_MASK_Zp;
952 switch (category_name[1])
954 case '\0': return UC_CATEGORY_MASK_C;
955 case 'c': return UC_CATEGORY_MASK_Cc;
956 case 'f': return UC_CATEGORY_MASK_Cf;
957 case 's': return UC_CATEGORY_MASK_Cs;
958 case 'o': return UC_CATEGORY_MASK_Co;
959 case 'n': return UC_CATEGORY_MASK_Cn;
963 /* Invalid category name. */
967 /* Construction of sparse 3-level tables. */
968 #define TABLE category_table
969 #define ELEMENT uint8_t
970 #define DEFAULT 29 /* = log2(UC_CATEGORY_MASK_Cn) */
971 #define xmalloc malloc
972 #define xrealloc realloc
975 /* Output the per-character category table. */
977 output_category (const char *filename, const char *version)
981 struct category_table t;
982 unsigned int level1_offset, level2_offset, level3_offset;
983 uint16_t *level3_packed;
985 stream = fopen (filename, "w");
988 fprintf (stderr, "cannot open '%s' for writing\n", filename);
992 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
993 fprintf (stream, "/* Categories of Unicode characters. */\n");
994 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
999 category_table_init (&t);
1001 for (ch = 0; ch < 0x110000; ch++)
1004 unsigned int log2_value;
1006 if (is_category_Cs (ch))
1007 value = UC_CATEGORY_MASK_Cs;
1008 else if (unicode_attributes[ch].name != NULL)
1009 value = general_category_byname (unicode_attributes[ch].category);
1013 /* Now value should contain exactly one bit. */
1014 if (value == 0 || ((value & (value - 1)) != 0))
1017 for (log2_value = 0; value > 1; value >>= 1, log2_value++);
1019 category_table_add (&t, ch, log2_value);
1022 category_table_finalize (&t);
1024 /* Offsets in t.result, in memory of this process. */
1026 5 * sizeof (uint32_t);
1028 5 * sizeof (uint32_t)
1029 + t.level1_size * sizeof (uint32_t);
1031 5 * sizeof (uint32_t)
1032 + t.level1_size * sizeof (uint32_t)
1033 + (t.level2_size << t.q) * sizeof (uint32_t);
1035 for (i = 0; i < 5; i++)
1036 fprintf (stream, "#define category_header_%d %d\n", i,
1037 ((uint32_t *) t.result)[i]);
1038 fprintf (stream, "static const\n");
1039 fprintf (stream, "struct\n");
1040 fprintf (stream, " {\n");
1041 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1042 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1043 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
1044 (1 << t.p) * 5 / 16);
1045 fprintf (stream, " }\n");
1046 fprintf (stream, "u_category =\n");
1047 fprintf (stream, "{\n");
1048 fprintf (stream, " {");
1049 if (t.level1_size > 8)
1050 fprintf (stream, "\n ");
1051 for (i = 0; i < t.level1_size; i++)
1054 if (i > 0 && (i % 8) == 0)
1055 fprintf (stream, "\n ");
1056 offset = ((uint32_t *) (t.result + level1_offset))[i];
1058 fprintf (stream, " %5d", -1);
1060 fprintf (stream, " %5zu",
1061 (offset - level2_offset) / sizeof (uint32_t));
1062 if (i+1 < t.level1_size)
1063 fprintf (stream, ",");
1065 if (t.level1_size > 8)
1066 fprintf (stream, "\n ");
1067 fprintf (stream, " },\n");
1068 fprintf (stream, " {");
1069 if (t.level2_size << t.q > 8)
1070 fprintf (stream, "\n ");
1071 for (i = 0; i < t.level2_size << t.q; i++)
1074 if (i > 0 && (i % 8) == 0)
1075 fprintf (stream, "\n ");
1076 offset = ((uint32_t *) (t.result + level2_offset))[i];
1078 fprintf (stream, " %5d", -1);
1080 fprintf (stream, " %5zu",
1081 (offset - level3_offset) / sizeof (uint8_t));
1082 if (i+1 < t.level2_size << t.q)
1083 fprintf (stream, ",");
1085 if (t.level2_size << t.q > 8)
1086 fprintf (stream, "\n ");
1087 fprintf (stream, " },\n");
1088 /* Pack the level3 array. Each entry needs 5 bits only. Use 16-bit units,
1089 not 32-bit units, in order to make the lookup function easier. */
1092 calloc ((t.level3_size << t.p) * 5 / 16 + 1, sizeof (uint16_t));
1093 for (i = 0; i < t.level3_size << t.p; i++)
1095 unsigned int j = (i * 5) / 16;
1096 unsigned int k = (i * 5) % 16;
1097 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
1098 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
1099 level3_packed[j] = value & 0xffff;
1100 level3_packed[j+1] = value >> 16;
1102 fprintf (stream, " {");
1103 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1104 fprintf (stream, "\n ");
1105 for (i = 0; i < (t.level3_size << t.p) * 5 / 16 + 1; i++)
1107 if (i > 0 && (i % 8) == 0)
1108 fprintf (stream, "\n ");
1109 fprintf (stream, " 0x%04x", level3_packed[i]);
1110 if (i+1 < (t.level3_size << t.p) * 5 / 16 + 1)
1111 fprintf (stream, ",");
1113 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1114 fprintf (stream, "\n ");
1115 fprintf (stream, " }\n");
1116 free (level3_packed);
1117 fprintf (stream, "};\n");
1119 if (ferror (stream) || fclose (stream))
1121 fprintf (stderr, "error writing to '%s'\n", filename);
1126 /* ========================================================================= */
1128 /* Canonical combining class. */
1129 /* See Unicode 3.0 book, section 4.2,
1132 /* Construction of sparse 3-level tables. */
1133 #define TABLE combclass_table
1134 #define ELEMENT uint8_t
1136 #define xmalloc malloc
1137 #define xrealloc realloc
1140 /* Output the per-character combining class table. */
1142 output_combclass (const char *filename, const char *version)
1146 struct combclass_table t;
1147 unsigned int level1_offset, level2_offset, level3_offset;
1149 stream = fopen (filename, "w");
1152 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1156 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1157 fprintf (stream, "/* Combining class of Unicode characters. */\n");
1158 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1163 combclass_table_init (&t);
1165 for (ch = 0; ch < 0x110000; ch++)
1166 if (unicode_attributes[ch].name != NULL)
1168 int value = atoi (unicode_attributes[ch].combining);
1169 if (!(value >= 0 && value <= 255))
1171 combclass_table_add (&t, ch, value);
1174 combclass_table_finalize (&t);
1176 /* Offsets in t.result, in memory of this process. */
1178 5 * sizeof (uint32_t);
1180 5 * sizeof (uint32_t)
1181 + t.level1_size * sizeof (uint32_t);
1183 5 * sizeof (uint32_t)
1184 + t.level1_size * sizeof (uint32_t)
1185 + (t.level2_size << t.q) * sizeof (uint32_t);
1187 for (i = 0; i < 5; i++)
1188 fprintf (stream, "#define combclass_header_%d %d\n", i,
1189 ((uint32_t *) t.result)[i]);
1190 fprintf (stream, "static const\n");
1191 fprintf (stream, "struct\n");
1192 fprintf (stream, " {\n");
1193 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1194 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1195 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
1196 fprintf (stream, " }\n");
1197 fprintf (stream, "u_combclass =\n");
1198 fprintf (stream, "{\n");
1199 fprintf (stream, " {");
1200 if (t.level1_size > 8)
1201 fprintf (stream, "\n ");
1202 for (i = 0; i < t.level1_size; i++)
1205 if (i > 0 && (i % 8) == 0)
1206 fprintf (stream, "\n ");
1207 offset = ((uint32_t *) (t.result + level1_offset))[i];
1209 fprintf (stream, " %5d", -1);
1211 fprintf (stream, " %5zu",
1212 (offset - level2_offset) / sizeof (uint32_t));
1213 if (i+1 < t.level1_size)
1214 fprintf (stream, ",");
1216 if (t.level1_size > 8)
1217 fprintf (stream, "\n ");
1218 fprintf (stream, " },\n");
1219 fprintf (stream, " {");
1220 if (t.level2_size << t.q > 8)
1221 fprintf (stream, "\n ");
1222 for (i = 0; i < t.level2_size << t.q; i++)
1225 if (i > 0 && (i % 8) == 0)
1226 fprintf (stream, "\n ");
1227 offset = ((uint32_t *) (t.result + level2_offset))[i];
1229 fprintf (stream, " %5d", -1);
1231 fprintf (stream, " %5zu",
1232 (offset - level3_offset) / sizeof (uint8_t));
1233 if (i+1 < t.level2_size << t.q)
1234 fprintf (stream, ",");
1236 if (t.level2_size << t.q > 8)
1237 fprintf (stream, "\n ");
1238 fprintf (stream, " },\n");
1239 fprintf (stream, " {");
1240 if (t.level3_size << t.p > 8)
1241 fprintf (stream, "\n ");
1242 for (i = 0; i < t.level3_size << t.p; i++)
1244 if (i > 0 && (i % 8) == 0)
1245 fprintf (stream, "\n ");
1246 fprintf (stream, " %3d", ((uint8_t *) (t.result + level3_offset))[i]);
1247 if (i+1 < t.level3_size << t.p)
1248 fprintf (stream, ",");
1250 if (t.level3_size << t.p > 8)
1251 fprintf (stream, "\n ");
1252 fprintf (stream, " }\n");
1253 fprintf (stream, "};\n");
1255 if (ferror (stream) || fclose (stream))
1257 fprintf (stderr, "error writing to '%s'\n", filename);
1262 /* ========================================================================= */
1264 /* Bidirectional category. */
1265 /* See Unicode 3.0 book, section 4.3,
1270 UC_BIDI_L, /* Left-to-Right */
1271 UC_BIDI_LRE, /* Left-to-Right Embedding */
1272 UC_BIDI_LRO, /* Left-to-Right Override */
1273 UC_BIDI_R, /* Right-to-Left */
1274 UC_BIDI_AL, /* Right-to-Left Arabic */
1275 UC_BIDI_RLE, /* Right-to-Left Embedding */
1276 UC_BIDI_RLO, /* Right-to-Left Override */
1277 UC_BIDI_PDF, /* Pop Directional Format */
1278 UC_BIDI_EN, /* European Number */
1279 UC_BIDI_ES, /* European Number Separator */
1280 UC_BIDI_ET, /* European Number Terminator */
1281 UC_BIDI_AN, /* Arabic Number */
1282 UC_BIDI_CS, /* Common Number Separator */
1283 UC_BIDI_NSM, /* Non-Spacing Mark */
1284 UC_BIDI_BN, /* Boundary Neutral */
1285 UC_BIDI_B, /* Paragraph Separator */
1286 UC_BIDI_S, /* Segment Separator */
1287 UC_BIDI_WS, /* Whitespace */
1288 UC_BIDI_ON /* Other Neutral */
1292 bidi_category_byname (const char *category_name)
1294 switch (category_name[0])
1297 switch (category_name[1])
1300 if (category_name[2] == '\0')
1304 if (category_name[2] == '\0')
1310 switch (category_name[1])
1315 if (category_name[2] == '\0')
1321 switch (category_name[1])
1324 if (category_name[2] == '\0')
1330 switch (category_name[1])
1333 if (category_name[2] == '\0')
1337 if (category_name[2] == '\0')
1341 if (category_name[2] == '\0')
1347 switch (category_name[1])
1352 switch (category_name[2])
1355 if (category_name[3] == '\0')
1359 if (category_name[3] == '\0')
1367 switch (category_name[1])
1370 switch (category_name[2])
1373 if (category_name[3] == '\0')
1381 switch (category_name[1])
1384 if (category_name[2] == '\0')
1390 switch (category_name[1])
1393 switch (category_name[2])
1396 if (category_name[3] == '\0')
1404 switch (category_name[1])
1409 switch (category_name[2])
1412 if (category_name[3] == '\0')
1416 if (category_name[3] == '\0')
1424 if (category_name[1] == '\0')
1428 switch (category_name[1])
1431 if (category_name[2] == '\0')
1437 /* Invalid bidi category name. */
1442 get_bidi_category (unsigned int ch)
1444 if (unicode_attributes[ch].name != NULL)
1445 return bidi_category_byname (unicode_attributes[ch].bidi);
1448 /* The bidi category of unassigned characters depends on the range.
1449 See UTR #9 and DerivedBidiClass.txt. */
1450 if ((ch >= 0x0590 && ch <= 0x05FF)
1451 || (ch >= 0x07FB && ch <= 0x08FF)
1452 || (ch >= 0xFB37 && ch <= 0xFB45)
1453 || (ch >= 0x10800 && ch <= 0x10FFF))
1455 else if ((ch >= 0x0600 && ch <= 0x07BF)
1456 || (ch >= 0x2064 && ch <= 0x2069)
1457 || (ch >= 0xFBB2 && ch <= 0xFDCF)
1458 || (ch >= 0xFDFE && ch <= 0xFEFE))
1460 else if ((ch >= 0xFDD0 && ch <= 0xFDEF)
1461 || (ch >= 0xFFF0 && ch <= 0xFFFF)
1462 || (ch & 0xFFFF) == 0xFFFE
1463 || (ch & 0xFFFF) == 0xFFFF
1464 || (ch >= 0xE0000 && ch <= 0xE0FFF))
1471 /* Construction of sparse 3-level tables. */
1472 #define TABLE bidi_category_table
1473 #define ELEMENT uint8_t
1474 #define DEFAULT UC_BIDI_L
1475 #define xmalloc malloc
1476 #define xrealloc realloc
1479 /* Output the per-character bidi category table. */
1481 output_bidi_category (const char *filename, const char *version)
1485 struct bidi_category_table t;
1486 unsigned int level1_offset, level2_offset, level3_offset;
1487 uint16_t *level3_packed;
1489 stream = fopen (filename, "w");
1492 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1496 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1497 fprintf (stream, "/* Bidi categories of Unicode characters. */\n");
1498 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1503 bidi_category_table_init (&t);
1505 for (ch = 0; ch < 0x110000; ch++)
1507 int value = get_bidi_category (ch);
1509 bidi_category_table_add (&t, ch, value);
1512 bidi_category_table_finalize (&t);
1514 /* Offsets in t.result, in memory of this process. */
1516 5 * sizeof (uint32_t);
1518 5 * sizeof (uint32_t)
1519 + t.level1_size * sizeof (uint32_t);
1521 5 * sizeof (uint32_t)
1522 + t.level1_size * sizeof (uint32_t)
1523 + (t.level2_size << t.q) * sizeof (uint32_t);
1525 for (i = 0; i < 5; i++)
1526 fprintf (stream, "#define bidi_category_header_%d %d\n", i,
1527 ((uint32_t *) t.result)[i]);
1528 fprintf (stream, "static const\n");
1529 fprintf (stream, "struct\n");
1530 fprintf (stream, " {\n");
1531 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1532 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1533 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
1534 (1 << t.p) * 5 / 16);
1535 fprintf (stream, " }\n");
1536 fprintf (stream, "u_bidi_category =\n");
1537 fprintf (stream, "{\n");
1538 fprintf (stream, " {");
1539 if (t.level1_size > 8)
1540 fprintf (stream, "\n ");
1541 for (i = 0; i < t.level1_size; i++)
1544 if (i > 0 && (i % 8) == 0)
1545 fprintf (stream, "\n ");
1546 offset = ((uint32_t *) (t.result + level1_offset))[i];
1548 fprintf (stream, " %5d", -1);
1550 fprintf (stream, " %5zu",
1551 (offset - level2_offset) / sizeof (uint32_t));
1552 if (i+1 < t.level1_size)
1553 fprintf (stream, ",");
1555 if (t.level1_size > 8)
1556 fprintf (stream, "\n ");
1557 fprintf (stream, " },\n");
1558 fprintf (stream, " {");
1559 if (t.level2_size << t.q > 8)
1560 fprintf (stream, "\n ");
1561 for (i = 0; i < t.level2_size << t.q; i++)
1564 if (i > 0 && (i % 8) == 0)
1565 fprintf (stream, "\n ");
1566 offset = ((uint32_t *) (t.result + level2_offset))[i];
1568 fprintf (stream, " %5d", -1);
1570 fprintf (stream, " %5zu",
1571 (offset - level3_offset) / sizeof (uint8_t));
1572 if (i+1 < t.level2_size << t.q)
1573 fprintf (stream, ",");
1575 if (t.level2_size << t.q > 8)
1576 fprintf (stream, "\n ");
1577 fprintf (stream, " },\n");
1578 /* Pack the level3 array. Each entry needs 5 bits only. Use 16-bit units,
1579 not 32-bit units, in order to make the lookup function easier. */
1582 calloc ((t.level3_size << t.p) * 5 / 16 + 1, sizeof (uint16_t));
1583 for (i = 0; i < t.level3_size << t.p; i++)
1585 unsigned int j = (i * 5) / 16;
1586 unsigned int k = (i * 5) % 16;
1587 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
1588 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
1589 level3_packed[j] = value & 0xffff;
1590 level3_packed[j+1] = value >> 16;
1592 fprintf (stream, " {");
1593 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1594 fprintf (stream, "\n ");
1595 for (i = 0; i < (t.level3_size << t.p) * 5 / 16 + 1; i++)
1597 if (i > 0 && (i % 8) == 0)
1598 fprintf (stream, "\n ");
1599 fprintf (stream, " 0x%04x", level3_packed[i]);
1600 if (i+1 < (t.level3_size << t.p) * 5 / 16 + 1)
1601 fprintf (stream, ",");
1603 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1604 fprintf (stream, "\n ");
1605 fprintf (stream, " }\n");
1606 free (level3_packed);
1607 fprintf (stream, "};\n");
1609 if (ferror (stream) || fclose (stream))
1611 fprintf (stderr, "error writing to '%s'\n", filename);
1616 /* ========================================================================= */
1618 /* Decimal digit value. */
1619 /* See Unicode 3.0 book, section 4.6. */
1622 get_decdigit_value (unsigned int ch)
1624 if (unicode_attributes[ch].name != NULL
1625 && unicode_attributes[ch].decdigit[0] != '\0')
1626 return atoi (unicode_attributes[ch].decdigit);
1630 /* Construction of sparse 3-level tables. */
1631 #define TABLE decdigit_table
1632 #define ELEMENT uint8_t
1634 #define xmalloc malloc
1635 #define xrealloc realloc
1638 /* Output the unit test for the per-character decimal digit value table. */
1640 output_decimal_digit_test (const char *filename, const char *version)
1646 stream = fopen (filename, "w");
1649 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1653 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1654 fprintf (stream, "/* Decimal digit values of Unicode characters. */\n");
1655 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1659 for (ch = 0; ch < 0x110000; ch++)
1661 int value = get_decdigit_value (ch);
1663 if (!(value >= -1 && value < 10))
1669 fprintf (stream, ",\n");
1670 fprintf (stream, " { 0x%04X, %d }", ch, value);
1675 fprintf (stream, "\n");
1677 if (ferror (stream) || fclose (stream))
1679 fprintf (stderr, "error writing to '%s'\n", filename);
1684 /* Output the per-character decimal digit value table. */
1686 output_decimal_digit (const char *filename, const char *version)
1690 struct decdigit_table t;
1691 unsigned int level1_offset, level2_offset, level3_offset;
1693 stream = fopen (filename, "w");
1696 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1700 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1701 fprintf (stream, "/* Decimal digit values of Unicode characters. */\n");
1702 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1707 decdigit_table_init (&t);
1709 for (ch = 0; ch < 0x110000; ch++)
1711 int value = 1 + get_decdigit_value (ch);
1713 if (!(value >= 0 && value <= 10))
1716 decdigit_table_add (&t, ch, value);
1719 decdigit_table_finalize (&t);
1721 /* Offsets in t.result, in memory of this process. */
1723 5 * sizeof (uint32_t);
1725 5 * sizeof (uint32_t)
1726 + t.level1_size * sizeof (uint32_t);
1728 5 * sizeof (uint32_t)
1729 + t.level1_size * sizeof (uint32_t)
1730 + (t.level2_size << t.q) * sizeof (uint32_t);
1732 for (i = 0; i < 5; i++)
1733 fprintf (stream, "#define decdigit_header_%d %d\n", i,
1734 ((uint32_t *) t.result)[i]);
1735 fprintf (stream, "static const\n");
1736 fprintf (stream, "struct\n");
1737 fprintf (stream, " {\n");
1738 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1739 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1740 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size,
1742 fprintf (stream, " }\n");
1743 fprintf (stream, "u_decdigit =\n");
1744 fprintf (stream, "{\n");
1745 fprintf (stream, " {");
1746 if (t.level1_size > 8)
1747 fprintf (stream, "\n ");
1748 for (i = 0; i < t.level1_size; i++)
1751 if (i > 0 && (i % 8) == 0)
1752 fprintf (stream, "\n ");
1753 offset = ((uint32_t *) (t.result + level1_offset))[i];
1755 fprintf (stream, " %5d", -1);
1757 fprintf (stream, " %5zu",
1758 (offset - level2_offset) / sizeof (uint32_t));
1759 if (i+1 < t.level1_size)
1760 fprintf (stream, ",");
1762 if (t.level1_size > 8)
1763 fprintf (stream, "\n ");
1764 fprintf (stream, " },\n");
1765 fprintf (stream, " {");
1766 if (t.level2_size << t.q > 8)
1767 fprintf (stream, "\n ");
1768 for (i = 0; i < t.level2_size << t.q; i++)
1771 if (i > 0 && (i % 8) == 0)
1772 fprintf (stream, "\n ");
1773 offset = ((uint32_t *) (t.result + level2_offset))[i];
1775 fprintf (stream, " %5d", -1);
1777 fprintf (stream, " %5zu",
1778 (offset - level3_offset) / sizeof (uint8_t));
1779 if (i+1 < t.level2_size << t.q)
1780 fprintf (stream, ",");
1782 if (t.level2_size << t.q > 8)
1783 fprintf (stream, "\n ");
1784 fprintf (stream, " },\n");
1785 /* Pack the level3 array. Each entry needs 4 bits only. */
1786 fprintf (stream, " {");
1787 if (t.level3_size << (t.p - 1) > 8)
1788 fprintf (stream, "\n ");
1789 for (i = 0; i < t.level3_size << (t.p - 1); i++)
1791 if (i > 0 && (i % 8) == 0)
1792 fprintf (stream, "\n ");
1793 fprintf (stream, " 0x%02x",
1794 ((uint8_t *) (t.result + level3_offset))[2*i]
1795 + (((uint8_t *) (t.result + level3_offset))[2*i+1] << 4));
1796 if (i+1 < t.level3_size << (t.p - 1))
1797 fprintf (stream, ",");
1799 if (t.level3_size << (t.p - 1) > 8)
1800 fprintf (stream, "\n ");
1801 fprintf (stream, " }\n");
1802 fprintf (stream, "};\n");
1804 if (ferror (stream) || fclose (stream))
1806 fprintf (stderr, "error writing to '%s'\n", filename);
1811 /* ========================================================================= */
1814 /* See Unicode 3.0 book, section 4.6. */
1817 get_digit_value (unsigned int ch)
1819 if (unicode_attributes[ch].name != NULL
1820 && unicode_attributes[ch].digit[0] != '\0')
1821 return atoi (unicode_attributes[ch].digit);
1825 /* Output the unit test for the per-character digit value table. */
1827 output_digit_test (const char *filename, const char *version)
1833 stream = fopen (filename, "w");
1836 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1840 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1841 fprintf (stream, "/* Digit values of Unicode characters. */\n");
1842 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1846 for (ch = 0; ch < 0x110000; ch++)
1848 int value = get_digit_value (ch);
1850 if (!(value >= -1 && value < 10))
1856 fprintf (stream, ",\n");
1857 fprintf (stream, " { 0x%04X, %d }", ch, value);
1862 fprintf (stream, "\n");
1864 if (ferror (stream) || fclose (stream))
1866 fprintf (stderr, "error writing to '%s'\n", filename);
1871 /* Output the per-character digit value table. */
1873 output_digit (const char *filename, const char *version)
1877 struct decdigit_table t;
1878 unsigned int level1_offset, level2_offset, level3_offset;
1880 stream = fopen (filename, "w");
1883 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1887 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1888 fprintf (stream, "/* Digit values of Unicode characters. */\n");
1889 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
1894 decdigit_table_init (&t);
1896 for (ch = 0; ch < 0x110000; ch++)
1898 int value = 1 + get_digit_value (ch);
1900 if (!(value >= 0 && value <= 10))
1903 decdigit_table_add (&t, ch, value);
1906 decdigit_table_finalize (&t);
1908 /* Offsets in t.result, in memory of this process. */
1910 5 * sizeof (uint32_t);
1912 5 * sizeof (uint32_t)
1913 + t.level1_size * sizeof (uint32_t);
1915 5 * sizeof (uint32_t)
1916 + t.level1_size * sizeof (uint32_t)
1917 + (t.level2_size << t.q) * sizeof (uint32_t);
1919 for (i = 0; i < 5; i++)
1920 fprintf (stream, "#define digit_header_%d %d\n", i,
1921 ((uint32_t *) t.result)[i]);
1922 fprintf (stream, "static const\n");
1923 fprintf (stream, "struct\n");
1924 fprintf (stream, " {\n");
1925 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1926 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1927 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size,
1929 fprintf (stream, " }\n");
1930 fprintf (stream, "u_digit =\n");
1931 fprintf (stream, "{\n");
1932 fprintf (stream, " {");
1933 if (t.level1_size > 8)
1934 fprintf (stream, "\n ");
1935 for (i = 0; i < t.level1_size; i++)
1938 if (i > 0 && (i % 8) == 0)
1939 fprintf (stream, "\n ");
1940 offset = ((uint32_t *) (t.result + level1_offset))[i];
1942 fprintf (stream, " %5d", -1);
1944 fprintf (stream, " %5zu",
1945 (offset - level2_offset) / sizeof (uint32_t));
1946 if (i+1 < t.level1_size)
1947 fprintf (stream, ",");
1949 if (t.level1_size > 8)
1950 fprintf (stream, "\n ");
1951 fprintf (stream, " },\n");
1952 fprintf (stream, " {");
1953 if (t.level2_size << t.q > 8)
1954 fprintf (stream, "\n ");
1955 for (i = 0; i < t.level2_size << t.q; i++)
1958 if (i > 0 && (i % 8) == 0)
1959 fprintf (stream, "\n ");
1960 offset = ((uint32_t *) (t.result + level2_offset))[i];
1962 fprintf (stream, " %5d", -1);
1964 fprintf (stream, " %5zu",
1965 (offset - level3_offset) / sizeof (uint8_t));
1966 if (i+1 < t.level2_size << t.q)
1967 fprintf (stream, ",");
1969 if (t.level2_size << t.q > 8)
1970 fprintf (stream, "\n ");
1971 fprintf (stream, " },\n");
1972 /* Pack the level3 array. Each entry needs 4 bits only. */
1973 fprintf (stream, " {");
1974 if (t.level3_size << (t.p - 1) > 8)
1975 fprintf (stream, "\n ");
1976 for (i = 0; i < t.level3_size << (t.p - 1); i++)
1978 if (i > 0 && (i % 8) == 0)
1979 fprintf (stream, "\n ");
1980 fprintf (stream, " 0x%02x",
1981 ((uint8_t *) (t.result + level3_offset))[2*i]
1982 + (((uint8_t *) (t.result + level3_offset))[2*i+1] << 4));
1983 if (i+1 < t.level3_size << (t.p - 1))
1984 fprintf (stream, ",");
1986 if (t.level3_size << (t.p - 1) > 8)
1987 fprintf (stream, "\n ");
1988 fprintf (stream, " }\n");
1989 fprintf (stream, "};\n");
1991 if (ferror (stream) || fclose (stream))
1993 fprintf (stderr, "error writing to '%s'\n", filename);
1998 /* ========================================================================= */
2000 /* Numeric value. */
2001 /* See Unicode 3.0 book, section 4.6. */
2003 typedef struct { int numerator; int denominator; } uc_fraction_t;
2005 static uc_fraction_t
2006 get_numeric_value (unsigned int ch)
2008 uc_fraction_t value;
2010 if (unicode_attributes[ch].name != NULL
2011 && unicode_attributes[ch].numeric[0] != '\0')
2013 const char *str = unicode_attributes[ch].numeric;
2014 /* str is of the form "integer" or "integer/posinteger". */
2015 value.numerator = atoi (str);
2016 if (strchr (str, '/') != NULL)
2017 value.denominator = atoi (strchr (str, '/') + 1);
2019 value.denominator = 1;
2023 value.numerator = 0;
2024 value.denominator = 0;
2029 /* Output the unit test for the per-character numeric value table. */
2031 output_numeric_test (const char *filename, const char *version)
2037 stream = fopen (filename, "w");
2040 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2044 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2045 fprintf (stream, "/* Numeric values of Unicode characters. */\n");
2046 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
2050 for (ch = 0; ch < 0x110000; ch++)
2052 uc_fraction_t value = get_numeric_value (ch);
2054 if (value.numerator != 0 || value.denominator != 0)
2057 fprintf (stream, ",\n");
2058 fprintf (stream, " { 0x%04X, %d, %d }",
2059 ch, value.numerator, value.denominator);
2064 fprintf (stream, "\n");
2066 if (ferror (stream) || fclose (stream))
2068 fprintf (stderr, "error writing to '%s'\n", filename);
2073 /* Construction of sparse 3-level tables. */
2074 #define TABLE numeric_table
2075 #define ELEMENT uint8_t
2077 #define xmalloc malloc
2078 #define xrealloc realloc
2081 /* Output the per-character numeric value table. */
2083 output_numeric (const char *filename, const char *version)
2086 uc_fraction_t fractions[128];
2087 unsigned int nfractions;
2088 unsigned int ch, i, j;
2089 struct numeric_table t;
2090 unsigned int level1_offset, level2_offset, level3_offset;
2091 uint16_t *level3_packed;
2093 stream = fopen (filename, "w");
2096 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2100 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2101 fprintf (stream, "/* Numeric values of Unicode characters. */\n");
2102 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
2105 /* Create table of occurring fractions. */
2107 for (ch = 0; ch < 0x110000; ch++)
2109 uc_fraction_t value = get_numeric_value (ch);
2111 for (i = 0; i < nfractions; i++)
2112 if (value.numerator == fractions[i].numerator
2113 && value.denominator == fractions[i].denominator)
2115 if (i == nfractions)
2117 if (nfractions == 128)
2119 for (i = 0; i < nfractions; i++)
2120 if (value.denominator < fractions[i].denominator
2121 || (value.denominator == fractions[i].denominator
2122 && value.numerator < fractions[i].numerator))
2124 for (j = nfractions; j > i; j--)
2125 fractions[j] = fractions[j - 1];
2126 fractions[i] = value;
2131 fprintf (stream, "static const uc_fraction_t u_numeric_values[%d] =\n",
2133 fprintf (stream, "{\n");
2134 for (i = 0; i < nfractions; i++)
2136 fprintf (stream, " { %d, %d }", fractions[i].numerator,
2137 fractions[i].denominator);
2138 if (i+1 < nfractions)
2139 fprintf (stream, ",");
2140 fprintf (stream, "\n");
2142 fprintf (stream, "};\n");
2146 numeric_table_init (&t);
2148 for (ch = 0; ch < 0x110000; ch++)
2150 uc_fraction_t value = get_numeric_value (ch);
2152 for (i = 0; i < nfractions; i++)
2153 if (value.numerator == fractions[i].numerator
2154 && value.denominator == fractions[i].denominator)
2156 if (i == nfractions)
2159 numeric_table_add (&t, ch, i);
2162 numeric_table_finalize (&t);
2164 /* Offsets in t.result, in memory of this process. */
2166 5 * sizeof (uint32_t);
2168 5 * sizeof (uint32_t)
2169 + t.level1_size * sizeof (uint32_t);
2171 5 * sizeof (uint32_t)
2172 + t.level1_size * sizeof (uint32_t)
2173 + (t.level2_size << t.q) * sizeof (uint32_t);
2175 for (i = 0; i < 5; i++)
2176 fprintf (stream, "#define numeric_header_%d %d\n", i,
2177 ((uint32_t *) t.result)[i]);
2178 fprintf (stream, "static const\n");
2179 fprintf (stream, "struct\n");
2180 fprintf (stream, " {\n");
2181 fprintf (stream, " int level1[%zu];\n", t.level1_size);
2182 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
2183 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
2184 (1 << t.p) * 7 / 16);
2185 fprintf (stream, " }\n");
2186 fprintf (stream, "u_numeric =\n");
2187 fprintf (stream, "{\n");
2188 fprintf (stream, " {");
2189 if (t.level1_size > 8)
2190 fprintf (stream, "\n ");
2191 for (i = 0; i < t.level1_size; i++)
2194 if (i > 0 && (i % 8) == 0)
2195 fprintf (stream, "\n ");
2196 offset = ((uint32_t *) (t.result + level1_offset))[i];
2198 fprintf (stream, " %5d", -1);
2200 fprintf (stream, " %5zu",
2201 (offset - level2_offset) / sizeof (uint32_t));
2202 if (i+1 < t.level1_size)
2203 fprintf (stream, ",");
2205 if (t.level1_size > 8)
2206 fprintf (stream, "\n ");
2207 fprintf (stream, " },\n");
2208 fprintf (stream, " {");
2209 if (t.level2_size << t.q > 8)
2210 fprintf (stream, "\n ");
2211 for (i = 0; i < t.level2_size << t.q; i++)
2214 if (i > 0 && (i % 8) == 0)
2215 fprintf (stream, "\n ");
2216 offset = ((uint32_t *) (t.result + level2_offset))[i];
2218 fprintf (stream, " %5d", -1);
2220 fprintf (stream, " %5zu",
2221 (offset - level3_offset) / sizeof (uint8_t));
2222 if (i+1 < t.level2_size << t.q)
2223 fprintf (stream, ",");
2225 if (t.level2_size << t.q > 8)
2226 fprintf (stream, "\n ");
2227 fprintf (stream, " },\n");
2228 /* Pack the level3 array. Each entry needs 7 bits only. Use 16-bit units,
2229 not 32-bit units, in order to make the lookup function easier. */
2232 calloc ((t.level3_size << t.p) * 7 / 16 + 1, sizeof (uint16_t));
2233 for (i = 0; i < t.level3_size << t.p; i++)
2235 unsigned int j = (i * 7) / 16;
2236 unsigned int k = (i * 7) % 16;
2237 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
2238 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
2239 level3_packed[j] = value & 0xffff;
2240 level3_packed[j+1] = value >> 16;
2242 fprintf (stream, " {");
2243 if ((t.level3_size << t.p) * 7 / 16 + 1 > 8)
2244 fprintf (stream, "\n ");
2245 for (i = 0; i < (t.level3_size << t.p) * 7 / 16 + 1; i++)
2247 if (i > 0 && (i % 8) == 0)
2248 fprintf (stream, "\n ");
2249 fprintf (stream, " 0x%04x", level3_packed[i]);
2250 if (i+1 < (t.level3_size << t.p) * 7 / 16 + 1)
2251 fprintf (stream, ",");
2253 if ((t.level3_size << t.p) * 7 / 16 + 1 > 8)
2254 fprintf (stream, "\n ");
2255 fprintf (stream, " }\n");
2256 free (level3_packed);
2257 fprintf (stream, "};\n");
2259 if (ferror (stream) || fclose (stream))
2261 fprintf (stderr, "error writing to '%s'\n", filename);
2266 /* ========================================================================= */
2269 /* See Unicode 3.0 book, section 4.7,
2272 /* List of mirrored character pairs. This is a subset of the characters
2273 having the BidiMirrored property. */
2274 static unsigned int mirror_pairs[][2] =
2331 get_mirror_value (unsigned int ch)
2334 unsigned int mirror_char;
2337 mirrored = (unicode_attributes[ch].name != NULL
2338 && unicode_attributes[ch].mirrored);
2339 mirror_char = 0xfffd;
2340 for (i = 0; i < sizeof (mirror_pairs) / sizeof (mirror_pairs[0]); i++)
2341 if (ch == mirror_pairs[i][0])
2343 mirror_char = mirror_pairs[i][1];
2346 else if (ch == mirror_pairs[i][1])
2348 mirror_char = mirror_pairs[i][0];
2352 return (int) mirror_char - (int) ch;
2355 if (mirror_char != 0xfffd)
2361 /* Construction of sparse 3-level tables. */
2362 #define TABLE mirror_table
2363 #define ELEMENT int32_t
2365 #define xmalloc malloc
2366 #define xrealloc realloc
2369 /* Output the per-character mirror table. */
2371 output_mirror (const char *filename, const char *version)
2375 struct mirror_table t;
2376 unsigned int level1_offset, level2_offset, level3_offset;
2378 stream = fopen (filename, "w");
2381 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2385 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2386 fprintf (stream, "/* Mirrored Unicode characters. */\n");
2387 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
2392 mirror_table_init (&t);
2394 for (ch = 0; ch < 0x110000; ch++)
2396 int value = get_mirror_value (ch);
2398 mirror_table_add (&t, ch, value);
2401 mirror_table_finalize (&t);
2403 /* Offsets in t.result, in memory of this process. */
2405 5 * sizeof (uint32_t);
2407 5 * sizeof (uint32_t)
2408 + t.level1_size * sizeof (uint32_t);
2410 5 * sizeof (uint32_t)
2411 + t.level1_size * sizeof (uint32_t)
2412 + (t.level2_size << t.q) * sizeof (uint32_t);
2414 for (i = 0; i < 5; i++)
2415 fprintf (stream, "#define mirror_header_%d %d\n", i,
2416 ((uint32_t *) t.result)[i]);
2417 fprintf (stream, "static const\n");
2418 fprintf (stream, "struct\n");
2419 fprintf (stream, " {\n");
2420 fprintf (stream, " int level1[%zu];\n", t.level1_size);
2421 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
2422 fprintf (stream, " int level3[%zu << %d];\n", t.level3_size, t.p);
2423 fprintf (stream, " }\n");
2424 fprintf (stream, "u_mirror =\n");
2425 fprintf (stream, "{\n");
2426 fprintf (stream, " {");
2427 if (t.level1_size > 8)
2428 fprintf (stream, "\n ");
2429 for (i = 0; i < t.level1_size; i++)
2432 if (i > 0 && (i % 8) == 0)
2433 fprintf (stream, "\n ");
2434 offset = ((uint32_t *) (t.result + level1_offset))[i];
2436 fprintf (stream, " %5d", -1);
2438 fprintf (stream, " %5zu",
2439 (offset - level2_offset) / sizeof (uint32_t));
2440 if (i+1 < t.level1_size)
2441 fprintf (stream, ",");
2443 if (t.level1_size > 8)
2444 fprintf (stream, "\n ");
2445 fprintf (stream, " },\n");
2446 fprintf (stream, " {");
2447 if (t.level2_size << t.q > 8)
2448 fprintf (stream, "\n ");
2449 for (i = 0; i < t.level2_size << t.q; i++)
2452 if (i > 0 && (i % 8) == 0)
2453 fprintf (stream, "\n ");
2454 offset = ((uint32_t *) (t.result + level2_offset))[i];
2456 fprintf (stream, " %5d", -1);
2458 fprintf (stream, " %5zu",
2459 (offset - level3_offset) / sizeof (int32_t));
2460 if (i+1 < t.level2_size << t.q)
2461 fprintf (stream, ",");
2463 if (t.level2_size << t.q > 8)
2464 fprintf (stream, "\n ");
2465 fprintf (stream, " },\n");
2466 fprintf (stream, " {");
2467 if (t.level3_size << t.p > 8)
2468 fprintf (stream, "\n ");
2469 for (i = 0; i < t.level3_size << t.p; i++)
2471 if (i > 0 && (i % 8) == 0)
2472 fprintf (stream, "\n ");
2473 fprintf (stream, " %5d", ((int32_t *) (t.result + level3_offset))[i]);
2474 if (i+1 < t.level3_size << t.p)
2475 fprintf (stream, ",");
2477 if (t.level3_size << t.p > 8)
2478 fprintf (stream, "\n ");
2479 fprintf (stream, " }\n");
2480 fprintf (stream, "};\n");
2482 if (ferror (stream) || fclose (stream))
2484 fprintf (stderr, "error writing to '%s'\n", filename);
2489 /* ========================================================================= */
2493 /* Reading PropList.txt and DerivedCoreProperties.txt. */
2502 PROP_QUOTATION_MARK,
2503 PROP_TERMINAL_PUNCTUATION,
2506 PROP_ASCII_HEX_DIGIT,
2507 PROP_OTHER_ALPHABETIC,
2511 PROP_OTHER_LOWERCASE,
2512 PROP_OTHER_UPPERCASE,
2513 PROP_NONCHARACTER_CODE_POINT,
2514 PROP_OTHER_GRAPHEME_EXTEND,
2515 PROP_IDS_BINARY_OPERATOR,
2516 PROP_IDS_TRINARY_OPERATOR,
2518 PROP_UNIFIED_IDEOGRAPH,
2519 PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT,
2522 PROP_LOGICAL_ORDER_EXCEPTION,
2523 PROP_OTHER_ID_START,
2524 PROP_OTHER_ID_CONTINUE,
2526 PROP_VARIATION_SELECTOR,
2527 PROP_PATTERN_WHITE_SPACE,
2528 PROP_PATTERN_SYNTAX,
2529 /* DerivedCoreProperties.txt */
2538 PROP_DEFAULT_IGNORABLE_CODE_POINT,
2539 PROP_GRAPHEME_EXTEND,
2543 unsigned long long unicode_properties[0x110000];
2546 clear_properties (void)
2550 for (i = 0; i < 0x110000; i++)
2551 unicode_properties[i] = 0;
2554 /* Stores in unicode_properties[] the properties from the
2555 PropList.txt or DerivedCoreProperties.txt file. */
2557 fill_properties (const char *proplist_filename)
2562 stream = fopen (proplist_filename, "r");
2565 fprintf (stderr, "error during fopen of '%s'\n", proplist_filename);
2572 unsigned int i1, i2;
2573 char padding[200+1];
2574 char propname[200+1];
2575 unsigned int propvalue;
2577 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
2580 if (buf[0] == '\0' || buf[0] == '#')
2583 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4)
2585 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3)
2587 fprintf (stderr, "parse error in '%s'\n", proplist_filename);
2592 #define PROP(name,value) \
2593 if (strcmp (propname, name) == 0) propvalue = value; else
2595 PROP ("White_Space", PROP_WHITE_SPACE)
2596 PROP ("Bidi_Control", PROP_BIDI_CONTROL)
2597 PROP ("Join_Control", PROP_JOIN_CONTROL)
2598 PROP ("Dash", PROP_DASH)
2599 PROP ("Hyphen", PROP_HYPHEN)
2600 PROP ("Quotation_Mark", PROP_QUOTATION_MARK)
2601 PROP ("Terminal_Punctuation", PROP_TERMINAL_PUNCTUATION)
2602 PROP ("Other_Math", PROP_OTHER_MATH)
2603 PROP ("Hex_Digit", PROP_HEX_DIGIT)
2604 PROP ("ASCII_Hex_Digit", PROP_ASCII_HEX_DIGIT)
2605 PROP ("Other_Alphabetic", PROP_OTHER_ALPHABETIC)
2606 PROP ("Ideographic", PROP_IDEOGRAPHIC)
2607 PROP ("Diacritic", PROP_DIACRITIC)
2608 PROP ("Extender", PROP_EXTENDER)
2609 PROP ("Other_Lowercase", PROP_OTHER_LOWERCASE)
2610 PROP ("Other_Uppercase", PROP_OTHER_UPPERCASE)
2611 PROP ("Noncharacter_Code_Point", PROP_NONCHARACTER_CODE_POINT)
2612 PROP ("Other_Grapheme_Extend", PROP_OTHER_GRAPHEME_EXTEND)
2613 PROP ("IDS_Binary_Operator", PROP_IDS_BINARY_OPERATOR)
2614 PROP ("IDS_Trinary_Operator", PROP_IDS_TRINARY_OPERATOR)
2615 PROP ("Radical", PROP_RADICAL)
2616 PROP ("Unified_Ideograph", PROP_UNIFIED_IDEOGRAPH)
2617 PROP ("Other_Default_Ignorable_Code_Point", PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)
2618 PROP ("Deprecated", PROP_DEPRECATED)
2619 PROP ("Soft_Dotted", PROP_SOFT_DOTTED)
2620 PROP ("Logical_Order_Exception", PROP_LOGICAL_ORDER_EXCEPTION)
2621 PROP ("Other_ID_Start", PROP_OTHER_ID_START)
2622 PROP ("Other_ID_Continue", PROP_OTHER_ID_CONTINUE)
2623 PROP ("STerm", PROP_STERM)
2624 PROP ("Variation_Selector", PROP_VARIATION_SELECTOR)
2625 PROP ("Pattern_White_Space", PROP_PATTERN_WHITE_SPACE)
2626 PROP ("Pattern_Syntax", PROP_PATTERN_SYNTAX)
2627 /* DerivedCoreProperties.txt */
2628 PROP ("Math", PROP_MATH)
2629 PROP ("Alphabetic", PROP_ALPHABETIC)
2630 PROP ("Lowercase", PROP_LOWERCASE)
2631 PROP ("Uppercase", PROP_UPPERCASE)
2632 PROP ("ID_Start", PROP_ID_START)
2633 PROP ("ID_Continue", PROP_ID_CONTINUE)
2634 PROP ("XID_Start", PROP_XID_START)
2635 PROP ("XID_Continue", PROP_XID_CONTINUE)
2636 PROP ("Default_Ignorable_Code_Point", PROP_DEFAULT_IGNORABLE_CODE_POINT)
2637 PROP ("Grapheme_Extend", PROP_GRAPHEME_EXTEND)
2638 PROP ("Grapheme_Base", PROP_GRAPHEME_BASE)
2639 PROP ("Grapheme_Link", PROP_GRAPHEME_LINK)
2642 fprintf (stderr, "unknown property named '%s' in '%s'\n", propname,
2646 if (!(i1 <= i2 && i2 < 0x110000))
2649 for (i = i1; i <= i2; i++)
2650 unicode_properties[i] |= 1ULL << propvalue;
2653 if (ferror (stream) || fclose (stream))
2655 fprintf (stderr, "error reading from '%s'\n", proplist_filename);
2660 /* Stores in array the given property from the Unicode 3.0 PropList.txt
2663 fill_property30 (char array[0x110000], const char *proplist_filename, const char *property_name)
2669 for (i = 0; i < 0x110000; i++)
2672 stream = fopen (proplist_filename, "r");
2675 fprintf (stderr, "error during fopen of '%s'\n", proplist_filename);
2679 /* Search for the "Property dump for: ..." line. */
2682 if (fscanf (stream, "%100[^\n]\n", buf) < 1)
2684 fprintf (stderr, "no property found in '%s'\n", proplist_filename);
2688 while (strstr (buf, property_name) == NULL);
2692 unsigned int i1, i2;
2694 if (fscanf (stream, "%100[^\n]\n", buf) < 1)
2698 if (strlen (buf) >= 10 && buf[4] == '.' && buf[5] == '.')
2700 if (sscanf (buf, "%4X..%4X", &i1, &i2) < 2)
2702 fprintf (stderr, "parse error in property in '%s'\n",
2707 else if (strlen (buf) >= 4)
2709 if (sscanf (buf, "%4X", &i1) < 1)
2711 fprintf (stderr, "parse error in property in '%s'\n",
2719 fprintf (stderr, "parse error in property in '%s'\n",
2723 if (!(i1 <= i2 && i2 < 0x110000))
2725 for (i = i1; i <= i2; i++)
2728 if (ferror (stream) || fclose (stream))
2730 fprintf (stderr, "error reading from '%s'\n", proplist_filename);
2735 /* Properties from Unicode 3.0 PropList.txt file. */
2737 /* The paired punctuation property from the PropList.txt file. */
2738 char unicode_pairedpunctuation[0x110000];
2740 /* The left of pair property from the PropList.txt file. */
2741 char unicode_leftofpair[0x110000];
2744 fill_properties30 (const char *proplist30_filename)
2746 fill_property30 (unicode_pairedpunctuation, proplist30_filename, "(Paired Punctuation)");
2747 fill_property30 (unicode_leftofpair, proplist30_filename, "(Left of Pair)");
2750 /* ------------------------------------------------------------------------- */
2752 /* See PropList.txt, UCD.html. */
2754 is_property_white_space (unsigned int ch)
2756 return ((unicode_properties[ch] & (1ULL << PROP_WHITE_SPACE)) != 0);
2759 /* See Unicode 3.0 book, section 4.10,
2760 PropList.txt, UCD.html,
2761 DerivedCoreProperties.txt, UCD.html. */
2763 is_property_alphabetic (unsigned int ch)
2767 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_ALPHABETIC)) != 0)
2768 /* For some reason, the following are listed as having property
2769 Alphabetic but not as having property Other_Alphabetic. */
2770 || (ch >= 0x16EE && ch <= 0x16F0) /* RUNIC SYMBOLS */
2771 || (ch >= 0x2160 && ch <= 0x2182) /* ROMAN NUMERALS */
2772 || (ch >= 0x2185 && ch <= 0x2188) /* ROMAN NUMERALS */
2773 || (ch >= 0x24D0 && ch <= 0x24E9) /* CIRCLED LATIN SMALL LETTER */
2774 || (ch == 0x3007) /* IDEOGRAPHIC NUMBER ZERO */
2775 || (ch >= 0x3021 && ch <= 0x3029) /* HANGZHOU NUMERAL */
2776 || (ch >= 0x3038 && ch <= 0x303A) /* HANGZHOU NUMERAL */
2777 || (ch >= 0x10140 && ch <= 0x10174) /* GREEK ACROPHONICS */
2778 || (ch == 0x10341) /* GOTHIC LETTER NINETY */
2779 || (ch == 0x1034A) /* GOTHIC LETTER NINE HUNDRED */
2780 || (ch >= 0x103D1 && ch <= 0x103D5) /* OLD PERSIAN NUMBERS */
2781 || (ch >= 0x12400 && ch <= 0x12462); /* CUNEIFORM NUMERIC SIGNS */
2783 ((unicode_properties[ch] & (1ULL << PROP_ALPHABETIC)) != 0);
2785 if (result1 != result2)
2790 /* See PropList.txt, UCD.html. */
2792 is_property_other_alphabetic (unsigned int ch)
2794 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ALPHABETIC)) != 0);
2797 /* See PropList.txt, UCD.html. */
2799 is_property_not_a_character (unsigned int ch)
2801 return ((unicode_properties[ch] & (1ULL << PROP_NONCHARACTER_CODE_POINT)) != 0);
2804 /* See PropList.txt, UCD.html,
2805 DerivedCoreProperties.txt, UCD.html. */
2807 is_property_default_ignorable_code_point (unsigned int ch)
2810 (is_category_Cf (ch)
2811 && !(ch >= 0xFFF9 && ch <= 0xFFFB) /* Annotations */
2812 && !((ch >= 0x0600 && ch <= 0x0603) || ch == 0x06DD || ch == 0x070F))
2813 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)) != 0)
2814 || ((unicode_properties[ch] & (1ULL << PROP_VARIATION_SELECTOR)) != 0);
2816 ((unicode_properties[ch] & (1ULL << PROP_DEFAULT_IGNORABLE_CODE_POINT)) != 0);
2818 if (result1 != result2)
2823 /* See PropList.txt, UCD.html. */
2825 is_property_other_default_ignorable_code_point (unsigned int ch)
2827 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)) != 0);
2830 /* See PropList.txt, UCD.html. */
2832 is_property_deprecated (unsigned int ch)
2834 return ((unicode_properties[ch] & (1ULL << PROP_DEPRECATED)) != 0);
2837 /* See PropList.txt, UCD.html. */
2839 is_property_logical_order_exception (unsigned int ch)
2841 return ((unicode_properties[ch] & (1ULL << PROP_LOGICAL_ORDER_EXCEPTION)) != 0);
2844 /* See PropList.txt, UCD.html. */
2846 is_property_variation_selector (unsigned int ch)
2848 return ((unicode_properties[ch] & (1ULL << PROP_VARIATION_SELECTOR)) != 0);
2851 /* See PropList-3.0.1.txt. */
2853 is_property_private_use (unsigned int ch)
2855 /* Determined through "grep 'Private Use,' UnicodeData-3.1.0.txt". */
2856 return (ch >= 0xE000 && ch <= 0xF8FF)
2857 || (ch >= 0xF0000 && ch <= 0xFFFFD)
2858 || (ch >= 0x100000 && ch <= 0x10FFFD);
2861 /* See PropList-3.0.1.txt. */
2863 is_property_unassigned_code_value (unsigned int ch)
2865 return (is_category_Cn (ch) && !is_property_not_a_character (ch));
2868 /* See PropList.txt, UCD.html,
2869 DerivedCoreProperties.txt, UCD.html. */
2871 is_property_uppercase (unsigned int ch)
2875 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_UPPERCASE)) != 0);
2877 ((unicode_properties[ch] & (1ULL << PROP_UPPERCASE)) != 0);
2879 if (result1 != result2)
2884 /* See PropList.txt, UCD.html. */
2886 is_property_other_uppercase (unsigned int ch)
2888 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_UPPERCASE)) != 0);
2891 /* See PropList.txt, UCD.html,
2892 DerivedCoreProperties.txt, UCD.html. */
2894 is_property_lowercase (unsigned int ch)
2898 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_LOWERCASE)) != 0);
2900 ((unicode_properties[ch] & (1ULL << PROP_LOWERCASE)) != 0);
2902 if (result1 != result2)
2907 /* See PropList.txt, UCD.html. */
2909 is_property_other_lowercase (unsigned int ch)
2911 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_LOWERCASE)) != 0);
2914 /* See PropList-3.0.1.txt. */
2916 is_property_titlecase (unsigned int ch)
2918 return is_category_Lt (ch);
2921 /* See PropList.txt, UCD.html. */
2923 is_property_soft_dotted (unsigned int ch)
2925 return ((unicode_properties[ch] & (1ULL << PROP_SOFT_DOTTED)) != 0);
2928 /* See DerivedCoreProperties.txt, UCD.html. */
2930 is_property_id_start (unsigned int ch)
2932 return ((unicode_properties[ch] & (1ULL << PROP_ID_START)) != 0);
2935 /* See PropList.txt, UCD.html. */
2937 is_property_other_id_start (unsigned int ch)
2939 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ID_START)) != 0);
2942 /* See DerivedCoreProperties.txt, UCD.html. */
2944 is_property_id_continue (unsigned int ch)
2946 return ((unicode_properties[ch] & (1ULL << PROP_ID_CONTINUE)) != 0);
2949 /* See PropList.txt, UCD.html. */
2951 is_property_other_id_continue (unsigned int ch)
2953 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ID_CONTINUE)) != 0);
2956 /* See DerivedCoreProperties.txt, UCD.html. */
2958 is_property_xid_start (unsigned int ch)
2960 return ((unicode_properties[ch] & (1ULL << PROP_XID_START)) != 0);
2963 /* See DerivedCoreProperties.txt, UCD.html. */
2965 is_property_xid_continue (unsigned int ch)
2967 return ((unicode_properties[ch] & (1ULL << PROP_XID_CONTINUE)) != 0);
2970 /* See PropList.txt, UCD.html. */
2972 is_property_pattern_white_space (unsigned int ch)
2974 return ((unicode_properties[ch] & (1ULL << PROP_PATTERN_WHITE_SPACE)) != 0);
2977 /* See PropList.txt, UCD.html. */
2979 is_property_pattern_syntax (unsigned int ch)
2981 return ((unicode_properties[ch] & (1ULL << PROP_PATTERN_SYNTAX)) != 0);
2984 /* See PropList.txt, UCD.html. */
2986 is_property_join_control (unsigned int ch)
2988 return ((unicode_properties[ch] & (1ULL << PROP_JOIN_CONTROL)) != 0);
2991 /* See DerivedCoreProperties.txt, UCD.html. */
2993 is_property_grapheme_base (unsigned int ch)
2995 return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_BASE)) != 0);
2998 /* See DerivedCoreProperties.txt, UCD.html. */
3000 is_property_grapheme_extend (unsigned int ch)
3002 return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_EXTEND)) != 0);
3005 /* See PropList.txt, UCD.html. */
3007 is_property_other_grapheme_extend (unsigned int ch)
3009 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_GRAPHEME_EXTEND)) != 0);
3012 /* See DerivedCoreProperties.txt, UCD.html. */
3014 is_property_grapheme_link (unsigned int ch)
3016 return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_LINK)) != 0);
3019 /* See PropList.txt, UCD.html. */
3021 is_property_bidi_control (unsigned int ch)
3023 return ((unicode_properties[ch] & (1ULL << PROP_BIDI_CONTROL)) != 0);
3026 /* See PropList-3.0.1.txt. */
3028 is_property_bidi_left_to_right (unsigned int ch)
3030 return (get_bidi_category (ch) == UC_BIDI_L);
3033 /* See PropList-3.0.1.txt. */
3035 is_property_bidi_hebrew_right_to_left (unsigned int ch)
3037 return (get_bidi_category (ch) == UC_BIDI_R);
3040 /* See PropList-3.0.1.txt. */
3042 is_property_bidi_arabic_right_to_left (unsigned int ch)
3044 return (get_bidi_category (ch) == UC_BIDI_AL);
3047 /* See PropList-3.0.1.txt. */
3049 is_property_bidi_european_digit (unsigned int ch)
3051 return (get_bidi_category (ch) == UC_BIDI_EN);
3054 /* See PropList-3.0.1.txt. */
3056 is_property_bidi_eur_num_separator (unsigned int ch)
3058 return (get_bidi_category (ch) == UC_BIDI_ES);
3061 /* See PropList-3.0.1.txt. */
3063 is_property_bidi_eur_num_terminator (unsigned int ch)
3065 return (get_bidi_category (ch) == UC_BIDI_ET);
3068 /* See PropList-3.0.1.txt. */
3070 is_property_bidi_arabic_digit (unsigned int ch)
3072 return (get_bidi_category (ch) == UC_BIDI_AN);
3075 /* See PropList-3.0.1.txt. */
3077 is_property_bidi_common_separator (unsigned int ch)
3079 return (get_bidi_category (ch) == UC_BIDI_CS);
3082 /* See PropList-3.0.1.txt. */
3084 is_property_bidi_block_separator (unsigned int ch)
3086 return (get_bidi_category (ch) == UC_BIDI_B);
3089 /* See PropList-3.0.1.txt. */
3091 is_property_bidi_segment_separator (unsigned int ch)
3093 return (get_bidi_category (ch) == UC_BIDI_S);
3096 /* See PropList-3.0.1.txt. */
3098 is_property_bidi_whitespace (unsigned int ch)
3100 return (get_bidi_category (ch) == UC_BIDI_WS);
3103 /* See PropList-3.0.1.txt. */
3105 is_property_bidi_non_spacing_mark (unsigned int ch)
3107 return (get_bidi_category (ch) == UC_BIDI_NSM);
3110 /* See PropList-3.0.1.txt. */
3112 is_property_bidi_boundary_neutral (unsigned int ch)
3114 return (get_bidi_category (ch) == UC_BIDI_BN);
3117 /* See PropList-3.0.1.txt. */
3119 is_property_bidi_pdf (unsigned int ch)
3121 return (get_bidi_category (ch) == UC_BIDI_PDF);
3124 /* See PropList-3.0.1.txt. */
3126 is_property_bidi_embedding_or_override (unsigned int ch)
3128 int category = get_bidi_category (ch);
3129 return (category == UC_BIDI_LRE || category == UC_BIDI_LRO
3130 || category == UC_BIDI_RLE || category == UC_BIDI_RLO);
3133 /* See PropList-3.0.1.txt. */
3135 is_property_bidi_other_neutral (unsigned int ch)
3137 return (get_bidi_category (ch) == UC_BIDI_ON);
3140 /* See PropList.txt, UCD.html. */
3142 is_property_hex_digit (unsigned int ch)
3144 return ((unicode_properties[ch] & (1ULL << PROP_HEX_DIGIT)) != 0);
3147 /* See PropList.txt, UCD.html. */
3149 is_property_ascii_hex_digit (unsigned int ch)
3151 return ((unicode_properties[ch] & (1ULL << PROP_ASCII_HEX_DIGIT)) != 0);
3154 /* See Unicode 3.0 book, section 4.10,
3155 PropList.txt, UCD.html. */
3157 is_property_ideographic (unsigned int ch)
3159 return ((unicode_properties[ch] & (1ULL << PROP_IDEOGRAPHIC)) != 0);
3162 /* See PropList.txt, UCD.html. */
3164 is_property_unified_ideograph (unsigned int ch)
3166 return ((unicode_properties[ch] & (1ULL << PROP_UNIFIED_IDEOGRAPH)) != 0);
3169 /* See PropList.txt, UCD.html. */
3171 is_property_radical (unsigned int ch)
3173 return ((unicode_properties[ch] & (1ULL << PROP_RADICAL)) != 0);
3176 /* See PropList.txt, UCD.html. */
3178 is_property_ids_binary_operator (unsigned int ch)
3180 return ((unicode_properties[ch] & (1ULL << PROP_IDS_BINARY_OPERATOR)) != 0);
3183 /* See PropList.txt, UCD.html. */
3185 is_property_ids_trinary_operator (unsigned int ch)
3187 return ((unicode_properties[ch] & (1ULL << PROP_IDS_TRINARY_OPERATOR)) != 0);
3190 /* See PropList-3.0.1.txt. */
3192 is_property_zero_width (unsigned int ch)
3194 return is_category_Cf (ch)
3195 || (unicode_attributes[ch].name != NULL
3196 && strstr (unicode_attributes[ch].name, "ZERO WIDTH") != NULL);
3199 /* See PropList-3.0.1.txt. */
3201 is_property_space (unsigned int ch)
3203 return is_category_Zs (ch);
3206 /* See PropList-3.0.1.txt. */
3208 is_property_non_break (unsigned int ch)
3210 /* This is exactly the set of characters having line breaking
3212 return (ch == 0x00A0 /* NO-BREAK SPACE */
3213 || ch == 0x034F /* COMBINING GRAPHEME JOINER */
3214 || ch == 0x035C /* COMBINING DOUBLE BREVE BELOW */
3215 || ch == 0x035D /* COMBINING DOUBLE BREVE */
3216 || ch == 0x035E /* COMBINING DOUBLE MACRON */
3217 || ch == 0x035F /* COMBINING DOUBLE MACRON BELOW */
3218 || ch == 0x0360 /* COMBINING DOUBLE TILDE */
3219 || ch == 0x0361 /* COMBINING DOUBLE INVERTED BREVE */
3220 || ch == 0x0362 /* COMBINING DOUBLE RIGHTWARDS ARROW BELOW */
3221 || ch == 0x0F08 /* TIBETAN MARK SBRUL SHAD */
3222 || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */
3223 || ch == 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */
3224 || ch == 0x180E /* MONGOLIAN VOWEL SEPARATOR */
3225 || ch == 0x2007 /* FIGURE SPACE */
3226 || ch == 0x2011 /* NON-BREAKING HYPHEN */
3227 || ch == 0x202F /* NARROW NO-BREAK SPACE */);
3230 /* See PropList-3.0.1.txt. */
3232 is_property_iso_control (unsigned int ch)
3235 (unicode_attributes[ch].name != NULL
3236 && strcmp (unicode_attributes[ch].name, "<control>") == 0);
3238 is_category_Cc (ch);
3240 if (result1 != result2)
3245 /* See PropList-3.0.1.txt. */
3247 is_property_format_control (unsigned int ch)
3249 return (is_category_Cf (ch)
3250 && get_bidi_category (ch) == UC_BIDI_BN
3251 && !is_property_join_control (ch)
3255 /* See PropList.txt, UCD.html. */
3257 is_property_dash (unsigned int ch)
3259 return ((unicode_properties[ch] & (1ULL << PROP_DASH)) != 0);
3262 /* See PropList.txt, UCD.html. */
3264 is_property_hyphen (unsigned int ch)
3266 return ((unicode_properties[ch] & (1ULL << PROP_HYPHEN)) != 0);
3269 /* See PropList-3.0.1.txt. */
3271 is_property_punctuation (unsigned int ch)
3273 return is_category_P (ch);
3276 /* See PropList-3.0.1.txt. */
3278 is_property_line_separator (unsigned int ch)
3280 return is_category_Zl (ch);
3283 /* See PropList-3.0.1.txt. */
3285 is_property_paragraph_separator (unsigned int ch)
3287 return is_category_Zp (ch);
3290 /* See PropList.txt, UCD.html. */
3292 is_property_quotation_mark (unsigned int ch)
3294 return ((unicode_properties[ch] & (1ULL << PROP_QUOTATION_MARK)) != 0);
3297 /* See PropList.txt, UCD.html. */
3299 is_property_sentence_terminal (unsigned int ch)
3301 return ((unicode_properties[ch] & (1ULL << PROP_STERM)) != 0);
3304 /* See PropList.txt, UCD.html. */
3306 is_property_terminal_punctuation (unsigned int ch)
3308 return ((unicode_properties[ch] & (1ULL << PROP_TERMINAL_PUNCTUATION)) != 0);
3311 /* See PropList-3.0.1.txt. */
3313 is_property_currency_symbol (unsigned int ch)
3315 return is_category_Sc (ch);
3318 /* See Unicode 3.0 book, section 4.9,
3319 PropList.txt, UCD.html,
3320 DerivedCoreProperties.txt, UCD.html. */
3322 is_property_math (unsigned int ch)
3326 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_MATH)) != 0);
3328 ((unicode_properties[ch] & (1ULL << PROP_MATH)) != 0);
3330 if (result1 != result2)
3335 /* See PropList.txt, UCD.html. */
3337 is_property_other_math (unsigned int ch)
3339 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_MATH)) != 0);
3342 /* See PropList-3.0.1.txt. */
3344 is_property_paired_punctuation (unsigned int ch)
3346 return unicode_pairedpunctuation[ch];
3349 /* See PropList-3.0.1.txt. */
3351 is_property_left_of_pair (unsigned int ch)
3353 return unicode_leftofpair[ch];
3356 /* See PropList-3.0.1.txt. */
3358 is_property_combining (unsigned int ch)
3360 return (unicode_attributes[ch].name != NULL
3361 && (strcmp (unicode_attributes[ch].combining, "0") != 0
3362 || is_category_Mc (ch)
3363 || is_category_Me (ch)
3364 || is_category_Mn (ch)));
3367 #if 0 /* same as is_property_bidi_non_spacing_mark */
3368 /* See PropList-3.0.1.txt. */
3370 is_property_non_spacing (unsigned int ch)
3372 return (unicode_attributes[ch].name != NULL
3373 && get_bidi_category (ch) == UC_BIDI_NSM);
3377 /* See PropList-3.0.1.txt. */
3379 is_property_composite (unsigned int ch)
3381 /* This definition differs from the one in PropList-3.0.1.txt, but is more
3382 logical in some sense. */
3383 if (ch >= 0xAC00 && ch <= 0xD7A4) /* Hangul Syllables */
3385 if (unicode_attributes[ch].name != NULL
3386 && unicode_attributes[ch].decomposition != NULL)
3388 /* Test whether the decomposition contains more than one character,
3389 and the first is not a space. */
3390 const char *decomp = unicode_attributes[ch].decomposition;
3391 if (decomp[0] == '<')
3393 decomp = strchr (decomp, '>') + 1;
3394 if (decomp[0] == ' ')
3397 return strchr (decomp, ' ') != NULL && strncmp (decomp, "0020 ", 5) != 0;
3402 /* See PropList-3.0.1.txt. */
3404 is_property_decimal_digit (unsigned int ch)
3406 return is_category_Nd (ch);
3409 /* See PropList-3.0.1.txt. */
3411 is_property_numeric (unsigned int ch)
3413 return ((get_numeric_value (ch)).denominator > 0)
3414 || (ch == 0x09F8) /* BENGALI CURRENCY NUMERATOR ONE LESS THAN THE DENOMINATOR */
3415 || (ch == 0x2183); /* ROMAN NUMERAL REVERSED ONE HUNDRED */
3418 /* See PropList.txt, UCD.html. */
3420 is_property_diacritic (unsigned int ch)
3422 return ((unicode_properties[ch] & (1ULL << PROP_DIACRITIC)) != 0);
3425 /* See PropList.txt, UCD.html. */
3427 is_property_extender (unsigned int ch)
3429 return ((unicode_properties[ch] & (1ULL << PROP_EXTENDER)) != 0);
3432 /* See PropList-3.0.1.txt. */
3434 is_property_ignorable_control (unsigned int ch)
3436 return ((is_category_Cc (ch) && get_bidi_category (ch) == UC_BIDI_BN)
3437 || is_category_Cf (ch))
3441 /* ------------------------------------------------------------------------- */
3443 /* Output all properties. */
3445 output_properties (const char *version)
3447 #define PROPERTY(P) \
3448 debug_output_predicate ("unictype/pr_" #P ".txt", is_property_ ## P); \
3449 output_predicate_test ("../tests/unictype/test-pr_" #P ".c", is_property_ ## P, "uc_is_property_" #P " (c)"); \
3450 output_predicate ("unictype/pr_" #P ".h", is_property_ ## P, "u_property_" #P, "Properties", version);
3451 PROPERTY(white_space)
3452 PROPERTY(alphabetic)
3453 PROPERTY(other_alphabetic)
3454 PROPERTY(not_a_character)
3455 PROPERTY(default_ignorable_code_point)
3456 PROPERTY(other_default_ignorable_code_point)
3457 PROPERTY(deprecated)
3458 PROPERTY(logical_order_exception)
3459 PROPERTY(variation_selector)
3460 PROPERTY(private_use)
3461 PROPERTY(unassigned_code_value)
3463 PROPERTY(other_uppercase)
3465 PROPERTY(other_lowercase)
3467 PROPERTY(soft_dotted)
3469 PROPERTY(other_id_start)
3470 PROPERTY(id_continue)
3471 PROPERTY(other_id_continue)
3473 PROPERTY(xid_continue)
3474 PROPERTY(pattern_white_space)
3475 PROPERTY(pattern_syntax)
3476 PROPERTY(join_control)
3477 PROPERTY(grapheme_base)
3478 PROPERTY(grapheme_extend)
3479 PROPERTY(other_grapheme_extend)
3480 PROPERTY(grapheme_link)
3481 PROPERTY(bidi_control)
3482 PROPERTY(bidi_left_to_right)
3483 PROPERTY(bidi_hebrew_right_to_left)
3484 PROPERTY(bidi_arabic_right_to_left)
3485 PROPERTY(bidi_european_digit)
3486 PROPERTY(bidi_eur_num_separator)
3487 PROPERTY(bidi_eur_num_terminator)
3488 PROPERTY(bidi_arabic_digit)
3489 PROPERTY(bidi_common_separator)
3490 PROPERTY(bidi_block_separator)
3491 PROPERTY(bidi_segment_separator)
3492 PROPERTY(bidi_whitespace)
3493 PROPERTY(bidi_non_spacing_mark)
3494 PROPERTY(bidi_boundary_neutral)
3496 PROPERTY(bidi_embedding_or_override)
3497 PROPERTY(bidi_other_neutral)
3499 PROPERTY(ascii_hex_digit)
3500 PROPERTY(ideographic)
3501 PROPERTY(unified_ideograph)
3503 PROPERTY(ids_binary_operator)
3504 PROPERTY(ids_trinary_operator)
3505 PROPERTY(zero_width)
3508 PROPERTY(iso_control)
3509 PROPERTY(format_control)
3512 PROPERTY(punctuation)
3513 PROPERTY(line_separator)
3514 PROPERTY(paragraph_separator)
3515 PROPERTY(quotation_mark)
3516 PROPERTY(sentence_terminal)
3517 PROPERTY(terminal_punctuation)
3518 PROPERTY(currency_symbol)
3520 PROPERTY(other_math)
3521 PROPERTY(paired_punctuation)
3522 PROPERTY(left_of_pair)
3525 PROPERTY(decimal_digit)
3529 PROPERTY(ignorable_control)
3533 /* ========================================================================= */
3537 static const char *scripts[256];
3538 static unsigned int numscripts;
3540 static uint8_t unicode_scripts[0x110000];
3543 fill_scripts (const char *scripts_filename)
3548 stream = fopen (scripts_filename, "r");
3551 fprintf (stderr, "error during fopen of '%s'\n", scripts_filename);
3557 for (i = 0; i < 0x110000; i++)
3558 unicode_scripts[i] = (uint8_t)~(uint8_t)0;
3563 unsigned int i1, i2;
3564 char padding[200+1];
3565 char scriptname[200+1];
3568 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
3571 if (buf[0] == '\0' || buf[0] == '#')
3574 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, scriptname) != 4)
3576 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, scriptname) != 3)
3578 fprintf (stderr, "parse error in '%s'\n", scripts_filename);
3588 for (script = numscripts - 1; script >= 0; script--)
3589 if (strcmp (scripts[script], scriptname) == 0)
3593 scripts[numscripts] = strdup (scriptname);
3594 script = numscripts;
3596 if (numscripts == 256)
3600 for (i = i1; i <= i2; i++)
3602 if (unicode_scripts[i] != (uint8_t)~(uint8_t)0)
3603 fprintf (stderr, "0x%04X belongs to multiple scripts\n", i);
3604 unicode_scripts[i] = script;
3608 if (ferror (stream) || fclose (stream))
3610 fprintf (stderr, "error reading from '%s'\n", scripts_filename);
3615 /* Construction of sparse 3-level tables. */
3616 #define TABLE script_table
3617 #define ELEMENT uint8_t
3618 #define DEFAULT (uint8_t)~(uint8_t)0
3619 #define xmalloc malloc
3620 #define xrealloc realloc
3624 output_scripts (const char *version)
3626 const char *filename = "unictype/scripts.h";
3628 unsigned int ch, s, i;
3629 struct script_table t;
3630 unsigned int level1_offset, level2_offset, level3_offset;
3634 const char *lowercase_name;
3637 scriptinfo_t scriptinfo[256];
3639 stream = fopen (filename, "w");
3642 fprintf (stderr, "cannot open '%s' for writing\n", filename);
3646 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
3647 fprintf (stream, "/* Unicode scripts. */\n");
3648 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
3651 for (s = 0; s < numscripts; s++)
3653 char *lcp = strdup (scripts[s]);
3656 for (cp = lcp; *cp != '\0'; cp++)
3657 if (*cp >= 'A' && *cp <= 'Z')
3660 scriptinfo[s].lowercase_name = lcp;
3663 for (s = 0; s < numscripts; s++)
3665 fprintf (stream, "static const uc_interval_t script_%s_intervals[] =\n",
3666 scriptinfo[s].lowercase_name);
3667 fprintf (stream, "{\n");
3669 for (ch = 0; ch < 0x110000; ch++)
3670 if (unicode_scripts[ch] == s)
3676 while (ch + 1 < 0x110000 && unicode_scripts[ch + 1] == s)
3681 fprintf (stream, ",\n");
3683 fprintf (stream, " { 0x%04X, 1, 1 }", start);
3685 fprintf (stream, " { 0x%04X, 1, 0 }, { 0x%04X, 0, 1 }",
3689 fprintf (stream, "\n");
3690 fprintf (stream, "};\n");
3693 fprintf (stream, "static const uc_script_t scripts[%d] =\n", numscripts);
3694 fprintf (stream, "{\n");
3695 for (s = 0; s < numscripts; s++)
3697 fprintf (stream, " {\n");
3698 fprintf (stream, " sizeof (script_%s_intervals) / sizeof (uc_interval_t),\n",
3699 scriptinfo[s].lowercase_name);
3700 fprintf (stream, " script_%s_intervals,\n",
3701 scriptinfo[s].lowercase_name);
3702 fprintf (stream, " \"%s\"\n", scripts[s]);
3703 fprintf (stream, " }");
3704 if (s+1 < numscripts)
3705 fprintf (stream, ",");
3706 fprintf (stream, "\n");
3708 fprintf (stream, "};\n");
3712 script_table_init (&t);
3714 for (ch = 0; ch < 0x110000; ch++)
3716 unsigned int s = unicode_scripts[ch];
3717 if (s != (uint8_t)~(uint8_t)0)
3718 script_table_add (&t, ch, s);
3721 script_table_finalize (&t);
3723 /* Offsets in t.result, in memory of this process. */
3725 5 * sizeof (uint32_t);
3727 5 * sizeof (uint32_t)
3728 + t.level1_size * sizeof (uint32_t);
3730 5 * sizeof (uint32_t)
3731 + t.level1_size * sizeof (uint32_t)
3732 + (t.level2_size << t.q) * sizeof (uint32_t);
3734 for (i = 0; i < 5; i++)
3735 fprintf (stream, "#define script_header_%d %d\n", i,
3736 ((uint32_t *) t.result)[i]);
3737 fprintf (stream, "static const\n");
3738 fprintf (stream, "struct\n");
3739 fprintf (stream, " {\n");
3740 fprintf (stream, " int level1[%zu];\n", t.level1_size);
3741 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
3742 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
3743 fprintf (stream, " }\n");
3744 fprintf (stream, "u_script =\n");
3745 fprintf (stream, "{\n");
3746 fprintf (stream, " {");
3747 if (t.level1_size > 8)
3748 fprintf (stream, "\n ");
3749 for (i = 0; i < t.level1_size; i++)
3752 if (i > 0 && (i % 8) == 0)
3753 fprintf (stream, "\n ");
3754 offset = ((uint32_t *) (t.result + level1_offset))[i];
3756 fprintf (stream, " %5d", -1);
3758 fprintf (stream, " %5zu",
3759 (offset - level2_offset) / sizeof (uint32_t));
3760 if (i+1 < t.level1_size)
3761 fprintf (stream, ",");
3763 if (t.level1_size > 8)
3764 fprintf (stream, "\n ");
3765 fprintf (stream, " },\n");
3766 fprintf (stream, " {");
3767 if (t.level2_size << t.q > 8)
3768 fprintf (stream, "\n ");
3769 for (i = 0; i < t.level2_size << t.q; i++)
3772 if (i > 0 && (i % 8) == 0)
3773 fprintf (stream, "\n ");
3774 offset = ((uint32_t *) (t.result + level2_offset))[i];
3776 fprintf (stream, " %5d", -1);
3778 fprintf (stream, " %5zu",
3779 (offset - level3_offset) / sizeof (uint8_t));
3780 if (i+1 < t.level2_size << t.q)
3781 fprintf (stream, ",");
3783 if (t.level2_size << t.q > 8)
3784 fprintf (stream, "\n ");
3785 fprintf (stream, " },\n");
3786 fprintf (stream, " {");
3787 if (t.level3_size << t.p > 8)
3788 fprintf (stream, "\n ");
3789 for (i = 0; i < t.level3_size << t.p; i++)
3791 if (i > 0 && (i % 8) == 0)
3792 fprintf (stream, "\n ");
3793 fprintf (stream, " %3d", ((uint8_t *) (t.result + level3_offset))[i]);
3794 if (i+1 < t.level3_size << t.p)
3795 fprintf (stream, ",");
3797 if (t.level3_size << t.p > 8)
3798 fprintf (stream, "\n ");
3799 fprintf (stream, " }\n");
3800 fprintf (stream, "};\n");
3802 if (ferror (stream) || fclose (stream))
3804 fprintf (stderr, "error writing to '%s'\n", filename);
3810 output_scripts_byname (const char *version)
3812 const char *filename = "unictype/scripts_byname.gperf";
3816 stream = fopen (filename, "w");
3819 fprintf (stderr, "cannot open '%s' for writing\n", filename);
3823 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
3824 fprintf (stream, "/* Unicode scripts. */\n");
3825 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
3827 fprintf (stream, "struct named_script { const char *name; unsigned int index; };\n");
3828 fprintf (stream, "%%struct-type\n");
3829 fprintf (stream, "%%language=ANSI-C\n");
3830 fprintf (stream, "%%define hash-function-name scripts_hash\n");
3831 fprintf (stream, "%%define lookup-function-name uc_script_lookup\n");
3832 fprintf (stream, "%%readonly-tables\n");
3833 fprintf (stream, "%%global-table\n");
3834 fprintf (stream, "%%define word-array-name script_names\n");
3835 fprintf (stream, "%%%%\n");
3836 for (s = 0; s < numscripts; s++)
3837 fprintf (stream, "%s, %u\n", scripts[s], s);
3839 if (ferror (stream) || fclose (stream))
3841 fprintf (stderr, "error writing to '%s'\n", filename);
3846 /* ========================================================================= */
3850 typedef struct { unsigned int start; unsigned int end; const char *name; }
3852 static block_t blocks[256];
3853 static unsigned int numblocks;
3856 fill_blocks (const char *blocks_filename)
3860 stream = fopen (blocks_filename, "r");
3863 fprintf (stderr, "error during fopen of '%s'\n", blocks_filename);
3870 unsigned int i1, i2;
3871 char padding[200+1];
3872 char blockname[200+1];
3874 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
3877 if (buf[0] == '\0' || buf[0] == '#')
3880 if (sscanf (buf, "%X..%X%[ ;]%[^\r]", &i1, &i2, padding, blockname) != 4)
3882 fprintf (stderr, "parse error in '%s'\n", blocks_filename);
3885 blocks[numblocks].start = i1;
3886 blocks[numblocks].end = i2;
3887 blocks[numblocks].name = strdup (blockname);
3888 /* It must be sorted. */
3889 if (numblocks > 0 && !(blocks[numblocks-1].end < blocks[numblocks].start))
3892 if (numblocks == 256)
3896 if (ferror (stream) || fclose (stream))
3898 fprintf (stderr, "error reading from '%s'\n", blocks_filename);
3903 /* Return the smallest block index among the blocks for characters >= ch. */
3905 block_first_index (unsigned int ch)
3907 /* Binary search. */
3908 unsigned int lo = 0;
3909 unsigned int hi = numblocks;
3911 All blocks[i], i < lo, have blocks[i].end < ch,
3912 all blocks[i], i >= hi, have blocks[i].end >= ch. */
3915 unsigned int mid = (lo + hi) / 2; /* >= lo, < hi */
3916 if (blocks[mid].end < ch)
3924 /* Return the largest block index among the blocks for characters <= ch,
3927 block_last_index (unsigned int ch)
3929 /* Binary search. */
3930 unsigned int lo = 0;
3931 unsigned int hi = numblocks;
3933 All blocks[i], i < lo, have blocks[i].start <= ch,
3934 all blocks[i], i >= hi, have blocks[i].start > ch. */
3937 unsigned int mid = (lo + hi) / 2; /* >= lo, < hi */
3938 if (blocks[mid].start <= ch)
3947 output_blocks (const char *version)
3949 const char *filename = "unictype/blocks.h";
3950 const unsigned int shift = 8; /* bits to shift away for array access */
3951 const unsigned int threshold = 0x30000; /* cut-off table here to save space */
3956 stream = fopen (filename, "w");
3959 fprintf (stderr, "cannot open '%s' for writing\n", filename);
3963 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
3964 fprintf (stream, "/* Unicode blocks. */\n");
3965 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
3968 fprintf (stream, "static const uc_block_t blocks[] =\n");
3969 fprintf (stream, "{\n");
3970 for (i = 0; i < numblocks; i++)
3972 fprintf (stream, " { 0x%04X, 0x%04X, \"%s\" }", blocks[i].start,
3973 blocks[i].end, blocks[i].name);
3974 if (i+1 < numblocks)
3975 fprintf (stream, ",");
3976 fprintf (stream, "\n");
3978 fprintf (stream, "};\n");
3979 fprintf (stream, "#define blocks_level1_shift %d\n", shift);
3980 fprintf (stream, "#define blocks_level1_threshold 0x%04X\n", threshold);
3981 fprintf (stream, "static const uint8_t blocks_level1[%d * 2] =\n",
3982 threshold >> shift);
3983 fprintf (stream, "{\n");
3984 for (i1 = 0; i1 < (threshold >> shift); i1++)
3986 unsigned int first_index = block_first_index (i1 << shift);
3987 unsigned int last_index = block_last_index (((i1 + 1) << shift) - 1);
3988 fprintf (stream, " %3d, %3d", first_index, last_index);
3989 if (i1+1 < (threshold >> shift))
3990 fprintf (stream, ",");
3991 fprintf (stream, "\n");
3993 fprintf (stream, "};\n");
3994 fprintf (stream, "#define blocks_upper_first_index %d\n",
3995 block_first_index (threshold));
3996 fprintf (stream, "#define blocks_upper_last_index %d\n",
3997 block_last_index (0x10FFFF));
3999 if (ferror (stream) || fclose (stream))
4001 fprintf (stderr, "error writing to '%s'\n", filename);
4006 /* ========================================================================= */
4008 /* C and Java syntax. */
4012 UC_IDENTIFIER_START, /* valid as first or subsequent character */
4013 UC_IDENTIFIER_VALID, /* valid as subsequent character only */
4014 UC_IDENTIFIER_INVALID, /* not valid */
4015 UC_IDENTIFIER_IGNORABLE /* ignorable (Java only) */
4018 /* ISO C 99 section 6.4.(3). */
4020 is_c_whitespace (unsigned int ch)
4022 return (ch == ' ' /* space */
4023 || ch == '\t' /* horizontal tab */
4024 || ch == '\n' || ch == '\r' /* new-line */
4025 || ch == '\v' /* vertical tab */
4026 || ch == '\f'); /* form-feed */
4029 /* ISO C 99 section 6.4.2.1 and appendix D. */
4031 c_ident_category (unsigned int ch)
4033 /* Section 6.4.2.1. */
4034 if (ch >= '0' && ch <= '9')
4035 return UC_IDENTIFIER_VALID;
4036 if ((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || ch == '_')
4037 return UC_IDENTIFIER_START;
4043 || (ch >= 0x00C0 && ch <= 0x00D6)
4044 || (ch >= 0x00D8 && ch <= 0x00F6)
4045 || (ch >= 0x00F8 && ch <= 0x01F5)
4046 || (ch >= 0x01FA && ch <= 0x0217)
4047 || (ch >= 0x0250 && ch <= 0x02A8)
4048 || (ch >= 0x1E00 && ch <= 0x1E9B)
4049 || (ch >= 0x1EA0 && ch <= 0x1EF9)
4053 || (ch >= 0x0388 && ch <= 0x038A)
4055 || (ch >= 0x038E && ch <= 0x03A1)
4056 || (ch >= 0x03A3 && ch <= 0x03CE)
4057 || (ch >= 0x03D0 && ch <= 0x03D6)
4062 || (ch >= 0x03E2 && ch <= 0x03F3)
4063 || (ch >= 0x1F00 && ch <= 0x1F15)
4064 || (ch >= 0x1F18 && ch <= 0x1F1D)
4065 || (ch >= 0x1F20 && ch <= 0x1F45)
4066 || (ch >= 0x1F48 && ch <= 0x1F4D)
4067 || (ch >= 0x1F50 && ch <= 0x1F57)
4071 || (ch >= 0x1F5F && ch <= 0x1F7D)
4072 || (ch >= 0x1F80 && ch <= 0x1FB4)
4073 || (ch >= 0x1FB6 && ch <= 0x1FBC)
4074 || (ch >= 0x1FC2 && ch <= 0x1FC4)
4075 || (ch >= 0x1FC6 && ch <= 0x1FCC)
4076 || (ch >= 0x1FD0 && ch <= 0x1FD3)
4077 || (ch >= 0x1FD6 && ch <= 0x1FDB)
4078 || (ch >= 0x1FE0 && ch <= 0x1FEC)
4079 || (ch >= 0x1FF2 && ch <= 0x1FF4)
4080 || (ch >= 0x1FF6 && ch <= 0x1FFC)
4082 || (ch >= 0x0401 && ch <= 0x040C)
4083 || (ch >= 0x040E && ch <= 0x044F)
4084 || (ch >= 0x0451 && ch <= 0x045C)
4085 || (ch >= 0x045E && ch <= 0x0481)
4086 || (ch >= 0x0490 && ch <= 0x04C4)
4087 || (ch >= 0x04C7 && ch <= 0x04C8)
4088 || (ch >= 0x04CB && ch <= 0x04CC)
4089 || (ch >= 0x04D0 && ch <= 0x04EB)
4090 || (ch >= 0x04EE && ch <= 0x04F5)
4091 || (ch >= 0x04F8 && ch <= 0x04F9)
4093 || (ch >= 0x0531 && ch <= 0x0556)
4094 || (ch >= 0x0561 && ch <= 0x0587)
4096 || (ch >= 0x05B0 && ch <= 0x05B9)
4097 || (ch >= 0x05BB && ch <= 0x05BD)
4099 || (ch >= 0x05C1 && ch <= 0x05C2)
4100 || (ch >= 0x05D0 && ch <= 0x05EA)
4101 || (ch >= 0x05F0 && ch <= 0x05F2)
4103 || (ch >= 0x0621 && ch <= 0x063A)
4104 || (ch >= 0x0640 && ch <= 0x0652)
4105 || (ch >= 0x0670 && ch <= 0x06B7)
4106 || (ch >= 0x06BA && ch <= 0x06BE)
4107 || (ch >= 0x06C0 && ch <= 0x06CE)
4108 || (ch >= 0x06D0 && ch <= 0x06DC)
4109 || (ch >= 0x06E5 && ch <= 0x06E8)
4110 || (ch >= 0x06EA && ch <= 0x06ED)
4112 || (ch >= 0x0901 && ch <= 0x0903)
4113 || (ch >= 0x0905 && ch <= 0x0939)
4114 || (ch >= 0x093E && ch <= 0x094D)
4115 || (ch >= 0x0950 && ch <= 0x0952)
4116 || (ch >= 0x0958 && ch <= 0x0963)
4118 || (ch >= 0x0981 && ch <= 0x0983)
4119 || (ch >= 0x0985 && ch <= 0x098C)
4120 || (ch >= 0x098F && ch <= 0x0990)
4121 || (ch >= 0x0993 && ch <= 0x09A8)
4122 || (ch >= 0x09AA && ch <= 0x09B0)
4124 || (ch >= 0x09B6 && ch <= 0x09B9)
4125 || (ch >= 0x09BE && ch <= 0x09C4)
4126 || (ch >= 0x09C7 && ch <= 0x09C8)
4127 || (ch >= 0x09CB && ch <= 0x09CD)
4128 || (ch >= 0x09DC && ch <= 0x09DD)
4129 || (ch >= 0x09DF && ch <= 0x09E3)
4130 || (ch >= 0x09F0 && ch <= 0x09F1)
4133 || (ch >= 0x0A05 && ch <= 0x0A0A)
4134 || (ch >= 0x0A0F && ch <= 0x0A10)
4135 || (ch >= 0x0A13 && ch <= 0x0A28)
4136 || (ch >= 0x0A2A && ch <= 0x0A30)
4137 || (ch >= 0x0A32 && ch <= 0x0A33)
4138 || (ch >= 0x0A35 && ch <= 0x0A36)
4139 || (ch >= 0x0A38 && ch <= 0x0A39)
4140 || (ch >= 0x0A3E && ch <= 0x0A42)
4141 || (ch >= 0x0A47 && ch <= 0x0A48)
4142 || (ch >= 0x0A4B && ch <= 0x0A4D)
4143 || (ch >= 0x0A59 && ch <= 0x0A5C)
4147 || (ch >= 0x0A81 && ch <= 0x0A83)
4148 || (ch >= 0x0A85 && ch <= 0x0A8B)
4150 || (ch >= 0x0A8F && ch <= 0x0A91)
4151 || (ch >= 0x0A93 && ch <= 0x0AA8)
4152 || (ch >= 0x0AAA && ch <= 0x0AB0)
4153 || (ch >= 0x0AB2 && ch <= 0x0AB3)
4154 || (ch >= 0x0AB5 && ch <= 0x0AB9)
4155 || (ch >= 0x0ABD && ch <= 0x0AC5)
4156 || (ch >= 0x0AC7 && ch <= 0x0AC9)
4157 || (ch >= 0x0ACB && ch <= 0x0ACD)
4161 || (ch >= 0x0B01 && ch <= 0x0B03)
4162 || (ch >= 0x0B05 && ch <= 0x0B0C)
4163 || (ch >= 0x0B0F && ch <= 0x0B10)
4164 || (ch >= 0x0B13 && ch <= 0x0B28)
4165 || (ch >= 0x0B2A && ch <= 0x0B30)
4166 || (ch >= 0x0B32 && ch <= 0x0B33)
4167 || (ch >= 0x0B36 && ch <= 0x0B39)
4168 || (ch >= 0x0B3E && ch <= 0x0B43)
4169 || (ch >= 0x0B47 && ch <= 0x0B48)
4170 || (ch >= 0x0B4B && ch <= 0x0B4D)
4171 || (ch >= 0x0B5C && ch <= 0x0B5D)
4172 || (ch >= 0x0B5F && ch <= 0x0B61)
4174 || (ch >= 0x0B82 && ch <= 0x0B83)
4175 || (ch >= 0x0B85 && ch <= 0x0B8A)
4176 || (ch >= 0x0B8E && ch <= 0x0B90)
4177 || (ch >= 0x0B92 && ch <= 0x0B95)
4178 || (ch >= 0x0B99 && ch <= 0x0B9A)
4180 || (ch >= 0x0B9E && ch <= 0x0B9F)
4181 || (ch >= 0x0BA3 && ch <= 0x0BA4)
4182 || (ch >= 0x0BA8 && ch <= 0x0BAA)
4183 || (ch >= 0x0BAE && ch <= 0x0BB5)
4184 || (ch >= 0x0BB7 && ch <= 0x0BB9)
4185 || (ch >= 0x0BBE && ch <= 0x0BC2)
4186 || (ch >= 0x0BC6 && ch <= 0x0BC8)
4187 || (ch >= 0x0BCA && ch <= 0x0BCD)
4189 || (ch >= 0x0C01 && ch <= 0x0C03)
4190 || (ch >= 0x0C05 && ch <= 0x0C0C)
4191 || (ch >= 0x0C0E && ch <= 0x0C10)
4192 || (ch >= 0x0C12 && ch <= 0x0C28)
4193 || (ch >= 0x0C2A && ch <= 0x0C33)
4194 || (ch >= 0x0C35 && ch <= 0x0C39)
4195 || (ch >= 0x0C3E && ch <= 0x0C44)
4196 || (ch >= 0x0C46 && ch <= 0x0C48)
4197 || (ch >= 0x0C4A && ch <= 0x0C4D)
4198 || (ch >= 0x0C60 && ch <= 0x0C61)
4200 || (ch >= 0x0C82 && ch <= 0x0C83)
4201 || (ch >= 0x0C85 && ch <= 0x0C8C)
4202 || (ch >= 0x0C8E && ch <= 0x0C90)
4203 || (ch >= 0x0C92 && ch <= 0x0CA8)
4204 || (ch >= 0x0CAA && ch <= 0x0CB3)
4205 || (ch >= 0x0CB5 && ch <= 0x0CB9)
4206 || (ch >= 0x0CBE && ch <= 0x0CC4)
4207 || (ch >= 0x0CC6 && ch <= 0x0CC8)
4208 || (ch >= 0x0CCA && ch <= 0x0CCD)
4210 || (ch >= 0x0CE0 && ch <= 0x0CE1)
4212 || (ch >= 0x0D02 && ch <= 0x0D03)
4213 || (ch >= 0x0D05 && ch <= 0x0D0C)
4214 || (ch >= 0x0D0E && ch <= 0x0D10)
4215 || (ch >= 0x0D12 && ch <= 0x0D28)
4216 || (ch >= 0x0D2A && ch <= 0x0D39)
4217 || (ch >= 0x0D3E && ch <= 0x0D43)
4218 || (ch >= 0x0D46 && ch <= 0x0D48)
4219 || (ch >= 0x0D4A && ch <= 0x0D4D)
4220 || (ch >= 0x0D60 && ch <= 0x0D61)
4222 || (ch >= 0x0E01 && ch <= 0x0E3A)
4223 || (ch >= 0x0E40 && ch <= 0x0E5B)
4225 || (ch >= 0x0E81 && ch <= 0x0E82)
4227 || (ch >= 0x0E87 && ch <= 0x0E88)
4230 || (ch >= 0x0E94 && ch <= 0x0E97)
4231 || (ch >= 0x0E99 && ch <= 0x0E9F)
4232 || (ch >= 0x0EA1 && ch <= 0x0EA3)
4235 || (ch >= 0x0EAA && ch <= 0x0EAB)
4236 || (ch >= 0x0EAD && ch <= 0x0EAE)
4237 || (ch >= 0x0EB0 && ch <= 0x0EB9)
4238 || (ch >= 0x0EBB && ch <= 0x0EBD)
4239 || (ch >= 0x0EC0 && ch <= 0x0EC4)
4241 || (ch >= 0x0EC8 && ch <= 0x0ECD)
4242 || (ch >= 0x0EDC && ch <= 0x0EDD)
4245 || (ch >= 0x0F18 && ch <= 0x0F19)
4249 || (ch >= 0x0F3E && ch <= 0x0F47)
4250 || (ch >= 0x0F49 && ch <= 0x0F69)
4251 || (ch >= 0x0F71 && ch <= 0x0F84)
4252 || (ch >= 0x0F86 && ch <= 0x0F8B)
4253 || (ch >= 0x0F90 && ch <= 0x0F95)
4255 || (ch >= 0x0F99 && ch <= 0x0FAD)
4256 || (ch >= 0x0FB1 && ch <= 0x0FB7)
4259 || (ch >= 0x10A0 && ch <= 0x10C5)
4260 || (ch >= 0x10D0 && ch <= 0x10F6)
4262 || (ch >= 0x3041 && ch <= 0x3093)
4263 || (ch >= 0x309B && ch <= 0x309C)
4265 || (ch >= 0x30A1 && ch <= 0x30F6)
4266 || (ch >= 0x30FB && ch <= 0x30FC)
4268 || (ch >= 0x3105 && ch <= 0x312C)
4269 /* CJK Unified Ideographs */
4270 || (ch >= 0x4E00 && ch <= 0x9FA5)
4272 || (ch >= 0xAC00 && ch <= 0xD7A3)
4274 || (ch >= 0x0660 && ch <= 0x0669)
4275 || (ch >= 0x06F0 && ch <= 0x06F9)
4276 || (ch >= 0x0966 && ch <= 0x096F)
4277 || (ch >= 0x09E6 && ch <= 0x09EF)
4278 || (ch >= 0x0A66 && ch <= 0x0A6F)
4279 || (ch >= 0x0AE6 && ch <= 0x0AEF)
4280 || (ch >= 0x0B66 && ch <= 0x0B6F)
4281 || (ch >= 0x0BE7 && ch <= 0x0BEF)
4282 || (ch >= 0x0C66 && ch <= 0x0C6F)
4283 || (ch >= 0x0CE6 && ch <= 0x0CEF)
4284 || (ch >= 0x0D66 && ch <= 0x0D6F)
4285 || (ch >= 0x0E50 && ch <= 0x0E59)
4286 || (ch >= 0x0ED0 && ch <= 0x0ED9)
4287 || (ch >= 0x0F20 && ch <= 0x0F33)
4288 /* Special characters */
4291 || (ch >= 0x02B0 && ch <= 0x02B8)
4293 || (ch >= 0x02BD && ch <= 0x02C1)
4294 || (ch >= 0x02D0 && ch <= 0x02D1)
4295 || (ch >= 0x02E0 && ch <= 0x02E4)
4301 || (ch >= 0x203F && ch <= 0x2040)
4304 || (ch >= 0x210A && ch <= 0x2113)
4306 || (ch >= 0x2118 && ch <= 0x211D)
4310 || (ch >= 0x212A && ch <= 0x2131)
4311 || (ch >= 0x2133 && ch <= 0x2138)
4312 || (ch >= 0x2160 && ch <= 0x2182)
4313 || (ch >= 0x3005 && ch <= 0x3007)
4314 || (ch >= 0x3021 && ch <= 0x3029)
4316 return UC_IDENTIFIER_START;
4317 return UC_IDENTIFIER_INVALID;
4320 /* The Java Language Specification, 3rd edition, §3.6.
4321 http://java.sun.com/docs/books/jls/third_edition/html/lexical.html#95710 */
4323 is_java_whitespace (unsigned int ch)
4325 return (ch == ' ' || ch == '\t' || ch == '\f'
4326 || ch == '\n' || ch == '\r');
4329 /* The Java Language Specification, 3rd edition, §3.8.
4330 http://java.sun.com/docs/books/jls/third_edition/html/lexical.html#40625
4331 and Character.isJavaIdentifierStart and Character.isJavaIdentifierPart */
4333 java_ident_category (unsigned int ch)
4335 /* FIXME: Check this against Sun's JDK implementation. */
4336 if (is_category_L (ch) /* = Character.isLetter(ch) */
4337 || is_category_Nl (ch) /* = Character.getType(ch)==LETTER_NUMBER */
4338 || is_category_Sc (ch) /* currency symbol */
4339 || is_category_Pc (ch) /* connector punctuation */
4341 return UC_IDENTIFIER_START;
4342 if (is_category_Nd (ch) /* digit */
4343 || is_category_Mc (ch) /* combining mark */
4344 || is_category_Mn (ch) /* non-spacing mark */
4346 return UC_IDENTIFIER_VALID;
4347 if ((ch >= 0x0000 && ch <= 0x0008)
4348 || (ch >= 0x000E && ch <= 0x001B)
4349 || (ch >= 0x007F && ch <= 0x009F)
4350 || is_category_Cf (ch) /* = Character.getType(ch)==FORMAT */
4352 return UC_IDENTIFIER_IGNORABLE;
4353 return UC_IDENTIFIER_INVALID;
4356 /* Construction of sparse 3-level tables. */
4357 #define TABLE identsyntax_table
4358 #define ELEMENT uint8_t
4359 #define DEFAULT UC_IDENTIFIER_INVALID
4360 #define xmalloc malloc
4361 #define xrealloc realloc
4364 /* Output an identifier syntax categorization in a three-level bitmap. */
4366 output_ident_category (const char *filename, int (*predicate) (unsigned int), const char *name, const char *version)
4370 struct identsyntax_table t;
4371 unsigned int level1_offset, level2_offset, level3_offset;
4373 stream = fopen (filename, "w");
4376 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4380 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4381 fprintf (stream, "/* Language syntax properties of Unicode characters. */\n");
4382 fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n",
4387 identsyntax_table_init (&t);
4389 for (ch = 0; ch < 0x110000; ch++)
4391 int syntaxcode = predicate (ch);
4392 if (syntaxcode != UC_IDENTIFIER_INVALID)
4393 identsyntax_table_add (&t, ch, syntaxcode);
4396 identsyntax_table_finalize (&t);
4398 /* Offsets in t.result, in memory of this process. */
4400 5 * sizeof (uint32_t);
4402 5 * sizeof (uint32_t)
4403 + t.level1_size * sizeof (uint32_t);
4405 5 * sizeof (uint32_t)
4406 + t.level1_size * sizeof (uint32_t)
4407 + (t.level2_size << t.q) * sizeof (uint32_t);
4409 for (i = 0; i < 5; i++)
4410 fprintf (stream, "#define identsyntax_header_%d %d\n", i,
4411 ((uint32_t *) t.result)[i]);
4412 fprintf (stream, "static const\n");
4413 fprintf (stream, "struct\n");
4414 fprintf (stream, " {\n");
4415 fprintf (stream, " int level1[%zu];\n", t.level1_size);
4416 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
4417 fprintf (stream, " unsigned short level3[%zu * %d];\n", t.level3_size,
4418 (1 << t.p) * 2 / 16);
4419 fprintf (stream, " }\n");
4420 fprintf (stream, "%s =\n", name);
4421 fprintf (stream, "{\n");
4422 fprintf (stream, " {");
4423 if (t.level1_size > 8)
4424 fprintf (stream, "\n ");
4425 for (i = 0; i < t.level1_size; i++)
4428 if (i > 0 && (i % 8) == 0)
4429 fprintf (stream, "\n ");
4430 offset = ((uint32_t *) (t.result + level1_offset))[i];
4432 fprintf (stream, " %5d", -1);
4434 fprintf (stream, " %5zu",
4435 (offset - level2_offset) / sizeof (uint32_t));
4436 if (i+1 < t.level1_size)
4437 fprintf (stream, ",");
4439 if (t.level1_size > 8)
4440 fprintf (stream, "\n ");
4441 fprintf (stream, " },\n");
4442 fprintf (stream, " {");
4443 if (t.level2_size << t.q > 8)
4444 fprintf (stream, "\n ");
4445 for (i = 0; i < t.level2_size << t.q; i++)
4448 if (i > 0 && (i % 8) == 0)
4449 fprintf (stream, "\n ");
4450 offset = ((uint32_t *) (t.result + level2_offset))[i];
4452 fprintf (stream, " %5d", -1);
4454 fprintf (stream, " %5zu",
4455 (offset - level3_offset) / sizeof (uint8_t));
4456 if (i+1 < t.level2_size << t.q)
4457 fprintf (stream, ",");
4459 if (t.level2_size << t.q > 8)
4460 fprintf (stream, "\n ");
4461 fprintf (stream, " },\n");
4462 /* Pack the level3 array. Each entry needs 2 bits only. */
4463 fprintf (stream, " {");
4464 if ((t.level3_size << t.p) * 2 / 16 > 8)
4465 fprintf (stream, "\n ");
4466 for (i = 0; i < (t.level3_size << t.p) * 2 / 16; i++)
4468 if (i > 0 && (i % 8) == 0)
4469 fprintf (stream, "\n ");
4470 fprintf (stream, " 0x%04x",
4471 (((uint8_t *) (t.result + level3_offset))[8 * i] << 0)
4472 | (((uint8_t *) (t.result + level3_offset))[8 * i + 1] << 2)
4473 | (((uint8_t *) (t.result + level3_offset))[8 * i + 2] << 4)
4474 | (((uint8_t *) (t.result + level3_offset))[8 * i + 3] << 6)
4475 | (((uint8_t *) (t.result + level3_offset))[8 * i + 4] << 8)
4476 | (((uint8_t *) (t.result + level3_offset))[8 * i + 5] << 10)
4477 | (((uint8_t *) (t.result + level3_offset))[8 * i + 6] << 12)
4478 | (((uint8_t *) (t.result + level3_offset))[8 * i + 7] << 14));
4479 if (i+1 < (t.level3_size << t.p) * 2 / 16)
4480 fprintf (stream, ",");
4482 if ((t.level3_size << t.p) * 2 / 16 > 8)
4483 fprintf (stream, "\n ");
4484 fprintf (stream, " }\n");
4485 fprintf (stream, "};\n");
4487 if (ferror (stream) || fclose (stream))
4489 fprintf (stderr, "error writing to '%s'\n", filename);
4495 output_ident_properties (const char *version)
4497 #define PROPERTY(P) \
4498 debug_output_predicate ("unictype/sy_" #P ".txt", is_ ## P); \
4499 output_predicate_test ("../tests/unictype/test-sy_" #P ".c", is_ ## P, "uc_is_" #P " (c)"); \
4500 output_predicate ("unictype/sy_" #P ".h", is_ ## P, "u_" #P, "Language syntax properties", version);
4501 PROPERTY(c_whitespace)
4502 PROPERTY(java_whitespace)
4505 output_ident_category ("unictype/sy_c_ident.h", c_ident_category, "u_c_ident", version);
4506 output_ident_category ("unictype/sy_java_ident.h", java_ident_category, "u_java_ident", version);
4509 /* ========================================================================= */
4511 /* Like ISO C <ctype.h> and <wctype.h>. Compatible to glibc's
4512 glibc/localedata/locales/i18n file, generated by
4513 glibc/localedata/gen-unicode-ctype.c. */
4515 /* Character mappings. */
4518 to_upper (unsigned int ch)
4520 if (unicode_attributes[ch].name != NULL
4521 && unicode_attributes[ch].upper != NONE)
4522 return unicode_attributes[ch].upper;
4528 to_lower (unsigned int ch)
4530 if (unicode_attributes[ch].name != NULL
4531 && unicode_attributes[ch].lower != NONE)
4532 return unicode_attributes[ch].lower;
4538 to_title (unsigned int ch)
4540 if (unicode_attributes[ch].name != NULL
4541 && unicode_attributes[ch].title != NONE)
4542 return unicode_attributes[ch].title;
4547 /* Character class properties. */
4550 is_upper (unsigned int ch)
4552 return (to_lower (ch) != ch);
4556 is_lower (unsigned int ch)
4558 return (to_upper (ch) != ch)
4559 /* <U00DF> is lowercase, but without simple to_upper mapping. */
4564 is_alpha (unsigned int ch)
4566 return (unicode_attributes[ch].name != NULL
4567 && ((unicode_attributes[ch].category[0] == 'L'
4568 /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
4569 <U0E2F>, <U0E46> should belong to is_punct. */
4570 && (ch != 0x0E2F) && (ch != 0x0E46))
4571 /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
4572 <U0E31>, <U0E34>..<U0E3A>, <U0E47>..<U0E4E> are is_alpha. */
4574 || (ch >= 0x0E34 && ch <= 0x0E3A)
4575 || (ch >= 0x0E47 && ch <= 0x0E4E)
4576 /* Avoid warning for <U0345>. */
4578 /* Avoid warnings for <U2160>..<U217F>. */
4579 || (unicode_attributes[ch].category[0] == 'N'
4580 && unicode_attributes[ch].category[1] == 'l')
4581 /* Avoid warnings for <U24B6>..<U24E9>. */
4582 || (unicode_attributes[ch].category[0] == 'S'
4583 && unicode_attributes[ch].category[1] == 'o'
4584 && strstr (unicode_attributes[ch].name, " LETTER ")
4586 /* Consider all the non-ASCII digits as alphabetic.
4587 ISO C 99 forbids us to have them in category "digit",
4588 but we want iswalnum to return true on them. */
4589 || (unicode_attributes[ch].category[0] == 'N'
4590 && unicode_attributes[ch].category[1] == 'd'
4591 && !(ch >= 0x0030 && ch <= 0x0039))));
4595 is_digit (unsigned int ch)
4598 return (unicode_attributes[ch].name != NULL
4599 && unicode_attributes[ch].category[0] == 'N'
4600 && unicode_attributes[ch].category[1] == 'd');
4601 /* Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
4602 a zero. Must add <0> in front of them by hand. */
4604 /* SUSV2 gives us some freedom for the "digit" category, but ISO C 99
4607 The iswdigit function tests for any wide character that corresponds
4608 to a decimal-digit character (as defined in 5.2.1).
4610 the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
4612 return (ch >= 0x0030 && ch <= 0x0039);
4617 is_outdigit (unsigned int ch)
4619 return (ch >= 0x0030 && ch <= 0x0039);
4623 is_alnum (unsigned int ch)
4625 return is_alpha (ch) || is_digit (ch);
4629 is_blank (unsigned int ch)
4631 return (ch == 0x0009 /* '\t' */
4632 /* Category Zs without mention of "<noBreak>" */
4633 || (unicode_attributes[ch].name != NULL
4634 && unicode_attributes[ch].category[0] == 'Z'
4635 && unicode_attributes[ch].category[1] == 's'
4636 && !strstr (unicode_attributes[ch].decomposition, "<noBreak>")));
4640 is_space (unsigned int ch)
4642 /* Don't make U+00A0 a space. Non-breaking space means that all programs
4643 should treat it like a punctuation character, not like a space. */
4644 return (ch == 0x0020 /* ' ' */
4645 || ch == 0x000C /* '\f' */
4646 || ch == 0x000A /* '\n' */
4647 || ch == 0x000D /* '\r' */
4648 || ch == 0x0009 /* '\t' */
4649 || ch == 0x000B /* '\v' */
4650 /* Categories Zl, Zp, and Zs without mention of "<noBreak>" */
4651 || (unicode_attributes[ch].name != NULL
4652 && unicode_attributes[ch].category[0] == 'Z'
4653 && (unicode_attributes[ch].category[1] == 'l'
4654 || unicode_attributes[ch].category[1] == 'p'
4655 || (unicode_attributes[ch].category[1] == 's'
4656 && !strstr (unicode_attributes[ch].decomposition,
4661 is_cntrl (unsigned int ch)
4663 return (unicode_attributes[ch].name != NULL
4664 && (strcmp (unicode_attributes[ch].name, "<control>") == 0
4665 /* Categories Zl and Zp */
4666 || (unicode_attributes[ch].category[0] == 'Z'
4667 && (unicode_attributes[ch].category[1] == 'l'
4668 || unicode_attributes[ch].category[1] == 'p'))));
4672 is_xdigit (unsigned int ch)
4675 return is_digit (ch)
4676 || (ch >= 0x0041 && ch <= 0x0046)
4677 || (ch >= 0x0061 && ch <= 0x0066);
4679 /* SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
4682 The iswxdigit function tests for any wide character that corresponds
4683 to a hexadecimal-digit character (as defined in 6.4.4.1).
4685 hexadecimal-digit: one of 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
4687 return (ch >= 0x0030 && ch <= 0x0039)
4688 || (ch >= 0x0041 && ch <= 0x0046)
4689 || (ch >= 0x0061 && ch <= 0x0066);
4694 is_graph (unsigned int ch)
4696 return (unicode_attributes[ch].name != NULL
4697 && strcmp (unicode_attributes[ch].name, "<control>")
4702 is_print (unsigned int ch)
4704 return (unicode_attributes[ch].name != NULL
4705 && strcmp (unicode_attributes[ch].name, "<control>")
4706 /* Categories Zl and Zp */
4707 && !(unicode_attributes[ch].name != NULL
4708 && unicode_attributes[ch].category[0] == 'Z'
4709 && (unicode_attributes[ch].category[1] == 'l'
4710 || unicode_attributes[ch].category[1] == 'p')));
4714 is_punct (unsigned int ch)
4717 return (unicode_attributes[ch].name != NULL
4718 && unicode_attributes[ch].category[0] == 'P');
4720 /* The traditional POSIX definition of punctuation is every graphic,
4721 non-alphanumeric character. */
4722 return (is_graph (ch) && !is_alpha (ch) && !is_digit (ch));
4726 /* Output all properties. */
4728 output_old_ctype (const char *version)
4730 #define PROPERTY(P) \
4731 debug_output_predicate ("unictype/ctype_" #P ".txt", is_ ## P); \
4732 output_predicate_test ("../tests/unictype/test-ctype_" #P ".c", is_ ## P, "uc_is_" #P " (c)"); \
4733 output_predicate ("unictype/ctype_" #P ".h", is_ ## P, "u_is_" #P, "ISO C <ctype.h> like properties", version);
4752 is_combining (unsigned int ch)
4754 /* Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
4755 file. In 3.0.1 it was identical to the union of the general categories
4756 "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the
4757 PropList.txt file, so we take the latter definition. */
4758 return (unicode_attributes[ch].name != NULL
4759 && unicode_attributes[ch].category[0] == 'M'
4760 && (unicode_attributes[ch].category[1] == 'n'
4761 || unicode_attributes[ch].category[1] == 'c'
4762 || unicode_attributes[ch].category[1] == 'e'));
4766 is_combining_level3 (unsigned int ch)
4768 return is_combining (ch)
4769 && !(unicode_attributes[ch].combining[0] != '\0'
4770 && unicode_attributes[ch].combining[0] != '0'
4771 && strtoul (unicode_attributes[ch].combining, NULL, 10) >= 200);
4774 /* Return the UCS symbol string for a Unicode character. */
4776 ucs_symbol (unsigned int i)
4778 static char buf[11+1];
4780 sprintf (buf, (i < 0x10000 ? "<U%04X>" : "<U%08X>"), i);
4784 /* Return the UCS symbol range string for a Unicode characters interval. */
4786 ucs_symbol_range (unsigned int low, unsigned int high)
4788 static char buf[24+1];
4790 strcpy (buf, ucs_symbol (low));
4792 strcat (buf, ucs_symbol (high));
4796 /* Output a character class (= property) table. */
4799 output_charclass (FILE *stream, const char *classname,
4800 bool (*func) (unsigned int))
4802 char table[0x110000];
4804 bool need_semicolon;
4805 const int max_column = 75;
4808 for (i = 0; i < 0x110000; i++)
4809 table[i] = (int) func (i);
4811 fprintf (stream, "%s ", classname);
4812 need_semicolon = false;
4814 for (i = 0; i < 0x110000; )
4820 unsigned int low, high;
4826 while (i < 0x110000 && table[i]);
4830 strcpy (buf, ucs_symbol (low));
4832 strcpy (buf, ucs_symbol_range (low, high));
4836 fprintf (stream, ";");
4840 if (column + strlen (buf) > max_column)
4842 fprintf (stream, "/\n ");
4846 fprintf (stream, "%s", buf);
4847 column += strlen (buf);
4848 need_semicolon = true;
4851 fprintf (stream, "\n");
4854 /* Output a character mapping table. */
4857 output_charmap (FILE *stream, const char *mapname,
4858 unsigned int (*func) (unsigned int))
4860 char table[0x110000];
4862 bool need_semicolon;
4863 const int max_column = 75;
4866 for (i = 0; i < 0x110000; i++)
4867 table[i] = (func (i) != i);
4869 fprintf (stream, "%s ", mapname);
4870 need_semicolon = false;
4872 for (i = 0; i < 0x110000; i++)
4878 strcat (buf, ucs_symbol (i));
4880 strcat (buf, ucs_symbol (func (i)));
4885 fprintf (stream, ";");
4889 if (column + strlen (buf) > max_column)
4891 fprintf (stream, "/\n ");
4895 fprintf (stream, "%s", buf);
4896 column += strlen (buf);
4897 need_semicolon = true;
4899 fprintf (stream, "\n");
4902 /* Output the width table. */
4905 output_widthmap (FILE *stream)
4909 /* Output the tables to the given file. */
4912 output_tables (const char *filename, const char *version)
4917 stream = fopen (filename, "w");
4920 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4924 fprintf (stream, "escape_char /\n");
4925 fprintf (stream, "comment_char %%\n");
4926 fprintf (stream, "\n");
4927 fprintf (stream, "%% Generated automatically by gen-unicode-ctype for Unicode %s.\n",
4929 fprintf (stream, "\n");
4931 fprintf (stream, "LC_IDENTIFICATION\n");
4932 fprintf (stream, "title \"Unicode %s FDCC-set\"\n", version);
4933 fprintf (stream, "source \"UnicodeData.txt, PropList.txt\"\n");
4934 fprintf (stream, "address \"\"\n");
4935 fprintf (stream, "contact \"\"\n");
4936 fprintf (stream, "email \"bug-glibc@gnu.org\"\n");
4937 fprintf (stream, "tel \"\"\n");
4938 fprintf (stream, "fax \"\"\n");
4939 fprintf (stream, "language \"\"\n");
4940 fprintf (stream, "territory \"Earth\"\n");
4941 fprintf (stream, "revision \"%s\"\n", version);
4946 strftime (date, sizeof (date), "%Y-%m-%d", gmtime (&now));
4947 fprintf (stream, "date \"%s\"\n", date);
4949 fprintf (stream, "category \"unicode:2001\";LC_CTYPE\n");
4950 fprintf (stream, "END LC_IDENTIFICATION\n");
4951 fprintf (stream, "\n");
4953 /* Verifications. */
4954 for (ch = 0; ch < 0x110000; ch++)
4956 /* toupper restriction: "Only characters specified for the keywords
4957 lower and upper shall be specified. */
4958 if (to_upper (ch) != ch && !(is_lower (ch) || is_upper (ch)))
4960 "%s is not upper|lower but toupper(0x%04X) = 0x%04X\n",
4961 ucs_symbol (ch), ch, to_upper (ch));
4963 /* tolower restriction: "Only characters specified for the keywords
4964 lower and upper shall be specified. */
4965 if (to_lower (ch) != ch && !(is_lower (ch) || is_upper (ch)))
4967 "%s is not upper|lower but tolower(0x%04X) = 0x%04X\n",
4968 ucs_symbol (ch), ch, to_lower (ch));
4970 /* alpha restriction: "Characters classified as either upper or lower
4971 shall automatically belong to this class. */
4972 if ((is_lower (ch) || is_upper (ch)) && !is_alpha (ch))
4973 fprintf (stderr, "%s is upper|lower but not alpha\n", ucs_symbol (ch));
4975 /* alpha restriction: "No character specified for the keywords cntrl,
4976 digit, punct or space shall be specified." */
4977 if (is_alpha (ch) && is_cntrl (ch))
4978 fprintf (stderr, "%s is alpha and cntrl\n", ucs_symbol (ch));
4979 if (is_alpha (ch) && is_digit (ch))
4980 fprintf (stderr, "%s is alpha and digit\n", ucs_symbol (ch));
4981 if (is_alpha (ch) && is_punct (ch))
4982 fprintf (stderr, "%s is alpha and punct\n", ucs_symbol (ch));
4983 if (is_alpha (ch) && is_space (ch))
4984 fprintf (stderr, "%s is alpha and space\n", ucs_symbol (ch));
4986 /* space restriction: "No character specified for the keywords upper,
4987 lower, alpha, digit, graph or xdigit shall be specified."
4988 upper, lower, alpha already checked above. */
4989 if (is_space (ch) && is_digit (ch))
4990 fprintf (stderr, "%s is space and digit\n", ucs_symbol (ch));
4991 if (is_space (ch) && is_graph (ch))
4992 fprintf (stderr, "%s is space and graph\n", ucs_symbol (ch));
4993 if (is_space (ch) && is_xdigit (ch))
4994 fprintf (stderr, "%s is space and xdigit\n", ucs_symbol (ch));
4996 /* cntrl restriction: "No character specified for the keywords upper,
4997 lower, alpha, digit, punct, graph, print or xdigit shall be
4998 specified." upper, lower, alpha already checked above. */
4999 if (is_cntrl (ch) && is_digit (ch))
5000 fprintf (stderr, "%s is cntrl and digit\n", ucs_symbol (ch));
5001 if (is_cntrl (ch) && is_punct (ch))
5002 fprintf (stderr, "%s is cntrl and punct\n", ucs_symbol (ch));
5003 if (is_cntrl (ch) && is_graph (ch))
5004 fprintf (stderr, "%s is cntrl and graph\n", ucs_symbol (ch));
5005 if (is_cntrl (ch) && is_print (ch))
5006 fprintf (stderr, "%s is cntrl and print\n", ucs_symbol (ch));
5007 if (is_cntrl (ch) && is_xdigit (ch))
5008 fprintf (stderr, "%s is cntrl and xdigit\n", ucs_symbol (ch));
5010 /* punct restriction: "No character specified for the keywords upper,
5011 lower, alpha, digit, cntrl, xdigit or as the <space> character shall
5012 be specified." upper, lower, alpha, cntrl already checked above. */
5013 if (is_punct (ch) && is_digit (ch))
5014 fprintf (stderr, "%s is punct and digit\n", ucs_symbol (ch));
5015 if (is_punct (ch) && is_xdigit (ch))
5016 fprintf (stderr, "%s is punct and xdigit\n", ucs_symbol (ch));
5017 if (is_punct (ch) && (ch == 0x0020))
5018 fprintf (stderr, "%s is punct\n", ucs_symbol (ch));
5020 /* graph restriction: "No character specified for the keyword cntrl
5021 shall be specified." Already checked above. */
5023 /* print restriction: "No character specified for the keyword cntrl
5024 shall be specified." Already checked above. */
5026 /* graph - print relation: differ only in the <space> character.
5027 How is this possible if there are more than one space character?!
5028 I think susv2/xbd/locale.html should speak of "space characters",
5029 not "space character". */
5030 if (is_print (ch) && !(is_graph (ch) || /* ch == 0x0020 */ is_space (ch)))
5032 "%s is print but not graph|<space>\n", ucs_symbol (ch));
5033 if (!is_print (ch) && (is_graph (ch) || ch == 0x0020))
5035 "%s is graph|<space> but not print\n", ucs_symbol (ch));
5038 fprintf (stream, "LC_CTYPE\n");
5039 output_charclass (stream, "upper", is_upper);
5040 output_charclass (stream, "lower", is_lower);
5041 output_charclass (stream, "alpha", is_alpha);
5042 output_charclass (stream, "digit", is_digit);
5043 output_charclass (stream, "outdigit", is_outdigit);
5044 output_charclass (stream, "blank", is_blank);
5045 output_charclass (stream, "space", is_space);
5046 output_charclass (stream, "cntrl", is_cntrl);
5047 output_charclass (stream, "punct", is_punct);
5048 output_charclass (stream, "xdigit", is_xdigit);
5049 output_charclass (stream, "graph", is_graph);
5050 output_charclass (stream, "print", is_print);
5051 output_charclass (stream, "class \"combining\";", is_combining);
5052 output_charclass (stream, "class \"combining_level3\";", is_combining_level3);
5053 output_charmap (stream, "toupper", to_upper);
5054 output_charmap (stream, "tolower", to_lower);
5055 output_charmap (stream, "map \"totitle\";", to_title);
5056 output_widthmap (stream);
5057 fprintf (stream, "END LC_CTYPE\n");
5059 if (ferror (stream) || fclose (stream))
5061 fprintf (stderr, "error writing to '%s'\n", filename);
5068 /* ========================================================================= */
5070 /* The width property from the EastAsianWidth.txt file.
5071 Each is NULL (unassigned) or "N", "A", "H", "W", "F", "Na". */
5072 const char * unicode_width[0x110000];
5074 /* Stores in unicode_width[] the width property from the EastAsianWidth.txt
5077 fill_width (const char *width_filename)
5081 char field0[FIELDLEN];
5082 char field1[FIELDLEN];
5083 char field2[FIELDLEN];
5086 for (i = 0; i < 0x110000; i++)
5087 unicode_width[i] = (unicode_attributes[i].name != NULL ? "N" : NULL);
5089 stream = fopen (width_filename, "r");
5092 fprintf (stderr, "error during fopen of '%s'\n", width_filename);
5107 do c = getc (stream); while (c != EOF && c != '\n');
5111 n = getfield (stream, field0, ';');
5112 n += getfield (stream, field1, ' ');
5113 n += getfield (stream, field2, '\n');
5118 fprintf (stderr, "short line in '%s':%d\n", width_filename, lineno);
5121 i = strtoul (field0, NULL, 16);
5122 if (strstr (field0, "..") != NULL)
5124 /* Deal with a range. */
5125 j = strtoul (strstr (field0, "..") + 2, NULL, 16);
5127 unicode_width[i] = strdup (field1);
5131 /* Single character line. */
5132 unicode_width[i] = strdup (field1);
5135 if (ferror (stream) || fclose (stream))
5137 fprintf (stderr, "error reading from '%s'\n", width_filename);
5142 /* Line breaking classification. */
5146 /* Values >= 24 are resolved at run time. */
5147 LBP_BK = 24, /* mandatory break */
5148 /*LBP_CR, carriage return - not used here because it's a DOSism */
5149 /*LBP_LF, line feed - not used here because it's a DOSism */
5150 LBP_CM = 25, /* attached characters and combining marks */
5151 /*LBP_NL, next line - not used here because it's equivalent to LBP_BK */
5152 /*LBP_SG, surrogates - not used here because they are not characters */
5153 LBP_WJ = 0, /* word joiner */
5154 LBP_ZW = 26, /* zero width space */
5155 LBP_GL = 1, /* non-breaking (glue) */
5156 LBP_SP = 27, /* space */
5157 LBP_B2 = 2, /* break opportunity before and after */
5158 LBP_BA = 3, /* break opportunity after */
5159 LBP_BB = 4, /* break opportunity before */
5160 LBP_HY = 5, /* hyphen */
5161 LBP_CB = 28, /* contingent break opportunity */
5162 LBP_CL = 6, /* closing punctuation */
5163 LBP_EX = 7, /* exclamation/interrogation */
5164 LBP_IN = 8, /* inseparable */
5165 LBP_NS = 9, /* non starter */
5166 LBP_OP = 10, /* opening punctuation */
5167 LBP_QU = 11, /* ambiguous quotation */
5168 LBP_IS = 12, /* infix separator (numeric) */
5169 LBP_NU = 13, /* numeric */
5170 LBP_PO = 14, /* postfix (numeric) */
5171 LBP_PR = 15, /* prefix (numeric) */
5172 LBP_SY = 16, /* symbols allowing breaks */
5173 LBP_AI = 29, /* ambiguous (alphabetic or ideograph) */
5174 LBP_AL = 17, /* ordinary alphabetic and symbol characters */
5175 LBP_H2 = 18, /* Hangul LV syllable */
5176 LBP_H3 = 19, /* Hangul LVT syllable */
5177 LBP_ID = 20, /* ideographic */
5178 LBP_JL = 21, /* Hangul L Jamo */
5179 LBP_JV = 22, /* Hangul V Jamo */
5180 LBP_JT = 23, /* Hangul T Jamo */
5181 LBP_SA = 30, /* complex context (South East Asian) */
5182 LBP_XX = 31 /* unknown */
5185 /* Returns the line breaking classification for ch, as a bit mask. */
5187 get_lbp (unsigned int ch)
5191 if (unicode_attributes[ch].name != NULL)
5193 /* mandatory break */
5194 if (ch == 0x000A || ch == 0x000D || ch == 0x0085 /* newline */
5195 || ch == 0x000C /* form feed */
5196 || ch == 0x000B /* line tabulation */
5197 || ch == 0x2028 /* LINE SEPARATOR */
5198 || ch == 0x2029 /* PARAGRAPH SEPARATOR */)
5199 attr |= 1 << LBP_BK;
5201 if (ch == 0x2060 /* WORD JOINER */
5202 || ch == 0xFEFF /* ZERO WIDTH NO-BREAK SPACE */)
5203 attr |= 1 << LBP_WJ;
5205 /* zero width space */
5206 if (ch == 0x200B /* ZERO WIDTH SPACE */)
5207 attr |= 1 << LBP_ZW;
5209 /* non-breaking (glue) */
5210 if (ch == 0x00A0 /* NO-BREAK SPACE */
5211 || ch == 0x202F /* NARROW NO-BREAK SPACE */
5212 || ch == 0x180E /* MONGOLIAN VOWEL SEPARATOR */
5213 || ch == 0x034F /* COMBINING GRAPHEME JOINER */
5214 || ch == 0x2007 /* FIGURE SPACE */
5215 || ch == 0x2011 /* NON-BREAKING HYPHEN */
5216 || ch == 0x0F08 /* TIBETAN MARK SBRUL SHAD */
5217 || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */
5218 || ch == 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */
5219 || (ch >= 0x035C && ch <= 0x0362) /* COMBINING DOUBLE ... */)
5220 attr |= 1 << LBP_GL;
5223 if (ch == 0x0020 /* SPACE */)
5224 attr |= 1 << LBP_SP;
5226 /* break opportunity before and after */
5227 if (ch == 0x2014 /* EM DASH */)
5228 attr |= 1 << LBP_B2;
5230 /* break opportunity after */
5231 if (ch == 0x1680 /* OGHAM SPACE MARK */
5232 || ch == 0x2000 /* EN QUAD */
5233 || ch == 0x2001 /* EM QUAD */
5234 || ch == 0x2002 /* EN SPACE */
5235 || ch == 0x2003 /* EM SPACE */
5236 || ch == 0x2004 /* THREE-PER-EM SPACE */
5237 || ch == 0x2005 /* FOUR-PER-EM SPACE */
5238 || ch == 0x2006 /* SIX-PER-EM SPACE */
5239 || ch == 0x2008 /* PUNCTUATION SPACE */
5240 || ch == 0x2009 /* THIN SPACE */
5241 || ch == 0x200A /* HAIR SPACE */
5242 || ch == 0x205F /* MEDIUM MATHEMATICAL SPACE */
5243 || ch == 0x0009 /* tab */
5244 || ch == 0x00AD /* SOFT HYPHEN */
5245 || ch == 0x058A /* ARMENIAN HYPHEN */
5246 || ch == 0x2010 /* HYPHEN */
5247 || ch == 0x2012 /* FIGURE DASH */
5248 || ch == 0x2013 /* EN DASH */
5249 || ch == 0x05BE /* HEBREW PUNCTUATION MAQAF */
5250 || ch == 0x0F0B /* TIBETAN MARK INTERSYLLABIC TSHEG */
5251 || ch == 0x1361 /* ETHIOPIC WORDSPACE */
5252 || ch == 0x17D8 /* KHMER SIGN BEYYAL */
5253 || ch == 0x17DA /* KHMER SIGN KOOMUUT */
5254 || ch == 0x2027 /* HYPHENATION POINT */
5255 || ch == 0x007C /* VERTICAL LINE */
5256 || ch == 0x16EB /* RUNIC SINGLE PUNCTUATION */
5257 || ch == 0x16EC /* RUNIC MULTIPLE PUNCTUATION */
5258 || ch == 0x16ED /* RUNIC CROSS PUNCTUATION */
5259 || ch == 0x2056 /* THREE DOT PUNCTUATION */
5260 || ch == 0x2058 /* FOUR DOT PUNCTUATION */
5261 || ch == 0x2059 /* FIVE DOT PUNCTUATION */
5262 || ch == 0x205A /* TWO DOT PUNCTUATION */
5263 || ch == 0x205B /* FOUR DOT MARK */
5264 || ch == 0x205D /* TRICOLON */
5265 || ch == 0x205E /* VERTICAL FOUR DOTS */
5266 || ch == 0x2E19 /* PALM BRANCH */
5267 || ch == 0x2E2A /* TWO DOTS OVER ONE DOT PUNCTUATION */
5268 || ch == 0x2E2B /* ONE DOT OVER TWO DOTS PUNCTUATION */
5269 || ch == 0x2E2C /* SQUARED FOUR DOT PUNCTUATION */
5270 || ch == 0x2E2D /* FIVE DOT PUNCTUATION */
5271 || ch == 0x2E30 /* RING POINT */
5272 || ch == 0x10100 /* AEGEAN WORD SEPARATOR LINE */
5273 || ch == 0x10101 /* AEGEAN WORD SEPARATOR DOT */
5274 || ch == 0x10102 /* AEGEAN CHECK MARK */
5275 || ch == 0x1039F /* UGARITIC WORD DIVIDER */
5276 || ch == 0x103D0 /* OLD PERSIAN WORD DIVIDER */
5277 || ch == 0x1091F /* PHOENICIAN WORD SEPARATOR */
5278 || ch == 0x12470 /* CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER */
5279 || ch == 0x0964 /* DEVANAGARI DANDA */
5280 || ch == 0x0965 /* DEVANAGARI DOUBLE DANDA */
5281 || ch == 0x0E5A /* THAI CHARACTER ANGKHANKHU */
5282 || ch == 0x0E5B /* THAI CHARACTER KHOMUT */
5283 || ch == 0x104A /* MYANMAR SIGN LITTLE SECTION */
5284 || ch == 0x104B /* MYANMAR SIGN SECTION */
5285 || ch == 0x1735 /* PHILIPPINE SINGLE PUNCTUATION */
5286 || ch == 0x1736 /* PHILIPPINE DOUBLE PUNCTUATION */
5287 || ch == 0x17D4 /* KHMER SIGN KHAN */
5288 || ch == 0x17D5 /* KHMER SIGN BARIYOOSAN */
5289 || ch == 0x1B5E /* BALINESE CARIK SIKI */
5290 || ch == 0x1B5F /* BALINESE CARIK PAREREN */
5291 || ch == 0xA8CE /* SAURASHTRA DANDA */
5292 || ch == 0xA8CF /* SAURASHTRA DOUBLE DANDA */
5293 || ch == 0xAA5D /* CHAM PUNCTUATION DANDA */
5294 || ch == 0xAA5E /* CHAM PUNCTUATION DOUBLE DANDA */
5295 || ch == 0xAA5F /* CHAM PUNCTUATION TRIPLE DANDA */
5296 || ch == 0x10A56 /* KHAROSHTHI PUNCTUATION DANDA */
5297 || ch == 0x10A57 /* KHAROSHTHI PUNCTUATION DOUBLE DANDA */
5298 || ch == 0x0F34 /* TIBETAN MARK BSDUS RTAGS */
5299 || ch == 0x0F7F /* TIBETAN SIGN RNAM BCAD */
5300 || ch == 0x0F85 /* TIBETAN MARK PALUTA */
5301 || ch == 0x0FBE /* TIBETAN KU RU KHA */
5302 || ch == 0x0FBF /* TIBETAN KU RU KHA BZHI MIG CAN */
5303 || ch == 0x0FD2 /* TIBETAN MARK NYIS TSHEG */
5304 || ch == 0x1804 /* MONGOLIAN COLON */
5305 || ch == 0x1805 /* MONGOLIAN FOUR DOTS */
5306 || ch == 0x1B5A /* BALINESE PANTI */
5307 || ch == 0x1B5B /* BALINESE PAMADA */
5308 || ch == 0x1B5C /* BALINESE WINDU */
5309 || ch == 0x1B5D /* BALINESE CARIK PAMUNGKAH */
5310 || ch == 0x1B60 /* BALINESE PAMENENG */
5311 || ch == 0x1C3B /* LEPCHA PUNCTUATION TA-ROL */
5312 || ch == 0x1C3C /* LEPCHA PUNCTUATION NYET THYOOM TA-ROL */
5313 || ch == 0x1C3D /* LEPCHA PUNCTUATION CER-WA */
5314 || ch == 0x1C3E /* LEPCHA PUNCTUATION TSHOOK CER-WA */
5315 || ch == 0x1C3F /* LEPCHA PUNCTUATION TSHOOK */
5316 || ch == 0x1C7E /* OL CHIKI PUNCTUATION MUCAAD */
5317 || ch == 0x1C7F /* OL CHIKI PUNCTUATION DOUBLE MUCAAD */
5318 || ch == 0x2CFA /* COPTIC OLD NUBIAN DIRECT QUESTION MARK */
5319 || ch == 0x2CFB /* COPTIC OLD NUBIAN INDIRECT QUESTION MARK */
5320 || ch == 0x2CFC /* COPTIC OLD NUBIAN VERSE DIVIDER */
5321 || ch == 0x2CFF /* COPTIC MORPHOLOGICAL DIVIDER */
5322 || (ch >= 0x2E0E && ch <= 0x2E15) /* EDITORIAL CORONIS .. UPWARDS ANCORA */
5323 || ch == 0x2E17 /* DOUBLE OBLIQUE HYPHEN */
5324 || ch == 0xA60D /* VAI COMMA */
5325 || ch == 0xA60F /* VAI QUESTION MARK */
5326 || ch == 0xA92E /* KAYAH LI SIGN CWI */
5327 || ch == 0xA92F /* KAYAH LI SIGN SHYA */
5328 || ch == 0x10A50 /* KHAROSHTHI PUNCTUATION DOT */
5329 || ch == 0x10A51 /* KHAROSHTHI PUNCTUATION SMALL CIRCLE */
5330 || ch == 0x10A52 /* KHAROSHTHI PUNCTUATION CIRCLE */
5331 || ch == 0x10A53 /* KHAROSHTHI PUNCTUATION CRESCENT BAR */
5332 || ch == 0x10A54 /* KHAROSHTHI PUNCTUATION MANGALAM */
5333 || ch == 0x10A55 /* KHAROSHTHI PUNCTUATION LOTUS */
5334 /* Extra characters for compatibility with Unicode LineBreak.txt. */
5335 || ch == 0x12471 /* CUNEIFORM PUNCTUATION SIGN VERTICAL COLON */
5336 || ch == 0x12472 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL COLON */
5337 || ch == 0x12473 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL TRICOLON */)
5338 attr |= 1 << LBP_BA;
5340 /* break opportunity before */
5341 if (ch == 0x00B4 /* ACUTE ACCENT */
5342 || ch == 0x1FFD /* GREEK OXIA */
5343 || ch == 0x02DF /* MODIFIER LETTER CROSS ACCENT */
5344 || ch == 0x02C8 /* MODIFIER LETTER VERTICAL LINE */
5345 || ch == 0x02CC /* MODIFIER LETTER LOW VERTICAL LINE */
5346 || ch == 0x0F01 /* TIBETAN MARK GTER YIG MGO TRUNCATED A */
5347 || ch == 0x0F02 /* TIBETAN MARK GTER YIG MGO -UM RNAM BCAD MA */
5348 || ch == 0x0F03 /* TIBETAN MARK GTER YIG MGO -UM GTER TSHEG MA */
5349 || ch == 0x0F04 /* TIBETAN MARK INITIAL YIG MGO MDUN MA */
5350 || ch == 0x0F06 /* TIBETAN MARK CARET YIG MGO PHUR SHAD MA */
5351 || ch == 0x0F07 /* TIBETAN MARK YIG MGO TSHEG SHAD MA */
5352 || ch == 0x0F09 /* TIBETAN MARK BSKUR YIG MGO */
5353 || ch == 0x0F0A /* TIBETAN MARK BKA- SHOG YIG MGO */
5354 || ch == 0x0FD0 /* TIBETAN MARK BSKA- SHOG GI MGO RGYAN */
5355 || ch == 0x0FD1 /* TIBETAN MARK MNYAM YIG GI MGO RGYAN */
5356 || ch == 0x0FD3 /* TIBETAN MARK INITIAL BRDA RNYING YIG MGO MDUN MA */
5357 || ch == 0xA874 /* PHAGS-PA SINGLE HEAD MARK */
5358 || ch == 0xA875 /* PHAGS-PA DOUBLE HEAD MARK */
5359 || ch == 0x1806 /* MONGOLIAN TODO SOFT HYPHEN */)
5360 attr |= 1 << LBP_BB;
5363 if (ch == 0x002D /* HYPHEN-MINUS */)
5364 attr |= 1 << LBP_HY;
5366 /* contingent break opportunity */
5367 if (ch == 0xFFFC /* OBJECT REPLACEMENT CHARACTER */)
5368 attr |= 1 << LBP_CB;
5370 /* closing punctuation */
5371 if ((unicode_attributes[ch].category[0] == 'P'
5372 && unicode_attributes[ch].category[1] == 'e')
5373 || ch == 0x3001 /* IDEOGRAPHIC COMMA */
5374 || ch == 0x3002 /* IDEOGRAPHIC FULL STOP */
5375 || ch == 0xFE11 /* PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC COMMA */
5376 || ch == 0xFE12 /* PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC FULL STOP */
5377 || ch == 0xFE50 /* SMALL COMMA */
5378 || ch == 0xFE52 /* SMALL FULL STOP */
5379 || ch == 0xFF0C /* FULLWIDTH COMMA */
5380 || ch == 0xFF0E /* FULLWIDTH FULL STOP */
5381 || ch == 0xFF61 /* HALFWIDTH IDEOGRAPHIC FULL STOP */
5382 || ch == 0xFF64 /* HALFWIDTH IDEOGRAPHIC COMMA */)
5383 attr |= 1 << LBP_CL;
5385 /* exclamation/interrogation */
5386 if (ch == 0x0021 /* EXCLAMATION MARK */
5387 || ch == 0x003F /* QUESTION MARK */
5388 || ch == 0x05C6 /* HEBREW PUNCTUATION NUN HAFUKHA */
5389 || ch == 0x061B /* ARABIC SEMICOLON */
5390 || ch == 0x061E /* ARABIC TRIPLE DOT PUNCTUATION MARK */
5391 || ch == 0x061F /* ARABIC QUESTION MARK */
5392 || ch == 0x06D4 /* ARABIC FULL STOP */
5393 || ch == 0x07F9 /* NKO EXCLAMATION MARK */
5394 || ch == 0x0F0D /* TIBETAN MARK SHAD */
5395 || ch == 0x0F0E /* TIBETAN MARK NYIS SHAD */
5396 || ch == 0x0F0F /* TIBETAN MARK TSHEG SHAD */
5397 || ch == 0x0F10 /* TIBETAN MARK NYIS TSHEG SHAD */
5398 || ch == 0x0F11 /* TIBETAN MARK RIN CHEN SPUNGS SHAD */
5399 || ch == 0x0F14 /* TIBETAN MARK GTER TSHEG */
5400 || ch == 0x1802 /* MONGOLIAN COMMA */
5401 || ch == 0x1803 /* MONGOLIAN FULL STOP */
5402 || ch == 0x1808 /* MONGOLIAN MANCHU COMMA */
5403 || ch == 0x1809 /* MONGOLIAN MANCHU FULL STOP */
5404 || ch == 0x1944 /* LIMBU EXCLAMATION MARK */
5405 || ch == 0x1945 /* LIMBU QUESTION MARK */
5406 || ch == 0x2762 /* HEAVY EXCLAMATION MARK ORNAMENT */
5407 || ch == 0x2763 /* HEAVY HEART EXCLAMATION MARK ORNAMENT */
5408 || ch == 0x2CF9 /* COPTIC OLD NUBIAN FULL STOP */
5409 || ch == 0x2CFE /* COPTIC FULL STOP */
5410 || ch == 0x2E2E /* REVERSED QUESTION MARK */
5412 || ch == 0xA60C /* VAI SYLLABLE LENGTHENER */
5414 || ch == 0xA60E /* VAI FULL STOP */
5415 || ch == 0xA876 /* PHAGS-PA MARK SHAD */
5416 || ch == 0xA877 /* PHAGS-PA MARK DOUBLE SHAD */
5417 || ch == 0xFE15 /* PRESENTATION FORM FOR VERTICAL EXCLAMATION MARK */
5418 || ch == 0xFE16 /* PRESENTATION FORM FOR VERTICAL QUESTION MARK */
5419 || ch == 0xFE56 /* SMALL QUESTION MARK */
5420 || ch == 0xFE57 /* SMALL EXCLAMATION MARK */
5421 || ch == 0xFF01 /* FULLWIDTH EXCLAMATION MARK */
5422 || ch == 0xFF1F /* FULLWIDTH QUESTION MARK */)
5423 attr |= 1 << LBP_EX;
5426 if (ch == 0x2024 /* ONE DOT LEADER */
5427 || ch == 0x2025 /* TWO DOT LEADER */
5428 || ch == 0x2026 /* HORIZONTAL ELLIPSIS */
5429 || ch == 0xFE19 /* PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS */)
5430 attr |= 1 << LBP_IN;
5433 if (ch == 0x17D6 /* KHMER SIGN CAMNUC PII KUUH */
5434 || ch == 0x203C /* DOUBLE EXCLAMATION MARK */
5435 || ch == 0x203D /* INTERROBANG */
5436 || ch == 0x2047 /* DOUBLE QUESTION MARK */
5437 || ch == 0x2048 /* QUESTION EXCLAMATION MARK */
5438 || ch == 0x2049 /* EXCLAMATION QUESTION MARK */
5439 || ch == 0x3005 /* IDEOGRAPHIC ITERATION MARK */
5440 || ch == 0x301C /* WAVE DASH */
5441 || ch == 0x303C /* MASU MARK */
5442 || ch == 0x303B /* VERTICAL IDEOGRAPHIC ITERATION MARK */
5443 || ch == 0x309B /* KATAKANA-HIRAGANA VOICED SOUND MARK */
5444 || ch == 0x309C /* KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK */
5445 || ch == 0x309D /* HIRAGANA ITERATION MARK */
5446 || ch == 0x309E /* HIRAGANA VOICED ITERATION MARK */
5447 || ch == 0x30A0 /* KATAKANA-HIRAGANA DOUBLE HYPHEN */
5448 || ch == 0x30FB /* KATAKANA MIDDLE DOT */
5449 || ch == 0x30FC /* KATAKANA-HIRAGANA PROLONGED SOUND MARK */
5450 || ch == 0x30FD /* KATAKANA ITERATION MARK */
5451 || ch == 0x30FE /* KATAKANA VOICED ITERATION MARK */
5452 || ch == 0xA015 /* YI SYLLABLE WU */
5453 || ch == 0xFE54 /* SMALL SEMICOLON */
5454 || ch == 0xFE55 /* SMALL COLON */
5455 || ch == 0xFF1A /* FULLWIDTH COLON */
5456 || ch == 0xFF1B /* FULLWIDTH SEMICOLON */
5457 || ch == 0xFF65 /* HALFWIDTH KATAKANA MIDDLE DOT */
5458 || ch == 0xFF70 /* HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK */
5459 || ch == 0xFF9E /* HALFWIDTH KATAKANA VOICED SOUND MARK */
5460 || ch == 0xFF9F /* HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK */
5461 || strstr (unicode_attributes[ch].name, "HIRAGANA LETTER SMALL ") != NULL
5462 || strstr (unicode_attributes[ch].name, "KATAKANA LETTER SMALL ") != NULL)
5463 attr |= 1 << LBP_NS;
5465 /* opening punctuation */
5466 if ((unicode_attributes[ch].category[0] == 'P'
5467 && unicode_attributes[ch].category[1] == 's')
5468 || ch == 0x00A1 /* INVERTED EXCLAMATION MARK */
5469 || ch == 0x00BF /* INVERTED QUESTION MARK */
5470 || ch == 0x2E18 /* INVERTED INTERROBANG */)
5471 attr |= 1 << LBP_OP;
5473 /* ambiguous quotation */
5474 if ((unicode_attributes[ch].category[0] == 'P'
5475 && (unicode_attributes[ch].category[1] == 'f'
5476 || unicode_attributes[ch].category[1] == 'i'))
5477 || ch == 0x0022 /* QUOTATION MARK */
5478 || ch == 0x0027 /* APOSTROPHE */
5479 || ch == 0x275B /* HEAVY SINGLE TURNED COMMA QUOTATION MARK ORNAMENT */
5480 || ch == 0x275C /* HEAVY SINGLE COMMA QUOTATION MARK ORNAMENT */
5481 || ch == 0x275D /* HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT */
5482 || ch == 0x275E /* HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT */
5483 || ch == 0x2E00 /* RIGHT ANGLE SUBSTITUTION MARKER */
5484 || ch == 0x2E01 /* RIGHT ANGLE DOTTED SUBSTITUTION MARKER */
5485 || ch == 0x2E06 /* RAISED INTERPOLATION MARKER */
5486 || ch == 0x2E07 /* RAISED DOTTED INTERPOLATION MARKER */
5487 || ch == 0x2E08 /* DOTTED TRANSPOSITION MARKER */
5488 || ch == 0x2E0B /* RAISED SQUARE */)
5489 attr |= 1 << LBP_QU;
5491 /* infix separator (numeric) */
5492 if (ch == 0x002C /* COMMA */
5493 || ch == 0x002E /* FULL STOP */
5494 || ch == 0x003A /* COLON */
5495 || ch == 0x003B /* SEMICOLON */
5496 || ch == 0x037E /* GREEK QUESTION MARK */
5497 || ch == 0x0589 /* ARMENIAN FULL STOP */
5498 || ch == 0x060C /* ARABIC COMMA */
5499 || ch == 0x060D /* ARABIC DATE SEPARATOR */
5500 || ch == 0x07F8 /* NKO COMMA */
5501 || ch == 0x2044 /* FRACTION SLASH */
5502 || ch == 0xFE10 /* PRESENTATION FORM FOR VERTICAL COMMA */
5503 || ch == 0xFE13 /* PRESENTATION FORM FOR VERTICAL COLON */
5504 || ch == 0xFE14 /* PRESENTATION FORM FOR VERTICAL SEMICOLON */)
5505 attr |= 1 << LBP_IS;
5508 if ((unicode_attributes[ch].category[0] == 'N'
5509 && unicode_attributes[ch].category[1] == 'd'
5510 && strstr (unicode_attributes[ch].name, "FULLWIDTH") == NULL)
5511 || ch == 0x066B /* ARABIC DECIMAL SEPARATOR */
5512 || ch == 0x066C /* ARABIC THOUSANDS SEPARATOR */)
5513 attr |= 1 << LBP_NU;
5515 /* postfix (numeric) */
5516 if (ch == 0x0025 /* PERCENT SIGN */
5517 || ch == 0x00A2 /* CENT SIGN */
5518 || ch == 0x00B0 /* DEGREE SIGN */
5519 || ch == 0x060B /* AFGHANI SIGN */
5520 || ch == 0x066A /* ARABIC PERCENT SIGN */
5521 || ch == 0x2030 /* PER MILLE SIGN */
5522 || ch == 0x2031 /* PER TEN THOUSAND SIGN */
5523 || ch == 0x2032 /* PRIME */
5524 || ch == 0x2033 /* DOUBLE PRIME */
5525 || ch == 0x2034 /* TRIPLE PRIME */
5526 || ch == 0x2035 /* REVERSED PRIME */
5527 || ch == 0x2036 /* REVERSED DOUBLE PRIME */
5528 || ch == 0x2037 /* REVERSED TRIPLE PRIME */
5529 || ch == 0x20A7 /* PESETA SIGN */
5530 || ch == 0x2103 /* DEGREE CELSIUS */
5531 || ch == 0x2109 /* DEGREE FAHRENHEIT */
5532 || ch == 0xFDFC /* RIAL SIGN */
5533 || ch == 0xFE6A /* SMALL PERCENT SIGN */
5534 || ch == 0xFF05 /* FULLWIDTH PERCENT SIGN */
5535 || ch == 0xFFE0 /* FULLWIDTH DIGIT ZERO */
5536 /* Extra characters for compatibility with Unicode LineBreak.txt. */
5537 || ch == 0x0609 /* ARABIC-INDIC PER MILLE SIGN */
5538 || ch == 0x060A /* ARABIC-INDIC PER TEN THOUSAND SIGN */
5539 || ch == 0x0D79 /* MALAYALAM DATE MARK */)
5540 attr |= 1 << LBP_PO;
5542 /* prefix (numeric) */
5543 if ((unicode_attributes[ch].category[0] == 'S'
5544 && unicode_attributes[ch].category[1] == 'c')
5545 || ch == 0x002B /* PLUS SIGN */
5546 || ch == 0x005C /* REVERSE SOLIDUS */
5547 || ch == 0x00B1 /* PLUS-MINUS SIGN */
5548 || ch == 0x2116 /* NUMERO SIGN */
5549 || ch == 0x2212 /* MINUS SIGN */
5550 || ch == 0x2213 /* MINUS-OR-PLUS SIGN */)
5551 if (!(attr & (1 << LBP_PO)))
5552 attr |= 1 << LBP_PR;
5554 /* symbols allowing breaks */
5555 if (ch == 0x002F /* SOLIDUS */)
5556 attr |= 1 << LBP_SY;
5558 if (ch >= 0xAC00 && ch <= 0xD7A3 && ((ch - 0xAC00) % 28) == 0)
5559 attr |= 1 << LBP_H2;
5561 if (ch >= 0xAC00 && ch <= 0xD7A3 && ((ch - 0xAC00) % 28) != 0)
5562 attr |= 1 << LBP_H3;
5564 if ((ch >= 0x1100 && ch <= 0x1159) || ch == 0x115F)
5565 attr |= 1 << LBP_JL;
5567 if (ch >= 0x1160 && ch <= 0x11A2)
5568 attr |= 1 << LBP_JV;
5570 if (ch >= 0x11A8 && ch <= 0x11F9)
5571 attr |= 1 << LBP_JT;
5573 /* complex context (South East Asian) */
5574 if (((unicode_attributes[ch].category[0] == 'C'
5575 && unicode_attributes[ch].category[1] == 'f')
5576 || (unicode_attributes[ch].category[0] == 'L'
5577 && (unicode_attributes[ch].category[1] == 'm'
5578 || unicode_attributes[ch].category[1] == 'o'))
5579 || (unicode_attributes[ch].category[0] == 'M'
5580 && (unicode_attributes[ch].category[1] == 'c'
5581 || unicode_attributes[ch].category[1] == 'n'))
5582 /* Extra characters for compatibility with Unicode LineBreak.txt. */
5583 || ch == 0x109E /* MYANMAR SYMBOL SHAN ONE */
5584 || ch == 0x109F /* MYANMAR SYMBOL SHAN EXCLAMATION */
5585 || ch == 0x19DE /* NEW TAI LUE SIGN LAE */
5586 || ch == 0x19DF /* NEW TAI LUE SIGN LAEV */)
5587 && ((ch >= 0x0E00 && ch <= 0x0EFF)
5588 || (ch >= 0x1000 && ch <= 0x109F)
5589 || (ch >= 0x1780 && ch <= 0x17FF)
5590 || (ch >= 0x1950 && ch <= 0x19DF)))
5591 attr |= 1 << LBP_SA;
5593 /* attached characters and combining marks */
5594 if ((unicode_attributes[ch].category[0] == 'M'
5595 && (unicode_attributes[ch].category[1] == 'c'
5596 || unicode_attributes[ch].category[1] == 'e'
5597 || unicode_attributes[ch].category[1] == 'n'))
5598 || (unicode_attributes[ch].category[0] == 'C'
5599 && (unicode_attributes[ch].category[1] == 'c'
5600 || unicode_attributes[ch].category[1] == 'f')))
5601 if (!(attr & ((1 << LBP_BK) | (1 << LBP_BA) | (1 << LBP_GL) | (1 << LBP_SA) | (1 << LBP_WJ) | (1 << LBP_ZW))))
5602 attr |= 1 << LBP_CM;
5605 if ((ch >= 0x2E80 && ch <= 0x2FFF) /* CJK RADICAL, KANGXI RADICAL, IDEOGRAPHIC DESCRIPTION */
5606 || ch == 0x3000 /* IDEOGRAPHIC SPACE */
5607 || (ch >= 0x3040 && ch <= 0x309F) /* HIRAGANA */
5608 || (ch >= 0x30A0 && ch <= 0x30FF) /* KATAKANA */
5609 || (ch >= 0x3400 && ch <= 0x4DB5) /* CJK Ideograph Extension A */
5610 || (ch >= 0x4E00 && ch <= 0x9FC3) /* CJK Ideograph */
5611 || (ch >= 0xF900 && ch <= 0xFAD9) /* CJK COMPATIBILITY IDEOGRAPH */
5612 || (ch >= 0xA000 && ch <= 0xA48F) /* YI SYLLABLE */
5613 || (ch >= 0xA490 && ch <= 0xA4CF) /* YI RADICAL */
5614 || ch == 0xFE62 /* SMALL PLUS SIGN */
5615 || ch == 0xFE63 /* SMALL HYPHEN-MINUS */
5616 || ch == 0xFE64 /* SMALL LESS-THAN SIGN */
5617 || ch == 0xFE65 /* SMALL GREATER-THAN SIGN */
5618 || ch == 0xFE66 /* SMALL EQUALS SIGN */
5619 || (ch >= 0xFF10 && ch <= 0xFF19) /* FULLWIDTH DIGIT */
5620 || (ch >= 0x20000 && ch <= 0x2A6D6) /* CJK Ideograph Extension B */
5621 || (ch >= 0x2F800 && ch <= 0x2FA1D) /* CJK COMPATIBILITY IDEOGRAPH */
5622 || strstr (unicode_attributes[ch].name, "FULLWIDTH LATIN ") != NULL
5623 || (ch >= 0x3000 && ch <= 0x33FF
5624 && !(attr & ((1 << LBP_CM) | (1 << LBP_NS) | (1 << LBP_OP) | (1 << LBP_CL))))
5625 /* Extra characters for compatibility with Unicode LineBreak.txt. */
5626 || ch == 0xFE30 /* PRESENTATION FORM FOR VERTICAL TWO DOT LEADER */
5627 || ch == 0xFE31 /* PRESENTATION FORM FOR VERTICAL EM DASH */
5628 || ch == 0xFE32 /* PRESENTATION FORM FOR VERTICAL EN DASH */
5629 || ch == 0xFE33 /* PRESENTATION FORM FOR VERTICAL LOW LINE */
5630 || ch == 0xFE34 /* PRESENTATION FORM FOR VERTICAL WAVY LOW LINE */
5631 || ch == 0xFE45 /* SESAME DOT */
5632 || ch == 0xFE46 /* WHITE SESAME DOT */
5633 || ch == 0xFE49 /* DASHED OVERLINE */
5634 || ch == 0xFE4A /* CENTRELINE OVERLINE */
5635 || ch == 0xFE4B /* WAVY OVERLINE */
5636 || ch == 0xFE4C /* DOUBLE WAVY OVERLINE */
5637 || ch == 0xFE4D /* DASHED LOW LINE */
5638 || ch == 0xFE4E /* CENTRELINE LOW LINE */
5639 || ch == 0xFE4F /* WAVY LOW LINE */
5640 || ch == 0xFE51 /* SMALL IDEOGRAPHIC COMMA */
5641 || ch == 0xFE58 /* SMALL EM DASH */
5642 || ch == 0xFE5F /* SMALL NUMBER SIGN */
5643 || ch == 0xFE60 /* SMALL AMPERSAND */
5644 || ch == 0xFE61 /* SMALL ASTERISK */
5645 || ch == 0xFE68 /* SMALL REVERSE SOLIDUS */
5646 || ch == 0xFE6B /* SMALL COMMERCIAL AT */
5647 || ch == 0xFF02 /* FULLWIDTH QUOTATION MARK */
5648 || ch == 0xFF03 /* FULLWIDTH NUMBER SIGN */
5649 || ch == 0xFF06 /* FULLWIDTH AMPERSAND */
5650 || ch == 0xFF07 /* FULLWIDTH APOSTROPHE */
5651 || ch == 0xFF0A /* FULLWIDTH ASTERISK */
5652 || ch == 0xFF0B /* FULLWIDTH PLUS SIGN */
5653 || ch == 0xFF0D /* FULLWIDTH HYPHEN-MINUS */
5654 || ch == 0xFF0F /* FULLWIDTH SOLIDUS */
5655 || ch == 0xFF1C /* FULLWIDTH LESS-THAN SIGN */
5656 || ch == 0xFF1D /* FULLWIDTH EQUALS SIGN */
5657 || ch == 0xFF1E /* FULLWIDTH GREATER-THAN SIGN */
5658 || ch == 0xFF20 /* FULLWIDTH COMMERCIAL AT */
5659 || ch == 0xFF3C /* FULLWIDTH REVERSE SOLIDUS */
5660 || ch == 0xFF3E /* FULLWIDTH CIRCUMFLEX ACCENT */
5661 || ch == 0xFF3F /* FULLWIDTH LOW LINE */
5662 || ch == 0xFF40 /* FULLWIDTH GRAVE ACCENT */
5663 || ch == 0xFF5C /* FULLWIDTH VERTICAL LINE */
5664 || ch == 0xFF5E /* FULLWIDTH TILDE */
5665 || ch == 0xFFE2 /* FULLWIDTH NOT SIGN */
5666 || ch == 0xFFE3 /* FULLWIDTH MACRON */
5667 || ch == 0xFFE4 /* FULLWIDTH BROKEN BAR */)
5668 if (!(attr & ((1 << LBP_NS) | (1 << LBP_CM))))
5670 /* ambiguous (ideograph) ? */
5671 if ((unicode_width[ch] != NULL
5672 && unicode_width[ch][0] == 'A'
5674 || ch == 0x24EA /* CIRCLED DIGIT ZERO */
5675 || (ch >= 0x2780 && ch <= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */)
5676 attr |= 1 << LBP_AI;
5678 attr |= 1 << LBP_ID;
5681 /* ordinary alphabetic and symbol characters */
5682 if ((unicode_attributes[ch].category[0] == 'L'
5683 && (unicode_attributes[ch].category[1] == 'u'
5684 || unicode_attributes[ch].category[1] == 'l'
5685 || unicode_attributes[ch].category[1] == 't'
5686 || unicode_attributes[ch].category[1] == 'm'
5687 || unicode_attributes[ch].category[1] == 'o'))
5688 || (unicode_attributes[ch].category[0] == 'S'
5689 && (unicode_attributes[ch].category[1] == 'm'
5690 || unicode_attributes[ch].category[1] == 'k'
5691 || unicode_attributes[ch].category[1] == 'o'))
5692 || (unicode_attributes[ch].category[0] == 'N'
5693 && (unicode_attributes[ch].category[1] == 'l'
5694 || unicode_attributes[ch].category[1] == 'o'))
5695 || (unicode_attributes[ch].category[0] == 'P'
5696 && (unicode_attributes[ch].category[1] == 'c'
5697 || unicode_attributes[ch].category[1] == 'd'
5698 || unicode_attributes[ch].category[1] == 'o'))
5699 || ch == 0x0600 /* ARABIC NUMBER SIGN */
5700 || ch == 0x0601 /* ARABIC SIGN SANAH */
5701 || ch == 0x0602 /* ARABIC FOOTNOTE MARKER */
5702 || ch == 0x0603 /* ARABIC SIGN SAFHA */
5703 || ch == 0x06DD /* ARABIC END OF AYAH */
5704 || ch == 0x070F /* SYRIAC ABBREVIATION MARK */
5705 || ch == 0x2061 /* FUNCTION APPLICATION */
5706 || ch == 0x2062 /* INVISIBLE TIMES */
5707 || ch == 0x2063 /* INVISIBLE SEPARATOR */
5708 || ch == 0x2064 /* INVISIBLE PLUS */)
5709 if (!(attr & ((1 << LBP_GL) | (1 << LBP_B2) | (1 << LBP_BA) | (1 << LBP_BB) | (1 << LBP_HY) | (1 << LBP_CB) | (1 << LBP_CL) | (1 << LBP_EX) | (1 << LBP_IN) | (1 << LBP_NS) | (1 << LBP_OP) | (1 << LBP_QU) | (1 << LBP_IS) | (1 << LBP_NU) | (1 << LBP_PO) | (1 << LBP_PR) | (1 << LBP_SY) | (1 << LBP_H2) | (1 << LBP_H3) | (1 << LBP_JL) | (1 << LBP_JV) | (1 << LBP_JT) | (1 << LBP_SA) | (1 << LBP_ID))))
5711 /* ambiguous (alphabetic) ? */
5712 if ((unicode_width[ch] != NULL
5713 && unicode_width[ch][0] == 'A'
5715 /* Extra exceptions for compatibility with Unicode LineBreak.txt. */
5716 && ch != 0x2022 /* BULLET */
5717 && ch != 0x203E /* OVERLINE */
5718 && ch != 0x2126 /* OHM SIGN */
5719 && ch != 0x2153 /* VULGAR FRACTION ONE THIRD */
5720 && ch != 0x215C /* VULGAR FRACTION THREE EIGHTHS */
5721 && ch != 0x215D /* VULGAR FRACTION FIVE EIGHTHS */
5722 && ch != 0x21B8 /* NORTH WEST ARROW TO LONG BAR */
5723 && ch != 0x21B9 /* LEFTWARDS ARROW TO BAR OVER RIGHTWARDS ARROW TO BAR */
5724 && ch != 0x21E7 /* UPWARDS WHITE ARROW */
5725 && ch != 0x24FF /* NEGATIVE CIRCLED DIGIT ZERO */
5726 && ch != 0x273D /* HEAVY TEARDROP-SPOKED ASTERISK */)
5728 || ch == 0x00A1 /* INVERTED EXCLAMATION MARK */
5729 || ch == 0x00A7 /* SECTION SIGN */
5730 || ch == 0x00A8 /* DIAERESIS */
5731 || ch == 0x00AA /* FEMININE ORDINAL INDICATOR */
5732 || ch == 0x00B2 /* SUPERSCRIPT TWO */
5733 || ch == 0x00B3 /* SUPERSCRIPT THREE */
5734 || ch == 0x00B6 /* PILCROW SIGN */
5735 || ch == 0x00B7 /* MIDDLE DOT */
5736 || ch == 0x00B8 /* CEDILLA */
5737 || ch == 0x00B9 /* SUPERSCRIPT ONE */
5738 || ch == 0x00BA /* MASCULINE ORDINAL INDICATOR */
5739 || ch == 0x00BC /* VULGAR FRACTION ONE QUARTER */
5740 || ch == 0x00BD /* VULGAR FRACTION ONE HALF */
5741 || ch == 0x00BE /* VULGAR FRACTION THREE QUARTERS */
5742 || ch == 0x00BF /* INVERTED QUESTION MARK */
5743 || ch == 0x00D7 /* MULTIPLICATION SIGN */
5744 || ch == 0x00F7 /* DIVISION SIGN */
5745 || ch == 0x02C7 /* CARON */
5746 || ch == 0x02C9 /* MODIFIER LETTER MACRON */
5747 || ch == 0x02CA /* MODIFIER LETTER ACUTE ACCENT */
5748 || ch == 0x02CB /* MODIFIER LETTER GRAVE ACCENT */
5749 || ch == 0x02CD /* MODIFIER LETTER LOW MACRON */
5750 || ch == 0x02D0 /* MODIFIER LETTER TRIANGULAR COLON */
5751 || ch == 0x02D8 /* BREVE */
5752 || ch == 0x02D9 /* DOT ABOVE */
5753 || ch == 0x02DA /* RING ABOVE */
5754 || ch == 0x02DB /* OGONEK */
5755 || ch == 0x02DD /* DOUBLE ACUTE ACCENT */
5757 || ch == 0x24EA /* CIRCLED DIGIT ZERO */
5758 || (ch >= 0x2780 && ch <= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */
5759 /* Extra characters for compatibility with Unicode LineBreak.txt. */
5760 || ch == 0x2155 /* VULGAR FRACTION ONE FIFTH */
5761 || ch == 0x2574 /* BOX DRAWINGS LIGHT LEFT */
5762 || ch == 0x2616 /* WHITE SHOGI PIECE */
5763 || ch == 0x2617 /* BLACK SHOGI PIECE */)
5764 attr |= 1 << LBP_AI;
5766 attr |= 1 << LBP_AL;
5767 attr &= ~(1 << LBP_CM);
5773 attr |= 1 << LBP_XX;
5778 /* Output the line breaking properties in a human readable format. */
5780 debug_output_lbp (FILE *stream)
5784 for (i = 0; i < 0x110000; i++)
5786 int attr = get_lbp (i);
5787 if (attr != 1 << LBP_XX)
5789 fprintf (stream, "0x%04X", i);
5790 #define PRINT_BIT(attr,bit) \
5791 if (attr & (1 << bit)) fprintf (stream, " " #bit);
5792 PRINT_BIT(attr,LBP_BK);
5793 PRINT_BIT(attr,LBP_CM);
5794 PRINT_BIT(attr,LBP_WJ);
5795 PRINT_BIT(attr,LBP_ZW);
5796 PRINT_BIT(attr,LBP_GL);
5797 PRINT_BIT(attr,LBP_SP);
5798 PRINT_BIT(attr,LBP_B2);
5799 PRINT_BIT(attr,LBP_BA);
5800 PRINT_BIT(attr,LBP_BB);
5801 PRINT_BIT(attr,LBP_HY);
5802 PRINT_BIT(attr,LBP_CB);
5803 PRINT_BIT(attr,LBP_CL);
5804 PRINT_BIT(attr,LBP_EX);
5805 PRINT_BIT(attr,LBP_IN);
5806 PRINT_BIT(attr,LBP_NS);
5807 PRINT_BIT(attr,LBP_OP);
5808 PRINT_BIT(attr,LBP_QU);
5809 PRINT_BIT(attr,LBP_IS);
5810 PRINT_BIT(attr,LBP_NU);
5811 PRINT_BIT(attr,LBP_PO);
5812 PRINT_BIT(attr,LBP_PR);
5813 PRINT_BIT(attr,LBP_SY);
5814 PRINT_BIT(attr,LBP_AI);
5815 PRINT_BIT(attr,LBP_AL);
5816 PRINT_BIT(attr,LBP_H2);
5817 PRINT_BIT(attr,LBP_H3);
5818 PRINT_BIT(attr,LBP_ID);
5819 PRINT_BIT(attr,LBP_JL);
5820 PRINT_BIT(attr,LBP_JV);
5821 PRINT_BIT(attr,LBP_JT);
5822 PRINT_BIT(attr,LBP_SA);
5823 PRINT_BIT(attr,LBP_XX);
5825 fprintf (stream, "\n");
5831 debug_output_lbrk_tables (const char *filename)
5835 stream = fopen (filename, "w");
5838 fprintf (stderr, "cannot open '%s' for writing\n", filename);
5842 debug_output_lbp (stream);
5844 if (ferror (stream) || fclose (stream))
5846 fprintf (stderr, "error writing to '%s'\n", filename);
5851 /* The line breaking property from the LineBreak.txt file. */
5852 int unicode_org_lbp[0x110000];
5854 /* Stores in unicode_org_lbp[] the line breaking property from the
5855 LineBreak.txt file. */
5857 fill_org_lbp (const char *linebreak_filename)
5861 char field0[FIELDLEN];
5862 char field1[FIELDLEN];
5863 char field2[FIELDLEN];
5866 for (i = 0; i < 0x110000; i++)
5867 unicode_org_lbp[i] = LBP_XX;
5869 stream = fopen (linebreak_filename, "r");
5872 fprintf (stderr, "error during fopen of '%s'\n", linebreak_filename);
5888 do c = getc (stream); while (c != EOF && c != '\n');
5892 n = getfield (stream, field0, ';');
5893 n += getfield (stream, field1, ' ');
5894 n += getfield (stream, field2, '\n');
5899 fprintf (stderr, "short line in '%s':%d\n", linebreak_filename,
5903 #define TRY(bit) else if (strcmp (field1, #bit + 4) == 0) value = bit;
5938 else if (strcmp (field1, "LF") == 0) value = LBP_BK;
5939 else if (strcmp (field1, "CR") == 0) value = LBP_BK;
5940 else if (strcmp (field1, "NL") == 0) value = LBP_BK;
5941 else if (strcmp (field1, "SG") == 0) value = LBP_XX;
5944 fprintf (stderr, "unknown property value \"%s\" in '%s':%d\n",
5945 field1, linebreak_filename, lineno);
5948 i = strtoul (field0, NULL, 16);
5949 if (strstr (field0, "..") != NULL)
5951 /* Deal with a range. */
5952 j = strtoul (strstr (field0, "..") + 2, NULL, 16);
5954 unicode_org_lbp[i] = value;
5958 /* Single character line. */
5959 unicode_org_lbp[i] = value;
5962 if (ferror (stream) || fclose (stream))
5964 fprintf (stderr, "error reading from '%s'\n", linebreak_filename);
5969 /* Output the line breaking properties in a human readable format. */
5971 debug_output_org_lbp (FILE *stream)
5975 for (i = 0; i < 0x110000; i++)
5977 int attr = unicode_org_lbp[i];
5980 fprintf (stream, "0x%04X", i);
5981 #define PRINT_BIT(attr,bit) \
5982 if (attr == bit) fprintf (stream, " " #bit);
5983 PRINT_BIT(attr,LBP_BK);
5984 PRINT_BIT(attr,LBP_CM);
5985 PRINT_BIT(attr,LBP_WJ);
5986 PRINT_BIT(attr,LBP_ZW);
5987 PRINT_BIT(attr,LBP_GL);
5988 PRINT_BIT(attr,LBP_SP);
5989 PRINT_BIT(attr,LBP_B2);
5990 PRINT_BIT(attr,LBP_BA);
5991 PRINT_BIT(attr,LBP_BB);
5992 PRINT_BIT(attr,LBP_HY);
5993 PRINT_BIT(attr,LBP_CB);
5994 PRINT_BIT(attr,LBP_CL);
5995 PRINT_BIT(attr,LBP_EX);
5996 PRINT_BIT(attr,LBP_IN);
5997 PRINT_BIT(attr,LBP_NS);
5998 PRINT_BIT(attr,LBP_OP);
5999 PRINT_BIT(attr,LBP_QU);
6000 PRINT_BIT(attr,LBP_IS);
6001 PRINT_BIT(attr,LBP_NU);
6002 PRINT_BIT(attr,LBP_PO);
6003 PRINT_BIT(attr,LBP_PR);
6004 PRINT_BIT(attr,LBP_SY);
6005 PRINT_BIT(attr,LBP_AI);
6006 PRINT_BIT(attr,LBP_AL);
6007 PRINT_BIT(attr,LBP_H2);
6008 PRINT_BIT(attr,LBP_H3);
6009 PRINT_BIT(attr,LBP_ID);
6010 PRINT_BIT(attr,LBP_JL);
6011 PRINT_BIT(attr,LBP_JV);
6012 PRINT_BIT(attr,LBP_JT);
6013 PRINT_BIT(attr,LBP_SA);
6014 PRINT_BIT(attr,LBP_XX);
6016 fprintf (stream, "\n");
6022 debug_output_org_lbrk_tables (const char *filename)
6026 stream = fopen (filename, "w");
6029 fprintf (stderr, "cannot open '%s' for writing\n", filename);
6033 debug_output_org_lbp (stream);
6035 if (ferror (stream) || fclose (stream))
6037 fprintf (stderr, "error writing to '%s'\n", filename);
6042 /* Construction of sparse 3-level tables. */
6043 #define TABLE lbp_table
6044 #define ELEMENT unsigned char
6045 #define DEFAULT LBP_XX
6046 #define xmalloc malloc
6047 #define xrealloc realloc
6051 output_lbp (FILE *stream1, FILE *stream2)
6055 unsigned int level1_offset, level2_offset, level3_offset;
6059 lbp_table_init (&t);
6061 for (i = 0; i < 0x110000; i++)
6063 int attr = get_lbp (i);
6065 /* Now attr should contain exactly one bit. */
6066 if (attr == 0 || ((attr & (attr - 1)) != 0))
6069 if (attr != 1 << LBP_XX)
6071 unsigned int log2_attr;
6072 for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++);
6074 lbp_table_add (&t, i, log2_attr);
6078 lbp_table_finalize (&t);
6081 5 * sizeof (uint32_t);
6083 5 * sizeof (uint32_t)
6084 + t.level1_size * sizeof (uint32_t);
6086 5 * sizeof (uint32_t)
6087 + t.level1_size * sizeof (uint32_t)
6088 + (t.level2_size << t.q) * sizeof (uint32_t);
6090 for (i = 0; i < 5; i++)
6091 fprintf (stream1, "#define lbrkprop_header_%d %d\n", i,
6092 ((uint32_t *) t.result)[i]);
6093 fprintf (stream1, "\n");
6094 fprintf (stream1, "typedef struct\n");
6095 fprintf (stream1, " {\n");
6096 fprintf (stream1, " int level1[%zu];\n", t.level1_size);
6097 fprintf (stream1, " int level2[%zu << %d];\n", t.level2_size, t.q);
6098 fprintf (stream1, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
6099 fprintf (stream1, " }\n");
6100 fprintf (stream1, "lbrkprop_t;\n");
6101 fprintf (stream1, "extern const lbrkprop_t unilbrkprop;\n");
6103 fprintf (stream2, "const lbrkprop_t unilbrkprop =\n");
6104 fprintf (stream2, "{\n");
6105 fprintf (stream2, " {");
6106 if (t.level1_size > 8)
6107 fprintf (stream2, "\n ");
6108 for (i = 0; i < t.level1_size; i++)
6111 if (i > 0 && (i % 8) == 0)
6112 fprintf (stream2, "\n ");
6113 offset = ((uint32_t *) (t.result + level1_offset))[i];
6115 fprintf (stream2, " %5d", -1);
6117 fprintf (stream2, " %5zu",
6118 (offset - level2_offset) / sizeof (uint32_t));
6119 if (i+1 < t.level1_size)
6120 fprintf (stream2, ",");
6122 if (t.level1_size > 8)
6123 fprintf (stream2, "\n ");
6124 fprintf (stream2, " },\n");
6125 fprintf (stream2, " {");
6126 if (t.level2_size << t.q > 8)
6127 fprintf (stream2, "\n ");
6128 for (i = 0; i < t.level2_size << t.q; i++)
6131 if (i > 0 && (i % 8) == 0)
6132 fprintf (stream2, "\n ");
6133 offset = ((uint32_t *) (t.result + level2_offset))[i];
6135 fprintf (stream2, " %5d", -1);
6137 fprintf (stream2, " %5zu",
6138 (offset - level3_offset) / sizeof (unsigned char));
6139 if (i+1 < t.level2_size << t.q)
6140 fprintf (stream2, ",");
6142 if (t.level2_size << t.q > 8)
6143 fprintf (stream2, "\n ");
6144 fprintf (stream2, " },\n");
6145 fprintf (stream2, " {");
6146 if (t.level3_size << t.p > 8)
6147 fprintf (stream2, "\n ");
6148 for (i = 0; i < t.level3_size << t.p; i++)
6150 unsigned char value = ((unsigned char *) (t.result + level3_offset))[i];
6151 const char *value_string;
6154 #define CASE(x) case x: value_string = #x; break;
6191 if (i > 0 && (i % 8) == 0)
6192 fprintf (stream2, "\n ");
6193 fprintf (stream2, " %s%s", value_string,
6194 (i+1 < t.level3_size << t.p ? "," : ""));
6196 if (t.level3_size << t.p > 8)
6197 fprintf (stream2, "\n ");
6198 fprintf (stream2, " }\n");
6199 fprintf (stream2, "};\n");
6203 output_lbrk_tables (const char *filename1, const char *filename2, const char *version)
6205 const char *filenames[2];
6209 filenames[0] = filename1;
6210 filenames[1] = filename2;
6212 for (i = 0; i < 2; i++)
6214 streams[i] = fopen (filenames[i], "w");
6215 if (streams[i] == NULL)
6217 fprintf (stderr, "cannot open '%s' for writing\n", filenames[i]);
6222 for (i = 0; i < 2; i++)
6224 FILE *stream = streams[i];
6226 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
6227 fprintf (stream, "/* Line breaking properties of Unicode characters. */\n");
6228 fprintf (stream, "/* Generated automatically by gen-lbrk for Unicode %s. */\n",
6230 fprintf (stream, "\n");
6232 /* Put a GPL header on it. The gnulib module is under LGPL (although it
6233 still carries the GPL header), and it's gnulib-tool which replaces the
6234 GPL header with an LGPL header. */
6235 fprintf (stream, "/* Copyright (C) 2000-2002, 2004, 2008 Free Software Foundation, Inc.\n");
6236 fprintf (stream, "\n");
6237 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
6238 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
6239 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
6240 fprintf (stream, " (at your option) any later version.\n");
6241 fprintf (stream, "\n");
6242 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
6243 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
6244 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
6245 fprintf (stream, " GNU General Public License for more details.\n");
6246 fprintf (stream, "\n");
6247 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
6248 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
6249 fprintf (stream, "\n");
6252 output_lbp (streams[0], streams[1]);
6254 for (i = 0; i < 2; i++)
6256 if (ferror (streams[i]) || fclose (streams[i]))
6258 fprintf (stderr, "error writing to '%s'\n", filenames[i]);
6264 /* ========================================================================= */
6266 /* Word break property. */
6268 /* Possible values of the Word_Break property. */
6283 WBP_EXTENDNUMLET = 7
6286 /* Returns the word breaking property for ch, as a bit mask. */
6288 get_wbp (unsigned int ch)
6292 if (unicode_attributes[ch].name != NULL)
6295 attr |= 1 << WBP_CR;
6298 attr |= 1 << WBP_LF;
6300 if (ch == 0x000B || ch == 0x000C
6302 || ch == 0x2028 || ch == 0x2029)
6303 attr |= 1 << WBP_NEWLINE;
6305 if (((unicode_properties[ch] >> PROP_GRAPHEME_EXTEND) & 1) != 0
6306 || (unicode_attributes[ch].category != NULL
6307 && strcmp (unicode_attributes[ch].category, "Mc") == 0))
6308 attr |= 1 << WBP_EXTEND;
6310 if (unicode_attributes[ch].category != NULL
6311 && strcmp (unicode_attributes[ch].category, "Cf") == 0
6312 && ch != 0x200C && ch != 0x200D)
6313 attr |= 1 << WBP_FORMAT;
6315 if ((unicode_scripts[ch] < numscripts
6316 && strcmp (scripts[unicode_scripts[ch]], "Katakana") == 0)
6317 || (ch >= 0x3031 && ch <= 0x3035)
6318 || ch == 0x309B || ch == 0x309C || ch == 0x30A0 || ch == 0x30FC
6320 attr |= 1 << WBP_KATAKANA;
6322 if ((((unicode_properties[ch] >> PROP_ALPHABETIC) & 1) != 0
6324 && ((unicode_properties[ch] >> PROP_IDEOGRAPHIC) & 1) == 0
6325 && (attr & (1 << WBP_KATAKANA)) == 0
6326 && ((get_lbp (ch) >> LBP_SA) & 1) == 0
6327 && !(unicode_scripts[ch] < numscripts
6328 && strcmp (scripts[unicode_scripts[ch]], "Hiragana") == 0)
6329 && (attr & (1 << WBP_EXTEND)) == 0)
6330 attr |= 1 << WBP_ALETTER;
6332 if (ch == 0x0027 || ch == 0x002E || ch == 0x2018 || ch == 0x2019
6333 || ch == 0x2024 || ch == 0xFE52 || ch == 0xFF07 || ch == 0xFF0E)
6334 attr |= 1 << WBP_MIDNUMLET;
6336 if (ch == 0x00B7 || ch == 0x05F4 || ch == 0x2027 || ch == 0x003A
6337 || ch == 0x0387 || ch == 0xFE13 || ch == 0xFE55 || ch == 0xFF1A)
6338 attr |= 1 << WBP_MIDLETTER;
6340 if ((((get_lbp (ch) >> LBP_IS) & 1) != 0
6341 || ch == 0x066C || ch == 0xFE50 || ch == 0xFE54 || ch == 0xFF0C
6343 && ch != 0x003A && ch != 0xFE13 && ch != 0x002E)
6344 attr |= 1 << WBP_MIDNUM;
6346 if (((get_lbp (ch) >> LBP_NU) & 1) != 0
6348 attr |= 1 << WBP_NUMERIC;
6350 if (unicode_attributes[ch].category != NULL
6351 && strcmp (unicode_attributes[ch].category, "Pc") == 0)
6352 attr |= 1 << WBP_EXTENDNUMLET;
6357 attr |= 1 << WBP_OTHER;
6362 /* Output the word break property in a human readable format. */
6364 debug_output_wbp (FILE *stream)
6368 for (i = 0; i < 0x110000; i++)
6370 int attr = get_wbp (i);
6371 if (attr != 1 << WBP_OTHER)
6373 fprintf (stream, "0x%04X", i);
6374 if (attr & (1 << WBP_CR))
6375 fprintf (stream, " CR");
6376 if (attr & (1 << WBP_LF))
6377 fprintf (stream, " LF");
6378 if (attr & (1 << WBP_NEWLINE))
6379 fprintf (stream, " Newline");
6380 if (attr & (1 << WBP_EXTEND))
6381 fprintf (stream, " Extend");
6382 if (attr & (1 << WBP_FORMAT))
6383 fprintf (stream, " Format");
6384 if (attr & (1 << WBP_KATAKANA))
6385 fprintf (stream, " Katakana");
6386 if (attr & (1 << WBP_ALETTER))
6387 fprintf (stream, " ALetter");
6388 if (attr & (1 << WBP_MIDNUMLET))
6389 fprintf (stream, " MidNumLet");
6390 if (attr & (1 << WBP_MIDLETTER))
6391 fprintf (stream, " MidLetter");
6392 if (attr & (1 << WBP_MIDNUM))
6393 fprintf (stream, " MidNum");
6394 if (attr & (1 << WBP_NUMERIC))
6395 fprintf (stream, " Numeric");
6396 if (attr & (1 << WBP_EXTENDNUMLET))
6397 fprintf (stream, " ExtendNumLet");
6398 fprintf (stream, "\n");
6404 debug_output_wbrk_tables (const char *filename)
6408 stream = fopen (filename, "w");
6411 fprintf (stderr, "cannot open '%s' for writing\n", filename);
6415 debug_output_wbp (stream);
6417 if (ferror (stream) || fclose (stream))
6419 fprintf (stderr, "error writing to '%s'\n", filename);
6424 /* The word break property from the WordBreakProperty.txt file. */
6425 int unicode_org_wbp[0x110000];
6427 /* Stores in unicode_org_wbp[] the word break property from the
6428 WordBreakProperty.txt file. */
6430 fill_org_wbp (const char *wordbreakproperty_filename)
6435 for (i = 0; i < 0x110000; i++)
6436 unicode_org_wbp[i] = WBP_OTHER;
6438 stream = fopen (wordbreakproperty_filename, "r");
6441 fprintf (stderr, "error during fopen of '%s'\n", wordbreakproperty_filename);
6448 unsigned int i1, i2;
6449 char padding[200+1];
6450 char propname[200+1];
6453 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
6456 if (buf[0] == '\0' || buf[0] == '#')
6459 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4)
6461 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3)
6463 fprintf (stderr, "parse error in '%s'\n",
6464 wordbreakproperty_filename);
6469 #define PROP(name,value) \
6470 if (strcmp (propname, name) == 0) propvalue = value; else
6473 PROP ("Newline", WBP_NEWLINE)
6474 PROP ("Extend", WBP_EXTEND)
6475 PROP ("Format", WBP_FORMAT)
6476 PROP ("Katakana", WBP_KATAKANA)
6477 PROP ("ALetter", WBP_ALETTER)
6478 PROP ("MidNumLet", WBP_MIDNUMLET)
6479 PROP ("MidLetter", WBP_MIDLETTER)
6480 PROP ("MidNum", WBP_MIDNUM)
6481 PROP ("Numeric", WBP_NUMERIC)
6482 PROP ("ExtendNumLet", WBP_EXTENDNUMLET)
6485 fprintf (stderr, "unknown property value '%s' in '%s'\n", propname,
6486 wordbreakproperty_filename);
6489 if (!(i1 <= i2 && i2 < 0x110000))
6492 for (i = i1; i <= i2; i++)
6493 unicode_org_wbp[i] = propvalue;
6496 if (ferror (stream) || fclose (stream))
6498 fprintf (stderr, "error reading from '%s'\n", wordbreakproperty_filename);
6503 /* Output the word break property in a human readable format. */
6505 debug_output_org_wbp (FILE *stream)
6509 for (i = 0; i < 0x110000; i++)
6511 int propvalue = unicode_org_wbp[i];
6512 if (propvalue != WBP_OTHER)
6514 fprintf (stream, "0x%04X", i);
6515 #define PROP(name,value) \
6516 if (propvalue == value) fprintf (stream, " " name); else
6519 PROP ("Newline", WBP_NEWLINE)
6520 PROP ("Extend", WBP_EXTEND)
6521 PROP ("Format", WBP_FORMAT)
6522 PROP ("Katakana", WBP_KATAKANA)
6523 PROP ("ALetter", WBP_ALETTER)
6524 PROP ("MidNumLet", WBP_MIDNUMLET)
6525 PROP ("MidLetter", WBP_MIDLETTER)
6526 PROP ("MidNum", WBP_MIDNUM)
6527 PROP ("Numeric", WBP_NUMERIC)
6528 PROP ("ExtendNumLet", WBP_EXTENDNUMLET)
6530 fprintf (stream, " ??");
6531 fprintf (stream, "\n");
6537 debug_output_org_wbrk_tables (const char *filename)
6541 stream = fopen (filename, "w");
6544 fprintf (stderr, "cannot open '%s' for writing\n", filename);
6548 debug_output_org_wbp (stream);
6550 if (ferror (stream) || fclose (stream))
6552 fprintf (stderr, "error writing to '%s'\n", filename);
6557 /* Construction of sparse 3-level tables. */
6558 #define TABLE wbp_table
6559 #define ELEMENT unsigned char
6560 #define DEFAULT WBP_OTHER
6561 #define xmalloc malloc
6562 #define xrealloc realloc
6566 output_wbp (FILE *stream)
6570 unsigned int level1_offset, level2_offset, level3_offset;
6574 wbp_table_init (&t);
6576 for (i = 0; i < 0x110000; i++)
6578 int attr = get_wbp (i);
6580 /* Now attr should contain exactly one bit. */
6581 if (attr == 0 || ((attr & (attr - 1)) != 0))
6584 if (attr != 1 << WBP_OTHER)
6586 unsigned int log2_attr;
6587 for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++);
6589 wbp_table_add (&t, i, log2_attr);
6593 wbp_table_finalize (&t);
6596 5 * sizeof (uint32_t);
6598 5 * sizeof (uint32_t)
6599 + t.level1_size * sizeof (uint32_t);
6601 5 * sizeof (uint32_t)
6602 + t.level1_size * sizeof (uint32_t)
6603 + (t.level2_size << t.q) * sizeof (uint32_t);
6605 for (i = 0; i < 5; i++)
6606 fprintf (stream, "#define wbrkprop_header_%d %d\n", i,
6607 ((uint32_t *) t.result)[i]);
6608 fprintf (stream, "\n");
6609 fprintf (stream, "typedef struct\n");
6610 fprintf (stream, " {\n");
6611 fprintf (stream, " int level1[%zu];\n", t.level1_size);
6612 fprintf (stream, " int level2[%zu << %d];\n", t.level2_size, t.q);
6613 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
6614 fprintf (stream, " }\n");
6615 fprintf (stream, "wbrkprop_t;\n");
6616 fprintf (stream, "static const wbrkprop_t uniwbrkprop =\n");
6617 fprintf (stream, "{\n");
6618 fprintf (stream, " {");
6619 if (t.level1_size > 8)
6620 fprintf (stream, "\n ");
6621 for (i = 0; i < t.level1_size; i++)
6624 if (i > 0 && (i % 8) == 0)
6625 fprintf (stream, "\n ");
6626 offset = ((uint32_t *) (t.result + level1_offset))[i];
6628 fprintf (stream, " %5d", -1);
6630 fprintf (stream, " %5zu",
6631 (offset - level2_offset) / sizeof (uint32_t));
6632 if (i+1 < t.level1_size)
6633 fprintf (stream, ",");
6635 if (t.level1_size > 8)
6636 fprintf (stream, "\n ");
6637 fprintf (stream, " },\n");
6638 fprintf (stream, " {");
6639 if (t.level2_size << t.q > 8)
6640 fprintf (stream, "\n ");
6641 for (i = 0; i < t.level2_size << t.q; i++)
6644 if (i > 0 && (i % 8) == 0)
6645 fprintf (stream, "\n ");
6646 offset = ((uint32_t *) (t.result + level2_offset))[i];
6648 fprintf (stream, " %5d", -1);
6650 fprintf (stream, " %5zu",
6651 (offset - level3_offset) / sizeof (unsigned char));
6652 if (i+1 < t.level2_size << t.q)
6653 fprintf (stream, ",");
6655 if (t.level2_size << t.q > 8)
6656 fprintf (stream, "\n ");
6657 fprintf (stream, " },\n");
6658 fprintf (stream, " {");
6659 if (t.level3_size << t.p > 4)
6660 fprintf (stream, "\n ");
6661 for (i = 0; i < t.level3_size << t.p; i++)
6663 unsigned char value = ((unsigned char *) (t.result + level3_offset))[i];
6664 const char *value_string;
6667 #define CASE(x) case x: value_string = #x; break;
6676 CASE(WBP_MIDNUMLET);
6677 CASE(WBP_MIDLETTER);
6680 CASE(WBP_EXTENDNUMLET);
6685 if (i > 0 && (i % 4) == 0)
6686 fprintf (stream, "\n ");
6687 fprintf (stream, " %s%s", value_string,
6688 (i+1 < t.level3_size << t.p ? "," : ""));
6690 if (t.level3_size << t.p > 4)
6691 fprintf (stream, "\n ");
6692 fprintf (stream, " }\n");
6693 fprintf (stream, "};\n");
6697 output_wbrk_tables (const char *filename, const char *version)
6701 stream = fopen (filename, "w");
6704 fprintf (stderr, "cannot open '%s' for writing\n", filename);
6708 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
6709 fprintf (stream, "/* Line breaking properties of Unicode characters. */\n");
6710 fprintf (stream, "/* Generated automatically by gen-uni-tables for Unicode %s. */\n",
6712 fprintf (stream, "\n");
6714 /* Put a GPL header on it. The gnulib module is under LGPL (although it
6715 still carries the GPL header), and it's gnulib-tool which replaces the
6716 GPL header with an LGPL header. */
6717 fprintf (stream, "/* Copyright (C) 2000-2002, 2004, 2007-2009 Free Software Foundation, Inc.\n");
6718 fprintf (stream, "\n");
6719 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
6720 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
6721 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
6722 fprintf (stream, " (at your option) any later version.\n");
6723 fprintf (stream, "\n");
6724 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
6725 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
6726 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
6727 fprintf (stream, " GNU General Public License for more details.\n");
6728 fprintf (stream, "\n");
6729 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
6730 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
6731 fprintf (stream, "\n");
6733 output_wbp (stream);
6735 if (ferror (stream) || fclose (stream))
6737 fprintf (stderr, "error writing to '%s'\n", filename);
6742 /* ========================================================================= */
6744 /* Maximum number of characters into which a single Unicode character can be
6746 #define MAX_DECOMP_LENGTH 18
6750 UC_DECOMP_CANONICAL,/* Canonical decomposition. */
6751 UC_DECOMP_FONT, /* <font> A font variant (e.g. a blackletter form). */
6752 UC_DECOMP_NOBREAK, /* <noBreak> A no-break version of a space or hyphen. */
6753 UC_DECOMP_INITIAL, /* <initial> An initial presentation form (Arabic). */
6754 UC_DECOMP_MEDIAL, /* <medial> A medial presentation form (Arabic). */
6755 UC_DECOMP_FINAL, /* <final> A final presentation form (Arabic). */
6756 UC_DECOMP_ISOLATED,/* <isolated> An isolated presentation form (Arabic). */
6757 UC_DECOMP_CIRCLE, /* <circle> An encircled form. */
6758 UC_DECOMP_SUPER, /* <super> A superscript form. */
6759 UC_DECOMP_SUB, /* <sub> A subscript form. */
6760 UC_DECOMP_VERTICAL,/* <vertical> A vertical layout presentation form. */
6761 UC_DECOMP_WIDE, /* <wide> A wide (or zenkaku) compatibility character. */
6762 UC_DECOMP_NARROW, /* <narrow> A narrow (or hankaku) compatibility character. */
6763 UC_DECOMP_SMALL, /* <small> A small variant form (CNS compatibility). */
6764 UC_DECOMP_SQUARE, /* <square> A CJK squared font variant. */
6765 UC_DECOMP_FRACTION,/* <fraction> A vulgar fraction form. */
6766 UC_DECOMP_COMPAT /* <compat> Otherwise unspecified compatibility character. */
6769 /* Return the decomposition for a Unicode character (ignoring Hangul Jamo
6770 decompositions). Return the type, or -1 for none. */
6772 get_decomposition (unsigned int ch,
6773 unsigned int *lengthp, unsigned int decomposed[MAX_DECOMP_LENGTH])
6775 const char *decomposition = unicode_attributes[ch].decomposition;
6777 if (decomposition != NULL && decomposition[0] != '\0')
6779 int type = UC_DECOMP_CANONICAL;
6780 unsigned int length;
6783 if (decomposition[0] == '<')
6788 rangle = strchr (decomposition + 1, '>');
6791 typelen = rangle + 1 - decomposition;
6792 #define TYPE(t1,t2) \
6793 if (typelen == (sizeof (t1) - 1) && memcmp (decomposition, t1, typelen) == 0) \
6796 TYPE ("<font>", UC_DECOMP_FONT)
6797 TYPE ("<noBreak>", UC_DECOMP_NOBREAK)
6798 TYPE ("<initial>", UC_DECOMP_INITIAL)
6799 TYPE ("<medial>", UC_DECOMP_MEDIAL)
6800 TYPE ("<final>", UC_DECOMP_FINAL)
6801 TYPE ("<isolated>", UC_DECOMP_ISOLATED)
6802 TYPE ("<circle>", UC_DECOMP_CIRCLE)
6803 TYPE ("<super>", UC_DECOMP_SUPER)
6804 TYPE ("<sub>", UC_DECOMP_SUB)
6805 TYPE ("<vertical>", UC_DECOMP_VERTICAL)
6806 TYPE ("<wide>", UC_DECOMP_WIDE)
6807 TYPE ("<narrow>", UC_DECOMP_NARROW)
6808 TYPE ("<small>", UC_DECOMP_SMALL)
6809 TYPE ("<square>", UC_DECOMP_SQUARE)
6810 TYPE ("<fraction>", UC_DECOMP_FRACTION)
6811 TYPE ("<compat>", UC_DECOMP_COMPAT)
6813 fprintf (stderr, "unknown decomposition type %*s\n", (int)typelen, decomposition);
6817 decomposition = rangle + 1;
6818 if (decomposition[0] == ' ')
6821 for (length = 0; length < MAX_DECOMP_LENGTH; length++)
6823 decomposed[length] = strtoul (decomposition, &endptr, 16);
6824 if (endptr == decomposition)
6826 decomposition = endptr;
6827 if (decomposition[0] == ' ')
6830 if (*decomposition != '\0')
6831 /* MAX_DECOMP_LENGTH is too small. */
6841 /* Construction of sparse 3-level tables. */
6842 #define TABLE decomp_table
6843 #define ELEMENT uint16_t
6844 #define DEFAULT (uint16_t)(-1)
6845 #define xmalloc malloc
6846 #define xrealloc realloc
6850 output_decomposition (FILE *stream1, FILE *stream2)
6852 struct decomp_table t;
6853 unsigned int level1_offset, level2_offset, level3_offset;
6854 unsigned int offset;
6860 decomp_table_init (&t);
6862 fprintf (stream1, "extern const unsigned char gl_uninorm_decomp_chars_table[];\n");
6863 fprintf (stream1, "\n");
6864 fprintf (stream2, "const unsigned char gl_uninorm_decomp_chars_table[] =\n{");
6867 for (ch = 0; ch < 0x110000; ch++)
6869 unsigned int length;
6870 unsigned int decomposed[MAX_DECOMP_LENGTH];
6871 int type = get_decomposition (ch, &length, decomposed);
6875 if (!(offset < (1 << 15)))
6877 decomp_table_add (&t, ch, ((type == UC_DECOMP_CANONICAL ? 0 : 1) << 15) | offset);
6879 /* Produce length 3-bytes entries. */
6881 /* We would need a special representation of zero-length entries. */
6883 for (i = 0; i < length; i++)
6886 fprintf (stream2, ",");
6887 if ((offset % 4) == 0)
6888 fprintf (stream2, "\n ");
6889 if (!(decomposed[i] < (1 << 18)))
6891 fprintf (stream2, " 0x%02X, 0x%02X, 0x%02X",
6892 (((i+1 < length ? (1 << 23) : 0)
6893 | (i == 0 ? (type << 18) : 0)
6894 | decomposed[i]) >> 16) & 0xff,
6895 (decomposed[i] >> 8) & 0xff,
6896 decomposed[i] & 0xff);
6902 fprintf (stream2, "\n};\n");
6903 fprintf (stream2, "\n");
6905 decomp_table_finalize (&t);
6908 5 * sizeof (uint32_t);
6910 5 * sizeof (uint32_t)
6911 + t.level1_size * sizeof (uint32_t);
6913 5 * sizeof (uint32_t)
6914 + t.level1_size * sizeof (uint32_t)
6915 + (t.level2_size << t.q) * sizeof (uint32_t);
6917 for (i = 0; i < 5; i++)
6918 fprintf (stream1, "#define decomp_header_%d %d\n", i,
6919 ((uint32_t *) t.result)[i]);
6920 fprintf (stream1, "\n");
6921 fprintf (stream1, "typedef struct\n");
6922 fprintf (stream1, " {\n");
6923 fprintf (stream1, " int level1[%zu];\n", t.level1_size);
6924 fprintf (stream1, " int level2[%zu << %d];\n", t.level2_size, t.q);
6925 fprintf (stream1, " unsigned short level3[%zu << %d];\n", t.level3_size, t.p);
6926 fprintf (stream1, " }\n");
6927 fprintf (stream1, "decomp_index_table_t;\n");
6928 fprintf (stream1, "extern const decomp_index_table_t gl_uninorm_decomp_index_table;\n");
6929 fprintf (stream2, "const decomp_index_table_t gl_uninorm_decomp_index_table =\n");
6930 fprintf (stream2, "{\n");
6931 fprintf (stream2, " {");
6932 if (t.level1_size > 8)
6933 fprintf (stream2, "\n ");
6934 for (i = 0; i < t.level1_size; i++)
6937 if (i > 0 && (i % 8) == 0)
6938 fprintf (stream2, "\n ");
6939 offset = ((uint32_t *) (t.result + level1_offset))[i];
6941 fprintf (stream2, " %5d", -1);
6943 fprintf (stream2, " %5zu",
6944 (offset - level2_offset) / sizeof (uint32_t));
6945 if (i+1 < t.level1_size)
6946 fprintf (stream2, ",");
6948 if (t.level1_size > 8)
6949 fprintf (stream2, "\n ");
6950 fprintf (stream2, " },\n");
6951 fprintf (stream2, " {");
6952 if (t.level2_size << t.q > 8)
6953 fprintf (stream2, "\n ");
6954 for (i = 0; i < t.level2_size << t.q; i++)
6957 if (i > 0 && (i % 8) == 0)
6958 fprintf (stream2, "\n ");
6959 offset = ((uint32_t *) (t.result + level2_offset))[i];
6961 fprintf (stream2, " %5d", -1);
6963 fprintf (stream2, " %5zu",
6964 (offset - level3_offset) / sizeof (uint16_t));
6965 if (i+1 < t.level2_size << t.q)
6966 fprintf (stream2, ",");
6968 if (t.level2_size << t.q > 8)
6969 fprintf (stream2, "\n ");
6970 fprintf (stream2, " },\n");
6971 fprintf (stream2, " {");
6972 if (t.level3_size << t.p > 8)
6973 fprintf (stream2, "\n ");
6974 for (i = 0; i < t.level3_size << t.p; i++)
6976 uint16_t value = ((uint16_t *) (t.result + level3_offset))[i];
6977 if (i > 0 && (i % 8) == 0)
6978 fprintf (stream2, "\n ");
6979 fprintf (stream2, " %5d", value == (uint16_t)(-1) ? -1 : value);
6980 if (i+1 < t.level3_size << t.p)
6981 fprintf (stream2, ",");
6983 if (t.level3_size << t.p > 8)
6984 fprintf (stream2, "\n ");
6985 fprintf (stream2, " }\n");
6986 fprintf (stream2, "};\n");
6990 output_decomposition_tables (const char *filename1, const char *filename2, const char *version)
6992 const char *filenames[2];
6996 filenames[0] = filename1;
6997 filenames[1] = filename2;
6999 for (i = 0; i < 2; i++)
7001 streams[i] = fopen (filenames[i], "w");
7002 if (streams[i] == NULL)
7004 fprintf (stderr, "cannot open '%s' for writing\n", filenames[i]);
7009 for (i = 0; i < 2; i++)
7011 FILE *stream = streams[i];
7013 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
7014 fprintf (stream, "/* Decomposition of Unicode characters. */\n");
7015 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
7017 fprintf (stream, "\n");
7020 output_decomposition (streams[0], streams[1]);
7022 for (i = 0; i < 2; i++)
7024 if (ferror (streams[i]) || fclose (streams[i]))
7026 fprintf (stderr, "error writing to '%s'\n", filenames[i]);
7032 /* The "excluded from composition" property from the CompositionExclusions.txt file. */
7033 char unicode_composition_exclusions[0x110000];
7036 fill_composition_exclusions (const char *compositionexclusions_filename)
7041 stream = fopen (compositionexclusions_filename, "r");
7044 fprintf (stderr, "error during fopen of '%s'\n", compositionexclusions_filename);
7048 for (i = 0; i < 0x110000; i++)
7049 unicode_composition_exclusions[i] = 0;
7056 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
7059 if (buf[0] == '\0' || buf[0] == '#')
7062 if (sscanf (buf, "%X", &i) != 1)
7064 fprintf (stderr, "parse error in '%s'\n", compositionexclusions_filename);
7067 if (!(i < 0x110000))
7070 unicode_composition_exclusions[i] = 1;
7073 if (ferror (stream) || fclose (stream))
7075 fprintf (stderr, "error reading from '%s'\n", compositionexclusions_filename);
7081 debug_output_composition_tables (const char *filename)
7086 stream = fopen (filename, "w");
7089 fprintf (stderr, "cannot open '%s' for writing\n", filename);
7093 for (ch = 0; ch < 0x110000; ch++)
7095 unsigned int length;
7096 unsigned int decomposed[MAX_DECOMP_LENGTH];
7097 int type = get_decomposition (ch, &length, decomposed);
7099 if (type == UC_DECOMP_CANONICAL
7100 /* Consider only binary decompositions.
7101 Exclude singleton decompositions. */
7104 unsigned int code1 = decomposed[0];
7105 unsigned int code2 = decomposed[1];
7106 unsigned int combined = ch;
7108 /* Exclude decompositions where the first part is not a starter,
7109 i.e. is not of canonical combining class 0. */
7110 if (strcmp (unicode_attributes[code1].combining, "0") == 0
7111 /* Exclude characters listed in CompositionExclusions.txt. */
7112 && !unicode_composition_exclusions[combined])
7114 /* The combined character must now also be a starter.
7116 if (strcmp (unicode_attributes[combined].combining, "0") != 0)
7119 fprintf (stream, "0x%04X\t0x%04X\t0x%04X\t%s\n",
7123 unicode_attributes[code2].combining);
7128 if (ferror (stream) || fclose (stream))
7130 fprintf (stderr, "error writing to '%s'\n", filename);
7136 output_composition_tables (const char *filename, const char *version)
7141 stream = fopen (filename, "w");
7144 fprintf (stderr, "cannot open '%s' for writing\n", filename);
7148 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
7149 fprintf (stream, "/* Canonical composition of Unicode characters. */\n");
7150 fprintf (stream, "/* Generated automatically by gen-uni-tables for Unicode %s. */\n",
7152 fprintf (stream, "\n");
7154 /* Put a GPL header on it. The gnulib module is under LGPL (although it
7155 still carries the GPL header), and it's gnulib-tool which replaces the
7156 GPL header with an LGPL header. */
7157 fprintf (stream, "/* Copyright (C) 2009 Free Software Foundation, Inc.\n");
7158 fprintf (stream, "\n");
7159 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
7160 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
7161 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
7162 fprintf (stream, " (at your option) any later version.\n");
7163 fprintf (stream, "\n");
7164 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
7165 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
7166 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
7167 fprintf (stream, " GNU General Public License for more details.\n");
7168 fprintf (stream, "\n");
7169 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
7170 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
7171 fprintf (stream, "\n");
7173 /* The composition table is a set of mappings (code1, code2) -> combined,
7175 367 values for code1 (from 0x003C to 0x30FD),
7176 54 values for code2 (from 0x0300 to 0x309A).
7177 For a fixed code1, there are from 1 to 19 possible values for code2.
7178 For a fixed code2, there are from 1 to 117 possible values for code1.
7179 This is a very sparse matrix.
7181 We want an O(1) hash lookup.
7183 We could implement the hash lookup by mapping (code1, code2) to a linear
7184 combination mul1*code1 + mul2*code2, which is then used as an index into
7185 a 3-level table. But this leads to a table of size 37 KB.
7187 We use gperf to implement the hash lookup, giving it the 928 sets of
7188 4 bytes (code1, code2) as input. gperf generates a hash table of size
7189 1527, which is quite good (60% filled). It requires an auxiliary table
7190 lookup in a table of size 0.5 KB. The total tables size is 11 KB. */
7192 fprintf (stream, "struct composition_rule { char codes[4]; };\n");
7193 fprintf (stream, "%%struct-type\n");
7194 fprintf (stream, "%%language=ANSI-C\n");
7195 fprintf (stream, "%%define slot-name codes\n");
7196 fprintf (stream, "%%define hash-function-name gl_uninorm_compose_hash\n");
7197 fprintf (stream, "%%define lookup-function-name gl_uninorm_compose_lookup\n");
7198 fprintf (stream, "%%compare-lengths\n");
7199 fprintf (stream, "%%compare-strncmp\n");
7200 fprintf (stream, "%%readonly-tables\n");
7201 fprintf (stream, "%%omit-struct-type\n");
7202 fprintf (stream, "%%%%\n");
7204 for (ch = 0; ch < 0x110000; ch++)
7206 unsigned int length;
7207 unsigned int decomposed[MAX_DECOMP_LENGTH];
7208 int type = get_decomposition (ch, &length, decomposed);
7210 if (type == UC_DECOMP_CANONICAL
7211 /* Consider only binary decompositions.
7212 Exclude singleton decompositions. */
7215 unsigned int code1 = decomposed[0];
7216 unsigned int code2 = decomposed[1];
7217 unsigned int combined = ch;
7219 /* Exclude decompositions where the first part is not a starter,
7220 i.e. is not of canonical combining class 0. */
7221 if (strcmp (unicode_attributes[code1].combining, "0") == 0
7222 /* Exclude characters listed in CompositionExclusions.txt. */
7223 && !unicode_composition_exclusions[combined])
7225 /* The combined character must now also be a starter.
7227 if (strcmp (unicode_attributes[combined].combining, "0") != 0)
7230 if (!(code1 < 0x10000))
7232 if (!(code2 < 0x10000))
7234 if (!(combined < 0x10000))
7237 fprintf (stream, "\"\\x%02x\\x%02x\\x%02x\\x%02x\", 0x%04x\n",
7238 (code1 >> 8) & 0xff, code1 & 0xff,
7239 (code2 >> 8) & 0xff, code2 & 0xff,
7245 if (ferror (stream) || fclose (stream))
7247 fprintf (stderr, "error writing to '%s'\n", filename);
7252 /* ========================================================================= */
7254 /* Output the test for a simple character mapping table to the given file. */
7257 output_simple_mapping_test (const char *filename,
7258 const char *function_name,
7259 unsigned int (*func) (unsigned int),
7260 const char *version)
7266 stream = fopen (filename, "w");
7269 fprintf (stderr, "cannot open '%s' for writing\n", filename);
7273 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
7274 fprintf (stream, "/* Test the Unicode character mapping functions.\n");
7275 fprintf (stream, " Copyright (C) 2009 Free Software Foundation, Inc.\n");
7276 fprintf (stream, "\n");
7277 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
7278 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
7279 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
7280 fprintf (stream, " (at your option) any later version.\n");
7281 fprintf (stream, "\n");
7282 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
7283 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
7284 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
7285 fprintf (stream, " GNU General Public License for more details.\n");
7286 fprintf (stream, "\n");
7287 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
7288 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
7289 fprintf (stream, "\n");
7290 fprintf (stream, "/* Generated automatically by gen-case.c for Unicode %s. */\n",
7292 fprintf (stream, "\n");
7293 fprintf (stream, "#include \"test-mapping-part1.h\"\n");
7294 fprintf (stream, "\n");
7297 for (ch = 0; ch < 0x110000; ch++)
7299 unsigned int value = func (ch);
7304 fprintf (stream, ",\n");
7305 fprintf (stream, " { 0x%04X, 0x%04X }", ch, value);
7310 fprintf (stream, "\n");
7312 fprintf (stream, "\n");
7313 fprintf (stream, "#define MAP(c) %s (c)\n", function_name);
7314 fprintf (stream, "#include \"test-mapping-part2.h\"\n");
7316 if (ferror (stream) || fclose (stream))
7318 fprintf (stderr, "error writing to '%s'\n", filename);
7323 /* Construction of sparse 3-level tables. */
7324 #define TABLE mapping_table
7325 #define ELEMENT int32_t
7327 #define xmalloc malloc
7328 #define xrealloc realloc
7331 /* Output a simple character mapping table to the given file. */
7334 output_simple_mapping (const char *filename,
7335 unsigned int (*func) (unsigned int),
7336 const char *version)
7340 struct mapping_table t;
7341 unsigned int level1_offset, level2_offset, level3_offset;
7343 stream = fopen (filename, "w");
7346 fprintf (stderr, "cannot open '%s' for writing\n", filename);
7350 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
7351 fprintf (stream, "/* Simple character mapping of Unicode characters. */\n");
7352 fprintf (stream, "/* Generated automatically by gen-case.c for Unicode %s. */\n",
7357 mapping_table_init (&t);
7359 for (ch = 0; ch < 0x110000; ch++)
7361 int value = (int) func (ch) - (int) ch;
7363 mapping_table_add (&t, ch, value);
7366 mapping_table_finalize (&t);
7368 /* Offsets in t.result, in memory of this process. */
7370 5 * sizeof (uint32_t);
7372 5 * sizeof (uint32_t)
7373 + t.level1_size * sizeof (uint32_t);
7375 5 * sizeof (uint32_t)
7376 + t.level1_size * sizeof (uint32_t)
7377 + (t.level2_size << t.q) * sizeof (uint32_t);
7379 for (i = 0; i < 5; i++)
7380 fprintf (stream, "#define mapping_header_%d %d\n", i,
7381 ((uint32_t *) t.result)[i]);
7382 fprintf (stream, "static const\n");
7383 fprintf (stream, "struct\n");
7384 fprintf (stream, " {\n");
7385 fprintf (stream, " int level1[%zu];\n", t.level1_size);
7386 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
7387 fprintf (stream, " int level3[%zu << %d];\n", t.level3_size, t.p);
7388 fprintf (stream, " }\n");
7389 fprintf (stream, "u_mapping =\n");
7390 fprintf (stream, "{\n");
7391 fprintf (stream, " {");
7392 if (t.level1_size > 8)
7393 fprintf (stream, "\n ");
7394 for (i = 0; i < t.level1_size; i++)
7397 if (i > 0 && (i % 8) == 0)
7398 fprintf (stream, "\n ");
7399 offset = ((uint32_t *) (t.result + level1_offset))[i];
7401 fprintf (stream, " %5d", -1);
7403 fprintf (stream, " %5zu",
7404 (offset - level2_offset) / sizeof (uint32_t));
7405 if (i+1 < t.level1_size)
7406 fprintf (stream, ",");
7408 if (t.level1_size > 8)
7409 fprintf (stream, "\n ");
7410 fprintf (stream, " },\n");
7411 fprintf (stream, " {");
7412 if (t.level2_size << t.q > 8)
7413 fprintf (stream, "\n ");
7414 for (i = 0; i < t.level2_size << t.q; i++)
7417 if (i > 0 && (i % 8) == 0)
7418 fprintf (stream, "\n ");
7419 offset = ((uint32_t *) (t.result + level2_offset))[i];
7421 fprintf (stream, " %5d", -1);
7423 fprintf (stream, " %5zu",
7424 (offset - level3_offset) / sizeof (int32_t));
7425 if (i+1 < t.level2_size << t.q)
7426 fprintf (stream, ",");
7428 if (t.level2_size << t.q > 8)
7429 fprintf (stream, "\n ");
7430 fprintf (stream, " },\n");
7431 fprintf (stream, " {");
7432 if (t.level3_size << t.p > 8)
7433 fprintf (stream, "\n ");
7434 for (i = 0; i < t.level3_size << t.p; i++)
7436 if (i > 0 && (i % 8) == 0)
7437 fprintf (stream, "\n ");
7438 fprintf (stream, " %5d", ((int32_t *) (t.result + level3_offset))[i]);
7439 if (i+1 < t.level3_size << t.p)
7440 fprintf (stream, ",");
7442 if (t.level3_size << t.p > 8)
7443 fprintf (stream, "\n ");
7444 fprintf (stream, " }\n");
7445 fprintf (stream, "};\n");
7447 if (ferror (stream) || fclose (stream))
7449 fprintf (stderr, "error writing to '%s'\n", filename);
7454 /* ========================================================================= */
7457 main (int argc, char * argv[])
7459 const char *unicodedata_filename;
7460 const char *proplist_filename;
7461 const char *derivedproplist_filename;
7462 const char *scripts_filename;
7463 const char *blocks_filename;
7464 const char *proplist30_filename;
7465 const char *eastasianwidth_filename;
7466 const char *linebreak_filename;
7467 const char *wordbreakproperty_filename;
7468 const char *compositionexclusions_filename;
7469 const char *version;
7473 fprintf (stderr, "Usage: %s UnicodeData.txt PropList.txt DerivedCoreProperties.txt Scripts.txt Blocks.txt PropList-3.0.1.txt EastAsianWidth.txt LineBreak.txt WordBreakProperty.txt CompositionExclusions.txt version\n",
7478 unicodedata_filename = argv[1];
7479 proplist_filename = argv[2];
7480 derivedproplist_filename = argv[3];
7481 scripts_filename = argv[4];
7482 blocks_filename = argv[5];
7483 proplist30_filename = argv[6];
7484 eastasianwidth_filename = argv[7];
7485 linebreak_filename = argv[8];
7486 wordbreakproperty_filename = argv[9];
7487 compositionexclusions_filename = argv[10];
7490 fill_attributes (unicodedata_filename);
7491 clear_properties ();
7492 fill_properties (proplist_filename);
7493 fill_properties (derivedproplist_filename);
7494 fill_properties30 (proplist30_filename);
7495 fill_scripts (scripts_filename);
7496 fill_blocks (blocks_filename);
7497 fill_width (eastasianwidth_filename);
7498 fill_org_lbp (linebreak_filename);
7499 fill_org_wbp (wordbreakproperty_filename);
7500 fill_composition_exclusions (compositionexclusions_filename);
7502 output_categories (version);
7503 output_category ("unictype/categ_of.h", version);
7504 output_combclass ("unictype/combining.h", version);
7505 output_bidi_category ("unictype/bidi_of.h", version);
7506 output_decimal_digit_test ("../tests/unictype/test-decdigit.h", version);
7507 output_decimal_digit ("unictype/decdigit.h", version);
7508 output_digit_test ("../tests/unictype/test-digit.h", version);
7509 output_digit ("unictype/digit.h", version);
7510 output_numeric_test ("../tests/unictype/test-numeric.h", version);
7511 output_numeric ("unictype/numeric.h", version);
7512 output_mirror ("unictype/mirror.h", version);
7513 output_properties (version);
7514 output_scripts (version);
7515 output_scripts_byname (version);
7516 output_blocks (version);
7517 output_ident_properties (version);
7518 output_old_ctype (version);
7520 debug_output_lbrk_tables ("unilbrk/lbrkprop.txt");
7521 debug_output_org_lbrk_tables ("unilbrk/lbrkprop_org.txt");
7522 output_lbrk_tables ("unilbrk/lbrkprop1.h", "unilbrk/lbrkprop2.h", version);
7524 debug_output_wbrk_tables ("uniwbrk/wbrkprop.txt");
7525 debug_output_org_wbrk_tables ("uniwbrk/wbrkprop_org.txt");
7526 output_wbrk_tables ("uniwbrk/wbrkprop.h", version);
7528 output_decomposition_tables ("uninorm/decomposition-table1.h", "uninorm/decomposition-table2.h", version);
7529 debug_output_composition_tables ("uninorm/composition.txt");
7530 output_composition_tables ("uninorm/composition-table.gperf", version);
7532 output_simple_mapping_test ("../tests/unicase/test-uc_toupper.c", "uc_toupper", to_upper, version);
7533 output_simple_mapping_test ("../tests/unicase/test-uc_tolower.c", "uc_tolower", to_lower, version);
7534 output_simple_mapping_test ("../tests/unicase/test-uc_totitle.c", "uc_totitle", to_title, version);
7535 output_simple_mapping ("unicase/toupper.h", to_upper, version);
7536 output_simple_mapping ("unicase/tolower.h", to_lower, version);
7537 output_simple_mapping ("unicase/totitle.h", to_title, version);
7543 * For Emacs M-x compile
7545 * compile-command: "
7546 gcc -O -Wall gen-uni-tables.c -Iunictype -o gen-uni-tables && \
7548 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/UnicodeData.txt \
7549 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/PropList.txt \
7550 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/DerivedCoreProperties.txt \
7551 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/Scripts.txt \
7552 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/Blocks.txt \
7553 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/3.0.1/PropList-3.0.1.txt \
7554 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/EastAsianWidth.txt \
7555 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/LineBreak.txt \
7556 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/auxiliary/WordBreakProperty.txt \
7557 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/CompositionExclusions.txt \