1 /* Generate a Unicode conforming Line Break Properties tables from a
3 Copyright (C) 2000-2002, 2004, 2008 Free Software Foundation, Inc.
4 Written by Bruno Haible <bruno@clisp.org>, 2000-2002.
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <http://www.gnu.org/licenses/>. */
20 $ gen-lbrk /usr/local/share/Unidata/UnicodeData.txt \
21 /usr/local/share/Unidata/EastAsianWidth.txt \
22 /usr/local/share/Unidata/LineBreak.txt \
33 /* This structure represents one line in the UnicodeData.txt file. */
34 struct unicode_attribute
36 const char *name; /* Character name */
37 const char *category; /* General category */
38 const char *combining; /* Canonical combining classes */
39 const char *bidi; /* Bidirectional category */
40 const char *decomposition; /* Character decomposition mapping */
41 const char *decdigit; /* Decimal digit value */
42 const char *digit; /* Digit value */
43 const char *numeric; /* Numeric value */
44 int mirrored; /* mirrored */
45 const char *oldname; /* Old Unicode 1.0 name */
46 const char *comment; /* Comment */
47 unsigned int upper; /* Uppercase mapping */
48 unsigned int lower; /* Lowercase mapping */
49 unsigned int title; /* Titlecase mapping */
52 /* Missing fields are represented with "" for strings, and NONE for
54 #define NONE (~(unsigned int)0)
56 /* The entire contents of the UnicodeData.txt file. */
57 struct unicode_attribute unicode_attributes [0x110000];
59 /* Stores in unicode_attributes[i] the values from the given fields. */
61 fill_attribute (unsigned int i,
62 const char *field1, const char *field2,
63 const char *field3, const char *field4,
64 const char *field5, const char *field6,
65 const char *field7, const char *field8,
66 const char *field9, const char *field10,
67 const char *field11, const char *field12,
68 const char *field13, const char *field14)
70 struct unicode_attribute * uni;
74 fprintf (stderr, "index too large\n");
77 uni = &unicode_attributes[i];
78 /* Copy the strings. */
79 uni->name = strdup (field1);
80 uni->category = (field2[0] == '\0' ? "" : strdup (field2));
81 uni->combining = (field3[0] == '\0' ? "" : strdup (field3));
82 uni->bidi = (field4[0] == '\0' ? "" : strdup (field4));
83 uni->decomposition = (field5[0] == '\0' ? "" : strdup (field5));
84 uni->decdigit = (field6[0] == '\0' ? "" : strdup (field6));
85 uni->digit = (field7[0] == '\0' ? "" : strdup (field7));
86 uni->numeric = (field8[0] == '\0' ? "" : strdup (field8));
87 uni->mirrored = (field9[0] == 'Y');
88 uni->oldname = (field10[0] == '\0' ? "" : strdup (field10));
89 uni->comment = (field11[0] == '\0' ? "" : strdup (field11));
90 uni->upper = (field12[0] =='\0' ? NONE : strtoul (field12, NULL, 16));
91 uni->lower = (field13[0] =='\0' ? NONE : strtoul (field13, NULL, 16));
92 uni->title = (field14[0] =='\0' ? NONE : strtoul (field14, NULL, 16));
95 /* Maximum length of a field in the UnicodeData.txt file. */
98 /* Reads the next field from STREAM. The buffer BUFFER has size FIELDLEN.
99 Reads up to (but excluding) DELIM.
100 Returns 1 when a field was successfully read, otherwise 0. */
102 getfield (FILE *stream, char *buffer, int delim)
107 for (; (c = getc (stream)), (c != EOF && c != delim); )
109 /* The original unicode.org UnicodeData.txt file happens to have
110 CR/LF line terminators. Silently convert to LF. */
114 /* Put c into the buffer. */
115 if (++count >= FIELDLEN - 1)
117 fprintf (stderr, "field too long\n");
130 /* Stores in unicode_attributes[] the entire contents of the UnicodeData.txt
133 fill_attributes (const char *unicodedata_filename)
137 char field0[FIELDLEN];
138 char field1[FIELDLEN];
139 char field2[FIELDLEN];
140 char field3[FIELDLEN];
141 char field4[FIELDLEN];
142 char field5[FIELDLEN];
143 char field6[FIELDLEN];
144 char field7[FIELDLEN];
145 char field8[FIELDLEN];
146 char field9[FIELDLEN];
147 char field10[FIELDLEN];
148 char field11[FIELDLEN];
149 char field12[FIELDLEN];
150 char field13[FIELDLEN];
151 char field14[FIELDLEN];
154 for (i = 0; i < 0x110000; i++)
155 unicode_attributes[i].name = NULL;
157 stream = fopen (unicodedata_filename, "r");
160 fprintf (stderr, "error during fopen of '%s'\n", unicodedata_filename);
169 n = getfield (stream, field0, ';');
170 n += getfield (stream, field1, ';');
171 n += getfield (stream, field2, ';');
172 n += getfield (stream, field3, ';');
173 n += getfield (stream, field4, ';');
174 n += getfield (stream, field5, ';');
175 n += getfield (stream, field6, ';');
176 n += getfield (stream, field7, ';');
177 n += getfield (stream, field8, ';');
178 n += getfield (stream, field9, ';');
179 n += getfield (stream, field10, ';');
180 n += getfield (stream, field11, ';');
181 n += getfield (stream, field12, ';');
182 n += getfield (stream, field13, ';');
183 n += getfield (stream, field14, '\n');
188 fprintf (stderr, "short line in'%s':%d\n",
189 unicodedata_filename, lineno);
192 i = strtoul (field0, NULL, 16);
194 && strlen (field1) >= 9
195 && !strcmp (field1 + strlen(field1) - 8, ", First>"))
197 /* Deal with a range. */
199 n = getfield (stream, field0, ';');
200 n += getfield (stream, field1, ';');
201 n += getfield (stream, field2, ';');
202 n += getfield (stream, field3, ';');
203 n += getfield (stream, field4, ';');
204 n += getfield (stream, field5, ';');
205 n += getfield (stream, field6, ';');
206 n += getfield (stream, field7, ';');
207 n += getfield (stream, field8, ';');
208 n += getfield (stream, field9, ';');
209 n += getfield (stream, field10, ';');
210 n += getfield (stream, field11, ';');
211 n += getfield (stream, field12, ';');
212 n += getfield (stream, field13, ';');
213 n += getfield (stream, field14, '\n');
216 fprintf (stderr, "missing end range in '%s':%d\n",
217 unicodedata_filename, lineno);
220 if (!(field1[0] == '<'
221 && strlen (field1) >= 8
222 && !strcmp (field1 + strlen (field1) - 7, ", Last>")))
224 fprintf (stderr, "missing end range in '%s':%d\n",
225 unicodedata_filename, lineno);
228 field1[strlen (field1) - 7] = '\0';
229 j = strtoul (field0, NULL, 16);
231 fill_attribute (i, field1+1, field2, field3, field4, field5,
232 field6, field7, field8, field9, field10,
233 field11, field12, field13, field14);
237 /* Single character line */
238 fill_attribute (i, field1, field2, field3, field4, field5,
239 field6, field7, field8, field9, field10,
240 field11, field12, field13, field14);
243 if (ferror (stream) || fclose (stream))
245 fprintf (stderr, "error reading from '%s'\n", unicodedata_filename);
250 /* The width property from the EastAsianWidth.txt file.
251 Each is NULL (unassigned) or "N", "A", "H", "W", "F", "Na". */
252 const char * unicode_width[0x110000];
254 /* Stores in unicode_width[] the width property from the EastAsianWidth.txt
257 fill_width (const char *width_filename)
261 char field0[FIELDLEN];
262 char field1[FIELDLEN];
263 char field2[FIELDLEN];
266 for (i = 0; i < 0x110000; i++)
267 unicode_width[i] = (unicode_attributes[i].name != NULL ? "N" : NULL);
269 stream = fopen (width_filename, "r");
272 fprintf (stderr, "error during fopen of '%s'\n", width_filename);
287 do c = getc (stream); while (c != EOF && c != '\n');
291 n = getfield (stream, field0, ';');
292 n += getfield (stream, field1, ' ');
293 n += getfield (stream, field2, '\n');
298 fprintf (stderr, "short line in '%s':%d\n", width_filename, lineno);
301 i = strtoul (field0, NULL, 16);
302 if (strstr (field0, "..") != NULL)
304 /* Deal with a range. */
305 j = strtoul (strstr (field0, "..") + 2, NULL, 16);
307 unicode_width[i] = strdup (field1);
311 /* Single character line. */
312 unicode_width[i] = strdup (field1);
315 if (ferror (stream) || fclose (stream))
317 fprintf (stderr, "error reading from '%s'\n", width_filename);
322 /* Line breaking classification. */
326 /* Values >= 24 are resolved at run time. */
327 LBP_BK = 24, /* mandatory break */
328 /*LBP_CR, carriage return - not used here because it's a DOSism */
329 /*LBP_LF, line feed - not used here because it's a DOSism */
330 LBP_CM = 25, /* attached characters and combining marks */
331 /*LBP_NL, next line - not used here because it's equivalent to LBP_BK */
332 /*LBP_SG, surrogates - not used here because they are not characters */
333 LBP_WJ = 0, /* word joiner */
334 LBP_ZW = 26, /* zero width space */
335 LBP_GL = 1, /* non-breaking (glue) */
336 LBP_SP = 27, /* space */
337 LBP_B2 = 2, /* break opportunity before and after */
338 LBP_BA = 3, /* break opportunity after */
339 LBP_BB = 4, /* break opportunity before */
340 LBP_HY = 5, /* hyphen */
341 LBP_CB = 28, /* contingent break opportunity */
342 LBP_CL = 6, /* closing punctuation */
343 LBP_EX = 7, /* exclamation/interrogation */
344 LBP_IN = 8, /* inseparable */
345 LBP_NS = 9, /* non starter */
346 LBP_OP = 10, /* opening punctuation */
347 LBP_QU = 11, /* ambiguous quotation */
348 LBP_IS = 12, /* infix separator (numeric) */
349 LBP_NU = 13, /* numeric */
350 LBP_PO = 14, /* postfix (numeric) */
351 LBP_PR = 15, /* prefix (numeric) */
352 LBP_SY = 16, /* symbols allowing breaks */
353 LBP_AI = 29, /* ambiguous (alphabetic or ideograph) */
354 LBP_AL = 17, /* ordinary alphabetic and symbol characters */
355 LBP_H2 = 18, /* Hangul LV syllable */
356 LBP_H3 = 19, /* Hangul LVT syllable */
357 LBP_ID = 20, /* ideographic */
358 LBP_JL = 21, /* Hangul L Jamo */
359 LBP_JV = 22, /* Hangul V Jamo */
360 LBP_JT = 23, /* Hangul T Jamo */
361 LBP_SA = 30, /* complex context (South East Asian) */
362 LBP_XX = 31 /* unknown */
365 /* Returns the line breaking classification for ch, as a bit mask. */
367 get_lbp (unsigned int ch)
371 if (unicode_attributes[ch].name != NULL)
373 /* mandatory break */
374 if (ch == 0x000A || ch == 0x000D || ch == 0x0085 /* newline */
375 || ch == 0x000C /* form feed */
376 || ch == 0x000B /* line tabulation */
377 || ch == 0x2028 /* LINE SEPARATOR */
378 || ch == 0x2029 /* PARAGRAPH SEPARATOR */)
381 if (ch == 0x2060 /* WORD JOINER */
382 || ch == 0xFEFF /* ZERO WIDTH NO-BREAK SPACE */)
385 /* zero width space */
386 if (ch == 0x200B /* ZERO WIDTH SPACE */)
389 /* non-breaking (glue) */
390 if (ch == 0x00A0 /* NO-BREAK SPACE */
391 || ch == 0x202F /* NARROW NO-BREAK SPACE */
392 || ch == 0x180E /* MONGOLIAN VOWEL SEPARATOR */
393 || ch == 0x034F /* COMBINING GRAPHEME JOINER */
394 || ch == 0x2007 /* FIGURE SPACE */
395 || ch == 0x2011 /* NON-BREAKING HYPHEN */
396 || ch == 0x0F08 /* TIBETAN MARK SBRUL SHAD */
397 || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */
398 || ch == 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */
399 || (ch >= 0x035C && ch <= 0x0362) /* COMBINING DOUBLE ... */)
403 if (ch == 0x0020 /* SPACE */)
406 /* break opportunity before and after */
407 if (ch == 0x2014 /* EM DASH */)
410 /* break opportunity after */
411 if (ch == 0x1680 /* OGHAM SPACE MARK */
412 || ch == 0x2000 /* EN QUAD */
413 || ch == 0x2001 /* EM QUAD */
414 || ch == 0x2002 /* EN SPACE */
415 || ch == 0x2003 /* EM SPACE */
416 || ch == 0x2004 /* THREE-PER-EM SPACE */
417 || ch == 0x2005 /* FOUR-PER-EM SPACE */
418 || ch == 0x2006 /* SIX-PER-EM SPACE */
419 || ch == 0x2008 /* PUNCTUATION SPACE */
420 || ch == 0x2009 /* THIN SPACE */
421 || ch == 0x200A /* HAIR SPACE */
422 || ch == 0x205F /* MEDIUM MATHEMATICAL SPACE */
423 || ch == 0x0009 /* tab */
424 || ch == 0x00AD /* SOFT HYPHEN */
425 || ch == 0x058A /* ARMENIAN HYPHEN */
426 || ch == 0x2010 /* HYPHEN */
427 || ch == 0x2012 /* FIGURE DASH */
428 || ch == 0x2013 /* EN DASH */
429 || ch == 0x05BE /* HEBREW PUNCTUATION MAQAF */
430 || ch == 0x0F0B /* TIBETAN MARK INTERSYLLABIC TSHEG */
431 || ch == 0x1361 /* ETHIOPIC WORDSPACE */
432 || ch == 0x17D8 /* KHMER SIGN BEYYAL */
433 || ch == 0x17DA /* KHMER SIGN KOOMUUT */
434 || ch == 0x2027 /* HYPHENATION POINT */
435 || ch == 0x007C /* VERTICAL LINE */
436 || ch == 0x16EB /* RUNIC SINGLE PUNCTUATION */
437 || ch == 0x16EC /* RUNIC MULTIPLE PUNCTUATION */
438 || ch == 0x16ED /* RUNIC CROSS PUNCTUATION */
439 || ch == 0x2056 /* THREE DOT PUNCTUATION */
440 || ch == 0x2058 /* FOUR DOT PUNCTUATION */
441 || ch == 0x2059 /* FIVE DOT PUNCTUATION */
442 || ch == 0x205A /* TWO DOT PUNCTUATION */
443 || ch == 0x205B /* FOUR DOT MARK */
444 || ch == 0x205D /* TRICOLON */
445 || ch == 0x205E /* VERTICAL FOUR DOTS */
446 || ch == 0x2E19 /* PALM BRANCH */
447 || ch == 0x2E2A /* TWO DOTS OVER ONE DOT PUNCTUATION */
448 || ch == 0x2E2B /* ONE DOT OVER TWO DOTS PUNCTUATION */
449 || ch == 0x2E2C /* SQUARED FOUR DOT PUNCTUATION */
450 || ch == 0x2E2D /* FIVE DOT PUNCTUATION */
451 || ch == 0x2E30 /* RING POINT */
452 || ch == 0x10100 /* AEGEAN WORD SEPARATOR LINE */
453 || ch == 0x10101 /* AEGEAN WORD SEPARATOR DOT */
454 || ch == 0x10102 /* AEGEAN CHECK MARK */
455 || ch == 0x1039F /* UGARITIC WORD DIVIDER */
456 || ch == 0x103D0 /* OLD PERSIAN WORD DIVIDER */
457 || ch == 0x1091F /* PHOENICIAN WORD SEPARATOR */
458 || ch == 0x12470 /* CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER */
459 || ch == 0x0964 /* DEVANAGARI DANDA */
460 || ch == 0x0965 /* DEVANAGARI DOUBLE DANDA */
461 || ch == 0x0E5A /* THAI CHARACTER ANGKHANKHU */
462 || ch == 0x0E5B /* THAI CHARACTER KHOMUT */
463 || ch == 0x104A /* MYANMAR SIGN LITTLE SECTION */
464 || ch == 0x104B /* MYANMAR SIGN SECTION */
465 || ch == 0x1735 /* PHILIPPINE SINGLE PUNCTUATION */
466 || ch == 0x1736 /* PHILIPPINE DOUBLE PUNCTUATION */
467 || ch == 0x17D4 /* KHMER SIGN KHAN */
468 || ch == 0x17D5 /* KHMER SIGN BARIYOOSAN */
469 || ch == 0x1B5E /* BALINESE CARIK SIKI */
470 || ch == 0x1B5F /* BALINESE CARIK PAREREN */
471 || ch == 0xA8CE /* SAURASHTRA DANDA */
472 || ch == 0xA8CF /* SAURASHTRA DOUBLE DANDA */
473 || ch == 0xAA5D /* CHAM PUNCTUATION DANDA */
474 || ch == 0xAA5E /* CHAM PUNCTUATION DOUBLE DANDA */
475 || ch == 0xAA5F /* CHAM PUNCTUATION TRIPLE DANDA */
476 || ch == 0x10A56 /* KHAROSHTHI PUNCTUATION DANDA */
477 || ch == 0x10A57 /* KHAROSHTHI PUNCTUATION DOUBLE DANDA */
478 || ch == 0x0F34 /* TIBETAN MARK BSDUS RTAGS */
479 || ch == 0x0F7F /* TIBETAN SIGN RNAM BCAD */
480 || ch == 0x0F85 /* TIBETAN MARK PALUTA */
481 || ch == 0x0FBE /* TIBETAN KU RU KHA */
482 || ch == 0x0FBF /* TIBETAN KU RU KHA BZHI MIG CAN */
483 || ch == 0x0FD2 /* TIBETAN MARK NYIS TSHEG */
485 || ch == 0x1802 /* MONGOLIAN COMMA */
486 || ch == 0x1803 /* MONGOLIAN FULL STOP */
488 || ch == 0x1804 /* MONGOLIAN COLON */
489 || ch == 0x1805 /* MONGOLIAN FOUR DOTS */
491 || ch == 0x1808 /* MONGOLIAN MANCHU COMMA */
492 || ch == 0x1809 /* MONGOLIAN MANCHU FULL STOP */
494 || ch == 0x1B5A /* BALINESE PANTI */
495 || ch == 0x1B5B /* BALINESE PAMADA */
496 || ch == 0x1B5C /* BALINESE WINDU */
497 || ch == 0x1B5D /* BALINESE CARIK PAMUNGKAH */
498 || ch == 0x1B60 /* BALINESE PAMENENG */
499 || ch == 0x1C3B /* LEPCHA PUNCTUATION TA-ROL */
500 || ch == 0x1C3C /* LEPCHA PUNCTUATION NYET THYOOM TA-ROL */
501 || ch == 0x1C3D /* LEPCHA PUNCTUATION CER-WA */
502 || ch == 0x1C3E /* LEPCHA PUNCTUATION TSHOOK CER-WA */
503 || ch == 0x1C3F /* LEPCHA PUNCTUATION TSHOOK */
504 || ch == 0x1C7E /* OL CHIKI PUNCTUATION MUCAAD */
505 || ch == 0x1C7F /* OL CHIKI PUNCTUATION DOUBLE MUCAAD */
507 || ch == 0x2CF9 /* COPTIC OLD NUBIAN FULL STOP */
509 || ch == 0x2CFA /* COPTIC OLD NUBIAN DIRECT QUESTION MARK */
510 || ch == 0x2CFB /* COPTIC OLD NUBIAN INDIRECT QUESTION MARK */
511 || ch == 0x2CFC /* COPTIC OLD NUBIAN VERSE DIVIDER */
513 || ch == 0x2CFE /* COPTIC FULL STOP */
515 || ch == 0x2CFF /* COPTIC MORPHOLOGICAL DIVIDER */
516 || (ch >= 0x2E0E && ch <= 0x2E15) /* EDITORIAL CORONIS .. UPWARDS ANCORA */
517 || ch == 0x2E17 /* DOUBLE OBLIQUE HYPHEN */
518 || ch == 0xA60D /* VAI COMMA */
519 || ch == 0xA60F /* VAI QUESTION MARK */
520 || ch == 0xA92E /* KAYAH LI SIGN CWI */
521 || ch == 0xA92F /* KAYAH LI SIGN SHYA */
522 || ch == 0x10A50 /* KHAROSHTHI PUNCTUATION DOT */
523 || ch == 0x10A51 /* KHAROSHTHI PUNCTUATION SMALL CIRCLE */
524 || ch == 0x10A52 /* KHAROSHTHI PUNCTUATION CIRCLE */
525 || ch == 0x10A53 /* KHAROSHTHI PUNCTUATION CRESCENT BAR */
526 || ch == 0x10A54 /* KHAROSHTHI PUNCTUATION MANGALAM */
527 || ch == 0x10A55 /* KHAROSHTHI PUNCTUATION LOTUS */
528 /* Extra characters for compatibility with Unicode LineBreak.txt. */
530 || ch == 0x1A1E /* BUGINESE PALLAWA */
532 || ch == 0x12471 /* CUNEIFORM PUNCTUATION SIGN VERTICAL COLON */
533 || ch == 0x12472 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL COLON */
534 || ch == 0x12473 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL TRICOLON */)
537 /* break opportunity before */
538 if (ch == 0x00B4 /* ACUTE ACCENT */
540 || ch == 0x1FFD /* GREEK OXIA */
541 || ch == 0x02DF /* MODIFIER LETTER CROSS ACCENT */
543 || ch == 0x02C8 /* MODIFIER LETTER VERTICAL LINE */
544 || ch == 0x02CC /* MODIFIER LETTER LOW VERTICAL LINE */
545 || ch == 0x0F01 /* TIBETAN MARK GTER YIG MGO TRUNCATED A */
546 || ch == 0x0F02 /* TIBETAN MARK GTER YIG MGO -UM RNAM BCAD MA */
547 || ch == 0x0F03 /* TIBETAN MARK GTER YIG MGO -UM GTER TSHEG MA */
548 || ch == 0x0F04 /* TIBETAN MARK INITIAL YIG MGO MDUN MA */
549 || ch == 0x0F06 /* TIBETAN MARK CARET YIG MGO PHUR SHAD MA */
550 || ch == 0x0F07 /* TIBETAN MARK YIG MGO TSHEG SHAD MA */
551 || ch == 0x0F09 /* TIBETAN MARK BSKUR YIG MGO */
552 || ch == 0x0F0A /* TIBETAN MARK BKA- SHOG YIG MGO */
553 || ch == 0x0FD0 /* TIBETAN MARK BSKA- SHOG GI MGO RGYAN */
554 || ch == 0x0FD1 /* TIBETAN MARK MNYAM YIG GI MGO RGYAN */
555 || ch == 0x0FD3 /* TIBETAN MARK INITIAL BRDA RNYING YIG MGO MDUN MA */
556 || ch == 0xA874 /* PHAGS-PA SINGLE HEAD MARK */
557 || ch == 0xA875 /* PHAGS-PA DOUBLE HEAD MARK */
558 || ch == 0x1806 /* MONGOLIAN TODO SOFT HYPHEN */)
562 if (ch == 0x002D /* HYPHEN-MINUS */)
565 /* contingent break opportunity */
566 if (ch == 0xFFFC /* OBJECT REPLACEMENT CHARACTER */)
569 /* closing punctuation */
570 if ((unicode_attributes[ch].category[0] == 'P'
571 && unicode_attributes[ch].category[1] == 'e')
572 || ch == 0x3001 /* IDEOGRAPHIC COMMA */
573 || ch == 0x3002 /* IDEOGRAPHIC FULL STOP */
574 || ch == 0xFE11 /* PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC COMMA */
575 || ch == 0xFE12 /* PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC FULL STOP */
576 || ch == 0xFE50 /* SMALL COMMA */
577 || ch == 0xFE52 /* SMALL FULL STOP */
578 || ch == 0xFF0C /* FULLWIDTH COMMA */
579 || ch == 0xFF0E /* FULLWIDTH FULL STOP */
580 || ch == 0xFF61 /* HALFWIDTH IDEOGRAPHIC FULL STOP */
581 || ch == 0xFF64 /* HALFWIDTH IDEOGRAPHIC COMMA */)
584 /* exclamation/interrogation */
585 if (ch == 0x0021 /* EXCLAMATION MARK */
586 || ch == 0x003F /* QUESTION MARK */
587 || ch == 0x05C6 /* HEBREW PUNCTUATION NUN HAFUKHA */
589 || ch == 0x060C /* ARABIC COMMA */
591 || ch == 0x061B /* ARABIC SEMICOLON */
592 || ch == 0x061E /* ARABIC TRIPLE DOT PUNCTUATION MARK */
593 || ch == 0x061F /* ARABIC QUESTION MARK */
595 || ch == 0x066A /* ARABIC PERCENT SIGN */
597 || ch == 0x06D4 /* ARABIC FULL STOP */
598 || ch == 0x07F9 /* NKO EXCLAMATION MARK */
599 || ch == 0x0F0D /* TIBETAN MARK SHAD */
600 || ch == 0x0F0E /* TIBETAN MARK NYIS SHAD */
601 || ch == 0x0F0F /* TIBETAN MARK TSHEG SHAD */
602 || ch == 0x0F10 /* TIBETAN MARK NYIS TSHEG SHAD */
603 || ch == 0x0F11 /* TIBETAN MARK RIN CHEN SPUNGS SHAD */
604 || ch == 0x0F14 /* TIBETAN MARK GTER TSHEG */
606 || ch == 0x1802 /* MONGOLIAN COMMA */
607 || ch == 0x1803 /* MONGOLIAN FULL STOP */
608 || ch == 0x1808 /* MONGOLIAN MANCHU COMMA */
609 || ch == 0x1809 /* MONGOLIAN MANCHU FULL STOP */
611 || ch == 0x1944 /* LIMBU EXCLAMATION MARK */
612 || ch == 0x1945 /* LIMBU QUESTION MARK */
613 || ch == 0x2762 /* HEAVY EXCLAMATION MARK ORNAMENT */
614 || ch == 0x2763 /* HEAVY HEART EXCLAMATION MARK ORNAMENT */
616 || ch == 0x2CF9 /* COPTIC OLD NUBIAN FULL STOP */
617 || ch == 0x2CFE /* COPTIC FULL STOP */
619 || ch == 0x2E2E /* REVERSED QUESTION MARK */
620 || ch == 0xA60C /* VAI SYLLABLE LENGTHENER */
621 || ch == 0xA60E /* VAI FULL STOP */
622 || ch == 0xA876 /* PHAGS-PA MARK SHAD */
623 || ch == 0xA877 /* PHAGS-PA MARK DOUBLE SHAD */
624 || ch == 0xFE15 /* PRESENTATION FORM FOR VERTICAL EXCLAMATION MARK */
625 || ch == 0xFE16 /* PRESENTATION FORM FOR VERTICAL QUESTION MARK */
626 || ch == 0xFE56 /* SMALL QUESTION MARK */
627 || ch == 0xFE57 /* SMALL EXCLAMATION MARK */
628 || ch == 0xFF01 /* FULLWIDTH EXCLAMATION MARK */
629 || ch == 0xFF1F /* FULLWIDTH QUESTION MARK */)
633 if (ch == 0x2024 /* ONE DOT LEADER */
634 || ch == 0x2025 /* TWO DOT LEADER */
635 || ch == 0x2026 /* HORIZONTAL ELLIPSIS */
636 || ch == 0xFE19 /* PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS */)
640 if (ch == 0x17D6 /* KHMER SIGN CAMNUC PII KUUH */
641 || ch == 0x203C /* DOUBLE EXCLAMATION MARK */
642 || ch == 0x203D /* INTERROBANG */
643 || ch == 0x2047 /* DOUBLE QUESTION MARK */
644 || ch == 0x2048 /* QUESTION EXCLAMATION MARK */
645 || ch == 0x2049 /* EXCLAMATION QUESTION MARK */
646 || ch == 0x3005 /* IDEOGRAPHIC ITERATION MARK */
647 || ch == 0x301C /* WAVE DASH */
648 || ch == 0x303C /* MASU MARK */
649 || ch == 0x303B /* VERTICAL IDEOGRAPHIC ITERATION MARK */
650 || ch == 0x309B /* KATAKANA-HIRAGANA VOICED SOUND MARK */
651 || ch == 0x309C /* KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK */
652 || ch == 0x309D /* HIRAGANA ITERATION MARK */
653 || ch == 0x309E /* HIRAGANA VOICED ITERATION MARK */
654 || ch == 0x30A0 /* KATAKANA-HIRAGANA DOUBLE HYPHEN */
655 || ch == 0x30FB /* KATAKANA MIDDLE DOT */
656 || ch == 0x30FC /* KATAKANA-HIRAGANA PROLONGED SOUND MARK */
657 || ch == 0x30FD /* KATAKANA ITERATION MARK */
658 || ch == 0x30FE /* KATAKANA VOICED ITERATION MARK */
659 || ch == 0xA015 /* YI SYLLABLE WU */
660 || ch == 0xFE54 /* SMALL SEMICOLON */
661 || ch == 0xFE55 /* SMALL COLON */
662 || ch == 0xFF1A /* FULLWIDTH COLON */
663 || ch == 0xFF1B /* FULLWIDTH SEMICOLON */
664 || ch == 0xFF65 /* HALFWIDTH KATAKANA MIDDLE DOT */
665 || ch == 0xFF70 /* HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK */
666 || ch == 0xFF9E /* HALFWIDTH KATAKANA VOICED SOUND MARK */
667 || ch == 0xFF9F /* HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK */
668 || strstr (unicode_attributes[ch].name, "HIRAGANA LETTER SMALL ") != NULL
669 || strstr (unicode_attributes[ch].name, "KATAKANA LETTER SMALL ") != NULL)
672 /* opening punctuation */
673 if ((unicode_attributes[ch].category[0] == 'P'
674 && unicode_attributes[ch].category[1] == 's')
676 || ch == 0x00A1 /* INVERTED EXCLAMATION MARK */
677 || ch == 0x00BF /* INVERTED QUESTION MARK */
679 || ch == 0x2E18 /* INVERTED INTERROBANG */)
682 /* ambiguous quotation */
683 if ((unicode_attributes[ch].category[0] == 'P'
684 && (unicode_attributes[ch].category[1] == 'f'
685 || unicode_attributes[ch].category[1] == 'i'))
686 || ch == 0x0022 /* QUOTATION MARK */
687 || ch == 0x0027 /* APOSTROPHE */
688 || ch == 0x275B /* HEAVY SINGLE TURNED COMMA QUOTATION MARK ORNAMENT */
689 || ch == 0x275C /* HEAVY SINGLE COMMA QUOTATION MARK ORNAMENT */
690 || ch == 0x275D /* HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT */
691 || ch == 0x275E /* HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT */
692 || ch == 0x2E00 /* RIGHT ANGLE SUBSTITUTION MARKER */
693 || ch == 0x2E01 /* RIGHT ANGLE DOTTED SUBSTITUTION MARKER */
694 || ch == 0x2E06 /* RAISED INTERPOLATION MARKER */
695 || ch == 0x2E07 /* RAISED DOTTED INTERPOLATION MARKER */
696 || ch == 0x2E08 /* DOTTED TRANSPOSITION MARKER */
697 || ch == 0x2E0B /* RAISED SQUARE */)
700 /* infix separator (numeric) */
701 if (ch == 0x002C /* COMMA */
702 || ch == 0x002E /* FULL STOP */
703 || ch == 0x003A /* COLON */
704 || ch == 0x003B /* SEMICOLON */
705 || ch == 0x037E /* GREEK QUESTION MARK */
706 || ch == 0x0589 /* ARMENIAN FULL STOP */
708 || ch == 0x060C /* ARABIC COMMA */
710 || ch == 0x060D /* ARABIC DATE SEPARATOR */
711 || ch == 0x07F8 /* NKO COMMA */
712 || ch == 0x2044 /* FRACTION SLASH */
713 || ch == 0xFE10 /* PRESENTATION FORM FOR VERTICAL COMMA */
714 || ch == 0xFE13 /* PRESENTATION FORM FOR VERTICAL COLON */
715 || ch == 0xFE14 /* PRESENTATION FORM FOR VERTICAL SEMICOLON */)
719 if ((unicode_attributes[ch].category[0] == 'N'
720 && unicode_attributes[ch].category[1] == 'd'
721 && strstr (unicode_attributes[ch].name, "FULLWIDTH") == NULL)
722 || ch == 0x066B /* ARABIC DECIMAL SEPARATOR */
723 || ch == 0x066C /* ARABIC THOUSANDS SEPARATOR */)
726 /* postfix (numeric) */
727 if (ch == 0x0025 /* PERCENT SIGN */
728 || ch == 0x00A2 /* CENT SIGN */
729 || ch == 0x00B0 /* DEGREE SIGN */
730 || ch == 0x060B /* AFGHANI SIGN */
732 || ch == 0x066A /* ARABIC PERCENT SIGN */
734 || ch == 0x2030 /* PER MILLE SIGN */
735 || ch == 0x2031 /* PER TEN THOUSAND SIGN */
736 || ch == 0x2032 /* PRIME */
737 || ch == 0x2033 /* DOUBLE PRIME */
738 || ch == 0x2034 /* TRIPLE PRIME */
739 || ch == 0x2035 /* REVERSED PRIME */
740 || ch == 0x2036 /* REVERSED DOUBLE PRIME */
741 || ch == 0x2037 /* REVERSED TRIPLE PRIME */
742 || ch == 0x20A7 /* PESETA SIGN */
743 || ch == 0x2103 /* DEGREE CELSIUS */
744 || ch == 0x2109 /* DEGREE FAHRENHEIT */
745 || ch == 0xFDFC /* RIAL SIGN */
746 || ch == 0xFE6A /* SMALL PERCENT SIGN */
747 || ch == 0xFF05 /* FULLWIDTH PERCENT SIGN */
748 || ch == 0xFFE0 /* FULLWIDTH DIGIT ZERO */)
751 /* prefix (numeric) */
752 if ((unicode_attributes[ch].category[0] == 'S'
753 && unicode_attributes[ch].category[1] == 'c')
754 || ch == 0x002B /* PLUS SIGN */
755 || ch == 0x005C /* REVERSE SOLIDUS */
756 || ch == 0x00B1 /* PLUS-MINUS SIGN */
757 || ch == 0x2116 /* NUMERO SIGN */
758 || ch == 0x2212 /* MINUS SIGN */
759 || ch == 0x2213 /* MINUS-OR-PLUS SIGN */)
760 if (!(attr & (1 << LBP_PO)))
763 /* symbols allowing breaks */
764 if (ch == 0x002F /* SOLIDUS */)
767 if (ch >= 0xAC00 && ch <= 0xD7A3 && ((ch - 0xAC00) % 28) == 0)
770 if (ch >= 0xAC00 && ch <= 0xD7A3 && ((ch - 0xAC00) % 28) != 0)
773 if ((ch >= 0x1100 && ch <= 0x1159) || ch == 0x115F)
776 if (ch >= 0x1160 && ch <= 0x11A2)
779 if (ch >= 0x11A8 && ch <= 0x11F9)
782 /* complex context (South East Asian) */
783 if (((unicode_attributes[ch].category[0] == 'C'
784 && unicode_attributes[ch].category[1] == 'f')
785 || (unicode_attributes[ch].category[0] == 'L'
786 && (unicode_attributes[ch].category[1] == 'm'
787 || unicode_attributes[ch].category[1] == 'o'))
788 || (unicode_attributes[ch].category[0] == 'M'
789 && (unicode_attributes[ch].category[1] == 'c'
790 || unicode_attributes[ch].category[1] == 'n'))
791 /* Extra characters for compatibility with Unicode LineBreak.txt. */
792 || ch == 0x19DE /* NEW TAI LUE SIGN LAE */
793 || ch == 0x19DF /* NEW TAI LUE SIGN LAEV */)
794 && ((ch >= 0x0E00 && ch <= 0x0EFF)
795 || (ch >= 0x1000 && ch <= 0x109F)
796 || (ch >= 0x1780 && ch <= 0x17FF)
797 || (ch >= 0x1950 && ch <= 0x19DF)))
800 /* attached characters and combining marks */
801 if ((unicode_attributes[ch].category[0] == 'M'
802 && (unicode_attributes[ch].category[1] == 'c'
803 || unicode_attributes[ch].category[1] == 'e'
804 || unicode_attributes[ch].category[1] == 'n'))
805 || (unicode_attributes[ch].category[0] == 'C'
806 && (unicode_attributes[ch].category[1] == 'c'
807 || unicode_attributes[ch].category[1] == 'f')))
808 if (!(attr & ((1 << LBP_BK) | (1 << LBP_BA) | (1 << LBP_GL) | (1 << LBP_SA) | (1 << LBP_WJ) | (1 << LBP_ZW))))
812 if ((ch >= 0x2E80 && ch <= 0x2FFF) /* CJK RADICAL, KANGXI RADICAL, IDEOGRAPHIC DESCRIPTION */
813 || ch == 0x3000 /* IDEOGRAPHIC SPACE */
814 || (ch >= 0x3040 && ch <= 0x309F) /* HIRAGANA */
815 || (ch >= 0x30A0 && ch <= 0x30FF) /* KATAKANA */
816 || (ch >= 0x3400 && ch <= 0x4DB5) /* CJK Ideograph Extension A */
817 || (ch >= 0x4E00 && ch <= 0x9FBB) /* CJK Ideograph */
818 || (ch >= 0xF900 && ch <= 0xFAD9) /* CJK COMPATIBILITY IDEOGRAPH */
819 || (ch >= 0xA000 && ch <= 0xA48F) /* YI SYLLABLE */
820 || (ch >= 0xA490 && ch <= 0xA4CF) /* YI RADICAL */
821 || ch == 0xFE62 /* SMALL PLUS SIGN */
822 || ch == 0xFE63 /* SMALL HYPHEN-MINUS */
823 || ch == 0xFE64 /* SMALL LESS-THAN SIGN */
824 || ch == 0xFE65 /* SMALL GREATER-THAN SIGN */
825 || ch == 0xFE66 /* SMALL EQUALS SIGN */
826 || (ch >= 0xFF10 && ch <= 0xFF19) /* FULLWIDTH DIGIT */
827 || (ch >= 0x20000 && ch <= 0x2A6D6) /* CJK Ideograph Extension B */
828 || (ch >= 0x2F800 && ch <= 0x2FA1D) /* CJK COMPATIBILITY IDEOGRAPH */
829 || strstr (unicode_attributes[ch].name, "FULLWIDTH LATIN ") != NULL
830 || (ch >= 0x3000 && ch <= 0x33FF
831 && !(attr & ((1 << LBP_CM) | (1 << LBP_NS) | (1 << LBP_OP) | (1 << LBP_CL))))
832 /* Extra characters for compatibility with Unicode LineBreak.txt. */
833 || ch == 0xFE30 /* PRESENTATION FORM FOR VERTICAL TWO DOT LEADER */
834 || ch == 0xFE31 /* PRESENTATION FORM FOR VERTICAL EM DASH */
835 || ch == 0xFE32 /* PRESENTATION FORM FOR VERTICAL EN DASH */
836 || ch == 0xFE33 /* PRESENTATION FORM FOR VERTICAL LOW LINE */
837 || ch == 0xFE34 /* PRESENTATION FORM FOR VERTICAL WAVY LOW LINE */
838 || ch == 0xFE45 /* SESAME DOT */
839 || ch == 0xFE46 /* WHITE SESAME DOT */
840 || ch == 0xFE49 /* DASHED OVERLINE */
841 || ch == 0xFE4A /* CENTRELINE OVERLINE */
842 || ch == 0xFE4B /* WAVY OVERLINE */
843 || ch == 0xFE4C /* DOUBLE WAVY OVERLINE */
844 || ch == 0xFE4D /* DASHED LOW LINE */
845 || ch == 0xFE4E /* CENTRELINE LOW LINE */
846 || ch == 0xFE4F /* WAVY LOW LINE */
847 || ch == 0xFE51 /* SMALL IDEOGRAPHIC COMMA */
848 || ch == 0xFE58 /* SMALL EM DASH */
849 || ch == 0xFE5F /* SMALL NUMBER SIGN */
850 || ch == 0xFE60 /* SMALL AMPERSAND */
851 || ch == 0xFE61 /* SMALL ASTERISK */
852 || ch == 0xFE68 /* SMALL REVERSE SOLIDUS */
853 || ch == 0xFE6B /* SMALL COMMERCIAL AT */
854 || ch == 0xFF02 /* FULLWIDTH QUOTATION MARK */
855 || ch == 0xFF03 /* FULLWIDTH NUMBER SIGN */
856 || ch == 0xFF06 /* FULLWIDTH AMPERSAND */
857 || ch == 0xFF07 /* FULLWIDTH APOSTROPHE */
858 || ch == 0xFF0A /* FULLWIDTH ASTERISK */
859 || ch == 0xFF0B /* FULLWIDTH PLUS SIGN */
860 || ch == 0xFF0D /* FULLWIDTH HYPHEN-MINUS */
861 || ch == 0xFF0F /* FULLWIDTH SOLIDUS */
862 || ch == 0xFF1C /* FULLWIDTH LESS-THAN SIGN */
863 || ch == 0xFF1D /* FULLWIDTH EQUALS SIGN */
864 || ch == 0xFF1E /* FULLWIDTH GREATER-THAN SIGN */
865 || ch == 0xFF20 /* FULLWIDTH COMMERCIAL AT */
866 || ch == 0xFF3C /* FULLWIDTH REVERSE SOLIDUS */
867 || ch == 0xFF3E /* FULLWIDTH CIRCUMFLEX ACCENT */
868 || ch == 0xFF3F /* FULLWIDTH LOW LINE */
869 || ch == 0xFF40 /* FULLWIDTH GRAVE ACCENT */
870 || ch == 0xFF5C /* FULLWIDTH VERTICAL LINE */
871 || ch == 0xFF5E /* FULLWIDTH TILDE */
872 || ch == 0xFFE2 /* FULLWIDTH NOT SIGN */
873 || ch == 0xFFE3 /* FULLWIDTH MACRON */
874 || ch == 0xFFE4 /* FULLWIDTH BROKEN BAR */)
875 if (!(attr & ((1 << LBP_NS) | (1 << LBP_CM))))
877 /* ambiguous (ideograph) ? */
878 if ((unicode_width[ch] != NULL
879 && unicode_width[ch][0] == 'A'
881 || ch == 0x24EA /* CIRCLED DIGIT ZERO */
882 || (ch >= 0x2780 && ch <= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */)
888 /* ordinary alphabetic and symbol characters */
889 if ((unicode_attributes[ch].category[0] == 'L'
890 && (unicode_attributes[ch].category[1] == 'u'
891 || unicode_attributes[ch].category[1] == 'l'
892 || unicode_attributes[ch].category[1] == 't'
893 || unicode_attributes[ch].category[1] == 'm'
894 || unicode_attributes[ch].category[1] == 'o'))
895 || (unicode_attributes[ch].category[0] == 'S'
896 && (unicode_attributes[ch].category[1] == 'm'
897 || unicode_attributes[ch].category[1] == 'k'
898 || unicode_attributes[ch].category[1] == 'o'))
899 || (unicode_attributes[ch].category[0] == 'N'
900 && (unicode_attributes[ch].category[1] == 'l'
901 || unicode_attributes[ch].category[1] == 'o'))
902 || (unicode_attributes[ch].category[0] == 'P'
903 && (unicode_attributes[ch].category[1] == 'c'
904 || unicode_attributes[ch].category[1] == 'd'
905 || unicode_attributes[ch].category[1] == 'o'))
906 || ch == 0x0600 /* ARABIC NUMBER SIGN */
907 || ch == 0x0601 /* ARABIC SIGN SANAH */
908 || ch == 0x0602 /* ARABIC FOOTNOTE MARKER */
909 || ch == 0x0603 /* ARABIC SIGN SAFHA */
910 || ch == 0x06DD /* ARABIC END OF AYAH */
911 || ch == 0x070F /* SYRIAC ABBREVIATION MARK */
912 || ch == 0x2061 /* FUNCTION APPLICATION */
913 || ch == 0x2062 /* INVISIBLE TIMES */
914 || ch == 0x2063 /* INVISIBLE SEPARATOR */
915 || ch == 0x2064 /* INVISIBLE PLUS */)
916 if (!(attr & ((1 << LBP_GL) | (1 << LBP_B2) | (1 << LBP_BA) | (1 << LBP_BB) | (1 << LBP_HY) | (1 << LBP_CB) | (1 << LBP_CL) | (1 << LBP_EX) | (1 << LBP_IN) | (1 << LBP_NS) | (1 << LBP_OP) | (1 << LBP_QU) | (1 << LBP_IS) | (1 << LBP_NU) | (1 << LBP_PO) | (1 << LBP_PR) | (1 << LBP_SY) | (1 << LBP_H2) | (1 << LBP_H3) | (1 << LBP_JL) | (1 << LBP_JV) | (1 << LBP_JT) | (1 << LBP_SA) | (1 << LBP_ID))))
918 /* ambiguous (alphabetic) ? */
919 if ((unicode_width[ch] != NULL
920 && unicode_width[ch][0] == 'A'
922 /* Extra exceptions for compatibility with Unicode LineBreak.txt. */
923 && ch != 0x2022 /* BULLET */
924 && ch != 0x203E /* OVERLINE */
925 && ch != 0x2126 /* OHM SIGN */
926 && ch != 0x2153 /* VULGAR FRACTION ONE THIRD */
927 && ch != 0x215C /* VULGAR FRACTION THREE EIGHTHS */
928 && ch != 0x215D /* VULGAR FRACTION FIVE EIGHTHS */
929 && ch != 0x21B8 /* NORTH WEST ARROW TO LONG BAR */
930 && ch != 0x21B9 /* LEFTWARDS ARROW TO BAR OVER RIGHTWARDS ARROW TO BAR */
931 && ch != 0x21E7 /* UPWARDS WHITE ARROW */
932 && ch != 0x24FF /* NEGATIVE CIRCLED DIGIT ZERO */
933 && ch != 0x273D /* HEAVY TEARDROP-SPOKED ASTERISK */)
935 || ch == 0x00A1 /* INVERTED EXCLAMATION MARK */
936 || ch == 0x00A7 /* SECTION SIGN */
937 || ch == 0x00A8 /* DIAERESIS */
938 || ch == 0x00AA /* FEMININE ORDINAL INDICATOR */
939 || ch == 0x00B2 /* SUPERSCRIPT TWO */
940 || ch == 0x00B3 /* SUPERSCRIPT THREE */
941 || ch == 0x00B6 /* PILCROW SIGN */
942 || ch == 0x00B7 /* MIDDLE DOT */
943 || ch == 0x00B8 /* CEDILLA */
944 || ch == 0x00B9 /* SUPERSCRIPT ONE */
945 || ch == 0x00BA /* MASCULINE ORDINAL INDICATOR */
946 || ch == 0x00BC /* VULGAR FRACTION ONE QUARTER */
947 || ch == 0x00BD /* VULGAR FRACTION ONE HALF */
948 || ch == 0x00BE /* VULGAR FRACTION THREE QUARTERS */
949 || ch == 0x00BF /* INVERTED QUESTION MARK */
950 || ch == 0x00D7 /* MULTIPLICATION SIGN */
951 || ch == 0x00F7 /* DIVISION SIGN */
952 || ch == 0x02C7 /* CARON */
953 || ch == 0x02C9 /* MODIFIER LETTER MACRON */
954 || ch == 0x02CA /* MODIFIER LETTER ACUTE ACCENT */
955 || ch == 0x02CB /* MODIFIER LETTER GRAVE ACCENT */
956 || ch == 0x02CD /* MODIFIER LETTER LOW MACRON */
957 || ch == 0x02D0 /* MODIFIER LETTER TRIANGULAR COLON */
958 || ch == 0x02D8 /* BREVE */
959 || ch == 0x02D9 /* DOT ABOVE */
960 || ch == 0x02DA /* RING ABOVE */
961 || ch == 0x02DB /* OGONEK */
962 || ch == 0x02DD /* DOUBLE ACUTE ACCENT */
964 || ch == 0x24EA /* CIRCLED DIGIT ZERO */
965 || (ch >= 0x2780 && ch <= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */
966 /* Extra characters for compatibility with Unicode LineBreak.txt. */
967 || ch == 0x2155 /* VULGAR FRACTION ONE FIFTH */
968 || ch == 0x2574 /* BOX DRAWINGS LIGHT LEFT */
969 || ch == 0x2616 /* WHITE SHOGI PIECE */
970 || ch == 0x2617 /* BLACK SHOGI PIECE */)
974 attr &= ~(1 << LBP_CM);
985 /* Output the line breaking properties in a human readable format. */
987 debug_output_lbp (FILE *stream)
991 for (i = 0; i < 0x110000; i++)
993 int attr = get_lbp (i);
994 if (attr != 1 << LBP_XX)
996 fprintf (stream, "0x%04X", i);
997 #define PRINT_BIT(attr,bit) \
998 if (attr & (1 << bit)) fprintf (stream, " " #bit);
999 PRINT_BIT(attr,LBP_BK);
1000 PRINT_BIT(attr,LBP_CM);
1001 PRINT_BIT(attr,LBP_WJ);
1002 PRINT_BIT(attr,LBP_ZW);
1003 PRINT_BIT(attr,LBP_GL);
1004 PRINT_BIT(attr,LBP_SP);
1005 PRINT_BIT(attr,LBP_B2);
1006 PRINT_BIT(attr,LBP_BA);
1007 PRINT_BIT(attr,LBP_BB);
1008 PRINT_BIT(attr,LBP_HY);
1009 PRINT_BIT(attr,LBP_CB);
1010 PRINT_BIT(attr,LBP_CL);
1011 PRINT_BIT(attr,LBP_EX);
1012 PRINT_BIT(attr,LBP_IN);
1013 PRINT_BIT(attr,LBP_NS);
1014 PRINT_BIT(attr,LBP_OP);
1015 PRINT_BIT(attr,LBP_QU);
1016 PRINT_BIT(attr,LBP_IS);
1017 PRINT_BIT(attr,LBP_NU);
1018 PRINT_BIT(attr,LBP_PO);
1019 PRINT_BIT(attr,LBP_PR);
1020 PRINT_BIT(attr,LBP_SY);
1021 PRINT_BIT(attr,LBP_AI);
1022 PRINT_BIT(attr,LBP_AL);
1023 PRINT_BIT(attr,LBP_H2);
1024 PRINT_BIT(attr,LBP_H3);
1025 PRINT_BIT(attr,LBP_ID);
1026 PRINT_BIT(attr,LBP_JL);
1027 PRINT_BIT(attr,LBP_JV);
1028 PRINT_BIT(attr,LBP_JT);
1029 PRINT_BIT(attr,LBP_SA);
1030 PRINT_BIT(attr,LBP_XX);
1032 fprintf (stream, "\n");
1038 debug_output_tables (const char *filename)
1042 stream = fopen (filename, "w");
1045 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1049 debug_output_lbp (stream);
1051 if (ferror (stream) || fclose (stream))
1053 fprintf (stderr, "error writing to '%s'\n", filename);
1058 /* The line breaking property from the LineBreak.txt file. */
1059 int unicode_org_lbp[0x110000];
1061 /* Stores in unicode_org_lbp[] the line breaking property from the
1062 LineBreak.txt file. */
1064 fill_org_lbp (const char *linebreak_filename)
1068 char field0[FIELDLEN];
1069 char field1[FIELDLEN];
1070 char field2[FIELDLEN];
1073 for (i = 0; i < 0x110000; i++)
1074 unicode_org_lbp[i] = LBP_XX;
1076 stream = fopen (linebreak_filename, "r");
1079 fprintf (stderr, "error during fopen of '%s'\n", linebreak_filename);
1095 do c = getc (stream); while (c != EOF && c != '\n');
1099 n = getfield (stream, field0, ';');
1100 n += getfield (stream, field1, ' ');
1101 n += getfield (stream, field2, '\n');
1106 fprintf (stderr, "short line in '%s':%d\n", linebreak_filename,
1110 #define TRY(bit) else if (strcmp (field1, #bit + 4) == 0) value = bit;
1145 else if (strcmp (field1, "LF") == 0) value = LBP_BK;
1146 else if (strcmp (field1, "CR") == 0) value = LBP_BK;
1147 else if (strcmp (field1, "NL") == 0) value = LBP_BK;
1148 else if (strcmp (field1, "SG") == 0) value = LBP_XX;
1151 fprintf (stderr, "unknown property value \"%s\" in '%s':%d\n",
1152 field1, linebreak_filename, lineno);
1155 i = strtoul (field0, NULL, 16);
1156 if (strstr (field0, "..") != NULL)
1158 /* Deal with a range. */
1159 j = strtoul (strstr (field0, "..") + 2, NULL, 16);
1161 unicode_org_lbp[i] = value;
1165 /* Single character line. */
1166 unicode_org_lbp[i] = value;
1169 if (ferror (stream) || fclose (stream))
1171 fprintf (stderr, "error reading from '%s'\n", linebreak_filename);
1176 /* Output the line breaking properties in a human readable format. */
1178 debug_output_org_lbp (FILE *stream)
1182 for (i = 0; i < 0x110000; i++)
1184 int attr = unicode_org_lbp[i];
1187 fprintf (stream, "0x%04X", i);
1188 #define PRINT_BIT(attr,bit) \
1189 if (attr == bit) fprintf (stream, " " #bit);
1190 PRINT_BIT(attr,LBP_BK);
1191 PRINT_BIT(attr,LBP_CM);
1192 PRINT_BIT(attr,LBP_WJ);
1193 PRINT_BIT(attr,LBP_ZW);
1194 PRINT_BIT(attr,LBP_GL);
1195 PRINT_BIT(attr,LBP_SP);
1196 PRINT_BIT(attr,LBP_B2);
1197 PRINT_BIT(attr,LBP_BA);
1198 PRINT_BIT(attr,LBP_BB);
1199 PRINT_BIT(attr,LBP_HY);
1200 PRINT_BIT(attr,LBP_CB);
1201 PRINT_BIT(attr,LBP_CL);
1202 PRINT_BIT(attr,LBP_EX);
1203 PRINT_BIT(attr,LBP_IN);
1204 PRINT_BIT(attr,LBP_NS);
1205 PRINT_BIT(attr,LBP_OP);
1206 PRINT_BIT(attr,LBP_QU);
1207 PRINT_BIT(attr,LBP_IS);
1208 PRINT_BIT(attr,LBP_NU);
1209 PRINT_BIT(attr,LBP_PO);
1210 PRINT_BIT(attr,LBP_PR);
1211 PRINT_BIT(attr,LBP_SY);
1212 PRINT_BIT(attr,LBP_AI);
1213 PRINT_BIT(attr,LBP_AL);
1214 PRINT_BIT(attr,LBP_H2);
1215 PRINT_BIT(attr,LBP_H3);
1216 PRINT_BIT(attr,LBP_ID);
1217 PRINT_BIT(attr,LBP_JL);
1218 PRINT_BIT(attr,LBP_JV);
1219 PRINT_BIT(attr,LBP_JT);
1220 PRINT_BIT(attr,LBP_SA);
1221 PRINT_BIT(attr,LBP_XX);
1223 fprintf (stream, "\n");
1229 debug_output_org_tables (const char *filename)
1233 stream = fopen (filename, "w");
1236 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1240 debug_output_org_lbp (stream);
1242 if (ferror (stream) || fclose (stream))
1244 fprintf (stderr, "error writing to '%s'\n", filename);
1249 /* Construction of sparse 3-level tables. */
1250 #define TABLE lbp_table
1251 #define ELEMENT unsigned char
1252 #define DEFAULT LBP_XX
1253 #define xmalloc malloc
1254 #define xrealloc realloc
1258 output_lbp (FILE *stream1, FILE *stream2)
1262 unsigned int level1_offset, level2_offset, level3_offset;
1266 lbp_table_init (&t);
1268 for (i = 0; i < 0x110000; i++)
1270 int attr = get_lbp (i);
1272 /* Now attr should contain exactly one bit. */
1273 if (attr == 0 || ((attr & (attr - 1)) != 0))
1276 if (attr != 1 << LBP_XX)
1278 unsigned int log2_attr;
1279 for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++);
1281 lbp_table_add (&t, i, log2_attr);
1285 lbp_table_finalize (&t);
1288 5 * sizeof (uint32_t);
1290 5 * sizeof (uint32_t)
1291 + t.level1_size * sizeof (uint32_t);
1293 5 * sizeof (uint32_t)
1294 + t.level1_size * sizeof (uint32_t)
1295 + (t.level2_size << t.q) * sizeof (uint32_t);
1297 for (i = 0; i < 5; i++)
1298 fprintf (stream1, "#define lbrkprop_header_%d %d\n", i,
1299 ((uint32_t *) t.result)[i]);
1300 fprintf (stream1, "\n");
1301 fprintf (stream1, "typedef struct\n");
1302 fprintf (stream1, " {\n");
1303 fprintf (stream1, " int level1[%d];\n", t.level1_size);
1304 fprintf (stream1, " int level2[%d << %d];\n", t.level2_size, t.q);
1305 fprintf (stream1, " unsigned char level3[%d << %d];\n", t.level3_size, t.p);
1306 fprintf (stream1, " }\n");
1307 fprintf (stream1, "lbrkprop_t;\n");
1308 fprintf (stream1, "extern const lbrkprop_t unilbrkprop;\n");
1310 fprintf (stream2, "const lbrkprop_t unilbrkprop =\n");
1311 fprintf (stream2, "{\n");
1312 fprintf (stream2, " {");
1313 if (t.level1_size > 8)
1314 fprintf (stream2, "\n ");
1315 for (i = 0; i < t.level1_size; i++)
1318 if (i > 0 && (i % 8) == 0)
1319 fprintf (stream2, "\n ");
1320 offset = ((uint32_t *) (t.result + level1_offset))[i];
1321 fprintf (stream2, " %5d%s",
1322 offset == 0 ? -1 : (offset - level2_offset) / sizeof (uint32_t),
1323 (i+1 < t.level1_size ? "," : ""));
1325 if (t.level1_size > 8)
1326 fprintf (stream2, "\n ");
1327 fprintf (stream2, " },\n");
1328 fprintf (stream2, " {");
1329 if (t.level2_size << t.q > 8)
1330 fprintf (stream2, "\n ");
1331 for (i = 0; i < t.level2_size << t.q; i++)
1334 if (i > 0 && (i % 8) == 0)
1335 fprintf (stream2, "\n ");
1336 offset = ((uint32_t *) (t.result + level2_offset))[i];
1337 fprintf (stream2, " %5d%s",
1338 offset == 0 ? -1 : (offset - level3_offset) / sizeof (uint8_t),
1339 (i+1 < t.level2_size << t.q ? "," : ""));
1341 if (t.level2_size << t.q > 8)
1342 fprintf (stream2, "\n ");
1343 fprintf (stream2, " },\n");
1344 fprintf (stream2, " {");
1345 if (t.level3_size << t.p > 8)
1346 fprintf (stream2, "\n ");
1347 for (i = 0; i < t.level3_size << t.p; i++)
1349 unsigned char value = ((unsigned char *) (t.result + level3_offset))[i];
1350 const char *value_string;
1353 #define CASE(x) case x: value_string = #x; break;
1390 if (i > 0 && (i % 8) == 0)
1391 fprintf (stream2, "\n ");
1392 fprintf (stream2, " %s%s", value_string,
1393 (i+1 < t.level3_size << t.p ? "," : ""));
1395 if (t.level3_size << t.p > 8)
1396 fprintf (stream2, "\n ");
1397 fprintf (stream2, " }\n");
1398 fprintf (stream2, "};\n");
1402 output_tables (const char *filename1, const char *filename2, const char *version)
1404 const char *filenames[2];
1408 filenames[0] = filename1;
1409 filenames[1] = filename2;
1411 for (i = 0; i < 2; i++)
1413 streams[i] = fopen (filenames[i], "w");
1414 if (streams[i] == NULL)
1416 fprintf (stderr, "cannot open '%s' for writing\n", filenames[i]);
1421 for (i = 0; i < 2; i++)
1423 FILE *stream = streams[i];
1425 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1426 fprintf (stream, "/* Line breaking properties of Unicode characters. */\n");
1427 fprintf (stream, "/* Generated automatically by gen-lbrk for Unicode %s. */\n",
1429 fprintf (stream, "\n");
1431 /* Put a GPL header on it. The gnulib module is under LGPL (although it
1432 still carries the GPL header), and it's gnulib-tool which replaces the
1433 GPL header with an LGPL header. */
1434 fprintf (stream, "/* Copyright (C) 2000-2002, 2004, 2008 Free Software Foundation, Inc.\n");
1435 fprintf (stream, "\n");
1436 fprintf (stream, " This program is free software: you can redistribute it and/or modify\n");
1437 fprintf (stream, " it under the terms of the GNU General Public License as published by\n");
1438 fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n");
1439 fprintf (stream, " (at your option) any later version.\n");
1440 fprintf (stream, "\n");
1441 fprintf (stream, " This program is distributed in the hope that it will be useful,\n");
1442 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
1443 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
1444 fprintf (stream, " GNU General Public License for more details.\n");
1445 fprintf (stream, "\n");
1446 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
1447 fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n");
1448 fprintf (stream, "\n");
1451 output_lbp (streams[0], streams[1]);
1453 for (i = 0; i < 2; i++)
1455 if (ferror (streams[i]) || fclose (streams[i]))
1457 fprintf (stderr, "error writing to '%s'\n", filenames[i]);
1464 main (int argc, char * argv[])
1468 fprintf (stderr, "Usage: %s UnicodeData.txt EastAsianWidth.txt LineBreak.txt version\n",
1473 fill_attributes (argv[1]);
1474 fill_width (argv[2]);
1475 fill_org_lbp (argv[3]);
1477 debug_output_tables ("lbrkprop.txt");
1478 debug_output_org_tables ("lbrkprop_org.txt");
1480 output_tables ("lbrkprop1.h", "lbrkprop2.h", argv[4]);
1486 * For Emacs M-x compile
1488 * compile-command: "
1489 gcc -O -Wall -I../unictype gen-lbrk.c -o gen-lbrk && \
1491 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.0.0/ucd/UnicodeData.txt \
1492 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.0.0/ucd/EastAsianWidth.txt \
1493 /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.0.0/ucd/LineBreak.txt \