1 /* Normalization forms (composition and decomposition) of Unicode strings.
2 Copyright (C) 2001-2002, 2009 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2009.
5 This program is free software: you can redistribute it and/or modify it
6 under the terms of the GNU Lesser General Public License as published
7 by the Free Software Foundation; either version 3 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>. */
34 All functions prefixed with u8_ operate on UTF-8 encoded strings.
35 Their unit is an uint8_t (1 byte).
37 All functions prefixed with u16_ operate on UTF-16 encoded strings.
38 Their unit is an uint16_t (a 2-byte word).
40 All functions prefixed with u32_ operate on UCS-4 encoded strings.
41 Their unit is an uint32_t (a 4-byte word).
43 All argument pairs (s, n) denote a Unicode string s[0..n-1] with exactly
46 Functions returning a string result take a (resultbuf, lengthp) argument
47 pair. If resultbuf is not NULL and the result fits into *lengthp units,
48 it is put in resultbuf, and resultbuf is returned. Otherwise, a freshly
49 allocated string is returned. In both cases, *lengthp is set to the
50 length (number of units) of the returned string. In case of error,
51 NULL is returned and errno is set. */
56 UC_DECOMP_CANONICAL,/* Canonical decomposition. */
57 UC_DECOMP_FONT, /* <font> A font variant (e.g. a blackletter form). */
58 UC_DECOMP_NOBREAK, /* <noBreak> A no-break version of a space or hyphen. */
59 UC_DECOMP_INITIAL, /* <initial> An initial presentation form (Arabic). */
60 UC_DECOMP_MEDIAL, /* <medial> A medial presentation form (Arabic). */
61 UC_DECOMP_FINAL, /* <final> A final presentation form (Arabic). */
62 UC_DECOMP_ISOLATED,/* <isolated> An isolated presentation form (Arabic). */
63 UC_DECOMP_CIRCLE, /* <circle> An encircled form. */
64 UC_DECOMP_SUPER, /* <super> A superscript form. */
65 UC_DECOMP_SUB, /* <sub> A subscript form. */
66 UC_DECOMP_VERTICAL,/* <vertical> A vertical layout presentation form. */
67 UC_DECOMP_WIDE, /* <wide> A wide (or zenkaku) compatibility character. */
68 UC_DECOMP_NARROW, /* <narrow> A narrow (or hankaku) compatibility character. */
69 UC_DECOMP_SMALL, /* <small> A small variant form (CNS compatibility). */
70 UC_DECOMP_SQUARE, /* <square> A CJK squared font variant. */
71 UC_DECOMP_FRACTION,/* <fraction> A vulgar fraction form. */
72 UC_DECOMP_COMPAT /* <compat> Otherwise unspecified compatibility character. */
75 /* Maximum size of decomposition of a single Unicode character. */
76 #define UC_DECOMPOSITION_MAX_LENGTH 32
78 /* Return the character decomposition mapping of a Unicode character.
79 DECOMPOSITION must point to an array of at least UC_DECOMPOSITION_MAX_LENGTH
81 When a decomposition exists, DECOMPOSITION[0..N-1] and *DECOMP_TAG are
82 filled and N is returned. Otherwise -1 is returned. */
84 uc_decomposition (ucs4_t uc, int *decomp_tag, ucs4_t *decomposition);
86 /* Return the canonical character decomposition mapping of a Unicode character.
87 DECOMPOSITION must point to an array of at least UC_DECOMPOSITION_MAX_LENGTH
89 When a decomposition exists, DECOMPOSITION[0..N-1] is filled and N is
90 returned. Otherwise -1 is returned. */
92 uc_canonical_decomposition (ucs4_t uc, ucs4_t *decomposition);
95 /* Attempt to combine the Unicode characters uc1, uc2.
96 uc1 is known to have canonical combining class 0.
97 Return the combination of uc1 and uc2, if it exists.
99 Not all decompositions can be recombined using this function. See the
100 Unicode file CompositionExclusions.txt for details. */
102 uc_composition (ucs4_t uc1, ucs4_t uc2);
105 /* An object of type uninorm_t denotes a Unicode normalization form. */
106 struct unicode_normalization_form;
107 typedef const struct unicode_normalization_form *uninorm_t;
109 /* UNINORM_NFD: Normalization form D: canonical decomposition. */
110 extern const struct unicode_normalization_form uninorm_nfd;
111 #define UNINORM_NFD (&uninorm_nfd)
113 /* UNINORM_NFC: Normalization form C: canonical decomposition, then
114 canonical composition. */
115 extern const struct unicode_normalization_form uninorm_nfc;
116 #define UNINORM_NFC (&uninorm_nfc)
118 /* UNINORM_NFKD: Normalization form KD: compatibility decomposition. */
119 extern const struct unicode_normalization_form uninorm_nfkd;
120 #define UNINORM_NFKD (&uninorm_nfkd)
122 /* UNINORM_NFKC: Normalization form KC: compatibility decomposition, then
123 canonical composition. */
124 extern const struct unicode_normalization_form uninorm_nfkc;
125 #define UNINORM_NFKC (&uninorm_nfkc)
127 /* Test whether a normalization form does compatibility decomposition. */
128 #define uninorm_is_compat_decomposing(nf) \
129 ((* (const unsigned int *) (nf) >> 0) & 1)
131 /* Test whether a normalization form includes canonical composition. */
132 #define uninorm_is_composing(nf) \
133 ((* (const unsigned int *) (nf) >> 1) & 1)
136 /* Return the specified normalization form of a string. */
138 u8_normalize (uninorm_t nf, const uint8_t *s, size_t n,
139 uint8_t *resultbuf, size_t *lengthp);
141 u16_normalize (uninorm_t nf, const uint16_t *s, size_t n,
142 uint16_t *resultbuf, size_t *lengthp);
144 u32_normalize (uninorm_t nf, const uint32_t *s, size_t n,
145 uint32_t *resultbuf, size_t *lengthp);
153 #endif /* _UNINORM_H */