1 /* Elementary Unicode string functions.
2 Copyright (C) 2001-2002, 2005-2009 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify it
5 under the terms of the GNU Lesser General Public License as published
6 by the Free Software Foundation; either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 Lesser General Public License for more details.
14 You should have received a copy of the GNU Lesser General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
35 All functions prefixed with u8_ operate on UTF-8 encoded strings.
36 Their unit is an uint8_t (1 byte).
38 All functions prefixed with u16_ operate on UTF-16 encoded strings.
39 Their unit is an uint16_t (a 2-byte word).
41 All functions prefixed with u32_ operate on UCS-4 encoded strings.
42 Their unit is an uint32_t (a 4-byte word).
44 All argument pairs (s, n) denote a Unicode string s[0..n-1] with exactly
47 All arguments starting with "str" and the arguments of functions starting
48 with u8_str/u16_str/u32_str denote a NUL terminated string, i.e. a string
49 which terminates at the first NUL unit. This termination unit is
50 considered part of the string for all memory allocation purposes, but
51 is not considered part of the string for all other logical purposes.
53 Functions returning a string result take a (resultbuf, lengthp) argument
54 pair. If resultbuf is not NULL and the result fits into *lengthp units,
55 it is put in resultbuf, and resultbuf is returned. Otherwise, a freshly
56 allocated string is returned. In both cases, *lengthp is set to the
57 length (number of units) of the returned string. In case of error,
58 NULL is returned and errno is set. */
61 /* Elementary string checks. */
63 /* Check whether an UTF-8 string is well-formed.
64 Return NULL if valid, or a pointer to the first invalid unit otherwise. */
65 extern const uint8_t *
66 u8_check (const uint8_t *s, size_t n);
68 /* Check whether an UTF-16 string is well-formed.
69 Return NULL if valid, or a pointer to the first invalid unit otherwise. */
70 extern const uint16_t *
71 u16_check (const uint16_t *s, size_t n);
73 /* Check whether an UCS-4 string is well-formed.
74 Return NULL if valid, or a pointer to the first invalid unit otherwise. */
75 extern const uint32_t *
76 u32_check (const uint32_t *s, size_t n);
79 /* Elementary string conversions. */
81 /* Convert an UTF-8 string to an UTF-16 string. */
83 u8_to_u16 (const uint8_t *s, size_t n, uint16_t *resultbuf,
86 /* Convert an UTF-8 string to an UCS-4 string. */
88 u8_to_u32 (const uint8_t *s, size_t n, uint32_t *resultbuf,
91 /* Convert an UTF-16 string to an UTF-8 string. */
93 u16_to_u8 (const uint16_t *s, size_t n, uint8_t *resultbuf,
96 /* Convert an UTF-16 string to an UCS-4 string. */
98 u16_to_u32 (const uint16_t *s, size_t n, uint32_t *resultbuf,
101 /* Convert an UCS-4 string to an UTF-8 string. */
103 u32_to_u8 (const uint32_t *s, size_t n, uint8_t *resultbuf,
106 /* Convert an UCS-4 string to an UTF-16 string. */
108 u32_to_u16 (const uint32_t *s, size_t n, uint16_t *resultbuf,
112 /* Elementary string functions. */
114 /* Return the length (number of units) of the first character in S, which is
115 no longer than N. Return 0 if it is the NUL character. Return -1 upon
117 /* Similar to mblen(), except that s must not be NULL. */
119 u8_mblen (const uint8_t *s, size_t n);
121 u16_mblen (const uint16_t *s, size_t n);
123 u32_mblen (const uint32_t *s, size_t n);
125 /* Return the length (number of units) of the first character in S, putting
126 its 'ucs4_t' representation in *PUC. Upon failure, *PUC is set to 0xfffd,
127 and an appropriate number of units is returned.
128 The number of available units, N, must be > 0. */
129 /* Similar to mbtowc(), except that puc and s must not be NULL, n must be > 0,
130 and the NUL character is not treated specially. */
131 /* The variants with _safe suffix are safe, even if the library is compiled
132 without --enable-safety. */
134 #ifdef GNULIB_UNISTR_U8_MBTOUC_UNSAFE
137 u8_mbtouc_unsafe (ucs4_t *puc, const uint8_t *s, size_t n);
140 u8_mbtouc_unsafe_aux (ucs4_t *puc, const uint8_t *s, size_t n);
142 u8_mbtouc_unsafe (ucs4_t *puc, const uint8_t *s, size_t n)
152 return u8_mbtouc_unsafe_aux (puc, s, n);
157 #ifdef GNULIB_UNISTR_U16_MBTOUC_UNSAFE
160 u16_mbtouc_unsafe (ucs4_t *puc, const uint16_t *s, size_t n);
163 u16_mbtouc_unsafe_aux (ucs4_t *puc, const uint16_t *s, size_t n);
165 u16_mbtouc_unsafe (ucs4_t *puc, const uint16_t *s, size_t n)
169 if (c < 0xd800 || c >= 0xe000)
175 return u16_mbtouc_unsafe_aux (puc, s, n);
180 #ifdef GNULIB_UNISTR_U32_MBTOUC_UNSAFE
183 u32_mbtouc_unsafe (ucs4_t *puc, const uint32_t *s, size_t n);
186 u32_mbtouc_unsafe (ucs4_t *puc, const uint32_t *s, size_t n _UNUSED_PARAMETER_)
190 # if CONFIG_UNICODE_SAFETY
191 if (c < 0xd800 || (c >= 0xe000 && c < 0x110000))
194 # if CONFIG_UNICODE_SAFETY
196 /* invalid multibyte character */
204 #ifdef GNULIB_UNISTR_U8_MBTOUC
207 u8_mbtouc (ucs4_t *puc, const uint8_t *s, size_t n);
210 u8_mbtouc_aux (ucs4_t *puc, const uint8_t *s, size_t n);
212 u8_mbtouc (ucs4_t *puc, const uint8_t *s, size_t n)
222 return u8_mbtouc_aux (puc, s, n);
227 #ifdef GNULIB_UNISTR_U16_MBTOUC
230 u16_mbtouc (ucs4_t *puc, const uint16_t *s, size_t n);
233 u16_mbtouc_aux (ucs4_t *puc, const uint16_t *s, size_t n);
235 u16_mbtouc (ucs4_t *puc, const uint16_t *s, size_t n)
239 if (c < 0xd800 || c >= 0xe000)
245 return u16_mbtouc_aux (puc, s, n);
250 #ifdef GNULIB_UNISTR_U32_MBTOUC
253 u32_mbtouc (ucs4_t *puc, const uint32_t *s, size_t n);
256 u32_mbtouc (ucs4_t *puc, const uint32_t *s, size_t n _UNUSED_PARAMETER_)
260 if (c < 0xd800 || (c >= 0xe000 && c < 0x110000))
263 /* invalid multibyte character */
270 /* Return the length (number of units) of the first character in S, putting
271 its 'ucs4_t' representation in *PUC. Upon failure, *PUC is set to 0xfffd,
272 and -1 is returned for an invalid sequence of units, -2 is returned for an
273 incomplete sequence of units.
274 The number of available units, N, must be > 0. */
275 /* Similar to u*_mbtouc(), except that the return value gives more details
276 about the failure, similar to mbrtowc(). */
278 #ifdef GNULIB_UNISTR_U8_MBTOUCR
280 u8_mbtoucr (ucs4_t *puc, const uint8_t *s, size_t n);
283 #ifdef GNULIB_UNISTR_U16_MBTOUCR
285 u16_mbtoucr (ucs4_t *puc, const uint16_t *s, size_t n);
288 #ifdef GNULIB_UNISTR_U32_MBTOUCR
290 u32_mbtoucr (ucs4_t *puc, const uint32_t *s, size_t n);
293 /* Put the multibyte character represented by UC in S, returning its
294 length. Return -1 upon failure, -2 if the number of available units, N,
295 is too small. The latter case cannot occur if N >= 6/2/1, respectively. */
296 /* Similar to wctomb(), except that s must not be NULL, and the argument n
297 must be specified. */
299 #ifdef GNULIB_UNISTR_U8_UCTOMB
300 /* Auxiliary function, also used by u8_chr, u8_strchr, u8_strrchr. */
302 u8_uctomb_aux (uint8_t *s, ucs4_t uc, int n);
305 u8_uctomb (uint8_t *s, ucs4_t uc, int n);
308 u8_uctomb (uint8_t *s, ucs4_t uc, int n)
310 if (uc < 0x80 && n > 0)
316 return u8_uctomb_aux (s, uc, n);
321 #ifdef GNULIB_UNISTR_U16_UCTOMB
322 /* Auxiliary function, also used by u16_chr, u16_strchr, u16_strrchr. */
324 u16_uctomb_aux (uint16_t *s, ucs4_t uc, int n);
327 u16_uctomb (uint16_t *s, ucs4_t uc, int n);
330 u16_uctomb (uint16_t *s, ucs4_t uc, int n)
332 if (uc < 0xd800 && n > 0)
338 return u16_uctomb_aux (s, uc, n);
343 #ifdef GNULIB_UNISTR_U32_UCTOMB
346 u32_uctomb (uint32_t *s, ucs4_t uc, int n);
349 u32_uctomb (uint32_t *s, ucs4_t uc, int n)
351 if (uc < 0xd800 || (uc >= 0xe000 && uc < 0x110000))
367 /* Copy N units from SRC to DEST. */
368 /* Similar to memcpy(). */
370 u8_cpy (uint8_t *dest, const uint8_t *src, size_t n);
372 u16_cpy (uint16_t *dest, const uint16_t *src, size_t n);
374 u32_cpy (uint32_t *dest, const uint32_t *src, size_t n);
376 /* Copy N units from SRC to DEST, guaranteeing correct behavior for
377 overlapping memory areas. */
378 /* Similar to memmove(). */
380 u8_move (uint8_t *dest, const uint8_t *src, size_t n);
382 u16_move (uint16_t *dest, const uint16_t *src, size_t n);
384 u32_move (uint32_t *dest, const uint32_t *src, size_t n);
386 /* Set the first N characters of S to UC. UC should be a character that
387 occupies only 1 unit. */
388 /* Similar to memset(). */
390 u8_set (uint8_t *s, ucs4_t uc, size_t n);
392 u16_set (uint16_t *s, ucs4_t uc, size_t n);
394 u32_set (uint32_t *s, ucs4_t uc, size_t n);
396 /* Compare S1 and S2, each of length N. */
397 /* Similar to memcmp(). */
399 u8_cmp (const uint8_t *s1, const uint8_t *s2, size_t n);
401 u16_cmp (const uint16_t *s1, const uint16_t *s2, size_t n);
403 u32_cmp (const uint32_t *s1, const uint32_t *s2, size_t n);
405 /* Compare S1 and S2. */
406 /* Similar to the gnulib function memcmp2(). */
408 u8_cmp2 (const uint8_t *s1, size_t n1, const uint8_t *s2, size_t n2);
410 u16_cmp2 (const uint16_t *s1, size_t n1, const uint16_t *s2, size_t n2);
412 u32_cmp2 (const uint32_t *s1, size_t n1, const uint32_t *s2, size_t n2);
414 /* Search the string at S for UC. */
415 /* Similar to memchr(). */
417 u8_chr (const uint8_t *s, size_t n, ucs4_t uc);
419 u16_chr (const uint16_t *s, size_t n, ucs4_t uc);
421 u32_chr (const uint32_t *s, size_t n, ucs4_t uc);
423 /* Count the number of Unicode characters in the N units from S. */
424 /* Similar to mbsnlen(). */
426 u8_mbsnlen (const uint8_t *s, size_t n);
428 u16_mbsnlen (const uint16_t *s, size_t n);
430 u32_mbsnlen (const uint32_t *s, size_t n);
432 /* Elementary string functions with memory allocation. */
434 /* Make a freshly allocated copy of S, of length N. */
436 u8_cpy_alloc (const uint8_t *s, size_t n);
438 u16_cpy_alloc (const uint16_t *s, size_t n);
440 u32_cpy_alloc (const uint32_t *s, size_t n);
442 /* Elementary string functions on NUL terminated strings. */
444 /* Return the length (number of units) of the first character in S.
445 Return 0 if it is the NUL character. Return -1 upon failure. */
447 u8_strmblen (const uint8_t *s);
449 u16_strmblen (const uint16_t *s);
451 u32_strmblen (const uint32_t *s);
453 /* Return the length (number of units) of the first character in S, putting
454 its 'ucs4_t' representation in *PUC. Return 0 if it is the NUL
455 character. Return -1 upon failure. */
457 u8_strmbtouc (ucs4_t *puc, const uint8_t *s);
459 u16_strmbtouc (ucs4_t *puc, const uint16_t *s);
461 u32_strmbtouc (ucs4_t *puc, const uint32_t *s);
463 /* Forward iteration step. Advances the pointer past the next character,
464 or returns NULL if the end of the string has been reached. Puts the
465 character's 'ucs4_t' representation in *PUC. */
466 extern const uint8_t *
467 u8_next (ucs4_t *puc, const uint8_t *s);
468 extern const uint16_t *
469 u16_next (ucs4_t *puc, const uint16_t *s);
470 extern const uint32_t *
471 u32_next (ucs4_t *puc, const uint32_t *s);
473 /* Backward iteration step. Advances the pointer to point to the previous
474 character, or returns NULL if the beginning of the string had been reached.
475 Puts the character's 'ucs4_t' representation in *PUC. */
476 extern const uint8_t *
477 u8_prev (ucs4_t *puc, const uint8_t *s, const uint8_t *start);
478 extern const uint16_t *
479 u16_prev (ucs4_t *puc, const uint16_t *s, const uint16_t *start);
480 extern const uint32_t *
481 u32_prev (ucs4_t *puc, const uint32_t *s, const uint32_t *start);
483 /* Return the number of units in S. */
484 /* Similar to strlen(), wcslen(). */
486 u8_strlen (const uint8_t *s);
488 u16_strlen (const uint16_t *s);
490 u32_strlen (const uint32_t *s);
492 /* Return the number of units in S, but at most MAXLEN. */
493 /* Similar to strnlen(), wcsnlen(). */
495 u8_strnlen (const uint8_t *s, size_t maxlen);
497 u16_strnlen (const uint16_t *s, size_t maxlen);
499 u32_strnlen (const uint32_t *s, size_t maxlen);
501 /* Copy SRC to DEST. */
502 /* Similar to strcpy(), wcscpy(). */
504 u8_strcpy (uint8_t *dest, const uint8_t *src);
506 u16_strcpy (uint16_t *dest, const uint16_t *src);
508 u32_strcpy (uint32_t *dest, const uint32_t *src);
510 /* Copy SRC to DEST, returning the address of the terminating NUL in DEST. */
511 /* Similar to stpcpy(). */
513 u8_stpcpy (uint8_t *dest, const uint8_t *src);
515 u16_stpcpy (uint16_t *dest, const uint16_t *src);
517 u32_stpcpy (uint32_t *dest, const uint32_t *src);
519 /* Copy no more than N units of SRC to DEST. */
520 /* Similar to strncpy(), wcsncpy(). */
522 u8_strncpy (uint8_t *dest, const uint8_t *src, size_t n);
524 u16_strncpy (uint16_t *dest, const uint16_t *src, size_t n);
526 u32_strncpy (uint32_t *dest, const uint32_t *src, size_t n);
528 /* Copy no more than N units of SRC to DEST, returning the address of
529 the last unit written into DEST. */
530 /* Similar to stpncpy(). */
532 u8_stpncpy (uint8_t *dest, const uint8_t *src, size_t n);
534 u16_stpncpy (uint16_t *dest, const uint16_t *src, size_t n);
536 u32_stpncpy (uint32_t *dest, const uint32_t *src, size_t n);
538 /* Append SRC onto DEST. */
539 /* Similar to strcat(), wcscat(). */
541 u8_strcat (uint8_t *dest, const uint8_t *src);
543 u16_strcat (uint16_t *dest, const uint16_t *src);
545 u32_strcat (uint32_t *dest, const uint32_t *src);
547 /* Append no more than N units of SRC onto DEST. */
548 /* Similar to strncat(), wcsncat(). */
550 u8_strncat (uint8_t *dest, const uint8_t *src, size_t n);
552 u16_strncat (uint16_t *dest, const uint16_t *src, size_t n);
554 u32_strncat (uint32_t *dest, const uint32_t *src, size_t n);
556 /* Compare S1 and S2. */
557 /* Similar to strcmp(), wcscmp(). */
559 u8_strcmp (const uint8_t *s1, const uint8_t *s2);
561 u16_strcmp (const uint16_t *s1, const uint16_t *s2);
563 u32_strcmp (const uint32_t *s1, const uint32_t *s2);
565 /* Compare S1 and S2 using the collation rules of the current locale.
566 Return -1 if S1 < S2, 0 if S1 = S2, 1 if S1 > S2.
567 Upon failure, set errno and return any value. */
568 /* Similar to strcoll(), wcscoll(). */
570 u8_strcoll (const uint8_t *s1, const uint8_t *s2);
572 u16_strcoll (const uint16_t *s1, const uint16_t *s2);
574 u32_strcoll (const uint32_t *s1, const uint32_t *s2);
576 /* Compare no more than N units of S1 and S2. */
577 /* Similar to strncmp(), wcsncmp(). */
579 u8_strncmp (const uint8_t *s1, const uint8_t *s2, size_t n);
581 u16_strncmp (const uint16_t *s1, const uint16_t *s2, size_t n);
583 u32_strncmp (const uint32_t *s1, const uint32_t *s2, size_t n);
585 /* Duplicate S, returning an identical malloc'd string. */
586 /* Similar to strdup(), wcsdup(). */
588 u8_strdup (const uint8_t *s);
590 u16_strdup (const uint16_t *s);
592 u32_strdup (const uint32_t *s);
594 /* Find the first occurrence of UC in STR. */
595 /* Similar to strchr(), wcschr(). */
597 u8_strchr (const uint8_t *str, ucs4_t uc);
599 u16_strchr (const uint16_t *str, ucs4_t uc);
601 u32_strchr (const uint32_t *str, ucs4_t uc);
603 /* Find the last occurrence of UC in STR. */
604 /* Similar to strrchr(), wcsrchr(). */
606 u8_strrchr (const uint8_t *str, ucs4_t uc);
608 u16_strrchr (const uint16_t *str, ucs4_t uc);
610 u32_strrchr (const uint32_t *str, ucs4_t uc);
612 /* Return the length of the initial segment of STR which consists entirely
613 of Unicode characters not in REJECT. */
614 /* Similar to strcspn(), wcscspn(). */
616 u8_strcspn (const uint8_t *str, const uint8_t *reject);
618 u16_strcspn (const uint16_t *str, const uint16_t *reject);
620 u32_strcspn (const uint32_t *str, const uint32_t *reject);
622 /* Return the length of the initial segment of STR which consists entirely
623 of Unicode characters in ACCEPT. */
624 /* Similar to strspn(), wcsspn(). */
626 u8_strspn (const uint8_t *str, const uint8_t *accept);
628 u16_strspn (const uint16_t *str, const uint16_t *accept);
630 u32_strspn (const uint32_t *str, const uint32_t *accept);
632 /* Find the first occurrence in STR of any character in ACCEPT. */
633 /* Similar to strpbrk(), wcspbrk(). */
635 u8_strpbrk (const uint8_t *str, const uint8_t *accept);
637 u16_strpbrk (const uint16_t *str, const uint16_t *accept);
639 u32_strpbrk (const uint32_t *str, const uint32_t *accept);
641 /* Find the first occurrence of NEEDLE in HAYSTACK. */
642 /* Similar to strstr(), wcsstr(). */
644 u8_strstr (const uint8_t *haystack, const uint8_t *needle);
646 u16_strstr (const uint16_t *haystack, const uint16_t *needle);
648 u32_strstr (const uint32_t *haystack, const uint32_t *needle);
650 /* Test whether STR starts with PREFIX. */
652 u8_startswith (const uint8_t *str, const uint8_t *prefix);
654 u16_startswith (const uint16_t *str, const uint16_t *prefix);
656 u32_startswith (const uint32_t *str, const uint32_t *prefix);
658 /* Test whether STR ends with SUFFIX. */
660 u8_endswith (const uint8_t *str, const uint8_t *suffix);
662 u16_endswith (const uint16_t *str, const uint16_t *suffix);
664 u32_endswith (const uint32_t *str, const uint32_t *suffix);
666 /* Divide STR into tokens separated by characters in DELIM.
667 This interface is actually more similar to wcstok than to strtok. */
668 /* Similar to strtok_r(), wcstok(). */
670 u8_strtok (uint8_t *str, const uint8_t *delim, uint8_t **ptr);
672 u16_strtok (uint16_t *str, const uint16_t *delim, uint16_t **ptr);
674 u32_strtok (uint32_t *str, const uint32_t *delim, uint32_t **ptr);
681 #endif /* _UNISTR_H */