i18n: New functions for truncating strings in an arbitrary encoding.

[pspp-builds.git] / tests / libpspp / i18n.at
diff --git a/tests/libpspp/i18n.at b/tests/libpspp/i18n.at

index ed29a7d69ca65cde5e28c23af21a1d80c3a2f737..5444a78ff6cfa9e059b2e1252284765598ecd7f7 100644 (file)
--- a/tests/libpspp/i18n.at
+++ b/tests/libpspp/i18n.at
@@ -1,50 +1,111 @@
-AT_BANNER([i18n routines])
+AT_BANNER([i18n recoding])
  
-# CHECK_I18N([TITLE], [FROM-CODING], [TO-CODING], [FROM-TEXT], [TO-TEXT])
+m4_divert_push([PREPARE_TESTS])
+supports_encodings () {
+  case "$host" in
+    *-*-linux* | *-*-*-gnu*)
+      dnl GNU/Linux always has the encodings we want.  We can't ask
+      dnl config.charset about them because it has a special case here
+      dnl too and won't tell us.
+      return 0
+      ;;
+    *)
+      for encoding in "$@"; do
+       $SHELL $top_srcdir/gl/config.charset "$host" | grep '$2' || return 77
+      done
+      ;;
+  esac
+}
+m4_divert_pop([PREPARE_TESTS])
+
+# CHECK_I18N_RECODE([TITLE], [FROM-CODING], [TO-CODING],
+#                   [FROM-TEXT], [TO-TEXT])
  #
  # Converts FROM-TEXT from FROM-CODING to TO-CODING and checks that the result
-# is TO-TEXT.  The "printf" program is applied to both FROM-TEXT and TO-TEXT
-# to allow for backslash-escapes.  (Be aware that hex escapes are not portable;
-# use octal escapes instead.)
-m4_define([CHECK_I18N],
+# is TO-TEXT.  The "printf" program is applied to both FROM-TEXT and TO-TEXT to
+# allow for backslash-escapes.  (Hex escapes are not portable; use octal
+# escapes instead.)
+m4_define([CHECK_I18N_RECODE],
    [AT_SETUP([convert $1])
     AT_KEYWORDS([i18n])
  
     dnl Skip the test if this host doesn't know the source and target encodings.
-   AT_CHECK(
-     [case "$host" in
-        *-*-linux* | *-*-*-gnu*)
-          dnl GNU/Linux always has the encodings we want.  We can't ask
-         dnl config.charset about them because it has a special case here
-         dnl too and won't tell us.
-         ;;
-       *)
-          $SHELL $top_srcdir/gl/config.charset "$host" | grep '$2' || exit 77
-          $SHELL $top_srcdir/gl/config.charset "$host" | grep '$3' || exit 77
-          ;;
-      esac
-     ], [0], [ignore])
-   AT_CHECK_UNQUOTED([i18n-test '$2' '$3' `printf '$4'`], [0], [`printf '$5'`
+   AT_CHECK([supports_encodings '$2' '$3'])
+   AT_CHECK_UNQUOTED([i18n-test recode '$2' '$3' `printf '$4'`], [0], [`printf '$5'`
  ])
     AT_CLEANUP])
       
-CHECK_I18N([reflexively], [ASCII], [ASCII], [abc], [abc])
-CHECK_I18N([without any change], [ASCII], [UTF-8], [abc], [abc])
+CHECK_I18N_RECODE([reflexively], [ASCII], [ASCII], [abc], [abc])
+CHECK_I18N_RECODE([without any change], [ASCII], [UTF-8], [abc], [abc])
  
-CHECK_I18N([from ISO-8859-1 to UTF-8], [ISO-8859-1], [UTF-8],
-           [\242], [\302\242])
-CHECK_I18N([from UTF-8 to ISO-8859-1], [UTF-8], [ISO-8859-1],
-           [\302\242], [\242])
+CHECK_I18N_RECODE([from ISO-8859-1 to UTF-8], [ISO-8859-1], [UTF-8],
+                  [\242], [\302\242])
+CHECK_I18N_RECODE([from UTF-8 to ISO-8859-1], [UTF-8], [ISO-8859-1],
+                  [\302\242], [\242])
  
  # 0xc0 == 0300 is invalid in UTF-8
-CHECK_I18N([invalid UTF-8 to ISO-8859-1], [UTF-8], [ISO-8859-1],
-           [xy\300z], [xy?z])
+CHECK_I18N_RECODE([invalid UTF-8 to ISO-8859-1], [UTF-8], [ISO-8859-1],
+                  [xy\300z], [xy?z])
  # 0xc2 == 0302 is the first byte of a 2-byte UTF-8 sequence
-CHECK_I18N([truncated UTF-8 to ISO-8559-1], [UTF-8], [ISO-8859-1],
-           [xy\302], [xy?])
+CHECK_I18N_RECODE([truncated UTF-8 to ISO-8559-1], [UTF-8], [ISO-8859-1],
+                  [xy\302], [xy?])
  
  dnl The input to this test is 7 bytes long and the expected output is 9 bytes.
  dnl So it should exercise the E2BIG case 
-CHECK_I18N([from ISO-8859-1 to UTF-8 with overflow], [ISO-8859-1], [UTF-8],
-           [Tsch\374\337!], [Tsch\303\274\303\237!])
+CHECK_I18N_RECODE([from ISO-8859-1 to UTF-8 with overflow], 
+                  [ISO-8859-1], [UTF-8],
+                  [Tsch\374\337!], [Tsch\303\274\303\237!])
+\f
+AT_BANNER([i18n concatenation])
+
+# CHECK_I18N_CONCAT([HEAD], [TAIL], [ENCODING], [MAX-LEN], [ANSWER])
+#
+# Concatenates HEAD and TAIL, omitting as many characters from HEAD as needed
+# to make the result come out to no more than MAX-LEN bytes if it was expressed
+# in ENCODING, and checks that the answer matches ANSWER.  HEAD, TAIL, and
+# ANSWER are all in UTF-8.  The "printf" program is applied to HEAD, TAIL, and
+# ANSWER to allow for backslash-escapes.  (Hex escapes are not portable; use
+# octal escapes instead.)
+m4_define([CHECK_I18N_CONCAT],
+  [AT_SETUP([m4_if([$2], [], [truncate "$1" to $4 bytes in $3],
+                             [truncate "$1" + "$2" to $4 bytes in $3])])
+   AT_KEYWORDS([i18n])
+
+   dnl Skip the test if this host doesn't know the encoding.
+   AT_CHECK([supports_encodings '$3'])
+   AT_CHECK_UNQUOTED(
+     [i18n-test concat "`printf '$1'`" "`printf '$2'`" '$3' '$4'], [0],
+     [`printf '$5'`
+])
+   AT_CLEANUP])
+
+CHECK_I18N_CONCAT([abc], [], [UTF-8], [6], [abc])
+CHECK_I18N_CONCAT([], [xyz], [UTF-8], [6], [xyz])
+CHECK_I18N_CONCAT([], [], [UTF-8], [6], [])
+CHECK_I18N_CONCAT([abcdefghij], [], [UTF-8], [6], [abcdef])
+CHECK_I18N_CONCAT([], [tuvwxyz], [UTF-8], [6], [tuvwxyz])
+
+CHECK_I18N_CONCAT([abc], [xyz], [UTF-8], [6], [abcxyz])
+CHECK_I18N_CONCAT([abcd], [xyz], [UTF-8], [6], [abcxyz])
+CHECK_I18N_CONCAT([abc], [uvwxyz], [UTF-8], [6], [uvwxyz])
+
+# x in a box ( x⃞ ) is U+0078, U+20DE, 4 bytes in UTF-8, and one grapheme
+# cluster.
+CHECK_I18N_CONCAT([x\342\203\236], [y], [UTF-8], [0], [y])
+CHECK_I18N_CONCAT([x\342\203\236], [y], [UTF-8], [1], [y])
+CHECK_I18N_CONCAT([x\342\203\236], [y], [UTF-8], [2], [y])
+CHECK_I18N_CONCAT([x\342\203\236], [y], [UTF-8], [3], [y])
+CHECK_I18N_CONCAT([x\342\203\236], [y], [UTF-8], [4], [y])
+CHECK_I18N_CONCAT([x\342\203\236], [y], [UTF-8], [5], [x\342\203\236y])
  
+# éèä is only 3 bytes in ISO-8859-1.
+CHECK_I18N_CONCAT([\303\251\303\250\303\244], [xyz], [ISO-8859-1], [0], [xyz])
+CHECK_I18N_CONCAT([\303\251\303\250\303\244], [xyz], [ISO-8859-1], [1], [xyz])
+CHECK_I18N_CONCAT([\303\251\303\250\303\244], [xyz], [ISO-8859-1], [2], [xyz])
+CHECK_I18N_CONCAT([\303\251\303\250\303\244], [xyz], [ISO-8859-1], [3], [xyz])
+CHECK_I18N_CONCAT([\303\251\303\250\303\244], [xyz], [ISO-8859-1], [4], 
+                  [\303\251xyz])
+CHECK_I18N_CONCAT([\303\251\303\250\303\244], [xyz], [ISO-8859-1], [5],
+                  [\303\251\303\250xyz])
+CHECK_I18N_CONCAT([\303\251\303\250\303\244], [xyz], [ISO-8859-1], [6],
+                  [\303\251\303\250\303\244xyz])