Start working on lsp server.

author Ben Pfaff <blp@cs.stanford.edu>

Tue, 20 Aug 2024 04:31:52 +0000 (21:31 -0700)

committer Ben Pfaff <blp@cs.stanford.edu>

Tue, 20 Aug 2024 04:31:52 +0000 (21:31 -0700)
author Ben Pfaff <blp@cs.stanford.edu>
Tue, 20 Aug 2024 04:31:52 +0000 (21:31 -0700)
committer Ben Pfaff <blp@cs.stanford.edu>
Tue, 20 Aug 2024 04:31:52 +0000 (21:31 -0700)
diff --git a/rust/Cargo.lock b/rust/Cargo.lock

index 2c9fed4fa137f71578abfd588c56b8e8f883f13a..3d7a9ebc7d9d9249c83279d06e77a72ba557d018 100644 (file)
--- a/rust/Cargo.lock
+++ b/rust/Cargo.lock
@@ -2,12 +2,30 @@
  # It is not intended for manual editing.
  version = 3
  
+[[package]]
+name = "addr2line"
+version = "0.22.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6e4503c46a5c0c7844e948c9a4d6acd9f50cccb4de1c48eb9e291ea17470c678"
+dependencies = [
+ "gimli",
+]
+
  [[package]]
  name = "adler"
  version = "1.0.2"
  source = "registry+https://github.com/rust-lang/crates.io-index"
  checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
  
+[[package]]
+name = "aho-corasick"
+version = "1.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"
+dependencies = [
+ "memchr",
+]
+
  [[package]]
  name = "android-tzdata"
  version = "0.1.1"
@@ -23,11 +41,71 @@ dependencies = [
   "libc",
  ]
  
+[[package]]
+name = "anstream"
+version = "0.6.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "64e15c1ab1f89faffbf04a634d5e1962e9074f2741eef6d97f3c4e322426d526"
+dependencies = [
+ "anstyle",
+ "anstyle-parse",
+ "anstyle-query",
+ "anstyle-wincon",
+ "colorchoice",
+ "is_terminal_polyfill",
+ "utf8parse",
+]
+
+[[package]]
+name = "anstyle"
+version = "1.0.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1bec1de6f59aedf83baf9ff929c98f2ad654b97c9510f4e70cf6f661d49fd5b1"
+
+[[package]]
+name = "anstyle-parse"
+version = "0.2.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eb47de1e80c2b463c735db5b217a0ddc39d612e7ac9e2e96a5aed1f57616c1cb"
+dependencies = [
+ "utf8parse",
+]
+
+[[package]]
+name = "anstyle-query"
+version = "1.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6d36fc52c7f6c869915e99412912f22093507da8d9e942ceaf66fe4b7c14422a"
+dependencies = [
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "anstyle-wincon"
+version = "3.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5bf74e1b6e971609db8ca7a9ce79fd5768ab6ae46441c572e46cf596f59e57f8"
+dependencies = [
+ "anstyle",
+ "windows-sys 0.52.0",
+]
+
  [[package]]
  name = "anyhow"
-version = "1.0.69"
+version = "1.0.86"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "224afbd727c3d6e4b90103ece64b8d1b67fbb1973b1046c2281eed3f3803f800"
+checksum = "b3d1d046238990b9cf5bcde22a3fb3584ee5cf65fb2765f454ed428c7a0063da"
+
+[[package]]
+name = "async-trait"
+version = "0.1.81"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6e0c28dcc82d7c8ead5cb13beb15405b57b8546e93215673ff8ca0349a028107"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
  
  [[package]]
  name = "atty"
@@ -40,11 +118,37 @@ dependencies = [
   "winapi",
  ]
  
+[[package]]
+name = "auto_impl"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3c87f3f15e7794432337fc718554eaa4dc8f04c9677a950ffe366f20a162ae42"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
  [[package]]
  name = "autocfg"
-version = "1.1.0"
+version = "1.3.0"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
+checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0"
+
+[[package]]
+name = "backtrace"
+version = "0.3.73"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5cc23269a4f8976d0a4d2e7109211a419fe30e8d88d677cd60b6bc79c5732e0a"
+dependencies = [
+ "addr2line",
+ "cc",
+ "cfg-if",
+ "libc",
+ "miniz_oxide",
+ "object",
+ "rustc-demangle",
+]
  
  [[package]]
  name = "bitflags"
@@ -54,21 +158,30 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
  
  [[package]]
  name = "bitflags"
-version = "2.5.0"
+version = "2.6.0"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cf4b9d6a944f767f8e5e0db018570623c85f3d925ac718db4e06d0187adb21c1"
+checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de"
  
  [[package]]
  name = "bumpalo"
-version = "3.13.0"
+version = "3.16.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c"
+
+[[package]]
+name = "bytes"
+version = "1.7.1"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a3e2c3daef883ecc1b5d58c15adae93470a91d425f3532ba1695849656af3fc1"
+checksum = "8318a53db07bb3f8dca91a600466bdb3f2eaadeedfdbcf02e1accbad9271ba50"
  
  [[package]]
  name = "cc"
-version = "1.0.79"
+version = "1.1.13"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f"
+checksum = "72db2f7947ecee9b03b510377e8bb9077afa27176fdbff55c51027e976fdcc48"
+dependencies = [
+ "shlex",
+]
  
  [[package]]
  name = "cfg-if"
@@ -89,72 +202,93 @@ dependencies = [
  
  [[package]]
  name = "chrono"
-version = "0.4.26"
+version = "0.4.38"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ec837a71355b28f6556dbd569b37b3f363091c0bd4b2e735674521b4c5fd9bc5"
+checksum = "a21f936df1771bf62b77f047b726c4625ff2e8aa607c01ec06e5a05bd8463401"
  dependencies = [
   "android-tzdata",
   "iana-time-zone",
   "js-sys",
   "num-traits",
- "time",
   "wasm-bindgen",
- "winapi",
+ "windows-targets 0.52.6",
  ]
  
  [[package]]
  name = "clap"
-version = "4.1.7"
+version = "4.5.16"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2f3061d6db6d8fcbbd4b05e057f2acace52e64e96b498c08c2d7a4e65addd340"
+checksum = "ed6719fffa43d0d87e5fd8caeab59be1554fb028cd30edc88fc4369b17971019"
  dependencies = [
- "bitflags 1.3.2",
+ "clap_builder",
   "clap_derive",
+]
+
+[[package]]
+name = "clap_builder"
+version = "4.5.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "216aec2b177652e3846684cbfe25c9964d18ec45234f0f5da5157b207ed1aab6"
+dependencies = [
+ "anstream",
+ "anstyle",
   "clap_lex",
- "is-terminal",
- "once_cell",
   "strsim",
- "termcolor 1.2.0",
   "terminal_size",
  ]
  
  [[package]]
  name = "clap_derive"
-version = "4.1.7"
+version = "4.5.13"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "34d122164198950ba84a918270a3bb3f7ededd25e15f7451673d986f55bd2667"
+checksum = "501d359d5f3dcaf6ecdeee48833ae73ec6e42723a1e52419c79abf9507eec0a0"
  dependencies = [
   "heck",
- "proc-macro-error",
   "proc-macro2",
   "quote",
- "syn 1.0.109",
+ "syn",
  ]
  
  [[package]]
  name = "clap_lex"
-version = "0.3.2"
+version = "0.7.2"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "350b9cf31731f9957399229e9b2adc51eeabdfbe9d71d9a0552275fd12710d09"
-dependencies = [
- "os_str_bytes",
-]
+checksum = "1462739cb27611015575c0c11df5df7601141071f07518d56fcc1be504cbec97"
+
+[[package]]
+name = "colorchoice"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d3fd119d74b830634cea2a0f58bbd0d54540518a14397557951e79340abc28c0"
  
  [[package]]
  name = "core-foundation-sys"
-version = "0.8.4"
+version = "0.8.7"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa"
+checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b"
  
  [[package]]
  name = "crc32fast"
-version = "1.3.2"
+version = "1.4.2"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d"
+checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3"
  dependencies = [
   "cfg-if",
  ]
  
+[[package]]
+name = "dashmap"
+version = "5.5.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "978747c1d849a7d2ee5e8adc0159961c48fb7e5db2f06af6723b80123bb53856"
+dependencies = [
+ "cfg-if",
+ "hashbrown",
+ "lock_api",
+ "once_cell",
+ "parking_lot_core",
+]
+
  [[package]]
  name = "diff"
  version = "0.1.13"
@@ -163,9 +297,9 @@ checksum = "56254986775e3233ffa9c4d7d3faaf6d36a2c09d30b20687e9f88bc8bafc16c8"
  
  [[package]]
  name = "encoding_rs"
-version = "0.8.32"
+version = "0.8.34"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "071a31f4ee85403370b58aca746f01041ede6f0da2730960ad001edc2b71b394"
+checksum = "b45de904aa0b010bce2ab45264d0631681847fa7b6f2eaa7dab7619943bc4f59"
  dependencies = [
   "cfg-if",
  ]
@@ -187,45 +321,46 @@ checksum = "f282cfdfe92516eb26c2af8589c274c7c17681f5ecc03c18255fe741c6aa64eb"
  dependencies = [
   "proc-macro2",
   "quote",
- "syn 2.0.27",
+ "syn",
  ]
  
  [[package]]
-name = "equivalent"
-version = "1.0.1"
+name = "env_filter"
+version = "0.1.2"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5"
+checksum = "4f2c92ceda6ceec50f43169f9ee8424fe2db276791afde7b2cd8bc084cb376ab"
+dependencies = [
+ "log",
+ "regex",
+]
  
  [[package]]
-name = "errno"
-version = "0.2.8"
+name = "env_logger"
+version = "0.11.5"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f639046355ee4f37944e44f60642c6f3a7efa3cf6b78c78a0d989a8ce6c396a1"
+checksum = "e13fa619b91fb2381732789fc5de83b45675e882f66623b7d8cb4f643017018d"
  dependencies = [
- "errno-dragonfly",
- "libc",
- "winapi",
+ "anstream",
+ "anstyle",
+ "env_filter",
+ "humantime",
+ "log",
  ]
  
  [[package]]
-name = "errno"
-version = "0.3.1"
+name = "equivalent"
+version = "1.0.1"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4bcfec3a70f97c962c307b2d2c56e358cf1d00b558d74262b5f929ee8cc7e73a"
-dependencies = [
- "errno-dragonfly",
- "libc",
- "windows-sys 0.48.0",
-]
+checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5"
  
  [[package]]
-name = "errno-dragonfly"
-version = "0.1.2"
+name = "errno"
+version = "0.3.9"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf"
+checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba"
  dependencies = [
- "cc",
   "libc",
+ "windows-sys 0.52.0",
  ]
  
  [[package]]
@@ -242,9 +377,9 @@ checksum = "b3ea1ec5f8307826a5b71094dd91fc04d4ae75d5709b20ad351c7fb4815c86ec"
  
  [[package]]
  name = "flate2"
-version = "1.0.26"
+version = "1.0.31"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3b9429470923de8e8cbd4d2dc513535400b4b3fef0319fb5c4e1f520a7bef743"
+checksum = "7f211bbe8e69bbd0cfdea405084f128ae8b4aaa6b0b522fc8f2b009084797920"
  dependencies = [
   "crc32fast",
   "miniz_oxide",
@@ -256,17 +391,109 @@ version = "1.0.0"
  source = "registry+https://github.com/rust-lang/crates.io-index"
  checksum = "8bf7cc16383c4b8d58b9905a8509f02926ce3058053c056376248d958c9df1e8"
  
+[[package]]
+name = "form_urlencoded"
+version = "1.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456"
+dependencies = [
+ "percent-encoding",
+]
+
+[[package]]
+name = "futures"
+version = "0.3.30"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "645c6916888f6cb6350d2550b80fb63e734897a8498abe35cfb732b6487804b0"
+dependencies = [
+ "futures-channel",
+ "futures-core",
+ "futures-io",
+ "futures-sink",
+ "futures-task",
+ "futures-util",
+]
+
+[[package]]
+name = "futures-channel"
+version = "0.3.30"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eac8f7d7865dcb88bd4373ab671c8cf4508703796caa2b1985a9ca867b3fcb78"
+dependencies = [
+ "futures-core",
+ "futures-sink",
+]
+
+[[package]]
+name = "futures-core"
+version = "0.3.30"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d"
+
+[[package]]
+name = "futures-io"
+version = "0.3.30"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1"
+
+[[package]]
+name = "futures-macro"
+version = "0.3.30"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "futures-sink"
+version = "0.3.30"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9fb8e00e87438d937621c1c6269e53f536c14d3fbd6a042bb24879e57d474fb5"
+
+[[package]]
+name = "futures-task"
+version = "0.3.30"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "38d84fa142264698cdce1a9f9172cf383a0c82de1bddcf3092901442c4097004"
+
+[[package]]
+name = "futures-util"
+version = "0.3.30"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48"
+dependencies = [
+ "futures-channel",
+ "futures-core",
+ "futures-io",
+ "futures-macro",
+ "futures-sink",
+ "futures-task",
+ "memchr",
+ "pin-project-lite",
+ "pin-utils",
+ "slab",
+]
+
+[[package]]
+name = "gimli"
+version = "0.29.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "40ecd4077b5ae9fd2e9e169b102c6c330d0605168eb0e8bf79952b256dbefffd"
+
  [[package]]
  name = "hashbrown"
-version = "0.14.3"
+version = "0.14.5"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604"
+checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
  
  [[package]]
  name = "heck"
-version = "0.4.1"
+version = "0.5.0"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
+checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
  
  [[package]]
  name = "hermit-abi"
@@ -279,9 +506,9 @@ dependencies = [
  
  [[package]]
  name = "hermit-abi"
-version = "0.3.1"
+version = "0.3.9"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286"
+checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024"
  
  [[package]]
  name = "hexplay"
@@ -290,21 +517,33 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
  checksum = "0962bea6731e28b5a443ba4aa00fe3e4fe7555dadf12012435efb738eeac5898"
  dependencies = [
   "atty",
- "termcolor 0.3.6",
+ "termcolor",
  ]
  
+[[package]]
+name = "httparse"
+version = "1.9.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0fcc0b4a115bf80b728eb8ea024ad5bd707b615bfed49e0665b6e0f86fd082d9"
+
+[[package]]
+name = "humantime"
+version = "2.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4"
+
  [[package]]
  name = "iana-time-zone"
-version = "0.1.57"
+version = "0.1.60"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2fad5b825842d2b38bd206f3e81d6957625fd7f0a361e345c30e01a0ae2dd613"
+checksum = "e7ffbb5a1b541ea2561f8c41c087286cc091e21e556a4f09a8f6cbf17b69b141"
  dependencies = [
   "android_system_properties",
   "core-foundation-sys",
   "iana-time-zone-haiku",
   "js-sys",
   "wasm-bindgen",
- "windows",
+ "windows-core",
  ]
  
  [[package]]
@@ -316,76 +555,93 @@ dependencies = [
   "cc",
  ]
  
+[[package]]
+name = "idna"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "634d9b1461af396cad843f47fdba5597a4f9e6ddd4bfb6ff5d85028c25cb12f6"
+dependencies = [
+ "unicode-bidi",
+ "unicode-normalization",
+]
+
  [[package]]
  name = "indexmap"
-version = "2.1.0"
+version = "2.4.0"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d530e1a18b1cb4c484e6e34556a0d948706958449fca0cab753d649f2bce3d1f"
+checksum = "93ead53efc7ea8ed3cfb0c79fc8023fbb782a5432b52830b6518941cebe6505c"
  dependencies = [
   "equivalent",
   "hashbrown",
  ]
  
  [[package]]
-name = "io-lifetimes"
-version = "1.0.5"
+name = "is_terminal_polyfill"
+version = "1.70.1"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1abeb7a0dd0f8181267ff8adc397075586500b81b28a73e8a0208b00fc170fb3"
-dependencies = [
- "libc",
- "windows-sys 0.45.0",
-]
+checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
  
  [[package]]
-name = "is-terminal"
-version = "0.4.4"
+name = "itoa"
+version = "1.0.11"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "21b6b32576413a8e69b90e952e4a026476040d81017b80445deda5f2d3921857"
-dependencies = [
- "hermit-abi 0.3.1",
- "io-lifetimes",
- "rustix 0.36.8",
- "windows-sys 0.45.0",
-]
+checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b"
  
  [[package]]
  name = "js-sys"
-version = "0.3.64"
+version = "0.3.70"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c5f195fe497f702db0f318b07fdd68edb16955aed830df8363d837542f8f935a"
+checksum = "1868808506b929d7b0cfa8f75951347aa71bb21144b7791bae35d9bccfcfe37a"
  dependencies = [
   "wasm-bindgen",
  ]
  
  [[package]]
  name = "lazy_static"
-version = "1.4.0"
+version = "1.5.0"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
+checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
  
  [[package]]
  name = "libc"
-version = "0.2.147"
+version = "0.2.158"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b4668fb0ea861c1df094127ac5f1da3409a82116a4ba74fca2e58ef927159bb3"
+checksum = "d8adc4bb1803a324070e64a98ae98f38934d91957a99cfb3a43dcbc01bc56439"
  
  [[package]]
  name = "linux-raw-sys"
-version = "0.1.4"
+version = "0.4.14"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4"
+checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89"
  
  [[package]]
-name = "linux-raw-sys"
-version = "0.3.8"
+name = "lock_api"
+version = "0.4.12"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519"
+checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17"
+dependencies = [
+ "autocfg",
+ "scopeguard",
+]
  
  [[package]]
  name = "log"
-version = "0.4.19"
+version = "0.4.22"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b06a4cde4c0f271a446782e3eff8de789548ce57dbc8eca9292c27f4a42004b4"
+checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24"
+
+[[package]]
+name = "lsp-types"
+version = "0.94.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c66bfd44a06ae10647fe3f8214762e9369fd4248df1350924b4ef9e770a85ea1"
+dependencies = [
+ "bitflags 1.3.2",
+ "serde",
+ "serde_json",
+ "serde_repr",
+ "url",
+]
  
  [[package]]
  name = "memchr"
@@ -395,18 +651,30 @@ checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
  
  [[package]]
  name = "miniz_oxide"
-version = "0.7.1"
+version = "0.7.4"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e7810e0be55b428ada41041c41f32c9f1a42817901b4ccf45fa3d4b6561e74c7"
+checksum = "b8a240ddb74feaf34a79a7add65a741f3167852fba007066dcac1ca548d89c08"
  dependencies = [
   "adler",
  ]
  
+[[package]]
+name = "mio"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "80e04d1dcff3aae0704555fe5fee3bcfaf3d1fdf8a7e521d5b9d2b42acb52cec"
+dependencies = [
+ "hermit-abi 0.3.9",
+ "libc",
+ "wasi",
+ "windows-sys 0.52.0",
+]
+
  [[package]]
  name = "num"
-version = "0.4.0"
+version = "0.4.3"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "43db66d1170d347f9a065114077f7dccb00c1b9478c89384490a3425279a4606"
+checksum = "35bd024e8b2ff75562e5f34e7f4905839deb4b22955ef5e73d2fea1b9813cb23"
  dependencies = [
   "num-bigint",
   "num-complex",
@@ -418,50 +686,48 @@ dependencies = [
  
  [[package]]
  name = "num-bigint"
-version = "0.4.3"
+version = "0.4.6"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f93ab6289c7b344a8a9f60f88d80aa20032336fe78da341afc91c8a2341fc75f"
+checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9"
  dependencies = [
- "autocfg",
   "num-integer",
   "num-traits",
  ]
  
  [[package]]
  name = "num-complex"
-version = "0.4.3"
+version = "0.4.6"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "02e0d21255c828d6f128a1e41534206671e8c3ea0c62f32291e808dc82cff17d"
+checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495"
  dependencies = [
   "num-traits",
  ]
  
  [[package]]
  name = "num-derive"
-version = "0.4.0"
+version = "0.4.2"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9e6a0fd4f737c707bd9086cc16c925f294943eb62eb71499e9fd4cf71f8b9f4e"
+checksum = "ed3955f1a9c7c0c15e092f9c887db08b1fc683305fdf6eb6684f22555355e202"
  dependencies = [
   "proc-macro2",
   "quote",
- "syn 2.0.27",
+ "syn",
  ]
  
  [[package]]
  name = "num-integer"
-version = "0.1.45"
+version = "0.1.46"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "225d3389fb3509a24c93f5c29eb6bde2586b98d9f016636dff58d7c6f7569cd9"
+checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f"
  dependencies = [
- "autocfg",
   "num-traits",
  ]
  
  [[package]]
  name = "num-iter"
-version = "0.1.43"
+version = "0.1.45"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7d03e6c028c5dc5cac6e2dec0efda81fc887605bb3d884578bb6d6bf7514e252"
+checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf"
  dependencies = [
   "autocfg",
   "num-integer",
@@ -470,11 +736,10 @@ dependencies = [
  
  [[package]]
  name = "num-rational"
-version = "0.4.1"
+version = "0.4.2"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0638a1c9d0a3c0914158145bc76cff373a75a627e6ecbfb71cbe6f453a5a19b0"
+checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824"
  dependencies = [
- "autocfg",
   "num-bigint",
   "num-integer",
   "num-traits",
@@ -482,63 +747,103 @@ dependencies = [
  
  [[package]]
  name = "num-traits"
-version = "0.2.16"
+version = "0.2.19"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f30b0abd723be7e2ffca1272140fac1a2f084c77ec3e123c192b66af1ee9e6c2"
+checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
  dependencies = [
   "autocfg",
  ]
  
+[[package]]
+name = "object"
+version = "0.36.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "27b64972346851a39438c60b341ebc01bba47464ae329e55cf343eb93964efd9"
+dependencies = [
+ "memchr",
+]
+
  [[package]]
  name = "once_cell"
-version = "1.17.1"
+version = "1.19.0"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3"
+checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
  
  [[package]]
  name = "ordered-float"
-version = "3.7.0"
+version = "3.9.2"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2fc2dbde8f8a79f2102cc474ceb0ad68e3b80b85289ea62389b60e66777e4213"
+checksum = "f1e1c390732d15f1d48471625cd92d154e66db2c56645e29a9cd26f4699f72dc"
  dependencies = [
   "num-traits",
  ]
  
  [[package]]
-name = "os_str_bytes"
-version = "6.4.1"
+name = "parking_lot"
+version = "0.12.3"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9b7820b9daea5457c9f21c69448905d723fbd21136ccf521748f23fd49e723ee"
+checksum = "f1bf18183cf54e8d6059647fc3063646a1801cf30896933ec2311622cc4b9a27"
+dependencies = [
+ "lock_api",
+ "parking_lot_core",
+]
  
  [[package]]
-name = "proc-macro-error"
-version = "1.0.4"
+name = "parking_lot_core"
+version = "0.9.10"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c"
+checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8"
  dependencies = [
- "proc-macro-error-attr",
- "proc-macro2",
- "quote",
- "syn 1.0.109",
- "version_check",
+ "cfg-if",
+ "libc",
+ "redox_syscall",
+ "smallvec",
+ "windows-targets 0.52.6",
+]
+
+[[package]]
+name = "percent-encoding"
+version = "2.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
+
+[[package]]
+name = "pin-project"
+version = "1.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b6bf43b791c5b9e34c3d182969b4abb522f9343702850a2e57f460d00d09b4b3"
+dependencies = [
+ "pin-project-internal",
  ]
  
  [[package]]
-name = "proc-macro-error-attr"
-version = "1.0.4"
+name = "pin-project-internal"
+version = "1.1.5"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869"
+checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965"
  dependencies = [
   "proc-macro2",
   "quote",
- "version_check",
+ "syn",
  ]
  
+[[package]]
+name = "pin-project-lite"
+version = "0.2.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bda66fc9667c18cb2758a2ac84d1167245054bcf85d5d1aaa6923f45801bdd02"
+
+[[package]]
+name = "pin-utils"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
+
  [[package]]
  name = "proc-macro2"
-version = "1.0.66"
+version = "1.0.86"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "18fb31db3f9bddb2ea821cde30a9f70117e3f119938b5ee630b7403aa6e2ead9"
+checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77"
  dependencies = [
   "unicode-ident",
  ]
@@ -548,7 +853,7 @@ name = "pspp"
  version = "1.0.0"
  dependencies = [
   "anyhow",
- "bitflags 2.5.0",
+ "bitflags 2.6.0",
   "chardetng",
   "chrono",
   "clap",
@@ -574,144 +879,411 @@ dependencies = [
   "windows-sys 0.48.0",
  ]
  
+[[package]]
+name = "pspp-lsp"
+version = "0.1.0"
+dependencies = [
+ "env_logger",
+ "log",
+ "pspp",
+ "tokio",
+ "tower-lsp",
+]
+
  [[package]]
  name = "quote"
-version = "1.0.32"
+version = "1.0.36"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "50f3b39ccfb720540debaa0164757101c08ecb8d326b15358ce76a62c7e85965"
+checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7"
  dependencies = [
   "proc-macro2",
  ]
  
  [[package]]
-name = "rustix"
-version = "0.36.8"
+name = "redox_syscall"
+version = "0.5.3"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f43abb88211988493c1abb44a70efa56ff0ce98f233b7b276146f1f3f7ba9644"
+checksum = "2a908a6e00f1fdd0dfd9c0eb08ce85126f6d8bbda50017e74bc4a4b7d4a926a4"
  dependencies = [
- "bitflags 1.3.2",
- "errno 0.2.8",
- "io-lifetimes",
- "libc",
- "linux-raw-sys 0.1.4",
- "windows-sys 0.45.0",
+ "bitflags 2.6.0",
+]
+
+[[package]]
+name = "regex"
+version = "1.10.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4219d74c6b67a3654a9fbebc4b419e22126d13d2f3c4a07ee0cb61ff79a79619"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-automata",
+ "regex-syntax",
+]
+
+[[package]]
+name = "regex-automata"
+version = "0.4.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "38caf58cc5ef2fed281f89292ef23f6365465ed9a41b7a7754eb4e26496c92df"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-syntax",
  ]
  
+[[package]]
+name = "regex-syntax"
+version = "0.8.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b"
+
+[[package]]
+name = "rustc-demangle"
+version = "0.1.24"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f"
+
  [[package]]
  name = "rustix"
-version = "0.37.3"
+version = "0.38.34"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "62b24138615de35e32031d041a09032ef3487a616d901ca4db224e7d557efae2"
+checksum = "70dc5ec042f7a43c4a73241207cecc9873a06d45debb38b329f8541d85c2730f"
  dependencies = [
- "bitflags 1.3.2",
- "errno 0.3.1",
- "io-lifetimes",
+ "bitflags 2.6.0",
+ "errno",
   "libc",
- "linux-raw-sys 0.3.8",
- "windows-sys 0.45.0",
+ "linux-raw-sys",
+ "windows-sys 0.52.0",
  ]
  
  [[package]]
-name = "strsim"
-version = "0.10.0"
+name = "ryu"
+version = "1.0.18"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
+checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f"
  
  [[package]]
-name = "syn"
-version = "1.0.109"
+name = "scopeguard"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
+
+[[package]]
+name = "serde"
+version = "1.0.208"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237"
+checksum = "cff085d2cb684faa248efb494c39b68e522822ac0de72ccf08109abde717cfb2"
+dependencies = [
+ "serde_derive",
+]
+
+[[package]]
+name = "serde_derive"
+version = "1.0.208"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "24008e81ff7613ed8e5ba0cfaf24e2c2f1e5b8a0495711e44fcd4882fca62bcf"
  dependencies = [
   "proc-macro2",
   "quote",
- "unicode-ident",
+ "syn",
  ]
  
  [[package]]
-name = "syn"
-version = "2.0.27"
+name = "serde_json"
+version = "1.0.125"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "83c8e735a073ccf5be70aa8066aa984eaf2fa000db6c8d0100ae605b366d31ed"
+dependencies = [
+ "itoa",
+ "memchr",
+ "ryu",
+ "serde",
+]
+
+[[package]]
+name = "serde_repr"
+version = "0.1.19"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b60f673f44a8255b9c8c657daf66a596d435f2da81a555b06dc644d080ba45e0"
+checksum = "6c64451ba24fc7a6a2d60fc75dd9c83c90903b19028d4eff35e88fc1e86564e9"
  dependencies = [
   "proc-macro2",
   "quote",
- "unicode-ident",
+ "syn",
  ]
  
  [[package]]
-name = "termcolor"
-version = "0.3.6"
+name = "shlex"
+version = "1.3.0"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "adc4587ead41bf016f11af03e55a624c06568b5a19db4e90fde573d805074f83"
+checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
+
+[[package]]
+name = "signal-hook-registry"
+version = "1.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a9e9e0b4211b72e7b8b6e85c807d36c212bdb33ea8587f7569562a84df5465b1"
  dependencies = [
- "wincolor",
+ "libc",
+]
+
+[[package]]
+name = "slab"
+version = "0.4.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8f92a496fb766b417c996b9c5e57daf2f7ad3b0bebe1ccfca4856390e3d3bb67"
+dependencies = [
+ "autocfg",
+]
+
+[[package]]
+name = "smallvec"
+version = "1.13.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67"
+
+[[package]]
+name = "socket2"
+version = "0.5.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ce305eb0b4296696835b71df73eb912e0f1ffd2556a501fcede6e0c50349191c"
+dependencies = [
+ "libc",
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "strsim"
+version = "0.11.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
+
+[[package]]
+name = "syn"
+version = "2.0.75"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f6af063034fc1935ede7be0122941bafa9bacb949334d090b77ca98b5817c7d9"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
  ]
  
  [[package]]
  name = "termcolor"
-version = "1.2.0"
+version = "0.3.6"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "be55cf8942feac5c765c2c993422806843c9a9a45d4d5c407ad6dd2ea95eb9b6"
+checksum = "adc4587ead41bf016f11af03e55a624c06568b5a19db4e90fde573d805074f83"
  dependencies = [
- "winapi-util",
+ "wincolor",
  ]
  
  [[package]]
  name = "terminal_size"
-version = "0.2.6"
+version = "0.3.0"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8e6bf6f19e9f8ed8d4048dc22981458ebcf406d67e94cd422e5ecd73d63b3237"
+checksum = "21bebf2b7c9e0a515f6e0f8c51dc0f8e4696391e6f1ff30379559f8365fb0df7"
  dependencies = [
- "rustix 0.37.3",
+ "rustix",
   "windows-sys 0.48.0",
  ]
  
  [[package]]
  name = "thiserror"
-version = "1.0.39"
+version = "1.0.63"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a5ab016db510546d856297882807df8da66a16fb8c4101cb8b30054b0d5b2d9c"
+checksum = "c0342370b38b6a11b6cc11d6a805569958d54cfa061a29969c3b5ce2ea405724"
  dependencies = [
   "thiserror-impl",
  ]
  
  [[package]]
  name = "thiserror-impl"
-version = "1.0.39"
+version = "1.0.63"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5420d42e90af0c38c3290abcca25b9b3bdf379fc9f55c528f53a269d9c9a267e"
+checksum = "a4558b58466b9ad7ca0f102865eccc95938dca1a74a856f2b57b6629050da261"
  dependencies = [
   "proc-macro2",
   "quote",
- "syn 1.0.109",
+ "syn",
  ]
  
  [[package]]
-name = "time"
-version = "0.1.45"
+name = "tinyvec"
+version = "1.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "445e881f4f6d382d5f27c034e25eb92edd7c784ceab92a0937db7f2e9471b938"
+dependencies = [
+ "tinyvec_macros",
+]
+
+[[package]]
+name = "tinyvec_macros"
+version = "0.1.1"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1b797afad3f312d1c66a56d11d0316f916356d11bd158fbc6ca6389ff6bf805a"
+checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
+
+[[package]]
+name = "tokio"
+version = "1.39.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9babc99b9923bfa4804bd74722ff02c0381021eafa4db9949217e3be8e84fff5"
  dependencies = [
+ "backtrace",
+ "bytes",
   "libc",
- "wasi",
- "winapi",
+ "mio",
+ "parking_lot",
+ "pin-project-lite",
+ "signal-hook-registry",
+ "socket2",
+ "tokio-macros",
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "tokio-macros"
+version = "2.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "tokio-util"
+version = "0.7.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9cf6b47b3771c49ac75ad09a6162f53ad4b8088b76ac60e8ec1455b31a189fe1"
+dependencies = [
+ "bytes",
+ "futures-core",
+ "futures-sink",
+ "pin-project-lite",
+ "tokio",
+]
+
+[[package]]
+name = "tower"
+version = "0.4.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c"
+dependencies = [
+ "futures-core",
+ "futures-util",
+ "pin-project",
+ "pin-project-lite",
+ "tower-layer",
+ "tower-service",
+]
+
+[[package]]
+name = "tower-layer"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e"
+
+[[package]]
+name = "tower-lsp"
+version = "0.20.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d4ba052b54a6627628d9b3c34c176e7eda8359b7da9acd497b9f20998d118508"
+dependencies = [
+ "async-trait",
+ "auto_impl",
+ "bytes",
+ "dashmap",
+ "futures",
+ "httparse",
+ "lsp-types",
+ "memchr",
+ "serde",
+ "serde_json",
+ "tokio",
+ "tokio-util",
+ "tower",
+ "tower-lsp-macros",
+ "tracing",
+]
+
+[[package]]
+name = "tower-lsp-macros"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "84fd902d4e0b9a4b27f2f440108dc034e1758628a9b702f8ec61ad66355422fa"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "tower-service"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3"
+
+[[package]]
+name = "tracing"
+version = "0.1.40"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef"
+dependencies = [
+ "pin-project-lite",
+ "tracing-attributes",
+ "tracing-core",
+]
+
+[[package]]
+name = "tracing-attributes"
+version = "0.1.27"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "tracing-core"
+version = "0.1.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54"
+dependencies = [
+ "once_cell",
  ]
  
  [[package]]
  name = "unicase"
-version = "2.6.0"
+version = "2.7.0"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "50f37be617794602aabbeee0be4f259dc1778fabe05e2d67ee8f79326d5cb4f6"
+checksum = "f7d2d4dafb69621809a81864c9c1b864479e1235c0dd4e199924b9742439ed89"
  dependencies = [
   "version_check",
  ]
  
+[[package]]
+name = "unicode-bidi"
+version = "0.3.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "08f95100a766bf4f8f28f90d77e0a5461bbdb219042e7679bebe79004fed8d75"
+
  [[package]]
  name = "unicode-ident"
-version = "1.0.6"
+version = "1.0.12"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "84a22b9f218b40614adcb3f4ff08b703773ad44fa9423e4e0d346d5db86e4ebc"
+checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"
+
+[[package]]
+name = "unicode-normalization"
+version = "0.1.23"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a56d1686db2308d901306f92a263857ef59ea39678a5458e7cb17f01415101f5"
+dependencies = [
+ "tinyvec",
+]
  
  [[package]]
  name = "unicode-width"
@@ -719,54 +1291,73 @@ version = "0.1.13"
  source = "registry+https://github.com/rust-lang/crates.io-index"
  checksum = "0336d538f7abc86d282a4189614dfaa90810dfc2c6f6427eaf88e16311dd225d"
  
+[[package]]
+name = "url"
+version = "2.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "22784dbdf76fdde8af1aeda5622b546b422b6fc585325248a2bf9f5e41e94d6c"
+dependencies = [
+ "form_urlencoded",
+ "idna",
+ "percent-encoding",
+ "serde",
+]
+
  [[package]]
  name = "utf8-decode"
  version = "1.0.1"
  source = "registry+https://github.com/rust-lang/crates.io-index"
  checksum = "ca61eb27fa339aa08826a29f03e87b99b4d8f0fc2255306fd266bb1b6a9de498"
  
+[[package]]
+name = "utf8parse"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
+
  [[package]]
  name = "version_check"
-version = "0.9.4"
+version = "0.9.5"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
+checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
  
  [[package]]
  name = "wasi"
-version = "0.10.0+wasi-snapshot-preview1"
+version = "0.11.0+wasi-snapshot-preview1"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f"
+checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
  
  [[package]]
  name = "wasm-bindgen"
-version = "0.2.87"
+version = "0.2.93"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7706a72ab36d8cb1f80ffbf0e071533974a60d0a308d01a5d0375bf60499a342"
+checksum = "a82edfc16a6c469f5f44dc7b571814045d60404b55a0ee849f9bcfa2e63dd9b5"
  dependencies = [
   "cfg-if",
+ "once_cell",
   "wasm-bindgen-macro",
  ]
  
  [[package]]
  name = "wasm-bindgen-backend"
-version = "0.2.87"
+version = "0.2.93"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5ef2b6d3c510e9625e5fe6f509ab07d66a760f0885d858736483c32ed7809abd"
+checksum = "9de396da306523044d3302746f1208fa71d7532227f15e347e2d93e4145dd77b"
  dependencies = [
   "bumpalo",
   "log",
   "once_cell",
   "proc-macro2",
   "quote",
- "syn 2.0.27",
+ "syn",
   "wasm-bindgen-shared",
  ]
  
  [[package]]
  name = "wasm-bindgen-macro"
-version = "0.2.87"
+version = "0.2.93"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dee495e55982a3bd48105a7b947fd2a9b4a8ae3010041b9e0faab3f9cd028f1d"
+checksum = "585c4c91a46b072c92e908d99cb1dcdf95c5218eeb6f3bf1efa991ee7a68cccf"
  dependencies = [
   "quote",
   "wasm-bindgen-macro-support",
@@ -774,22 +1365,22 @@ dependencies = [
  
  [[package]]
  name = "wasm-bindgen-macro-support"
-version = "0.2.87"
+version = "0.2.93"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b"
+checksum = "afc340c74d9005395cf9dd098506f7f44e38f2b4a21c6aaacf9a105ea5e1e836"
  dependencies = [
   "proc-macro2",
   "quote",
- "syn 2.0.27",
+ "syn",
   "wasm-bindgen-backend",
   "wasm-bindgen-shared",
  ]
  
  [[package]]
  name = "wasm-bindgen-shared"
-version = "0.2.87"
+version = "0.2.93"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ca6ad05a4870b2bf5fe995117d3728437bd27d7cd5f06f13c17443ef369775a1"
+checksum = "c62a0a307cb4a311d3a07867860911ca130c3494e8c2719593806c08bc5d0484"
  
  [[package]]
  name = "winapi"
@@ -807,15 +1398,6 @@ version = "0.4.0"
  source = "registry+https://github.com/rust-lang/crates.io-index"
  checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
  
-[[package]]
-name = "winapi-util"
-version = "0.1.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178"
-dependencies = [
- "winapi",
-]
-
  [[package]]
  name = "winapi-x86_64-pc-windows-gnu"
  version = "0.4.0"
@@ -832,142 +1414,149 @@ dependencies = [
  ]
  
  [[package]]
-name = "windows"
-version = "0.48.0"
+name = "windows-core"
+version = "0.52.0"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e686886bc078bc1b0b600cac0147aadb815089b6e4da64016cbd754b6342700f"
+checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9"
  dependencies = [
- "windows-targets 0.48.1",
+ "windows-targets 0.52.6",
  ]
  
  [[package]]
  name = "windows-sys"
-version = "0.45.0"
+version = "0.48.0"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0"
+checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9"
  dependencies = [
- "windows-targets 0.42.1",
+ "windows-targets 0.48.5",
  ]
  
  [[package]]
  name = "windows-sys"
-version = "0.48.0"
+version = "0.52.0"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9"
+checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
  dependencies = [
- "windows-targets 0.48.1",
+ "windows-targets 0.52.6",
  ]
  
  [[package]]
  name = "windows-targets"
-version = "0.42.1"
+version = "0.48.5"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8e2522491fbfcd58cc84d47aeb2958948c4b8982e9a2d8a2a35bbaed431390e7"
+checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c"
  dependencies = [
- "windows_aarch64_gnullvm 0.42.1",
- "windows_aarch64_msvc 0.42.1",
- "windows_i686_gnu 0.42.1",
- "windows_i686_msvc 0.42.1",
- "windows_x86_64_gnu 0.42.1",
- "windows_x86_64_gnullvm 0.42.1",
- "windows_x86_64_msvc 0.42.1",
+ "windows_aarch64_gnullvm 0.48.5",
+ "windows_aarch64_msvc 0.48.5",
+ "windows_i686_gnu 0.48.5",
+ "windows_i686_msvc 0.48.5",
+ "windows_x86_64_gnu 0.48.5",
+ "windows_x86_64_gnullvm 0.48.5",
+ "windows_x86_64_msvc 0.48.5",
  ]
  
  [[package]]
  name = "windows-targets"
-version = "0.48.1"
+version = "0.52.6"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "05d4b17490f70499f20b9e791dcf6a299785ce8af4d709018206dc5b4953e95f"
+checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
  dependencies = [
- "windows_aarch64_gnullvm 0.48.0",
- "windows_aarch64_msvc 0.48.0",
- "windows_i686_gnu 0.48.0",
- "windows_i686_msvc 0.48.0",
- "windows_x86_64_gnu 0.48.0",
- "windows_x86_64_gnullvm 0.48.0",
- "windows_x86_64_msvc 0.48.0",
+ "windows_aarch64_gnullvm 0.52.6",
+ "windows_aarch64_msvc 0.52.6",
+ "windows_i686_gnu 0.52.6",
+ "windows_i686_gnullvm",
+ "windows_i686_msvc 0.52.6",
+ "windows_x86_64_gnu 0.52.6",
+ "windows_x86_64_gnullvm 0.52.6",
+ "windows_x86_64_msvc 0.52.6",
  ]
  
  [[package]]
  name = "windows_aarch64_gnullvm"
-version = "0.42.1"
+version = "0.48.5"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8c9864e83243fdec7fc9c5444389dcbbfd258f745e7853198f365e3c4968a608"
+checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8"
  
  [[package]]
  name = "windows_aarch64_gnullvm"
-version = "0.48.0"
+version = "0.52.6"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc"
+checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
  
  [[package]]
  name = "windows_aarch64_msvc"
-version = "0.42.1"
+version = "0.48.5"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4c8b1b673ffc16c47a9ff48570a9d85e25d265735c503681332589af6253c6c7"
+checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc"
  
  [[package]]
  name = "windows_aarch64_msvc"
-version = "0.48.0"
+version = "0.52.6"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3"
+checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
  
  [[package]]
  name = "windows_i686_gnu"
-version = "0.42.1"
+version = "0.48.5"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "de3887528ad530ba7bdbb1faa8275ec7a1155a45ffa57c37993960277145d640"
+checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e"
  
  [[package]]
  name = "windows_i686_gnu"
-version = "0.48.0"
+version = "0.52.6"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241"
+checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
+
+[[package]]
+name = "windows_i686_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
  
  [[package]]
  name = "windows_i686_msvc"
-version = "0.42.1"
+version = "0.48.5"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bf4d1122317eddd6ff351aa852118a2418ad4214e6613a50e0191f7004372605"
+checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406"
  
  [[package]]
  name = "windows_i686_msvc"
-version = "0.48.0"
+version = "0.52.6"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00"
+checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
  
  [[package]]
  name = "windows_x86_64_gnu"
-version = "0.42.1"
+version = "0.48.5"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c1040f221285e17ebccbc2591ffdc2d44ee1f9186324dd3e84e99ac68d699c45"
+checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e"
  
  [[package]]
  name = "windows_x86_64_gnu"
-version = "0.48.0"
+version = "0.52.6"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1"
+checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
  
  [[package]]
  name = "windows_x86_64_gnullvm"
-version = "0.42.1"
+version = "0.48.5"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "628bfdf232daa22b0d64fdb62b09fcc36bb01f05a3939e20ab73aaf9470d0463"
+checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc"
  
  [[package]]
  name = "windows_x86_64_gnullvm"
-version = "0.48.0"
+version = "0.52.6"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953"
+checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
  
  [[package]]
  name = "windows_x86_64_msvc"
-version = "0.42.1"
+version = "0.48.5"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "447660ad36a13288b1db4d4248e857b510e8c3a225c822ba4fb748c0aafecffd"
+checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538"
  
  [[package]]
  name = "windows_x86_64_msvc"
-version = "0.48.0"
+version = "0.52.6"
  source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a"
+checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
diff --git a/rust/Cargo.toml b/rust/Cargo.toml

index 41b2f02c6f27b85e8a1a44723a70a30413a7a698..3aa9d37c2633979e129ae872b14df2ea2308878f 100644 (file)
--- a/rust/Cargo.toml
+++ b/rust/Cargo.toml
@@ -1,51 +1,6 @@
-[package]
-name = "pspp"
-version = "1.0.0"
-edition = "2021"
-authors = [ "Ben Pfaff", "John Darrington" ]
-
-[dependencies]
-anyhow = "1.0.69"
-clap = { version = "4.1.7", features = ["derive", "wrap_help"] }
-encoding_rs = "0.8.32"
-flate2 = "1.0.26"
-float_next_after = "1.0.0"
-hexplay = "0.2.1"
-lazy_static = "1.4.0"
-num = "0.4.0"
-num-derive = "0.4.0"
-num-traits = "0.2.16"
-ordered-float = "3.7.0"
-thiserror = "1.0"
-chrono = "0.4.26"
-finl_unicode = "1.2.0"
-unicase = "2.6.0"
-libc = "0.2.147"
-indexmap = "2.1.0"
-utf8-decode = "1.0.1"
-bitflags = "2.5.0"
-unicode-width = "0.1.13"
-chardetng = "0.1.17"
-enum-map = "2.7.3"
-flagset = "0.4.6"
-
-[target.'cfg(windows)'.dependencies]
-windows-sys = { version = "0.48.0", features = ["Win32_Globalization"] }
-
-[build-dependencies]
-anyhow = "1.0.69"
-
-[[bin]]
-name = "pspp-dump-sav"
-path = "src/main.rs"
-
-[lib]
-path = "src/lib.rs"
-
-[[test]]
-name = "sack"
-path = "tests/sack.rs"
-harness = false
-
-[dev-dependencies]
-diff = "0.1.13"
+[workspace]
+members = [
+  "pspp",
+  "pspp-lsp",
+]
+resolver = "2"
diff --git a/rust/build.rs b/rust/build.rs

deleted file mode 100644 (file)

index f8cb9ef..0000000
--- a/rust/build.rs
+++ /dev/null
@@ -1,184 +0,0 @@
-use anyhow::{anyhow, Result as AnyResult};
-use std::{
-    collections::{BTreeMap, HashSet, VecDeque},
-    env::var_os,
-    fs::{read_to_string, File},
-    io::{Error as IoError, Write},
-    path::{Path, PathBuf},
-};
-
-#[derive(Copy, Clone, PartialEq, Eq, Ord, PartialOrd)]
-enum Source {
-    Codepage,
-    Ibm,
-    Windows,
-}
-
-// Code page number.
-type CodepageNumber = usize;
-
-fn process_converter<'a>(
-    fields: &Vec<&'a str>,
-    codepages: &mut BTreeMap<CodepageNumber, BTreeMap<Source, Vec<&'a str>>>,
-) {
-    if fields.is_empty() || fields[0] == "{" {
-        return;
-    }
-
-    let mut cps: BTreeMap<Source, CodepageNumber> = BTreeMap::new();
-    let mut iana = VecDeque::new();
-    let mut other = VecDeque::new();
-
-    let mut iter = fields.iter().peekable();
-    while let Some(&name) = iter.next() {
-        if iter.next_if(|&&s| s == "{").is_some() {
-            let mut standards = HashSet::new();
-            loop {
-                let &standard = iter.next().expect("missing `}` in list of standards");
-                if standard == "}" {
-                    break;
-                }
-                standards.insert(standard);
-            }
-
-            if standards.contains("IANA*") {
-                iana.push_front(name);
-            } else if standards.contains("IANA") {
-                iana.push_back(name);
-            } else if standards.iter().any(|&s| s.ends_with('*')) {
-                other.push_front(name);
-            } else {
-                other.push_back(name);
-            }
-        } else {
-            // Untagged names are completely nonstandard.
-            continue;
-        }
-
-        if let Some(number) = name.strip_prefix("cp") {
-            if let Ok(number) = number.parse::<CodepageNumber>() {
-                cps.insert(Source::Codepage, number);
-            }
-        }
-
-        if let Some(number) = name.strip_prefix("windows-") {
-            if let Ok(number) = number.parse::<CodepageNumber>() {
-                cps.insert(Source::Windows, number);
-            }
-        }
-
-        if let Some(number) = name.strip_prefix("ibm-") {
-            if let Ok(number) = number.parse::<CodepageNumber>() {
-                cps.insert(Source::Ibm, number);
-            }
-        }
-    }
-
-    // If there are no tagged names then this is completely nonstandard.
-    if iana.is_empty() && other.is_empty() {
-        return;
-    }
-
-    let all: Vec<&str> = iana.into_iter().chain(other).collect();
-    for (source, number) in cps {
-        codepages
-            .entry(number)
-            .or_default()
-            .insert(source, all.clone());
-    }
-}
-
-fn write_output(
-    codepages: &BTreeMap<CodepageNumber, BTreeMap<Source, Vec<&str>>>,
-    file_name: &PathBuf,
-) -> Result<(), IoError> {
-    let mut file = File::create(file_name)?;
-
-    file.write_all(
-        "\
-use lazy_static::lazy_static;
-use std::collections::HashMap;
-
-lazy_static! {
-    static ref CODEPAGE_NUMBER_TO_NAME: HashMap<i32, &'static str> = {
-        let mut map = HashMap::new();
-"
-        .as_bytes(),
-    )?;
-
-    for (&cpnumber, value) in codepages.iter() {
-        let source = value.keys().max().unwrap();
-        let name = value[source][0];
-        writeln!(file, "        map.insert({cpnumber}, \"{name}\");")?;
-    }
-    file.write_all(
-        "        map
-    };
-
-    static ref CODEPAGE_NAME_TO_NUMBER: HashMap<&'static str, u32> = {
-        let mut map = HashMap::new();
-"
-        .as_bytes(),
-    )?;
-
-    let mut names: BTreeMap<String, BTreeMap<Source, Vec<CodepageNumber>>> = BTreeMap::new();
-    for (&cpnumber, value) in codepages.iter() {
-        for (&source, value2) in value.iter() {
-            for name in value2.iter().map(|name| name.to_ascii_lowercase()) {
-                names
-                    .entry(name)
-                    .or_default()
-                    .entry(source)
-                    .or_default()
-                    .push(cpnumber);
-            }
-        }
-    }
-
-    for (name, value) in names.iter() {
-        for (_source, numbers) in value.iter().rev().take(1) {
-            writeln!(file, "        map.insert(\"{name}\", {});", numbers[0])?;
-        }
-    }
-    file.write_all(
-        "        map
-    };
-}
-"
-        .as_bytes(),
-    )?;
-
-    Ok(())
-}
-
-fn main() -> AnyResult<()> {
-    println!("cargo:rerun-if-changed=build.rs");
-
-    let input_file = Path::new(env!("CARGO_MANIFEST_DIR")).join("convrtrs.txt");
-    println!("cargo:rerun-if-changed={}", input_file.to_string_lossy());
-    let input = read_to_string(&input_file)
-        .map_err(|e| anyhow!("{}: read failed ({e})", input_file.to_string_lossy()))?;
-
-    let mut codepages: BTreeMap<CodepageNumber, BTreeMap<Source, Vec<&str>>> = BTreeMap::new();
-    let mut converter: Vec<&str> = Vec::new();
-    for line in input.lines() {
-        let line = line
-            .find('#')
-            .map(|position| &line[..position])
-            .unwrap_or(line)
-            .trim_end();
-        if !line.starts_with([' ', '\t']) {
-            process_converter(&converter, &mut codepages);
-            converter.clear();
-        }
-        converter.extend(line.split_whitespace());
-    }
-    process_converter(&converter, &mut codepages);
-
-    let output_file_name = Path::new(&var_os("OUT_DIR").unwrap()).join("encodings.rs");
-
-    write_output(&codepages, &output_file_name)
-        .map_err(|e| anyhow!("{}: write failed ({e})", output_file_name.to_string_lossy()))?;
-
-    Ok(())
-}
diff --git a/rust/convrtrs.txt b/rust/convrtrs.txt

deleted file mode 100644 (file)

index 4aaa592..0000000
--- a/rust/convrtrs.txt
+++ /dev/null
@@ -1,1269 +0,0 @@
-# Copyright (C) 2016 and later: Unicode, Inc. and others.
-# License & terms of use: http://www.unicode.org/copyright.html
-# ******************************************************************************
-# *
-# *   Copyright (C) 1995-2014, International Business Machines
-# *   Corporation and others.  All Rights Reserved.
-# *
-# ******************************************************************************
-
-# If this converter alias table looks very confusing, a much easier to
-# understand view can be found at this demo:
-# http://demo.icu-project.org/icu-bin/convexp
-
-# IMPORTANT NOTE
-#
-# This file is not read directly by ICU. If you change it, you need to
-# run gencnval, and eventually run pkgdata to update the representation that
-# ICU uses for aliases. The gencnval tool will normally compile this file into
-# cnvalias.icu. The gencnval -v verbose option will help you when you edit
-# this file.
-
-# Please be friendly to the rest of us that edit this table by
-# keeping this table free of tabs.
-
-# This is an alias file used by the character set converter.
-# A lot of converter information can be found in unicode/ucnv.h, but here
-# is more information about this file.
-# 
-# If you are adding a new converter to this list and want to include it in the
-# icu data library, please be sure to add an entry to the appropriate ucm*.mk file
-# (see ucmfiles.mk for more information).
-# 
-# Here is the file format using BNF-like syntax:
-#
-# converterTable ::= tags { converterLine* }
-# converterLine ::= converterName [ tags ] { taggedAlias* }'\n'
-# taggedAlias ::= alias [ tags ]
-# tags ::= '{' { tag+ } '}'
-# tag ::= standard['*']
-# converterName ::= [0-9a-zA-Z:_'-']+
-# alias ::= converterName
-#
-# Except for the converter name, aliases are case insensitive.
-# Names are separated by whitespace.
-# Line continuation and comment sytax are similar to the GNU make syntax.
-# Any lines beginning with whitespace (e.g. U+0020 SPACE or U+0009 HORIZONTAL
-# TABULATION) are presumed to be a continuation of the previous line.
-# The # symbol starts a comment and the comment continues till the end of
-# the line.
-#
-# The converter
-#
-# All names can be tagged by including a space-separated list of tags in
-# curly braces, as in ISO_8859-1:1987{IANA*} iso-8859-1 { MIME* } or
-# some-charset{MIME* IANA*}. The order of tags does not matter, and
-# whitespace is allowed between the tagged name and the tags list.
-#
-# The tags can be used to get standard names using ucnv_getStandardName().
-#
-# The complete list of recognized tags used in this file is defined in
-# the affinity list near the beginning of the file.
-#
-# The * after the standard tag denotes that the previous alias is the
-# preferred (default) charset name for that standard. There can only
-# be one of these default charset names per converter.
-
-
-
-# The world is getting more complicated...
-# Supporting XML parsers, HTML, MIME, and similar applications
-# that mark encodings with a charset name can be difficult.
-# Many of these applications and operating systems will update
-# their codepages over time.
-
-# It means that a new codepage, one that differs from an
-# old one by changing a code point, e.g., to the Euro sign,
-# must not get an old alias, because it would mean that
-# old files with this alias would be interpreted differently.
-
-# If an codepage gets updated by assigning characters to previously
-# unassigned code points, then a new name is not necessary.
-# Also, some codepages map unassigned codepage byte values
-# to the same numbers in Unicode for roundtripping. It may be
-# industry practice to keep the encoding name in such a case, too
-# (example: Windows codepages).
-
-# The aliases listed in the list of character sets
-# that is maintained by the IANA (http://www.iana.org/) must
-# not be changed to mean encodings different from what this
-# list shows. Currently, the IANA list is at
-# http://www.iana.org/assignments/character-sets
-# It should also be mentioned that the exact mapping table used for each
-# IANA names usually isn't specified. This means that some other applications
-# and operating systems are left to interpret the exact mappings for the
-# underspecified aliases. For instance, Shift-JIS on a Solaris platform
-# may be different from Shift-JIS on a Windows platform. This is why
-# some of the aliases can be tagged to differentiate different mapping
-# tables with the same alias. If an alias is given to more than one converter,
-# it is considered to be an ambiguous alias, and the affinity list will
-# choose the converter to use when a standard isn't specified with the alias.
-
-# Name matching is case-insensitive. Also, dashes '-', underscores '_'
-# and spaces ' ' are ignored in names (thus cs-iso_latin-1, csisolatin1
-# and "cs iso latin 1" are the same).
-# However, the names in the left column are directly file names
-# or names of algorithmic converters, and their case must not
-# be changed - or else code and/or file names must also be changed.
-# For example, the converter ibm-921 is expected to be the file ibm-921.cnv.
-
-
-
-# The immediately following list is the affinity list of supported standard tags.
-# When multiple converters have the same alias under different standards,
-# the standard nearest to the top of this list with that alias will
-# be the first converter that will be opened. The ordering of the aliases
-# after this affinity list does not affect the preferred alias, but it may
-# affect the order of the returned list of aliases for a given converter.
-#
-# The general ordering is from specific and frequently used to more general
-# or rarely used at the bottom.
-{   UTR22           # Name format specified by https://www.unicode.org/reports/tr22/
-    # ICU             # Can also use ICU_FEATURE
-    IBM             # The IBM CCSID number is specified by ibm-*
-    WINDOWS         # The Microsoft code page identifier number is specified by windows-*. The rest are recognized IE names.
-    JAVA            # Source: Sun JDK. Alias name case is ignored, but dashes are not ignored.
-    # GLIBC
-    # AIX
-    # DB2
-    # SOLARIS
-    # APPLE
-    # HPUX
-    IANA            # Source: http://www.iana.org/assignments/character-sets
-    MIME            # Source: http://www.iana.org/assignments/character-sets
-    # MSIE            # MSIE is Internet Explorer, which can be different from Windows (From the IMultiLanguage COM interface)
-    # ZOS_USS         # z/OS (os/390) Unix System Services (USS), which has NL<->LF swapping. They have the same format as the IBM tag.
-    }
-
-
-
-# Fully algorithmic converters
-
-UTF-8 { IANA* MIME* JAVA* WINDOWS }
-                                ibm-1208 { IBM* } # UTF-8 with IBM PUA
-                                ibm-1209 { IBM }  # UTF-8
-                                ibm-5304 { IBM }  # Unicode 2.0, UTF-8 with IBM PUA
-                                ibm-5305 { IBM }  # Unicode 2.0, UTF-8
-                                ibm-13496 { IBM } # Unicode 3.0, UTF-8 with IBM PUA
-                                ibm-13497 { IBM } # Unicode 3.0, UTF-8
-                                ibm-17592 { IBM } # Unicode 4.0, UTF-8 with IBM PUA
-                                ibm-17593 { IBM } # Unicode 4.0, UTF-8
-                                windows-65001 { WINDOWS* }
-                                cp1208
-                                x-UTF_8J
-                                unicode-1-1-utf-8
-                                unicode-2-0-utf-8
-
-# The ICU 2.2 UTF-16/32 converters detect and write a BOM.
-UTF-16 { IANA* MIME* JAVA* }    ISO-10646-UCS-2 { IANA }
-                                ibm-1204 { IBM* } # UTF-16 with IBM PUA and BOM sensitive
-                                ibm-1205 { IBM }  # UTF-16 BOM sensitive
-                                unicode
-                                csUnicode
-                                ucs-2
-# The following Unicode CCSIDs (IBM) are not valid in ICU because they are
-# considered pure DBCS (exactly 2 bytes) of Unicode,
-# and they are a subset of Unicode. ICU does not support their encoding structures.
-# 1400 1401 1402 1410 1414 1415 1446 1447 1448 1449 64770 64771 65520 5496 5497 5498 9592 13688
-UTF-16BE { IANA* MIME* JAVA* }  x-utf-16be { JAVA }
-                                UnicodeBigUnmarked { JAVA } # java.io name
-                                ibm-1200 { IBM* } # UTF-16 BE with IBM PUA
-                                ibm-1201 { IBM }  # UTF-16 BE
-                                ibm-13488 { IBM } # Unicode 2.0, UTF-16 BE with IBM PUA
-                                ibm-13489 { IBM } # Unicode 2.0, UTF-16 BE
-                                ibm-17584 { IBM } # Unicode 3.0, UTF-16 BE with IBM PUA
-                                ibm-17585 { IBM } # Unicode 3.0, UTF-16 BE
-                                ibm-21680 { IBM } # Unicode 4.0, UTF-16 BE with IBM PUA
-                                ibm-21681 { IBM } # Unicode 4.0, UTF-16 BE
-                                ibm-25776 { IBM } # Unicode 4.1, UTF-16 BE with IBM PUA
-                                ibm-25777 { IBM } # Unicode 4.1, UTF-16 BE
-                                ibm-29872 { IBM } # Unicode 5.0, UTF-16 BE with IBM PUA
-                                ibm-29873 { IBM } # Unicode 5.0, UTF-16 BE
-                                ibm-61955 { IBM } # UTF-16BE with Gaidai University (Japan) PUA
-                                ibm-61956 { IBM } # UTF-16BE with Microsoft HKSCS-Big 5 PUA
-                                windows-1201 { WINDOWS* }
-                                cp1200
-                                cp1201
-                                UTF16_BigEndian
-                                # ibm-5297 { IBM }  # Unicode 2.0, UTF-16 (BE) (reserved, never used)
-                                # iso-10646-ucs-2 { JAVA } # This is ambiguous
-                                # ibm-61952 is not a valid CCSID because it's Unicode 1.1
-                                # ibm-61953 is not a valid CCSID because it's Unicode 1.0
-UTF-16LE { IANA* MIME* JAVA* }  x-utf-16le { JAVA }
-                                UnicodeLittleUnmarked { JAVA } # java.io name
-                                ibm-1202 { IBM* } # UTF-16 LE with IBM PUA
-                                ibm-1203 { IBM }  # UTF-16 LE
-                                ibm-13490 { IBM } # Unicode 2.0, UTF-16 LE with IBM PUA
-                                ibm-13491 { IBM } # Unicode 2.0, UTF-16 LE
-                                ibm-17586 { IBM } # Unicode 3.0, UTF-16 LE with IBM PUA
-                                ibm-17587 { IBM } # Unicode 3.0, UTF-16 LE
-                                ibm-21682 { IBM } # Unicode 4.0, UTF-16 LE with IBM PUA
-                                ibm-21683 { IBM } # Unicode 4.0, UTF-16 LE
-                                ibm-25778 { IBM } # Unicode 4.1, UTF-16 LE with IBM PUA
-                                ibm-25779 { IBM } # Unicode 4.1, UTF-16 LE
-                                ibm-29874 { IBM } # Unicode 5.0, UTF-16 LE with IBM PUA
-                                ibm-29875 { IBM } # Unicode 5.0, UTF-16 LE
-                                UTF16_LittleEndian
-                                windows-1200 { WINDOWS* }
-
-UTF-32 { IANA* MIME* }          ISO-10646-UCS-4 { IANA }
-                                ibm-1236 { IBM* } # UTF-32 with IBM PUA and BOM sensitive
-                                ibm-1237 { IBM }  # UTF-32 BOM sensitive
-                                csUCS4
-                                ucs-4
-UTF-32BE { IANA* }              UTF32_BigEndian
-                                ibm-1232 { IBM* } # UTF-32 BE with IBM PUA
-                                ibm-1233 { IBM }  # UTF-32 BE
-                                ibm-9424 { IBM }  # Unicode 4.1, UTF-32 BE with IBM PUA
-UTF-32LE { IANA* }              UTF32_LittleEndian
-                                ibm-1234 { IBM* } # UTF-32 LE, with IBM PUA
-                                ibm-1235 { IBM }  # UTF-32 LE
-
-# ICU-specific names for special uses
-UTF16_PlatformEndian
-UTF16_OppositeEndian
-
-UTF32_PlatformEndian
-UTF32_OppositeEndian
-
-
-# Java-specific, non-Unicode-standard UTF-16 variants.
-# These are in the Java "Basic Encoding Set (contained in lib/rt.jar)".
-# See the "Supported Encodings" at
-# http://java.sun.com/javase/6/docs/technotes/guides/intl/encoding.doc.html
-# or a newer version of this document.
-#
-# Aliases marked with { JAVA* } are canonical names for java.io and java.lang APIs.
-# Aliases marked with { JAVA } are canonical names for the java.nio API.
-#
-# "BOM" means the Unicode Byte Order Mark, which is the encoding-scheme-specific
-# byte sequence for U+FEFF.
-# "Reverse BOM" means the BOM for the sibling encoding scheme with the
-# opposite endianness. (LE<->BE)
-
-# "Sixteen-bit Unicode (or UCS) Transformation Format, big-endian byte order,
-# with byte-order mark"
-#
-# From Unicode: Writes BOM.
-# To Unicode: Detects and consumes BOM. 
-#   If there is a "reverse BOM", Java throws
-#   MalformedInputException: Incorrect byte-order mark.
-#   In this case, ICU4C sets a U_ILLEGAL_ESCAPE_SEQUENCE UErrorCode value
-#   and a UCNV_ILLEGAL UConverterCallbackReason.
-UTF-16BE,version=1             UnicodeBig { JAVA* }
-
-# "Sixteen-bit Unicode (or UCS) Transformation Format, little-endian byte order,
-# with byte-order mark"
-#
-# From Unicode: Writes BOM.
-# To Unicode: Detects and consumes BOM. 
-#   If there is a "reverse BOM", Java throws
-#   MalformedInputException: Incorrect byte-order mark.
-#   In this case, ICU4C sets a U_ILLEGAL_ESCAPE_SEQUENCE UErrorCode value
-#   and a UCNV_ILLEGAL UConverterCallbackReason.
-UTF-16LE,version=1             UnicodeLittle { JAVA* }  x-UTF-16LE-BOM { JAVA }
-
-# This one is not mentioned on the "Supported Encodings" page
-# but is available in Java.
-# In Java, this is called "Unicode" but we cannot give it that alias
-# because the standard UTF-16 converter already has a "unicode" alias.
-#
-# From Unicode: Writes BOM.
-# To Unicode: Detects and consumes BOM.
-#   If there is no BOM, rather than defaulting to BE, Java throws
-#   MalformedInputException: Missing byte-order mark.
-#   In this case, ICU4C sets a U_ILLEGAL_ESCAPE_SEQUENCE UErrorCode value
-#   and a UCNV_ILLEGAL UConverterCallbackReason.
-UTF-16,version=1
-
-# This is the same as standard UTF-16 but always writes a big-endian byte stream,
-# regardless of the platform endianness, as expected by the Java compatibility tests.
-# See the java.nio.charset.Charset API documentation at
-# http://java.sun.com/javase/6/docs/api/java/nio/charset/Charset.html
-# or a newer version of this document.
-#
-# From Unicode: Write BE BOM and BE bytes
-# To Unicode: Detects and consumes BOM. Defaults to BE.
-UTF-16,version=2
-
-# Note: ICU does not currently support Java-specific, non-Unicode-standard UTF-32 variants.
-# Presumably, these behave analogously to the UTF-16 variants with similar names.
-# UTF_32BE_BOM  x-UTF-32BE-BOM
-# UTF_32LE_BOM  x-UTF-32LE-BOM
-
-# End of Java-specific, non-Unicode-standard UTF variants.
-
-
-# On UTF-7:
-# RFC 2152 (http://www.imc.org/rfc2152) allows to encode some US-ASCII
-# characters directly or in base64. Especially, the characters in set O
-# as defined in the RFC (!"#$%&*;<=>@[]^_`{|}) may be encoded directly
-# but are not allowed in, e.g., email headers.
-# By default, the ICU UTF-7 converter encodes set O directly.
-# By choosing the option "version=1", set O will be escaped instead.
-# For example:
-#     utf7Converter=ucnv_open("UTF-7,version=1");
-#
-# For details about email headers see RFC 2047.
-UTF-7 { IANA* MIME* WINDOWS }   windows-65000 { WINDOWS* }
-                                unicode-1-1-utf-7
-                                unicode-2-0-utf-7
-
-# UTF-EBCDIC doesn't exist in ICU, but the aliases are here for reference.
-#UTF-EBCDIC ibm-1210 { IBM* } ibm-1211 { IBM }
-
-# IMAP-mailbox-name is an ICU-specific name for the encoding of IMAP mailbox names.
-# It is a substantially modified UTF-7 encoding. See the specification in:
-#
-# RFC 2060: INTERNET MESSAGE ACCESS PROTOCOL - VERSION 4rev1
-# (http://www.ietf.org/rfc/rfc2060.txt)
-# Section 5.1.3.  Mailbox International Naming Convention
-IMAP-mailbox-name
-
-SCSU { IANA* }
-    ibm-1212 { IBM }  # SCSU with IBM PUA
-    ibm-1213 { IBM* } # SCSU
-BOCU-1 { IANA* }
-    csBOCU-1 { IANA }
-    ibm-1214 { IBM }  # BOCU-1 with IBM PUA
-    ibm-1215 { IBM* } # BOCU-1
-
-# See https://www.unicode.org/reports/tr26 for this Compatibility Encoding Scheme for UTF-16
-# The Unicode Consortium does not encourage the use of CESU-8
-CESU-8 { IANA* } ibm-9400 { IBM* }
-
-# Standard iso-8859-1, which does not have the Euro update.
-# See iso-8859-15 (latin9) for the Euro update
-ISO-8859-1 { MIME* IANA JAVA* }
-    ibm-819 { IBM* JAVA }    # This is not truely ibm-819 because it's missing the fallbacks.
-    IBM819 { IANA }
-    cp819 { IANA JAVA }
-    latin1 { IANA JAVA }
-    8859_1 { JAVA }
-    csISOLatin1 { IANA JAVA }
-    iso-ir-100 { IANA JAVA }
-    ISO_8859-1:1987 { IANA* JAVA }
-    l1 { IANA JAVA }
-    819 { JAVA }
-    # windows-28591 { WINDOWS* } # This has odd behavior because it has the Euro update, which isn't correct.
-    # LATIN_1     # Old ICU name
-    # ANSI_X3.110-1983  # This is for a different IANA alias.  This isn't iso-8859-1.
-
-US-ASCII { MIME* IANA JAVA WINDOWS }
-    ASCII { JAVA* IANA WINDOWS }
-    ANSI_X3.4-1968 { IANA* WINDOWS }
-    ANSI_X3.4-1986 { IANA WINDOWS }
-    ISO_646.irv:1991 { IANA WINDOWS }
-    iso_646.irv:1983 { JAVA }
-    ISO646-US { JAVA IANA WINDOWS }
-    us { IANA }
-    csASCII { IANA WINDOWS }
-    iso-ir-6 { IANA }
-    cp367 { IANA WINDOWS }
-    ascii7 { JAVA }
-    646 { JAVA }
-    windows-20127 { WINDOWS* }
-    ibm-367 { IBM* } IBM367 { IANA WINDOWS } # This is not truely ibm-367 because it's missing the fallbacks.
-
-# GB 18030 is partly algorithmic, using the MBCS converter
-gb18030 { IANA* }       ibm-1392 { IBM* } windows-54936 { WINDOWS* } GB18030 { MIME* }
-
-# Table-based interchange codepages
-
-# Central Europe
-ibm-912_P100-1995 { UTR22* }
-                        ibm-912 { IBM* JAVA }
-                        ISO-8859-2 { MIME* IANA JAVA* WINDOWS }
-                        ISO_8859-2:1987 { IANA* WINDOWS JAVA }
-                        latin2 { IANA WINDOWS JAVA }
-                        csISOLatin2 { IANA WINDOWS JAVA }
-                        iso-ir-101 { IANA WINDOWS JAVA }
-                        l2 { IANA WINDOWS JAVA }
-                        8859_2 { JAVA }
-                        cp912 { JAVA }
-                        912 { JAVA }
-                        windows-28592 { WINDOWS* }
-
-# Maltese Esperanto
-ibm-913_P100-2000 { UTR22* }
-                        ibm-913 { IBM* JAVA }
-                        ISO-8859-3 { MIME* IANA WINDOWS JAVA* }
-                        ISO_8859-3:1988 { IANA* WINDOWS JAVA }
-                        latin3 { IANA JAVA WINDOWS }
-                        csISOLatin3 { IANA WINDOWS }
-                        iso-ir-109 { IANA WINDOWS JAVA }
-                        l3 { IANA WINDOWS JAVA }
-                        8859_3 { JAVA }
-                        cp913 { JAVA }
-                        913 { JAVA }
-                        windows-28593 { WINDOWS* }
-
-# Baltic
-ibm-914_P100-1995 { UTR22* }
-                        ibm-914 { IBM* JAVA }
-                        ISO-8859-4 { MIME* IANA WINDOWS JAVA* }
-                        latin4 { IANA WINDOWS JAVA }
-                        csISOLatin4 { IANA WINDOWS JAVA }
-                        iso-ir-110 { IANA WINDOWS JAVA }
-                        ISO_8859-4:1988 { IANA* WINDOWS JAVA }
-                        l4 { IANA WINDOWS JAVA }
-                        8859_4 { JAVA }
-                        cp914 { JAVA }
-                        914 { JAVA }
-                        windows-28594 { WINDOWS* }
-
-# Cyrillic
-ibm-915_P100-1995 { UTR22* }
-                        ibm-915 { IBM* JAVA }
-                        ISO-8859-5 { MIME* IANA WINDOWS JAVA* }
-                        cyrillic { IANA WINDOWS JAVA }
-                        csISOLatinCyrillic { IANA WINDOWS JAVA }
-                        iso-ir-144 { IANA WINDOWS JAVA }
-                        ISO_8859-5:1988 { IANA* WINDOWS JAVA }
-                        8859_5 { JAVA }
-                        cp915 { JAVA }
-                        915 { JAVA }
-                        windows-28595 { WINDOWS* }
-
-glibc-PT154-2.3.3 { UTR22* }
-                        PTCP154 { IANA* }
-                        csPTCP154
-                        PT154
-                        CP154
-                        Cyrillic-Asian
-
-# Arabic
-# ISO_8859-6-E and ISO_8859-6-I are similar to this charset, but BiDi is done differently
-# From a narrow mapping point of view, there is no difference.
-# -E means explicit. -I means implicit.
-# -E requires the client to handle the ISO 6429 bidirectional controls
-ibm-1089_P100-1995 { UTR22* }
-                        ibm-1089 { IBM* JAVA }
-                        ISO-8859-6 { MIME* IANA WINDOWS JAVA* }
-                        arabic { IANA WINDOWS JAVA }
-                        csISOLatinArabic { IANA WINDOWS JAVA }
-                        iso-ir-127 { IANA WINDOWS JAVA }
-                        ISO_8859-6:1987 { IANA* WINDOWS JAVA }
-                        ECMA-114 { IANA JAVA }
-                        ASMO-708 { IANA JAVA }
-                        8859_6 { JAVA }
-                        cp1089 { JAVA }
-                        1089 { JAVA }
-                        windows-28596 { WINDOWS* }
-                        ISO-8859-6-I { IANA MIME } # IANA considers this alias different and BiDi needs to be applied.
-                        ISO-8859-6-E { IANA MIME } # IANA considers this alias different and BiDi needs to be applied.
-                        x-ISO-8859-6S { JAVA }
-
-# ISO Greek (with euro update). This is really ISO_8859-7:2003
-ibm-9005_X110-2007 { UTR22* }
-                        ibm-9005 { IBM* }
-                        ISO-8859-7 { MIME* IANA JAVA* WINDOWS }
-                        8859_7 { JAVA }
-                        greek { IANA JAVA WINDOWS }
-                        greek8 { IANA JAVA WINDOWS }
-                        ELOT_928 { IANA JAVA WINDOWS }
-                        ECMA-118 { IANA JAVA WINDOWS }
-                        csISOLatinGreek { IANA JAVA WINDOWS }
-                        iso-ir-126 { IANA JAVA WINDOWS }
-                        ISO_8859-7:1987 { IANA* JAVA WINDOWS }
-                        windows-28597 { WINDOWS* }
-                        sun_eu_greek # For Solaris
-
-# ISO Greek (w/o euro update)
-# JDK 1.5 has these aliases.
-ibm-813_P100-1995 { UTR22* }
-                        ibm-813 { IBM* JAVA* }
-                        cp813 { JAVA }
-                        813 { JAVA }
-
-# hebrew
-# ISO_8859-8-E and ISO_8859-8-I are similar to this charset, but BiDi is done differently
-# From a narrow mapping point of view, there is no difference.
-# -E means explicit. -I means implicit.
-# -E requires the client to handle the ISO 6429 bidirectional controls
-# This matches the official mapping on unicode.org
-ibm-5012_P100-1999 { UTR22* }
-                        ibm-5012 { IBM* }
-                        ISO-8859-8 { MIME* IANA WINDOWS JAVA* }
-                        hebrew { IANA WINDOWS JAVA }
-                        csISOLatinHebrew { IANA WINDOWS JAVA }
-                        iso-ir-138 { IANA WINDOWS JAVA }
-                        ISO_8859-8:1988 { IANA* WINDOWS JAVA }
-                        ISO-8859-8-I { IANA MIME } # IANA and Windows considers this alias different and BiDi needs to be applied.
-                        ISO-8859-8-E { IANA MIME } # IANA and Windows considers this alias different and BiDi needs to be applied.
-                        8859_8 { JAVA }
-                        windows-28598 { WINDOWS* } # Hebrew (ISO-Visual). A hybrid between ibm-5012 and ibm-916 with extra PUA mappings.
-                        hebrew8 # Reflect HP-UX code page update
-
-# Unfortunately, the Java aliases are split across ibm-916 and ibm-5012
-# Also many platforms are a combination between ibm-916 and ibm-5012 behaviors
-ibm-916_P100-1995 { UTR22* }
-                        ibm-916 { IBM* JAVA* }
-                        cp916 { JAVA }
-                        916 { JAVA }
-
-# Turkish
-ibm-920_P100-1995 { UTR22* }
-                        ibm-920 { IBM* JAVA }
-                        ISO-8859-9 { MIME* IANA WINDOWS JAVA* }
-                        latin5 { IANA WINDOWS JAVA }
-                        csISOLatin5 { IANA JAVA }
-                        iso-ir-148 { IANA WINDOWS JAVA }
-                        ISO_8859-9:1989 { IANA* WINDOWS }
-                        l5 { IANA WINDOWS JAVA }
-                        8859_9 { JAVA }
-                        cp920 { JAVA }
-                        920 { JAVA }
-                        windows-28599 { WINDOWS* }
-                        ECMA-128    # IANA doesn't have this alias 6/24/2002
-                        turkish8    # Reflect HP-UX codepage update 8/1/2008
-                        turkish     # Reflect HP-UX codepage update 8/1/2008
-
-# Nordic languages
-iso-8859_10-1998 { UTR22* } ISO-8859-10 { MIME* IANA* }
-                        iso-ir-157 { IANA }
-                        l6 { IANA }
-                        ISO_8859-10:1992 { IANA }
-                        csISOLatin6 { IANA }
-                        latin6 { IANA }
-
-# Thai
-# Be warned. There several iso-8859-11 codepage variants, and they are all incompatible.
-# ISO-8859-11 is a superset of TIS-620. The difference is that ISO-8859-11 contains the C1 control codes.
-iso-8859_11-2001 { UTR22* } ISO-8859-11
-                        thai8 # HP-UX alias. HP-UX says TIS-620, but it's closer to ISO-8859-11.
-                        x-iso-8859-11 { JAVA* }
-
-# iso-8859-13, PC Baltic (w/o euro update)
-ibm-921_P100-1995 { UTR22* }
-                        ibm-921 { IBM* }
-                        ISO-8859-13 { IANA* MIME* JAVA* }
-                        8859_13 { JAVA }
-                        windows-28603 { WINDOWS* }
-                        cp921
-                        921
-                        x-IBM921 { JAVA }
-
-# Celtic
-iso-8859_14-1998 { UTR22* } ISO-8859-14 { IANA* }
-                        iso-ir-199 { IANA }
-                        ISO_8859-14:1998 { IANA }
-                        latin8 { IANA }
-                        iso-celtic { IANA }
-                        l8 { IANA }
-
-# Latin 9
-ibm-923_P100-1998 { UTR22* }
-                        ibm-923 { IBM* JAVA }
-                        ISO-8859-15 { IANA* MIME* WINDOWS JAVA* }
-                        Latin-9 { IANA WINDOWS }
-                        l9 { WINDOWS }
-                        8859_15 { JAVA }
-                        latin0 { JAVA }
-                        csisolatin0 { JAVA }
-                        csisolatin9 { JAVA }
-                        iso8859_15_fdis { JAVA }
-                        cp923 { JAVA }
-                        923 { JAVA }
-                        windows-28605 { WINDOWS* }
-
-# CJK encodings
-
-ibm-942_P12A-1999 { UTR22* }    # ibm-942_P120 is a rarely used alternate mapping (sjis78 is already old)
-                        ibm-942 { IBM* }
-                        ibm-932 { IBM }
-                        cp932
-                        shift_jis78
-                        sjis78
-                        ibm-942_VSUB_VPUA
-                        ibm-932_VSUB_VPUA
-                        x-IBM942 { JAVA* }
-                        x-IBM942C { JAVA }
-                        # Is this "JIS_C6226-1978"?
-
-# ibm-943_P15A-2003 differs from windows-932-2000 only in a few roundtrip mappings:
-# - the usual IBM PC control code rotation (1A-1C-7F)
-# - the Windows table has roundtrips for bytes 80, A0, and FD-FF to U+0080 and PUA
-ibm-943_P15A-2003 { UTR22* }
-                        ibm-943 # Leave untagged because this isn't the default
-                        Shift_JIS { IANA* MIME* WINDOWS JAVA }
-                        MS_Kanji { IANA WINDOWS JAVA }
-                        csShiftJIS { IANA WINDOWS JAVA }
-                        windows-31j { IANA JAVA } # A further extension of Shift_JIS to include NEC special characters (Row 13)
-                        csWindows31J { IANA WINDOWS JAVA } # A further extension of Shift_JIS to include NEC special characters (Row 13)
-                        x-sjis { WINDOWS JAVA }
-                        x-ms-cp932 { WINDOWS }
-                        cp932 { WINDOWS }
-                        windows-932 { WINDOWS* }
-                        cp943c { JAVA* }    # This is slightly different, but the backslash mapping is the same.
-                        IBM-943C #{ AIX* } # Add this tag once AIX aliases becomes available
-                        ms932
-                        pck     # Probably SOLARIS
-                        sjis    # This might be for ibm-1351
-                        ibm-943_VSUB_VPUA
-                        x-MS932_0213 { JAVA }
-                        x-JISAutoDetect { JAVA }
-                        # cp943 # This isn't Windows, and no one else uses it.
-                        # IANA says that Windows-31J is an extension to csshiftjis ibm-932 
-ibm-943_P130-1999 { UTR22* }
-                        ibm-943 { IBM* JAVA }
-                        Shift_JIS # Leave untagged because this isn't the default
-                        cp943 { JAVA* }    # This is slightly different, but the backslash mapping is the same.
-                        943 { JAVA }
-                        ibm-943_VASCII_VSUB_VPUA
-                        x-IBM943 { JAVA }
-                        # japanese. Unicode name is \u30b7\u30d5\u30c8\u7b26\u53f7\u5316\u8868\u73fe
-ibm-33722_P12A_P12A-2009_U2 { UTR22* }
-                        ibm-33722   # Leave untagged because this isn't the default
-                        ibm-5050    # Leave untagged because this isn't the default, and yes this alias is correct
-                        ibm-33722_VPUA
-                        IBM-eucJP
-windows-51932-2006 { UTR22* }
-                        windows-51932 { WINDOWS* }
-                        CP51932 { IANA* }
-                        csCP51932
-ibm-33722_P120-1999 { UTR22* }  # Japan EUC with \ <-> Yen mapping
-                        ibm-33722 { IBM* JAVA }
-                        ibm-5050 { IBM }    # Yes this is correct
-                        cp33722 { JAVA* }
-                        33722 { JAVA }
-                        ibm-33722_VASCII_VPUA
-                        x-IBM33722 { JAVA }
-                        x-IBM33722A { JAVA }
-                        x-IBM33722C { JAVA }
-# ibm-954 seems to be almost a superset of ibm-33722 and ibm-1350
-# ibm-1350 seems to be almost a superset of ibm-33722
-# ibm-954 contains more PUA characters than the others.
-ibm-954_P101-2007 { UTR22* }
-                        ibm-954 { IBM* }
-                        x-IBM954 { JAVA* }
-                        x-IBM954C { JAVA }
-                        # eucJP # This is closest to Solaris EUC-JP.
-euc-jp-2007 { UTR22* }
-                        EUC-JP { MIME* IANA JAVA* WINDOWS* }
-                        Extended_UNIX_Code_Packed_Format_for_Japanese { IANA* JAVA WINDOWS }
-                        csEUCPkdFmtJapanese { IANA JAVA WINDOWS }
-                        X-EUC-JP { MIME JAVA WINDOWS }   # Japan EUC. x-euc-jp is a MIME name
-                        eucjis {JAVA}
-                        ujis # Linux sometimes uses this name. This is an unfortunate generic and rarely used name. Its use is discouraged.
-
-aix-IBM_udcJP-4.3.6 { UTR22* }
-                        x-IBM-udcJP { JAVA* }
-
-java-euc_jp_linux-1.6_P { UTR22* }
-                        euc-jp-linux
-                        x-EUC_JP_LINUX { JAVA* }
-
-java-sjis_0213-1.6_P { UTR22* }
-                        x-SJIS_0213 { JAVA* }
-
-# Here are various interpretations and extensions of Big5
-ibm-1373_P100-2002 { UTR22* } # IBM's interpretation of Windows' Taiwan Big-5 without HKSCS extensions
-                        ibm-1373 { IBM* }
-                        windows-950 # Alternate mapping. Leave untagged. This is the IBM interpretation of a Windows codepage.
-windows-950-2000 { UTR22* }
-                        Big5 { IANA* MIME* JAVA* WINDOWS }
-                        csBig5 { IANA WINDOWS }
-                        windows-950 { WINDOWS* }
-                        x-windows-950 { JAVA }
-                        x-big5
-                        ms950
-ibm-950_P110-1999 { UTR22* }                # Taiwan Big-5 (w/o euro update)
-                        ibm-950 { IBM* JAVA }
-                        cp950 { JAVA* }
-                        950 { JAVA }
-                        x-IBM950 { JAVA }
-ibm-1375_P100-2008 { UTR22* }   # Big5-HKSCS-2004 with Unicode 3.1 mappings. This uses supplementary characters.
-                        ibm-1375 { IBM* }
-                        Big5-HKSCS { IANA* JAVA* }
-                        big5hk { JAVA }
-                        HKSCS-BIG5  # From http://www.openi18n.org/localenameguide/
-ibm-5471_P100-2006 { UTR22* }   # Big5-HKSCS-2001 with Unicode 3.0 mappings. This uses many PUA characters.
-                        ibm-5471 { IBM* }
-                        Big5-HKSCS
-                        MS950_HKSCS { JAVA* }
-                        hkbig5 # from HP-UX 11i, which can't handle supplementary characters.
-                        big5-hkscs:unicode3.0
-                        x-MS950-HKSCS { JAVA }
-                        # windows-950 # Windows-950 can be w/ or w/o HKSCS extensions. By default it's not.
-                        # windows-950_hkscs
-solaris-zh_TW_big5-2.7 { UTR22* }
-                        Big5_Solaris { JAVA* }
-                        x-Big5-Solaris { JAVA }
-# GBK
-ibm-1386_P100-2001  { UTR22* }
-                        ibm-1386 { IBM* }
-                        cp1386
-                        windows-936 # Alternate mapping. Leave untagged. This is the IBM interpretation of a Windows codepage.
-                        ibm-1386_VSUB_VPUA
-windows-936-2000 { UTR22* }
-                        GBK { IANA* WINDOWS JAVA* }
-                        CP936 { IANA JAVA }
-                        MS936 { IANA }  # In JDK 1.5, this goes to x-mswin-936. This is an IANA name split.
-                        windows-936 { IANA WINDOWS* JAVA }
-
-# Java has two different tables for ibm-1383 and gb2312. We pick closest set for tagging.
-ibm-1383_P110-1999 { UTR22* }       # China EUC.
-                        ibm-1383 { IBM* JAVA }
-                        GB2312 { IANA* MIME* }
-                        csGB2312 { IANA }
-                        cp1383 { JAVA* }
-                        1383 { JAVA }
-                        EUC-CN  # According to other platforms, windows-20936 looks more like euc-cn. x-euc-cn is also a MIME name
-                        ibm-eucCN
-                        hp15CN  # From HP-UX?
-                        ibm-1383_VPUA
-                        # gb          # This is not an IANA name. gb in IANA means Great Britain.
-
-ibm-5478_P100-1995 { UTR22* } ibm-5478 { IBM* } # This gb_2312_80 DBCS mapping is needed by iso-2022.
-                        GB_2312-80 { IANA* }    # Windows maps this alias incorrectly
-                        chinese { IANA }
-                        iso-ir-58 { IANA }
-                        csISO58GB231280 { IANA }
-                        gb2312-1980
-                        GB2312.1980-0   # From X11R6
-
-euc-tw-2014 { UTR22* }                # Updated EUC-TW converter based on ibm-964
-                        EUC-TW
-
-ibm-964_P110-1999 { UTR22* }                # Taiwan EUC. x-euc-tw is a MIME name
-                        ibm-964 { IBM* JAVA }
-                        ibm-eucTW
-                        cns11643 
-                        cp964 { JAVA* }
-                        964 { JAVA }
-                        ibm-964_VPUA
-                        x-IBM964 { JAVA }
-
-# ISO-2022 needs one, and other people may need others.
-ibm-949_P110-1999 { UTR22* }
-                        ibm-949 { IBM* JAVA }
-                        cp949 { JAVA* }
-                        949 { JAVA }
-                        ibm-949_VASCII_VSUB_VPUA
-                        x-IBM949 { JAVA }
-ibm-949_P11A-1999 { UTR22* }
-                        ibm-949 # Leave untagged because this isn't the default
-                        cp949c { JAVA* }
-                        ibm-949_VSUB_VPUA
-                        x-IBM949C { JAVA }
-                        IBM-949C { JAVA }
-
-# Korean EUC.
-#
-# <quote from="Jungshik Shin">
-# EUC-KR = KS X 1003/ISO 646-KR or ISO 646-IRV/US-ASCII in GL and KS X 1001:1998 (formerly KS C 5601-1987) in GR.
-#
-# Although widely spread on MS Windows, using 
-# KS C 5601 or related names to denote EUC-KR or
-# windows-949 is very much misleading. KS C 5601-1987
-# is NOT suitable as a designation for MIME charset
-# and MBCS. It's just the name of a 94 x 94 Korean 
-# coded character set standard which can be invoked
-# on either GL (with MSB reset) or GR (with MSB set).
-# Note that JOHAB (windows-1361) specified in 
-# KS X 1001:1998 annex 3 (KS C 5601-1992 annex 3) 
-# is a _seprate_ MBCS with a _completely different_
-# mapping.
-# </quote>
-#
-# The following aliases tries to mirror the poor state of alias recognition
-# on these platforms.
-#
-# ibm-970 is almost a subset of ibm-1363.
-# Java, Solaris and AIX use euc-kr to also mean ksc5601.
-# Java has both ibm-970 and EUC-KR as separate converters.
-ibm-970_P110_P110-2006_U2 { UTR22* }
-                        ibm-970 { IBM* JAVA }
-                        EUC-KR { IANA* MIME* WINDOWS JAVA }
-                        KS_C_5601-1987 { JAVA }
-                        windows-51949 { WINDOWS* }
-                        csEUCKR { IANA WINDOWS }  # x-euc-kr is also a MIME name
-                        ibm-eucKR { JAVA }
-                        KSC_5601 { JAVA } # Needed by iso-2022
-                        5601 { JAVA }
-                        cp970 { JAVA* }
-                        970 { JAVA }
-                        ibm-970_VPUA
-                        x-IBM970 { JAVA }
-
-# ibm-971 is almost the set of DBCS mappings of ibm-970
-ibm-971_P100-1995       ibm-971 { IBM* } ibm-971_VPUA x-IBM971 { JAVA* }
-
-# Java, Solaris and AIX use euc-kr to also mean ksc5601, and _sometimes_ for Windows too.
-# ibm-1363 is almost a superset of ibm-970.
-ibm-1363_P11B-1998 { UTR22* }
-                        ibm-1363 # Leave untagged because this isn't the default
-                        KS_C_5601-1987 { IANA* }
-                        KS_C_5601-1989 { IANA }
-                        KSC_5601 { IANA }
-                        csKSC56011987 { IANA }
-                        korean { IANA }
-                        iso-ir-149 { IANA }
-                        cp1363 { MIME* }
-                        5601
-                        ksc
-                        windows-949 # Alternate mapping. Leave untagged. This is the IBM interpretation of a Windows codepage.
-                        ibm-1363_VSUB_VPUA
-                        x-IBM1363C { JAVA* }
-                        # ks_x_1001:1992
-                        # ksc5601-1992
-
-ibm-1363_P110-1997 { UTR22* } # Korean KSC MBCS with \ <-> Won mapping
-                        ibm-1363 { IBM* }
-                        ibm-1363_VASCII_VSUB_VPUA
-                        x-IBM1363 { JAVA* }
-
-windows-949-2000 { UTR22* }
-                        windows-949 { JAVA* WINDOWS* }
-                        KS_C_5601-1987 { WINDOWS }
-                        KS_C_5601-1989 { WINDOWS }
-                        KSC_5601 { MIME* WINDOWS } # Needed by iso-2022
-                        csKSC56011987 { WINDOWS }
-                        korean { WINDOWS }
-                        iso-ir-149 { WINDOWS }
-                        ms949 { JAVA }
-                        x-KSC5601 { JAVA }
-
-windows-1361-2000 { UTR22* }
-                        ksc5601_1992
-                        ms1361
-                        johab
-                        x-Johab { JAVA* }
-
-windows-874-2000 { UTR22* }   # Thai (w/ euro update)
-                        TIS-620 { WINDOWS }
-                        windows-874 { JAVA* WINDOWS* }
-                        MS874 { JAVA }
-                        x-windows-874 { JAVA }
-                        # iso-8859-11 { WINDOWS } # iso-8859-11 is similar to TIS-620. ibm-13162 is a closer match.
-
-ibm-874_P100-1995 { UTR22* }    # Thai PC (w/o euro update).
-                        ibm-874 { IBM* JAVA }
-                        ibm-9066 { IBM }    # Yes ibm-874 == ibm-9066. ibm-1161 has the euro update.
-                        cp874 { JAVA* }
-                        TIS-620 { IANA* JAVA }  # This is actually separate from ibm-874, which is similar to this table
-                        tis620.2533 { JAVA }    # This is actually separate from ibm-874, which is similar to this table
-                        eucTH               # eucTH is an unusual alias from Solaris.  eucTH has fewer mappings than TIS620
-                        x-IBM874 { JAVA }
-
-ibm-1162_P100-1999 { UTR22* }   # Thai (w/ euro update)
-                        ibm-1162 { IBM* }
-
-windows-864-2000 { UTR22* }
-                        ibm-864s
-                        cp864s
-                        x-IBM864S { JAVA* }
-
-# Platform codepages
-# If Java supports the IBM prefix, it should also support the ibm- prefix too.
-ibm-437_P100-1995 { UTR22* }    ibm-437 { IBM* } IBM437 { IANA* WINDOWS JAVA } cp437 { IANA WINDOWS JAVA* } 437 { IANA WINDOWS JAVA } csPC8CodePage437 { IANA JAVA } windows-437 { WINDOWS* }  # PC US
-ibm-720_P100-1997 { UTR22* }    ibm-720 { IBM* } windows-720 { WINDOWS* } DOS-720 { WINDOWS } x-IBM720 { JAVA* } # PC Arabic
-ibm-737_P100-1997 { UTR22* }    ibm-737 { IBM* } IBM737 { WINDOWS JAVA } cp737 { JAVA* } windows-737 { WINDOWS* } 737 { JAVA } x-IBM737 { JAVA } # PC Greek
-ibm-775_P100-1996 { UTR22* }    ibm-775 { IBM* } IBM775 { IANA* WINDOWS JAVA } cp775 { IANA WINDOWS JAVA* } csPC775Baltic { IANA } windows-775 { WINDOWS* } 775 { JAVA } # PC Baltic
-ibm-850_P100-1995 { UTR22* }    ibm-850 { IBM* } IBM850 { IANA* MIME* WINDOWS JAVA } cp850 { IANA MIME WINDOWS JAVA* } 850 { IANA JAVA } csPC850Multilingual { IANA JAVA } windows-850 { WINDOWS* } # PC latin1
-ibm-851_P100-1995 { UTR22* }    ibm-851 { IBM* } IBM851 { IANA* } cp851 { IANA MIME* } 851 { IANA } csPC851 { IANA }             # PC DOS Greek (w/o euro)
-ibm-852_P100-1995 { UTR22* }    ibm-852 { IBM* } IBM852 { IANA* WINDOWS JAVA } cp852 { IANA WINDOWS JAVA* } 852 { IANA WINDOWS JAVA } csPCp852 { IANA JAVA } windows-852 { WINDOWS* } # PC latin2 (w/o euro update)
-ibm-855_P100-1995 { UTR22* }    ibm-855 { IBM* } IBM855 { IANA* JAVA } cp855 { IANA JAVA* } 855 { IANA } csIBM855 { IANA } csPCp855 { JAVA } windows-855 { WINDOWS* } # PC cyrillic (w/o euro update)
-ibm-856_P100-1995 { UTR22* }    ibm-856 { IBM* } IBM856 { JAVA } cp856 { JAVA* } 856 { JAVA } x-IBM856 { JAVA } # PC Hebrew implicit order
-ibm-857_P100-1995 { UTR22* }    ibm-857 { IBM* } IBM857 { IANA* MIME* WINDOWS JAVA } cp857 { IANA MIME JAVA* } 857 { IANA JAVA } csIBM857 { IANA JAVA } windows-857 { WINDOWS* }   # PC Latin 5 (w/o euro update)
-ibm-858_P100-1997 { UTR22* }    ibm-858 { IBM* } IBM00858 { IANA* MIME* JAVA } CCSID00858 { IANA JAVA } CP00858 { IANA JAVA } PC-Multilingual-850+euro { IANA } cp858 { MIME JAVA* } windows-858 { WINDOWS* } # PC latin1 with Euro
-ibm-860_P100-1995 { UTR22* }    ibm-860 { IBM* } IBM860 { IANA* MIME* JAVA } cp860 { IANA MIME JAVA* } 860 { IANA JAVA } csIBM860 { IANA JAVA }    # PC Portugal
-ibm-861_P100-1995 { UTR22* }    ibm-861 { IBM* } IBM861 { IANA* MIME* WINDOWS JAVA } cp861 { IANA MIME JAVA* } 861 { IANA JAVA } cp-is { IANA JAVA } csIBM861 { IANA JAVA } windows-861 { WINDOWS* } # PC Iceland
-ibm-862_P100-1995 { UTR22* }    ibm-862 { IBM* } IBM862 { IANA* MIME* JAVA } cp862 { IANA MIME JAVA* } 862 { IANA JAVA } csPC862LatinHebrew { IANA JAVA } DOS-862 { WINDOWS } windows-862 { WINDOWS* }    # PC Hebrew visual order (w/o euro update)
-ibm-863_P100-1995 { UTR22* }    ibm-863 { IBM* } IBM863 { IANA* MIME* JAVA } cp863 { IANA MIME JAVA* } 863 { IANA JAVA } csIBM863 { IANA JAVA }    # PC Canadian French
-ibm-864_X110-1999 { UTR22* }    ibm-864 { IBM* } IBM864 { IANA* MIME* JAVA } cp864 { IANA MIME JAVA* } csIBM864 { IANA JAVA } # PC Arabic (w/o euro update)
-ibm-865_P100-1995 { UTR22* }    ibm-865 { IBM* } IBM865 { IANA* MIME* JAVA } cp865 { IANA MIME JAVA* } 865 { IANA JAVA } csIBM865 { IANA JAVA }    # PC Nordic
-ibm-866_P100-1995 { UTR22* }    ibm-866 { IBM* } IBM866 { IANA* MIME* JAVA } cp866 { IANA MIME WINDOWS JAVA* } 866 { IANA JAVA } csIBM866 { IANA JAVA } windows-866 { WINDOWS* } # PC Russian (w/o euro update)
-ibm-867_P100-1998 { UTR22* }    ibm-867 { IBM* } x-IBM867 { JAVA* } # PC Hebrew (w/ euro update) Updated version of ibm-862
-ibm-868_P100-1995 { UTR22* }    ibm-868 { IBM* } IBM868 { IANA* MIME* JAVA } CP868 { IANA MIME JAVA* } 868 { JAVA } csIBM868 { IANA } cp-ar { IANA }          # PC Urdu
-ibm-869_P100-1995 { UTR22* }    ibm-869 { IBM* } IBM869 { IANA* MIME* WINDOWS JAVA } cp869 { IANA MIME JAVA* } 869 { IANA JAVA } cp-gr { IANA JAVA } csIBM869 { IANA JAVA } windows-869 { WINDOWS* } # PC Greek (w/o euro update)
-ibm-878_P100-1996 { UTR22* }    ibm-878 { IBM* } KOI8-R { IANA* MIME* WINDOWS JAVA* } koi8 { WINDOWS JAVA } csKOI8R { IANA WINDOWS JAVA } windows-20866 { WINDOWS* } cp878   # Russian internet
-ibm-901_P100-1999 { UTR22* }    ibm-901 { IBM* } # PC Baltic (w/ euro update), update of ibm-921
-ibm-902_P100-1999 { UTR22* }    ibm-902 { IBM* } # PC Estonian (w/ euro update), update of ibm-922
-ibm-922_P100-1999 { UTR22* }    ibm-922 { IBM* } IBM922 { JAVA } cp922 { JAVA* } 922 { JAVA } x-IBM922 { JAVA } # PC Estonian (w/o euro update)
-ibm-1168_P100-2002 { UTR22* }   ibm-1168 { IBM* } KOI8-U { IANA* WINDOWS } windows-21866 { WINDOWS* } # Ukrainian KOI8. koi8-ru != KOI8-U and Microsoft is wrong for aliasing them as the same.
-ibm-4909_P100-1999 { UTR22* }   ibm-4909 { IBM* } # ISO Greek (w/ euro update), update of ibm-813
-
-# The cp aliases in this section aren't really windows aliases, but it was used by ICU for Windows.
-# cp is usually used to denote IBM in Java, and that is why we don't do that anymore.
-# The windows-* aliases mean windows codepages.
-ibm-5346_P100-1998 { UTR22* }   ibm-5346 { IBM* } windows-1250 { IANA* JAVA* WINDOWS* } cp1250 { WINDOWS JAVA } # Windows Latin2 (w/ euro update)
-ibm-5347_P100-1998 { UTR22* }   ibm-5347 { IBM* } windows-1251 { IANA* JAVA* WINDOWS* } cp1251 { WINDOWS JAVA } ANSI1251 # Windows Cyrillic (w/ euro update). ANSI1251 is from Solaris
-ibm-5348_P100-1997 { UTR22* }   ibm-5348 { IBM* } windows-1252 { IANA* JAVA* WINDOWS* } cp1252 { JAVA }         # Windows Latin1 (w/ euro update)
-ibm-5349_P100-1998 { UTR22* }   ibm-5349 { IBM* } windows-1253 { IANA* JAVA* WINDOWS* } cp1253 { JAVA }         # Windows Greek (w/ euro update)
-ibm-5350_P100-1998 { UTR22* }   ibm-5350 { IBM* } windows-1254 { IANA* JAVA* WINDOWS* } cp1254 { JAVA }         # Windows Turkish (w/ euro update)
-ibm-9447_P100-2002 { UTR22* }   ibm-9447 { IBM* } windows-1255 { IANA* JAVA* WINDOWS* } cp1255 { JAVA }         # Windows Hebrew (w/ euro update)
-ibm-9448_X100-2005 { UTR22* }   ibm-9448 { IBM* } windows-1256 { IANA* JAVA* WINDOWS* } cp1256 { WINDOWS JAVA } x-windows-1256S { JAVA } # Windows Arabic (w/ euro update)
-ibm-9449_P100-2002 { UTR22* }   ibm-9449 { IBM* } windows-1257 { IANA* JAVA* WINDOWS* } cp1257 { JAVA }         # Windows Baltic (w/ euro update)
-ibm-5354_P100-1998 { UTR22* }   ibm-5354 { IBM* } windows-1258 { IANA* JAVA* WINDOWS* } cp1258 { JAVA }         # Windows Vietnamese (w/ euro update)
-
-# These tables are out of date, and most don't have the Euro
-# Leave the windows- variants untagged. They are alternate tables of the newer ones above.
-ibm-1250_P100-1995 { UTR22* }   ibm-1250 { IBM* } windows-1250  # Old Windows Latin2 (w/o euro update)
-ibm-1251_P100-1995 { UTR22* }   ibm-1251 { IBM* } windows-1251  # Old Windows Cyrillic (w/o euro update)
-ibm-1252_P100-2000 { UTR22* }   ibm-1252 { IBM* } windows-1252  # Old Windows Latin 1 without Euro
-ibm-1253_P100-1995 { UTR22* }   ibm-1253 { IBM* } windows-1253  # Old Windows Greek (w/o euro update)
-ibm-1254_P100-1995 { UTR22* }   ibm-1254 { IBM* } windows-1254  # Old Windows Turkish (w/o euro update)
-ibm-1255_P100-1995 { UTR22* }   ibm-1255 { IBM* }               # Very old Windows Hebrew (w/o euro update)
-ibm-5351_P100-1998 { UTR22* }   ibm-5351 { IBM* } windows-1255  # Old Windows Hebrew (w/ euro update)
-ibm-1256_P110-1997 { UTR22* }   ibm-1256 { IBM* }               # Old Windows Arabic (w/o euro update)
-ibm-5352_P100-1998 { UTR22* }   ibm-5352 { IBM* } windows-1256  # Somewhat old Windows Arabic (w/ euro update)
-ibm-1257_P100-1995 { UTR22* }   ibm-1257 { IBM* }               # Old Windows Baltic (w/o euro update)
-ibm-5353_P100-1998 { UTR22* }   ibm-5353 { IBM* } windows-1257  # Somewhat old Windows Baltic (w/ euro update)
-ibm-1258_P100-1997 { UTR22* }   ibm-1258 { IBM* } windows-1258  # Old Windows Vietnamese (w/o euro update)
-
-macos-0_2-10.2 { UTR22* }       macintosh { IANA* MIME* WINDOWS } mac { IANA } csMacintosh { IANA } windows-10000 { WINDOWS* } macroman { JAVA } x-macroman { JAVA* } # Apple latin 1
-macos-6_2-10.4 { UTR22* }       x-mac-greek { MIME* WINDOWS } windows-10006 { WINDOWS* } macgr x-MacGreek { JAVA* }  # Apple Greek
-macos-7_3-10.2 { UTR22* }       x-mac-cyrillic { MIME* WINDOWS } windows-10007 { WINDOWS* } mac-cyrillic maccy x-MacCyrillic { JAVA } x-MacUkraine { JAVA* } # Apple Cyrillic
-macos-21-10.5 { UTR22* }        x-mac-thai { MIME* } x-MacThai { JAVA* } MacThai { JAVA }
-macos-29-10.2 { UTR22* }        x-mac-centraleurroman { MIME* } windows-10029 { WINDOWS* } x-mac-ce { WINDOWS } macce maccentraleurope x-MacCentralEurope { JAVA* }  # Apple Central Europe
-macos-33-10.5 { UTR22* }        x-mac-symbol { MIME* } x-MacSymbol { JAVA* } MacSymbol { JAVA }
-macos-34-10.2 { UTR22* }        x-mac-dingbat { MIME* } x-MacDingbat { JAVA* } MacDingbat { JAVA }
-macos-35-10.2 { UTR22* }        x-mac-turkish { MIME* WINDOWS } windows-10081 { WINDOWS* } mactr x-MacTurkish { JAVA* }  # Apple Turkish
-macos-36_2-10.2 { UTR22* }      x-mac-croatian { MIME* } x-MacCroatian { JAVA* } MacCroatian { JAVA }
-macos-37_5-10.2 { UTR22* }      x-mac-iceland { MIME* } x-MacIceland { JAVA* } MacIceland { JAVA }
-macos-38_2-10.2 { UTR22* }      x-mac-romania { MIME* } x-MacRomania { JAVA* } MacRomania { JAVA }
-macos-518-10.2 { UTR22* }       x-mac-arabic { MIME* } x-MacArabic { JAVA* } MacArabic { JAVA }
-macos-1285-10.2 { UTR22* }      x-mac-hebrew { MIME* } x-MacHebrew { JAVA* } MacHebrew { JAVA }
-
-ibm-1051_P100-1995 { UTR22* }   ibm-1051 { IBM* } hp-roman8 { IANA* } roman8 { IANA } r8 { IANA } csHPRoman8 { IANA } x-roman8 { JAVA* }   # HP Latin1
-ibm-1276_P100-1995 { UTR22* }   ibm-1276 { IBM* } Adobe-Standard-Encoding { IANA* } csAdobeStandardEncoding { IANA } # Different from ISO-Unicode-IBM-1276 (GCSGID: 1276)
-
-ibm-1006_P100-1995 { UTR22* }   ibm-1006 { IBM* } IBM1006 { JAVA } cp1006 { JAVA* } 1006 { JAVA } x-IBM1006 { JAVA }  # Urdu
-ibm-1098_P100-1995 { UTR22* }   ibm-1098 { IBM* } IBM1098 { JAVA } cp1098 { JAVA* } 1098 { JAVA } x-IBM1098 { JAVA }  # PC Farsi
-ibm-1124_P100-1996 { UTR22* }   ibm-1124 { IBM* JAVA } cp1124 { JAVA* } 1124 { JAVA } x-IBM1124 { JAVA }  # ISO Cyrillic Ukraine
-ibm-1125_P100-1997 { UTR22* }   ibm-1125 { IBM* } cp1125                                # Cyrillic Ukraine PC
-ibm-1129_P100-1997 { UTR22* }   ibm-1129 { IBM* }                                       # ISO Vietnamese
-ibm-1131_P100-1997 { UTR22* }   ibm-1131 { IBM* } cp1131                                # Cyrillic Belarus PC
-ibm-1133_P100-1997 { UTR22* }   ibm-1133 { IBM* }                                       # ISO Lao
-
-# GSM 03.38
-gsm-03.38-2009 { UTR22* }   GSM0338 # GSM0338 alias is from Perl
-
-# Partially algorithmic converters
-
-# [U_ENABLE_GENERIC_ISO_2022]
-# The _generic_ ISO-2022 converter is disabled starting 2003-dec-03 (ICU 2.8).
-# For details see the icu mailing list from 2003-dec-01 and the ucnv2022.c file.
-# Language-specific variants of ISO-2022 continue to be available as listed below.
-# ISO_2022                         ISO-2022
-
-ISO_2022,locale=ja,version=0    ISO-2022-JP { IANA* MIME* JAVA* } csISO2022JP { IANA JAVA } x-windows-iso2022jp { JAVA } x-windows-50220 { JAVA }
-ISO_2022,locale=ja,version=1    ISO-2022-JP-1 { MIME* } JIS_Encoding { IANA* } csJISEncoding { IANA } ibm-5054 { IBM* } JIS x-windows-50221 { JAVA* }
-ISO_2022,locale=ja,version=2    ISO-2022-JP-2 { IANA* MIME* } csISO2022JP2 { IANA }
-ISO_2022,locale=ja,version=3    JIS7
-ISO_2022,locale=ja,version=4    JIS8
-ISO_2022,locale=ko,version=0    ISO-2022-KR { IANA* MIME* JAVA* } csISO2022KR { IANA JAVA } # This uses ibm-949
-ISO_2022,locale=ko,version=1    ibm-25546 { IBM* }
-ISO_2022,locale=zh,version=0    ISO-2022-CN { IANA* JAVA* } csISO2022CN { JAVA } x-ISO-2022-CN-GB { JAVA }
-ISO_2022,locale=zh,version=1    ISO-2022-CN-EXT { IANA* }
-ISO_2022,locale=zh,version=2    ISO-2022-CN-CNS x-ISO-2022-CN-CNS { JAVA* }
-HZ                              HZ-GB-2312 { IANA* }
-x11-compound-text               COMPOUND_TEXT x-compound-text { JAVA* }
-
-ISCII,version=0         x-ISCII91 { JAVA* } x-iscii-de { WINDOWS } windows-57002 { WINDOWS* } iscii-dev ibm-4902 { IBM* } # ibm-806 contains non-standard box drawing symbols.
-ISCII,version=1         x-iscii-be { WINDOWS } windows-57003 { WINDOWS* } iscii-bng windows-57006 { WINDOWS } x-iscii-as { WINDOWS } # be is different from as on Windows.
-ISCII,version=2         x-iscii-pa { WINDOWS } windows-57011 { WINDOWS* } iscii-gur
-ISCII,version=3         x-iscii-gu { WINDOWS } windows-57010 { WINDOWS* } iscii-guj
-ISCII,version=4         x-iscii-or { WINDOWS } windows-57007 { WINDOWS* } iscii-ori
-ISCII,version=5         x-iscii-ta { WINDOWS } windows-57004 { WINDOWS* } iscii-tml
-ISCII,version=6         x-iscii-te { WINDOWS } windows-57005 { WINDOWS* } iscii-tlg
-ISCII,version=7         x-iscii-ka { WINDOWS } windows-57008 { WINDOWS* } iscii-knd
-ISCII,version=8         x-iscii-ma { WINDOWS } windows-57009 { WINDOWS* } iscii-mlm
-
-# Lotus specific
-LMBCS-1                 lmbcs ibm-65025 { IBM* }
-
-# These Lotus specific converters still work, but they aren't advertised in this alias table.
-# These are almost never used outside of Lotus software,
-# and they take a lot of time when creating the available converter list.
-# Also Lotus doesn't really use them anyway. It was a mistake to create these LMBCS variant converters in ICU.
-#LMBCS-2
-#LMBCS-3
-#LMBCS-4
-#LMBCS-5
-#LMBCS-6
-#LMBCS-8
-#LMBCS-11
-#LMBCS-16
-#LMBCS-17
-#LMBCS-18
-#LMBCS-19
-
-# EBCDIC codepages according to the CDRA
-
-# without Euro
-ibm-37_P100-1995 { UTR22* }              # EBCDIC US
-                        ibm-37 { IBM* }
-                        IBM037 { IANA* JAVA }
-                        ibm-037 # { JAVA }
-                        ebcdic-cp-us { IANA JAVA }
-                        ebcdic-cp-ca { IANA JAVA }
-                        ebcdic-cp-wt { IANA JAVA }
-                        ebcdic-cp-nl { IANA JAVA }
-                        csIBM037 { IANA JAVA }
-                        cp037 { JAVA* }
-                        037 { JAVA }
-                        cpibm37 { JAVA }
-                        cp37
-
-ibm-273_P100-1995 { UTR22* }    ibm-273 { IBM* } IBM273 { IANA* JAVA } CP273 { IANA JAVA* } csIBM273 { IANA } ebcdic-de 273 { JAVA }                 # EBCDIC Germanay, Austria
-ibm-277_P100-1995 { UTR22* }    ibm-277 { IBM* } IBM277 { IANA* JAVA } cp277 { JAVA* } EBCDIC-CP-DK { IANA } EBCDIC-CP-NO { IANA } csIBM277 { IANA } ebcdic-dk 277 { JAVA } # EBCDIC Denmark
-ibm-278_P100-1995 { UTR22* }    ibm-278 { IBM* } IBM278 { IANA* JAVA } cp278 { JAVA* } ebcdic-cp-fi { IANA } ebcdic-cp-se { IANA } csIBM278 { IANA } ebcdic-sv { JAVA } 278 { JAVA } # EBCDIC Sweden
-ibm-280_P100-1995 { UTR22* }    ibm-280 { IBM* } IBM280 { IANA* JAVA } CP280 { IANA JAVA* } ebcdic-cp-it { IANA } csIBM280 { IANA } 280 { JAVA }     # EBCDIC Italy
-ibm-284_P100-1995 { UTR22* }    ibm-284 { IBM* } IBM284 { IANA* JAVA } CP284 { IANA JAVA* } ebcdic-cp-es { IANA } csIBM284 { IANA } cpibm284 { JAVA } 284 { JAVA }       # EBCDIC Spain
-ibm-285_P100-1995 { UTR22* }    ibm-285 { IBM* } IBM285 { IANA* JAVA } CP285 { IANA JAVA* } ebcdic-cp-gb { IANA } csIBM285 { IANA } cpibm285 { JAVA } ebcdic-gb { JAVA } 285 { JAVA } # EBCDIC UK Ireland
-ibm-290_P100-1995 { UTR22* }    ibm-290 { IBM* } IBM290 { IANA* } cp290 { IANA } EBCDIC-JP-kana { IANA } csIBM290 { IANA } # host SBCS (Katakana)
-ibm-297_P100-1995 { UTR22* }    ibm-297 { IBM* } IBM297 { IANA* JAVA } cp297 { IANA JAVA* } ebcdic-cp-fr { IANA } csIBM297 { IANA } cpibm297 { JAVA } 297 { JAVA }     # EBCDIC France
-ibm-420_X120-1999 { UTR22* }    ibm-420 { IBM* } IBM420 { IANA* JAVA } cp420 { IANA JAVA* } ebcdic-cp-ar1 { IANA } csIBM420 { IANA } 420 { JAVA }    # EBCDIC Arabic (all presentation shapes)
-ibm-424_P100-1995 { UTR22* }    ibm-424 { IBM* } IBM424 { IANA* JAVA } cp424 { IANA JAVA* } ebcdic-cp-he { IANA } csIBM424 { IANA } 424 { JAVA }     # EBCDIC Hebrew
-ibm-500_P100-1995 { UTR22* }    ibm-500 { IBM* } IBM500 { IANA* JAVA } CP500 { IANA JAVA* } ebcdic-cp-be { IANA } csIBM500 { IANA } ebcdic-cp-ch { IANA } 500   # EBCDIC International Latin1
-ibm-803_P100-1999 { UTR22* }    ibm-803 { IBM* } cp803   # Old EBCDIC Hebrew
-ibm-838_P100-1995 { UTR22* }    ibm-838 { IBM* } IBM838 { JAVA } IBM-Thai { IANA* JAVA } csIBMThai { IANA } cp838 { JAVA* } 838 { JAVA } ibm-9030 { IBM }   # EBCDIC Thai. Yes ibm-9030 is an alias.
-ibm-870_P100-1995 { UTR22* }    ibm-870 { IBM* } IBM870 { IANA* JAVA } CP870 { IANA JAVA* } ebcdic-cp-roece { IANA } ebcdic-cp-yu { IANA } csIBM870 { IANA }    # EBCDIC Latin 2 
-ibm-871_P100-1995 { UTR22* }    ibm-871 { IBM* } IBM871 { IANA* JAVA } ebcdic-cp-is { IANA JAVA } csIBM871 { IANA JAVA } CP871 { IANA JAVA* } ebcdic-is { JAVA } 871 { JAVA }     # EBCDIC Iceland
-ibm-875_P100-1995 { UTR22* }    ibm-875 { IBM* } IBM875 { JAVA } cp875 { JAVA* } 875 { JAVA } x-IBM875 { JAVA } # EBCDIC Greek
-ibm-918_P100-1995 { UTR22* }    ibm-918 { IBM* } IBM918 { IANA* JAVA } CP918 { IANA JAVA* } ebcdic-cp-ar2 { IANA } csIBM918 { IANA }        # EBCDIC Urdu
-ibm-930_P120-1999 { UTR22* }    # EBCDIC_STATEFUL Katakana-Kanji Host Mixed.
-                        ibm-930 { IBM* }
-                        ibm-5026 { IBM } # Yes this is correct
-                        IBM930 { JAVA }
-                        cp930 { JAVA* }
-                        930 { JAVA }
-                        x-IBM930 { JAVA }
-                        x-IBM930A { JAVA }
-ibm-933_P110-1995 { UTR22* }    ibm-933 { IBM* JAVA } cp933 { JAVA* } 933 { JAVA } x-IBM933 { JAVA } # Korea EBCDIC MIXED
-ibm-935_P110-1999 { UTR22* }    ibm-935 { IBM* JAVA } cp935 { JAVA* } 935 { JAVA } x-IBM935 { JAVA } # China EBCDIC MIXED. Need to use Unicode, ibm-1388 or gb18030 instead because it is required by the government of China.
-ibm-937_P110-1999 { UTR22* }    ibm-937 { IBM* JAVA } cp937 { JAVA* } 937 { JAVA } x-IBM937 { JAVA } # Taiwan EBCDIC MIXED
-ibm-939_P120-1999 { UTR22* }    # EBCDIC_STATEFUL Latin-Kanji Host Mixed.
-                        ibm-939 { IBM* }
-                        ibm-931 { IBM }     # Yes this is correct
-                        ibm-5035 { IBM }    # Yes this is also correct
-                        IBM939 { JAVA }
-                        cp939 { JAVA* }
-                        939 { JAVA }
-                        x-IBM939 { JAVA }
-                        x-IBM939A { JAVA }
-ibm-1025_P100-1995 { UTR22* }   ibm-1025 { IBM* JAVA } cp1025 { JAVA* } 1025 { JAVA } x-IBM1025 { JAVA }  # EBCDIC Cyrillic
-ibm-1026_P100-1995 { UTR22* }   ibm-1026 { IBM* } IBM1026 { IANA* JAVA } CP1026 { IANA JAVA* } csIBM1026 { IANA } 1026 { JAVA } # EBCDIC Turkey 
-ibm-1047_P100-1995 { UTR22* }   ibm-1047 { IBM* } IBM1047 { IANA* JAVA } cp1047 { JAVA* } 1047 { JAVA } # EBCDIC Open systems Latin1
-ibm-1097_P100-1995 { UTR22* }   ibm-1097 { IBM* JAVA } cp1097 { JAVA* } 1097 { JAVA } x-IBM1097 { JAVA }  # EBCDIC Farsi
-ibm-1112_P100-1995 { UTR22* }   ibm-1112 { IBM* JAVA } cp1112 { JAVA* } 1112 { JAVA } x-IBM1112 { JAVA }  # EBCDIC Baltic
-ibm-1114_P100-2001 { UTR22* }   ibm-1114 { IBM* } x-IBM1114 { JAVA* }
-ibm-1115_P100-1995 { UTR22* }   ibm-1115 { IBM* } x-IBM1115 { JAVA* }
-ibm-1122_P100-1999 { UTR22* }   ibm-1122 { IBM* JAVA } cp1122 { JAVA* } 1122 { JAVA } x-IBM1122 { JAVA }  # EBCDIC Estonia 
-ibm-1123_P100-1995 { UTR22* }   ibm-1123 { IBM* JAVA } cp1123 { JAVA* } 1123 { JAVA } x-IBM1123 { JAVA }  # EBCDIC Cyrillic Ukraine
-ibm-1130_P100-1997 { UTR22* }   ibm-1130 { IBM* }       # EBCDIC Vietnamese
-ibm-1132_P100-1998 { UTR22* }   ibm-1132 { IBM* }       # EBCDIC Lao
-ibm-1137_P100-1999 { UTR22* }   ibm-1137 { IBM* }       # Devanagari EBCDIC (based on Unicode character set)
-ibm-4517_P100-2005 { UTR22* }   ibm-4517 { IBM* }       # EBCDIC Arabic. Update of ibm-421
-
-# with Euro
-ibm-1140_P100-1997 { UTR22* }   ibm-1140 { IBM* } IBM01140 { IANA* JAVA } CCSID01140 { IANA JAVA } CP01140 { IANA JAVA } cp1140 { JAVA* } ebcdic-us-37+euro { IANA } # EBCDIC US
-ibm-1141_P100-1997 { UTR22* }   ibm-1141 { IBM* } IBM01141 { IANA* JAVA } CCSID01141 { IANA JAVA } CP01141 { IANA JAVA } cp1141 { JAVA* } ebcdic-de-273+euro { IANA } # EBCDIC Germanay, Austria
-ibm-1142_P100-1997 { UTR22* }   ibm-1142 { IBM* } IBM01142 { IANA* JAVA } CCSID01142 { IANA JAVA } CP01142 { IANA JAVA } cp1142 { JAVA* } ebcdic-dk-277+euro { IANA } ebcdic-no-277+euro { IANA } # EBCDIC Denmark
-ibm-1143_P100-1997 { UTR22* }   ibm-1143 { IBM* } IBM01143 { IANA* JAVA } CCSID01143 { IANA JAVA } CP01143 { IANA JAVA } cp1143 { JAVA* } ebcdic-fi-278+euro { IANA } ebcdic-se-278+euro { IANA } # EBCDIC Sweden
-ibm-1144_P100-1997 { UTR22* }   ibm-1144 { IBM* } IBM01144 { IANA* JAVA } CCSID01144 { IANA JAVA } CP01144 { IANA JAVA } cp1144 { JAVA* } ebcdic-it-280+euro { IANA } # EBCDIC Italy
-ibm-1145_P100-1997 { UTR22* }   ibm-1145 { IBM* } IBM01145 { IANA* JAVA } CCSID01145 { IANA JAVA } CP01145 { IANA JAVA } cp1145 { JAVA* } ebcdic-es-284+euro { IANA } # EBCDIC Spain
-ibm-1146_P100-1997 { UTR22* }   ibm-1146 { IBM* } IBM01146 { IANA* JAVA } CCSID01146 { IANA JAVA } CP01146 { IANA JAVA } cp1146 { JAVA* } ebcdic-gb-285+euro { IANA } # EBCDIC UK Ireland
-ibm-1147_P100-1997 { UTR22* }   ibm-1147 { IBM* } IBM01147 { IANA* JAVA } CCSID01147 { IANA JAVA } CP01147 { IANA JAVA } cp1147 { JAVA* } ebcdic-fr-297+euro { IANA } # EBCDIC France
-ibm-1148_P100-1997 { UTR22* }   ibm-1148 { IBM* } IBM01148 { IANA* JAVA } CCSID01148 { IANA JAVA } CP01148 { IANA JAVA } cp1148 { JAVA* } ebcdic-international-500+euro { IANA } # EBCDIC International Latin1
-ibm-1149_P100-1997 { UTR22* }   ibm-1149 { IBM* } IBM01149 { IANA* JAVA } CCSID01149 { IANA JAVA } CP01149 { IANA JAVA } cp1149 { JAVA* } ebcdic-is-871+euro { IANA } # EBCDIC Iceland
-ibm-1153_P100-1999 { UTR22* }   ibm-1153 { IBM* } IBM1153 { JAVA } x-IBM1153 { JAVA* } # EBCDIC latin 2
-ibm-1154_P100-1999 { UTR22* }   ibm-1154 { IBM* }                   # EBCDIC Cyrillic Multilingual
-ibm-1155_P100-1999 { UTR22* }   ibm-1155 { IBM* }                   # EBCDIC Turkey
-ibm-1156_P100-1999 { UTR22* }   ibm-1156 { IBM* }                   # EBCDIC Baltic Multilingual
-ibm-1157_P100-1999 { UTR22* }   ibm-1157 { IBM* }                   # EBCDIC Estonia
-ibm-1158_P100-1999 { UTR22* }   ibm-1158 { IBM* }                   # EBCDIC Cyrillic Ukraine
-ibm-1160_P100-1999 { UTR22* }   ibm-1160 { IBM* }                   # EBCDIC Thailand
-ibm-1164_P100-1999 { UTR22* }   ibm-1164 { IBM* }                   # EBCDIC Viet Nam
-ibm-1364_P110-2007 { UTR22* }   ibm-1364 { IBM* } x-IBM1364 { JAVA* } # Korean Host Mixed
-ibm-1370_P100-1999 { UTR22* }   ibm-1370 { IBM* } x-IBM1370 { JAVA* }
-ibm-1371_P100-1999 { UTR22* }   ibm-1371 { IBM* } x-IBM1371 { JAVA* } # Taiwan EBCDIC MIXED (Euro update of ibm-937)
-ibm-1388_P103-2001 { UTR22* }   ibm-1388 { IBM* } ibm-9580 { IBM } x-IBM1388 { JAVA* } # S-Ch DBCS-Host Data GBK EBCDIC_STATEFUL. Yes ibm-9580 is an alias.
-ibm-1390_P110-2003 { UTR22* }   ibm-1390 { IBM* } x-IBM1390 { JAVA* } # Japan EBCDIC MIXED (JIS X 0213)
-ibm-1399_P110-2003 { UTR22* }   ibm-1399 { IBM* } x-IBM1399 { JAVA* } # Host MBCS (Latin-Kanji) (JIS X 0213)
-ibm-5123_P100-1999 { UTR22* }   ibm-5123 { IBM* }                   # Host Roman Jis. Euro update of ibm-1027. SBCS portion of ibm-1390.
-ibm-8482_P100-1999 { UTR22* }   ibm-8482 { IBM* }                   # host SBCS (Katakana). Euro update of ibm-290. SBCS portion of ibm-1399.
-# Yes ibm-20780 is the same as ibm-16684
-ibm-16684_P110-2003 { UTR22* }  ibm-16684 { IBM* } ibm-20780 { IBM } # DBCS Jis + Roman Jis Host. This is the DBCS portion of ibm-1390 and ibm-1399 (JIS X 0213).
-ibm-4899_P100-1998 { UTR22* }   ibm-4899 { IBM* }                   # Old EBCDIC Hebrew. Update of ibm-803
-ibm-4971_P100-1999 { UTR22* }   ibm-4971 { IBM* }                   # EBCDIC Greek. Update of ibm-875 and superceded by ibm-9067
-ibm-9067_X100-2005 { UTR22* }   ibm-9067 { IBM* }                   # EBCDIC Greek. Update of ibm-875 and ibm-4971
-ibm-12712_P100-1998 { UTR22* }  ibm-12712 { IBM* } ebcdic-he        # EBCDIC Hebrew (new sheqel, control characters update). Update of ibm-424
-ibm-16804_X110-1999 { UTR22* }  ibm-16804 { IBM* } ebcdic-ar        # EBCDIC Arabic. Update of ibm-420
-
-java-Cp1399A-1.6_P { UTR22* }   x-IBM1399A { JAVA* }
-java-Cp420s-1.6_P { UTR22* }    x-IBM420S { JAVA* }
-java-Cp1390A-1.6_P { UTR22* }   x-IBM1390A { JAVA* }
-
-# EBCDIC codepages for S/390, with LF and NL codes swapped
-# Starting with ICU 2.4, the swapping is done by modifying the
-# normal tables at runtime instead of at build time.
-# Append UCNV_SWAP_LFNL_OPTION_STRING to the "ibm-CCSID" name to select this.
-#
-# Example: "ibm-1047,swaplfnl" or "ibm-1047" UCNV_SWAP_LFNL_OPTION_STRING
-#
-# This avoids the duplication of all EBCDIC SBCS and mixed-SBCS/DBCS
-# mapping files.
-
-# Some examples below for declaring old-style, obsolete aliases with the "-s390"
-# suffix to map to the new-style, recommended names with the option added.
-# These are listed here for backward compatibility.
-# Do not use these; instead use the normal converter name with the option
-# added as recommended above.
-
-# Note: It is not possible to define an alias (non-initial name in a line here)
-# that itself contains a converter option like this one for swapping LF<->NL.
-# Such names would never be found because ucnv_open() will first parse and strip
-# options before looking up a name in this table.
-# ucnv_open() then parses the lookup result (the canonical name on the left
-# in lines here) as well.
-
-# This also means that it is not necessary to add anything to convrtrs.txt
-# for converter names like "ibm-1026,swaplfnl" to work -
-# they are already covered by the normal option parsing together with the
-# regular, option-less alias elsewhere in this file.
-
-ibm-37_P100-1995,swaplfnl     ibm-37-s390 # ibm037-s390 also matches ibm-37-s390
-ibm-924_P100-1998,swaplfnl    ibm-924-s390 IBM924_LF { JAVA* }
-ibm-1047_P100-1995,swaplfnl   ibm-1047-s390 IBM1047_LF { JAVA* }
-ibm-1140_P100-1997,swaplfnl   ibm-1140-s390
-ibm-1141_P100-1997,swaplfnl   ibm-1141-s390 IBM1141_LF { JAVA* }
-ibm-1142_P100-1997,swaplfnl   ibm-1142-s390
-ibm-1143_P100-1997,swaplfnl   ibm-1143-s390
-ibm-1144_P100-1997,swaplfnl   ibm-1144-s390
-ibm-1145_P100-1997,swaplfnl   ibm-1145-s390
-ibm-1146_P100-1997,swaplfnl   ibm-1146-s390
-ibm-1147_P100-1997,swaplfnl   ibm-1147-s390
-ibm-1148_P100-1997,swaplfnl   ibm-1148-s390
-ibm-1149_P100-1997,swaplfnl   ibm-1149-s390
-ibm-1153_P100-1999,swaplfnl   ibm-1153-s390
-ibm-12712_P100-1998,swaplfnl  ibm-12712-s390
-ibm-16804_X110-1999,swaplfnl  ibm-16804-s390
-
-# This is a special version of ibm-1140 that the XML4C (Xerces) parser team
-# requested in 2000.
-# It maps both EBCDIC LF and NL controls to Unicode LF U+000A.
-
-ebcdic-xml-us
-
-# These are not installed by default. They are rarely used.
-# Many of them can be added through the online ICU Data Library Customization tool
-
-ibm-1004_P100-1995 { UTR22* }   ibm-1004 { IBM* }
-ibm-1008_P100-1995 { UTR22* }   ibm-1008 { IBM* } # cp1008, 8-bit Arabic (w/o euro update)
-ibm-1009_P100-1995 { UTR22* }   ibm-1009 { IBM* }
-ibm-1010_P100-1995 { UTR22* }   ibm-1010 { IBM* } NF_Z_62-010 { IANA* } iso-ir-69 { IANA } ISO646-FR { IANA } fr { IANA } csISO69French { IANA }
-ibm-1011_P100-1995 { UTR22* }   ibm-1011 { IBM* } DIN_66003 { IANA* } iso-ir-21 { IANA } de { IANA } ISO646-DE { IANA } csISO21German { IANA }
-ibm-1012_P100-1995 { UTR22* }   ibm-1012 { IBM* } IT { IANA* } iso-ir-15 { IANA } ISO646-IT { IANA } csISO15Italian { IANA }
-ibm-1013_P100-1995 { UTR22* }   ibm-1013 { IBM* } BS_4730 { IANA* } iso-ir-4 { IANA } ISO646-GB { IANA } gb { IANA } uk { IANA } csISO4UnitedKingdom { IANA }
-ibm-1014_P100-1995 { UTR22* }   ibm-1014 { IBM* } ES2 { IANA* } iso-ir-85 { IANA } ISO646-ES2 { IANA } csISO85Spanish2 { IANA }
-ibm-1015_P100-1995 { UTR22* }   ibm-1015 { IBM* } PT2 { IANA* } iso-ir-84 { IANA } ISO646-PT2 { IANA } csISO84Portuguese2 { IANA }
-ibm-1016_P100-1995 { UTR22* }   ibm-1016 { IBM* } NS_4551-1 { IANA* } iso-ir-60 { IANA } ISO646-NO { IANA } no { IANA } csISO60DanishNorwegian { IANA } csISO60Norwegian1 { IANA }
-ibm-1017_P100-1995 { UTR22* }   ibm-1017 { IBM* }
-ibm-1018_P100-1995 { UTR22* }   ibm-1018 { IBM* } SEN_850200_B { IANA* } iso-ir-10 { IANA } FI { IANA } ISO646-FI { IANA } ISO646-SE { IANA } se { IANA } csISO10Swedish { IANA }
-ibm-1019_P100-1995 { UTR22* }   ibm-1019 { IBM* }
-ibm-1020_P100-2003 { UTR22* }   ibm-1020 { IBM* } CSA_Z243.4-1985-1 { IANA* } iso-ir-121 { IANA } ISO646-CA { IANA } csa7-1 { IANA } ca { IANA } csISO121Canadian1 { IANA }
-ibm-1021_P100-2003 { UTR22* }   ibm-1021 { IBM* }
-ibm-1023_P100-2003 { UTR22* }   ibm-1023 { IBM* } ES { IANA* } iso-ir-17 { IANA } ISO646-ES { IANA } csISO17Spanish { IANA }
-ibm-1027_P100-1995 { UTR22* }   ibm-1027 { IBM* } x-IBM1027 { JAVA* }
-ibm-1041_P100-1995 { UTR22* }   ibm-1041 { IBM* } x-IBM1041 { JAVA* }
-ibm-1043_P100-1995 { UTR22* }   ibm-1043 { IBM* } x-IBM1043 { JAVA* }
-ibm-1046_X110-1999 { UTR22* }   ibm-1046 { IBM* } x-IBM1046 { JAVA* } x-IBM1046S { JAVA } # Arabic
-ibm-1088_P100-1995 { UTR22* }   ibm-1088 { IBM* } x-IBM1088 { JAVA* }
-ibm-1100_P100-2003 { UTR22* }   ibm-1100 { IBM* } DEC-MCS { IANA* } dec { IANA } csDECMCS { IANA }
-ibm-1101_P100-2003 { UTR22* }   ibm-1101 { IBM* }
-ibm-1102_P100-2003 { UTR22* }   ibm-1102 { IBM* }
-ibm-1103_P100-2003 { UTR22* }   ibm-1103 { IBM* }
-ibm-1104_P100-2003 { UTR22* }   ibm-1104 { IBM* } NF_Z_62-010_1973 iso-ir-25 { IANA* } ISO646-FR1 { IANA } csISO25French { IANA } # NF_Z_62-010_(1973) is the real IANA alias, but () aren't invariant characters.
-ibm-1105_P100-2003 { UTR22* }   ibm-1105 { IBM* }
-ibm-1106_P100-2003 { UTR22* }   ibm-1106 { IBM* }
-ibm-1107_P100-2003 { UTR22* }   ibm-1107 { IBM* } DS_2089 { IANA* } ISO646-DK { IANA } dk { IANA } csISO646Danish { IANA }
-ibm-1127_P100-2004 { UTR22* }   ibm-1127 { IBM* }
-ibm-1161_P100-1999 { UTR22* }   ibm-1161 { IBM* } # Thai (Euro update of ibm-1129)
-ibm-1163_P100-1999 { UTR22* }   ibm-1163 { IBM* } # Vietnamese
-ibm-1165_P101-2000 { UTR22* }   ibm-1165 { IBM* } # Vietnamese (EBCDIC)
-ibm-1166_P100-2002 { UTR22* }   ibm-1166 { IBM* } # Cyrillic for Kazakhstan
-ibm-1167_P100-2002 { UTR22* }   ibm-1167 { IBM* } KOI8-RU x-KOI8_RU { JAVA* }
-ibm-1174_X100-2007 { UTR22* }   ibm-1174 { IBM* } KZ-1048 { IANA* } STRK1048-2002 { IANA } RK1048 { IANA } csKZ1048 { IANA }
-ibm-1277_P100-1995 { UTR22* }   ibm-1277 { IBM* } # Adobe (Postscript) Latin-1
-ibm-13125_P100-1997 { UTR22* }  ibm-13125 { IBM* } # S-Ch (DBCS subset of ibm-4933, ibm-1388)
-ibm-13140_P101-2000 { UTR22* }  ibm-13140 { IBM* }
-ibm-13218_P100-1996 { UTR22* }  ibm-13218 { IBM* } # Japanese (EBCDIC update of ibm-930)
-ibm-1350_P110-1997 { UTR22* }   ibm-1350 { IBM* } x-eucJP-Open { JAVA* } eucJP-Open { JAVA } # Japanese (EUC-JP variant)
-ibm-1351_P110-1997 { UTR22* }   ibm-1351 { IBM* } x-IBM1351 { JAVA* } # Japanese (DBCS subset of ibm-5039)
-ibm-1362_P110-1999 { UTR22* }   ibm-1362 { IBM* } x-IBM1362 { JAVA* } # Korean (DBCS subset of ibm-1363)
-ibm-13676_P102-2001 { UTR22* }  ibm-13676 { IBM* } # Simplified Chinese (EBCDIC)
-ibm-1380_P100-1995 { UTR22* }   ibm-1380 { IBM* } x-IBM1380 { JAVA* } # Simplified Chinese (DBCS subset of ibm-1381)
-ibm-1381_P110-1999 { UTR22* }   ibm-1381 { IBM* JAVA } cp1381 { JAVA* } 1381 { JAVA } x-IBM1381 { JAVA } # Simplified Chinese PC Data mixed (IBM GB) 
-ibm-1382_P100-1995 { UTR22* }   ibm-1382 { IBM* } x-IBM1382 { JAVA* } # Simplified Chinese (DBCS subset of ibm-1383)
-ibm-17221_P100-2001 { UTR22* }  ibm-17221 { IBM* } # Simplified Chinese (EBCDIC)
-ibm-17248_X110-1999 { UTR22* }  ibm-17248 { IBM* } # PC Arabic (w/ euro update) Updated version of ibm-864
-ibm-21344_P101-2000 { UTR22* }  ibm-21344 { IBM* } # PC Arabic. Updated version of ibm-864
-ibm-21427_P100-1999 { UTR22* }  ibm-21427 { IBM* } # Traditional Chinese (DBCS subset of ibm-1370)
-ibm-256_P100-1995 { UTR22* }    ibm-256 { IBM* } # Latin 1 EBCDIC
-ibm-259_P100-1995 { UTR22* }    ibm-259 { IBM* } IBM-Symbols { IANA* } csIBMSymbols { IANA }
-ibm-274_P100-2000 { UTR22* }    ibm-274 { IBM* } IBM274 { IANA* } EBCDIC-BE { IANA } CP274 { IANA } csIBM274 { IANA }
-ibm-275_P100-1995 { UTR22* }    ibm-275 { IBM* } IBM275 { IANA* } EBCDIC-BR { IANA } cp275 { IANA } csIBM275 { IANA }
-ibm-286_P100-2003 { UTR22* }    ibm-286 { IBM* } EBCDIC-AT-DE-A { IANA* } csEBCDICATDEA { IANA }
-ibm-293_P100-1995 { UTR22* }    ibm-293 { IBM* } # APL EBCDIC (APL: A Programming Language)
-ibm-300_P120-2006 { UTR22* }    ibm-300 { IBM* } x-IBM300 { JAVA* } # Japanese (DBCS subset of ibm-930 and ibm-939)
-ibm-301_P110-1997 { UTR22* }    ibm-301 { IBM* } x-IBM301 { JAVA* } # Japanese (DBCS subset of ibm-943)
-ibm-33058_P100-2000 { UTR22* }  ibm-33058 { IBM* } # SBCS (Katakana)
-ibm-425_P101-2000 { UTR22* }    ibm-425 { IBM* } # Arabic (EBCDIC)
-ibm-4930_P110-1999 { UTR22* }   ibm-4930 { IBM* } # Korean (DBCS subset of ibm-1364)
-ibm-4933_P100-2002 { UTR22* }   ibm-4933 { IBM* } # S-Ch (DBCS subset of ibm-1388)
-ibm-4948_P100-1995 { UTR22* }   ibm-4948 { IBM* }
-ibm-4951_P100-1995 { UTR22* }   ibm-4951 { IBM* }
-ibm-4952_P100-1995 { UTR22* }   ibm-4952 { IBM* }
-ibm-4960_P100-1995 { UTR22* }   ibm-4960 { IBM* }
-ibm-5039_P11A-1998 { UTR22* }   ibm-5039 { IBM* } # Japanese (HP Shift-JIS variant)
-ibm-5048_P100-1995 { UTR22* }   ibm-5048 { IBM* } # Japanese (DBCS subset of ibm-1350, JIS X208-1990)
-ibm-5049_P100-1995 { UTR22* }   ibm-5049 { IBM* } # Japanese (DBCS subset of ibm-1350, JIS X212)
-ibm-5067_P100-1995 { UTR22* }   ibm-5067 { IBM* } # Korean (DBCS subset of ibm-21450)
-ibm-5104_X110-1999 { UTR22* }   ibm-5104 { IBM* } # cp1008, 8-bit Arabic (w/ euro update)
-ibm-5233_P100-2011 { UTR22* }   ibm-5233 { IBM* } # Devanagari EBCDIC, including Indian Rupee
-ibm-806_P100-1998 { UTR22* }    ibm-806 { IBM* } # Hindi (ISCII variant)
-ibm-808_P100-1999 { UTR22* }    ibm-808 { IBM* } x-IBM808 { JAVA* } # Cyrillic
-ibm-833_P100-1995 { UTR22* }    ibm-833 { IBM* } x-IBM833 { JAVA* }
-ibm-834_P100-1995 { UTR22* }    ibm-834 { IBM* } x-IBM834 { JAVA* } # Korean (DBCS subset of ibm-933)
-ibm-835_P100-1995 { UTR22* }    ibm-835 { IBM* } x-IBM835 { JAVA* } # Traditional Chinese (DBCS subset of ibm-5033)
-ibm-836_P100-1995 { UTR22* }    ibm-836 { IBM* } x-IBM836 { JAVA* }
-ibm-837_P100-2011 { UTR22* }    ibm-837 { IBM* } x-IBM837 { JAVA* } # Simplified Chinese (DBCS subset of ibm-5031)
-ibm-848_P100-1999 { UTR22* }    ibm-848 { IBM* } # Cyrillic (euro update of ibm-1125)
-ibm-849_P100-1999 { UTR22* }    ibm-849 { IBM* } # Cyrillic Belarus (euro update of ibm-1131)
-ibm-859_P100-1999 { UTR22* }    ibm-859 { IBM* } x-IBM859 { JAVA* } # PC Latin 9 (w/ euro update)
-ibm-8612_P100-1995 { UTR22* }   ibm-8612 { IBM* } # Arabic (EBCDIC update of ibm-420)
-ibm-872_P100-1999 { UTR22* }    ibm-872 { IBM* } # Cyrillic (Euro update of ibm-855)
-ibm-880_P100-1995 { UTR22* }    ibm-880 { IBM* } IBM880 { IANA* } cp880 { IANA } EBCDIC-Cyrillic { IANA } csIBM880 { IANA } windows-20880 { WINDOWS* }
-ibm-896_P100-1995 { UTR22* }    ibm-896 { IBM* } # SBCS Katakana
-ibm-897_P100-1995 { UTR22* }    ibm-897 { IBM* } JIS_X0201 { IANA* } X0201 { IANA } csHalfWidthKatakana { IANA } x-IBM897 { JAVA* }
-ibm-9027_P100-1999 { UTR22* }   ibm-9027 { IBM* } # DBCS T-Ch Host. Euro update of ibm-835. DBCS portion of ibm-1371.
-ibm-9048_P100-1998 { UTR22* }   ibm-9048 { IBM* } # Hebrew (Euro and Sequel update of ibm-856)
-ibm-905_P100-1995 { UTR22* }    ibm-905 { IBM* } IBM905 { IANA* } CP905 { IANA } ebcdic-cp-tr { IANA } csIBM905 { IANA } windows-20905 { WINDOWS* }
-ibm-9056_P100-1995 { UTR22* }   ibm-9056 { IBM* } # Arabic
-ibm-9061_P100-1999 { UTR22* }   ibm-9061 { IBM* } # Greek (w/ euro update)
-ibm-9145_P110-1997 { UTR22* }   ibm-9145 { IBM* } # Japanese (DBCS subset of ibm-5050)
-ibm-9238_X110-1999 { UTR22* }   ibm-9238 { IBM* } # cp1046, PC Arabic Extended (w/ euro update)
-ibm-924_P100-1998 { UTR22* }    ibm-924 { IBM* } IBM00924 { IANA* } CCSID00924 { IANA } CP00924 { IANA } ebcdic-Latin9--euro { IANA }
-ibm-926_P100-2000 { UTR22* }    ibm-926 { IBM* } # Korean (DBCS subset of ibm-944)
-ibm-927_P100-1995 { UTR22* }    ibm-927 { IBM* } x-IBM927 { JAVA* } # Traditional Chinese (DBCS subset of ibm-948)
-ibm-928_P100-1995 { UTR22* }    ibm-928 { IBM* } # Simplified Chinese (DBCS subset of ibm-936)
-ibm-941_P13A-2001 { UTR22* }    ibm-941 { IBM* } # DBCS portion of ibm-943
-ibm-944_P100-1995 { UTR22* }    ibm-944 { IBM* } # Korean
-ibm-946_P100-1995 { UTR22* }    ibm-946 { IBM* } # Simplified Chinese
-ibm-947_P100-1995 { UTR22* }    ibm-947 { IBM* } x-IBM947 { JAVA* } # Traditional Chinese (DBCS subset of ibm-950)
-ibm-948_P110-1999 { UTR22* }    ibm-948 { IBM* } x-IBM948 { JAVA* } # Traditional Chinese
-ibm-951_P100-1995 { UTR22* }    ibm-951 { IBM* } x-IBM951 { JAVA* } # Korean (DBCS subset of ibm-949)
-ibm-952_P110-1997 { UTR22* }    ibm-952 { IBM* } x-JIS0208 # Pure DBCS, Japanese EUC, G1 - JIS X208-1990
-ibm-953_P100-2000 { UTR22* }    ibm-953 { IBM* } JIS_X0212-1990 { IANA* } # Pure DBCS, Japanese EUC, G3 - JIS X 0212-1990
-ibm-955_P110-1997 { UTR22* }    ibm-955 { IBM* } # Pure DBCS, Japanese EUC, G0 - JIS X208-1978
-ibm-9577_P100-2001 { UTR22* }   ibm-9577 { IBM* } ibm-1385 { IBM } x-IBM1385 { JAVA* } # ibm-9577 and ibm-1385 are identical DBCS tables.
-iso-8859_16-2001 { UTR22* }     ISO-8859-16 { IANA* } iso-ir-226 { IANA } ISO_8859-16:2001 { IANA } latin10 { IANA } l10 { IANA }
-
-# To be considered for listing at a later date for the data library customization tool
-#ibm-1159_P100-1999 { UTR22* }   ibm-1159 { IBM* } # SBCS T-Ch Host. Euro update of ibm-28709. This is used in combination with another CCSID mapping.
-#ibm-960_P100-2000 { UTR22* }    ibm-960 { IBM* } # Pure DBCS, CNS11643 plane 1
-#ibm-963_P100-1995 { UTR22* }    ibm-963 { IBM* } # Pure DBCS, CNS11643 plane 2 Traditional Chinese (DBCS subset of ibm-965)
diff --git a/rust/fuzz/.gitignore b/rust/fuzz/.gitignore

deleted file mode 100644 (file)

index 1a45eee..0000000
--- a/rust/fuzz/.gitignore
+++ /dev/null
@@ -1,4 +0,0 @@
-target
-corpus
-artifacts
-coverage
diff --git a/rust/fuzz/Cargo.lock b/rust/fuzz/Cargo.lock

deleted file mode 100644 (file)

index c840c28..0000000
--- a/rust/fuzz/Cargo.lock
+++ /dev/null
@@ -1,872 +0,0 @@
-# This file is automatically @generated by Cargo.
-# It is not intended for manual editing.
-version = 3
-
-[[package]]
-name = "adler"
-version = "1.0.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
-
-[[package]]
-name = "android-tzdata"
-version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0"
-
-[[package]]
-name = "android_system_properties"
-version = "0.1.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311"
-dependencies = [
- "libc",
-]
-
-[[package]]
-name = "anstream"
-version = "0.6.14"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "418c75fa768af9c03be99d17643f93f79bbba589895012a80e3452a19ddda15b"
-dependencies = [
- "anstyle",
- "anstyle-parse",
- "anstyle-query",
- "anstyle-wincon",
- "colorchoice",
- "is_terminal_polyfill",
- "utf8parse",
-]
-
-[[package]]
-name = "anstyle"
-version = "1.0.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "038dfcf04a5feb68e9c60b21c9625a54c2c0616e79b72b0fd87075a056ae1d1b"
-
-[[package]]
-name = "anstyle-parse"
-version = "0.2.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c03a11a9034d92058ceb6ee011ce58af4a9bf61491aa7e1e59ecd24bd40d22d4"
-dependencies = [
- "utf8parse",
-]
-
-[[package]]
-name = "anstyle-query"
-version = "1.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ad186efb764318d35165f1758e7dcef3b10628e26d41a44bc5550652e6804391"
-dependencies = [
- "windows-sys 0.52.0",
-]
-
-[[package]]
-name = "anstyle-wincon"
-version = "3.0.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "61a38449feb7068f52bb06c12759005cf459ee52bb4adc1d5a7c4322d716fb19"
-dependencies = [
- "anstyle",
- "windows-sys 0.52.0",
-]
-
-[[package]]
-name = "anyhow"
-version = "1.0.86"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b3d1d046238990b9cf5bcde22a3fb3584ee5cf65fb2765f454ed428c7a0063da"
-
-[[package]]
-name = "arbitrary"
-version = "1.3.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7d5a26814d8dcb93b0e5a0ff3c6d80a8843bafb21b39e8e18a6f05471870e110"
-
-[[package]]
-name = "atty"
-version = "0.2.14"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8"
-dependencies = [
- "hermit-abi",
- "libc",
- "winapi",
-]
-
-[[package]]
-name = "autocfg"
-version = "1.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0"
-
-[[package]]
-name = "bitflags"
-version = "2.6.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de"
-
-[[package]]
-name = "bumpalo"
-version = "3.16.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c"
-
-[[package]]
-name = "cc"
-version = "1.0.106"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "066fce287b1d4eafef758e89e09d724a24808a9196fe9756b8ca90e86d0719a2"
-dependencies = [
- "jobserver",
- "libc",
- "once_cell",
-]
-
-[[package]]
-name = "cfg-if"
-version = "1.0.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
-
-[[package]]
-name = "chrono"
-version = "0.4.38"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a21f936df1771bf62b77f047b726c4625ff2e8aa607c01ec06e5a05bd8463401"
-dependencies = [
- "android-tzdata",
- "iana-time-zone",
- "js-sys",
- "num-traits",
- "wasm-bindgen",
- "windows-targets 0.52.6",
-]
-
-[[package]]
-name = "clap"
-version = "4.5.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "84b3edb18336f4df585bc9aa31dd99c036dfa5dc5e9a2939a722a188f3a8970d"
-dependencies = [
- "clap_builder",
- "clap_derive",
-]
-
-[[package]]
-name = "clap_builder"
-version = "4.5.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c1c09dd5ada6c6c78075d6fd0da3f90d8080651e2d6cc8eb2f1aaa4034ced708"
-dependencies = [
- "anstream",
- "anstyle",
- "clap_lex",
- "strsim",
- "terminal_size",
-]
-
-[[package]]
-name = "clap_derive"
-version = "4.5.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2bac35c6dafb060fd4d275d9a4ffae97917c13a6327903a8be2153cd964f7085"
-dependencies = [
- "heck",
- "proc-macro2",
- "quote",
- "syn",
-]
-
-[[package]]
-name = "clap_lex"
-version = "0.7.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4b82cf0babdbd58558212896d1a4272303a57bdb245c2bf1147185fb45640e70"
-
-[[package]]
-name = "colorchoice"
-version = "1.0.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0b6a852b24ab71dffc585bcb46eaf7959d175cb865a7152e35b348d1b2960422"
-
-[[package]]
-name = "core-foundation-sys"
-version = "0.8.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f"
-
-[[package]]
-name = "crc32fast"
-version = "1.4.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3"
-dependencies = [
- "cfg-if",
-]
-
-[[package]]
-name = "encoding_rs"
-version = "0.8.34"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b45de904aa0b010bce2ab45264d0631681847fa7b6f2eaa7dab7619943bc4f59"
-dependencies = [
- "cfg-if",
-]
-
-[[package]]
-name = "equivalent"
-version = "1.0.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5"
-
-[[package]]
-name = "errno"
-version = "0.3.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba"
-dependencies = [
- "libc",
- "windows-sys 0.52.0",
-]
-
-[[package]]
-name = "finl_unicode"
-version = "1.2.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8fcfdc7a0362c9f4444381a9e697c79d435fe65b52a37466fc2c1184cee9edc6"
-
-[[package]]
-name = "flate2"
-version = "1.0.30"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5f54427cfd1c7829e2a139fcefea601bf088ebca651d2bf53ebc600eac295dae"
-dependencies = [
- "crc32fast",
- "miniz_oxide",
-]
-
-[[package]]
-name = "float_next_after"
-version = "1.0.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8bf7cc16383c4b8d58b9905a8509f02926ce3058053c056376248d958c9df1e8"
-
-[[package]]
-name = "hashbrown"
-version = "0.14.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
-
-[[package]]
-name = "heck"
-version = "0.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
-
-[[package]]
-name = "hermit-abi"
-version = "0.1.19"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33"
-dependencies = [
- "libc",
-]
-
-[[package]]
-name = "hexplay"
-version = "0.2.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0962bea6731e28b5a443ba4aa00fe3e4fe7555dadf12012435efb738eeac5898"
-dependencies = [
- "atty",
- "termcolor",
-]
-
-[[package]]
-name = "iana-time-zone"
-version = "0.1.60"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e7ffbb5a1b541ea2561f8c41c087286cc091e21e556a4f09a8f6cbf17b69b141"
-dependencies = [
- "android_system_properties",
- "core-foundation-sys",
- "iana-time-zone-haiku",
- "js-sys",
- "wasm-bindgen",
- "windows-core",
-]
-
-[[package]]
-name = "iana-time-zone-haiku"
-version = "0.1.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f"
-dependencies = [
- "cc",
-]
-
-[[package]]
-name = "indexmap"
-version = "2.2.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "168fb715dda47215e360912c096649d23d58bf392ac62f73919e831745e40f26"
-dependencies = [
- "equivalent",
- "hashbrown",
-]
-
-[[package]]
-name = "is_terminal_polyfill"
-version = "1.70.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f8478577c03552c21db0e2724ffb8986a5ce7af88107e6be5d2ee6e158c12800"
-
-[[package]]
-name = "jobserver"
-version = "0.1.31"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d2b099aaa34a9751c5bf0878add70444e1ed2dd73f347be99003d4577277de6e"
-dependencies = [
- "libc",
-]
-
-[[package]]
-name = "js-sys"
-version = "0.3.69"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "29c15563dc2726973df627357ce0c9ddddbea194836909d655df6a75d2cf296d"
-dependencies = [
- "wasm-bindgen",
-]
-
-[[package]]
-name = "lazy_static"
-version = "1.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
-
-[[package]]
-name = "libc"
-version = "0.2.155"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c"
-
-[[package]]
-name = "libfuzzer-sys"
-version = "0.4.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a96cfd5557eb82f2b83fed4955246c988d331975a002961b07c81584d107e7f7"
-dependencies = [
- "arbitrary",
- "cc",
- "once_cell",
-]
-
-[[package]]
-name = "linux-raw-sys"
-version = "0.4.14"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89"
-
-[[package]]
-name = "log"
-version = "0.4.22"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24"
-
-[[package]]
-name = "miniz_oxide"
-version = "0.7.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b8a240ddb74feaf34a79a7add65a741f3167852fba007066dcac1ca548d89c08"
-dependencies = [
- "adler",
-]
-
-[[package]]
-name = "num"
-version = "0.4.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "35bd024e8b2ff75562e5f34e7f4905839deb4b22955ef5e73d2fea1b9813cb23"
-dependencies = [
- "num-bigint",
- "num-complex",
- "num-integer",
- "num-iter",
- "num-rational",
- "num-traits",
-]
-
-[[package]]
-name = "num-bigint"
-version = "0.4.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9"
-dependencies = [
- "num-integer",
- "num-traits",
-]
-
-[[package]]
-name = "num-complex"
-version = "0.4.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495"
-dependencies = [
- "num-traits",
-]
-
-[[package]]
-name = "num-derive"
-version = "0.4.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ed3955f1a9c7c0c15e092f9c887db08b1fc683305fdf6eb6684f22555355e202"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn",
-]
-
-[[package]]
-name = "num-integer"
-version = "0.1.46"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f"
-dependencies = [
- "num-traits",
-]
-
-[[package]]
-name = "num-iter"
-version = "0.1.45"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf"
-dependencies = [
- "autocfg",
- "num-integer",
- "num-traits",
-]
-
-[[package]]
-name = "num-rational"
-version = "0.4.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824"
-dependencies = [
- "num-bigint",
- "num-integer",
- "num-traits",
-]
-
-[[package]]
-name = "num-traits"
-version = "0.2.19"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
-dependencies = [
- "autocfg",
-]
-
-[[package]]
-name = "once_cell"
-version = "1.19.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
-
-[[package]]
-name = "ordered-float"
-version = "3.9.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f1e1c390732d15f1d48471625cd92d154e66db2c56645e29a9cd26f4699f72dc"
-dependencies = [
- "num-traits",
-]
-
-[[package]]
-name = "proc-macro2"
-version = "1.0.86"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77"
-dependencies = [
- "unicode-ident",
-]
-
-[[package]]
-name = "pspp"
-version = "1.0.0"
-dependencies = [
- "anyhow",
- "bitflags",
- "chrono",
- "clap",
- "encoding_rs",
- "finl_unicode",
- "flate2",
- "float_next_after",
- "hexplay",
- "indexmap",
- "lazy_static",
- "libc",
- "num",
- "num-derive",
- "num-traits",
- "ordered-float",
- "thiserror",
- "unicase",
- "utf8-decode",
- "windows-sys 0.48.0",
-]
-
-[[package]]
-name = "pspp-fuzz"
-version = "0.0.0"
-dependencies = [
- "libfuzzer-sys",
- "pspp",
-]
-
-[[package]]
-name = "quote"
-version = "1.0.36"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7"
-dependencies = [
- "proc-macro2",
-]
-
-[[package]]
-name = "rustix"
-version = "0.38.34"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "70dc5ec042f7a43c4a73241207cecc9873a06d45debb38b329f8541d85c2730f"
-dependencies = [
- "bitflags",
- "errno",
- "libc",
- "linux-raw-sys",
- "windows-sys 0.52.0",
-]
-
-[[package]]
-name = "strsim"
-version = "0.11.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
-
-[[package]]
-name = "syn"
-version = "2.0.69"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "201fcda3845c23e8212cd466bfebf0bd20694490fc0356ae8e428e0824a915a6"
-dependencies = [
- "proc-macro2",
- "quote",
- "unicode-ident",
-]
-
-[[package]]
-name = "termcolor"
-version = "0.3.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "adc4587ead41bf016f11af03e55a624c06568b5a19db4e90fde573d805074f83"
-dependencies = [
- "wincolor",
-]
-
-[[package]]
-name = "terminal_size"
-version = "0.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "21bebf2b7c9e0a515f6e0f8c51dc0f8e4696391e6f1ff30379559f8365fb0df7"
-dependencies = [
- "rustix",
- "windows-sys 0.48.0",
-]
-
-[[package]]
-name = "thiserror"
-version = "1.0.61"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c546c80d6be4bc6a00c0f01730c08df82eaa7a7a61f11d656526506112cc1709"
-dependencies = [
- "thiserror-impl",
-]
-
-[[package]]
-name = "thiserror-impl"
-version = "1.0.61"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "46c3384250002a6d5af4d114f2845d37b57521033f30d5c3f46c4d70e1197533"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn",
-]
-
-[[package]]
-name = "unicase"
-version = "2.7.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f7d2d4dafb69621809a81864c9c1b864479e1235c0dd4e199924b9742439ed89"
-dependencies = [
- "version_check",
-]
-
-[[package]]
-name = "unicode-ident"
-version = "1.0.12"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"
-
-[[package]]
-name = "utf8-decode"
-version = "1.0.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ca61eb27fa339aa08826a29f03e87b99b4d8f0fc2255306fd266bb1b6a9de498"
-
-[[package]]
-name = "utf8parse"
-version = "0.2.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
-
-[[package]]
-name = "version_check"
-version = "0.9.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
-
-[[package]]
-name = "wasm-bindgen"
-version = "0.2.92"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4be2531df63900aeb2bca0daaaddec08491ee64ceecbee5076636a3b026795a8"
-dependencies = [
- "cfg-if",
- "wasm-bindgen-macro",
-]
-
-[[package]]
-name = "wasm-bindgen-backend"
-version = "0.2.92"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "614d787b966d3989fa7bb98a654e369c762374fd3213d212cfc0251257e747da"
-dependencies = [
- "bumpalo",
- "log",
- "once_cell",
- "proc-macro2",
- "quote",
- "syn",
- "wasm-bindgen-shared",
-]
-
-[[package]]
-name = "wasm-bindgen-macro"
-version = "0.2.92"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a1f8823de937b71b9460c0c34e25f3da88250760bec0ebac694b49997550d726"
-dependencies = [
- "quote",
- "wasm-bindgen-macro-support",
-]
-
-[[package]]
-name = "wasm-bindgen-macro-support"
-version = "0.2.92"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn",
- "wasm-bindgen-backend",
- "wasm-bindgen-shared",
-]
-
-[[package]]
-name = "wasm-bindgen-shared"
-version = "0.2.92"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96"
-
-[[package]]
-name = "winapi"
-version = "0.3.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
-dependencies = [
- "winapi-i686-pc-windows-gnu",
- "winapi-x86_64-pc-windows-gnu",
-]
-
-[[package]]
-name = "winapi-i686-pc-windows-gnu"
-version = "0.4.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
-
-[[package]]
-name = "winapi-x86_64-pc-windows-gnu"
-version = "0.4.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
-
-[[package]]
-name = "wincolor"
-version = "0.1.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eeb06499a3a4d44302791052df005d5232b927ed1a9658146d842165c4de7767"
-dependencies = [
- "winapi",
-]
-
-[[package]]
-name = "windows-core"
-version = "0.52.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9"
-dependencies = [
- "windows-targets 0.52.6",
-]
-
-[[package]]
-name = "windows-sys"
-version = "0.48.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9"
-dependencies = [
- "windows-targets 0.48.5",
-]
-
-[[package]]
-name = "windows-sys"
-version = "0.52.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
-dependencies = [
- "windows-targets 0.52.6",
-]
-
-[[package]]
-name = "windows-targets"
-version = "0.48.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c"
-dependencies = [
- "windows_aarch64_gnullvm 0.48.5",
- "windows_aarch64_msvc 0.48.5",
- "windows_i686_gnu 0.48.5",
- "windows_i686_msvc 0.48.5",
- "windows_x86_64_gnu 0.48.5",
- "windows_x86_64_gnullvm 0.48.5",
- "windows_x86_64_msvc 0.48.5",
-]
-
-[[package]]
-name = "windows-targets"
-version = "0.52.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
-dependencies = [
- "windows_aarch64_gnullvm 0.52.6",
- "windows_aarch64_msvc 0.52.6",
- "windows_i686_gnu 0.52.6",
- "windows_i686_gnullvm",
- "windows_i686_msvc 0.52.6",
- "windows_x86_64_gnu 0.52.6",
- "windows_x86_64_gnullvm 0.52.6",
- "windows_x86_64_msvc 0.52.6",
-]
-
-[[package]]
-name = "windows_aarch64_gnullvm"
-version = "0.48.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8"
-
-[[package]]
-name = "windows_aarch64_gnullvm"
-version = "0.52.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
-
-[[package]]
-name = "windows_aarch64_msvc"
-version = "0.48.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc"
-
-[[package]]
-name = "windows_aarch64_msvc"
-version = "0.52.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
-
-[[package]]
-name = "windows_i686_gnu"
-version = "0.48.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e"
-
-[[package]]
-name = "windows_i686_gnu"
-version = "0.52.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
-
-[[package]]
-name = "windows_i686_gnullvm"
-version = "0.52.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
-
-[[package]]
-name = "windows_i686_msvc"
-version = "0.48.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406"
-
-[[package]]
-name = "windows_i686_msvc"
-version = "0.52.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
-
-[[package]]
-name = "windows_x86_64_gnu"
-version = "0.48.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e"
-
-[[package]]
-name = "windows_x86_64_gnu"
-version = "0.52.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
-
-[[package]]
-name = "windows_x86_64_gnullvm"
-version = "0.48.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc"
-
-[[package]]
-name = "windows_x86_64_gnullvm"
-version = "0.52.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
-
-[[package]]
-name = "windows_x86_64_msvc"
-version = "0.48.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538"
-
-[[package]]
-name = "windows_x86_64_msvc"
-version = "0.52.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
diff --git a/rust/fuzz/Cargo.toml b/rust/fuzz/Cargo.toml

deleted file mode 100644 (file)

index 8b44789..0000000
--- a/rust/fuzz/Cargo.toml
+++ /dev/null
@@ -1,28 +0,0 @@
-[package]
-name = "pspp-fuzz"
-version = "0.0.0"
-publish = false
-edition = "2021"
-
-[package.metadata]
-cargo-fuzz = true
-
-[dependencies]
-libfuzzer-sys = "0.4"
-
-[dependencies.pspp]
-path = ".."
-
-[[bin]]
-name = "fuzz_target_1"
-path = "fuzz_targets/fuzz_target_1.rs"
-test = false
-doc = false
-bench = false
-
-[[bin]]
-name = "segment"
-path = "fuzz_targets/segment.rs"
-test = false
-doc = false
-bench = false
diff --git a/rust/fuzz/fuzz_targets/fuzz_target_1.rs b/rust/fuzz/fuzz_targets/fuzz_target_1.rs

deleted file mode 100644 (file)

index 43a88c1..0000000
--- a/rust/fuzz/fuzz_targets/fuzz_target_1.rs
+++ /dev/null
@@ -1,7 +0,0 @@
-#![no_main]
-
-use libfuzzer_sys::fuzz_target;
-
-fuzz_target!(|data: &[u8]| {
-    // fuzzed code goes here
-});
diff --git a/rust/fuzz/fuzz_targets/segment.rs b/rust/fuzz/fuzz_targets/segment.rs

deleted file mode 100644 (file)

index 1e5a109..0000000
--- a/rust/fuzz/fuzz_targets/segment.rs
+++ /dev/null
@@ -1,18 +0,0 @@
-#![no_main]
-
-use libfuzzer_sys::fuzz_target;
-use pspp::lex::segment::{Segmenter, Mode, Type};
-
-fuzz_target!(|data: &[u8]| {
-    if let Ok(mut input) = std::str::from_utf8(data) {
-        let mut segmenter = Segmenter::new(Mode::Auto, false);
-        loop {
-            let (rest, type_) = segmenter.push(input, true).unwrap();
-            match type_ {
-                Type::End => break,
-                _ => (),
-            }
-            input = rest;
-        }
-    }
-});
diff --git a/rust/pspp-lsp/Cargo.toml b/rust/pspp-lsp/Cargo.toml

new file mode 100644 (file)

index 0000000..44dd022
--- /dev/null
+++ b/rust/pspp-lsp/Cargo.toml
@@ -0,0 +1,11 @@
+[package]
+name = "pspp-lsp"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies]
+env_logger = "0.11.5"
+log = "0.4.22"
+pspp = { version = "1.0.0", path = "../pspp" }
+tokio = { version = "1.39.3", features = ["full"] }
+tower-lsp = "0.20.0"
diff --git a/rust/pspp-lsp/src/main.rs b/rust/pspp-lsp/src/main.rs

new file mode 100644 (file)

index 0000000..3876550
--- /dev/null
+++ b/rust/pspp-lsp/src/main.rs
@@ -0,0 +1,86 @@
+use std::collections::HashMap;
+
+use tokio::sync::Mutex;
+use tower_lsp::{
+    jsonrpc::Result,
+    lsp_types::*,
+    Client, LanguageServer, LspService, Server,
+};
+
+#[tokio::main]
+async fn main() {
+    env_logger::init();
+
+    let stdin = tokio::io::stdin();
+    let stdout = tokio::io::stdout();
+
+    let (service, socket) = LspService::build(|client| Backend {
+        client,
+        document_map: Mutex::new(HashMap::new()),
+    })
+    .finish();
+
+    Server::new(stdin, stdout, socket).serve(service).await;
+}
+
+#[derive(Debug)]
+struct Backend {
+    client: Client,
+    document_map: Mutex<HashMap<String, String>>,
+}
+
+#[tower_lsp::async_trait]
+impl LanguageServer for Backend {
+    async fn initialize(&self, params: InitializeParams) -> Result<InitializeResult> {
+        Ok(InitializeResult {
+            server_info: None,
+            capabilities: ServerCapabilities {
+                text_document_sync: Some(TextDocumentSyncCapability::Kind(
+                    TextDocumentSyncKind::FULL,
+                )),
+                workspace: Some(WorkspaceServerCapabilities {
+                    workspace_folders: Some(WorkspaceFoldersServerCapabilities {
+                        supported: Some(true),
+                        change_notifications: Some(OneOf::Left(true)),
+                    }),
+                    file_operations: None,
+                }),
+/*
+                semantic_tokens_provider: Some(
+                    SemanticTokensServerCapabilities::SemanticTokensRegistrationOptions(
+                        SemanticTokensRegistrationOptions {
+                            text_document_registration_options: {
+                                TextDocumentRegistrationOptions {
+                                    document_selector: Some(vec![DocumentFilter {
+                                        language: Some("pspp".to_string()),
+                                        scheme: Some("file".to_string()),
+                                        pattern: None,
+                                    }]),
+                                }
+                            },
+                            semantic_tokens_options: SemanticTokensOptions {
+                                work_done_progress_options: WorkDoneProgressOptions::default(),
+                                legend: SemanticTokensLegend {
+                                    token_types: LEGEND_TYPE.into(),
+                                    token_modifiers: vec![],
+                                },
+                                range: Some(true),
+                                full: Some(SemanticTokensFullOptions::Bool(true)),
+                            },
+                            static_registration_options: StaticRegistrationOptions::default(),
+                        },
+                    ),
+                ),
+*/
+                definition_provider: Some(OneOf::Left(true)),
+                references_provider: Some(OneOf::Left(true)),
+                rename_provider: Some(OneOf::Left(true)),
+                ..ServerCapabilities::default()
+            },
+        })
+    }
+
+    async fn shutdown(&self) -> Result<()> {
+        Ok(())
+    }
+}
diff --git a/rust/pspp/Cargo.lock b/rust/pspp/Cargo.lock

new file mode 100644 (file)

index 0000000..2c9fed4
--- /dev/null
+++ b/rust/pspp/Cargo.lock
@@ -0,0 +1,973 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 3
+
+[[package]]
+name = "adler"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
+
+[[package]]
+name = "android-tzdata"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0"
+
+[[package]]
+name = "android_system_properties"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "anyhow"
+version = "1.0.69"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "224afbd727c3d6e4b90103ece64b8d1b67fbb1973b1046c2281eed3f3803f800"
+
+[[package]]
+name = "atty"
+version = "0.2.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8"
+dependencies = [
+ "hermit-abi 0.1.19",
+ "libc",
+ "winapi",
+]
+
+[[package]]
+name = "autocfg"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
+
+[[package]]
+name = "bitflags"
+version = "1.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
+
+[[package]]
+name = "bitflags"
+version = "2.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cf4b9d6a944f767f8e5e0db018570623c85f3d925ac718db4e06d0187adb21c1"
+
+[[package]]
+name = "bumpalo"
+version = "3.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a3e2c3daef883ecc1b5d58c15adae93470a91d425f3532ba1695849656af3fc1"
+
+[[package]]
+name = "cc"
+version = "1.0.79"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f"
+
+[[package]]
+name = "cfg-if"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
+
+[[package]]
+name = "chardetng"
+version = "0.1.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "14b8f0b65b7b08ae3c8187e8d77174de20cb6777864c6b832d8ad365999cf1ea"
+dependencies = [
+ "cfg-if",
+ "encoding_rs",
+ "memchr",
+]
+
+[[package]]
+name = "chrono"
+version = "0.4.26"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ec837a71355b28f6556dbd569b37b3f363091c0bd4b2e735674521b4c5fd9bc5"
+dependencies = [
+ "android-tzdata",
+ "iana-time-zone",
+ "js-sys",
+ "num-traits",
+ "time",
+ "wasm-bindgen",
+ "winapi",
+]
+
+[[package]]
+name = "clap"
+version = "4.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2f3061d6db6d8fcbbd4b05e057f2acace52e64e96b498c08c2d7a4e65addd340"
+dependencies = [
+ "bitflags 1.3.2",
+ "clap_derive",
+ "clap_lex",
+ "is-terminal",
+ "once_cell",
+ "strsim",
+ "termcolor 1.2.0",
+ "terminal_size",
+]
+
+[[package]]
+name = "clap_derive"
+version = "4.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "34d122164198950ba84a918270a3bb3f7ededd25e15f7451673d986f55bd2667"
+dependencies = [
+ "heck",
+ "proc-macro-error",
+ "proc-macro2",
+ "quote",
+ "syn 1.0.109",
+]
+
+[[package]]
+name = "clap_lex"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "350b9cf31731f9957399229e9b2adc51eeabdfbe9d71d9a0552275fd12710d09"
+dependencies = [
+ "os_str_bytes",
+]
+
+[[package]]
+name = "core-foundation-sys"
+version = "0.8.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa"
+
+[[package]]
+name = "crc32fast"
+version = "1.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d"
+dependencies = [
+ "cfg-if",
+]
+
+[[package]]
+name = "diff"
+version = "0.1.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "56254986775e3233ffa9c4d7d3faaf6d36a2c09d30b20687e9f88bc8bafc16c8"
+
+[[package]]
+name = "encoding_rs"
+version = "0.8.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "071a31f4ee85403370b58aca746f01041ede6f0da2730960ad001edc2b71b394"
+dependencies = [
+ "cfg-if",
+]
+
+[[package]]
+name = "enum-map"
+version = "2.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6866f3bfdf8207509a033af1a75a7b08abda06bbaaeae6669323fd5a097df2e9"
+dependencies = [
+ "enum-map-derive",
+]
+
+[[package]]
+name = "enum-map-derive"
+version = "0.17.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f282cfdfe92516eb26c2af8589c274c7c17681f5ecc03c18255fe741c6aa64eb"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.27",
+]
+
+[[package]]
+name = "equivalent"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5"
+
+[[package]]
+name = "errno"
+version = "0.2.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f639046355ee4f37944e44f60642c6f3a7efa3cf6b78c78a0d989a8ce6c396a1"
+dependencies = [
+ "errno-dragonfly",
+ "libc",
+ "winapi",
+]
+
+[[package]]
+name = "errno"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4bcfec3a70f97c962c307b2d2c56e358cf1d00b558d74262b5f929ee8cc7e73a"
+dependencies = [
+ "errno-dragonfly",
+ "libc",
+ "windows-sys 0.48.0",
+]
+
+[[package]]
+name = "errno-dragonfly"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf"
+dependencies = [
+ "cc",
+ "libc",
+]
+
+[[package]]
+name = "finl_unicode"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8fcfdc7a0362c9f4444381a9e697c79d435fe65b52a37466fc2c1184cee9edc6"
+
+[[package]]
+name = "flagset"
+version = "0.4.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b3ea1ec5f8307826a5b71094dd91fc04d4ae75d5709b20ad351c7fb4815c86ec"
+
+[[package]]
+name = "flate2"
+version = "1.0.26"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3b9429470923de8e8cbd4d2dc513535400b4b3fef0319fb5c4e1f520a7bef743"
+dependencies = [
+ "crc32fast",
+ "miniz_oxide",
+]
+
+[[package]]
+name = "float_next_after"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8bf7cc16383c4b8d58b9905a8509f02926ce3058053c056376248d958c9df1e8"
+
+[[package]]
+name = "hashbrown"
+version = "0.14.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604"
+
+[[package]]
+name = "heck"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
+
+[[package]]
+name = "hermit-abi"
+version = "0.1.19"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "hermit-abi"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286"
+
+[[package]]
+name = "hexplay"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0962bea6731e28b5a443ba4aa00fe3e4fe7555dadf12012435efb738eeac5898"
+dependencies = [
+ "atty",
+ "termcolor 0.3.6",
+]
+
+[[package]]
+name = "iana-time-zone"
+version = "0.1.57"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2fad5b825842d2b38bd206f3e81d6957625fd7f0a361e345c30e01a0ae2dd613"
+dependencies = [
+ "android_system_properties",
+ "core-foundation-sys",
+ "iana-time-zone-haiku",
+ "js-sys",
+ "wasm-bindgen",
+ "windows",
+]
+
+[[package]]
+name = "iana-time-zone-haiku"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f"
+dependencies = [
+ "cc",
+]
+
+[[package]]
+name = "indexmap"
+version = "2.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d530e1a18b1cb4c484e6e34556a0d948706958449fca0cab753d649f2bce3d1f"
+dependencies = [
+ "equivalent",
+ "hashbrown",
+]
+
+[[package]]
+name = "io-lifetimes"
+version = "1.0.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1abeb7a0dd0f8181267ff8adc397075586500b81b28a73e8a0208b00fc170fb3"
+dependencies = [
+ "libc",
+ "windows-sys 0.45.0",
+]
+
+[[package]]
+name = "is-terminal"
+version = "0.4.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "21b6b32576413a8e69b90e952e4a026476040d81017b80445deda5f2d3921857"
+dependencies = [
+ "hermit-abi 0.3.1",
+ "io-lifetimes",
+ "rustix 0.36.8",
+ "windows-sys 0.45.0",
+]
+
+[[package]]
+name = "js-sys"
+version = "0.3.64"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c5f195fe497f702db0f318b07fdd68edb16955aed830df8363d837542f8f935a"
+dependencies = [
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "lazy_static"
+version = "1.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
+
+[[package]]
+name = "libc"
+version = "0.2.147"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b4668fb0ea861c1df094127ac5f1da3409a82116a4ba74fca2e58ef927159bb3"
+
+[[package]]
+name = "linux-raw-sys"
+version = "0.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4"
+
+[[package]]
+name = "linux-raw-sys"
+version = "0.3.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519"
+
+[[package]]
+name = "log"
+version = "0.4.19"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b06a4cde4c0f271a446782e3eff8de789548ce57dbc8eca9292c27f4a42004b4"
+
+[[package]]
+name = "memchr"
+version = "2.7.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
+
+[[package]]
+name = "miniz_oxide"
+version = "0.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e7810e0be55b428ada41041c41f32c9f1a42817901b4ccf45fa3d4b6561e74c7"
+dependencies = [
+ "adler",
+]
+
+[[package]]
+name = "num"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "43db66d1170d347f9a065114077f7dccb00c1b9478c89384490a3425279a4606"
+dependencies = [
+ "num-bigint",
+ "num-complex",
+ "num-integer",
+ "num-iter",
+ "num-rational",
+ "num-traits",
+]
+
+[[package]]
+name = "num-bigint"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f93ab6289c7b344a8a9f60f88d80aa20032336fe78da341afc91c8a2341fc75f"
+dependencies = [
+ "autocfg",
+ "num-integer",
+ "num-traits",
+]
+
+[[package]]
+name = "num-complex"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "02e0d21255c828d6f128a1e41534206671e8c3ea0c62f32291e808dc82cff17d"
+dependencies = [
+ "num-traits",
+]
+
+[[package]]
+name = "num-derive"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9e6a0fd4f737c707bd9086cc16c925f294943eb62eb71499e9fd4cf71f8b9f4e"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.27",
+]
+
+[[package]]
+name = "num-integer"
+version = "0.1.45"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "225d3389fb3509a24c93f5c29eb6bde2586b98d9f016636dff58d7c6f7569cd9"
+dependencies = [
+ "autocfg",
+ "num-traits",
+]
+
+[[package]]
+name = "num-iter"
+version = "0.1.43"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7d03e6c028c5dc5cac6e2dec0efda81fc887605bb3d884578bb6d6bf7514e252"
+dependencies = [
+ "autocfg",
+ "num-integer",
+ "num-traits",
+]
+
+[[package]]
+name = "num-rational"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0638a1c9d0a3c0914158145bc76cff373a75a627e6ecbfb71cbe6f453a5a19b0"
+dependencies = [
+ "autocfg",
+ "num-bigint",
+ "num-integer",
+ "num-traits",
+]
+
+[[package]]
+name = "num-traits"
+version = "0.2.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f30b0abd723be7e2ffca1272140fac1a2f084c77ec3e123c192b66af1ee9e6c2"
+dependencies = [
+ "autocfg",
+]
+
+[[package]]
+name = "once_cell"
+version = "1.17.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3"
+
+[[package]]
+name = "ordered-float"
+version = "3.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2fc2dbde8f8a79f2102cc474ceb0ad68e3b80b85289ea62389b60e66777e4213"
+dependencies = [
+ "num-traits",
+]
+
+[[package]]
+name = "os_str_bytes"
+version = "6.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9b7820b9daea5457c9f21c69448905d723fbd21136ccf521748f23fd49e723ee"
+
+[[package]]
+name = "proc-macro-error"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c"
+dependencies = [
+ "proc-macro-error-attr",
+ "proc-macro2",
+ "quote",
+ "syn 1.0.109",
+ "version_check",
+]
+
+[[package]]
+name = "proc-macro-error-attr"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "version_check",
+]
+
+[[package]]
+name = "proc-macro2"
+version = "1.0.66"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "18fb31db3f9bddb2ea821cde30a9f70117e3f119938b5ee630b7403aa6e2ead9"
+dependencies = [
+ "unicode-ident",
+]
+
+[[package]]
+name = "pspp"
+version = "1.0.0"
+dependencies = [
+ "anyhow",
+ "bitflags 2.5.0",
+ "chardetng",
+ "chrono",
+ "clap",
+ "diff",
+ "encoding_rs",
+ "enum-map",
+ "finl_unicode",
+ "flagset",
+ "flate2",
+ "float_next_after",
+ "hexplay",
+ "indexmap",
+ "lazy_static",
+ "libc",
+ "num",
+ "num-derive",
+ "num-traits",
+ "ordered-float",
+ "thiserror",
+ "unicase",
+ "unicode-width",
+ "utf8-decode",
+ "windows-sys 0.48.0",
+]
+
+[[package]]
+name = "quote"
+version = "1.0.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "50f3b39ccfb720540debaa0164757101c08ecb8d326b15358ce76a62c7e85965"
+dependencies = [
+ "proc-macro2",
+]
+
+[[package]]
+name = "rustix"
+version = "0.36.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f43abb88211988493c1abb44a70efa56ff0ce98f233b7b276146f1f3f7ba9644"
+dependencies = [
+ "bitflags 1.3.2",
+ "errno 0.2.8",
+ "io-lifetimes",
+ "libc",
+ "linux-raw-sys 0.1.4",
+ "windows-sys 0.45.0",
+]
+
+[[package]]
+name = "rustix"
+version = "0.37.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "62b24138615de35e32031d041a09032ef3487a616d901ca4db224e7d557efae2"
+dependencies = [
+ "bitflags 1.3.2",
+ "errno 0.3.1",
+ "io-lifetimes",
+ "libc",
+ "linux-raw-sys 0.3.8",
+ "windows-sys 0.45.0",
+]
+
+[[package]]
+name = "strsim"
+version = "0.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
+
+[[package]]
+name = "syn"
+version = "1.0.109"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
+[[package]]
+name = "syn"
+version = "2.0.27"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b60f673f44a8255b9c8c657daf66a596d435f2da81a555b06dc644d080ba45e0"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
+[[package]]
+name = "termcolor"
+version = "0.3.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "adc4587ead41bf016f11af03e55a624c06568b5a19db4e90fde573d805074f83"
+dependencies = [
+ "wincolor",
+]
+
+[[package]]
+name = "termcolor"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "be55cf8942feac5c765c2c993422806843c9a9a45d4d5c407ad6dd2ea95eb9b6"
+dependencies = [
+ "winapi-util",
+]
+
+[[package]]
+name = "terminal_size"
+version = "0.2.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e6bf6f19e9f8ed8d4048dc22981458ebcf406d67e94cd422e5ecd73d63b3237"
+dependencies = [
+ "rustix 0.37.3",
+ "windows-sys 0.48.0",
+]
+
+[[package]]
+name = "thiserror"
+version = "1.0.39"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a5ab016db510546d856297882807df8da66a16fb8c4101cb8b30054b0d5b2d9c"
+dependencies = [
+ "thiserror-impl",
+]
+
+[[package]]
+name = "thiserror-impl"
+version = "1.0.39"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5420d42e90af0c38c3290abcca25b9b3bdf379fc9f55c528f53a269d9c9a267e"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 1.0.109",
+]
+
+[[package]]
+name = "time"
+version = "0.1.45"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1b797afad3f312d1c66a56d11d0316f916356d11bd158fbc6ca6389ff6bf805a"
+dependencies = [
+ "libc",
+ "wasi",
+ "winapi",
+]
+
+[[package]]
+name = "unicase"
+version = "2.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "50f37be617794602aabbeee0be4f259dc1778fabe05e2d67ee8f79326d5cb4f6"
+dependencies = [
+ "version_check",
+]
+
+[[package]]
+name = "unicode-ident"
+version = "1.0.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "84a22b9f218b40614adcb3f4ff08b703773ad44fa9423e4e0d346d5db86e4ebc"
+
+[[package]]
+name = "unicode-width"
+version = "0.1.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0336d538f7abc86d282a4189614dfaa90810dfc2c6f6427eaf88e16311dd225d"
+
+[[package]]
+name = "utf8-decode"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ca61eb27fa339aa08826a29f03e87b99b4d8f0fc2255306fd266bb1b6a9de498"
+
+[[package]]
+name = "version_check"
+version = "0.9.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
+
+[[package]]
+name = "wasi"
+version = "0.10.0+wasi-snapshot-preview1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f"
+
+[[package]]
+name = "wasm-bindgen"
+version = "0.2.87"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7706a72ab36d8cb1f80ffbf0e071533974a60d0a308d01a5d0375bf60499a342"
+dependencies = [
+ "cfg-if",
+ "wasm-bindgen-macro",
+]
+
+[[package]]
+name = "wasm-bindgen-backend"
+version = "0.2.87"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5ef2b6d3c510e9625e5fe6f509ab07d66a760f0885d858736483c32ed7809abd"
+dependencies = [
+ "bumpalo",
+ "log",
+ "once_cell",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.27",
+ "wasm-bindgen-shared",
+]
+
+[[package]]
+name = "wasm-bindgen-macro"
+version = "0.2.87"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dee495e55982a3bd48105a7b947fd2a9b4a8ae3010041b9e0faab3f9cd028f1d"
+dependencies = [
+ "quote",
+ "wasm-bindgen-macro-support",
+]
+
+[[package]]
+name = "wasm-bindgen-macro-support"
+version = "0.2.87"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.27",
+ "wasm-bindgen-backend",
+ "wasm-bindgen-shared",
+]
+
+[[package]]
+name = "wasm-bindgen-shared"
+version = "0.2.87"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ca6ad05a4870b2bf5fe995117d3728437bd27d7cd5f06f13c17443ef369775a1"
+
+[[package]]
+name = "winapi"
+version = "0.3.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
+dependencies = [
+ "winapi-i686-pc-windows-gnu",
+ "winapi-x86_64-pc-windows-gnu",
+]
+
+[[package]]
+name = "winapi-i686-pc-windows-gnu"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
+
+[[package]]
+name = "winapi-util"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178"
+dependencies = [
+ "winapi",
+]
+
+[[package]]
+name = "winapi-x86_64-pc-windows-gnu"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
+
+[[package]]
+name = "wincolor"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eeb06499a3a4d44302791052df005d5232b927ed1a9658146d842165c4de7767"
+dependencies = [
+ "winapi",
+]
+
+[[package]]
+name = "windows"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e686886bc078bc1b0b600cac0147aadb815089b6e4da64016cbd754b6342700f"
+dependencies = [
+ "windows-targets 0.48.1",
+]
+
+[[package]]
+name = "windows-sys"
+version = "0.45.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0"
+dependencies = [
+ "windows-targets 0.42.1",
+]
+
+[[package]]
+name = "windows-sys"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9"
+dependencies = [
+ "windows-targets 0.48.1",
+]
+
+[[package]]
+name = "windows-targets"
+version = "0.42.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e2522491fbfcd58cc84d47aeb2958948c4b8982e9a2d8a2a35bbaed431390e7"
+dependencies = [
+ "windows_aarch64_gnullvm 0.42.1",
+ "windows_aarch64_msvc 0.42.1",
+ "windows_i686_gnu 0.42.1",
+ "windows_i686_msvc 0.42.1",
+ "windows_x86_64_gnu 0.42.1",
+ "windows_x86_64_gnullvm 0.42.1",
+ "windows_x86_64_msvc 0.42.1",
+]
+
+[[package]]
+name = "windows-targets"
+version = "0.48.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "05d4b17490f70499f20b9e791dcf6a299785ce8af4d709018206dc5b4953e95f"
+dependencies = [
+ "windows_aarch64_gnullvm 0.48.0",
+ "windows_aarch64_msvc 0.48.0",
+ "windows_i686_gnu 0.48.0",
+ "windows_i686_msvc 0.48.0",
+ "windows_x86_64_gnu 0.48.0",
+ "windows_x86_64_gnullvm 0.48.0",
+ "windows_x86_64_msvc 0.48.0",
+]
+
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.42.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8c9864e83243fdec7fc9c5444389dcbbfd258f745e7853198f365e3c4968a608"
+
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc"
+
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.42.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4c8b1b673ffc16c47a9ff48570a9d85e25d265735c503681332589af6253c6c7"
+
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3"
+
+[[package]]
+name = "windows_i686_gnu"
+version = "0.42.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "de3887528ad530ba7bdbb1faa8275ec7a1155a45ffa57c37993960277145d640"
+
+[[package]]
+name = "windows_i686_gnu"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241"
+
+[[package]]
+name = "windows_i686_msvc"
+version = "0.42.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bf4d1122317eddd6ff351aa852118a2418ad4214e6613a50e0191f7004372605"
+
+[[package]]
+name = "windows_i686_msvc"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00"
+
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.42.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c1040f221285e17ebccbc2591ffdc2d44ee1f9186324dd3e84e99ac68d699c45"
+
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1"
+
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.42.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "628bfdf232daa22b0d64fdb62b09fcc36bb01f05a3939e20ab73aaf9470d0463"
+
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953"
+
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.42.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "447660ad36a13288b1db4d4248e857b510e8c3a225c822ba4fb748c0aafecffd"
+
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a"
diff --git a/rust/pspp/Cargo.toml b/rust/pspp/Cargo.toml

new file mode 100644 (file)

index 0000000..41b2f02
--- /dev/null
+++ b/rust/pspp/Cargo.toml
@@ -0,0 +1,51 @@
+[package]
+name = "pspp"
+version = "1.0.0"
+edition = "2021"
+authors = [ "Ben Pfaff", "John Darrington" ]
+
+[dependencies]
+anyhow = "1.0.69"
+clap = { version = "4.1.7", features = ["derive", "wrap_help"] }
+encoding_rs = "0.8.32"
+flate2 = "1.0.26"
+float_next_after = "1.0.0"
+hexplay = "0.2.1"
+lazy_static = "1.4.0"
+num = "0.4.0"
+num-derive = "0.4.0"
+num-traits = "0.2.16"
+ordered-float = "3.7.0"
+thiserror = "1.0"
+chrono = "0.4.26"
+finl_unicode = "1.2.0"
+unicase = "2.6.0"
+libc = "0.2.147"
+indexmap = "2.1.0"
+utf8-decode = "1.0.1"
+bitflags = "2.5.0"
+unicode-width = "0.1.13"
+chardetng = "0.1.17"
+enum-map = "2.7.3"
+flagset = "0.4.6"
+
+[target.'cfg(windows)'.dependencies]
+windows-sys = { version = "0.48.0", features = ["Win32_Globalization"] }
+
+[build-dependencies]
+anyhow = "1.0.69"
+
+[[bin]]
+name = "pspp-dump-sav"
+path = "src/main.rs"
+
+[lib]
+path = "src/lib.rs"
+
+[[test]]
+name = "sack"
+path = "tests/sack.rs"
+harness = false
+
+[dev-dependencies]
+diff = "0.1.13"
diff --git a/rust/pspp/build.rs b/rust/pspp/build.rs

new file mode 100644 (file)

index 0000000..f8cb9ef
--- /dev/null
+++ b/rust/pspp/build.rs
@@ -0,0 +1,184 @@
+use anyhow::{anyhow, Result as AnyResult};
+use std::{
+    collections::{BTreeMap, HashSet, VecDeque},
+    env::var_os,
+    fs::{read_to_string, File},
+    io::{Error as IoError, Write},
+    path::{Path, PathBuf},
+};
+
+#[derive(Copy, Clone, PartialEq, Eq, Ord, PartialOrd)]
+enum Source {
+    Codepage,
+    Ibm,
+    Windows,
+}
+
+// Code page number.
+type CodepageNumber = usize;
+
+fn process_converter<'a>(
+    fields: &Vec<&'a str>,
+    codepages: &mut BTreeMap<CodepageNumber, BTreeMap<Source, Vec<&'a str>>>,
+) {
+    if fields.is_empty() || fields[0] == "{" {
+        return;
+    }
+
+    let mut cps: BTreeMap<Source, CodepageNumber> = BTreeMap::new();
+    let mut iana = VecDeque::new();
+    let mut other = VecDeque::new();
+
+    let mut iter = fields.iter().peekable();
+    while let Some(&name) = iter.next() {
+        if iter.next_if(|&&s| s == "{").is_some() {
+            let mut standards = HashSet::new();
+            loop {
+                let &standard = iter.next().expect("missing `}` in list of standards");
+                if standard == "}" {
+                    break;
+                }
+                standards.insert(standard);
+            }
+
+            if standards.contains("IANA*") {
+                iana.push_front(name);
+            } else if standards.contains("IANA") {
+                iana.push_back(name);
+            } else if standards.iter().any(|&s| s.ends_with('*')) {
+                other.push_front(name);
+            } else {
+                other.push_back(name);
+            }
+        } else {
+            // Untagged names are completely nonstandard.
+            continue;
+        }
+
+        if let Some(number) = name.strip_prefix("cp") {
+            if let Ok(number) = number.parse::<CodepageNumber>() {
+                cps.insert(Source::Codepage, number);
+            }
+        }
+
+        if let Some(number) = name.strip_prefix("windows-") {
+            if let Ok(number) = number.parse::<CodepageNumber>() {
+                cps.insert(Source::Windows, number);
+            }
+        }
+
+        if let Some(number) = name.strip_prefix("ibm-") {
+            if let Ok(number) = number.parse::<CodepageNumber>() {
+                cps.insert(Source::Ibm, number);
+            }
+        }
+    }
+
+    // If there are no tagged names then this is completely nonstandard.
+    if iana.is_empty() && other.is_empty() {
+        return;
+    }
+
+    let all: Vec<&str> = iana.into_iter().chain(other).collect();
+    for (source, number) in cps {
+        codepages
+            .entry(number)
+            .or_default()
+            .insert(source, all.clone());
+    }
+}
+
+fn write_output(
+    codepages: &BTreeMap<CodepageNumber, BTreeMap<Source, Vec<&str>>>,
+    file_name: &PathBuf,
+) -> Result<(), IoError> {
+    let mut file = File::create(file_name)?;
+
+    file.write_all(
+        "\
+use lazy_static::lazy_static;
+use std::collections::HashMap;
+
+lazy_static! {
+    static ref CODEPAGE_NUMBER_TO_NAME: HashMap<i32, &'static str> = {
+        let mut map = HashMap::new();
+"
+        .as_bytes(),
+    )?;
+
+    for (&cpnumber, value) in codepages.iter() {
+        let source = value.keys().max().unwrap();
+        let name = value[source][0];
+        writeln!(file, "        map.insert({cpnumber}, \"{name}\");")?;
+    }
+    file.write_all(
+        "        map
+    };
+
+    static ref CODEPAGE_NAME_TO_NUMBER: HashMap<&'static str, u32> = {
+        let mut map = HashMap::new();
+"
+        .as_bytes(),
+    )?;
+
+    let mut names: BTreeMap<String, BTreeMap<Source, Vec<CodepageNumber>>> = BTreeMap::new();
+    for (&cpnumber, value) in codepages.iter() {
+        for (&source, value2) in value.iter() {
+            for name in value2.iter().map(|name| name.to_ascii_lowercase()) {
+                names
+                    .entry(name)
+                    .or_default()
+                    .entry(source)
+                    .or_default()
+                    .push(cpnumber);
+            }
+        }
+    }
+
+    for (name, value) in names.iter() {
+        for (_source, numbers) in value.iter().rev().take(1) {
+            writeln!(file, "        map.insert(\"{name}\", {});", numbers[0])?;
+        }
+    }
+    file.write_all(
+        "        map
+    };
+}
+"
+        .as_bytes(),
+    )?;
+
+    Ok(())
+}
+
+fn main() -> AnyResult<()> {
+    println!("cargo:rerun-if-changed=build.rs");
+
+    let input_file = Path::new(env!("CARGO_MANIFEST_DIR")).join("convrtrs.txt");
+    println!("cargo:rerun-if-changed={}", input_file.to_string_lossy());
+    let input = read_to_string(&input_file)
+        .map_err(|e| anyhow!("{}: read failed ({e})", input_file.to_string_lossy()))?;
+
+    let mut codepages: BTreeMap<CodepageNumber, BTreeMap<Source, Vec<&str>>> = BTreeMap::new();
+    let mut converter: Vec<&str> = Vec::new();
+    for line in input.lines() {
+        let line = line
+            .find('#')
+            .map(|position| &line[..position])
+            .unwrap_or(line)
+            .trim_end();
+        if !line.starts_with([' ', '\t']) {
+            process_converter(&converter, &mut codepages);
+            converter.clear();
+        }
+        converter.extend(line.split_whitespace());
+    }
+    process_converter(&converter, &mut codepages);
+
+    let output_file_name = Path::new(&var_os("OUT_DIR").unwrap()).join("encodings.rs");
+
+    write_output(&codepages, &output_file_name)
+        .map_err(|e| anyhow!("{}: write failed ({e})", output_file_name.to_string_lossy()))?;
+
+    Ok(())
+}
diff --git a/rust/pspp/convrtrs.txt b/rust/pspp/convrtrs.txt

new file mode 100644 (file)

index 0000000..4aaa592
--- /dev/null
+++ b/rust/pspp/convrtrs.txt
@@ -0,0 +1,1269 @@
+# Copyright (C) 2016 and later: Unicode, Inc. and others.
+# License & terms of use: http://www.unicode.org/copyright.html
+# ******************************************************************************
+# *
+# *   Copyright (C) 1995-2014, International Business Machines
+# *   Corporation and others.  All Rights Reserved.
+# *
+# ******************************************************************************
+
+# If this converter alias table looks very confusing, a much easier to
+# understand view can be found at this demo:
+# http://demo.icu-project.org/icu-bin/convexp
+
+# IMPORTANT NOTE
+#
+# This file is not read directly by ICU. If you change it, you need to
+# run gencnval, and eventually run pkgdata to update the representation that
+# ICU uses for aliases. The gencnval tool will normally compile this file into
+# cnvalias.icu. The gencnval -v verbose option will help you when you edit
+# this file.
+
+# Please be friendly to the rest of us that edit this table by
+# keeping this table free of tabs.
+
+# This is an alias file used by the character set converter.
+# A lot of converter information can be found in unicode/ucnv.h, but here
+# is more information about this file.
+# 
+# If you are adding a new converter to this list and want to include it in the
+# icu data library, please be sure to add an entry to the appropriate ucm*.mk file
+# (see ucmfiles.mk for more information).
+# 
+# Here is the file format using BNF-like syntax:
+#
+# converterTable ::= tags { converterLine* }
+# converterLine ::= converterName [ tags ] { taggedAlias* }'\n'
+# taggedAlias ::= alias [ tags ]
+# tags ::= '{' { tag+ } '}'
+# tag ::= standard['*']
+# converterName ::= [0-9a-zA-Z:_'-']+
+# alias ::= converterName
+#
+# Except for the converter name, aliases are case insensitive.
+# Names are separated by whitespace.
+# Line continuation and comment sytax are similar to the GNU make syntax.
+# Any lines beginning with whitespace (e.g. U+0020 SPACE or U+0009 HORIZONTAL
+# TABULATION) are presumed to be a continuation of the previous line.
+# The # symbol starts a comment and the comment continues till the end of
+# the line.
+#
+# The converter
+#
+# All names can be tagged by including a space-separated list of tags in
+# curly braces, as in ISO_8859-1:1987{IANA*} iso-8859-1 { MIME* } or
+# some-charset{MIME* IANA*}. The order of tags does not matter, and
+# whitespace is allowed between the tagged name and the tags list.
+#
+# The tags can be used to get standard names using ucnv_getStandardName().
+#
+# The complete list of recognized tags used in this file is defined in
+# the affinity list near the beginning of the file.
+#
+# The * after the standard tag denotes that the previous alias is the
+# preferred (default) charset name for that standard. There can only
+# be one of these default charset names per converter.
+
+
+
+# The world is getting more complicated...
+# Supporting XML parsers, HTML, MIME, and similar applications
+# that mark encodings with a charset name can be difficult.
+# Many of these applications and operating systems will update
+# their codepages over time.
+
+# It means that a new codepage, one that differs from an
+# old one by changing a code point, e.g., to the Euro sign,
+# must not get an old alias, because it would mean that
+# old files with this alias would be interpreted differently.
+
+# If an codepage gets updated by assigning characters to previously
+# unassigned code points, then a new name is not necessary.
+# Also, some codepages map unassigned codepage byte values
+# to the same numbers in Unicode for roundtripping. It may be
+# industry practice to keep the encoding name in such a case, too
+# (example: Windows codepages).
+
+# The aliases listed in the list of character sets
+# that is maintained by the IANA (http://www.iana.org/) must
+# not be changed to mean encodings different from what this
+# list shows. Currently, the IANA list is at
+# http://www.iana.org/assignments/character-sets
+# It should also be mentioned that the exact mapping table used for each
+# IANA names usually isn't specified. This means that some other applications
+# and operating systems are left to interpret the exact mappings for the
+# underspecified aliases. For instance, Shift-JIS on a Solaris platform
+# may be different from Shift-JIS on a Windows platform. This is why
+# some of the aliases can be tagged to differentiate different mapping
+# tables with the same alias. If an alias is given to more than one converter,
+# it is considered to be an ambiguous alias, and the affinity list will
+# choose the converter to use when a standard isn't specified with the alias.
+
+# Name matching is case-insensitive. Also, dashes '-', underscores '_'
+# and spaces ' ' are ignored in names (thus cs-iso_latin-1, csisolatin1
+# and "cs iso latin 1" are the same).
+# However, the names in the left column are directly file names
+# or names of algorithmic converters, and their case must not
+# be changed - or else code and/or file names must also be changed.
+# For example, the converter ibm-921 is expected to be the file ibm-921.cnv.
+
+
+
+# The immediately following list is the affinity list of supported standard tags.
+# When multiple converters have the same alias under different standards,
+# the standard nearest to the top of this list with that alias will
+# be the first converter that will be opened. The ordering of the aliases
+# after this affinity list does not affect the preferred alias, but it may
+# affect the order of the returned list of aliases for a given converter.
+#
+# The general ordering is from specific and frequently used to more general
+# or rarely used at the bottom.
+{   UTR22           # Name format specified by https://www.unicode.org/reports/tr22/
+    # ICU             # Can also use ICU_FEATURE
+    IBM             # The IBM CCSID number is specified by ibm-*
+    WINDOWS         # The Microsoft code page identifier number is specified by windows-*. The rest are recognized IE names.
+    JAVA            # Source: Sun JDK. Alias name case is ignored, but dashes are not ignored.
+    # GLIBC
+    # AIX
+    # DB2
+    # SOLARIS
+    # APPLE
+    # HPUX
+    IANA            # Source: http://www.iana.org/assignments/character-sets
+    MIME            # Source: http://www.iana.org/assignments/character-sets
+    # MSIE            # MSIE is Internet Explorer, which can be different from Windows (From the IMultiLanguage COM interface)
+    # ZOS_USS         # z/OS (os/390) Unix System Services (USS), which has NL<->LF swapping. They have the same format as the IBM tag.
+    }
+
+
+
+# Fully algorithmic converters
+
+UTF-8 { IANA* MIME* JAVA* WINDOWS }
+                                ibm-1208 { IBM* } # UTF-8 with IBM PUA
+                                ibm-1209 { IBM }  # UTF-8
+                                ibm-5304 { IBM }  # Unicode 2.0, UTF-8 with IBM PUA
+                                ibm-5305 { IBM }  # Unicode 2.0, UTF-8
+                                ibm-13496 { IBM } # Unicode 3.0, UTF-8 with IBM PUA
+                                ibm-13497 { IBM } # Unicode 3.0, UTF-8
+                                ibm-17592 { IBM } # Unicode 4.0, UTF-8 with IBM PUA
+                                ibm-17593 { IBM } # Unicode 4.0, UTF-8
+                                windows-65001 { WINDOWS* }
+                                cp1208
+                                x-UTF_8J
+                                unicode-1-1-utf-8
+                                unicode-2-0-utf-8
+
+# The ICU 2.2 UTF-16/32 converters detect and write a BOM.
+UTF-16 { IANA* MIME* JAVA* }    ISO-10646-UCS-2 { IANA }
+                                ibm-1204 { IBM* } # UTF-16 with IBM PUA and BOM sensitive
+                                ibm-1205 { IBM }  # UTF-16 BOM sensitive
+                                unicode
+                                csUnicode
+                                ucs-2
+# The following Unicode CCSIDs (IBM) are not valid in ICU because they are
+# considered pure DBCS (exactly 2 bytes) of Unicode,
+# and they are a subset of Unicode. ICU does not support their encoding structures.
+# 1400 1401 1402 1410 1414 1415 1446 1447 1448 1449 64770 64771 65520 5496 5497 5498 9592 13688
+UTF-16BE { IANA* MIME* JAVA* }  x-utf-16be { JAVA }
+                                UnicodeBigUnmarked { JAVA } # java.io name
+                                ibm-1200 { IBM* } # UTF-16 BE with IBM PUA
+                                ibm-1201 { IBM }  # UTF-16 BE
+                                ibm-13488 { IBM } # Unicode 2.0, UTF-16 BE with IBM PUA
+                                ibm-13489 { IBM } # Unicode 2.0, UTF-16 BE
+                                ibm-17584 { IBM } # Unicode 3.0, UTF-16 BE with IBM PUA
+                                ibm-17585 { IBM } # Unicode 3.0, UTF-16 BE
+                                ibm-21680 { IBM } # Unicode 4.0, UTF-16 BE with IBM PUA
+                                ibm-21681 { IBM } # Unicode 4.0, UTF-16 BE
+                                ibm-25776 { IBM } # Unicode 4.1, UTF-16 BE with IBM PUA
+                                ibm-25777 { IBM } # Unicode 4.1, UTF-16 BE
+                                ibm-29872 { IBM } # Unicode 5.0, UTF-16 BE with IBM PUA
+                                ibm-29873 { IBM } # Unicode 5.0, UTF-16 BE
+                                ibm-61955 { IBM } # UTF-16BE with Gaidai University (Japan) PUA
+                                ibm-61956 { IBM } # UTF-16BE with Microsoft HKSCS-Big 5 PUA
+                                windows-1201 { WINDOWS* }
+                                cp1200
+                                cp1201
+                                UTF16_BigEndian
+                                # ibm-5297 { IBM }  # Unicode 2.0, UTF-16 (BE) (reserved, never used)
+                                # iso-10646-ucs-2 { JAVA } # This is ambiguous
+                                # ibm-61952 is not a valid CCSID because it's Unicode 1.1
+                                # ibm-61953 is not a valid CCSID because it's Unicode 1.0
+UTF-16LE { IANA* MIME* JAVA* }  x-utf-16le { JAVA }
+                                UnicodeLittleUnmarked { JAVA } # java.io name
+                                ibm-1202 { IBM* } # UTF-16 LE with IBM PUA
+                                ibm-1203 { IBM }  # UTF-16 LE
+                                ibm-13490 { IBM } # Unicode 2.0, UTF-16 LE with IBM PUA
+                                ibm-13491 { IBM } # Unicode 2.0, UTF-16 LE
+                                ibm-17586 { IBM } # Unicode 3.0, UTF-16 LE with IBM PUA
+                                ibm-17587 { IBM } # Unicode 3.0, UTF-16 LE
+                                ibm-21682 { IBM } # Unicode 4.0, UTF-16 LE with IBM PUA
+                                ibm-21683 { IBM } # Unicode 4.0, UTF-16 LE
+                                ibm-25778 { IBM } # Unicode 4.1, UTF-16 LE with IBM PUA
+                                ibm-25779 { IBM } # Unicode 4.1, UTF-16 LE
+                                ibm-29874 { IBM } # Unicode 5.0, UTF-16 LE with IBM PUA
+                                ibm-29875 { IBM } # Unicode 5.0, UTF-16 LE
+                                UTF16_LittleEndian
+                                windows-1200 { WINDOWS* }
+
+UTF-32 { IANA* MIME* }          ISO-10646-UCS-4 { IANA }
+                                ibm-1236 { IBM* } # UTF-32 with IBM PUA and BOM sensitive
+                                ibm-1237 { IBM }  # UTF-32 BOM sensitive
+                                csUCS4
+                                ucs-4
+UTF-32BE { IANA* }              UTF32_BigEndian
+                                ibm-1232 { IBM* } # UTF-32 BE with IBM PUA
+                                ibm-1233 { IBM }  # UTF-32 BE
+                                ibm-9424 { IBM }  # Unicode 4.1, UTF-32 BE with IBM PUA
+UTF-32LE { IANA* }              UTF32_LittleEndian
+                                ibm-1234 { IBM* } # UTF-32 LE, with IBM PUA
+                                ibm-1235 { IBM }  # UTF-32 LE
+
+# ICU-specific names for special uses
+UTF16_PlatformEndian
+UTF16_OppositeEndian
+
+UTF32_PlatformEndian
+UTF32_OppositeEndian
+
+
+# Java-specific, non-Unicode-standard UTF-16 variants.
+# These are in the Java "Basic Encoding Set (contained in lib/rt.jar)".
+# See the "Supported Encodings" at
+# http://java.sun.com/javase/6/docs/technotes/guides/intl/encoding.doc.html
+# or a newer version of this document.
+#
+# Aliases marked with { JAVA* } are canonical names for java.io and java.lang APIs.
+# Aliases marked with { JAVA } are canonical names for the java.nio API.
+#
+# "BOM" means the Unicode Byte Order Mark, which is the encoding-scheme-specific
+# byte sequence for U+FEFF.
+# "Reverse BOM" means the BOM for the sibling encoding scheme with the
+# opposite endianness. (LE<->BE)
+
+# "Sixteen-bit Unicode (or UCS) Transformation Format, big-endian byte order,
+# with byte-order mark"
+#
+# From Unicode: Writes BOM.
+# To Unicode: Detects and consumes BOM. 
+#   If there is a "reverse BOM", Java throws
+#   MalformedInputException: Incorrect byte-order mark.
+#   In this case, ICU4C sets a U_ILLEGAL_ESCAPE_SEQUENCE UErrorCode value
+#   and a UCNV_ILLEGAL UConverterCallbackReason.
+UTF-16BE,version=1             UnicodeBig { JAVA* }
+
+# "Sixteen-bit Unicode (or UCS) Transformation Format, little-endian byte order,
+# with byte-order mark"
+#
+# From Unicode: Writes BOM.
+# To Unicode: Detects and consumes BOM. 
+#   If there is a "reverse BOM", Java throws
+#   MalformedInputException: Incorrect byte-order mark.
+#   In this case, ICU4C sets a U_ILLEGAL_ESCAPE_SEQUENCE UErrorCode value
+#   and a UCNV_ILLEGAL UConverterCallbackReason.
+UTF-16LE,version=1             UnicodeLittle { JAVA* }  x-UTF-16LE-BOM { JAVA }
+
+# This one is not mentioned on the "Supported Encodings" page
+# but is available in Java.
+# In Java, this is called "Unicode" but we cannot give it that alias
+# because the standard UTF-16 converter already has a "unicode" alias.
+#
+# From Unicode: Writes BOM.
+# To Unicode: Detects and consumes BOM.
+#   If there is no BOM, rather than defaulting to BE, Java throws
+#   MalformedInputException: Missing byte-order mark.
+#   In this case, ICU4C sets a U_ILLEGAL_ESCAPE_SEQUENCE UErrorCode value
+#   and a UCNV_ILLEGAL UConverterCallbackReason.
+UTF-16,version=1
+
+# This is the same as standard UTF-16 but always writes a big-endian byte stream,
+# regardless of the platform endianness, as expected by the Java compatibility tests.
+# See the java.nio.charset.Charset API documentation at
+# http://java.sun.com/javase/6/docs/api/java/nio/charset/Charset.html
+# or a newer version of this document.
+#
+# From Unicode: Write BE BOM and BE bytes
+# To Unicode: Detects and consumes BOM. Defaults to BE.
+UTF-16,version=2
+
+# Note: ICU does not currently support Java-specific, non-Unicode-standard UTF-32 variants.
+# Presumably, these behave analogously to the UTF-16 variants with similar names.
+# UTF_32BE_BOM  x-UTF-32BE-BOM
+# UTF_32LE_BOM  x-UTF-32LE-BOM
+
+# End of Java-specific, non-Unicode-standard UTF variants.
+
+
+# On UTF-7:
+# RFC 2152 (http://www.imc.org/rfc2152) allows to encode some US-ASCII
+# characters directly or in base64. Especially, the characters in set O
+# as defined in the RFC (!"#$%&*;<=>@[]^_`{|}) may be encoded directly
+# but are not allowed in, e.g., email headers.
+# By default, the ICU UTF-7 converter encodes set O directly.
+# By choosing the option "version=1", set O will be escaped instead.
+# For example:
+#     utf7Converter=ucnv_open("UTF-7,version=1");
+#
+# For details about email headers see RFC 2047.
+UTF-7 { IANA* MIME* WINDOWS }   windows-65000 { WINDOWS* }
+                                unicode-1-1-utf-7
+                                unicode-2-0-utf-7
+
+# UTF-EBCDIC doesn't exist in ICU, but the aliases are here for reference.
+#UTF-EBCDIC ibm-1210 { IBM* } ibm-1211 { IBM }
+
+# IMAP-mailbox-name is an ICU-specific name for the encoding of IMAP mailbox names.
+# It is a substantially modified UTF-7 encoding. See the specification in:
+#
+# RFC 2060: INTERNET MESSAGE ACCESS PROTOCOL - VERSION 4rev1
+# (http://www.ietf.org/rfc/rfc2060.txt)
+# Section 5.1.3.  Mailbox International Naming Convention
+IMAP-mailbox-name
+
+SCSU { IANA* }
+    ibm-1212 { IBM }  # SCSU with IBM PUA
+    ibm-1213 { IBM* } # SCSU
+BOCU-1 { IANA* }
+    csBOCU-1 { IANA }
+    ibm-1214 { IBM }  # BOCU-1 with IBM PUA
+    ibm-1215 { IBM* } # BOCU-1
+
+# See https://www.unicode.org/reports/tr26 for this Compatibility Encoding Scheme for UTF-16
+# The Unicode Consortium does not encourage the use of CESU-8
+CESU-8 { IANA* } ibm-9400 { IBM* }
+
+# Standard iso-8859-1, which does not have the Euro update.
+# See iso-8859-15 (latin9) for the Euro update
+ISO-8859-1 { MIME* IANA JAVA* }
+    ibm-819 { IBM* JAVA }    # This is not truely ibm-819 because it's missing the fallbacks.
+    IBM819 { IANA }
+    cp819 { IANA JAVA }
+    latin1 { IANA JAVA }
+    8859_1 { JAVA }
+    csISOLatin1 { IANA JAVA }
+    iso-ir-100 { IANA JAVA }
+    ISO_8859-1:1987 { IANA* JAVA }
+    l1 { IANA JAVA }
+    819 { JAVA }
+    # windows-28591 { WINDOWS* } # This has odd behavior because it has the Euro update, which isn't correct.
+    # LATIN_1     # Old ICU name
+    # ANSI_X3.110-1983  # This is for a different IANA alias.  This isn't iso-8859-1.
+
+US-ASCII { MIME* IANA JAVA WINDOWS }
+    ASCII { JAVA* IANA WINDOWS }
+    ANSI_X3.4-1968 { IANA* WINDOWS }
+    ANSI_X3.4-1986 { IANA WINDOWS }
+    ISO_646.irv:1991 { IANA WINDOWS }
+    iso_646.irv:1983 { JAVA }
+    ISO646-US { JAVA IANA WINDOWS }
+    us { IANA }
+    csASCII { IANA WINDOWS }
+    iso-ir-6 { IANA }
+    cp367 { IANA WINDOWS }
+    ascii7 { JAVA }
+    646 { JAVA }
+    windows-20127 { WINDOWS* }
+    ibm-367 { IBM* } IBM367 { IANA WINDOWS } # This is not truely ibm-367 because it's missing the fallbacks.
+
+# GB 18030 is partly algorithmic, using the MBCS converter
+gb18030 { IANA* }       ibm-1392 { IBM* } windows-54936 { WINDOWS* } GB18030 { MIME* }
+
+# Table-based interchange codepages
+
+# Central Europe
+ibm-912_P100-1995 { UTR22* }
+                        ibm-912 { IBM* JAVA }
+                        ISO-8859-2 { MIME* IANA JAVA* WINDOWS }
+                        ISO_8859-2:1987 { IANA* WINDOWS JAVA }
+                        latin2 { IANA WINDOWS JAVA }
+                        csISOLatin2 { IANA WINDOWS JAVA }
+                        iso-ir-101 { IANA WINDOWS JAVA }
+                        l2 { IANA WINDOWS JAVA }
+                        8859_2 { JAVA }
+                        cp912 { JAVA }
+                        912 { JAVA }
+                        windows-28592 { WINDOWS* }
+
+# Maltese Esperanto
+ibm-913_P100-2000 { UTR22* }
+                        ibm-913 { IBM* JAVA }
+                        ISO-8859-3 { MIME* IANA WINDOWS JAVA* }
+                        ISO_8859-3:1988 { IANA* WINDOWS JAVA }
+                        latin3 { IANA JAVA WINDOWS }
+                        csISOLatin3 { IANA WINDOWS }
+                        iso-ir-109 { IANA WINDOWS JAVA }
+                        l3 { IANA WINDOWS JAVA }
+                        8859_3 { JAVA }
+                        cp913 { JAVA }
+                        913 { JAVA }
+                        windows-28593 { WINDOWS* }
+
+# Baltic
+ibm-914_P100-1995 { UTR22* }
+                        ibm-914 { IBM* JAVA }
+                        ISO-8859-4 { MIME* IANA WINDOWS JAVA* }
+                        latin4 { IANA WINDOWS JAVA }
+                        csISOLatin4 { IANA WINDOWS JAVA }
+                        iso-ir-110 { IANA WINDOWS JAVA }
+                        ISO_8859-4:1988 { IANA* WINDOWS JAVA }
+                        l4 { IANA WINDOWS JAVA }
+                        8859_4 { JAVA }
+                        cp914 { JAVA }
+                        914 { JAVA }
+                        windows-28594 { WINDOWS* }
+
+# Cyrillic
+ibm-915_P100-1995 { UTR22* }
+                        ibm-915 { IBM* JAVA }
+                        ISO-8859-5 { MIME* IANA WINDOWS JAVA* }
+                        cyrillic { IANA WINDOWS JAVA }
+                        csISOLatinCyrillic { IANA WINDOWS JAVA }
+                        iso-ir-144 { IANA WINDOWS JAVA }
+                        ISO_8859-5:1988 { IANA* WINDOWS JAVA }
+                        8859_5 { JAVA }
+                        cp915 { JAVA }
+                        915 { JAVA }
+                        windows-28595 { WINDOWS* }
+
+glibc-PT154-2.3.3 { UTR22* }
+                        PTCP154 { IANA* }
+                        csPTCP154
+                        PT154
+                        CP154
+                        Cyrillic-Asian
+
+# Arabic
+# ISO_8859-6-E and ISO_8859-6-I are similar to this charset, but BiDi is done differently
+# From a narrow mapping point of view, there is no difference.
+# -E means explicit. -I means implicit.
+# -E requires the client to handle the ISO 6429 bidirectional controls
+ibm-1089_P100-1995 { UTR22* }
+                        ibm-1089 { IBM* JAVA }
+                        ISO-8859-6 { MIME* IANA WINDOWS JAVA* }
+                        arabic { IANA WINDOWS JAVA }
+                        csISOLatinArabic { IANA WINDOWS JAVA }
+                        iso-ir-127 { IANA WINDOWS JAVA }
+                        ISO_8859-6:1987 { IANA* WINDOWS JAVA }
+                        ECMA-114 { IANA JAVA }
+                        ASMO-708 { IANA JAVA }
+                        8859_6 { JAVA }
+                        cp1089 { JAVA }
+                        1089 { JAVA }
+                        windows-28596 { WINDOWS* }
+                        ISO-8859-6-I { IANA MIME } # IANA considers this alias different and BiDi needs to be applied.
+                        ISO-8859-6-E { IANA MIME } # IANA considers this alias different and BiDi needs to be applied.
+                        x-ISO-8859-6S { JAVA }
+
+# ISO Greek (with euro update). This is really ISO_8859-7:2003
+ibm-9005_X110-2007 { UTR22* }
+                        ibm-9005 { IBM* }
+                        ISO-8859-7 { MIME* IANA JAVA* WINDOWS }
+                        8859_7 { JAVA }
+                        greek { IANA JAVA WINDOWS }
+                        greek8 { IANA JAVA WINDOWS }
+                        ELOT_928 { IANA JAVA WINDOWS }
+                        ECMA-118 { IANA JAVA WINDOWS }
+                        csISOLatinGreek { IANA JAVA WINDOWS }
+                        iso-ir-126 { IANA JAVA WINDOWS }
+                        ISO_8859-7:1987 { IANA* JAVA WINDOWS }
+                        windows-28597 { WINDOWS* }
+                        sun_eu_greek # For Solaris
+
+# ISO Greek (w/o euro update)
+# JDK 1.5 has these aliases.
+ibm-813_P100-1995 { UTR22* }
+                        ibm-813 { IBM* JAVA* }
+                        cp813 { JAVA }
+                        813 { JAVA }
+
+# hebrew
+# ISO_8859-8-E and ISO_8859-8-I are similar to this charset, but BiDi is done differently
+# From a narrow mapping point of view, there is no difference.
+# -E means explicit. -I means implicit.
+# -E requires the client to handle the ISO 6429 bidirectional controls
+# This matches the official mapping on unicode.org
+ibm-5012_P100-1999 { UTR22* }
+                        ibm-5012 { IBM* }
+                        ISO-8859-8 { MIME* IANA WINDOWS JAVA* }
+                        hebrew { IANA WINDOWS JAVA }
+                        csISOLatinHebrew { IANA WINDOWS JAVA }
+                        iso-ir-138 { IANA WINDOWS JAVA }
+                        ISO_8859-8:1988 { IANA* WINDOWS JAVA }
+                        ISO-8859-8-I { IANA MIME } # IANA and Windows considers this alias different and BiDi needs to be applied.
+                        ISO-8859-8-E { IANA MIME } # IANA and Windows considers this alias different and BiDi needs to be applied.
+                        8859_8 { JAVA }
+                        windows-28598 { WINDOWS* } # Hebrew (ISO-Visual). A hybrid between ibm-5012 and ibm-916 with extra PUA mappings.
+                        hebrew8 # Reflect HP-UX code page update
+
+# Unfortunately, the Java aliases are split across ibm-916 and ibm-5012
+# Also many platforms are a combination between ibm-916 and ibm-5012 behaviors
+ibm-916_P100-1995 { UTR22* }
+                        ibm-916 { IBM* JAVA* }
+                        cp916 { JAVA }
+                        916 { JAVA }
+
+# Turkish
+ibm-920_P100-1995 { UTR22* }
+                        ibm-920 { IBM* JAVA }
+                        ISO-8859-9 { MIME* IANA WINDOWS JAVA* }
+                        latin5 { IANA WINDOWS JAVA }
+                        csISOLatin5 { IANA JAVA }
+                        iso-ir-148 { IANA WINDOWS JAVA }
+                        ISO_8859-9:1989 { IANA* WINDOWS }
+                        l5 { IANA WINDOWS JAVA }
+                        8859_9 { JAVA }
+                        cp920 { JAVA }
+                        920 { JAVA }
+                        windows-28599 { WINDOWS* }
+                        ECMA-128    # IANA doesn't have this alias 6/24/2002
+                        turkish8    # Reflect HP-UX codepage update 8/1/2008
+                        turkish     # Reflect HP-UX codepage update 8/1/2008
+
+# Nordic languages
+iso-8859_10-1998 { UTR22* } ISO-8859-10 { MIME* IANA* }
+                        iso-ir-157 { IANA }
+                        l6 { IANA }
+                        ISO_8859-10:1992 { IANA }
+                        csISOLatin6 { IANA }
+                        latin6 { IANA }
+
+# Thai
+# Be warned. There several iso-8859-11 codepage variants, and they are all incompatible.
+# ISO-8859-11 is a superset of TIS-620. The difference is that ISO-8859-11 contains the C1 control codes.
+iso-8859_11-2001 { UTR22* } ISO-8859-11
+                        thai8 # HP-UX alias. HP-UX says TIS-620, but it's closer to ISO-8859-11.
+                        x-iso-8859-11 { JAVA* }
+
+# iso-8859-13, PC Baltic (w/o euro update)
+ibm-921_P100-1995 { UTR22* }
+                        ibm-921 { IBM* }
+                        ISO-8859-13 { IANA* MIME* JAVA* }
+                        8859_13 { JAVA }
+                        windows-28603 { WINDOWS* }
+                        cp921
+                        921
+                        x-IBM921 { JAVA }
+
+# Celtic
+iso-8859_14-1998 { UTR22* } ISO-8859-14 { IANA* }
+                        iso-ir-199 { IANA }
+                        ISO_8859-14:1998 { IANA }
+                        latin8 { IANA }
+                        iso-celtic { IANA }
+                        l8 { IANA }
+
+# Latin 9
+ibm-923_P100-1998 { UTR22* }
+                        ibm-923 { IBM* JAVA }
+                        ISO-8859-15 { IANA* MIME* WINDOWS JAVA* }
+                        Latin-9 { IANA WINDOWS }
+                        l9 { WINDOWS }
+                        8859_15 { JAVA }
+                        latin0 { JAVA }
+                        csisolatin0 { JAVA }
+                        csisolatin9 { JAVA }
+                        iso8859_15_fdis { JAVA }
+                        cp923 { JAVA }
+                        923 { JAVA }
+                        windows-28605 { WINDOWS* }
+
+# CJK encodings
+
+ibm-942_P12A-1999 { UTR22* }    # ibm-942_P120 is a rarely used alternate mapping (sjis78 is already old)
+                        ibm-942 { IBM* }
+                        ibm-932 { IBM }
+                        cp932
+                        shift_jis78
+                        sjis78
+                        ibm-942_VSUB_VPUA
+                        ibm-932_VSUB_VPUA
+                        x-IBM942 { JAVA* }
+                        x-IBM942C { JAVA }
+                        # Is this "JIS_C6226-1978"?
+
+# ibm-943_P15A-2003 differs from windows-932-2000 only in a few roundtrip mappings:
+# - the usual IBM PC control code rotation (1A-1C-7F)
+# - the Windows table has roundtrips for bytes 80, A0, and FD-FF to U+0080 and PUA
+ibm-943_P15A-2003 { UTR22* }
+                        ibm-943 # Leave untagged because this isn't the default
+                        Shift_JIS { IANA* MIME* WINDOWS JAVA }
+                        MS_Kanji { IANA WINDOWS JAVA }
+                        csShiftJIS { IANA WINDOWS JAVA }
+                        windows-31j { IANA JAVA } # A further extension of Shift_JIS to include NEC special characters (Row 13)
+                        csWindows31J { IANA WINDOWS JAVA } # A further extension of Shift_JIS to include NEC special characters (Row 13)
+                        x-sjis { WINDOWS JAVA }
+                        x-ms-cp932 { WINDOWS }
+                        cp932 { WINDOWS }
+                        windows-932 { WINDOWS* }
+                        cp943c { JAVA* }    # This is slightly different, but the backslash mapping is the same.
+                        IBM-943C #{ AIX* } # Add this tag once AIX aliases becomes available
+                        ms932
+                        pck     # Probably SOLARIS
+                        sjis    # This might be for ibm-1351
+                        ibm-943_VSUB_VPUA
+                        x-MS932_0213 { JAVA }
+                        x-JISAutoDetect { JAVA }
+                        # cp943 # This isn't Windows, and no one else uses it.
+                        # IANA says that Windows-31J is an extension to csshiftjis ibm-932 
+ibm-943_P130-1999 { UTR22* }
+                        ibm-943 { IBM* JAVA }
+                        Shift_JIS # Leave untagged because this isn't the default
+                        cp943 { JAVA* }    # This is slightly different, but the backslash mapping is the same.
+                        943 { JAVA }
+                        ibm-943_VASCII_VSUB_VPUA
+                        x-IBM943 { JAVA }
+                        # japanese. Unicode name is \u30b7\u30d5\u30c8\u7b26\u53f7\u5316\u8868\u73fe
+ibm-33722_P12A_P12A-2009_U2 { UTR22* }
+                        ibm-33722   # Leave untagged because this isn't the default
+                        ibm-5050    # Leave untagged because this isn't the default, and yes this alias is correct
+                        ibm-33722_VPUA
+                        IBM-eucJP
+windows-51932-2006 { UTR22* }
+                        windows-51932 { WINDOWS* }
+                        CP51932 { IANA* }
+                        csCP51932
+ibm-33722_P120-1999 { UTR22* }  # Japan EUC with \ <-> Yen mapping
+                        ibm-33722 { IBM* JAVA }
+                        ibm-5050 { IBM }    # Yes this is correct
+                        cp33722 { JAVA* }
+                        33722 { JAVA }
+                        ibm-33722_VASCII_VPUA
+                        x-IBM33722 { JAVA }
+                        x-IBM33722A { JAVA }
+                        x-IBM33722C { JAVA }
+# ibm-954 seems to be almost a superset of ibm-33722 and ibm-1350
+# ibm-1350 seems to be almost a superset of ibm-33722
+# ibm-954 contains more PUA characters than the others.
+ibm-954_P101-2007 { UTR22* }
+                        ibm-954 { IBM* }
+                        x-IBM954 { JAVA* }
+                        x-IBM954C { JAVA }
+                        # eucJP # This is closest to Solaris EUC-JP.
+euc-jp-2007 { UTR22* }
+                        EUC-JP { MIME* IANA JAVA* WINDOWS* }
+                        Extended_UNIX_Code_Packed_Format_for_Japanese { IANA* JAVA WINDOWS }
+                        csEUCPkdFmtJapanese { IANA JAVA WINDOWS }
+                        X-EUC-JP { MIME JAVA WINDOWS }   # Japan EUC. x-euc-jp is a MIME name
+                        eucjis {JAVA}
+                        ujis # Linux sometimes uses this name. This is an unfortunate generic and rarely used name. Its use is discouraged.
+
+aix-IBM_udcJP-4.3.6 { UTR22* }
+                        x-IBM-udcJP { JAVA* }
+
+java-euc_jp_linux-1.6_P { UTR22* }
+                        euc-jp-linux
+                        x-EUC_JP_LINUX { JAVA* }
+
+java-sjis_0213-1.6_P { UTR22* }
+                        x-SJIS_0213 { JAVA* }
+
+# Here are various interpretations and extensions of Big5
+ibm-1373_P100-2002 { UTR22* } # IBM's interpretation of Windows' Taiwan Big-5 without HKSCS extensions
+                        ibm-1373 { IBM* }
+                        windows-950 # Alternate mapping. Leave untagged. This is the IBM interpretation of a Windows codepage.
+windows-950-2000 { UTR22* }
+                        Big5 { IANA* MIME* JAVA* WINDOWS }
+                        csBig5 { IANA WINDOWS }
+                        windows-950 { WINDOWS* }
+                        x-windows-950 { JAVA }
+                        x-big5
+                        ms950
+ibm-950_P110-1999 { UTR22* }                # Taiwan Big-5 (w/o euro update)
+                        ibm-950 { IBM* JAVA }
+                        cp950 { JAVA* }
+                        950 { JAVA }
+                        x-IBM950 { JAVA }
+ibm-1375_P100-2008 { UTR22* }   # Big5-HKSCS-2004 with Unicode 3.1 mappings. This uses supplementary characters.
+                        ibm-1375 { IBM* }
+                        Big5-HKSCS { IANA* JAVA* }
+                        big5hk { JAVA }
+                        HKSCS-BIG5  # From http://www.openi18n.org/localenameguide/
+ibm-5471_P100-2006 { UTR22* }   # Big5-HKSCS-2001 with Unicode 3.0 mappings. This uses many PUA characters.
+                        ibm-5471 { IBM* }
+                        Big5-HKSCS
+                        MS950_HKSCS { JAVA* }
+                        hkbig5 # from HP-UX 11i, which can't handle supplementary characters.
+                        big5-hkscs:unicode3.0
+                        x-MS950-HKSCS { JAVA }
+                        # windows-950 # Windows-950 can be w/ or w/o HKSCS extensions. By default it's not.
+                        # windows-950_hkscs
+solaris-zh_TW_big5-2.7 { UTR22* }
+                        Big5_Solaris { JAVA* }
+                        x-Big5-Solaris { JAVA }
+# GBK
+ibm-1386_P100-2001  { UTR22* }
+                        ibm-1386 { IBM* }
+                        cp1386
+                        windows-936 # Alternate mapping. Leave untagged. This is the IBM interpretation of a Windows codepage.
+                        ibm-1386_VSUB_VPUA
+windows-936-2000 { UTR22* }
+                        GBK { IANA* WINDOWS JAVA* }
+                        CP936 { IANA JAVA }
+                        MS936 { IANA }  # In JDK 1.5, this goes to x-mswin-936. This is an IANA name split.
+                        windows-936 { IANA WINDOWS* JAVA }
+
+# Java has two different tables for ibm-1383 and gb2312. We pick closest set for tagging.
+ibm-1383_P110-1999 { UTR22* }       # China EUC.
+                        ibm-1383 { IBM* JAVA }
+                        GB2312 { IANA* MIME* }
+                        csGB2312 { IANA }
+                        cp1383 { JAVA* }
+                        1383 { JAVA }
+                        EUC-CN  # According to other platforms, windows-20936 looks more like euc-cn. x-euc-cn is also a MIME name
+                        ibm-eucCN
+                        hp15CN  # From HP-UX?
+                        ibm-1383_VPUA
+                        # gb          # This is not an IANA name. gb in IANA means Great Britain.
+
+ibm-5478_P100-1995 { UTR22* } ibm-5478 { IBM* } # This gb_2312_80 DBCS mapping is needed by iso-2022.
+                        GB_2312-80 { IANA* }    # Windows maps this alias incorrectly
+                        chinese { IANA }
+                        iso-ir-58 { IANA }
+                        csISO58GB231280 { IANA }
+                        gb2312-1980
+                        GB2312.1980-0   # From X11R6
+
+euc-tw-2014 { UTR22* }                # Updated EUC-TW converter based on ibm-964
+                        EUC-TW
+
+ibm-964_P110-1999 { UTR22* }                # Taiwan EUC. x-euc-tw is a MIME name
+                        ibm-964 { IBM* JAVA }
+                        ibm-eucTW
+                        cns11643 
+                        cp964 { JAVA* }
+                        964 { JAVA }
+                        ibm-964_VPUA
+                        x-IBM964 { JAVA }
+
+# ISO-2022 needs one, and other people may need others.
+ibm-949_P110-1999 { UTR22* }
+                        ibm-949 { IBM* JAVA }
+                        cp949 { JAVA* }
+                        949 { JAVA }
+                        ibm-949_VASCII_VSUB_VPUA
+                        x-IBM949 { JAVA }
+ibm-949_P11A-1999 { UTR22* }
+                        ibm-949 # Leave untagged because this isn't the default
+                        cp949c { JAVA* }
+                        ibm-949_VSUB_VPUA
+                        x-IBM949C { JAVA }
+                        IBM-949C { JAVA }
+
+# Korean EUC.
+#
+# <quote from="Jungshik Shin">
+# EUC-KR = KS X 1003/ISO 646-KR or ISO 646-IRV/US-ASCII in GL and KS X 1001:1998 (formerly KS C 5601-1987) in GR.
+#
+# Although widely spread on MS Windows, using 
+# KS C 5601 or related names to denote EUC-KR or
+# windows-949 is very much misleading. KS C 5601-1987
+# is NOT suitable as a designation for MIME charset
+# and MBCS. It's just the name of a 94 x 94 Korean 
+# coded character set standard which can be invoked
+# on either GL (with MSB reset) or GR (with MSB set).
+# Note that JOHAB (windows-1361) specified in 
+# KS X 1001:1998 annex 3 (KS C 5601-1992 annex 3) 
+# is a _seprate_ MBCS with a _completely different_
+# mapping.
+# </quote>
+#
+# The following aliases tries to mirror the poor state of alias recognition
+# on these platforms.
+#
+# ibm-970 is almost a subset of ibm-1363.
+# Java, Solaris and AIX use euc-kr to also mean ksc5601.
+# Java has both ibm-970 and EUC-KR as separate converters.
+ibm-970_P110_P110-2006_U2 { UTR22* }
+                        ibm-970 { IBM* JAVA }
+                        EUC-KR { IANA* MIME* WINDOWS JAVA }
+                        KS_C_5601-1987 { JAVA }
+                        windows-51949 { WINDOWS* }
+                        csEUCKR { IANA WINDOWS }  # x-euc-kr is also a MIME name
+                        ibm-eucKR { JAVA }
+                        KSC_5601 { JAVA } # Needed by iso-2022
+                        5601 { JAVA }
+                        cp970 { JAVA* }
+                        970 { JAVA }
+                        ibm-970_VPUA
+                        x-IBM970 { JAVA }
+
+# ibm-971 is almost the set of DBCS mappings of ibm-970
+ibm-971_P100-1995       ibm-971 { IBM* } ibm-971_VPUA x-IBM971 { JAVA* }
+
+# Java, Solaris and AIX use euc-kr to also mean ksc5601, and _sometimes_ for Windows too.
+# ibm-1363 is almost a superset of ibm-970.
+ibm-1363_P11B-1998 { UTR22* }
+                        ibm-1363 # Leave untagged because this isn't the default
+                        KS_C_5601-1987 { IANA* }
+                        KS_C_5601-1989 { IANA }
+                        KSC_5601 { IANA }
+                        csKSC56011987 { IANA }
+                        korean { IANA }
+                        iso-ir-149 { IANA }
+                        cp1363 { MIME* }
+                        5601
+                        ksc
+                        windows-949 # Alternate mapping. Leave untagged. This is the IBM interpretation of a Windows codepage.
+                        ibm-1363_VSUB_VPUA
+                        x-IBM1363C { JAVA* }
+                        # ks_x_1001:1992
+                        # ksc5601-1992
+
+ibm-1363_P110-1997 { UTR22* } # Korean KSC MBCS with \ <-> Won mapping
+                        ibm-1363 { IBM* }
+                        ibm-1363_VASCII_VSUB_VPUA
+                        x-IBM1363 { JAVA* }
+
+windows-949-2000 { UTR22* }
+                        windows-949 { JAVA* WINDOWS* }
+                        KS_C_5601-1987 { WINDOWS }
+                        KS_C_5601-1989 { WINDOWS }
+                        KSC_5601 { MIME* WINDOWS } # Needed by iso-2022
+                        csKSC56011987 { WINDOWS }
+                        korean { WINDOWS }
+                        iso-ir-149 { WINDOWS }
+                        ms949 { JAVA }
+                        x-KSC5601 { JAVA }
+
+windows-1361-2000 { UTR22* }
+                        ksc5601_1992
+                        ms1361
+                        johab
+                        x-Johab { JAVA* }
+
+windows-874-2000 { UTR22* }   # Thai (w/ euro update)
+                        TIS-620 { WINDOWS }
+                        windows-874 { JAVA* WINDOWS* }
+                        MS874 { JAVA }
+                        x-windows-874 { JAVA }
+                        # iso-8859-11 { WINDOWS } # iso-8859-11 is similar to TIS-620. ibm-13162 is a closer match.
+
+ibm-874_P100-1995 { UTR22* }    # Thai PC (w/o euro update).
+                        ibm-874 { IBM* JAVA }
+                        ibm-9066 { IBM }    # Yes ibm-874 == ibm-9066. ibm-1161 has the euro update.
+                        cp874 { JAVA* }
+                        TIS-620 { IANA* JAVA }  # This is actually separate from ibm-874, which is similar to this table
+                        tis620.2533 { JAVA }    # This is actually separate from ibm-874, which is similar to this table
+                        eucTH               # eucTH is an unusual alias from Solaris.  eucTH has fewer mappings than TIS620
+                        x-IBM874 { JAVA }
+
+ibm-1162_P100-1999 { UTR22* }   # Thai (w/ euro update)
+                        ibm-1162 { IBM* }
+
+windows-864-2000 { UTR22* }
+                        ibm-864s
+                        cp864s
+                        x-IBM864S { JAVA* }
+
+# Platform codepages
+# If Java supports the IBM prefix, it should also support the ibm- prefix too.
+ibm-437_P100-1995 { UTR22* }    ibm-437 { IBM* } IBM437 { IANA* WINDOWS JAVA } cp437 { IANA WINDOWS JAVA* } 437 { IANA WINDOWS JAVA } csPC8CodePage437 { IANA JAVA } windows-437 { WINDOWS* }  # PC US
+ibm-720_P100-1997 { UTR22* }    ibm-720 { IBM* } windows-720 { WINDOWS* } DOS-720 { WINDOWS } x-IBM720 { JAVA* } # PC Arabic
+ibm-737_P100-1997 { UTR22* }    ibm-737 { IBM* } IBM737 { WINDOWS JAVA } cp737 { JAVA* } windows-737 { WINDOWS* } 737 { JAVA } x-IBM737 { JAVA } # PC Greek
+ibm-775_P100-1996 { UTR22* }    ibm-775 { IBM* } IBM775 { IANA* WINDOWS JAVA } cp775 { IANA WINDOWS JAVA* } csPC775Baltic { IANA } windows-775 { WINDOWS* } 775 { JAVA } # PC Baltic
+ibm-850_P100-1995 { UTR22* }    ibm-850 { IBM* } IBM850 { IANA* MIME* WINDOWS JAVA } cp850 { IANA MIME WINDOWS JAVA* } 850 { IANA JAVA } csPC850Multilingual { IANA JAVA } windows-850 { WINDOWS* } # PC latin1
+ibm-851_P100-1995 { UTR22* }    ibm-851 { IBM* } IBM851 { IANA* } cp851 { IANA MIME* } 851 { IANA } csPC851 { IANA }             # PC DOS Greek (w/o euro)
+ibm-852_P100-1995 { UTR22* }    ibm-852 { IBM* } IBM852 { IANA* WINDOWS JAVA } cp852 { IANA WINDOWS JAVA* } 852 { IANA WINDOWS JAVA } csPCp852 { IANA JAVA } windows-852 { WINDOWS* } # PC latin2 (w/o euro update)
+ibm-855_P100-1995 { UTR22* }    ibm-855 { IBM* } IBM855 { IANA* JAVA } cp855 { IANA JAVA* } 855 { IANA } csIBM855 { IANA } csPCp855 { JAVA } windows-855 { WINDOWS* } # PC cyrillic (w/o euro update)
+ibm-856_P100-1995 { UTR22* }    ibm-856 { IBM* } IBM856 { JAVA } cp856 { JAVA* } 856 { JAVA } x-IBM856 { JAVA } # PC Hebrew implicit order
+ibm-857_P100-1995 { UTR22* }    ibm-857 { IBM* } IBM857 { IANA* MIME* WINDOWS JAVA } cp857 { IANA MIME JAVA* } 857 { IANA JAVA } csIBM857 { IANA JAVA } windows-857 { WINDOWS* }   # PC Latin 5 (w/o euro update)
+ibm-858_P100-1997 { UTR22* }    ibm-858 { IBM* } IBM00858 { IANA* MIME* JAVA } CCSID00858 { IANA JAVA } CP00858 { IANA JAVA } PC-Multilingual-850+euro { IANA } cp858 { MIME JAVA* } windows-858 { WINDOWS* } # PC latin1 with Euro
+ibm-860_P100-1995 { UTR22* }    ibm-860 { IBM* } IBM860 { IANA* MIME* JAVA } cp860 { IANA MIME JAVA* } 860 { IANA JAVA } csIBM860 { IANA JAVA }    # PC Portugal
+ibm-861_P100-1995 { UTR22* }    ibm-861 { IBM* } IBM861 { IANA* MIME* WINDOWS JAVA } cp861 { IANA MIME JAVA* } 861 { IANA JAVA } cp-is { IANA JAVA } csIBM861 { IANA JAVA } windows-861 { WINDOWS* } # PC Iceland
+ibm-862_P100-1995 { UTR22* }    ibm-862 { IBM* } IBM862 { IANA* MIME* JAVA } cp862 { IANA MIME JAVA* } 862 { IANA JAVA } csPC862LatinHebrew { IANA JAVA } DOS-862 { WINDOWS } windows-862 { WINDOWS* }    # PC Hebrew visual order (w/o euro update)
+ibm-863_P100-1995 { UTR22* }    ibm-863 { IBM* } IBM863 { IANA* MIME* JAVA } cp863 { IANA MIME JAVA* } 863 { IANA JAVA } csIBM863 { IANA JAVA }    # PC Canadian French
+ibm-864_X110-1999 { UTR22* }    ibm-864 { IBM* } IBM864 { IANA* MIME* JAVA } cp864 { IANA MIME JAVA* } csIBM864 { IANA JAVA } # PC Arabic (w/o euro update)
+ibm-865_P100-1995 { UTR22* }    ibm-865 { IBM* } IBM865 { IANA* MIME* JAVA } cp865 { IANA MIME JAVA* } 865 { IANA JAVA } csIBM865 { IANA JAVA }    # PC Nordic
+ibm-866_P100-1995 { UTR22* }    ibm-866 { IBM* } IBM866 { IANA* MIME* JAVA } cp866 { IANA MIME WINDOWS JAVA* } 866 { IANA JAVA } csIBM866 { IANA JAVA } windows-866 { WINDOWS* } # PC Russian (w/o euro update)
+ibm-867_P100-1998 { UTR22* }    ibm-867 { IBM* } x-IBM867 { JAVA* } # PC Hebrew (w/ euro update) Updated version of ibm-862
+ibm-868_P100-1995 { UTR22* }    ibm-868 { IBM* } IBM868 { IANA* MIME* JAVA } CP868 { IANA MIME JAVA* } 868 { JAVA } csIBM868 { IANA } cp-ar { IANA }          # PC Urdu
+ibm-869_P100-1995 { UTR22* }    ibm-869 { IBM* } IBM869 { IANA* MIME* WINDOWS JAVA } cp869 { IANA MIME JAVA* } 869 { IANA JAVA } cp-gr { IANA JAVA } csIBM869 { IANA JAVA } windows-869 { WINDOWS* } # PC Greek (w/o euro update)
+ibm-878_P100-1996 { UTR22* }    ibm-878 { IBM* } KOI8-R { IANA* MIME* WINDOWS JAVA* } koi8 { WINDOWS JAVA } csKOI8R { IANA WINDOWS JAVA } windows-20866 { WINDOWS* } cp878   # Russian internet
+ibm-901_P100-1999 { UTR22* }    ibm-901 { IBM* } # PC Baltic (w/ euro update), update of ibm-921
+ibm-902_P100-1999 { UTR22* }    ibm-902 { IBM* } # PC Estonian (w/ euro update), update of ibm-922
+ibm-922_P100-1999 { UTR22* }    ibm-922 { IBM* } IBM922 { JAVA } cp922 { JAVA* } 922 { JAVA } x-IBM922 { JAVA } # PC Estonian (w/o euro update)
+ibm-1168_P100-2002 { UTR22* }   ibm-1168 { IBM* } KOI8-U { IANA* WINDOWS } windows-21866 { WINDOWS* } # Ukrainian KOI8. koi8-ru != KOI8-U and Microsoft is wrong for aliasing them as the same.
+ibm-4909_P100-1999 { UTR22* }   ibm-4909 { IBM* } # ISO Greek (w/ euro update), update of ibm-813
+
+# The cp aliases in this section aren't really windows aliases, but it was used by ICU for Windows.
+# cp is usually used to denote IBM in Java, and that is why we don't do that anymore.
+# The windows-* aliases mean windows codepages.
+ibm-5346_P100-1998 { UTR22* }   ibm-5346 { IBM* } windows-1250 { IANA* JAVA* WINDOWS* } cp1250 { WINDOWS JAVA } # Windows Latin2 (w/ euro update)
+ibm-5347_P100-1998 { UTR22* }   ibm-5347 { IBM* } windows-1251 { IANA* JAVA* WINDOWS* } cp1251 { WINDOWS JAVA } ANSI1251 # Windows Cyrillic (w/ euro update). ANSI1251 is from Solaris
+ibm-5348_P100-1997 { UTR22* }   ibm-5348 { IBM* } windows-1252 { IANA* JAVA* WINDOWS* } cp1252 { JAVA }         # Windows Latin1 (w/ euro update)
+ibm-5349_P100-1998 { UTR22* }   ibm-5349 { IBM* } windows-1253 { IANA* JAVA* WINDOWS* } cp1253 { JAVA }         # Windows Greek (w/ euro update)
+ibm-5350_P100-1998 { UTR22* }   ibm-5350 { IBM* } windows-1254 { IANA* JAVA* WINDOWS* } cp1254 { JAVA }         # Windows Turkish (w/ euro update)
+ibm-9447_P100-2002 { UTR22* }   ibm-9447 { IBM* } windows-1255 { IANA* JAVA* WINDOWS* } cp1255 { JAVA }         # Windows Hebrew (w/ euro update)
+ibm-9448_X100-2005 { UTR22* }   ibm-9448 { IBM* } windows-1256 { IANA* JAVA* WINDOWS* } cp1256 { WINDOWS JAVA } x-windows-1256S { JAVA } # Windows Arabic (w/ euro update)
+ibm-9449_P100-2002 { UTR22* }   ibm-9449 { IBM* } windows-1257 { IANA* JAVA* WINDOWS* } cp1257 { JAVA }         # Windows Baltic (w/ euro update)
+ibm-5354_P100-1998 { UTR22* }   ibm-5354 { IBM* } windows-1258 { IANA* JAVA* WINDOWS* } cp1258 { JAVA }         # Windows Vietnamese (w/ euro update)
+
+# These tables are out of date, and most don't have the Euro
+# Leave the windows- variants untagged. They are alternate tables of the newer ones above.
+ibm-1250_P100-1995 { UTR22* }   ibm-1250 { IBM* } windows-1250  # Old Windows Latin2 (w/o euro update)
+ibm-1251_P100-1995 { UTR22* }   ibm-1251 { IBM* } windows-1251  # Old Windows Cyrillic (w/o euro update)
+ibm-1252_P100-2000 { UTR22* }   ibm-1252 { IBM* } windows-1252  # Old Windows Latin 1 without Euro
+ibm-1253_P100-1995 { UTR22* }   ibm-1253 { IBM* } windows-1253  # Old Windows Greek (w/o euro update)
+ibm-1254_P100-1995 { UTR22* }   ibm-1254 { IBM* } windows-1254  # Old Windows Turkish (w/o euro update)
+ibm-1255_P100-1995 { UTR22* }   ibm-1255 { IBM* }               # Very old Windows Hebrew (w/o euro update)
+ibm-5351_P100-1998 { UTR22* }   ibm-5351 { IBM* } windows-1255  # Old Windows Hebrew (w/ euro update)
+ibm-1256_P110-1997 { UTR22* }   ibm-1256 { IBM* }               # Old Windows Arabic (w/o euro update)
+ibm-5352_P100-1998 { UTR22* }   ibm-5352 { IBM* } windows-1256  # Somewhat old Windows Arabic (w/ euro update)
+ibm-1257_P100-1995 { UTR22* }   ibm-1257 { IBM* }               # Old Windows Baltic (w/o euro update)
+ibm-5353_P100-1998 { UTR22* }   ibm-5353 { IBM* } windows-1257  # Somewhat old Windows Baltic (w/ euro update)
+ibm-1258_P100-1997 { UTR22* }   ibm-1258 { IBM* } windows-1258  # Old Windows Vietnamese (w/o euro update)
+
+macos-0_2-10.2 { UTR22* }       macintosh { IANA* MIME* WINDOWS } mac { IANA } csMacintosh { IANA } windows-10000 { WINDOWS* } macroman { JAVA } x-macroman { JAVA* } # Apple latin 1
+macos-6_2-10.4 { UTR22* }       x-mac-greek { MIME* WINDOWS } windows-10006 { WINDOWS* } macgr x-MacGreek { JAVA* }  # Apple Greek
+macos-7_3-10.2 { UTR22* }       x-mac-cyrillic { MIME* WINDOWS } windows-10007 { WINDOWS* } mac-cyrillic maccy x-MacCyrillic { JAVA } x-MacUkraine { JAVA* } # Apple Cyrillic
+macos-21-10.5 { UTR22* }        x-mac-thai { MIME* } x-MacThai { JAVA* } MacThai { JAVA }
+macos-29-10.2 { UTR22* }        x-mac-centraleurroman { MIME* } windows-10029 { WINDOWS* } x-mac-ce { WINDOWS } macce maccentraleurope x-MacCentralEurope { JAVA* }  # Apple Central Europe
+macos-33-10.5 { UTR22* }        x-mac-symbol { MIME* } x-MacSymbol { JAVA* } MacSymbol { JAVA }
+macos-34-10.2 { UTR22* }        x-mac-dingbat { MIME* } x-MacDingbat { JAVA* } MacDingbat { JAVA }
+macos-35-10.2 { UTR22* }        x-mac-turkish { MIME* WINDOWS } windows-10081 { WINDOWS* } mactr x-MacTurkish { JAVA* }  # Apple Turkish
+macos-36_2-10.2 { UTR22* }      x-mac-croatian { MIME* } x-MacCroatian { JAVA* } MacCroatian { JAVA }
+macos-37_5-10.2 { UTR22* }      x-mac-iceland { MIME* } x-MacIceland { JAVA* } MacIceland { JAVA }
+macos-38_2-10.2 { UTR22* }      x-mac-romania { MIME* } x-MacRomania { JAVA* } MacRomania { JAVA }
+macos-518-10.2 { UTR22* }       x-mac-arabic { MIME* } x-MacArabic { JAVA* } MacArabic { JAVA }
+macos-1285-10.2 { UTR22* }      x-mac-hebrew { MIME* } x-MacHebrew { JAVA* } MacHebrew { JAVA }
+
+ibm-1051_P100-1995 { UTR22* }   ibm-1051 { IBM* } hp-roman8 { IANA* } roman8 { IANA } r8 { IANA } csHPRoman8 { IANA } x-roman8 { JAVA* }   # HP Latin1
+ibm-1276_P100-1995 { UTR22* }   ibm-1276 { IBM* } Adobe-Standard-Encoding { IANA* } csAdobeStandardEncoding { IANA } # Different from ISO-Unicode-IBM-1276 (GCSGID: 1276)
+
+ibm-1006_P100-1995 { UTR22* }   ibm-1006 { IBM* } IBM1006 { JAVA } cp1006 { JAVA* } 1006 { JAVA } x-IBM1006 { JAVA }  # Urdu
+ibm-1098_P100-1995 { UTR22* }   ibm-1098 { IBM* } IBM1098 { JAVA } cp1098 { JAVA* } 1098 { JAVA } x-IBM1098 { JAVA }  # PC Farsi
+ibm-1124_P100-1996 { UTR22* }   ibm-1124 { IBM* JAVA } cp1124 { JAVA* } 1124 { JAVA } x-IBM1124 { JAVA }  # ISO Cyrillic Ukraine
+ibm-1125_P100-1997 { UTR22* }   ibm-1125 { IBM* } cp1125                                # Cyrillic Ukraine PC
+ibm-1129_P100-1997 { UTR22* }   ibm-1129 { IBM* }                                       # ISO Vietnamese
+ibm-1131_P100-1997 { UTR22* }   ibm-1131 { IBM* } cp1131                                # Cyrillic Belarus PC
+ibm-1133_P100-1997 { UTR22* }   ibm-1133 { IBM* }                                       # ISO Lao
+
+# GSM 03.38
+gsm-03.38-2009 { UTR22* }   GSM0338 # GSM0338 alias is from Perl
+
+# Partially algorithmic converters
+
+# [U_ENABLE_GENERIC_ISO_2022]
+# The _generic_ ISO-2022 converter is disabled starting 2003-dec-03 (ICU 2.8).
+# For details see the icu mailing list from 2003-dec-01 and the ucnv2022.c file.
+# Language-specific variants of ISO-2022 continue to be available as listed below.
+# ISO_2022                         ISO-2022
+
+ISO_2022,locale=ja,version=0    ISO-2022-JP { IANA* MIME* JAVA* } csISO2022JP { IANA JAVA } x-windows-iso2022jp { JAVA } x-windows-50220 { JAVA }
+ISO_2022,locale=ja,version=1    ISO-2022-JP-1 { MIME* } JIS_Encoding { IANA* } csJISEncoding { IANA } ibm-5054 { IBM* } JIS x-windows-50221 { JAVA* }
+ISO_2022,locale=ja,version=2    ISO-2022-JP-2 { IANA* MIME* } csISO2022JP2 { IANA }
+ISO_2022,locale=ja,version=3    JIS7
+ISO_2022,locale=ja,version=4    JIS8
+ISO_2022,locale=ko,version=0    ISO-2022-KR { IANA* MIME* JAVA* } csISO2022KR { IANA JAVA } # This uses ibm-949
+ISO_2022,locale=ko,version=1    ibm-25546 { IBM* }
+ISO_2022,locale=zh,version=0    ISO-2022-CN { IANA* JAVA* } csISO2022CN { JAVA } x-ISO-2022-CN-GB { JAVA }
+ISO_2022,locale=zh,version=1    ISO-2022-CN-EXT { IANA* }
+ISO_2022,locale=zh,version=2    ISO-2022-CN-CNS x-ISO-2022-CN-CNS { JAVA* }
+HZ                              HZ-GB-2312 { IANA* }
+x11-compound-text               COMPOUND_TEXT x-compound-text { JAVA* }
+
+ISCII,version=0         x-ISCII91 { JAVA* } x-iscii-de { WINDOWS } windows-57002 { WINDOWS* } iscii-dev ibm-4902 { IBM* } # ibm-806 contains non-standard box drawing symbols.
+ISCII,version=1         x-iscii-be { WINDOWS } windows-57003 { WINDOWS* } iscii-bng windows-57006 { WINDOWS } x-iscii-as { WINDOWS } # be is different from as on Windows.
+ISCII,version=2         x-iscii-pa { WINDOWS } windows-57011 { WINDOWS* } iscii-gur
+ISCII,version=3         x-iscii-gu { WINDOWS } windows-57010 { WINDOWS* } iscii-guj
+ISCII,version=4         x-iscii-or { WINDOWS } windows-57007 { WINDOWS* } iscii-ori
+ISCII,version=5         x-iscii-ta { WINDOWS } windows-57004 { WINDOWS* } iscii-tml
+ISCII,version=6         x-iscii-te { WINDOWS } windows-57005 { WINDOWS* } iscii-tlg
+ISCII,version=7         x-iscii-ka { WINDOWS } windows-57008 { WINDOWS* } iscii-knd
+ISCII,version=8         x-iscii-ma { WINDOWS } windows-57009 { WINDOWS* } iscii-mlm
+
+# Lotus specific
+LMBCS-1                 lmbcs ibm-65025 { IBM* }
+
+# These Lotus specific converters still work, but they aren't advertised in this alias table.
+# These are almost never used outside of Lotus software,
+# and they take a lot of time when creating the available converter list.
+# Also Lotus doesn't really use them anyway. It was a mistake to create these LMBCS variant converters in ICU.
+#LMBCS-2
+#LMBCS-3
+#LMBCS-4
+#LMBCS-5
+#LMBCS-6
+#LMBCS-8
+#LMBCS-11
+#LMBCS-16
+#LMBCS-17
+#LMBCS-18
+#LMBCS-19
+
+# EBCDIC codepages according to the CDRA
+
+# without Euro
+ibm-37_P100-1995 { UTR22* }              # EBCDIC US
+                        ibm-37 { IBM* }
+                        IBM037 { IANA* JAVA }
+                        ibm-037 # { JAVA }
+                        ebcdic-cp-us { IANA JAVA }
+                        ebcdic-cp-ca { IANA JAVA }
+                        ebcdic-cp-wt { IANA JAVA }
+                        ebcdic-cp-nl { IANA JAVA }
+                        csIBM037 { IANA JAVA }
+                        cp037 { JAVA* }
+                        037 { JAVA }
+                        cpibm37 { JAVA }
+                        cp37
+
+ibm-273_P100-1995 { UTR22* }    ibm-273 { IBM* } IBM273 { IANA* JAVA } CP273 { IANA JAVA* } csIBM273 { IANA } ebcdic-de 273 { JAVA }                 # EBCDIC Germanay, Austria
+ibm-277_P100-1995 { UTR22* }    ibm-277 { IBM* } IBM277 { IANA* JAVA } cp277 { JAVA* } EBCDIC-CP-DK { IANA } EBCDIC-CP-NO { IANA } csIBM277 { IANA } ebcdic-dk 277 { JAVA } # EBCDIC Denmark
+ibm-278_P100-1995 { UTR22* }    ibm-278 { IBM* } IBM278 { IANA* JAVA } cp278 { JAVA* } ebcdic-cp-fi { IANA } ebcdic-cp-se { IANA } csIBM278 { IANA } ebcdic-sv { JAVA } 278 { JAVA } # EBCDIC Sweden
+ibm-280_P100-1995 { UTR22* }    ibm-280 { IBM* } IBM280 { IANA* JAVA } CP280 { IANA JAVA* } ebcdic-cp-it { IANA } csIBM280 { IANA } 280 { JAVA }     # EBCDIC Italy
+ibm-284_P100-1995 { UTR22* }    ibm-284 { IBM* } IBM284 { IANA* JAVA } CP284 { IANA JAVA* } ebcdic-cp-es { IANA } csIBM284 { IANA } cpibm284 { JAVA } 284 { JAVA }       # EBCDIC Spain
+ibm-285_P100-1995 { UTR22* }    ibm-285 { IBM* } IBM285 { IANA* JAVA } CP285 { IANA JAVA* } ebcdic-cp-gb { IANA } csIBM285 { IANA } cpibm285 { JAVA } ebcdic-gb { JAVA } 285 { JAVA } # EBCDIC UK Ireland
+ibm-290_P100-1995 { UTR22* }    ibm-290 { IBM* } IBM290 { IANA* } cp290 { IANA } EBCDIC-JP-kana { IANA } csIBM290 { IANA } # host SBCS (Katakana)
+ibm-297_P100-1995 { UTR22* }    ibm-297 { IBM* } IBM297 { IANA* JAVA } cp297 { IANA JAVA* } ebcdic-cp-fr { IANA } csIBM297 { IANA } cpibm297 { JAVA } 297 { JAVA }     # EBCDIC France
+ibm-420_X120-1999 { UTR22* }    ibm-420 { IBM* } IBM420 { IANA* JAVA } cp420 { IANA JAVA* } ebcdic-cp-ar1 { IANA } csIBM420 { IANA } 420 { JAVA }    # EBCDIC Arabic (all presentation shapes)
+ibm-424_P100-1995 { UTR22* }    ibm-424 { IBM* } IBM424 { IANA* JAVA } cp424 { IANA JAVA* } ebcdic-cp-he { IANA } csIBM424 { IANA } 424 { JAVA }     # EBCDIC Hebrew
+ibm-500_P100-1995 { UTR22* }    ibm-500 { IBM* } IBM500 { IANA* JAVA } CP500 { IANA JAVA* } ebcdic-cp-be { IANA } csIBM500 { IANA } ebcdic-cp-ch { IANA } 500   # EBCDIC International Latin1
+ibm-803_P100-1999 { UTR22* }    ibm-803 { IBM* } cp803   # Old EBCDIC Hebrew
+ibm-838_P100-1995 { UTR22* }    ibm-838 { IBM* } IBM838 { JAVA } IBM-Thai { IANA* JAVA } csIBMThai { IANA } cp838 { JAVA* } 838 { JAVA } ibm-9030 { IBM }   # EBCDIC Thai. Yes ibm-9030 is an alias.
+ibm-870_P100-1995 { UTR22* }    ibm-870 { IBM* } IBM870 { IANA* JAVA } CP870 { IANA JAVA* } ebcdic-cp-roece { IANA } ebcdic-cp-yu { IANA } csIBM870 { IANA }    # EBCDIC Latin 2 
+ibm-871_P100-1995 { UTR22* }    ibm-871 { IBM* } IBM871 { IANA* JAVA } ebcdic-cp-is { IANA JAVA } csIBM871 { IANA JAVA } CP871 { IANA JAVA* } ebcdic-is { JAVA } 871 { JAVA }     # EBCDIC Iceland
+ibm-875_P100-1995 { UTR22* }    ibm-875 { IBM* } IBM875 { JAVA } cp875 { JAVA* } 875 { JAVA } x-IBM875 { JAVA } # EBCDIC Greek
+ibm-918_P100-1995 { UTR22* }    ibm-918 { IBM* } IBM918 { IANA* JAVA } CP918 { IANA JAVA* } ebcdic-cp-ar2 { IANA } csIBM918 { IANA }        # EBCDIC Urdu
+ibm-930_P120-1999 { UTR22* }    # EBCDIC_STATEFUL Katakana-Kanji Host Mixed.
+                        ibm-930 { IBM* }
+                        ibm-5026 { IBM } # Yes this is correct
+                        IBM930 { JAVA }
+                        cp930 { JAVA* }
+                        930 { JAVA }
+                        x-IBM930 { JAVA }
+                        x-IBM930A { JAVA }
+ibm-933_P110-1995 { UTR22* }    ibm-933 { IBM* JAVA } cp933 { JAVA* } 933 { JAVA } x-IBM933 { JAVA } # Korea EBCDIC MIXED
+ibm-935_P110-1999 { UTR22* }    ibm-935 { IBM* JAVA } cp935 { JAVA* } 935 { JAVA } x-IBM935 { JAVA } # China EBCDIC MIXED. Need to use Unicode, ibm-1388 or gb18030 instead because it is required by the government of China.
+ibm-937_P110-1999 { UTR22* }    ibm-937 { IBM* JAVA } cp937 { JAVA* } 937 { JAVA } x-IBM937 { JAVA } # Taiwan EBCDIC MIXED
+ibm-939_P120-1999 { UTR22* }    # EBCDIC_STATEFUL Latin-Kanji Host Mixed.
+                        ibm-939 { IBM* }
+                        ibm-931 { IBM }     # Yes this is correct
+                        ibm-5035 { IBM }    # Yes this is also correct
+                        IBM939 { JAVA }
+                        cp939 { JAVA* }
+                        939 { JAVA }
+                        x-IBM939 { JAVA }
+                        x-IBM939A { JAVA }
+ibm-1025_P100-1995 { UTR22* }   ibm-1025 { IBM* JAVA } cp1025 { JAVA* } 1025 { JAVA } x-IBM1025 { JAVA }  # EBCDIC Cyrillic
+ibm-1026_P100-1995 { UTR22* }   ibm-1026 { IBM* } IBM1026 { IANA* JAVA } CP1026 { IANA JAVA* } csIBM1026 { IANA } 1026 { JAVA } # EBCDIC Turkey 
+ibm-1047_P100-1995 { UTR22* }   ibm-1047 { IBM* } IBM1047 { IANA* JAVA } cp1047 { JAVA* } 1047 { JAVA } # EBCDIC Open systems Latin1
+ibm-1097_P100-1995 { UTR22* }   ibm-1097 { IBM* JAVA } cp1097 { JAVA* } 1097 { JAVA } x-IBM1097 { JAVA }  # EBCDIC Farsi
+ibm-1112_P100-1995 { UTR22* }   ibm-1112 { IBM* JAVA } cp1112 { JAVA* } 1112 { JAVA } x-IBM1112 { JAVA }  # EBCDIC Baltic
+ibm-1114_P100-2001 { UTR22* }   ibm-1114 { IBM* } x-IBM1114 { JAVA* }
+ibm-1115_P100-1995 { UTR22* }   ibm-1115 { IBM* } x-IBM1115 { JAVA* }
+ibm-1122_P100-1999 { UTR22* }   ibm-1122 { IBM* JAVA } cp1122 { JAVA* } 1122 { JAVA } x-IBM1122 { JAVA }  # EBCDIC Estonia 
+ibm-1123_P100-1995 { UTR22* }   ibm-1123 { IBM* JAVA } cp1123 { JAVA* } 1123 { JAVA } x-IBM1123 { JAVA }  # EBCDIC Cyrillic Ukraine
+ibm-1130_P100-1997 { UTR22* }   ibm-1130 { IBM* }       # EBCDIC Vietnamese
+ibm-1132_P100-1998 { UTR22* }   ibm-1132 { IBM* }       # EBCDIC Lao
+ibm-1137_P100-1999 { UTR22* }   ibm-1137 { IBM* }       # Devanagari EBCDIC (based on Unicode character set)
+ibm-4517_P100-2005 { UTR22* }   ibm-4517 { IBM* }       # EBCDIC Arabic. Update of ibm-421
+
+# with Euro
+ibm-1140_P100-1997 { UTR22* }   ibm-1140 { IBM* } IBM01140 { IANA* JAVA } CCSID01140 { IANA JAVA } CP01140 { IANA JAVA } cp1140 { JAVA* } ebcdic-us-37+euro { IANA } # EBCDIC US
+ibm-1141_P100-1997 { UTR22* }   ibm-1141 { IBM* } IBM01141 { IANA* JAVA } CCSID01141 { IANA JAVA } CP01141 { IANA JAVA } cp1141 { JAVA* } ebcdic-de-273+euro { IANA } # EBCDIC Germanay, Austria
+ibm-1142_P100-1997 { UTR22* }   ibm-1142 { IBM* } IBM01142 { IANA* JAVA } CCSID01142 { IANA JAVA } CP01142 { IANA JAVA } cp1142 { JAVA* } ebcdic-dk-277+euro { IANA } ebcdic-no-277+euro { IANA } # EBCDIC Denmark
+ibm-1143_P100-1997 { UTR22* }   ibm-1143 { IBM* } IBM01143 { IANA* JAVA } CCSID01143 { IANA JAVA } CP01143 { IANA JAVA } cp1143 { JAVA* } ebcdic-fi-278+euro { IANA } ebcdic-se-278+euro { IANA } # EBCDIC Sweden
+ibm-1144_P100-1997 { UTR22* }   ibm-1144 { IBM* } IBM01144 { IANA* JAVA } CCSID01144 { IANA JAVA } CP01144 { IANA JAVA } cp1144 { JAVA* } ebcdic-it-280+euro { IANA } # EBCDIC Italy
+ibm-1145_P100-1997 { UTR22* }   ibm-1145 { IBM* } IBM01145 { IANA* JAVA } CCSID01145 { IANA JAVA } CP01145 { IANA JAVA } cp1145 { JAVA* } ebcdic-es-284+euro { IANA } # EBCDIC Spain
+ibm-1146_P100-1997 { UTR22* }   ibm-1146 { IBM* } IBM01146 { IANA* JAVA } CCSID01146 { IANA JAVA } CP01146 { IANA JAVA } cp1146 { JAVA* } ebcdic-gb-285+euro { IANA } # EBCDIC UK Ireland
+ibm-1147_P100-1997 { UTR22* }   ibm-1147 { IBM* } IBM01147 { IANA* JAVA } CCSID01147 { IANA JAVA } CP01147 { IANA JAVA } cp1147 { JAVA* } ebcdic-fr-297+euro { IANA } # EBCDIC France
+ibm-1148_P100-1997 { UTR22* }   ibm-1148 { IBM* } IBM01148 { IANA* JAVA } CCSID01148 { IANA JAVA } CP01148 { IANA JAVA } cp1148 { JAVA* } ebcdic-international-500+euro { IANA } # EBCDIC International Latin1
+ibm-1149_P100-1997 { UTR22* }   ibm-1149 { IBM* } IBM01149 { IANA* JAVA } CCSID01149 { IANA JAVA } CP01149 { IANA JAVA } cp1149 { JAVA* } ebcdic-is-871+euro { IANA } # EBCDIC Iceland
+ibm-1153_P100-1999 { UTR22* }   ibm-1153 { IBM* } IBM1153 { JAVA } x-IBM1153 { JAVA* } # EBCDIC latin 2
+ibm-1154_P100-1999 { UTR22* }   ibm-1154 { IBM* }                   # EBCDIC Cyrillic Multilingual
+ibm-1155_P100-1999 { UTR22* }   ibm-1155 { IBM* }                   # EBCDIC Turkey
+ibm-1156_P100-1999 { UTR22* }   ibm-1156 { IBM* }                   # EBCDIC Baltic Multilingual
+ibm-1157_P100-1999 { UTR22* }   ibm-1157 { IBM* }                   # EBCDIC Estonia
+ibm-1158_P100-1999 { UTR22* }   ibm-1158 { IBM* }                   # EBCDIC Cyrillic Ukraine
+ibm-1160_P100-1999 { UTR22* }   ibm-1160 { IBM* }                   # EBCDIC Thailand
+ibm-1164_P100-1999 { UTR22* }   ibm-1164 { IBM* }                   # EBCDIC Viet Nam
+ibm-1364_P110-2007 { UTR22* }   ibm-1364 { IBM* } x-IBM1364 { JAVA* } # Korean Host Mixed
+ibm-1370_P100-1999 { UTR22* }   ibm-1370 { IBM* } x-IBM1370 { JAVA* }
+ibm-1371_P100-1999 { UTR22* }   ibm-1371 { IBM* } x-IBM1371 { JAVA* } # Taiwan EBCDIC MIXED (Euro update of ibm-937)
+ibm-1388_P103-2001 { UTR22* }   ibm-1388 { IBM* } ibm-9580 { IBM } x-IBM1388 { JAVA* } # S-Ch DBCS-Host Data GBK EBCDIC_STATEFUL. Yes ibm-9580 is an alias.
+ibm-1390_P110-2003 { UTR22* }   ibm-1390 { IBM* } x-IBM1390 { JAVA* } # Japan EBCDIC MIXED (JIS X 0213)
+ibm-1399_P110-2003 { UTR22* }   ibm-1399 { IBM* } x-IBM1399 { JAVA* } # Host MBCS (Latin-Kanji) (JIS X 0213)
+ibm-5123_P100-1999 { UTR22* }   ibm-5123 { IBM* }                   # Host Roman Jis. Euro update of ibm-1027. SBCS portion of ibm-1390.
+ibm-8482_P100-1999 { UTR22* }   ibm-8482 { IBM* }                   # host SBCS (Katakana). Euro update of ibm-290. SBCS portion of ibm-1399.
+# Yes ibm-20780 is the same as ibm-16684
+ibm-16684_P110-2003 { UTR22* }  ibm-16684 { IBM* } ibm-20780 { IBM } # DBCS Jis + Roman Jis Host. This is the DBCS portion of ibm-1390 and ibm-1399 (JIS X 0213).
+ibm-4899_P100-1998 { UTR22* }   ibm-4899 { IBM* }                   # Old EBCDIC Hebrew. Update of ibm-803
+ibm-4971_P100-1999 { UTR22* }   ibm-4971 { IBM* }                   # EBCDIC Greek. Update of ibm-875 and superceded by ibm-9067
+ibm-9067_X100-2005 { UTR22* }   ibm-9067 { IBM* }                   # EBCDIC Greek. Update of ibm-875 and ibm-4971
+ibm-12712_P100-1998 { UTR22* }  ibm-12712 { IBM* } ebcdic-he        # EBCDIC Hebrew (new sheqel, control characters update). Update of ibm-424
+ibm-16804_X110-1999 { UTR22* }  ibm-16804 { IBM* } ebcdic-ar        # EBCDIC Arabic. Update of ibm-420
+
+java-Cp1399A-1.6_P { UTR22* }   x-IBM1399A { JAVA* }
+java-Cp420s-1.6_P { UTR22* }    x-IBM420S { JAVA* }
+java-Cp1390A-1.6_P { UTR22* }   x-IBM1390A { JAVA* }
+
+# EBCDIC codepages for S/390, with LF and NL codes swapped
+# Starting with ICU 2.4, the swapping is done by modifying the
+# normal tables at runtime instead of at build time.
+# Append UCNV_SWAP_LFNL_OPTION_STRING to the "ibm-CCSID" name to select this.
+#
+# Example: "ibm-1047,swaplfnl" or "ibm-1047" UCNV_SWAP_LFNL_OPTION_STRING
+#
+# This avoids the duplication of all EBCDIC SBCS and mixed-SBCS/DBCS
+# mapping files.
+
+# Some examples below for declaring old-style, obsolete aliases with the "-s390"
+# suffix to map to the new-style, recommended names with the option added.
+# These are listed here for backward compatibility.
+# Do not use these; instead use the normal converter name with the option
+# added as recommended above.
+
+# Note: It is not possible to define an alias (non-initial name in a line here)
+# that itself contains a converter option like this one for swapping LF<->NL.
+# Such names would never be found because ucnv_open() will first parse and strip
+# options before looking up a name in this table.
+# ucnv_open() then parses the lookup result (the canonical name on the left
+# in lines here) as well.
+
+# This also means that it is not necessary to add anything to convrtrs.txt
+# for converter names like "ibm-1026,swaplfnl" to work -
+# they are already covered by the normal option parsing together with the
+# regular, option-less alias elsewhere in this file.
+
+ibm-37_P100-1995,swaplfnl     ibm-37-s390 # ibm037-s390 also matches ibm-37-s390
+ibm-924_P100-1998,swaplfnl    ibm-924-s390 IBM924_LF { JAVA* }
+ibm-1047_P100-1995,swaplfnl   ibm-1047-s390 IBM1047_LF { JAVA* }
+ibm-1140_P100-1997,swaplfnl   ibm-1140-s390
+ibm-1141_P100-1997,swaplfnl   ibm-1141-s390 IBM1141_LF { JAVA* }
+ibm-1142_P100-1997,swaplfnl   ibm-1142-s390
+ibm-1143_P100-1997,swaplfnl   ibm-1143-s390
+ibm-1144_P100-1997,swaplfnl   ibm-1144-s390
+ibm-1145_P100-1997,swaplfnl   ibm-1145-s390
+ibm-1146_P100-1997,swaplfnl   ibm-1146-s390
+ibm-1147_P100-1997,swaplfnl   ibm-1147-s390
+ibm-1148_P100-1997,swaplfnl   ibm-1148-s390
+ibm-1149_P100-1997,swaplfnl   ibm-1149-s390
+ibm-1153_P100-1999,swaplfnl   ibm-1153-s390
+ibm-12712_P100-1998,swaplfnl  ibm-12712-s390
+ibm-16804_X110-1999,swaplfnl  ibm-16804-s390
+
+# This is a special version of ibm-1140 that the XML4C (Xerces) parser team
+# requested in 2000.
+# It maps both EBCDIC LF and NL controls to Unicode LF U+000A.
+
+ebcdic-xml-us
+
+# These are not installed by default. They are rarely used.
+# Many of them can be added through the online ICU Data Library Customization tool
+
+ibm-1004_P100-1995 { UTR22* }   ibm-1004 { IBM* }
+ibm-1008_P100-1995 { UTR22* }   ibm-1008 { IBM* } # cp1008, 8-bit Arabic (w/o euro update)
+ibm-1009_P100-1995 { UTR22* }   ibm-1009 { IBM* }
+ibm-1010_P100-1995 { UTR22* }   ibm-1010 { IBM* } NF_Z_62-010 { IANA* } iso-ir-69 { IANA } ISO646-FR { IANA } fr { IANA } csISO69French { IANA }
+ibm-1011_P100-1995 { UTR22* }   ibm-1011 { IBM* } DIN_66003 { IANA* } iso-ir-21 { IANA } de { IANA } ISO646-DE { IANA } csISO21German { IANA }
+ibm-1012_P100-1995 { UTR22* }   ibm-1012 { IBM* } IT { IANA* } iso-ir-15 { IANA } ISO646-IT { IANA } csISO15Italian { IANA }
+ibm-1013_P100-1995 { UTR22* }   ibm-1013 { IBM* } BS_4730 { IANA* } iso-ir-4 { IANA } ISO646-GB { IANA } gb { IANA } uk { IANA } csISO4UnitedKingdom { IANA }
+ibm-1014_P100-1995 { UTR22* }   ibm-1014 { IBM* } ES2 { IANA* } iso-ir-85 { IANA } ISO646-ES2 { IANA } csISO85Spanish2 { IANA }
+ibm-1015_P100-1995 { UTR22* }   ibm-1015 { IBM* } PT2 { IANA* } iso-ir-84 { IANA } ISO646-PT2 { IANA } csISO84Portuguese2 { IANA }
+ibm-1016_P100-1995 { UTR22* }   ibm-1016 { IBM* } NS_4551-1 { IANA* } iso-ir-60 { IANA } ISO646-NO { IANA } no { IANA } csISO60DanishNorwegian { IANA } csISO60Norwegian1 { IANA }
+ibm-1017_P100-1995 { UTR22* }   ibm-1017 { IBM* }
+ibm-1018_P100-1995 { UTR22* }   ibm-1018 { IBM* } SEN_850200_B { IANA* } iso-ir-10 { IANA } FI { IANA } ISO646-FI { IANA } ISO646-SE { IANA } se { IANA } csISO10Swedish { IANA }
+ibm-1019_P100-1995 { UTR22* }   ibm-1019 { IBM* }
+ibm-1020_P100-2003 { UTR22* }   ibm-1020 { IBM* } CSA_Z243.4-1985-1 { IANA* } iso-ir-121 { IANA } ISO646-CA { IANA } csa7-1 { IANA } ca { IANA } csISO121Canadian1 { IANA }
+ibm-1021_P100-2003 { UTR22* }   ibm-1021 { IBM* }
+ibm-1023_P100-2003 { UTR22* }   ibm-1023 { IBM* } ES { IANA* } iso-ir-17 { IANA } ISO646-ES { IANA } csISO17Spanish { IANA }
+ibm-1027_P100-1995 { UTR22* }   ibm-1027 { IBM* } x-IBM1027 { JAVA* }
+ibm-1041_P100-1995 { UTR22* }   ibm-1041 { IBM* } x-IBM1041 { JAVA* }
+ibm-1043_P100-1995 { UTR22* }   ibm-1043 { IBM* } x-IBM1043 { JAVA* }
+ibm-1046_X110-1999 { UTR22* }   ibm-1046 { IBM* } x-IBM1046 { JAVA* } x-IBM1046S { JAVA } # Arabic
+ibm-1088_P100-1995 { UTR22* }   ibm-1088 { IBM* } x-IBM1088 { JAVA* }
+ibm-1100_P100-2003 { UTR22* }   ibm-1100 { IBM* } DEC-MCS { IANA* } dec { IANA } csDECMCS { IANA }
+ibm-1101_P100-2003 { UTR22* }   ibm-1101 { IBM* }
+ibm-1102_P100-2003 { UTR22* }   ibm-1102 { IBM* }
+ibm-1103_P100-2003 { UTR22* }   ibm-1103 { IBM* }
+ibm-1104_P100-2003 { UTR22* }   ibm-1104 { IBM* } NF_Z_62-010_1973 iso-ir-25 { IANA* } ISO646-FR1 { IANA } csISO25French { IANA } # NF_Z_62-010_(1973) is the real IANA alias, but () aren't invariant characters.
+ibm-1105_P100-2003 { UTR22* }   ibm-1105 { IBM* }
+ibm-1106_P100-2003 { UTR22* }   ibm-1106 { IBM* }
+ibm-1107_P100-2003 { UTR22* }   ibm-1107 { IBM* } DS_2089 { IANA* } ISO646-DK { IANA } dk { IANA } csISO646Danish { IANA }
+ibm-1127_P100-2004 { UTR22* }   ibm-1127 { IBM* }
+ibm-1161_P100-1999 { UTR22* }   ibm-1161 { IBM* } # Thai (Euro update of ibm-1129)
+ibm-1163_P100-1999 { UTR22* }   ibm-1163 { IBM* } # Vietnamese
+ibm-1165_P101-2000 { UTR22* }   ibm-1165 { IBM* } # Vietnamese (EBCDIC)
+ibm-1166_P100-2002 { UTR22* }   ibm-1166 { IBM* } # Cyrillic for Kazakhstan
+ibm-1167_P100-2002 { UTR22* }   ibm-1167 { IBM* } KOI8-RU x-KOI8_RU { JAVA* }
+ibm-1174_X100-2007 { UTR22* }   ibm-1174 { IBM* } KZ-1048 { IANA* } STRK1048-2002 { IANA } RK1048 { IANA } csKZ1048 { IANA }
+ibm-1277_P100-1995 { UTR22* }   ibm-1277 { IBM* } # Adobe (Postscript) Latin-1
+ibm-13125_P100-1997 { UTR22* }  ibm-13125 { IBM* } # S-Ch (DBCS subset of ibm-4933, ibm-1388)
+ibm-13140_P101-2000 { UTR22* }  ibm-13140 { IBM* }
+ibm-13218_P100-1996 { UTR22* }  ibm-13218 { IBM* } # Japanese (EBCDIC update of ibm-930)
+ibm-1350_P110-1997 { UTR22* }   ibm-1350 { IBM* } x-eucJP-Open { JAVA* } eucJP-Open { JAVA } # Japanese (EUC-JP variant)
+ibm-1351_P110-1997 { UTR22* }   ibm-1351 { IBM* } x-IBM1351 { JAVA* } # Japanese (DBCS subset of ibm-5039)
+ibm-1362_P110-1999 { UTR22* }   ibm-1362 { IBM* } x-IBM1362 { JAVA* } # Korean (DBCS subset of ibm-1363)
+ibm-13676_P102-2001 { UTR22* }  ibm-13676 { IBM* } # Simplified Chinese (EBCDIC)
+ibm-1380_P100-1995 { UTR22* }   ibm-1380 { IBM* } x-IBM1380 { JAVA* } # Simplified Chinese (DBCS subset of ibm-1381)
+ibm-1381_P110-1999 { UTR22* }   ibm-1381 { IBM* JAVA } cp1381 { JAVA* } 1381 { JAVA } x-IBM1381 { JAVA } # Simplified Chinese PC Data mixed (IBM GB) 
+ibm-1382_P100-1995 { UTR22* }   ibm-1382 { IBM* } x-IBM1382 { JAVA* } # Simplified Chinese (DBCS subset of ibm-1383)
+ibm-17221_P100-2001 { UTR22* }  ibm-17221 { IBM* } # Simplified Chinese (EBCDIC)
+ibm-17248_X110-1999 { UTR22* }  ibm-17248 { IBM* } # PC Arabic (w/ euro update) Updated version of ibm-864
+ibm-21344_P101-2000 { UTR22* }  ibm-21344 { IBM* } # PC Arabic. Updated version of ibm-864
+ibm-21427_P100-1999 { UTR22* }  ibm-21427 { IBM* } # Traditional Chinese (DBCS subset of ibm-1370)
+ibm-256_P100-1995 { UTR22* }    ibm-256 { IBM* } # Latin 1 EBCDIC
+ibm-259_P100-1995 { UTR22* }    ibm-259 { IBM* } IBM-Symbols { IANA* } csIBMSymbols { IANA }
+ibm-274_P100-2000 { UTR22* }    ibm-274 { IBM* } IBM274 { IANA* } EBCDIC-BE { IANA } CP274 { IANA } csIBM274 { IANA }
+ibm-275_P100-1995 { UTR22* }    ibm-275 { IBM* } IBM275 { IANA* } EBCDIC-BR { IANA } cp275 { IANA } csIBM275 { IANA }
+ibm-286_P100-2003 { UTR22* }    ibm-286 { IBM* } EBCDIC-AT-DE-A { IANA* } csEBCDICATDEA { IANA }
+ibm-293_P100-1995 { UTR22* }    ibm-293 { IBM* } # APL EBCDIC (APL: A Programming Language)
+ibm-300_P120-2006 { UTR22* }    ibm-300 { IBM* } x-IBM300 { JAVA* } # Japanese (DBCS subset of ibm-930 and ibm-939)
+ibm-301_P110-1997 { UTR22* }    ibm-301 { IBM* } x-IBM301 { JAVA* } # Japanese (DBCS subset of ibm-943)
+ibm-33058_P100-2000 { UTR22* }  ibm-33058 { IBM* } # SBCS (Katakana)
+ibm-425_P101-2000 { UTR22* }    ibm-425 { IBM* } # Arabic (EBCDIC)
+ibm-4930_P110-1999 { UTR22* }   ibm-4930 { IBM* } # Korean (DBCS subset of ibm-1364)
+ibm-4933_P100-2002 { UTR22* }   ibm-4933 { IBM* } # S-Ch (DBCS subset of ibm-1388)
+ibm-4948_P100-1995 { UTR22* }   ibm-4948 { IBM* }
+ibm-4951_P100-1995 { UTR22* }   ibm-4951 { IBM* }
+ibm-4952_P100-1995 { UTR22* }   ibm-4952 { IBM* }
+ibm-4960_P100-1995 { UTR22* }   ibm-4960 { IBM* }
+ibm-5039_P11A-1998 { UTR22* }   ibm-5039 { IBM* } # Japanese (HP Shift-JIS variant)
+ibm-5048_P100-1995 { UTR22* }   ibm-5048 { IBM* } # Japanese (DBCS subset of ibm-1350, JIS X208-1990)
+ibm-5049_P100-1995 { UTR22* }   ibm-5049 { IBM* } # Japanese (DBCS subset of ibm-1350, JIS X212)
+ibm-5067_P100-1995 { UTR22* }   ibm-5067 { IBM* } # Korean (DBCS subset of ibm-21450)
+ibm-5104_X110-1999 { UTR22* }   ibm-5104 { IBM* } # cp1008, 8-bit Arabic (w/ euro update)
+ibm-5233_P100-2011 { UTR22* }   ibm-5233 { IBM* } # Devanagari EBCDIC, including Indian Rupee
+ibm-806_P100-1998 { UTR22* }    ibm-806 { IBM* } # Hindi (ISCII variant)
+ibm-808_P100-1999 { UTR22* }    ibm-808 { IBM* } x-IBM808 { JAVA* } # Cyrillic
+ibm-833_P100-1995 { UTR22* }    ibm-833 { IBM* } x-IBM833 { JAVA* }
+ibm-834_P100-1995 { UTR22* }    ibm-834 { IBM* } x-IBM834 { JAVA* } # Korean (DBCS subset of ibm-933)
+ibm-835_P100-1995 { UTR22* }    ibm-835 { IBM* } x-IBM835 { JAVA* } # Traditional Chinese (DBCS subset of ibm-5033)
+ibm-836_P100-1995 { UTR22* }    ibm-836 { IBM* } x-IBM836 { JAVA* }
+ibm-837_P100-2011 { UTR22* }    ibm-837 { IBM* } x-IBM837 { JAVA* } # Simplified Chinese (DBCS subset of ibm-5031)
+ibm-848_P100-1999 { UTR22* }    ibm-848 { IBM* } # Cyrillic (euro update of ibm-1125)
+ibm-849_P100-1999 { UTR22* }    ibm-849 { IBM* } # Cyrillic Belarus (euro update of ibm-1131)
+ibm-859_P100-1999 { UTR22* }    ibm-859 { IBM* } x-IBM859 { JAVA* } # PC Latin 9 (w/ euro update)
+ibm-8612_P100-1995 { UTR22* }   ibm-8612 { IBM* } # Arabic (EBCDIC update of ibm-420)
+ibm-872_P100-1999 { UTR22* }    ibm-872 { IBM* } # Cyrillic (Euro update of ibm-855)
+ibm-880_P100-1995 { UTR22* }    ibm-880 { IBM* } IBM880 { IANA* } cp880 { IANA } EBCDIC-Cyrillic { IANA } csIBM880 { IANA } windows-20880 { WINDOWS* }
+ibm-896_P100-1995 { UTR22* }    ibm-896 { IBM* } # SBCS Katakana
+ibm-897_P100-1995 { UTR22* }    ibm-897 { IBM* } JIS_X0201 { IANA* } X0201 { IANA } csHalfWidthKatakana { IANA } x-IBM897 { JAVA* }
+ibm-9027_P100-1999 { UTR22* }   ibm-9027 { IBM* } # DBCS T-Ch Host. Euro update of ibm-835. DBCS portion of ibm-1371.
+ibm-9048_P100-1998 { UTR22* }   ibm-9048 { IBM* } # Hebrew (Euro and Sequel update of ibm-856)
+ibm-905_P100-1995 { UTR22* }    ibm-905 { IBM* } IBM905 { IANA* } CP905 { IANA } ebcdic-cp-tr { IANA } csIBM905 { IANA } windows-20905 { WINDOWS* }
+ibm-9056_P100-1995 { UTR22* }   ibm-9056 { IBM* } # Arabic
+ibm-9061_P100-1999 { UTR22* }   ibm-9061 { IBM* } # Greek (w/ euro update)
+ibm-9145_P110-1997 { UTR22* }   ibm-9145 { IBM* } # Japanese (DBCS subset of ibm-5050)
+ibm-9238_X110-1999 { UTR22* }   ibm-9238 { IBM* } # cp1046, PC Arabic Extended (w/ euro update)
+ibm-924_P100-1998 { UTR22* }    ibm-924 { IBM* } IBM00924 { IANA* } CCSID00924 { IANA } CP00924 { IANA } ebcdic-Latin9--euro { IANA }
+ibm-926_P100-2000 { UTR22* }    ibm-926 { IBM* } # Korean (DBCS subset of ibm-944)
+ibm-927_P100-1995 { UTR22* }    ibm-927 { IBM* } x-IBM927 { JAVA* } # Traditional Chinese (DBCS subset of ibm-948)
+ibm-928_P100-1995 { UTR22* }    ibm-928 { IBM* } # Simplified Chinese (DBCS subset of ibm-936)
+ibm-941_P13A-2001 { UTR22* }    ibm-941 { IBM* } # DBCS portion of ibm-943
+ibm-944_P100-1995 { UTR22* }    ibm-944 { IBM* } # Korean
+ibm-946_P100-1995 { UTR22* }    ibm-946 { IBM* } # Simplified Chinese
+ibm-947_P100-1995 { UTR22* }    ibm-947 { IBM* } x-IBM947 { JAVA* } # Traditional Chinese (DBCS subset of ibm-950)
+ibm-948_P110-1999 { UTR22* }    ibm-948 { IBM* } x-IBM948 { JAVA* } # Traditional Chinese
+ibm-951_P100-1995 { UTR22* }    ibm-951 { IBM* } x-IBM951 { JAVA* } # Korean (DBCS subset of ibm-949)
+ibm-952_P110-1997 { UTR22* }    ibm-952 { IBM* } x-JIS0208 # Pure DBCS, Japanese EUC, G1 - JIS X208-1990
+ibm-953_P100-2000 { UTR22* }    ibm-953 { IBM* } JIS_X0212-1990 { IANA* } # Pure DBCS, Japanese EUC, G3 - JIS X 0212-1990
+ibm-955_P110-1997 { UTR22* }    ibm-955 { IBM* } # Pure DBCS, Japanese EUC, G0 - JIS X208-1978
+ibm-9577_P100-2001 { UTR22* }   ibm-9577 { IBM* } ibm-1385 { IBM } x-IBM1385 { JAVA* } # ibm-9577 and ibm-1385 are identical DBCS tables.
+iso-8859_16-2001 { UTR22* }     ISO-8859-16 { IANA* } iso-ir-226 { IANA } ISO_8859-16:2001 { IANA } latin10 { IANA } l10 { IANA }
+
+# To be considered for listing at a later date for the data library customization tool
+#ibm-1159_P100-1999 { UTR22* }   ibm-1159 { IBM* } # SBCS T-Ch Host. Euro update of ibm-28709. This is used in combination with another CCSID mapping.
+#ibm-960_P100-2000 { UTR22* }    ibm-960 { IBM* } # Pure DBCS, CNS11643 plane 1
+#ibm-963_P100-1995 { UTR22* }    ibm-963 { IBM* } # Pure DBCS, CNS11643 plane 2 Traditional Chinese (DBCS subset of ibm-965)
diff --git a/rust/pspp/fuzz/.gitignore b/rust/pspp/fuzz/.gitignore

new file mode 100644 (file)

index 0000000..1a45eee
--- /dev/null
+++ b/rust/pspp/fuzz/.gitignore
@@ -0,0 +1,4 @@
+target
+corpus
+artifacts
+coverage
diff --git a/rust/pspp/fuzz/Cargo.lock b/rust/pspp/fuzz/Cargo.lock

new file mode 100644 (file)

index 0000000..c840c28
--- /dev/null
+++ b/rust/pspp/fuzz/Cargo.lock
@@ -0,0 +1,872 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 3
+
+[[package]]
+name = "adler"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
+
+[[package]]
+name = "android-tzdata"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0"
+
+[[package]]
+name = "android_system_properties"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "anstream"
+version = "0.6.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "418c75fa768af9c03be99d17643f93f79bbba589895012a80e3452a19ddda15b"
+dependencies = [
+ "anstyle",
+ "anstyle-parse",
+ "anstyle-query",
+ "anstyle-wincon",
+ "colorchoice",
+ "is_terminal_polyfill",
+ "utf8parse",
+]
+
+[[package]]
+name = "anstyle"
+version = "1.0.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "038dfcf04a5feb68e9c60b21c9625a54c2c0616e79b72b0fd87075a056ae1d1b"
+
+[[package]]
+name = "anstyle-parse"
+version = "0.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c03a11a9034d92058ceb6ee011ce58af4a9bf61491aa7e1e59ecd24bd40d22d4"
+dependencies = [
+ "utf8parse",
+]
+
+[[package]]
+name = "anstyle-query"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ad186efb764318d35165f1758e7dcef3b10628e26d41a44bc5550652e6804391"
+dependencies = [
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "anstyle-wincon"
+version = "3.0.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "61a38449feb7068f52bb06c12759005cf459ee52bb4adc1d5a7c4322d716fb19"
+dependencies = [
+ "anstyle",
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "anyhow"
+version = "1.0.86"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b3d1d046238990b9cf5bcde22a3fb3584ee5cf65fb2765f454ed428c7a0063da"
+
+[[package]]
+name = "arbitrary"
+version = "1.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7d5a26814d8dcb93b0e5a0ff3c6d80a8843bafb21b39e8e18a6f05471870e110"
+
+[[package]]
+name = "atty"
+version = "0.2.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8"
+dependencies = [
+ "hermit-abi",
+ "libc",
+ "winapi",
+]
+
+[[package]]
+name = "autocfg"
+version = "1.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0"
+
+[[package]]
+name = "bitflags"
+version = "2.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de"
+
+[[package]]
+name = "bumpalo"
+version = "3.16.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c"
+
+[[package]]
+name = "cc"
+version = "1.0.106"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "066fce287b1d4eafef758e89e09d724a24808a9196fe9756b8ca90e86d0719a2"
+dependencies = [
+ "jobserver",
+ "libc",
+ "once_cell",
+]
+
+[[package]]
+name = "cfg-if"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
+
+[[package]]
+name = "chrono"
+version = "0.4.38"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a21f936df1771bf62b77f047b726c4625ff2e8aa607c01ec06e5a05bd8463401"
+dependencies = [
+ "android-tzdata",
+ "iana-time-zone",
+ "js-sys",
+ "num-traits",
+ "wasm-bindgen",
+ "windows-targets 0.52.6",
+]
+
+[[package]]
+name = "clap"
+version = "4.5.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "84b3edb18336f4df585bc9aa31dd99c036dfa5dc5e9a2939a722a188f3a8970d"
+dependencies = [
+ "clap_builder",
+ "clap_derive",
+]
+
+[[package]]
+name = "clap_builder"
+version = "4.5.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c1c09dd5ada6c6c78075d6fd0da3f90d8080651e2d6cc8eb2f1aaa4034ced708"
+dependencies = [
+ "anstream",
+ "anstyle",
+ "clap_lex",
+ "strsim",
+ "terminal_size",
+]
+
+[[package]]
+name = "clap_derive"
+version = "4.5.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2bac35c6dafb060fd4d275d9a4ffae97917c13a6327903a8be2153cd964f7085"
+dependencies = [
+ "heck",
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "clap_lex"
+version = "0.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4b82cf0babdbd58558212896d1a4272303a57bdb245c2bf1147185fb45640e70"
+
+[[package]]
+name = "colorchoice"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b6a852b24ab71dffc585bcb46eaf7959d175cb865a7152e35b348d1b2960422"
+
+[[package]]
+name = "core-foundation-sys"
+version = "0.8.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f"
+
+[[package]]
+name = "crc32fast"
+version = "1.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3"
+dependencies = [
+ "cfg-if",
+]
+
+[[package]]
+name = "encoding_rs"
+version = "0.8.34"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b45de904aa0b010bce2ab45264d0631681847fa7b6f2eaa7dab7619943bc4f59"
+dependencies = [
+ "cfg-if",
+]
+
+[[package]]
+name = "equivalent"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5"
+
+[[package]]
+name = "errno"
+version = "0.3.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba"
+dependencies = [
+ "libc",
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "finl_unicode"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8fcfdc7a0362c9f4444381a9e697c79d435fe65b52a37466fc2c1184cee9edc6"
+
+[[package]]
+name = "flate2"
+version = "1.0.30"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5f54427cfd1c7829e2a139fcefea601bf088ebca651d2bf53ebc600eac295dae"
+dependencies = [
+ "crc32fast",
+ "miniz_oxide",
+]
+
+[[package]]
+name = "float_next_after"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8bf7cc16383c4b8d58b9905a8509f02926ce3058053c056376248d958c9df1e8"
+
+[[package]]
+name = "hashbrown"
+version = "0.14.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
+
+[[package]]
+name = "heck"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
+
+[[package]]
+name = "hermit-abi"
+version = "0.1.19"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "hexplay"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0962bea6731e28b5a443ba4aa00fe3e4fe7555dadf12012435efb738eeac5898"
+dependencies = [
+ "atty",
+ "termcolor",
+]
+
+[[package]]
+name = "iana-time-zone"
+version = "0.1.60"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e7ffbb5a1b541ea2561f8c41c087286cc091e21e556a4f09a8f6cbf17b69b141"
+dependencies = [
+ "android_system_properties",
+ "core-foundation-sys",
+ "iana-time-zone-haiku",
+ "js-sys",
+ "wasm-bindgen",
+ "windows-core",
+]
+
+[[package]]
+name = "iana-time-zone-haiku"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f"
+dependencies = [
+ "cc",
+]
+
+[[package]]
+name = "indexmap"
+version = "2.2.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "168fb715dda47215e360912c096649d23d58bf392ac62f73919e831745e40f26"
+dependencies = [
+ "equivalent",
+ "hashbrown",
+]
+
+[[package]]
+name = "is_terminal_polyfill"
+version = "1.70.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f8478577c03552c21db0e2724ffb8986a5ce7af88107e6be5d2ee6e158c12800"
+
+[[package]]
+name = "jobserver"
+version = "0.1.31"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d2b099aaa34a9751c5bf0878add70444e1ed2dd73f347be99003d4577277de6e"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "js-sys"
+version = "0.3.69"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "29c15563dc2726973df627357ce0c9ddddbea194836909d655df6a75d2cf296d"
+dependencies = [
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "lazy_static"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
+
+[[package]]
+name = "libc"
+version = "0.2.155"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c"
+
+[[package]]
+name = "libfuzzer-sys"
+version = "0.4.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a96cfd5557eb82f2b83fed4955246c988d331975a002961b07c81584d107e7f7"
+dependencies = [
+ "arbitrary",
+ "cc",
+ "once_cell",
+]
+
+[[package]]
+name = "linux-raw-sys"
+version = "0.4.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89"
+
+[[package]]
+name = "log"
+version = "0.4.22"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24"
+
+[[package]]
+name = "miniz_oxide"
+version = "0.7.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b8a240ddb74feaf34a79a7add65a741f3167852fba007066dcac1ca548d89c08"
+dependencies = [
+ "adler",
+]
+
+[[package]]
+name = "num"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "35bd024e8b2ff75562e5f34e7f4905839deb4b22955ef5e73d2fea1b9813cb23"
+dependencies = [
+ "num-bigint",
+ "num-complex",
+ "num-integer",
+ "num-iter",
+ "num-rational",
+ "num-traits",
+]
+
+[[package]]
+name = "num-bigint"
+version = "0.4.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9"
+dependencies = [
+ "num-integer",
+ "num-traits",
+]
+
+[[package]]
+name = "num-complex"
+version = "0.4.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495"
+dependencies = [
+ "num-traits",
+]
+
+[[package]]
+name = "num-derive"
+version = "0.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ed3955f1a9c7c0c15e092f9c887db08b1fc683305fdf6eb6684f22555355e202"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "num-integer"
+version = "0.1.46"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f"
+dependencies = [
+ "num-traits",
+]
+
+[[package]]
+name = "num-iter"
+version = "0.1.45"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf"
+dependencies = [
+ "autocfg",
+ "num-integer",
+ "num-traits",
+]
+
+[[package]]
+name = "num-rational"
+version = "0.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824"
+dependencies = [
+ "num-bigint",
+ "num-integer",
+ "num-traits",
+]
+
+[[package]]
+name = "num-traits"
+version = "0.2.19"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
+dependencies = [
+ "autocfg",
+]
+
+[[package]]
+name = "once_cell"
+version = "1.19.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
+
+[[package]]
+name = "ordered-float"
+version = "3.9.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f1e1c390732d15f1d48471625cd92d154e66db2c56645e29a9cd26f4699f72dc"
+dependencies = [
+ "num-traits",
+]
+
+[[package]]
+name = "proc-macro2"
+version = "1.0.86"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77"
+dependencies = [
+ "unicode-ident",
+]
+
+[[package]]
+name = "pspp"
+version = "1.0.0"
+dependencies = [
+ "anyhow",
+ "bitflags",
+ "chrono",
+ "clap",
+ "encoding_rs",
+ "finl_unicode",
+ "flate2",
+ "float_next_after",
+ "hexplay",
+ "indexmap",
+ "lazy_static",
+ "libc",
+ "num",
+ "num-derive",
+ "num-traits",
+ "ordered-float",
+ "thiserror",
+ "unicase",
+ "utf8-decode",
+ "windows-sys 0.48.0",
+]
+
+[[package]]
+name = "pspp-fuzz"
+version = "0.0.0"
+dependencies = [
+ "libfuzzer-sys",
+ "pspp",
+]
+
+[[package]]
+name = "quote"
+version = "1.0.36"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7"
+dependencies = [
+ "proc-macro2",
+]
+
+[[package]]
+name = "rustix"
+version = "0.38.34"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "70dc5ec042f7a43c4a73241207cecc9873a06d45debb38b329f8541d85c2730f"
+dependencies = [
+ "bitflags",
+ "errno",
+ "libc",
+ "linux-raw-sys",
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "strsim"
+version = "0.11.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
+
+[[package]]
+name = "syn"
+version = "2.0.69"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "201fcda3845c23e8212cd466bfebf0bd20694490fc0356ae8e428e0824a915a6"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
+[[package]]
+name = "termcolor"
+version = "0.3.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "adc4587ead41bf016f11af03e55a624c06568b5a19db4e90fde573d805074f83"
+dependencies = [
+ "wincolor",
+]
+
+[[package]]
+name = "terminal_size"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "21bebf2b7c9e0a515f6e0f8c51dc0f8e4696391e6f1ff30379559f8365fb0df7"
+dependencies = [
+ "rustix",
+ "windows-sys 0.48.0",
+]
+
+[[package]]
+name = "thiserror"
+version = "1.0.61"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c546c80d6be4bc6a00c0f01730c08df82eaa7a7a61f11d656526506112cc1709"
+dependencies = [
+ "thiserror-impl",
+]
+
+[[package]]
+name = "thiserror-impl"
+version = "1.0.61"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "46c3384250002a6d5af4d114f2845d37b57521033f30d5c3f46c4d70e1197533"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "unicase"
+version = "2.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f7d2d4dafb69621809a81864c9c1b864479e1235c0dd4e199924b9742439ed89"
+dependencies = [
+ "version_check",
+]
+
+[[package]]
+name = "unicode-ident"
+version = "1.0.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"
+
+[[package]]
+name = "utf8-decode"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ca61eb27fa339aa08826a29f03e87b99b4d8f0fc2255306fd266bb1b6a9de498"
+
+[[package]]
+name = "utf8parse"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
+
+[[package]]
+name = "version_check"
+version = "0.9.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
+
+[[package]]
+name = "wasm-bindgen"
+version = "0.2.92"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4be2531df63900aeb2bca0daaaddec08491ee64ceecbee5076636a3b026795a8"
+dependencies = [
+ "cfg-if",
+ "wasm-bindgen-macro",
+]
+
+[[package]]
+name = "wasm-bindgen-backend"
+version = "0.2.92"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "614d787b966d3989fa7bb98a654e369c762374fd3213d212cfc0251257e747da"
+dependencies = [
+ "bumpalo",
+ "log",
+ "once_cell",
+ "proc-macro2",
+ "quote",
+ "syn",
+ "wasm-bindgen-shared",
+]
+
+[[package]]
+name = "wasm-bindgen-macro"
+version = "0.2.92"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a1f8823de937b71b9460c0c34e25f3da88250760bec0ebac694b49997550d726"
+dependencies = [
+ "quote",
+ "wasm-bindgen-macro-support",
+]
+
+[[package]]
+name = "wasm-bindgen-macro-support"
+version = "0.2.92"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+ "wasm-bindgen-backend",
+ "wasm-bindgen-shared",
+]
+
+[[package]]
+name = "wasm-bindgen-shared"
+version = "0.2.92"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96"
+
+[[package]]
+name = "winapi"
+version = "0.3.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
+dependencies = [
+ "winapi-i686-pc-windows-gnu",
+ "winapi-x86_64-pc-windows-gnu",
+]
+
+[[package]]
+name = "winapi-i686-pc-windows-gnu"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
+
+[[package]]
+name = "winapi-x86_64-pc-windows-gnu"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
+
+[[package]]
+name = "wincolor"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eeb06499a3a4d44302791052df005d5232b927ed1a9658146d842165c4de7767"
+dependencies = [
+ "winapi",
+]
+
+[[package]]
+name = "windows-core"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9"
+dependencies = [
+ "windows-targets 0.52.6",
+]
+
+[[package]]
+name = "windows-sys"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9"
+dependencies = [
+ "windows-targets 0.48.5",
+]
+
+[[package]]
+name = "windows-sys"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
+dependencies = [
+ "windows-targets 0.52.6",
+]
+
+[[package]]
+name = "windows-targets"
+version = "0.48.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c"
+dependencies = [
+ "windows_aarch64_gnullvm 0.48.5",
+ "windows_aarch64_msvc 0.48.5",
+ "windows_i686_gnu 0.48.5",
+ "windows_i686_msvc 0.48.5",
+ "windows_x86_64_gnu 0.48.5",
+ "windows_x86_64_gnullvm 0.48.5",
+ "windows_x86_64_msvc 0.48.5",
+]
+
+[[package]]
+name = "windows-targets"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
+dependencies = [
+ "windows_aarch64_gnullvm 0.52.6",
+ "windows_aarch64_msvc 0.52.6",
+ "windows_i686_gnu 0.52.6",
+ "windows_i686_gnullvm",
+ "windows_i686_msvc 0.52.6",
+ "windows_x86_64_gnu 0.52.6",
+ "windows_x86_64_gnullvm 0.52.6",
+ "windows_x86_64_msvc 0.52.6",
+]
+
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.48.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8"
+
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
+
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.48.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc"
+
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
+
+[[package]]
+name = "windows_i686_gnu"
+version = "0.48.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e"
+
+[[package]]
+name = "windows_i686_gnu"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
+
+[[package]]
+name = "windows_i686_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
+
+[[package]]
+name = "windows_i686_msvc"
+version = "0.48.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406"
+
+[[package]]
+name = "windows_i686_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
+
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.48.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e"
+
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
+
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.48.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc"
+
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
+
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.48.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538"
+
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
diff --git a/rust/pspp/fuzz/Cargo.toml b/rust/pspp/fuzz/Cargo.toml

new file mode 100644 (file)

index 0000000..8b44789
--- /dev/null
+++ b/rust/pspp/fuzz/Cargo.toml
@@ -0,0 +1,28 @@
+[package]
+name = "pspp-fuzz"
+version = "0.0.0"
+publish = false
+edition = "2021"
+
+[package.metadata]
+cargo-fuzz = true
+
+[dependencies]
+libfuzzer-sys = "0.4"
+
+[dependencies.pspp]
+path = ".."
+
+[[bin]]
+name = "fuzz_target_1"
+path = "fuzz_targets/fuzz_target_1.rs"
+test = false
+doc = false
+bench = false
+
+[[bin]]
+name = "segment"
+path = "fuzz_targets/segment.rs"
+test = false
+doc = false
+bench = false
diff --git a/rust/pspp/fuzz/fuzz_targets/fuzz_target_1.rs b/rust/pspp/fuzz/fuzz_targets/fuzz_target_1.rs

new file mode 100644 (file)

index 0000000..43a88c1
--- /dev/null
+++ b/rust/pspp/fuzz/fuzz_targets/fuzz_target_1.rs
@@ -0,0 +1,7 @@
+#![no_main]
+
+use libfuzzer_sys::fuzz_target;
+
+fuzz_target!(|data: &[u8]| {
+    // fuzzed code goes here
+});
diff --git a/rust/pspp/fuzz/fuzz_targets/segment.rs b/rust/pspp/fuzz/fuzz_targets/segment.rs

new file mode 100644 (file)

index 0000000..1e5a109
--- /dev/null
+++ b/rust/pspp/fuzz/fuzz_targets/segment.rs
@@ -0,0 +1,18 @@
+#![no_main]
+
+use libfuzzer_sys::fuzz_target;
+use pspp::lex::segment::{Segmenter, Mode, Type};
+
+fuzz_target!(|data: &[u8]| {
+    if let Ok(mut input) = std::str::from_utf8(data) {
+        let mut segmenter = Segmenter::new(Mode::Auto, false);
+        loop {
+            let (rest, type_) = segmenter.push(input, true).unwrap();
+            match type_ {
+                Type::End => break,
+                _ => (),
+            }
+            input = rest;
+        }
+    }
+});
diff --git a/rust/pspp/src/command.rs b/rust/pspp/src/command.rs

new file mode 100644 (file)

index 0000000..d337d18
--- /dev/null
+++ b/rust/pspp/src/command.rs
@@ -0,0 +1,192 @@
+use std::{fmt::Write, sync::OnceLock};
+
+use flagset::{flags, FlagSet};
+
+use crate::{
+    integer::ToInteger,
+    lex::{
+        command_name::CommandMatcher,
+        lexer::Lexer,
+        token::{Punct, Token},
+    },
+    message::Diagnostic,
+};
+
+flags! {
+    enum State: u8 {
+        /// No active dataset yet defined.
+        Initial,
+
+        /// Active dataset has been defined.
+        Data,
+
+        /// Inside `INPUT PROGRAM`.
+        InputProgram,
+
+        /// Inside `FILE TYPE`.
+        FileType,
+
+        /// State nested inside `LOOP` or `DO IF`, inside [State::Data].
+        NestedData,
+
+        /// State nested inside `LOOP` or `DO IF`, inside [State::InputProgram].
+        NestedInputProgram,
+    }
+}
+
+struct Command {
+    allowed_states: FlagSet<State>,
+    enhanced_only: bool,
+    testing_only: bool,
+    no_abbrev: bool,
+    name: &'static str,
+    run: Box<dyn Fn(&Context) + Send + Sync>,
+}
+
+fn commands() -> &'static [Command] {
+    fn new_commands() -> Vec<Command> {
+        vec![Command {
+            allowed_states: State::Initial | State::Data,
+            enhanced_only: false,
+            testing_only: false,
+            no_abbrev: false,
+            name: "ECHO",
+            run: Box::new(|_context| {
+                println!("hi");
+            }),
+        }]
+    }
+
+    static COMMANDS: OnceLock<Vec<Command>> = OnceLock::new();
+    COMMANDS.get_or_init(|| new_commands()).as_slice()
+}
+
+fn parse_command_word(lexer: &mut Lexer, s: &mut String, n: isize) -> bool {
+    let separator = match s.chars().next_back() {
+        Some(c) if c != '-' => " ",
+        _ => "",
+    };
+
+    match lexer.next(n) {
+        Token::Punct(Punct::Dash) => {
+            s.push('-');
+            true
+        }
+        Token::Id(id) => {
+            write!(s, "{separator}{id}").unwrap();
+            true
+        }
+        Token::Number(number) if number.is_sign_positive() => {
+            if let Some(integer) = number.to_exact_usize() {
+                write!(s, "{separator}{integer}").unwrap();
+                true
+            } else {
+                false
+            }
+        }
+        _ => false,
+    }
+}
+
+fn find_best_match(s: &str) -> (Option<&'static Command>, isize) {
+    let mut cm = CommandMatcher::new(s);
+    for command in commands() {
+        cm.add(command.name, command);
+    }
+    cm.get_match()
+}
+
+fn parse_command_name(
+    lexer: &mut Lexer,
+    error: &Box<dyn Fn(Diagnostic)>,
+) -> Result<(&'static Command, isize), ()> {
+    let mut s = String::new();
+    let mut word = 0;
+    let mut missing_words = 0;
+    let mut command = None;
+    while parse_command_word(lexer, &mut s, word) {
+        (command, missing_words) = find_best_match(&s);
+        if missing_words <= 0 {
+            break;
+        }
+        word += 1;
+    }
+    if command.is_none() && missing_words > 0 {
+        s.push_str(" .");
+        (command, missing_words) = find_best_match(&s);
+        s.truncate(s.len() - 2);
+    }
+
+    match command {
+        Some(command) => Ok((command, (word + 1) + missing_words)),
+        None => {
+            if s.is_empty() {
+                error(lexer.error("Syntax error expecting command name"))
+            } else {
+                error(lexer.error("Unknown command `{s}`."))
+            };
+            Err(())
+        }
+    }
+}
+
+pub enum Success {
+    Success,
+    Eof,
+    Finish,
+}
+
+pub fn end_of_command(context: &Context) -> Result<Success, ()> {
+    match context.lexer.token() {
+        Token::EndCommand | Token::End => Ok(Success::Success),
+        _ => {
+            context.error(
+                context
+                    .lexer
+                    .error("Syntax error expecting end of command."),
+            );
+            Err(())
+        }
+    }
+}
+
+fn parse_in_state(lexer: &mut Lexer, error: &Box<dyn Fn(Diagnostic)>, _state: State) {
+    match lexer.token() {
+        Token::End | Token::EndCommand => (),
+        _ => {
+            if let Ok((command, n_tokens)) = parse_command_name(lexer, error) {
+                for _ in 0..n_tokens {
+                    lexer.get();
+                }
+                let context = Context {
+                    error,
+                    lexer,
+                    command_name: Some(command.name),
+                };
+                (command.run)(&context);
+                end_of_command(&context);
+            }
+            lexer.interactive_reset();
+            lexer.discard_rest_of_command();
+        }
+    }
+    while let Token::EndCommand = lexer.token() {
+        lexer.get();
+    }
+}
+
+pub fn parse(lexer: &mut Lexer, error: &Box<dyn Fn(Diagnostic)>) {
+    parse_in_state(lexer, error, State::Initial)
+}
+
+pub struct Context<'a> {
+    error: &'a Box<dyn Fn(Diagnostic)>,
+    lexer: &'a mut Lexer,
+    command_name: Option<&'static str>,
+}
+
+impl<'a> Context<'a> {
+    pub fn error(&self, diagnostic: Diagnostic) {
+        (self.error)(diagnostic);
+    }
+}
diff --git a/rust/pspp/src/cooked.rs b/rust/pspp/src/cooked.rs

new file mode 100644 (file)

index 0000000..d2617df
--- /dev/null
+++ b/rust/pspp/src/cooked.rs
@@ -0,0 +1,1482 @@
+use std::{cell::RefCell, collections::HashMap, ops::Range, rc::Rc};
+
+use crate::{
+    dictionary::{Dictionary, VarWidth, Variable},
+    encoding::Error as EncodingError,
+    endian::Endian,
+    format::{Error as FormatError, Format, UncheckedFormat},
+    identifier::{Error as IdError, Identifier},
+    raw::{
+        self, Cases, DecodedRecord, DocumentRecord, EncodingRecord, Extension, FileAttributeRecord,
+        FloatInfoRecord, HeaderRecord, IntegerInfoRecord, LongNamesRecord,
+        LongStringMissingValueRecord, LongStringValueLabelRecord, MultipleResponseRecord,
+        NumberOfCasesRecord, ProductInfoRecord, RawStr, ValueLabel, ValueLabelRecord,
+        VarDisplayRecord, VariableAttributeRecord, VariableRecord, VariableSetRecord,
+        VeryLongStringsRecord, ZHeader, ZTrailer,
+    },
+};
+use chrono::{NaiveDate, NaiveDateTime, NaiveTime};
+use encoding_rs::Encoding;
+use num::Integer;
+use thiserror::Error as ThisError;
+
+pub use crate::raw::{CategoryLabels, Compression};
+
+#[derive(ThisError, Debug)]
+pub enum Error {
+    #[error("Missing header record")]
+    MissingHeaderRecord,
+
+    // XXX this is an internal error
+    #[error("More than one file header record")]
+    DuplicateHeaderRecord,
+
+    #[error("{0}")]
+    EncodingError(EncodingError),
+
+    #[error("Using default encoding {0}.")]
+    UsingDefaultEncoding(String),
+
+    #[error("Variable record from offset {:x} to {:x} specifies width {width} not in valid range [-1,255).", offsets.start, offsets.end)]
+    InvalidVariableWidth { offsets: Range<u64>, width: i32 },
+
+    #[error("This file has corrupted metadata written by a buggy version of PSPP.  To ensure that other software can read it correctly, save a new copy of the file.")]
+    InvalidLongMissingValueFormat,
+
+    #[error("File creation date {creation_date} is not in the expected format \"DD MMM YY\" format.  Using 01 Jan 1970.")]
+    InvalidCreationDate { creation_date: String },
+
+    #[error("File creation time {creation_time} is not in the expected format \"HH:MM:SS\" format.  Using midnight.")]
+    InvalidCreationTime { creation_time: String },
+
+    #[error("{id_error}  Renaming variable to {new_name}.")]
+    InvalidVariableName {
+        id_error: IdError,
+        new_name: Identifier,
+    },
+
+    #[error(
+        "Substituting {new_spec} for invalid print format on variable {variable}.  {format_error}"
+    )]
+    InvalidPrintFormat {
+        new_spec: Format,
+        variable: Identifier,
+        format_error: FormatError,
+    },
+
+    #[error(
+        "Substituting {new_spec} for invalid write format on variable {variable}.  {format_error}"
+    )]
+    InvalidWriteFormat {
+        new_spec: Format,
+        variable: Identifier,
+        format_error: FormatError,
+    },
+
+    #[error("Renaming variable with duplicate name {duplicate_name} to {new_name}.")]
+    DuplicateVariableName {
+        duplicate_name: Identifier,
+        new_name: Identifier,
+    },
+
+    #[error("Dictionary index {dict_index} is outside valid range [1,{max_index}].")]
+    InvalidDictIndex { dict_index: usize, max_index: usize },
+
+    #[error("Dictionary index {0} refers to a long string continuation.")]
+    DictIndexIsContinuation(usize),
+
+    #[error("At offset {offset:#x}, one or more variable indexes for value labels referred to long string continuation records: {indexes:?}")]
+    LongStringContinuationIndexes { offset: u64, indexes: Vec<u32> },
+
+    #[error(
+        "At offsets {:#x}...{:#x}, record types 3 and 4 may not add value labels to one or more long string variables: {variables:?}", .offsets.start, .offsets.end
+    )]
+    InvalidLongStringValueLabels {
+        offsets: Range<u64>,
+        variables: Vec<Identifier>,
+    },
+
+    #[error("Variables associated with value label are not all of identical type.  Variable {numeric_var} is numeric, but variable {string_var} is string.")]
+    ValueLabelsDifferentTypes {
+        numeric_var: Identifier,
+        string_var: Identifier,
+    },
+
+    #[error("Invalid multiple response set name.  {0}")]
+    InvalidMrSetName(IdError),
+
+    #[error("Multiple response set {mr_set} includes unknown variable {short_name}.")]
+    UnknownMrSetVariable {
+        mr_set: Identifier,
+        short_name: Identifier,
+    },
+
+    #[error("Multiple response set {0} has no variables.")]
+    EmptyMrSet(Identifier),
+
+    #[error("Multiple response set {0} has only one variable.")]
+    OneVarMrSet(Identifier),
+
+    #[error("Multiple response set {0} contains both string and numeric variables.")]
+    MixedMrSet(Identifier),
+
+    #[error(
+        "Invalid numeric format for counted value {number} in multiple response set {mr_set}."
+    )]
+    InvalidMDGroupCountedValue { mr_set: Identifier, number: String },
+
+    #[error("Counted value {value} has width {width}, but it must be no wider than {max_width}, the width of the narrowest variable in multiple response set {mr_set}.")]
+    TooWideMDGroupCountedValue {
+        mr_set: Identifier,
+        value: String,
+        width: usize,
+        max_width: u16,
+    },
+
+    #[error("Long string value label for variable {name} has width {width}, which is not in the valid range [{min_width},{max_width}].")]
+    InvalidLongValueLabelWidth {
+        name: Identifier,
+        width: u32,
+        min_width: u16,
+        max_width: u16,
+    },
+
+    #[error("Invalid attribute name.  {0}")]
+    InvalidAttributeName(IdError),
+
+    #[error("Invalid short name in long variable name record.  {0}")]
+    InvalidShortName(IdError),
+
+    #[error("Invalid name in long variable name record.  {0}")]
+    InvalidLongName(IdError),
+
+    #[error("Invalid variable name in very long string record.  {0}")]
+    InvalidLongStringName(IdError),
+
+    #[error("Invalid variable name in long string value label record.  {0}")]
+    InvalidLongStringValueLabelName(IdError),
+
+    #[error("Invalid variable name in attribute record.  {0}")]
+    InvalidAttributeVariableName(IdError),
+
+    // XXX This is risky because `text` might be arbitarily long.
+    #[error("Text string contains invalid bytes for {encoding} encoding: {text}")]
+    MalformedString { encoding: String, text: String },
+
+    #[error("Details TBD")]
+    TBD,
+}
+
+type DictIndex = usize;
+
+#[derive(Clone, Debug)]
+pub struct Headers {
+    pub header: HeaderRecord<String>,
+    pub variable: Vec<VariableRecord<String, String>>,
+    pub value_label: Vec<ValueLabelRecord<RawStr<8>, String>>,
+    pub document: Vec<DocumentRecord<String>>,
+    pub integer_info: Option<IntegerInfoRecord>,
+    pub float_info: Option<FloatInfoRecord>,
+    pub var_display: Option<VarDisplayRecord>,
+    pub multiple_response: Vec<MultipleResponseRecord<Identifier, String>>,
+    pub long_string_value_labels: Vec<LongStringValueLabelRecord<Identifier, String>>,
+    pub long_string_missing_values: Vec<LongStringMissingValueRecord<Identifier, String>>,
+    pub encoding: Option<EncodingRecord>,
+    pub number_of_cases: Option<NumberOfCasesRecord>,
+    pub variable_sets: Vec<VariableSetRecord>,
+    pub product_info: Option<ProductInfoRecord>,
+    pub long_names: Vec<LongNamesRecord>,
+    pub very_long_strings: Vec<VeryLongStringsRecord>,
+    pub file_attributes: Vec<FileAttributeRecord>,
+    pub variable_attributes: Vec<VariableAttributeRecord>,
+    pub other_extension: Vec<Extension>,
+    pub end_of_headers: Option<u32>,
+    pub z_header: Option<ZHeader>,
+    pub z_trailer: Option<ZTrailer>,
+    pub cases: Option<Rc<RefCell<Cases>>>,
+}
+
+fn take_first<T, F>(mut vec: Vec<T>, more_than_one: F) -> Option<T>
+where
+    F: FnOnce(),
+{
+    if vec.len() > 1 {
+        more_than_one();
+    }
+    vec.drain(..).next()
+}
+
+impl Headers {
+    pub fn new(headers: Vec<raw::DecodedRecord>, warn: &impl Fn(Error)) -> Result<Headers, Error> {
+        let mut file_header = Vec::new();
+        let mut variable = Vec::new();
+        let mut value_label = Vec::new();
+        let mut document = Vec::new();
+        let mut integer_info = Vec::new();
+        let mut float_info = Vec::new();
+        let mut var_display = Vec::new();
+        let mut multiple_response = Vec::new();
+        let mut long_string_value_labels = Vec::new();
+        let mut long_string_missing_values = Vec::new();
+        let mut encoding = Vec::new();
+        let mut number_of_cases = Vec::new();
+        let mut variable_sets = Vec::new();
+        let mut product_info = Vec::new();
+        let mut long_names = Vec::new();
+        let mut very_long_strings = Vec::new();
+        let mut file_attributes = Vec::new();
+        let mut variable_attributes = Vec::new();
+        let mut other_extension = Vec::new();
+        let mut end_of_headers = Vec::new();
+        let mut z_header = Vec::new();
+        let mut z_trailer = Vec::new();
+        let mut cases = Vec::new();
+
+        for header in headers {
+            match header {
+                DecodedRecord::Header(record) => {
+                    file_header.push(record);
+                }
+                DecodedRecord::Variable(record) => {
+                    variable.push(record);
+                }
+                DecodedRecord::ValueLabel(record) => {
+                    value_label.push(record);
+                }
+                DecodedRecord::Document(record) => {
+                    document.push(record);
+                }
+                DecodedRecord::IntegerInfo(record) => {
+                    integer_info.push(record);
+                }
+                DecodedRecord::FloatInfo(record) => {
+                    float_info.push(record);
+                }
+                DecodedRecord::VariableSets(record) => {
+                    variable_sets.push(record);
+                }
+                DecodedRecord::VarDisplay(record) => {
+                    var_display.push(record);
+                }
+                DecodedRecord::MultipleResponse(record) => {
+                    multiple_response.push(record);
+                }
+                DecodedRecord::LongStringValueLabels(record) => {
+                    long_string_value_labels.push(record)
+                }
+                DecodedRecord::LongStringMissingValues(record) => {
+                    long_string_missing_values.push(record);
+                }
+                DecodedRecord::Encoding(record) => {
+                    encoding.push(record);
+                }
+                DecodedRecord::NumberOfCases(record) => {
+                    number_of_cases.push(record);
+                }
+                DecodedRecord::ProductInfo(record) => {
+                    product_info.push(record);
+                }
+                DecodedRecord::LongNames(record) => {
+                    long_names.push(record);
+                }
+                DecodedRecord::VeryLongStrings(record) => {
+                    very_long_strings.push(record);
+                }
+                DecodedRecord::FileAttributes(record) => {
+                    file_attributes.push(record);
+                }
+                DecodedRecord::VariableAttributes(record) => {
+                    variable_attributes.push(record);
+                }
+                DecodedRecord::OtherExtension(record) => {
+                    other_extension.push(record);
+                }
+                DecodedRecord::EndOfHeaders(record) => {
+                    end_of_headers.push(record);
+                }
+                DecodedRecord::ZHeader(record) => {
+                    z_header.push(record);
+                }
+                DecodedRecord::ZTrailer(record) => {
+                    z_trailer.push(record);
+                }
+                DecodedRecord::Cases(record) => {
+                    cases.push(record);
+                }
+            }
+        }
+
+        let Some(file_header) = take_first(file_header, || warn(Error::DuplicateHeaderRecord))
+        else {
+            return Err(Error::MissingHeaderRecord);
+        };
+
+        Ok(Headers {
+            header: file_header,
+            variable,
+            value_label,
+            document,
+            integer_info: take_first(integer_info, || warn(Error::TBD)),
+            float_info: take_first(float_info, || warn(Error::TBD)),
+            var_display: take_first(var_display, || warn(Error::TBD)),
+            multiple_response,
+            long_string_value_labels,
+            long_string_missing_values,
+            encoding: take_first(encoding, || warn(Error::TBD)),
+            number_of_cases: take_first(number_of_cases, || warn(Error::TBD)),
+            variable_sets,
+            product_info: take_first(product_info, || warn(Error::TBD)),
+            long_names,
+            very_long_strings,
+            file_attributes,
+            variable_attributes,
+            other_extension,
+            end_of_headers: take_first(end_of_headers, || warn(Error::TBD)),
+            z_header: take_first(z_header, || warn(Error::TBD)),
+            z_trailer: take_first(z_trailer, || warn(Error::TBD)),
+            cases: take_first(cases, || warn(Error::TBD)),
+        })
+    }
+}
+
+pub struct Metadata {
+    creation: NaiveDateTime,
+    endian: Endian,
+    compression: Option<Compression>,
+    n_cases: Option<u64>,
+    product: String,
+    product_ext: Option<String>,
+    version: Option<(i32, i32, i32)>,
+}
+
+impl Metadata {
+    fn decode(headers: &Headers, warn: impl Fn(Error)) -> Self {
+        let header = &headers.header;
+        let creation_date = NaiveDate::parse_from_str(&header.creation_date, "%e %b %Y")
+            .unwrap_or_else(|_| {
+                warn(Error::InvalidCreationDate {
+                    creation_date: header.creation_date.to_string(),
+                });
+                Default::default()
+            });
+        let creation_time = NaiveTime::parse_from_str(&header.creation_time, "%H:%M:%S")
+            .unwrap_or_else(|_| {
+                warn(Error::InvalidCreationTime {
+                    creation_time: header.creation_time.to_string(),
+                });
+                Default::default()
+            });
+        let creation = NaiveDateTime::new(creation_date, creation_time);
+
+        let product = header
+            .eye_catcher
+            .trim_start_matches("@(#) SPSS DATA FILE")
+            .trim_end()
+            .to_string();
+
+        Self {
+            creation,
+            endian: header.endian,
+            compression: header.compression,
+            n_cases: header.n_cases.map(|n| n as u64),
+            product,
+            product_ext: headers.product_info.as_ref().map(|pe| fix_line_ends(&pe.0)),
+            version: headers.integer_info.as_ref().map(|ii| ii.version),
+        }
+    }
+}
+
+struct Decoder {
+    //pub raw: raw::Decoder,
+    pub encoding: &'static Encoding,
+    //pub variables: HashMap<DictIndex, Variable>,
+    //pub var_names: HashMap<Identifier, DictIndex>,
+    //pub dictionary: Dictionary,
+    //n_dict_indexes: usize,
+    n_generated_names: usize,
+}
+
+impl Decoder {
+    fn generate_name(&mut self, dictionary: &Dictionary) -> Identifier {
+        loop {
+            self.n_generated_names += 1;
+            let name = Identifier::from_encoding(&format!("VAR{:03}", self.n_generated_names), self.encoding)
+                .unwrap();
+            if !dictionary.variables.contains(&name.0) {
+                return name;
+            }
+            assert!(self.n_generated_names < usize::MAX);
+        }
+    }
+}
+
+pub fn decode(
+    mut headers: Headers,
+    encoding: &'static Encoding,
+    warn: impl Fn(Error),
+) -> Result<(Dictionary, Metadata), Error> {
+    let mut dictionary = Dictionary::new(encoding);
+
+    let file_label = fix_line_ends(headers.header.file_label.trim_end_matches(' '));
+    if !file_label.is_empty() {
+        dictionary.file_label = Some(file_label);
+    }
+
+    for attributes in headers.file_attributes.drain(..) {
+        dictionary.attributes.extend(attributes.0 .0.into_iter())
+    }
+
+    // Concatenate all the document records (really there should only be one)
+    // and trim off the trailing spaces that pad them to 80 bytes.
+    dictionary.documents = headers
+        .document
+        .drain(..)
+        .flat_map(|record| record.lines)
+        .map(trim_end_spaces)
+        .collect();
+
+    // XXX warn for weird integer format
+    // XXX warn for weird floating-point format, etc.
+
+    let mut decoder = Decoder {
+        encoding,
+        n_generated_names: 0,
+    };
+
+    let mut header_vars = headers.variable.iter().enumerate();
+    let mut var_index_map = HashMap::new();
+    while let Some((value_index, input)) = header_vars.next() {
+        let name = trim_end_spaces(input.name.to_string());
+        let name = match Identifier::from_encoding(&name, encoding) {
+            Ok(name) => {
+                if !dictionary.variables.contains(&name.0) {
+                    name
+                } else {
+                    let new_name = decoder.generate_name(&dictionary);
+                    warn(Error::DuplicateVariableName {
+                        duplicate_name: name.clone(),
+                        new_name: new_name.clone(),
+                    });
+                    new_name
+                }
+            }
+            Err(id_error) => {
+                let new_name = decoder.generate_name(&dictionary);
+                warn(Error::InvalidVariableName {
+                    id_error,
+                    new_name: new_name.clone(),
+                });
+                new_name
+            }
+        };
+        let mut variable = Variable::new(name.clone(), VarWidth::from_raw(input.width).unwrap());
+
+        // Set the short name the same as the long name (even if we renamed it).
+        variable.short_names = vec![name];
+
+        variable.label = input.label.clone();
+
+        variable.missing_values = input.missing_values.clone();
+
+        variable.print_format = decode_format(
+            input.print_format,
+            variable.width,
+            |new_spec, format_error| {
+                warn(Error::InvalidPrintFormat {
+                    new_spec,
+                    variable: variable.name.clone(),
+                    format_error,
+                })
+            },
+        );
+        variable.write_format = decode_format(
+            input.write_format,
+            variable.width,
+            |new_spec, format_error| {
+                warn(Error::InvalidWriteFormat {
+                    new_spec,
+                    variable: variable.name.clone(),
+                    format_error,
+                })
+            },
+        );
+
+        // Skip long string continuation records.
+        if input.width > 0 {
+            #[allow(unstable_name_collisions)]
+            for _ in 1..input.width.div_ceil(&8) {
+                if let Some((_, continuation)) = header_vars.next() {
+                    if continuation.width == -1 {
+                        continue;
+                    }
+                }
+                return Err(Error::TBD);
+            }
+        }
+
+        let dict_index = dictionary.add_var(variable).unwrap();
+        assert_eq!(var_index_map.insert(value_index, dict_index), None);
+    }
+
+    for record in headers.value_label.drain(..) {
+        let mut dict_indexes = Vec::with_capacity(record.dict_indexes.len());
+        let mut continuation_indexes = Vec::new();
+        let mut long_string_variables = Vec::new();
+        for value_index in record.dict_indexes.iter() {
+            if let Some(dict_index) = var_index_map.get(&(*value_index as usize - 1)) {
+                let variable = &dictionary.variables[*dict_index];
+                if variable.width.is_long_string() {
+                    long_string_variables.push(variable.name.clone());
+                } else {
+                    dict_indexes.push(*dict_index);
+                }
+            } else {
+                continuation_indexes.push(*value_index);
+            }
+        }
+        if !continuation_indexes.is_empty() {
+            warn(Error::LongStringContinuationIndexes {
+                offset: record.offsets.start,
+                indexes: continuation_indexes,
+            });
+        }
+        if !long_string_variables.is_empty() {
+            warn(Error::InvalidLongStringValueLabels {
+                offsets: record.offsets.clone(),
+                variables: long_string_variables,
+            });
+        }
+
+        for dict_index in dict_indexes {
+            let mut variable = &dictionary.variables[dict_index];
+            for ValueLabel { value, label } in record.labels.iter().cloned() {
+                
+            }
+        }
+    }
+
+    let metadata = Metadata::decode(&headers, warn);
+    Ok((dictionary, metadata))
+}
+
+fn trim_end_spaces(mut s: String) -> String {
+    s.truncate(s.trim_end_matches(' ').len());
+    s
+}
+
+/// Returns a copy of `s` in which all lone CR and CR LF pairs have been
+/// replaced by LF.
+///
+/// (A product that identifies itself as VOXCO INTERVIEWER 4.3 produces system
+/// files that use CR-only line ends in the file label and extra product info.)
+fn fix_line_ends(s: &str) -> String {
+    let mut out = String::with_capacity(s.len());
+    let mut s = s.chars().peekable();
+    while let Some(c) = s.next() {
+        match c {
+            '\r' => {
+                s.next_if_eq(&'\n');
+                out.push('\n')
+            }
+            c => out.push(c),
+        }
+    }
+    out
+}
+
+fn decode_format(raw: raw::Spec, width: VarWidth, warn: impl Fn(Format, FormatError)) -> Format {
+    UncheckedFormat::try_from(raw)
+        .and_then(Format::try_from)
+        .and_then(|x| x.check_width_compatibility(width))
+        .unwrap_or_else(|error| {
+            let new_format = Format::default_for_width(width);
+            warn(new_format, error);
+            new_format
+        })
+}
+
+/*
+impl Decoder {
+    fn generate_name(&mut self) -> Identifier {
+        loop {
+            self.n_generated_names += 1;
+            let name = Identifier::new(&format!("VAR{:03}", self.n_generated_names), self.encoding)
+                .unwrap();
+            if !self.var_names.contains_key(&name) {
+                return name;
+            }
+            assert!(self.n_generated_names < usize::MAX);
+        }
+    }
+    fn decode_string_cow<'a>(&self, input: &'a [u8], warn: &impl Fn(Error)) -> Cow<'a, str> {
+        let (output, malformed) = self.encoding.decode_without_bom_handling(input);
+        if malformed {
+            warn(Error::MalformedString {
+                encoding: self.encoding.name().into(),
+                text: output.clone().into(),
+            });
+        }
+        output
+    }
+    fn decode_string(&self, input: &[u8], warn: &impl Fn(Error)) -> String {
+        self.decode_string_cow(input, warn).into()
+    }
+    pub fn decode_identifier(
+        &self,
+        input: &[u8],
+        warn: &impl Fn(Error),
+    ) -> Result<Identifier, IdError> {
+        let s = self.decode_string_cow(input, warn);
+        Identifier::new(&s, self.encoding)
+    }
+    fn get_var_by_index(&self, dict_index: usize) -> Result<&Variable, Error> {
+        let max_index = self.n_dict_indexes;
+        if dict_index == 0 || dict_index > max_index {
+            return Err(Error::InvalidDictIndex {
+                dict_index,
+                max_index,
+            });
+        }
+        let Some(variable) = self.variables.get(&(dict_index - 1)) else {
+            return Err(Error::DictIndexIsContinuation(dict_index));
+        };
+        Ok(variable)
+    }
+
+    /// Returns `input` decoded from `self.encoding` into UTF-8 such that
+    /// re-encoding the result back into `self.encoding` will have exactly the
+    /// same length in bytes.
+    ///
+    /// XXX warn about errors?
+    fn decode_exact_length<'a>(&self, input: &'a [u8]) -> Cow<'a, str> {
+        if let (s, false) = self.encoding.decode_without_bom_handling(input) {
+            // This is the common case.  Usually there will be no errors.
+            s
+        } else {
+            // Unusual case.  Don't bother to optimize it much.
+            let mut decoder = self.encoding.new_decoder_without_bom_handling();
+            let mut output = String::with_capacity(
+                decoder
+                    .max_utf8_buffer_length_without_replacement(input.len())
+                    .unwrap(),
+            );
+            let mut rest = input;
+            while !rest.is_empty() {
+                match decoder.decode_to_string_without_replacement(rest, &mut output, true) {
+                    (DecoderResult::InputEmpty, _) => break,
+                    (DecoderResult::OutputFull, _) => unreachable!(),
+                    (DecoderResult::Malformed(a, b), consumed) => {
+                        let skipped = a as usize + b as usize;
+                        output.extend(repeat('?').take(skipped));
+                        rest = &rest[consumed..];
+                    }
+                }
+            }
+            assert_eq!(self.encoding.encode(&output).0.len(), input.len());
+            output.into()
+        }
+    }
+}
+
+pub trait TryDecode: Sized {
+    type Input<'a>;
+    fn try_decode(
+        decoder: &mut Decoder,
+        input: &Self::Input<'_>,
+        warn: impl Fn(Error),
+    ) -> Result<Option<Self>, Error>;
+}
+
+pub trait Decode<Input>: Sized {
+    fn decode(decoder: &Decoder, input: &Input, warn: impl Fn(Error)) -> Self;
+}
+
+impl<const N: usize> Decode<RawStr<N>> for String {
+    fn decode(decoder: &Decoder, input: &RawStr<N>, warn: impl Fn(Error)) -> Self {
+        decoder.decode_string(&input.0, &warn)
+    }
+}
+*/
+/*
+#[derive(Clone, Debug)]
+pub struct HeaderRecord {
+    pub eye_catcher: String,
+    pub weight_index: Option<usize>,
+    pub n_cases: Option<u64>,
+    pub creation: NaiveDateTime,
+    pub file_label: String,
+}
+
+fn trim_end_spaces(mut s: String) -> String {
+    s.truncate(s.trim_end_matches(' ').len());
+    s
+}
+
+/// Data file info that doesn't fit in [Dictionary].
+pub struct Metadata {
+    creation: NaiveDateTime,
+    endian: Endian,
+    compression: Option<Compression>,
+    n_cases: Option<u64>,
+    product: String,
+    product_ext: Option<String>,
+    version: Option<(i32, i32, i32)>,
+}
+
+impl Metadata {
+    fn decode(
+        header: &crate::raw::HeaderRecord<Cow<str>>,
+        integer_info: Option<&IntegerInfoRecord>,
+        product_ext: Option<&ProductInfoRecord>,
+        warn: impl Fn(Error),
+    ) -> Self {
+        let creation_date = NaiveDate::parse_from_str(&header.creation_date, "%e %b %Y")
+            .unwrap_or_else(|_| {
+                warn(Error::InvalidCreationDate {
+                    creation_date: header.creation_date.to_string(),
+                });
+                Default::default()
+            });
+        let creation_time = NaiveTime::parse_from_str(&header.creation_time, "%H:%M:%S")
+            .unwrap_or_else(|_| {
+                warn(Error::InvalidCreationTime {
+                    creation_time: header.creation_time.to_string(),
+                });
+                Default::default()
+            });
+        let creation = NaiveDateTime::new(creation_date, creation_time);
+
+        let product = header
+            .eye_catcher
+            .trim_start_matches("@(#) SPSS DATA FILE")
+            .trim_end()
+            .to_string();
+
+        Self {
+            creation,
+            endian: header.endian,
+            compression: header.compression,
+            n_cases: header.n_cases.map(|n| n as u64),
+            product,
+            product_ext: product_ext.map(|pe| pe.0.clone()),
+            version: integer_info.map(|ii| ii.version),
+        }
+    }
+}
+
+impl TryDecode for HeaderRecord {
+    type Input<'a> = crate::raw::HeaderRecord<Cow<'a, str>>;
+
+    fn try_decode(
+        _decoder: &mut Decoder,
+        input: &Self::Input<'_>,
+        warn: impl Fn(Error),
+    ) -> Result<Option<Self>, Error> {
+        let eye_catcher = trim_end_spaces(input.eye_catcher.to_string());
+        let file_label = trim_end_spaces(input.file_label.to_string());
+        let creation_date = NaiveDate::parse_from_str(&input.creation_date, "%e %b %Y")
+            .unwrap_or_else(|_| {
+                warn(Error::InvalidCreationDate {
+                    creation_date: input.creation_date.to_string(),
+                });
+                Default::default()
+            });
+        let creation_time = NaiveTime::parse_from_str(&input.creation_time, "%H:%M:%S")
+            .unwrap_or_else(|_| {
+                warn(Error::InvalidCreationTime {
+                    creation_time: input.creation_time.to_string(),
+                });
+                Default::default()
+            });
+        Ok(Some(HeaderRecord {
+            eye_catcher,
+            weight_index: input.weight_index.map(|n| n as usize),
+            n_cases: input.n_cases.map(|n| n as u64),
+            creation: NaiveDateTime::new(creation_date, creation_time),
+            file_label,
+        }))
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct VariableRecord {
+    pub width: VarWidth,
+    pub name: Identifier,
+    pub print_format: Spec,
+    pub write_format: Spec,
+    pub missing_values: MissingValues<String>,
+    pub label: Option<String>,
+}
+
+
+fn parse_variable_record(
+    decoder: &mut Decoder,
+    input: &raw::VariableRecord<Cow<str>, String>,
+    warn: impl Fn(Error),
+) -> Result<(), Error> {
+    let width = match input.width {
+        0 => VarWidth::Numeric,
+        w @ 1..=255 => VarWidth::String(w as u16),
+        -1 => return Ok(()),
+        _ => {
+            return Err(Error::InvalidVariableWidth {
+                offsets: input.offsets.clone(),
+                width: input.width,
+            })
+        }
+    };
+    let name = trim_end_spaces(input.name.to_string());
+    let name = match Identifier::new(&name, decoder.encoding) {
+        Ok(name) => {
+            if !decoder.var_names.contains_key(&name) {
+                name
+            } else {
+                let new_name = decoder.generate_name();
+                warn(Error::DuplicateVariableName {
+                    duplicate_name: name.clone(),
+                    new_name: new_name.clone(),
+                });
+                new_name
+            }
+        }
+        Err(id_error) => {
+            let new_name = decoder.generate_name();
+            warn(Error::InvalidVariableName {
+                id_error,
+                new_name: new_name.clone(),
+            });
+            new_name
+        }
+    };
+    let variable = Variable {
+        dict_index: decoder.n_dict_indexes,
+        short_name: name.clone(),
+        long_name: None,
+        width,
+    };
+    decoder.n_dict_indexes += width.n_dict_indexes();
+    assert!(decoder
+        .var_names
+        .insert(name.clone(), variable.dict_index)
+        .is_none());
+    assert!(decoder
+        .variables
+        .insert(variable.dict_index, variable)
+        .is_none());
+
+    let print_format = decode_format(input.print_format, width, |new_spec, format_error| {
+        warn(Error::InvalidPrintFormat {
+            new_spec,
+            variable: name.clone(),
+            format_error,
+        })
+    });
+    let write_format = decode_format(input.write_format, width, |new_spec, format_error| {
+        warn(Error::InvalidWriteFormat {
+            new_spec,
+            variable: name.clone(),
+            format_error,
+        })
+    });
+    let mut variable = dictionary::Variable::new(name, width);
+    variable.print_format = print_format;
+    variable.write_format = write_format;
+    variable.missing_values = input.missing_values.clone();
+    if let Some(ref label) = input.label {
+        variable.label = Some(label.to_string());
+    }
+    decoder.dictionary.add_var(variable).unwrap();
+    Ok(())
+}
+
+#[derive(Clone, Debug)]
+pub struct DocumentRecord(Vec<String>);
+
+impl TryDecode for DocumentRecord {
+    type Input<'a> = crate::raw::DocumentRecord<RawDocumentLine>;
+
+    fn try_decode(
+        decoder: &mut Decoder,
+        input: &Self::Input<'_>,
+        warn: impl Fn(Error),
+    ) -> Result<Option<Self>, Error> {
+        Ok(Some(DocumentRecord(
+            input
+                .lines
+                .iter()
+                .map(|s| trim_end_spaces(decoder.decode_string(&s.0, &warn)))
+                .collect(),
+        )))
+    }
+}
+
+trait TextRecord
+where
+    Self: Sized,
+{
+    const NAME: &'static str;
+    fn parse(input: &str, warn: impl Fn(Error)) -> Result<Self, Error>;
+}
+
+#[derive(Clone, Debug)]
+pub struct VariableSet {
+    pub name: String,
+    pub vars: Vec<String>,
+}
+
+impl VariableSet {
+    fn parse(input: &str) -> Result<Self, Error> {
+        let (name, input) = input.split_once('=').ok_or(Error::TBD)?;
+        let vars = input.split_ascii_whitespace().map(String::from).collect();
+        Ok(VariableSet {
+            name: name.into(),
+            vars,
+        })
+    }
+}
+
+trait WarnOnError<T> {
+    fn warn_on_error<F: Fn(Error)>(self, warn: &F) -> Option<T>;
+}
+impl<T> WarnOnError<T> for Result<T, Error> {
+    fn warn_on_error<F: Fn(Error)>(self, warn: &F) -> Option<T> {
+        match self {
+            Ok(result) => Some(result),
+            Err(error) => {
+                warn(error);
+                None
+            }
+        }
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct ValueLabel {
+    pub value: Value,
+    pub label: String,
+}
+
+#[derive(Clone, Debug)]
+pub struct ValueLabelRecord {
+    pub var_type: VarType,
+    pub labels: Vec<ValueLabel>,
+    pub variables: Vec<Identifier>,
+}
+
+impl TryDecode for ValueLabelRecord {
+    type Input<'a> = crate::raw::ValueLabelRecord<RawStr<8>, RawString>;
+    fn try_decode(
+        decoder: &mut Decoder,
+        input: &Self::Input<'_>,
+        warn: impl Fn(Error),
+    ) -> Result<Option<ValueLabelRecord>, Error> {
+        let variables: Vec<&Variable> = input
+            .dict_indexes
+            .iter()
+            .filter_map(|&dict_index| {
+                decoder
+                    .get_var_by_index(dict_index as usize)
+                    .warn_on_error(&warn)
+            })
+            .filter(|&variable| match variable.width {
+                VarWidth::String(width) if width > 8 => {
+                    warn(Error::InvalidLongStringValueLabel(
+                        variable.short_name.clone(),
+                    ));
+                    false
+                }
+                _ => true,
+            })
+            .collect();
+        let mut i = variables.iter();
+        let Some(&first_var) = i.next() else {
+            return Ok(None);
+        };
+        let var_type: VarType = first_var.width.into();
+        for &variable in i {
+            let this_type: VarType = variable.width.into();
+            if var_type != this_type {
+                let (numeric_var, string_var) = match var_type {
+                    VarType::Numeric => (first_var, variable),
+                    VarType::String => (variable, first_var),
+                };
+                warn(Error::ValueLabelsDifferentTypes {
+                    numeric_var: numeric_var.short_name.clone(),
+                    string_var: string_var.short_name.clone(),
+                });
+                return Ok(None);
+            }
+        }
+        let labels = input
+            .labels
+            .iter()
+            .map(|raw::ValueLabel { value, label }| {
+                let label = decoder.decode_string(&label.0, &warn);
+                let value = Value::decode(value, decoder);
+                ValueLabel { value, label }
+            })
+            .collect();
+        let variables = variables
+            .iter()
+            .map(|&variable| variable.short_name.clone())
+            .collect();
+        Ok(Some(ValueLabelRecord {
+            var_type,
+            labels,
+            variables,
+        }))
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct VariableSetRecord(Vec<VariableSet>);
+
+impl TextRecord for VariableSetRecord {
+    const NAME: &'static str = "variable set";
+    fn parse(input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
+        let mut sets = Vec::new();
+        for line in input.lines() {
+            if let Some(set) = VariableSet::parse(line).warn_on_error(&warn) {
+                sets.push(set)
+            }
+        }
+        Ok(VariableSetRecord(sets))
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct LongName {
+    pub short_name: Identifier,
+    pub long_name: Identifier,
+}
+
+impl LongName {
+    fn new(decoder: &mut Decoder, short_name: &str, long_name: &str) -> Result<LongName, Error> {
+        let short_name =
+            Identifier::new(short_name, decoder.encoding).map_err(Error::InvalidShortName)?;
+        let long_name =
+            Identifier::new(long_name, decoder.encoding).map_err(Error::InvalidLongName)?;
+        Ok(LongName {
+            short_name,
+            long_name,
+        })
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct LongNameRecord(Vec<LongName>);
+
+impl LongNameRecord {
+    pub fn parse(decoder: &mut Decoder, input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
+        let mut names = Vec::new();
+        for pair in input.split('\t').filter(|s| !s.is_empty()) {
+            if let Some((short_name, long_name)) = pair.split_once('=') {
+                if let Some(long_name) =
+                    LongName::new(decoder, short_name, long_name).warn_on_error(&warn)
+                {
+                    names.push(long_name);
+                }
+            } else {
+                warn(Error::TBD)
+            }
+        }
+        Ok(LongNameRecord(names))
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct VeryLongString {
+    pub short_name: Identifier,
+    pub length: u16,
+}
+
+impl VeryLongString {
+    fn parse(decoder: &Decoder, input: &str) -> Result<VeryLongString, Error> {
+        let Some((short_name, length)) = input.split_once('=') else {
+            return Err(Error::TBD);
+        };
+        let short_name =
+            Identifier::new(short_name, decoder.encoding).map_err(Error::InvalidLongStringName)?;
+        let length: u16 = length.parse().map_err(|_| Error::TBD)?;
+        if length > VarWidth::MAX_STRING {
+            return Err(Error::TBD);
+        }
+        Ok(VeryLongString { short_name, length })
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct VeryLongStringRecord(Vec<VeryLongString>);
+
+impl VeryLongStringRecord {
+    pub fn parse(decoder: &Decoder, input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
+        let mut very_long_strings = Vec::new();
+        for tuple in input
+            .split('\0')
+            .map(|s| s.trim_end_matches('\t'))
+            .filter(|s| !s.is_empty())
+        {
+            if let Some(vls) = VeryLongString::parse(decoder, tuple).warn_on_error(&warn) {
+                very_long_strings.push(vls)
+            }
+        }
+        Ok(VeryLongStringRecord(very_long_strings))
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct Attribute {
+    pub name: Identifier,
+    pub values: Vec<String>,
+}
+
+impl Attribute {
+    fn parse<'a>(
+        decoder: &Decoder,
+        input: &'a str,
+        warn: &impl Fn(Error),
+    ) -> Result<(Option<Attribute>, &'a str), Error> {
+        let Some((name, mut input)) = input.split_once('(') else {
+            return Err(Error::TBD);
+        };
+        let mut values = Vec::new();
+        loop {
+            let Some((value, rest)) = input.split_once('\n') else {
+                return Err(Error::TBD);
+            };
+            if let Some(stripped) = value
+                .strip_prefix('\'')
+                .and_then(|value| value.strip_suffix('\''))
+            {
+                values.push(stripped.into());
+            } else {
+                warn(Error::TBD);
+                values.push(value.into());
+            }
+            if let Some(rest) = rest.strip_prefix(')') {
+                let attribute = Identifier::new(name, decoder.encoding)
+                    .map_err(Error::InvalidAttributeName)
+                    .warn_on_error(warn)
+                    .map(|name| Attribute { name, values });
+                return Ok((attribute, rest));
+            };
+            input = rest;
+        }
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct AttributeSet(pub Vec<Attribute>);
+
+impl AttributeSet {
+    fn parse<'a>(
+        decoder: &Decoder,
+        mut input: &'a str,
+        sentinel: Option<char>,
+        warn: &impl Fn(Error),
+    ) -> Result<(AttributeSet, &'a str), Error> {
+        let mut attributes = Vec::new();
+        let rest = loop {
+            match input.chars().next() {
+                None => break input,
+                c if c == sentinel => break &input[1..],
+                _ => {
+                    let (attribute, rest) = Attribute::parse(decoder, input, &warn)?;
+                    if let Some(attribute) = attribute {
+                        attributes.push(attribute);
+                    }
+                    input = rest;
+                }
+            }
+        };
+        Ok((AttributeSet(attributes), rest))
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct FileAttributeRecord(AttributeSet);
+
+impl FileAttributeRecord {
+    pub fn parse(decoder: &Decoder, input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
+        let (set, rest) = AttributeSet::parse(decoder, input, None, &warn)?;
+        if !rest.is_empty() {
+            warn(Error::TBD);
+        }
+        Ok(FileAttributeRecord(set))
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct VarAttributeSet {
+    pub long_var_name: Identifier,
+    pub attributes: AttributeSet,
+}
+
+impl VarAttributeSet {
+    fn parse<'a>(
+        decoder: &Decoder,
+        input: &'a str,
+        warn: &impl Fn(Error),
+    ) -> Result<(Option<VarAttributeSet>, &'a str), Error> {
+        let Some((long_var_name, rest)) = input.split_once(':') else {
+            return Err(Error::TBD);
+        };
+        let (attributes, rest) = AttributeSet::parse(decoder, rest, Some('/'), warn)?;
+        let var_attribute = Identifier::new(long_var_name, decoder.encoding)
+            .map_err(Error::InvalidAttributeVariableName)
+            .warn_on_error(warn)
+            .map(|name| VarAttributeSet {
+                long_var_name: name,
+                attributes,
+            });
+        Ok((var_attribute, rest))
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct VariableAttributeRecord(Vec<VarAttributeSet>);
+
+impl VariableAttributeRecord {
+    pub fn parse(decoder: &Decoder, mut input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
+        let mut var_attribute_sets = Vec::new();
+        while !input.is_empty() {
+            let Some((var_attribute, rest)) =
+                VarAttributeSet::parse(decoder, input, &warn).warn_on_error(&warn)
+            else {
+                break;
+            };
+            if let Some(var_attribute) = var_attribute {
+                var_attribute_sets.push(var_attribute);
+            }
+            input = rest;
+        }
+        Ok(VariableAttributeRecord(var_attribute_sets))
+    }
+}
+
+#[derive(Clone, Debug)]
+pub enum MultipleResponseType {
+    MultipleDichotomy {
+        value: Value,
+        labels: CategoryLabels,
+    },
+    MultipleCategory,
+}
+
+impl MultipleResponseType {
+    fn decode(
+        decoder: &Decoder,
+        mr_set: &Identifier,
+        input: &raw::MultipleResponseType,
+        min_width: VarWidth,
+        warn: &impl Fn(Error),
+    ) -> Result<Self, Error> {
+        let mr_type = match input {
+            raw::MultipleResponseType::MultipleDichotomy { value, labels } => {
+                let value = decoder.decode_string_cow(&value.0, warn);
+                let value = match min_width {
+                    VarWidth::Numeric => {
+                        let number: f64 = value.trim().parse().map_err(|_| {
+                            Error::InvalidMDGroupCountedValue {
+                                mr_set: mr_set.clone(),
+                                number: value.into(),
+                            }
+                        })?;
+                        Value::Number(Some(number.into()))
+                    }
+                    VarWidth::String(max_width) => {
+                        let value = value.trim_end_matches(' ');
+                        let width = value.len();
+                        if width > max_width as usize {
+                            return Err(Error::TooWideMDGroupCountedValue {
+                                mr_set: mr_set.clone(),
+                                value: value.into(),
+                                width,
+                                max_width,
+                            });
+                        };
+                        Value::String(value.into())
+                    }
+                };
+                MultipleResponseType::MultipleDichotomy {
+                    value,
+                    labels: *labels,
+                }
+            }
+            raw::MultipleResponseType::MultipleCategory => MultipleResponseType::MultipleCategory,
+        };
+        Ok(mr_type)
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct MultipleResponseSet {
+    pub name: Identifier,
+    pub min_width: VarWidth,
+    pub max_width: VarWidth,
+    pub label: String,
+    pub mr_type: MultipleResponseType,
+    pub dict_indexes: Vec<DictIndex>,
+}
+
+impl MultipleResponseSet {
+    fn decode(
+        decoder: &Decoder,
+        input: &raw::MultipleResponseSet<Identifier, Cow<str>>,
+        warn: &impl Fn(Error),
+    ) -> Result<Self, Error> {
+        let mr_set_name = input.name.clone();
+        let mut dict_indexes = Vec::with_capacity(input.short_names.len());
+        for short_name in input.short_names.iter() {
+            let Some(&dict_index) = decoder.var_names.get(&short_name) else {
+                warn(Error::UnknownMrSetVariable {
+                    mr_set: mr_set_name.clone(),
+                    short_name: short_name.clone(),
+                });
+                continue;
+            };
+            dict_indexes.push(dict_index);
+        }
+
+        match dict_indexes.len() {
+            0 => return Err(Error::EmptyMrSet(mr_set_name)),
+            1 => return Err(Error::OneVarMrSet(mr_set_name)),
+            _ => (),
+        }
+
+        let Some((Some(min_width), Some(max_width))) = dict_indexes
+            .iter()
+            .map(|dict_index| decoder.variables[dict_index].width)
+            .map(|w| (Some(w), Some(w)))
+            .reduce(|(na, wa), (nb, wb)| (VarWidth::narrower(na, nb), VarWidth::wider(wa, wb)))
+        else {
+            return Err(Error::MixedMrSet(mr_set_name));
+        };
+
+        let mr_type =
+            MultipleResponseType::decode(decoder, &mr_set_name, &input.mr_type, min_width, warn)?;
+
+        Ok(MultipleResponseSet {
+            name: mr_set_name,
+            min_width,
+            max_width,
+            label: input.label.to_string(),
+            mr_type,
+            dict_indexes,
+        })
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct MultipleResponseRecord(pub Vec<MultipleResponseSet>);
+
+impl TryDecode for MultipleResponseRecord {
+    type Input<'a> = raw::MultipleResponseRecord<Identifier, Cow<'a, str>>;
+
+    fn try_decode(
+        decoder: &mut Decoder,
+        input: &Self::Input<'_>,
+        warn: impl Fn(Error),
+    ) -> Result<Option<Self>, Error> {
+        let mut sets = Vec::with_capacity(input.0.len());
+        for set in &input.0 {
+            match MultipleResponseSet::decode(decoder, set, &warn) {
+                Ok(set) => sets.push(set),
+                Err(error) => warn(error),
+            }
+        }
+        Ok(Some(MultipleResponseRecord(sets)))
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct LongStringValueLabels {
+    pub var_name: Identifier,
+    pub width: VarWidth,
+    pub labels: Vec<ValueLabel>,
+}
+
+impl LongStringValueLabels {
+    fn decode(
+        decoder: &Decoder,
+        input: &raw::LongStringValueLabels<RawString>,
+        warn: &impl Fn(Error),
+    ) -> Result<Self, Error> {
+        let var_name = decoder.decode_string(&input.var_name.0, warn);
+        let var_name = Identifier::new(var_name.trim_end(), decoder.encoding)
+            .map_err(Error::InvalidLongStringValueLabelName)?;
+
+        let min_width = 9;
+        let max_width = VarWidth::MAX_STRING;
+        if input.width < 9 || input.width > max_width as u32 {
+            return Err(Error::InvalidLongValueLabelWidth {
+                name: var_name,
+                width: input.width,
+                min_width,
+                max_width,
+            });
+        }
+        let width = input.width as u16;
+
+        let mut labels = Vec::with_capacity(input.labels.len());
+        for (value, label) in input.labels.iter() {
+            let value = Value::String(decoder.decode_exact_length(&value.0).into());
+            let label = decoder.decode_string(&label.0, warn);
+            labels.push(ValueLabel { value, label });
+        }
+
+        Ok(LongStringValueLabels {
+            var_name,
+            width: VarWidth::String(width),
+            labels,
+        })
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct LongStringValueLabelRecord(pub Vec<LongStringValueLabels>);
+
+impl TryDecode for LongStringValueLabelRecord {
+    type Input<'a> = raw::LongStringValueLabelRecord<RawString>;
+
+    fn try_decode(
+        decoder: &mut Decoder,
+        input: &Self::Input<'_>,
+        warn: impl Fn(Error),
+    ) -> Result<Option<Self>, Error> {
+        let mut labels = Vec::with_capacity(input.0.len());
+        for label in &input.0 {
+            match LongStringValueLabels::decode(decoder, label, &warn) {
+                Ok(set) => labels.push(set),
+                Err(error) => warn(error),
+            }
+        }
+        Ok(Some(LongStringValueLabelRecord(labels)))
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use encoding_rs::WINDOWS_1252;
+
+    #[test]
+    fn test() {
+        let mut s = String::new();
+        s.push(char::REPLACEMENT_CHARACTER);
+        let encoded = WINDOWS_1252.encode(&s).0;
+        let decoded = WINDOWS_1252.decode(&encoded[..]).0;
+        println!("{:?}", decoded);
+    }
+
+    #[test]
+    fn test2() {
+        let charset: Vec<u8> = (0..=255).collect();
+        println!("{}", charset.len());
+        let decoded = WINDOWS_1252.decode(&charset[..]).0;
+        println!("{}", decoded.len());
+        let encoded = WINDOWS_1252.encode(&decoded[..]).0;
+        println!("{}", encoded.len());
+        assert_eq!(&charset[..], &encoded[..]);
+    }
+}
+*/
diff --git a/rust/pspp/src/dictionary.rs b/rust/pspp/src/dictionary.rs

new file mode 100644 (file)

index 0000000..c260099
--- /dev/null
+++ b/rust/pspp/src/dictionary.rs
@@ -0,0 +1,530 @@
+use std::{
+    cmp::Ordering,
+    collections::{HashMap, HashSet},
+    fmt::Debug,
+    ops::{Bound, RangeBounds},
+};
+
+use encoding_rs::Encoding;
+use indexmap::IndexSet;
+use num::integer::div_ceil;
+use ordered_float::OrderedFloat;
+use unicase::UniCase;
+
+use crate::{
+    format::Format,
+    identifier::{ByIdentifier, HasIdentifier, Identifier},
+    raw::{self, Alignment, CategoryLabels, Decoder, Measure, MissingValues, RawStr, VarType},
+};
+
+pub type DictIndex = usize;
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq)]
+pub enum VarWidth {
+    Numeric,
+    String(u16),
+}
+
+impl PartialOrd for VarWidth {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        match (self, other) {
+            (VarWidth::Numeric, VarWidth::Numeric) => Some(Ordering::Equal),
+            (VarWidth::String(a), VarWidth::String(b)) => Some(a.cmp(b)),
+            _ => None,
+        }
+    }
+}
+
+impl VarWidth {
+    pub const MAX_STRING: u16 = 32767;
+
+    pub fn n_dict_indexes(self) -> usize {
+        match self {
+            VarWidth::Numeric => 1,
+            VarWidth::String(w) => div_ceil(w as usize, 8),
+        }
+    }
+
+    fn width_predicate(
+        a: Option<VarWidth>,
+        b: Option<VarWidth>,
+        f: impl Fn(u16, u16) -> u16,
+    ) -> Option<VarWidth> {
+        match (a, b) {
+            (Some(VarWidth::Numeric), Some(VarWidth::Numeric)) => Some(VarWidth::Numeric),
+            (Some(VarWidth::String(a)), Some(VarWidth::String(b))) => {
+                Some(VarWidth::String(f(a, b)))
+            }
+            _ => None,
+        }
+    }
+
+    /// Returns the wider of `self` and `other`:
+    /// - Numerical variable widths are equally wide.
+    /// - Longer strings are wider than shorter strings.
+    /// - Numerical and string types are incomparable, so result in `None`.
+    /// - Any `None` in the input yields `None` in the output.
+    pub fn wider(a: Option<VarWidth>, b: Option<VarWidth>) -> Option<VarWidth> {
+        Self::width_predicate(a, b, |a, b| a.max(b))
+    }
+
+    /// Returns the narrower of `self` and `other` (see [`Self::wider`]).
+    pub fn narrower(a: Option<VarWidth>, b: Option<VarWidth>) -> Option<VarWidth> {
+        Self::width_predicate(a, b, |a, b| a.min(b))
+    }
+
+    pub fn default_display_width(&self) -> u32 {
+        match self {
+            VarWidth::Numeric => 8,
+            VarWidth::String(width) => *width.min(&32) as u32,
+        }
+    }
+
+    pub fn from_raw(raw: impl Into<i32>) -> Result<Self, ()> {
+        let raw: i32 = raw.into();
+        match raw {
+            0 => Ok(Self::Numeric),
+            1..=255 => Ok(Self::String(raw as u16)),
+            _ => Err(()),
+        }
+    }
+
+    pub fn is_long_string(&self) -> bool {
+        if let Self::String(width) = self {
+            *width > 8
+        } else {
+            false
+        }
+    }
+}
+
+impl From<VarWidth> for VarType {
+    fn from(source: VarWidth) -> Self {
+        match source {
+            VarWidth::Numeric => VarType::Numeric,
+            VarWidth::String(_) => VarType::String,
+        }
+    }
+}
+
+#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub enum Value {
+    Number(Option<OrderedFloat<f64>>),
+    String(String),
+}
+
+impl Value {
+    pub fn decode(raw: &raw::Value<RawStr<8>>, decoder: &Decoder) -> Self {
+        match raw {
+            raw::Value::Number(x) => Value::Number(x.map(|x| x.into())),
+            raw::Value::String(s) => Value::String(decoder.decode_exact_length(&s.0).into()),
+        }
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct Dictionary {
+    pub variables: IndexSet<ByIdentifier<Variable>>,
+    pub split_file: Vec<DictIndex>,
+    pub weight: Option<DictIndex>,
+    pub filter: Option<DictIndex>,
+    pub case_limit: Option<u64>,
+    pub file_label: Option<String>,
+    pub documents: Vec<String>,
+    pub vectors: HashSet<ByIdentifier<Vector>>,
+    pub attributes: HashMap<Identifier, Vec<String>>,
+    pub mrsets: HashSet<ByIdentifier<MultipleResponseSet>>,
+    pub variable_sets: HashSet<ByIdentifier<VariableSet>>,
+    pub encoding: &'static Encoding,
+}
+
+#[derive(Debug)]
+pub struct DuplicateVariableName;
+
+impl Dictionary {
+    pub fn new(encoding: &'static Encoding) -> Self {
+        Self {
+            variables: IndexSet::new(),
+            split_file: Vec::new(),
+            weight: None,
+            filter: None,
+            case_limit: None,
+            file_label: None,
+            documents: Vec::new(),
+            vectors: HashSet::new(),
+            attributes: HashMap::new(),
+            mrsets: HashSet::new(),
+            variable_sets: HashSet::new(),
+            encoding,
+        }
+    }
+
+    pub fn add_var(&mut self, variable: Variable) -> Result<usize, DuplicateVariableName> {
+        let (index, inserted) = self.variables.insert_full(ByIdentifier::new(variable));
+        if inserted {
+            Ok(index)
+        } else {
+            Err(DuplicateVariableName)
+        }
+    }
+
+    pub fn reorder_var(&mut self, from_index: DictIndex, to_index: DictIndex) {
+        if from_index != to_index {
+            self.variables.move_index(from_index, to_index);
+            self.update_dict_indexes(&|index| {
+                #[allow(clippy::collapsible_else_if)]
+                if index == from_index {
+                    Some(to_index)
+                } else if from_index < to_index {
+                    if index > from_index && index <= to_index {
+                        Some(index - 1)
+                    } else {
+                        Some(index)
+                    }
+                } else {
+                    if index >= to_index && index < from_index {
+                        Some(index + 1)
+                    } else {
+                        Some(index)
+                    }
+                }
+            })
+        }
+    }
+
+    pub fn retain_vars<F>(&mut self, keep: F)
+    where
+        F: Fn(&Variable) -> bool,
+    {
+        let mut deleted = Vec::new();
+        let mut index = 0;
+        self.variables.retain(|var_by_id| {
+            let keep = keep(&var_by_id.0);
+            if !keep {
+                deleted.push(index);
+            }
+            index += 1;
+            keep
+        });
+        if !deleted.is_empty() {
+            self.update_dict_indexes(&|index| match deleted.binary_search(&index) {
+                Ok(_) => None,
+                Err(position) => Some(position),
+            })
+        }
+    }
+
+    pub fn delete_vars<R>(&mut self, range: R)
+    where
+        R: RangeBounds<DictIndex>,
+    {
+        let start = match range.start_bound() {
+            Bound::Included(&start) => start,
+            Bound::Excluded(&start) => start + 1,
+            Bound::Unbounded => 0,
+        };
+        let end = match range.end_bound() {
+            Bound::Included(&end) => end + 1,
+            Bound::Excluded(&end) => end,
+            Bound::Unbounded => self.variables.len(),
+        };
+        if end > start {
+            self.variables.drain(start..end);
+            self.update_dict_indexes(&|index| {
+                if index < start {
+                    Some(index)
+                } else if index < end {
+                    None
+                } else {
+                    Some(index - end - start)
+                }
+            })
+        }
+    }
+
+    fn update_dict_indexes<F>(&mut self, f: &F)
+    where
+        F: Fn(DictIndex) -> Option<DictIndex>,
+    {
+        update_dict_index_vec(&mut self.split_file, f);
+        self.weight = self.weight.and_then(f);
+        self.filter = self.filter.and_then(f);
+        self.vectors = self
+            .vectors
+            .drain()
+            .filter_map(|vector_by_id| {
+                vector_by_id
+                    .0
+                    .with_updated_dict_indexes(f)
+                    .map(ByIdentifier::new)
+            })
+            .collect();
+        self.mrsets = self
+            .mrsets
+            .drain()
+            .filter_map(|mrset_by_id| {
+                mrset_by_id
+                    .0
+                    .with_updated_dict_indexes(f)
+                    .map(ByIdentifier::new)
+            })
+            .collect();
+        self.variable_sets = self
+            .variable_sets
+            .drain()
+            .filter_map(|var_set_by_id| {
+                var_set_by_id
+                    .0
+                    .with_updated_dict_indexes(f)
+                    .map(ByIdentifier::new)
+            })
+            .collect();
+    }
+}
+
+fn update_dict_index_vec<F>(dict_indexes: &mut Vec<DictIndex>, f: F)
+where
+    F: Fn(DictIndex) -> Option<DictIndex>,
+{
+    dict_indexes.retain_mut(|index| {
+        if let Some(new) = f(*index) {
+            *index = new;
+            true
+        } else {
+            false
+        }
+    });
+}
+
+#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)]
+pub enum Role {
+    Input,
+    Target,
+    Both,
+    None,
+    Partition,
+    Split,
+}
+
+impl Default for Role {
+    fn default() -> Self {
+        Self::Input
+    }
+}
+
+pub enum DictClass {
+    Ordinary,
+    System,
+    Scratch,
+}
+
+impl DictClass {
+    pub fn from_identifier(id: &Identifier) -> Self {
+        if id.0.starts_with('$') {
+            Self::System
+        } else if id.0.starts_with('#') {
+            Self::Scratch
+        } else {
+            Self::Ordinary
+        }
+    }
+
+    pub fn must_leave(self) -> bool {
+        match self {
+            DictClass::Ordinary => false,
+            DictClass::System => false,
+            DictClass::Scratch => true,
+        }
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct Variable {
+    pub name: Identifier,
+    pub width: VarWidth,
+    pub missing_values: MissingValues,
+    pub print_format: Format,
+    pub write_format: Format,
+    pub value_labels: HashMap<Value, String>,
+    pub label: Option<String>,
+    pub measure: Option<Measure>,
+    pub role: Role,
+    pub display_width: u32,
+    pub alignment: Alignment,
+    pub leave: bool,
+    pub short_names: Vec<Identifier>,
+    pub attributes: HashSet<ByIdentifier<Attribute>>,
+}
+
+impl Variable {
+    pub fn new(name: Identifier, width: VarWidth) -> Self {
+        let var_type = VarType::from_width(width);
+        let leave = DictClass::from_identifier(&name).must_leave();
+        Self {
+            name,
+            width,
+            missing_values: MissingValues::default(),
+            print_format: Format::default_for_width(width),
+            write_format: Format::default_for_width(width),
+            value_labels: HashMap::new(),
+            label: None,
+            measure: Measure::default_for_type(var_type),
+            role: Role::default(),
+            display_width: width.default_display_width(),
+            alignment: Alignment::default_for_type(var_type),
+            leave,
+            short_names: Vec::new(),
+            attributes: HashSet::new(),
+        }
+    }
+}
+
+impl HasIdentifier for Variable {
+    fn identifier(&self) -> &UniCase<String> {
+        &self.name.0
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct Vector {
+    pub name: Identifier,
+    pub variables: Vec<DictIndex>,
+}
+
+impl Vector {
+    fn with_updated_dict_indexes(
+        mut self,
+        f: impl Fn(DictIndex) -> Option<DictIndex>,
+    ) -> Option<Self> {
+        update_dict_index_vec(&mut self.variables, f);
+        (!self.variables.is_empty()).then_some(self)
+    }
+}
+
+impl HasIdentifier for Vector {
+    fn identifier(&self) -> &UniCase<String> {
+        &self.name.0
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct Attribute {
+    pub name: Identifier,
+    pub values: Vec<String>,
+}
+
+impl HasIdentifier for Attribute {
+    fn identifier(&self) -> &UniCase<String> {
+        &self.name.0
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct MultipleResponseSet {
+    pub name: Identifier,
+    pub label: String,
+    pub mr_type: MultipleResponseType,
+    pub variables: Vec<DictIndex>,
+}
+
+impl MultipleResponseSet {
+    fn with_updated_dict_indexes(
+        mut self,
+        f: impl Fn(DictIndex) -> Option<DictIndex>,
+    ) -> Option<Self> {
+        update_dict_index_vec(&mut self.variables, f);
+        (self.variables.len() > 1).then_some(self)
+    }
+}
+
+impl HasIdentifier for MultipleResponseSet {
+    fn identifier(&self) -> &UniCase<String> {
+        &self.name.0
+    }
+}
+
+#[derive(Clone, Debug)]
+pub enum MultipleResponseType {
+    MultipleDichotomy {
+        value: Value,
+        labels: CategoryLabels,
+    },
+    MultipleCategory,
+}
+
+#[derive(Clone, Debug)]
+pub struct VariableSet {
+    pub name: Identifier,
+    pub variables: Vec<DictIndex>,
+}
+
+impl VariableSet {
+    fn with_updated_dict_indexes(
+        mut self,
+        f: impl Fn(DictIndex) -> Option<DictIndex>,
+    ) -> Option<Self> {
+        update_dict_index_vec(&mut self.variables, f);
+        (!self.variables.is_empty()).then_some(self)
+    }
+}
+
+impl HasIdentifier for VariableSet {
+    fn identifier(&self) -> &UniCase<String> {
+        &self.name.0
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use std::collections::HashSet;
+
+    use unicase::UniCase;
+
+    use crate::identifier::Identifier;
+
+    use super::{ByIdentifier, HasIdentifier};
+
+    #[derive(PartialEq, Eq, Debug, Clone)]
+    struct Variable {
+        name: Identifier,
+        value: i32,
+    }
+
+    impl HasIdentifier for Variable {
+        fn identifier(&self) -> &UniCase<String> {
+            &self.name.0
+        }
+    }
+
+    #[test]
+    fn test() {
+        // Variables should not be the same if their values differ.
+        let abcd = Identifier::new("abcd").unwrap();
+        let abcd1 = Variable {
+            name: abcd.clone(),
+            value: 1,
+        };
+        let abcd2 = Variable {
+            name: abcd,
+            value: 2,
+        };
+        assert_ne!(abcd1, abcd2);
+
+        // But `ByName` should treat them the same.
+        let abcd1_by_name = ByIdentifier::new(abcd1);
+        let abcd2_by_name = ByIdentifier::new(abcd2);
+        assert_eq!(abcd1_by_name, abcd2_by_name);
+
+        // And a `HashSet` of `ByName` should also treat them the same.
+        let mut vars: HashSet<ByIdentifier<Variable>> = HashSet::new();
+        assert!(vars.insert(ByIdentifier::new(abcd1_by_name.0.clone())));
+        assert!(!vars.insert(ByIdentifier::new(abcd2_by_name.0.clone())));
+        assert_eq!(
+            vars.get(&UniCase::new(String::from("abcd")))
+                .unwrap()
+                .0
+                .value,
+            1
+        );
+    }
+}
diff --git a/rust/pspp/src/encoding.rs b/rust/pspp/src/encoding.rs

new file mode 100644 (file)

index 0000000..aaed5fd
--- /dev/null
+++ b/rust/pspp/src/encoding.rs
@@ -0,0 +1,64 @@
+use crate::locale_charset::locale_charset;
+use encoding_rs::{Encoding, UTF_8};
+
+include!(concat!(env!("OUT_DIR"), "/encodings.rs"));
+
+pub fn codepage_from_encoding(encoding: &str) -> Option<u32> {
+    CODEPAGE_NAME_TO_NUMBER
+        .get(encoding.to_ascii_lowercase().as_str())
+        .copied()
+}
+
+use thiserror::Error as ThisError;
+
+#[derive(ThisError, Debug)]
+pub enum Error {
+    #[error("This system file does not indicate its own character encoding.  For best results, specify an encoding explicitly.  Use SYSFILE INFO with ENCODING=\"DETECT\" to analyze the possible encodings.")]
+    NoEncoding,
+
+    #[error("This system file encodes text strings with unknown code page {0}.")]
+    UnknownCodepage(i32),
+
+    #[error("This system file encodes text strings with unknown encoding {0}.")]
+    UnknownEncoding(String),
+
+    #[error("This system file is encoded in EBCDIC, which is not supported.")]
+    Ebcdic,
+}
+
+pub fn default_encoding() -> &'static Encoding {
+    lazy_static! {
+        static ref DEFAULT_ENCODING: &'static Encoding =
+            Encoding::for_label(locale_charset().as_bytes()).unwrap_or(UTF_8);
+    }
+    &DEFAULT_ENCODING
+}
+
+pub fn get_encoding(
+    encoding: Option<&str>,
+    character_code: Option<i32>,
+) -> Result<&'static Encoding, Error> {
+    let label = if let Some(encoding) = encoding {
+        encoding
+    } else if let Some(codepage) = character_code {
+        match codepage {
+            1 => return Err(Error::Ebcdic),
+            2 | 3 => {
+                // These ostensibly mean "7-bit ASCII" and "8-bit ASCII"[sic]
+                // respectively.  However, many files have character code 2 but
+                // data which are clearly not ASCII.  Therefore, ignore these
+                // values.
+                return Err(Error::NoEncoding);
+            }
+            4 => "MS_KANJI",
+            _ => CODEPAGE_NUMBER_TO_NAME
+                .get(&codepage)
+                .copied()
+                .ok_or(Error::UnknownCodepage(codepage))?,
+        }
+    } else {
+        return Err(Error::NoEncoding);
+    };
+
+    Encoding::for_label(label.as_bytes()).ok_or(Error::UnknownEncoding(label.into()))
+}
diff --git a/rust/pspp/src/endian.rs b/rust/pspp/src/endian.rs

new file mode 100644 (file)

index 0000000..dd89a6c
--- /dev/null
+++ b/rust/pspp/src/endian.rs
@@ -0,0 +1,168 @@
+/// The endianness for integer and floating-point numbers in SPSS system files.
+///
+/// SPSS system files can declare IBM 370 and DEC VAX floating-point
+/// representations, but no file that uses either of these has ever been found
+/// in the wild, so this code does not handle them.
+#[derive(Copy, Clone, Debug, PartialEq, Eq)]
+pub enum Endian {
+    /// Big-endian: MSB at lowest address.
+    Big,
+
+    /// Little-endian: LSB at lowest address.
+    Little,
+}
+
+impl Endian {
+    #[cfg(target_endian = "big")]
+    pub const NATIVE: Endian = Endian::Big;
+    #[cfg(target_endian = "little")]
+    pub const NATIVE: Endian = Endian::Little;
+
+    pub fn identify_u32(expected_value: u32, bytes: [u8; 4]) -> Option<Self> {
+        let as_big: u32 = Endian::Big.parse(bytes);
+        let as_little: u32 = Endian::Little.parse(bytes);
+        match (as_big == expected_value, as_little == expected_value) {
+            (true, false) => Some(Endian::Big),
+            (false, true) => Some(Endian::Little),
+            _ => None,
+        }
+    }
+
+    pub fn identify_f64(expected_value: f64, bytes: [u8; 8]) -> Option<Self> {
+        let as_big: f64 = Endian::Big.parse(bytes);
+        let as_little: f64 = Endian::Little.parse(bytes);
+        match (as_big == expected_value, as_little == expected_value) {
+            (true, false) => Some(Endian::Big),
+            (false, true) => Some(Endian::Little),
+            _ => None,
+        }
+    }
+}
+
+pub trait ToBytes<T, const N: usize> {
+    fn to_bytes(self, value: T) -> [u8; N];
+}
+impl ToBytes<i64, 8> for Endian {
+    fn to_bytes(self, value: i64) -> [u8; 8] {
+        match self {
+            Endian::Big => i64::to_be_bytes(value),
+            Endian::Little => i64::to_le_bytes(value),
+        }
+    }
+}
+impl ToBytes<u32, 4> for Endian {
+    fn to_bytes(self, value: u32) -> [u8; 4] {
+        match self {
+            Endian::Big => u32::to_be_bytes(value),
+            Endian::Little => u32::to_le_bytes(value),
+        }
+    }
+}
+impl ToBytes<i32, 4> for Endian {
+    fn to_bytes(self, value: i32) -> [u8; 4] {
+        match self {
+            Endian::Big => i32::to_be_bytes(value),
+            Endian::Little => i32::to_le_bytes(value),
+        }
+    }
+}
+impl ToBytes<u16, 2> for Endian {
+    fn to_bytes(self, value: u16) -> [u8; 2] {
+        match self {
+            Endian::Big => u16::to_be_bytes(value),
+            Endian::Little => u16::to_le_bytes(value),
+        }
+    }
+}
+impl ToBytes<u8, 1> for Endian {
+    fn to_bytes(self, value: u8) -> [u8; 1] {
+        [value]
+    }
+}
+impl ToBytes<f64, 8> for Endian {
+    fn to_bytes(self, value: f64) -> [u8; 8] {
+        match self {
+            Endian::Big => f64::to_be_bytes(value),
+            Endian::Little => f64::to_le_bytes(value),
+        }
+    }
+}
+
+/// Parses an `N`-byte slice in one of the supported formats into native format
+/// as type `T`.
+pub trait Parse<T, const N: usize> {
+    /// Given 'bytes', returns `T`.
+    fn parse(self, bytes: [u8; N]) -> T;
+}
+impl Parse<u64, 8> for Endian {
+    fn parse(self, bytes: [u8; 8]) -> u64 {
+        match self {
+            Endian::Big => u64::from_be_bytes(bytes),
+            Endian::Little => u64::from_le_bytes(bytes),
+        }
+    }
+}
+impl Parse<u32, 4> for Endian {
+    fn parse(self, bytes: [u8; 4]) -> u32 {
+        match self {
+            Endian::Big => u32::from_be_bytes(bytes),
+            Endian::Little => u32::from_le_bytes(bytes),
+        }
+    }
+}
+impl Parse<u16, 2> for Endian {
+    fn parse(self, bytes: [u8; 2]) -> u16 {
+        match self {
+            Endian::Big => u16::from_be_bytes(bytes),
+            Endian::Little => u16::from_le_bytes(bytes),
+        }
+    }
+}
+impl Parse<u8, 1> for Endian {
+    fn parse(self, bytes: [u8; 1]) -> u8 {
+        match self {
+            Endian::Big => u8::from_be_bytes(bytes),
+            Endian::Little => u8::from_le_bytes(bytes),
+        }
+    }
+}
+impl Parse<i64, 8> for Endian {
+    fn parse(self, bytes: [u8; 8]) -> i64 {
+        match self {
+            Endian::Big => i64::from_be_bytes(bytes),
+            Endian::Little => i64::from_le_bytes(bytes),
+        }
+    }
+}
+impl Parse<i32, 4> for Endian {
+    fn parse(self, bytes: [u8; 4]) -> i32 {
+        match self {
+            Endian::Big => i32::from_be_bytes(bytes),
+            Endian::Little => i32::from_le_bytes(bytes),
+        }
+    }
+}
+impl Parse<i16, 2> for Endian {
+    fn parse(self, bytes: [u8; 2]) -> i16 {
+        match self {
+            Endian::Big => i16::from_be_bytes(bytes),
+            Endian::Little => i16::from_le_bytes(bytes),
+        }
+    }
+}
+impl Parse<i8, 1> for Endian {
+    fn parse(self, bytes: [u8; 1]) -> i8 {
+        match self {
+            Endian::Big => i8::from_be_bytes(bytes),
+            Endian::Little => i8::from_le_bytes(bytes),
+        }
+    }
+}
+impl Parse<f64, 8> for Endian {
+    fn parse(self, bytes: [u8; 8]) -> f64 {
+        match self {
+            Endian::Big => f64::from_be_bytes(bytes),
+            Endian::Little => f64::from_le_bytes(bytes),
+        }
+    }
+}
diff --git a/rust/pspp/src/engine.rs b/rust/pspp/src/engine.rs

new file mode 100644 (file)

index 0000000..f48c194
--- /dev/null
+++ b/rust/pspp/src/engine.rs
@@ -0,0 +1,51 @@
+use crate::{
+    command::parse,
+    lex::{lexer::{Lexer, Source}, token::Token},
+    message::Diagnostic,
+};
+
+pub struct Engine {
+    lexer: Lexer,
+}
+
+impl Engine {
+    fn new() -> Self {
+        Self {
+            lexer: Lexer::new(Box::new(|location, error| println!("{location}: {error}"))),
+        }
+    }
+    fn run(&mut self, source: Source) {
+        self.lexer.append(source);
+        self.lexer.get();
+        while self.lexer.token() != &Token::End {
+            let error: Box<dyn Fn(Diagnostic)> = Box::new(|diagnostic| {
+                println!("{diagnostic}");
+            });
+            parse(&mut self.lexer, &error);
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use encoding_rs::UTF_8;
+
+    use crate::lex::{
+        lexer::{ErrorHandling, Source},
+        segment::Mode,
+    };
+
+    use super::Engine;
+
+    #[test]
+    fn test_echo() {
+        let mut engine = Engine::new();
+        engine.run(Source::for_file_contents(
+            "ECHO 'hi there'.\nECHO 'bye there'.\n".to_string(),
+            Some("test.sps".to_string()),
+            UTF_8,
+            Mode::default(),
+            ErrorHandling::default(),
+        ));
+    }
+}
diff --git a/rust/pspp/src/format.rs b/rust/pspp/src/format.rs

new file mode 100644 (file)

index 0000000..bafdf27
--- /dev/null
+++ b/rust/pspp/src/format.rs
@@ -0,0 +1,658 @@
+use std::{
+    fmt::{Display, Formatter, Result as FmtResult},
+    ops::RangeInclusive,
+};
+
+use enum_map::{Enum, EnumMap};
+use thiserror::Error as ThisError;
+
+use crate::{
+    dictionary::VarWidth,
+    raw::{self, VarType},
+};
+
+#[derive(ThisError, Debug)]
+pub enum Error {
+    #[error("Unknown format type {value}.")]
+    UnknownFormat { value: u16 },
+
+    #[error("Output format {0} specifies width {}, but {} requires an even width.", .0.w, .0.type_)]
+    OddWidthNotAllowed(UncheckedFormat),
+
+    #[error("Output format {0} specifies width {}, but {} requires a width between {} and {}.", .0.w, .0.type_, .0.type_.min_width(), .0.type_.max_width())]
+    BadWidth(UncheckedFormat),
+
+    #[error("Output format {0} specifies decimal places, but {} format does not allow any decimals.", .0.type_)]
+    DecimalsNotAllowedForFormat(UncheckedFormat),
+
+    #[error("Output format {0} specifies {} decimal places, but with a width of {}, {} does not allow any decimal places.", .0.d, .0.w, .0.type_)]
+    DecimalsNotAllowedForWidth(UncheckedFormat),
+
+    #[error("Output format {spec} specifies {} decimal places but, with a width of {}, {} allows at most {max_d} decimal places.", .spec.d, .spec.w, .spec.type_)]
+    TooManyDecimalsForWidth {
+        spec: UncheckedFormat,
+        max_d: Decimals,
+    },
+
+    #[error("String variable is not compatible with numeric format {0}.")]
+    UnnamedVariableNotCompatibleWithNumericFormat(Type),
+
+    #[error("Numeric variable is not compatible with string format {0}.")]
+    UnnamedVariableNotCompatibleWithStringFormat(Type),
+
+    #[error("String variable {variable} with width {width} is not compatible with format {bad_spec}.  Use format {good_spec} instead.")]
+    NamedStringVariableBadSpecWidth {
+        variable: String,
+        width: Width,
+        bad_spec: Format,
+        good_spec: Format,
+    },
+
+    #[error("String variable with width {width} is not compatible with format {bad_spec}.  Use format {good_spec} instead.")]
+    UnnamedStringVariableBadSpecWidth {
+        width: Width,
+        bad_spec: Format,
+        good_spec: Format,
+    },
+}
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
+pub enum Category {
+    // Numeric formats.
+    Basic,
+    Custom,
+    Legacy,
+    Binary,
+    Hex,
+    Date,
+    Time,
+    DateComponent,
+
+    // String formats.
+    String,
+}
+
+impl From<Type> for Category {
+    fn from(source: Type) -> Self {
+        match source {
+            Type::F | Type::Comma | Type::Dot | Type::Dollar | Type::Pct | Type::E => Self::Basic,
+            Type::CC(_) => Self::Custom,
+            Type::N | Type::Z => Self::Legacy,
+            Type::P | Type::PK | Type::IB | Type::PIB | Type::RB => Self::Binary,
+            Type::PIBHex | Type::RBHex => Self::Hex,
+            Type::Date
+            | Type::ADate
+            | Type::EDate
+            | Type::JDate
+            | Type::SDate
+            | Type::QYr
+            | Type::MoYr
+            | Type::WkYr
+            | Type::DateTime
+            | Type::YMDHMS => Self::Date,
+            Type::MTime | Type::Time | Type::DTime => Self::Time,
+            Type::WkDay | Type::Month => Self::DateComponent,
+            Type::A | Type::AHex => Self::String,
+        }
+    }
+}
+
+#[derive(Copy, Clone, Debug, Enum, PartialEq, Eq, Hash)]
+pub enum CC {
+    A,
+    B,
+    C,
+    D,
+    E,
+}
+
+impl Display for CC {
+    fn fmt(&self, f: &mut Formatter) -> FmtResult {
+        let s = match self {
+            CC::A => "A",
+            CC::B => "B",
+            CC::C => "C",
+            CC::D => "D",
+            CC::E => "E",
+        };
+        write!(f, "{}", s)
+    }
+}
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
+pub enum Type {
+    // Basic numeric formats.
+    F,
+    Comma,
+    Dot,
+    Dollar,
+    Pct,
+    E,
+
+    // Custom currency formats.
+    CC(CC),
+
+    // Legacy numeric formats.
+    N,
+    Z,
+
+    // Binary and hexadecimal formats.
+    P,
+    PK,
+    IB,
+    PIB,
+    PIBHex,
+    RB,
+    RBHex,
+
+    // Time and date formats.
+    Date,
+    ADate,
+    EDate,
+    JDate,
+    SDate,
+    QYr,
+    MoYr,
+    WkYr,
+    DateTime,
+    YMDHMS,
+    MTime,
+    Time,
+    DTime,
+
+    // Date component formats.
+    WkDay,
+    Month,
+
+    // String formats.
+    A,
+    AHex,
+}
+
+pub type Width = u16;
+pub type SignedWidth = i16;
+
+pub type Decimals = u8;
+
+impl Type {
+    pub fn max_width(self) -> Width {
+        match self {
+            Self::P | Self::PK | Self::PIBHex | Self::RBHex => 16,
+            Self::IB | Self::PIB | Self::RB => 8,
+            Self::A => 32767,
+            Self::AHex => 32767 * 2,
+            _ => 40,
+        }
+    }
+
+    pub fn min_width(self) -> Width {
+        match self {
+            // Basic numeric formats.
+            Self::F => 1,
+            Self::Comma => 1,
+            Self::Dot => 1,
+            Self::Dollar => 2,
+            Self::Pct => 2,
+            Self::E => 6,
+
+            // Custom currency formats.
+            Self::CC(_) => 2,
+
+            // Legacy numeric formats.
+            Self::N => 1,
+            Self::Z => 1,
+
+            // Binary and hexadecimal formats.
+            Self::P => 1,
+            Self::PK => 1,
+            Self::IB => 1,
+            Self::PIB => 1,
+            Self::PIBHex => 2,
+            Self::RB => 2,
+            Self::RBHex => 4,
+
+            // Time and date formats.
+            Self::Date => 9,
+            Self::ADate => 8,
+            Self::EDate => 8,
+            Self::JDate => 5,
+            Self::SDate => 8,
+            Self::QYr => 6,
+            Self::MoYr => 6,
+            Self::WkYr => 8,
+            Self::DateTime => 17,
+            Self::YMDHMS => 16,
+            Self::MTime => 5,
+            Self::Time => 5,
+            Self::DTime => 8,
+
+            // Date component formats.
+            Self::WkDay => 2,
+            Self::Month => 3,
+
+            // String formats.
+            Self::A => 1,
+            Self::AHex => 2,
+        }
+    }
+
+    pub fn width_range(self) -> RangeInclusive<Width> {
+        self.min_width()..=self.max_width()
+    }
+
+    pub fn max_decimals(self, width: Width) -> Decimals {
+        let width = width.clamp(1, 40) as SignedWidth;
+        let max = match self {
+            Self::F | Self::Comma | Self::Dot | Self::CC(_) => width - 1,
+            Self::Dollar | Self::Pct => width - 2,
+            Self::E => width - 7,
+            Self::N | Self::Z => width,
+            Self::P => width * 2 - 1,
+            Self::PK => width * 2,
+            Self::IB | Self::PIB => max_digits_for_bytes(width as usize) as SignedWidth,
+            Self::PIBHex => 0,
+            Self::RB | Self::RBHex => 16,
+            Self::Date
+            | Self::ADate
+            | Self::EDate
+            | Self::JDate
+            | Self::SDate
+            | Self::QYr
+            | Self::MoYr
+            | Self::WkYr => 0,
+            Self::DateTime => width - 21,
+            Self::YMDHMS => width - 20,
+            Self::MTime => width - 6,
+            Self::Time => width - 9,
+            Self::DTime => width - 12,
+            Self::WkDay | Self::Month | Self::A | Self::AHex => 0,
+        };
+        max.clamp(0, 16) as Decimals
+    }
+
+    pub fn takes_decimals(self) -> bool {
+        self.max_decimals(Width::MAX) > 0
+    }
+
+    pub fn category(self) -> Category {
+        self.into()
+    }
+
+    pub fn width_step(self) -> Width {
+        if self.category() == Category::Hex || self == Self::AHex {
+            2
+        } else {
+            1
+        }
+    }
+
+    pub fn clamp_width(self, width: Width) -> Width {
+        let (min, max) = self.width_range().into_inner();
+        let width = width.clamp(min, max);
+        if self.width_step() == 2 {
+            width / 2 * 2
+        } else {
+            width
+        }
+    }
+
+    pub fn var_type(self) -> VarType {
+        match self {
+            Self::A | Self::AHex => VarType::String,
+            _ => VarType::Numeric,
+        }
+    }
+
+    /// Checks whether this format is valid for a variable with the given
+    /// `var_type`.
+    pub fn check_type_compatibility(self, var_type: VarType) -> Result<(), Error> {
+        let my_type = self.var_type();
+        match (my_type, var_type) {
+            (VarType::Numeric, VarType::String) => {
+                Err(Error::UnnamedVariableNotCompatibleWithNumericFormat(self))
+            }
+            (VarType::String, VarType::Numeric) => {
+                Err(Error::UnnamedVariableNotCompatibleWithStringFormat(self))
+            }
+            _ => Ok(()),
+        }
+    }
+}
+
+impl Display for Type {
+    fn fmt(&self, f: &mut Formatter) -> FmtResult {
+        let s = match self {
+            Self::F => "F",
+            Self::Comma => "COMMA",
+            Self::Dot => "DOT",
+            Self::Dollar => "DOLLAR",
+            Self::Pct => "PCT",
+            Self::E => "E",
+            Self::CC(cc) => return write!(f, "{}", cc),
+            Self::N => "N",
+            Self::Z => "Z",
+            Self::P => "P",
+            Self::PK => "PK",
+            Self::IB => "IB",
+            Self::PIB => "PIB",
+            Self::PIBHex => "PIBHEX",
+            Self::RB => "RB",
+            Self::RBHex => "RBHEX",
+            Self::Date => "DATE",
+            Self::ADate => "ADATE",
+            Self::EDate => "EDATE",
+            Self::JDate => "JDATE",
+            Self::SDate => "SDATE",
+            Self::QYr => "QYR",
+            Self::MoYr => "MOYR",
+            Self::WkYr => "WKYR",
+            Self::DateTime => "DATETIME",
+            Self::YMDHMS => "YMDHMS",
+            Self::MTime => "MTIME",
+            Self::Time => "TIME",
+            Self::DTime => "DTIME",
+            Self::WkDay => "WKDAY",
+            Self::Month => "MONTH",
+            Self::A => "A",
+            Self::AHex => "AHEX",
+        };
+        write!(f, "{}", s)
+    }
+}
+
+fn max_digits_for_bytes(bytes: usize) -> usize {
+    *[0, 3, 5, 8, 10, 13, 15, 17].get(bytes).unwrap_or(&20)
+}
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
+pub struct Format {
+    type_: Type,
+    w: Width,
+    d: Decimals,
+}
+
+impl Format {
+    pub const F40: Format = Format {
+        type_: Type::F,
+        w: 40,
+        d: 0,
+    };
+
+    pub const F8_2: Format = Format {
+        type_: Type::F,
+        w: 8,
+        d: 2,
+    };
+
+    pub fn format(self) -> Type {
+        self.type_
+    }
+    pub fn w(self) -> Width {
+        self.w
+    }
+    pub fn d(self) -> Decimals {
+        self.d
+    }
+
+    pub fn default_for_width(var_width: VarWidth) -> Self {
+        match var_width {
+            VarWidth::Numeric => Format {
+                type_: Type::F,
+                w: 8,
+                d: 2,
+            },
+            VarWidth::String(w) => Format {
+                type_: Type::A,
+                w,
+                d: 0,
+            },
+        }
+    }
+
+    pub fn fixed_from(source: &UncheckedFormat) -> Self {
+        let UncheckedFormat {
+            type_: format,
+            w,
+            d,
+        } = *source;
+        let (min, max) = format.width_range().into_inner();
+        let mut w = w.clamp(min, max);
+        if d <= format.max_decimals(Width::MAX) {
+            while d > format.max_decimals(w) {
+                w += 1;
+                assert!(w <= 40);
+            }
+        }
+        let d = d.clamp(0, format.max_decimals(w));
+        Self {
+            type_: format,
+            w,
+            d,
+        }
+    }
+
+    pub fn var_width(self) -> VarWidth {
+        match self.type_ {
+            Type::A => VarWidth::String(self.w),
+            Type::AHex => VarWidth::String(self.w / 2),
+            _ => VarWidth::Numeric,
+        }
+    }
+
+    pub fn var_type(self) -> VarType {
+        self.type_.var_type()
+    }
+
+    /// Checks whether this format specification is valid for a variable with
+    /// width `var_width`.
+    pub fn check_width_compatibility(self, var_width: VarWidth) -> Result<Self, Error> {
+        // Verify that the format is right for the variable's type.
+        self.type_.check_type_compatibility(var_width.into())?;
+
+        if let VarWidth::String(w) = var_width {
+            if var_width != self.var_width() {
+                let bad_spec = self;
+                let good_spec = if self.type_ == Type::A {
+                    Format { w, ..self }
+                } else {
+                    Format { w: w * 2, ..self }
+                };
+                return Err(Error::UnnamedStringVariableBadSpecWidth {
+                    width: w,
+                    bad_spec,
+                    good_spec,
+                });
+            }
+        }
+
+        Ok(self)
+    }
+}
+
+impl Display for Format {
+    fn fmt(&self, f: &mut Formatter) -> FmtResult {
+        write!(f, "{}{}", self.type_, self.w)?;
+        if self.type_.takes_decimals() || self.d > 0 {
+            write!(f, ".{}", self.d)?;
+        }
+        Ok(())
+    }
+}
+
+impl TryFrom<UncheckedFormat> for Format {
+    type Error = Error;
+
+    fn try_from(source: UncheckedFormat) -> Result<Self, Self::Error> {
+        let UncheckedFormat {
+            type_: format,
+            w,
+            d,
+        } = source;
+        let max_d = format.max_decimals(w);
+        if w % format.width_step() != 0 {
+            Err(Error::OddWidthNotAllowed(source))
+        } else if !format.width_range().contains(&w) {
+            Err(Error::BadWidth(source))
+        } else if d > max_d {
+            if format.takes_decimals() {
+                Err(Error::DecimalsNotAllowedForFormat(source))
+            } else if max_d > 0 {
+                Err(Error::TooManyDecimalsForWidth {
+                    spec: source,
+                    max_d,
+                })
+            } else {
+                Err(Error::DecimalsNotAllowedForWidth(source))
+            }
+        } else {
+            Ok(Format {
+                type_: format,
+                w,
+                d,
+            })
+        }
+    }
+}
+
+impl TryFrom<u16> for Type {
+    type Error = Error;
+
+    fn try_from(source: u16) -> Result<Self, Self::Error> {
+        match source {
+            1 => Ok(Self::A),
+            2 => Ok(Self::AHex),
+            3 => Ok(Self::Comma),
+            4 => Ok(Self::Dollar),
+            5 => Ok(Self::F),
+            6 => Ok(Self::IB),
+            7 => Ok(Self::PIBHex),
+            8 => Ok(Self::P),
+            9 => Ok(Self::PIB),
+            10 => Ok(Self::PK),
+            11 => Ok(Self::RB),
+            12 => Ok(Self::RBHex),
+            15 => Ok(Self::Z),
+            16 => Ok(Self::N),
+            17 => Ok(Self::E),
+            20 => Ok(Self::Date),
+            21 => Ok(Self::Time),
+            22 => Ok(Self::DateTime),
+            23 => Ok(Self::ADate),
+            24 => Ok(Self::JDate),
+            25 => Ok(Self::DTime),
+            26 => Ok(Self::WkDay),
+            27 => Ok(Self::Month),
+            28 => Ok(Self::MoYr),
+            29 => Ok(Self::QYr),
+            30 => Ok(Self::WkYr),
+            31 => Ok(Self::Pct),
+            32 => Ok(Self::Dot),
+            33 => Ok(Self::CC(CC::A)),
+            34 => Ok(Self::CC(CC::B)),
+            35 => Ok(Self::CC(CC::C)),
+            36 => Ok(Self::CC(CC::D)),
+            37 => Ok(Self::CC(CC::E)),
+            38 => Ok(Self::EDate),
+            39 => Ok(Self::SDate),
+            40 => Ok(Self::MTime),
+            41 => Ok(Self::YMDHMS),
+            _ => Err(Error::UnknownFormat { value: source }),
+        }
+    }
+}
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
+pub struct UncheckedFormat {
+    pub type_: Type,
+
+    pub w: Width,
+
+    pub d: Decimals,
+}
+
+impl TryFrom<raw::Spec> for UncheckedFormat {
+    type Error = Error;
+
+    fn try_from(raw: raw::Spec) -> Result<Self, Self::Error> {
+        let raw = raw.0;
+        let raw_format = (raw >> 16) as u16;
+        let format = raw_format.try_into()?;
+        let w = ((raw >> 8) & 0xff) as Width;
+        let d = (raw & 0xff) as Decimals;
+        Ok(Self {
+            type_: format,
+            w,
+            d,
+        })
+    }
+}
+
+impl Display for UncheckedFormat {
+    fn fmt(&self, f: &mut Formatter) -> FmtResult {
+        write!(f, "{}{}", self.type_, self.w)?;
+        if self.type_.takes_decimals() || self.d > 0 {
+            write!(f, ".{}", self.d)?;
+        }
+        Ok(())
+    }
+}
+
+pub struct Settings {
+    epoch: Option<i32>,
+
+    /// Either `'.'` or `','`.
+    decimal: char,
+
+    /// Format `F`, `E`, `COMMA`, and `DOT` with leading zero (e.g. `0.5`
+    /// instead of `.5`)?
+    include_leading_zero: bool,
+
+    /// Custom currency styles.
+    ccs: EnumMap<CC, Option<NumberStyle>>,
+}
+
+impl Default for Settings {
+    fn default() -> Self {
+        Self {
+            epoch: None,
+            decimal: '.',
+            include_leading_zero: false,
+            ccs: Default::default(),
+        }
+    }
+}
+
+/// A numeric output style.  This can express numeric formats in
+/// [Category::Basic] and [Category::Custom].
+pub struct NumberStyle {
+    neg_prefix: Affix,
+    prefix: Affix,
+    suffix: Affix,
+    neg_suffix: Affix,
+
+    /// Decimal point: `'.'` or `','`.
+    decimal: char,
+
+    /// Grouping character: `'.'` or `','` or `None`.
+    grouping: Option<char>,
+
+    /// Format as `.5` or `0.5`?
+    include_leading_zero: bool,
+
+    /// An `Affix` may require more bytes than its display width; for example,
+    /// U+00A5 (¥) is 2 bytes in UTF-8 but occupies only one display column.
+    /// This member is the sum of the number of bytes required by all of the
+    /// `Affix` members in this struct, minus their display widths.  Thus, it
+    /// can be used to size memory allocations: for example, the formatted
+    /// result of `CCA20.5` requires no more than `(20 + extra_bytes)` bytes in
+    /// UTF-8.
+    extra_bytes: usize,
+}
+
+pub struct Affix {
+    /// String contents of affix.
+    s: String,
+
+    /// Display width in columns (see [unicode_width])
+    width: usize,
+}
diff --git a/rust/pspp/src/hexfloat.rs b/rust/pspp/src/hexfloat.rs

new file mode 100644 (file)

index 0000000..b885fb2
--- /dev/null
+++ b/rust/pspp/src/hexfloat.rs
@@ -0,0 +1,52 @@
+use num::Float;
+use std::{num::FpCategory, fmt::{Display, Formatter, Result}};
+
+pub struct HexFloat<T: Float>(pub T);
+
+impl<T: Float> Display for HexFloat<T> {
+    fn fmt(&self, f: &mut Formatter<'_>) -> Result {
+        let sign = if self.0.is_sign_negative() { "-" } else { "" };
+        match self.0.classify() {
+            FpCategory::Nan => return write!(f, "NaN"),
+            FpCategory::Infinite => return write!(f, "{sign}Infinity"),
+            FpCategory::Zero => return write!(f, "{sign}0.0"),
+            _ => (),
+        };
+        let (significand, mut exponent, _) = self.0.integer_decode();
+        let mut hex_sig = format!("{:x}", significand);
+        while hex_sig.ends_with('0') {
+            hex_sig.pop();
+            exponent += 4;
+        }
+        match hex_sig.len() {
+            0 => write!(f, "{sign}0.0"),
+            1 => write!(f, "{sign}0x{hex_sig}.0p{exponent}"),
+            len => write!(
+                f,
+                "{sign}0x{}.{}p{}",
+                hex_sig.chars().next().unwrap(),
+                &hex_sig[1..],
+                exponent + 4 * (len as i16 - 1)
+            ),
+        }
+    }
+}
+
+#[cfg(test)]
+mod hex_float_tests {
+    use crate::HexFloat;
+    use num::Float;
+
+    #[test]
+    fn test() {
+        assert_eq!(format!("{}", HexFloat(1.0)), "0x1.0p0");
+        assert_eq!(format!("{}", HexFloat(123.0)), "0x1.ecp6");
+        assert_eq!(format!("{}", HexFloat(1.0 / 16.0)), "0x1.0p-4");
+        assert_eq!(format!("{}", HexFloat(f64::infinity())), "Infinity");
+        assert_eq!(format!("{}", HexFloat(f64::neg_infinity())), "-Infinity");
+        assert_eq!(format!("{}", HexFloat(f64::nan())), "NaN");
+        assert_eq!(format!("{}", HexFloat(0.0)), "0.0");
+        assert_eq!(format!("{}", HexFloat(f64::neg_zero())), "-0.0");
+    }
+}
+
diff --git a/rust/pspp/src/identifier.rs b/rust/pspp/src/identifier.rs

new file mode 100644 (file)

index 0000000..2d5c031
--- /dev/null
+++ b/rust/pspp/src/identifier.rs
@@ -0,0 +1,394 @@
+use std::{
+    borrow::Borrow,
+    cmp::Ordering,
+    fmt::{Debug, Display, Formatter, Result as FmtResult},
+    hash::{Hash, Hasher},
+    ops::Deref,
+};
+
+use encoding_rs::{EncoderResult, Encoding, UTF_8};
+use finl_unicode::categories::{CharacterCategories, MajorCategory};
+use thiserror::Error as ThisError;
+use unicase::UniCase;
+
+pub trait IdentifierChar {
+    /// Returns true if `self` is an ASCII character that may be the first
+    /// character in an identifier.
+    fn ascii_may_start_id(self) -> bool;
+
+    /// Returns true if `self` may be the first character in an identifier.
+    fn may_start_id(self) -> bool;
+
+    /// Returns true if `self` is an ASCII character that may be a second or
+    /// subsequent character in an identifier.
+    fn ascii_may_continue_id(self) -> bool;
+
+    /// Returns true if `self` may be a second or subsequent character in an
+    /// identifier.
+    fn may_continue_id(self) -> bool;
+}
+
+impl IdentifierChar for char {
+    fn ascii_may_start_id(self) -> bool {
+        matches!(self, 'a'..='z' | 'A'..='Z' | '@' | '#' | '$' | '!')
+    }
+
+    fn may_start_id(self) -> bool {
+        if self < '\u{0080}' {
+            self.ascii_may_start_id()
+        } else {
+            use MajorCategory::*;
+
+            [L, M, S].contains(&self.get_major_category()) && self != char::REPLACEMENT_CHARACTER
+        }
+    }
+
+    fn ascii_may_continue_id(self) -> bool {
+        matches!(self, 'a'..='z' | 'A'..='Z' | '0'..='9' | '@' | '#' | '$' | '.' | '_')
+    }
+
+    fn may_continue_id(self) -> bool {
+        if self < '\u{0080}' {
+            self.ascii_may_continue_id()
+        } else {
+            use MajorCategory::*;
+
+            [L, M, S, N].contains(&self.get_major_category()) && self != char::REPLACEMENT_CHARACTER
+        }
+    }
+}
+
+#[derive(Clone, Debug, ThisError)]
+pub enum Error {
+    #[error("Identifier cannot be empty string.")]
+    Empty,
+
+    #[error("\"{0}\" may not be used as an identifier because it is a reserved word.")]
+    Reserved(String),
+
+    #[error("\"!\" is not a valid identifier.")]
+    Bang,
+
+    #[error("\"{0}\" may not be used as an identifier because it begins with disallowed character \"{1}\".")]
+    BadFirstCharacter(String, char),
+
+    #[error("\"{0}\" may not be used as an identifier because it contains disallowed character \"{1}\".")]
+    BadLaterCharacter(String, char),
+
+    #[error("Identifier \"{id}\" is {length} bytes in the encoding in use ({encoding}), which exceeds the {max}-byte limit.")]
+    TooLong {
+        id: String,
+        length: usize,
+        encoding: &'static str,
+        max: usize,
+    },
+
+    #[error("\"{id}\" may not be used as an identifier because the encoding in use ({encoding}) cannot represent \"{c}\".")]
+    NotEncodable {
+        id: String,
+        encoding: &'static str,
+        c: char,
+    },
+}
+
+pub enum ReservedWord {
+    And,
+    Or,
+    Not,
+    Eq,
+    Ge,
+    Gt,
+    Le,
+    Lt,
+    Ne,
+    All,
+    By,
+    To,
+    With,
+}
+
+impl TryFrom<&str> for ReservedWord {
+    type Error = ();
+
+    fn try_from(source: &str) -> Result<Self, Self::Error> {
+        if !(2..=4).contains(&source.len()) {
+            Err(())
+        } else {
+            let b = source.as_bytes();
+            let c0 = b[0].to_ascii_uppercase();
+            let c1 = b[1].to_ascii_uppercase();
+            match (source.len(), c0, c1) {
+                (2, b'B', b'Y') => Ok(Self::By),
+                (2, b'E', b'Q') => Ok(Self::Eq),
+                (2, b'G', b'T') => Ok(Self::Gt),
+                (2, b'G', b'E') => Ok(Self::Ge),
+                (2, b'L', b'T') => Ok(Self::Lt),
+                (2, b'L', b'E') => Ok(Self::Le),
+                (2, b'N', b'E') => Ok(Self::Ne),
+                (3, b'N', b'O') if b[2].to_ascii_uppercase() == b'T' => Ok(Self::Not),
+                (2, b'O', b'R') => Ok(Self::Or),
+                (2, b'T', b'O') => Ok(Self::To),
+                (3, b'A', b'L') if b[2].to_ascii_uppercase() == b'L' => Ok(Self::All),
+                (3, b'A', b'N') if b[2].to_ascii_uppercase() == b'D' => Ok(Self::And),
+                (4, b'W', b'I')
+                    if b[2].to_ascii_uppercase() == b'T' && b[3].to_ascii_uppercase() == b'H' =>
+                {
+                    Ok(Self::With)
+                }
+                _ => Err(()),
+            }
+        }
+    }
+}
+
+pub fn is_reserved_word(s: &str) -> bool {
+    ReservedWord::try_from(s).is_ok()
+}
+
+#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub struct Identifier(pub UniCase<String>);
+
+impl Identifier {
+    /// Maximum length of an identifier, in bytes.  The limit applies in the
+    /// encoding used by the dictionary, not in UTF-8.
+    pub const MAX_LEN: usize = 64;
+
+    pub fn new(s: &str) -> Result<Self, Error> {
+        Self::from_encoding(s, UTF_8)
+    }
+    pub fn from_encoding(s: &str, encoding: &'static Encoding) -> Result<Identifier, Error> {
+        Self::is_plausible(s)?;
+        let identifier = Identifier(s.into());
+        identifier.check_encoding(encoding)?;
+        Ok(identifier)
+    }
+
+    /// Checks whether this is a valid identifier in the given `encoding`.  An
+    /// identifier that is valid in one encoding might be invalid in another
+    /// because some characters are unencodable or because it is too long.
+    pub fn check_encoding(&self, encoding: &'static Encoding) -> Result<(), Error> {
+        let s = self.0.as_str();
+        let (_encoded, _, unencodable) = encoding.encode(s);
+        if unencodable {
+            let mut encoder = encoding.new_encoder();
+            let mut buf = Vec::with_capacity(
+                encoder
+                    .max_buffer_length_from_utf8_without_replacement(s.len())
+                    .unwrap(),
+            );
+            let EncoderResult::Unmappable(c) = encoder
+                .encode_from_utf8_to_vec_without_replacement(s, &mut buf, true)
+                .0
+            else {
+                unreachable!();
+            };
+            return Err(Error::NotEncodable {
+                id: s.into(),
+                encoding: encoding.name(),
+                c,
+            });
+        }
+        /*
+        if encoded.len() > Self::MAX_LEN {
+            return Err(Error::TooLong {
+                id: s.into(),
+                length: encoded.len(),
+                encoding: encoding.name(),
+                max: Self::MAX_LEN,
+            });
+        }*/
+        Ok(())
+    }
+    pub fn is_plausible(s: &str) -> Result<(), Error> {
+        if s.is_empty() {
+            return Err(Error::Empty);
+        }
+        if is_reserved_word(s) {
+            return Err(Error::Reserved(s.into()));
+        }
+        if s == "!" {
+            return Err(Error::Bang);
+        }
+
+        let mut i = s.chars();
+        let first = i.next().unwrap();
+        if !first.may_start_id() {
+            return Err(Error::BadFirstCharacter(s.into(), first));
+        }
+        for c in i {
+            if !c.may_continue_id() {
+                return Err(Error::BadLaterCharacter(s.into(), c));
+            }
+        }
+        Ok(())
+    }
+
+    /// Returns true if `token` is a case-insensitive match for `keyword`.
+    ///
+    /// Keywords match `keyword` and `token` are identical, or `token` is at
+    /// least 3 characters long and those characters are identical to `keyword`
+    /// or differ only in case.
+    ///
+    /// `keyword` must be ASCII.
+    pub fn matches_keyword(&self, keyword: &str) -> bool {
+        id_match_n_nonstatic(keyword, self.0.as_str(), 3)
+    }
+
+    /// Returns true if `token` is a case-insensitive match for at least the
+    /// first `n` characters of `keyword`.
+    ///
+    /// `keyword` must be ASCII.
+    pub fn matches_keyword_n(&self, keyword: &str, n: usize) -> bool {
+        id_match_n_nonstatic(keyword, self.0.as_str(), n)
+    }
+}
+
+impl PartialEq<str> for Identifier {
+    fn eq(&self, other: &str) -> bool {
+        self.0.eq(&UniCase::new(other))
+    }
+}
+
+/// Returns true if `token` is a case-insensitive match for `keyword`.
+///
+/// Keywords match `keyword` and `token` are identical, or `token` is at least 3
+/// characters long and those characters are identical to `keyword` or differ
+/// only in case.
+///
+/// `keyword` must be ASCII.  It's normally a constant string, so it's declared
+/// as `&'static str` to make it harder to reverse the argument order. But
+/// there's no reason that a non-static string won't work, so use
+/// [`id_match_n_nonstatic`] instead if you need it.
+pub fn id_match(keyword: &'static str, token: &str) -> bool {
+    id_match_n(keyword, token, 3)
+}
+
+/// Returns true if `token` is a case-insensitive match for at least the first
+/// `n` characters of `keyword`.
+///
+/// `keyword` must be ASCII.  It's normally a constant string, so it's declared
+/// as `&'static str` to make it harder to reverse the argument order. But
+/// there's no reason that a non-static string won't work, so use
+/// [`id_match_n_nonstatic`] instead if you need it.
+pub fn id_match_n(keyword: &'static str, token: &str, n: usize) -> bool {
+    id_match_n_nonstatic(keyword, token, n)
+}
+
+/// Returns true if `token` is a case-insensitive match for at least the first
+/// `n` characters of `keyword`.
+///
+/// `keyword` must be ASCII.
+pub fn id_match_n_nonstatic(keyword: &str, token: &str, n: usize) -> bool {
+    debug_assert!(keyword.is_ascii());
+    let keyword_prefix = if (n..keyword.len()).contains(&token.len()) {
+        &keyword[..token.len()]
+    } else {
+        keyword
+    };
+    keyword_prefix.eq_ignore_ascii_case(token)
+}
+
+impl Display for Identifier {
+    fn fmt(&self, f: &mut Formatter) -> FmtResult {
+        write!(f, "{}", self.0)
+    }
+}
+
+impl Debug for Identifier {
+    fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
+        write!(f, "{}", self.0)
+    }
+}
+
+pub trait HasIdentifier {
+    fn identifier(&self) -> &UniCase<String>;
+}
+
+pub struct ByIdentifier<T>(pub T)
+where
+    T: HasIdentifier;
+
+impl<T> ByIdentifier<T>
+where
+    T: HasIdentifier,
+{
+    pub fn new(inner: T) -> Self {
+        Self(inner)
+    }
+}
+
+impl<T> PartialEq for ByIdentifier<T>
+where
+    T: HasIdentifier,
+{
+    fn eq(&self, other: &Self) -> bool {
+        self.0.identifier().eq(other.0.identifier())
+    }
+}
+
+impl<T> Eq for ByIdentifier<T> where T: HasIdentifier {}
+
+impl<T> PartialOrd for ByIdentifier<T>
+where
+    T: HasIdentifier,
+{
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl<T> Ord for ByIdentifier<T>
+where
+    T: HasIdentifier,
+{
+    fn cmp(&self, other: &Self) -> Ordering {
+        self.0.identifier().cmp(other.0.identifier())
+    }
+}
+
+impl<T> Hash for ByIdentifier<T>
+where
+    T: HasIdentifier,
+{
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        self.0.identifier().hash(state)
+    }
+}
+
+impl<T> Borrow<UniCase<String>> for ByIdentifier<T>
+where
+    T: HasIdentifier,
+{
+    fn borrow(&self) -> &UniCase<String> {
+        self.0.identifier()
+    }
+}
+
+impl<T> Debug for ByIdentifier<T>
+where
+    T: HasIdentifier + Debug,
+{
+    fn fmt(&self, f: &mut Formatter) -> FmtResult {
+        self.0.fmt(f)
+    }
+}
+
+impl<T> Clone for ByIdentifier<T>
+where
+    T: HasIdentifier + Clone,
+{
+    fn clone(&self) -> Self {
+        Self(self.0.clone())
+    }
+}
+
+impl<T> Deref for ByIdentifier<T>
+where
+    T: HasIdentifier + Clone,
+{
+    type Target = T;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
diff --git a/rust/pspp/src/integer.rs b/rust/pspp/src/integer.rs

new file mode 100644 (file)

index 0000000..6c76839
--- /dev/null
+++ b/rust/pspp/src/integer.rs
@@ -0,0 +1,86 @@
+pub trait ToInteger {
+    fn to_exact_integer<T>(&self) -> Option<T>
+    where
+        T: FromFloat;
+    fn to_exact_usize(&self) -> Option<usize> {
+        self.to_exact_integer()
+    }
+    fn to_exact_u8(&self) -> Option<u8> {
+        self.to_exact_integer()
+    }
+    fn to_exact_u16(&self) -> Option<u16> {
+        self.to_exact_integer()
+    }
+    fn to_exact_u32(&self) -> Option<u32> {
+        self.to_exact_integer()
+    }
+    fn to_exact_u64(&self) -> Option<u64> {
+        self.to_exact_integer()
+    }
+    fn to_exact_u128(&self) -> Option<u128> {
+        self.to_exact_integer()
+    }
+    fn to_exact_isize(&self) -> Option<usize> {
+        self.to_exact_integer()
+    }
+    fn to_exact_i8(&self) -> Option<i8> {
+        self.to_exact_integer()
+    }
+    fn to_exact_i16(&self) -> Option<i16> {
+        self.to_exact_integer()
+    }
+    fn to_exact_i32(&self) -> Option<i32> {
+        self.to_exact_integer()
+    }
+    fn to_exact_i64(&self) -> Option<i64> {
+        self.to_exact_integer()
+    }
+    fn to_exact_i128(&self) -> Option<i128> {
+        self.to_exact_integer()
+    }
+}
+
+impl ToInteger for f64 {
+    fn to_exact_integer<T>(&self) -> Option<T>
+    where
+        T: FromFloat,
+    {
+        T::from_float(*self)
+    }
+}
+
+pub trait FromFloat {
+    fn from_float(x: f64) -> Option<Self>
+    where
+        Self: Sized;
+}
+
+macro_rules! impl_from_float {
+    ($T:ident) => {
+        impl FromFloat for $T {
+            fn from_float(x: f64) -> Option<Self>
+            where
+                Self: Sized,
+            {
+                if x.trunc() == x && x >= $T::MIN as f64 && x <= $T::MAX as f64 {
+                    Some(x as Self)
+                } else {
+                    None
+                }
+            }
+        }
+    };
+}
+
+impl_from_float!(usize);
+impl_from_float!(u8);
+impl_from_float!(u16);
+impl_from_float!(u32);
+impl_from_float!(u64);
+impl_from_float!(u128);
+impl_from_float!(isize);
+impl_from_float!(i8);
+impl_from_float!(i16);
+impl_from_float!(i32);
+impl_from_float!(i64);
+impl_from_float!(i128);
diff --git a/rust/pspp/src/lex/command_name.rs b/rust/pspp/src/lex/command_name.rs

new file mode 100644 (file)

index 0000000..bccea14
--- /dev/null
+++ b/rust/pspp/src/lex/command_name.rs
@@ -0,0 +1,359 @@
+use crate::identifier::id_match_n_nonstatic;
+
+pub struct Match {
+    pub exact: bool,
+    pub missing_words: isize,
+}
+
+fn count_words(s: &str) -> isize {
+    s.split_whitespace().count() as isize
+}
+
+/// Compares `string` obtained from the user against the full name of a `command`,
+/// using this algorithm:
+///
+///   1. Divide `command` into words `c[0]` through `c[n - 1]`.
+///
+///   2. Divide `string` into words `s[0]` through `s[m - 1]`.
+///
+///   3. Compare word `c[i]` against `s[i]` for `0 <= i < min(n, m)`, using the keyword
+///      matching algorithm implemented by lex_id_match().  If any of them fail to
+///      match, then `string` does not match `command` and the function returns false.
+///
+///   4. Otherwise, `string` and `command` match.  Set *MISSING_WORDS to n - m.  Set
+///      *EXACT to false if any of the S[i] were found to be abbreviated in the
+///      comparisons done in step 3, or to true if they were all exactly equal
+///      (modulo case).  Return true.
+pub fn command_match(command: &str, string: &str) -> Option<Match> {
+    let mut command_words = command.split_whitespace();
+    let mut string_words = string.split_whitespace();
+    let mut exact = true;
+    loop {
+        let Some(cw) = command_words.next() else {
+            return Some(Match {
+                exact,
+                missing_words: -(string_words.count() as isize),
+            });
+        };
+        let Some(sw) = string_words.next() else {
+            return Some(Match {
+                exact,
+                missing_words: 1 + command_words.count() as isize,
+            });
+        };
+        if !id_match_n_nonstatic(cw, sw, 3) {
+            return None;
+        }
+        if sw.len() < cw.len() {
+            exact = false;
+        }
+    }
+}
+
+/// Matches a string against a collection of command names.
+pub struct CommandMatcher<'a, T> {
+    string: &'a str,
+    extensible: bool,
+    exact_match: Option<T>,
+    n_matches: usize,
+    match_: Option<T>,
+    match_missing_words: isize,
+}
+
+impl<'a, T> CommandMatcher<'a, T> {
+    pub fn new(string: &'a str) -> Self {
+        Self {
+            string,
+            extensible: false,
+            exact_match: None,
+            n_matches: 0,
+            match_: None,
+            match_missing_words: 0,
+        }
+    }
+
+    /// Consider `command` as a candidate for the command name being parsed. If
+    /// `command` is the correct command name, then [Self::get_match] will
+    /// return `aux` later.
+    pub fn add(&mut self, command: &str, aux: T) {
+        if let Some(Match {
+            missing_words,
+            exact,
+        }) = command_match(command, self.string)
+        {
+            if missing_words > 0 {
+                self.extensible = true;
+            } else if exact && missing_words == 0 {
+                self.exact_match = Some(aux);
+            } else {
+                if missing_words > self.match_missing_words {
+                    self.n_matches = 0;
+                }
+                if missing_words >= self.match_missing_words || self.n_matches == 0 {
+                    self.n_matches += 1;
+                    self.match_ = Some(aux);
+                    self.match_missing_words = missing_words;
+                }
+            }
+        }
+    }
+
+    pub fn get_match(self) -> (Option<T>, isize) {
+        if self.extensible {
+            (None, 1)
+        } else if let Some(exact_match) = self.exact_match {
+            (Some(exact_match), 0)
+        } else if self.n_matches == 1 {
+            (self.match_, self.match_missing_words)
+        } else {
+            (None, self.match_missing_words)
+        }
+    }
+}
+
+pub const COMMAND_NAMES: &'static [&'static str] = &[
+    "2SLS",
+    "ACF",
+    "ADD DOCUMENT",
+    "ADD FILES",
+    "ADD VALUE LABELS",
+    "AGGREGATE",
+    "ALSCAL",
+    "ANACOR",
+    "ANOVA",
+    "APPLY DICTIONARY",
+    "AUTORECODE",
+    "BEGIN DATA",
+    "BREAK",
+    "CACHE",
+    "CASEPLOT",
+    "CASESTOVARS",
+    "CATPCA",
+    "CATREG",
+    "CCF",
+    "CD",
+    "CLEAR TRANSFORMATIONS",
+    "CLOSE FILE HANDLE",
+    "CLUSTER",
+    "COMPUTE",
+    "CONJOINT",
+    "CORRELATIONS",
+    "CORRESPONDENCE",
+    "COUNT",
+    "COXREG",
+    "CREATE",
+    "CROSSTABS",
+    "CSDESCRIPTIVES",
+    "CSGLM",
+    "CSLOGISTIC",
+    "CSPLAN",
+    "CSSELECT",
+    "CSTABULATE",
+    "CTABLES",
+    "CURVEFIT",
+    "DATA LIST",
+    "DATAFILE ATTRIBUTE",
+    "DATASET ACTIVATE",
+    "DATASET CLOSE",
+    "DATASET COPY",
+    "DATASET DECLARE",
+    "DATASET DISPLAY",
+    "DATASET NAME",
+    "DATE",
+    "DEBUG EVALUATE",
+    "DEBUG EXPAND",
+    "DEBUG FLOAT FORMAT",
+    "DEBUG FORMAT GUESSER",
+    "DEBUG MATRIX READ",
+    "DEBUG MOMENTS",
+    "DEBUG PAPER SIZE",
+    "DEBUG POOL",
+    "DEBUG XFORM FAIL",
+    "DEFINE",
+    "DELETE VARIABLES",
+    "DESCRIPTIVES",
+    "DETECTANOMALY",
+    "DISCRIMINANT",
+    "DISPLAY MACROS",
+    "DISPLAY VARIABLE SETS",
+    "DISPLAY",
+    "DO IF",
+    "DO REPEAT",
+    "DOCUMENT",
+    "DROP DOCUMENTS",
+    "ECHO",
+    "EDIT",
+    "ELSE IF",
+    "ELSE",
+    "END CASE",
+    "END FILE TYPE",
+    "END FILE",
+    "END IF",
+    "END LOOP",
+    "END REPEAT",
+    "ERASE",
+    "EXAMINE",
+    "EXECUTE",
+    "EXIT",
+    "EXPORT",
+    "FACTOR",
+    "FILE HANDLE",
+    "FILE LABEL",
+    "FILE TYPE",
+    "FILTER",
+    "FINISH",
+    "FIT",
+    "FLIP",
+    "FORMATS",
+    "FREQUENCIES",
+    "GENLOG",
+    "GET DATA",
+    "GET TRANSLATE",
+    "GET",
+    "GGRAPH",
+    "GLM",
+    "GRAPH",
+    "HILOGLINEAR",
+    "HOMALS",
+    "HOST",
+    "IF",
+    "IGRAPH",
+    "IMPORT",
+    "INCLUDE",
+    "INFO",
+    "INPUT PROGRAM",
+    "INSERT",
+    "KEYED DATA LIST",
+    "KM",
+    "LEAVE",
+    "LIST",
+    "LOGISTIC REGRESSION",
+    "LOGLINEAR",
+    "LOOP",
+    "MANOVA",
+    "MAPS",
+    "MATCH FILES",
+    "MATRIX DATA",
+    "MATRIX",
+    "MCONVERT",
+    "MEANS",
+    "MISSING VALUES",
+    "MIXED",
+    "MODEL CLOSE",
+    "MODEL HANDLE",
+    "MODEL LIST",
+    "MODEL NAME",
+    "MRSETS",
+    "MULT RESPONSE",
+    "MULTIPLE CORRESPONDENCE",
+    "MVA",
+    "N OF CASES",
+    "N",
+    "NAIVEBAYES",
+    "NEW FILE",
+    "NLR",
+    "NOMREG",
+    "NONPAR CORR",
+    "NPAR TESTS",
+    "NUMBERED",
+    "NUMERIC",
+    "OLAP CUBES",
+    "OMS",
+    "ONEWAY",
+    "ORTHOPLAN",
+    "OUTPUT MODIFY",
+    "OVERALS",
+    "PACF",
+    "PARTIAL CORR",
+    "PEARSON CORRELATIONS",
+    "PERMISSIONS",
+    "PLANCARDS",
+    "PLUM",
+    "POINT",
+    "PPLOT",
+    "PREDICT",
+    "PREFSCAL",
+    "PRESERVE",
+    "PRINCALS",
+    "PRINT EJECT",
+    "PRINT FORMATS",
+    "PRINT SPACE",
+    "PRINT",
+    "PROBIT",
+    "PROCEDURE OUTPUT",
+    "PROXIMITIES",
+    "PROXSCAL",
+    "Q",
+    "QUICK CLUSTER",
+    "QUIT",
+    "RANK",
+    "RATIO STATISTICS",
+    "READ MODEL",
+    "RECODE",
+    "RECORD TYPE",
+    "REFORMAT",
+    "REGRESSION",
+    "RELIABILITY",
+    "RENAME VARIABLES",
+    "REPEATING DATA",
+    "REPORT",
+    "REREAD",
+    "RESTORE",
+    "RMV",
+    "ROC",
+    "SAMPLE",
+    "SAVE DATA COLLECTION",
+    "SAVE TRANSLATE",
+    "SAVE",
+    "SCRIPT",
+    "SEASON",
+    "SELECT IF",
+    "SELECTPRED",
+    "SET",
+    "SHOW",
+    "SORT CASES",
+    "SORT VARIABLES",
+    "SPCHART",
+    "SPECTRA",
+    "SPLIT FILE",
+    "STEMLEAF",
+    "STRING",
+    "SUBTITLE",
+    "SUMMARIZE",
+    "SURVIVAL",
+    "SYSFILE INFO",
+    "T-TEST",
+    "TDISPLAY",
+    "TEMPORARY",
+    "TITLE",
+    "TREE",
+    "TSAPPLY",
+    "TSET",
+    "TSHOW",
+    "TSMODEL",
+    "TSPLOT",
+    "TWOSTEP CLUSTER",
+    "UNIANOVA",
+    "UNNUMBERED",
+    "UPDATE",
+    "USE",
+    "VALIDATEDATA",
+    "VALUE LABELS",
+    "VARCOMP",
+    "VARIABLE ALIGNMENT",
+    "VARIABLE ATTRIBUTE",
+    "VARIABLE LABELS",
+    "VARIABLE LEVEL",
+    "VARIABLE ROLE",
+    "VARIABLE WIDTH",
+    "VARSTOCASES",
+    "VECTOR",
+    "VERIFY",
+    "WEIGHT",
+    "WLS",
+    "WRITE FORMATS",
+    "WRITE",
+    "XEXPORT",
+    "XGRAPH",
+    "XSAVE",
+];
diff --git a/rust/pspp/src/lex/lexer.rs b/rust/pspp/src/lex/lexer.rs

new file mode 100644 (file)

index 0000000..82ef008
--- /dev/null
+++ b/rust/pspp/src/lex/lexer.rs
@@ -0,0 +1,929 @@
+use std::{
+    borrow::{Borrow, Cow},
+    collections::{HashMap, VecDeque},
+    fmt::Write,
+    fs,
+    io::Result as IoResult,
+    mem,
+    ops::{Range, RangeInclusive},
+    path::Path,
+    sync::Arc,
+};
+
+use chardetng::EncodingDetector;
+use encoding_rs::{Encoding, UTF_8};
+use thiserror::Error as ThisError;
+use unicode_width::{UnicodeWidthChar, UnicodeWidthStr};
+
+use crate::{
+    macros::{macro_tokens_to_syntax, MacroSet, ParseStatus, Parser},
+    message::{Category, Diagnostic, Location, Point, Severity},
+    prompt::PromptStyle,
+    settings::Settings,
+};
+
+use super::{
+    scan::{MergeResult, ScanError, ScanToken},
+    segment::{Mode, Segment, Segmenter},
+    token::Token,
+};
+
+/// Error handling for a [`Reader`].
+#[derive(Copy, Clone, Debug, Default, PartialEq, Eq)]
+pub enum ErrorHandling {
+    /// Discard input line and continue reading.
+    Terminal,
+
+    /// Continue to next command, except for cascading failures.
+    #[default]
+    Continue,
+
+    /// Continue, even for cascading failures.
+    Ignore,
+
+    /// Stop processing,
+    Stop,
+}
+
+///  # Token pipeline
+///
+///  Tokens pass through a pipeline with the following stages.  Each token
+///  eventually made available to the parser passes through of these stages.
+///  The stages are named after the processing that happens in each one.
+///
+///  Initially, tokens come from the segmenter and scanner to `pp`:
+///
+///  - `pp`: Tokens that need to pass through the macro preprocessor to end up
+///    in `merge`.
+///
+///  - `merge`: Tokens that need to pass through
+///    [`super::scan::ScanToken::merge`] to end up in `parse`.
+///
+///  - `parse`: Tokens available to the client for parsing.
+///
+/// `pp` and `merge` store tokens only temporarily until they pass into `parse`.
+/// Tokens then live in `parse` until the command is fully consumed, at which
+/// time they are freed together.
+pub struct Source {
+    /// Error-handling mode.
+    error_handling: ErrorHandling,
+
+    /// Encoding.
+    encoding: &'static Encoding,
+
+    /// `None` if this reader is not associated with a file.
+    file_name: Option<Arc<String>>,
+
+    /// True if we've reached EOF already.
+    eof: bool,
+
+    /// Read some input from the source. If successful, returns the input that
+    /// was read.  At end of file or on error, returns an empty string.
+    ///
+    /// `prompt` provides a hint to interactive readers as to what kind of
+    /// syntax is being read right now.
+    read: Box<dyn Fn(PromptStyle) -> String>,
+
+    /// Source file contents.
+    buffer: String,
+
+    /// 0-based line number of the first line not yet written to the journal.
+    journal_line: usize,
+
+    /// Byte offset of first character not yet scanned as token.
+    seg_pos: usize,
+
+    /// Byte offsets into `buffer` of starts of lines.  The first element is 0.
+    lines: Vec<usize>,
+
+    /// Tokens that need to pass through the macro preprocessor to end up in
+    /// `merge`.
+    pp: VecDeque<LexToken>,
+
+    ///  Tokens that need to pass through [`super::scan::ScanToken::merge`] to
+    ///  end up in `parse`.
+    merge: VecDeque<LexToken>,
+
+    /// Tokens available to the client for parsing.
+    parse: Vec<LexToken>,
+
+    /// Offset in `parse` of the current token.
+    parse_ofs: usize,
+
+    segmenter: Segmenter,
+
+    suppress_next_newline: bool,
+}
+
+impl Default for Source {
+    fn default() -> Self {
+        Self {
+            error_handling: ErrorHandling::default(),
+            encoding: UTF_8,
+            file_name: None,
+            eof: false,
+            read: Box::new(|_| String::new()),
+            buffer: String::new(),
+            journal_line: 0,
+            seg_pos: 0,
+            lines: vec![0],
+            pp: VecDeque::new(),
+            merge: VecDeque::new(),
+            parse: Vec::new(),
+            parse_ofs: 0,
+            segmenter: Segmenter::new(Mode::default(), false),
+            suppress_next_newline: false,
+        }
+    }
+}
+
+impl Source {
+    pub fn for_file<P>(
+        path: P,
+        encoding: Option<&'static Encoding>,
+        syntax: Mode,
+        error_handling: ErrorHandling,
+    ) -> IoResult<Self>
+    where
+        P: AsRef<Path>,
+    {
+        let bytes = fs::read(path.as_ref())?;
+        let encoding = encoding.unwrap_or_else(|| {
+            let mut encoding_detector = EncodingDetector::new();
+            encoding_detector.feed(&bytes, true);
+            encoding_detector.guess(None, true)
+        });
+        let (contents, _malformed) = encoding.decode_with_bom_removal(&bytes);
+        Ok(Self::for_file_contents(
+            contents.to_string(),
+            Some(path.as_ref().to_string_lossy().to_string()),
+            encoding,
+            syntax,
+            error_handling,
+        ))
+    }
+
+    pub fn for_file_contents(
+        contents: String,
+        file_name: Option<String>,
+        encoding: &'static Encoding,
+        syntax: Mode,
+        error_handling: ErrorHandling,
+    ) -> Self {
+        Self {
+            buffer: contents,
+            file_name: file_name.map(Arc::new),
+            encoding,
+            error_handling,
+            segmenter: Segmenter::new(syntax, false),
+            ..Self::default()
+        }
+    }
+
+    pub fn for_string(contents: String, encoding: &'static Encoding) -> Self {
+        Self {
+            buffer: contents,
+            encoding,
+            ..Self::default()
+        }
+    }
+
+    pub fn for_function(
+        read: Box<dyn Fn(PromptStyle) -> String>,
+        file_name: Option<String>,
+        encoding: &'static Encoding,
+        syntax: Mode,
+        error_handling: ErrorHandling,
+    ) -> Self {
+        Self {
+            read,
+            file_name: file_name.map(Arc::new),
+            encoding,
+            segmenter: Segmenter::new(syntax, false),
+            error_handling,
+            ..Self::default()
+        }
+    }
+
+    fn read(&mut self) {
+        loop {
+            let prompt = self.segmenter.prompt();
+            let s = (self.read)(prompt);
+            if s.is_empty() {
+                self.eof = true;
+                return;
+            }
+            self.buffer.push_str(&s);
+            if self.buffer[self.seg_pos..].contains('\n') {
+                return;
+            }
+        }
+    }
+    fn try_get_pp(&mut self, context: &Context) -> bool {
+        let (seg_len, seg_type) = loop {
+            if let Ok(result) = self.segmenter.push(&self.buffer[self.seg_pos..], self.eof) {
+                break result;
+            }
+
+            debug_assert!(!self.eof);
+            self.read();
+        };
+
+        let pos = self.seg_pos..self.seg_pos + seg_len;
+        self.seg_pos += seg_len;
+        if seg_type == Segment::Newline {
+            self.lines.push(self.seg_pos);
+        }
+
+        let scan_token = ScanToken::from_segment(&self.buffer[pos.clone()], seg_type);
+
+        let n_lines = match (seg_type, self.suppress_next_newline) {
+            (Segment::EndCommand, false) => {
+                self.suppress_next_newline = true;
+                1
+            }
+            (Segment::Newline, true) => {
+                self.suppress_next_newline = false;
+                0
+            }
+            (Segment::Newline, false) => 1,
+            _ => 0,
+        };
+        for line_num in self.journal_line..self.journal_line + n_lines {
+            let start_ofs = self.lines[line_num];
+            let end_ofs = self
+                .lines
+                .get(line_num + 1)
+                .copied()
+                .unwrap_or(self.buffer.len());
+            let line = &self.buffer[start_ofs..end_ofs];
+            let _line = line
+                .strip_suffix("\r\n")
+                .unwrap_or(line.strip_suffix('\n').unwrap_or(line));
+            // XXX submit the line as syntax
+        }
+        self.journal_line += n_lines;
+
+        let pos = pos.start..pos.end;
+        match scan_token {
+            None => false,
+            Some(ScanToken::Token(Token::End)) => {
+                self.pp.push_back(LexToken {
+                    token: Token::EndCommand,
+                    pos,
+                    macro_rep: None,
+                });
+                self.eof = true;
+                true
+            }
+            Some(ScanToken::Token(token)) => {
+                self.pp.push_back(LexToken {
+                    token,
+                    pos,
+                    macro_rep: None,
+                });
+                true
+            }
+            Some(ScanToken::Error(error)) => {
+                (context.error)(
+                    Location {
+                        file_name: self.file_name.clone(),
+                        span: Some(self.offset_to_point(pos.start)..self.offset_to_point(pos.end)),
+                        omit_underlines: false,
+                    },
+                    error.into(),
+                );
+                false
+            }
+        }
+    }
+
+    fn get_pp(&mut self, context: &Context) -> bool {
+        while !self.eof {
+            if self.try_get_pp(context) {
+                return true;
+            }
+        }
+        false
+    }
+
+    fn try_get_merge(&mut self, context: &Context) -> bool {
+        if self.pp.is_empty() && !self.get_pp(context) {
+            return false;
+        }
+
+        if !Settings::global().macros.expand {
+            self.merge.append(&mut self.pp);
+            return true;
+        }
+
+        // Now pass tokens one-by-one to the macro expander.
+        let Some(mut parser) = Parser::new(context.macros, &self.pp[0].token) else {
+            // Common case where there is no macro to expand.
+            self.merge.push_back(self.pp.pop_front().unwrap());
+            return true;
+        };
+        for ofs in 1.. {
+            if self.pp.len() <= ofs && !self.get_pp(context) {
+                // This should not be reachable because we always get a
+                // `Token::EndCommand` at the end of an input file, which should
+                // always terminate macro expansion.
+                unreachable!();
+            }
+            let token = &self.pp[ofs];
+            if parser.push(&token.token, &self.buffer[token.pos.clone()], &|e| {
+                println!("{e:?}")
+            }) == ParseStatus::Complete
+            {
+                break;
+            }
+        }
+        let call = parser.finish();
+        if call.len() == 0 {
+            // False alarm: no macro to expand after all.
+            self.merge.push_back(self.pp.pop_front().unwrap());
+            return true;
+        }
+
+        // Expand the tokens.
+        let c0 = &self.pp[0];
+        let c1 = &self.pp[call.len() - 1];
+        let mut expansion = Vec::new();
+        call.expand(
+            self.segmenter.mode(),
+            self.token_location(c0..=c1),
+            &mut expansion,
+            |e| println!("{e:?}"),
+        );
+        let retval = !expansion.is_empty();
+
+        if Settings::global().macros.print_expansions {
+            // XXX
+        }
+
+        // Append the macro expansion tokens to the lookahead.
+        let mut macro_rep = String::new();
+        let mut pos = Vec::with_capacity(expansion.len());
+        for [prefix, token] in macro_tokens_to_syntax(expansion.as_slice()) {
+            macro_rep.push_str(prefix);
+            let len = macro_rep.len();
+            pos.push(len..=len + token.len() - 1);
+        }
+        let macro_rep = Arc::new(macro_rep);
+        for (index, token) in expansion.into_iter().enumerate() {
+            let lt = LexToken {
+                token: token.token,
+                pos: c0.pos.start..c1.pos.end,
+                macro_rep: Some(MacroRepresentation {
+                    expansion: Arc::clone(&macro_rep),
+                    pos: pos[index].clone(),
+                }),
+            };
+            self.merge.push_back(lt);
+        }
+        self.pp.drain(..call.len());
+        retval
+    }
+
+    /// Attempts to obtain at least one new token into `self.merge`.
+    ///
+    /// Returns true if successful, false on failure.  In the latter case, this source
+    /// exhausted and 'self.eof' is now true.
+    fn get_merge(&mut self, context: &Context) -> bool {
+        while !self.eof {
+            if self.try_get_merge(context) {
+                return true;
+            }
+        }
+        false
+    }
+
+    fn get_parse__(&mut self, context: &Context) -> bool {
+        for i in 0.. {
+            if self.merge.len() <= i && !self.get_merge(context) {
+                // We always get a `Token::EndCommand` at the end of an input
+                // file and the merger should return `Some(...)` for that token.
+                debug_assert_eq!(self.merge.len(), 0);
+                return false;
+            }
+
+            match ScanToken::merge(&self.merge) {
+                None => (),
+                Some(MergeResult::Copy) => {
+                    self.parse.push(self.merge.pop_front().unwrap());
+                    return true;
+                }
+                Some(MergeResult::Expand { n, token }) => {
+                    let first = &self.merge[0];
+                    let last = &self.merge[n - 1];
+                    self.parse.push(LexToken {
+                        token,
+                        pos: first.pos.start..last.pos.end,
+                        macro_rep: match (&first.macro_rep, &last.macro_rep) {
+                            (Some(a), Some(b)) if Arc::ptr_eq(&a.expansion, &b.expansion) => {
+                                Some(MacroRepresentation {
+                                    expansion: a.expansion.clone(),
+                                    pos: *a.pos.start()..=*b.pos.end(),
+                                })
+                            }
+                            _ => None,
+                        },
+                    });
+                    self.merge.drain(..n);
+                    return true;
+                }
+            }
+        }
+        unreachable!();
+    }
+
+    fn get_parse(&mut self, context: &Context) -> bool {
+        // XXX deal with accumulated messages
+        self.get_parse__(context)
+    }
+
+    fn offset_to_point(&self, offset: usize) -> Point {
+        let line = self
+            .lines
+            .partition_point(|&line_start| line_start <= offset);
+        Point {
+            line: line as i32,
+            column: Some(
+                self.buffer
+                    .get(self.lines[line - 1]..offset)
+                    .unwrap_or_default()
+                    .width() as i32
+                    + 1,
+            ),
+        }
+    }
+
+    /// Returns the syntax for 1-based line-number `line_number`.
+    fn get_line(&self, line_number: i32) -> &str {
+        if (1..=self.lines.len() as i32).contains(&line_number) {
+            let line_number = line_number as usize;
+            let start = self.lines[line_number - 1];
+            let end = self.lines.get(line_number).copied().unwrap_or(
+                self.buffer[start..]
+                    .find('\n')
+                    .map(|ofs| ofs + start)
+                    .unwrap_or(self.buffer.len()),
+            );
+            let line = &self.buffer[start..end];
+            line.strip_suffix("\r\n")
+                .unwrap_or(line.strip_suffix('\n').unwrap_or(line))
+        } else {
+            ""
+        }
+    }
+
+    fn token_location(&self, range: RangeInclusive<&LexToken>) -> Location {
+        Location {
+            file_name: self.file_name.clone(),
+            span: Some(
+                self.offset_to_point(range.start().pos.start)
+                    ..self.offset_to_point(range.end().pos.end),
+            ),
+            omit_underlines: false,
+        }
+    }
+
+    fn ofs_location(&self, range: RangeInclusive<usize>) -> Location {
+        if *range.start() <= *range.end() && *range.end() < self.parse.len() {
+            self.token_location(&self.parse[*range.start()]..=&self.parse[*range.end()])
+        } else {
+            Location {
+                file_name: self.file_name.clone(),
+                span: None,
+                omit_underlines: false,
+            }
+        }
+    }
+
+    fn token(&self) -> &Token {
+        &self.parse[self.parse_ofs].token
+    }
+
+    fn next(&mut self, offset: isize, context: &Context) -> &Token {
+        let Some(index) = offset.checked_add(self.parse_ofs as isize) else {
+            return &Token::EndCommand;
+        };
+        let Ok(index) = usize::try_from(index) else {
+            return &Token::EndCommand;
+        };
+
+        while index >= self.parse.len() {
+            if let Some(token) = self.parse.last() {
+                match token.token {
+                    Token::End => return &Token::End,
+                    Token::EndCommand => return &Token::EndCommand,
+                    _ => (),
+                }
+            }
+            self.get_parse(context);
+        }
+        &self.parse[index].token
+    }
+
+    /// If the tokens in `ofs` contains a macro call, this returns the raw
+    /// syntax for the macro call (not for the expansion) and for any other
+    /// tokens included in that range.  The syntax is encoded in UTF-8 and in
+    /// the original form supplied to the lexer so that, for example, it may
+    /// include comments, spaces, and new-lines if it spans multiple tokens.
+    ///
+    /// Returns `None` if the token range doesn't include a macro call.
+    fn get_macro_call(&self, ofs: RangeInclusive<usize>) -> Option<&str> {
+        if self
+            .parse
+            .get(ofs.clone())
+            .unwrap_or_default()
+            .iter()
+            .all(|token| token.macro_rep.is_none())
+        {
+            return None;
+        }
+
+        let token0 = &self.parse[*ofs.start()];
+        let token1 = &self.parse[*ofs.end()];
+        Some(&self.buffer[token0.pos.start..token1.pos.end])
+    }
+
+    fn is_empty(&self) -> bool {
+        self.buffer.is_empty() && self.eof
+    }
+
+    fn diagnostic(
+        &self,
+        severity: Severity,
+        ofs: RangeInclusive<usize>,
+        text: String,
+    ) -> Diagnostic {
+        let mut s = String::with_capacity(text.len() + 16);
+        if self.is_empty() {
+            s.push_str("At end of input: ");
+        } else if let Some(call) = self.get_macro_call(ofs.clone()) {
+            write!(&mut s, "In syntax expanded from `{}`: ", ellipsize(call)).unwrap();
+        }
+
+        if !text.is_empty() {
+            s.push_str(&text);
+        } else {
+            s.push_str("Syntax error.");
+        }
+
+        if !s.ends_with('.') {
+            s.push('.');
+        }
+
+        let location = self.ofs_location(ofs);
+        let mut source = Vec::new();
+        if let Some(Range {
+            start: Point { line: l0, .. },
+            end: Point { line: l1, .. },
+        }) = location.span
+        {
+            let lines = if l1 - l0 > 3 {
+                vec![l0, l0 + 1, l1]
+            } else {
+                (l0..=l1).collect()
+            };
+            for line_number in lines {
+                source.push((line_number, self.get_line(line_number).to_string()));
+            }
+        }
+
+        Diagnostic {
+            category: Category::Syntax,
+            severity,
+            location,
+            source,
+            stack: Vec::new(),
+            command_name: None, // XXX
+            text: s,
+        }
+    }
+
+    fn interactive_reset(&mut self) {
+        if self.error_handling == ErrorHandling::Terminal {
+            let Source {
+                error_handling,
+                encoding,
+                read,
+                ..
+            } = mem::take(self);
+            *self = Self {
+                error_handling,
+                encoding,
+                read,
+                ..Source::default()
+            };
+        }
+    }
+}
+
+fn ellipsize(s: &str) -> Cow<str> {
+    if s.width() > 64 {
+        let mut out = String::new();
+        let mut width = 0;
+        for c in s.chars() {
+            out.push(c);
+            width += c.width().unwrap_or(0);
+            if width > 64 {
+                break;
+            }
+        }
+        out.push_str("...");
+        Cow::from(out)
+    } else {
+        Cow::from(s)
+    }
+}
+
+/// A token in a [`Source`].
+struct LexToken {
+    /// The regular token.
+    token: Token,
+
+    /// For a token obtained through the lexer in an ordinary way, this is the
+    /// location of the token in the [`Source`]'s buffer.
+    ///
+    /// For a token produced through macro expansion, this is the entire macro
+    /// call.
+    pos: Range<usize>,
+
+    /// For a token obtained through macro expansion, the part of the macro
+    /// expansion that represents this token.
+    ///
+    /// For a token obtained through the lexer in an ordinary way, this is
+    /// `None`.
+    macro_rep: Option<MacroRepresentation>,
+}
+
+impl Borrow<Token> for LexToken {
+    fn borrow(&self) -> &Token {
+        &self.token
+    }
+}
+
+struct MacroRepresentation {
+    /// An entire macro expansion.
+    expansion: Arc<String>,
+
+    /// The substring of `expansion` that represents a single token.
+    pos: RangeInclusive<usize>,
+}
+
+pub struct Lexer {
+    source: Source,
+    stack: Vec<Source>,
+    macros: MacroSet,
+    error: Box<dyn Fn(Location, Error)>,
+}
+
+struct Context<'a> {
+    macros: &'a MacroSet,
+    error: &'a Box<dyn Fn(Location, Error)>,
+}
+
+impl Lexer {
+    pub fn new(error: Box<dyn Fn(Location, Error)>) -> Self {
+        Self {
+            source: Source::default(),
+            stack: Vec::new(),
+            macros: HashMap::new(),
+            error,
+        }
+    }
+
+    pub fn get(&mut self) -> &Token {
+        if self.source.parse_ofs < self.source.parse.len() {
+            if let Token::EndCommand = self.source.token() {
+                self.source.parse.clear();
+                self.source.parse_ofs = 0;
+            } else {
+                self.source.parse_ofs += 1;
+            }
+        }
+
+        while self.source.parse_ofs == self.source.parse.len() {
+            let context = Context {
+                macros: &self.macros,
+                error: &self.error,
+            };
+            if !self.source.get_parse(&context) && !self.pop_stack() {
+                return &Token::End;
+            }
+        }
+        self.source.token()
+    }
+
+    fn pop_stack(&mut self) -> bool {
+        if let Some(new_source) = self.stack.pop() {
+            self.source = new_source;
+            true
+        } else {
+            self.source = Source::default();
+            self.source.parse.push(LexToken {
+                token: Token::End,
+                pos: 0..0,
+                macro_rep: None,
+            });
+            false
+        }
+    }
+
+    /// Inserts `source` so that the next token comes from it.  This is only
+    /// permitted when the lexer is either empty or at `Token::EndCommand`.
+    pub fn include(&mut self, mut source: Source) {
+        // XXX what's the right assertion?
+        let context = Context {
+            macros: &self.macros,
+            error: &self.error,
+        };
+        source.get_parse(&context);
+        let old_source = mem::replace(&mut self.source, source);
+        self.stack.push(old_source);
+    }
+
+    /// Inserts `source` so that it will be read after all the other sources.
+    pub fn append(&mut self, mut source: Source) {
+        let context = Context {
+            macros: &self.macros,
+            error: &self.error,
+        };
+        source.get_parse(&context);
+        self.stack.insert(0, source);
+    }
+
+    pub fn token(&self) -> &Token {
+        self.source.token()
+    }
+
+    pub fn next(&mut self, offset: isize) -> &Token {
+        let context = Context {
+            macros: &self.macros,
+            error: &self.error,
+        };
+        self.source.next(offset, &context)
+    }
+
+    pub fn error<S>(&self, text: S) -> Diagnostic
+    where
+        S: ToString,
+    {
+        self.diagnostic(
+            Severity::Error,
+            self.source.parse_ofs..=self.source.parse_ofs,
+            text,
+        )
+    }
+
+    pub fn diagnostic<S>(
+        &self,
+        severity: Severity,
+        ofs: RangeInclusive<usize>,
+        text: S,
+    ) -> Diagnostic
+    where
+        S: ToString,
+    {
+        self.source.diagnostic(severity, ofs, text.to_string())
+    }
+
+    pub fn error_handling(&self) -> ErrorHandling {
+        self.source.error_handling
+    }
+
+    /// Discards all lookahead tokens, then discards all input sources
+    /// until it encounters one with error mode [ErrorHandling::Terminal] or until it
+    /// runs out of input sources.
+    pub fn discard_noninteractive(&mut self) {
+        while self.source.error_handling != ErrorHandling::Ignore {
+            self.source.pp.clear();
+            self.source.merge.clear();
+            self.source.parse.clear();
+            self.source.parse_ofs = 0;
+
+            if self.source.error_handling == ErrorHandling::Terminal || !self.pop_stack() {
+                return;
+            }
+        }
+    }
+
+    /// If the source that the lexer is currently reading has error mode
+    /// [ErrorHandling::Terminal], discards all buffered input and tokens, so
+    /// that the next token to be read comes directly from whatever is next read
+    /// from the stream.
+    ///
+    /// It makes sense to call this function after encountering an error in a
+    /// command entered on the console, because usually the user would prefer
+    /// not to have cascading errors.
+    pub fn interactive_reset(&mut self) {
+        self.source.interactive_reset()
+    }
+
+    /// Advances past any tokens up to [Token::EndCommand] or [Token::End].
+    pub fn discard_rest_of_command(&mut self) {
+        while !matches!(self.token(), Token::EndCommand | Token::End) {
+            self.get();
+        }
+    }
+}
+
+#[derive(ThisError, Clone, Debug, PartialEq, Eq)]
+pub enum Error {
+    /// Error forming tokens from the input.
+    #[error("{0}")]
+    TokenError(#[from] ScanError),
+}
+
+#[cfg(test)]
+mod tests {
+    use encoding_rs::UTF_8;
+
+    use crate::lex::{segment::Mode, token::Token};
+
+    use super::{ErrorHandling, Lexer, Source};
+
+    #[test]
+    fn test() {
+        let mut lexer = Lexer::new(Box::new(|location, error| println!("{location}: {error}")));
+        lexer.include(Source::for_string(
+            String::from(
+                r#"#! /usr/local/bin/pspp
+DATA LIST LIST NOTABLE /a.
+BEGIN DATA.
+1
+2
+END DATA.
+LIST.
+"#,
+            ),
+            UTF_8,
+        ));
+        loop {
+            lexer.get();
+            let token = lexer.token();
+            println!("{token:?}");
+            if let Token::End = token {
+                break;
+            }
+        }
+    }
+
+    #[test]
+    fn test_scan_errors() {
+        let mut lexer = Lexer::new(Box::new(|location, error| println!("{location}: {error}")));
+        lexer.include(Source::for_file_contents(
+            String::from(
+                r#"x'123'
+x'1x'
+u''
+u'012345678'
+u'd800'
+u'110000'
+'foo
+'very long unterminated string that be ellipsized in its error message
+1e .x
+^
+�
+"#,
+            ),
+            Some(String::from("syntax.sps")),
+            UTF_8,
+            Mode::default(),
+            ErrorHandling::default(),
+        ));
+        loop {
+            lexer.get();
+            let token = lexer.token();
+            println!("{token:?}");
+            if let Token::End = token {
+                break;
+            }
+        }
+    }
+
+    #[test]
+    fn test_null_byte() {
+        let mut lexer = Lexer::new(Box::new(|location, error| println!("{location}: {error}")));
+        lexer.include(Source::for_file_contents(
+            String::from(
+                "datA dist list notable file='input.txt'/a b c.
+lis|.\0",
+            ),
+            Some(String::from("syntax.sps")),
+            UTF_8,
+            Mode::default(),
+            ErrorHandling::default(),
+        ));
+        loop {
+            lexer.get();
+            let token = lexer.token();
+            println!("{token:?}");
+            if let Token::End = token {
+                break;
+            }
+        }
+    }
+}
diff --git a/rust/pspp/src/lex/mod.rs b/rust/pspp/src/lex/mod.rs

new file mode 100644 (file)

index 0000000..e87b088
--- /dev/null
+++ b/rust/pspp/src/lex/mod.rs
@@ -0,0 +1,17 @@
+//! PSPP syntax scanning.
+//!
+//! PSPP divides traditional "lexical analysis" or "tokenization" into two
+//! phases: a lower-level phase called "segmentation" and a higher-level phase
+//! called "scanning".  [super::segment] implements the segmentation phase and
+//! this module the scanning phase.
+//!
+//! Scanning accepts as input a stream of segments, which are UTF-8 strings each
+//! labeled with a segment type.  It outputs a stream of "scan tokens", which
+//! are the same as the tokens used by the PSPP parser with a few additional
+//! types.
+
+pub mod segment;
+pub mod scan;
+pub mod command_name;
+pub mod token;
+pub mod lexer;
diff --git a/rust/pspp/src/lex/scan/mod.rs b/rust/pspp/src/lex/scan/mod.rs

new file mode 100644 (file)

index 0000000..05577a9
--- /dev/null
+++ b/rust/pspp/src/lex/scan/mod.rs
@@ -0,0 +1,416 @@
+//! PSPP lexical analysis.
+//!
+//! PSPP divides traditional "lexical analysis" or "tokenization" into two
+//! phases: a lower-level phase called "segmentation" and a higher-level phase
+//! called "scanning".  [segment] implements the segmentation phase and [scan]
+//! the scanning phase.
+//!
+//! Scanning accepts as input a stream of segments, which are UTF-8 strings each
+//! labeled with a segment type.  It outputs a stream of "scan tokens", which
+//! are the same as the tokens used by the PSPP parser with a few additional
+//! types.
+
+use crate::identifier::{Identifier, ReservedWord};
+
+use super::{
+    segment::{Mode, Segment, Segmenter},
+    token::{Punct, Token},
+};
+use std::{borrow::Borrow, collections::VecDeque};
+use thiserror::Error as ThisError;
+
+#[derive(ThisError, Clone, Debug, PartialEq, Eq)]
+pub enum ScanError {
+    /// Unterminated string constant.
+    #[error("Unterminated string constant.")]
+    ExpectedQuote,
+
+    /// Missing exponent.
+    #[error("Missing exponent following `{0}`")]
+    ExpectedExponent(String),
+
+    /// Odd length hex string.
+    #[error("String of hex digits has {0} characters, which is not a multiple of 2.")]
+    OddLengthHexString(usize),
+
+    /// Invalid hex digit.
+    #[error("Invalid hex digit {0:?}.")]
+    BadHexDigit(char),
+
+    /// Incomplete UTF-8 sequence.
+    #[error("Incomplete UTF-8 sequence `{substring}` starting {offset} digits into hex string.")]
+    IncompleteUtf8 { substring: String, offset: usize },
+
+    /// Bad UTF-8 sequence.
+    #[error("Invalid UTF-8 sequence `{substring}` starting {offset} digits into hex string.")]
+    BadUtf8 { substring: String, offset: usize },
+
+    /// Invalid length Unicode string.
+    #[error("Unicode string contains {0} bytes, which is not in the valid range of 1 to 8 bytes.")]
+    BadLengthUnicodeString(usize),
+
+    /// Invalid code point.
+    #[error("U+{0:04X} is not a valid Unicode code point.")]
+    BadCodePoint(u32),
+
+    /// Expected hexadecimal Unicode code point
+    #[error("Expected hexadecimal Unicode code point.")]
+    ExpectedCodePoint,
+
+    /// `DO REPEAT` nested too deeply.
+    #[error("`DO REPEAT` nested too deeply.")]
+    DoRepeatOverflow,
+
+    /// Unexpected character.
+    #[error("Unexpected character {0:?} in input.")]
+    UnexpectedChar(char),
+}
+
+/// The input or output to token merging.
+#[derive(Clone, Debug, PartialEq)]
+pub enum ScanToken {
+    Token(Token),
+    Error(ScanError),
+}
+
+/// The result of merging tokens.
+#[derive(Clone, Debug)]
+pub enum MergeResult {
+    /// Copy one token literally from input to output.
+    Copy,
+
+    /// Expand `n` tokens from the input into `token` in the output.
+    Expand {
+        /// Number of tokens to expand.
+        n: usize,
+
+        /// Replacement token.
+        token: Token,
+    },
+}
+
+impl ScanToken {
+    pub fn from_segment(s: &str, segment: Segment) -> Option<Self> {
+        match segment {
+            Segment::Number => Some(Self::Token(Token::Number(s.parse().unwrap()))),
+            Segment::QuotedString => {
+                // Trim quote mark from front and back.
+                let mut chars = s.chars();
+                let quote = chars.next().unwrap();
+                let s = chars.as_str().strip_suffix(quote).unwrap();
+
+                // Replace doubled quotes by single ones.
+                let (single_quote, double_quote) = match quote {
+                    '\'' => ("'", "''"),
+                    '"' => ("\"", "\"\""),
+                    _ => unreachable!(),
+                };
+                Some(Self::Token(Token::String(
+                    s.replace(double_quote, single_quote),
+                )))
+            }
+            Segment::HexString => {
+                // Strip `X"` prefix and `"` suffix (or variations).
+                let s = &s[2..s.len() - 1];
+                for c in s.chars() {
+                    if !c.is_ascii_hexdigit() {
+                        return Some(Self::Error(ScanError::BadHexDigit(c)));
+                    }
+                }
+                if s.len() % 2 != 0 {
+                    return Some(Self::Error(ScanError::OddLengthHexString(s.len())));
+                }
+                let bytes = s
+                    .as_bytes()
+                    .chunks_exact(2)
+                    .map(|pair| {
+                        let hi = char::from(pair[0]).to_digit(16).unwrap() as u8;
+                        let lo = char::from(pair[1]).to_digit(16).unwrap() as u8;
+                        hi * 16 + lo
+                    })
+                    .collect::<Vec<_>>();
+                match String::from_utf8(bytes) {
+                    Ok(string) => Some(Self::Token(Token::String(string))),
+                    Err(error) => {
+                        let details = error.utf8_error();
+                        let offset = details.valid_up_to() * 2;
+                        let end = details
+                            .error_len()
+                            .map(|len| offset + len * 2)
+                            .unwrap_or(s.len());
+                        let substring = String::from(&s[offset..end]);
+                        Some(Self::Error(if details.error_len().is_some() {
+                            ScanError::BadUtf8 { substring, offset }
+                        } else {
+                            ScanError::IncompleteUtf8 { substring, offset }
+                        }))
+                    }
+                }
+            }
+            Segment::UnicodeString => {
+                // Strip `U"` prefix and `"` suffix (or variations).
+                let s = &s[2..s.len() - 1];
+                if !(1..=8).contains(&s.len()) {
+                    return Some(Self::Error(ScanError::BadLengthUnicodeString(s.len())));
+                }
+                let Ok(code_point) = u32::from_str_radix(s, 16) else {
+                    return Some(Self::Error(ScanError::ExpectedCodePoint));
+                };
+                let Some(c) = char::from_u32(code_point) else {
+                    return Some(Self::Error(ScanError::BadCodePoint(code_point)));
+                };
+                Some(Self::Token(Token::String(String::from(c))))
+            }
+
+            Segment::UnquotedString
+            | Segment::DoRepeatCommand
+            | Segment::InlineData
+            | Segment::Document
+            | Segment::MacroBody
+            | Segment::MacroName => Some(Self::Token(Token::String(String::from(s)))),
+
+            Segment::Identifier => {
+                if let Ok(reserved_word) = ReservedWord::try_from(s) {
+                    match reserved_word {
+                        ReservedWord::And => Some(Self::Token(Token::Punct(Punct::And))),
+                        ReservedWord::Or => Some(Self::Token(Token::Punct(Punct::Or))),
+                        ReservedWord::Not => Some(Self::Token(Token::Punct(Punct::Not))),
+                        ReservedWord::Eq => Some(Self::Token(Token::Punct(Punct::Eq))),
+                        ReservedWord::Ge => Some(Self::Token(Token::Punct(Punct::Ge))),
+                        ReservedWord::Gt => Some(Self::Token(Token::Punct(Punct::Gt))),
+                        ReservedWord::Le => Some(Self::Token(Token::Punct(Punct::Le))),
+                        ReservedWord::Lt => Some(Self::Token(Token::Punct(Punct::Lt))),
+                        ReservedWord::Ne => Some(Self::Token(Token::Punct(Punct::Ne))),
+                        ReservedWord::All => Some(Self::Token(Token::Punct(Punct::All))),
+                        ReservedWord::By => Some(Self::Token(Token::Punct(Punct::By))),
+                        ReservedWord::To => Some(Self::Token(Token::Punct(Punct::To))),
+                        ReservedWord::With => Some(Self::Token(Token::Punct(Punct::With))),
+                    }
+                } else {
+                    Some(Self::Token(Token::Id(Identifier::new(s).unwrap())))
+                }
+            }
+            Segment::Punct => match s {
+                "(" => Some(Self::Token(Token::Punct(Punct::LParen))),
+                ")" => Some(Self::Token(Token::Punct(Punct::RParen))),
+                "[" => Some(Self::Token(Token::Punct(Punct::LSquare))),
+                "]" => Some(Self::Token(Token::Punct(Punct::RSquare))),
+                "{" => Some(Self::Token(Token::Punct(Punct::LCurly))),
+                "}" => Some(Self::Token(Token::Punct(Punct::RCurly))),
+                "," => Some(Self::Token(Token::Punct(Punct::Comma))),
+                "=" => Some(Self::Token(Token::Punct(Punct::Equals))),
+                "-" => Some(Self::Token(Token::Punct(Punct::Dash))),
+                "&" => Some(Self::Token(Token::Punct(Punct::And))),
+                "|" => Some(Self::Token(Token::Punct(Punct::Or))),
+                "+" => Some(Self::Token(Token::Punct(Punct::Plus))),
+                "/" => Some(Self::Token(Token::Punct(Punct::Slash))),
+                "*" => Some(Self::Token(Token::Punct(Punct::Asterisk))),
+                "<" => Some(Self::Token(Token::Punct(Punct::Lt))),
+                ">" => Some(Self::Token(Token::Punct(Punct::Gt))),
+                "~" => Some(Self::Token(Token::Punct(Punct::Not))),
+                ":" => Some(Self::Token(Token::Punct(Punct::Colon))),
+                ";" => Some(Self::Token(Token::Punct(Punct::Semicolon))),
+                "**" => Some(Self::Token(Token::Punct(Punct::Exp))),
+                "<=" => Some(Self::Token(Token::Punct(Punct::Le))),
+                "<>" => Some(Self::Token(Token::Punct(Punct::Ne))),
+                "~=" => Some(Self::Token(Token::Punct(Punct::Ne))),
+                ">=" => Some(Self::Token(Token::Punct(Punct::Ge))),
+                "!" => Some(Self::Token(Token::Punct(Punct::Bang))),
+                "%" => Some(Self::Token(Token::Punct(Punct::Percent))),
+                "?" => Some(Self::Token(Token::Punct(Punct::Question))),
+                "`" => Some(Self::Token(Token::Punct(Punct::Backtick))),
+                "_" => Some(Self::Token(Token::Punct(Punct::Underscore))),
+                "." => Some(Self::Token(Token::Punct(Punct::Dot))),
+                "!*" => Some(Self::Token(Token::Punct(Punct::BangAsterisk))),
+                _ => unreachable!("bad punctuator {s:?}"),
+            },
+            Segment::Shbang
+            | Segment::Spaces
+            | Segment::Comment
+            | Segment::Newline
+            | Segment::CommentCommand => None,
+            Segment::DoRepeatOverflow => Some(Self::Error(ScanError::DoRepeatOverflow)),
+            Segment::StartDocument => {
+                Some(Self::Token(Token::Id(Identifier::new("DOCUMENT").unwrap())))
+            }
+            Segment::StartCommand | Segment::SeparateCommands | Segment::EndCommand => {
+                Some(Self::Token(Token::EndCommand))
+            }
+            Segment::End => Some(Self::Token(Token::End)),
+            Segment::ExpectedQuote => Some(Self::Error(ScanError::ExpectedQuote)),
+            Segment::ExpectedExponent => {
+                Some(Self::Error(ScanError::ExpectedExponent(String::from(s))))
+            }
+            Segment::UnexpectedChar => Some(Self::Error(ScanError::UnexpectedChar(
+                s.chars().next().unwrap(),
+            ))),
+        }
+    }
+
+    /// Attempts to merge a sequence of tokens together into a single token. The
+    /// tokens are taken from the beginning of `input`. If successful, removes one
+    /// or more token from the beginning of `input` and returnss the merged
+    /// token. More input tokens might be needed; if so, leaves `input` alone and
+    /// returns `None`. In the latter case, the caller should add more tokens to the
+    /// input ([Token::End] or [Token::Punct(Punct::EndCmd)] is always sufficient).
+    ///
+    /// This performs two different kinds of token merging:
+    ///
+    ///   - String concatenation, where syntax like `"a" + "b"` is converted into a
+    ///   single string token.  This is definitely needed because the parser relies
+    ///   on it.
+    ///
+    ///   - Negative number merging, where syntax like `-5` is converted from a pair
+    ///     of tokens (a dash and a positive number) into a single token (a negative
+    ///     number).  This might not be needed anymore because the segmenter
+    ///     directly treats a dash followed by a number, with optional intervening
+    ///     white space, as a negative number.  It's only needed if we want
+    ///     intervening comments to be allowed or for part of the negative number
+    ///     token to be produced by macro expansion.
+    pub fn merge<T>(tokens: &T) -> Option<MergeResult>
+    where
+        T: Tokens,
+    {
+        match tokens.get(0)? {
+            Token::Punct(Punct::Dash) => match tokens.get(1)? {
+                Token::Number(number) if number.is_sign_positive() => {
+                    let number = *number;
+                    return Some(MergeResult::Expand {
+                        n: 2,
+                        token: Token::Number(-number),
+                    });
+                }
+                _ => Some(MergeResult::Copy),
+            },
+            Token::String(_) => {
+                let mut i = 0;
+                while matches!(tokens.get(i * 2 + 1)?, Token::Punct(Punct::Plus))
+                    && matches!(tokens.get(i * 2 + 2)?, Token::String(_))
+                {
+                    i += 1;
+                }
+                if i == 0 {
+                    Some(MergeResult::Copy)
+                } else {
+                    let mut output = String::new();
+                    for i in 0..=i {
+                        let Token::String(s) = tokens.get(i * 2).unwrap() else {
+                            unreachable!()
+                        };
+                        output.push_str(&s);
+                    }
+                    Some(MergeResult::Expand {
+                        n: i * 2 + 1,
+                        token: Token::String(output),
+                    })
+                }
+            }
+            _ => Some(MergeResult::Copy),
+        }
+    }
+}
+
+pub trait Tokens {
+    fn get(&self, index: usize) -> Option<&Token>;
+}
+
+impl<T> Tokens for VecDeque<T>
+where
+    T: Borrow<Token>,
+{
+    fn get(&self, index: usize) -> Option<&Token> {
+        self.get(index).map(|token| token.borrow())
+    }
+}
+
+pub struct StringSegmenter<'a> {
+    input: &'a str,
+    segmenter: Segmenter,
+}
+
+impl<'a> StringSegmenter<'a> {
+    pub fn new(input: &'a str, mode: Mode, is_snippet: bool) -> Self {
+        Self {
+            input,
+            segmenter: Segmenter::new(mode, is_snippet),
+        }
+    }
+}
+
+impl<'a> Iterator for StringSegmenter<'a> {
+    type Item = (&'a str, ScanToken);
+
+    fn next(&mut self) -> Option<Self::Item> {
+        loop {
+            let (seg_len, seg_type) = self.segmenter.push(self.input, true).unwrap();
+            if seg_type == Segment::End {
+                return None;
+            }
+            let (s, rest) = self.input.split_at(seg_len);
+            self.input = rest;
+
+            if let Some(token) = ScanToken::from_segment(s, seg_type) {
+                return Some((s, token));
+            }
+        }
+    }
+}
+
+pub struct StringScanner<'a> {
+    input: &'a str,
+    segmenter: Segmenter,
+    tokens: VecDeque<Token>,
+}
+
+impl<'a> StringScanner<'a> {
+    pub fn new(input: &'a str, mode: Mode, is_snippet: bool) -> Self {
+        Self {
+            input,
+            segmenter: Segmenter::new(mode, is_snippet),
+            tokens: VecDeque::with_capacity(1),
+        }
+    }
+
+    fn merge(&mut self) -> Option<ScanToken> {
+        let result = ScanToken::merge(&self.tokens)?;
+        match result {
+            MergeResult::Copy => Some(ScanToken::Token(self.tokens.pop_front().unwrap())),
+            MergeResult::Expand { n, token } => {
+                self.tokens.drain(..n);
+                Some(ScanToken::Token(token))
+            }
+        }
+    }
+}
+
+impl<'a> Iterator for StringScanner<'a> {
+    type Item = ScanToken;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        if let Some(token) = self.merge() {
+            return Some(token);
+        }
+        loop {
+            let (seg_len, seg_type) = self.segmenter.push(self.input, true).unwrap();
+            if seg_type == Segment::End && self.tokens.is_empty() {
+                return None;
+            }
+            let (s, rest) = self.input.split_at(seg_len);
+            self.input = rest;
+
+            match ScanToken::from_segment(s, seg_type) {
+                Some(ScanToken::Error(error)) => return Some(ScanToken::Error(error)),
+                Some(ScanToken::Token(token)) => {
+                    self.tokens.push_back(token);
+                    if let Some(token) = self.merge() {
+                        return Some(token);
+                    }
+                }
+                None => (),
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod test;
diff --git a/rust/pspp/src/lex/scan/test.rs b/rust/pspp/src/lex/scan/test.rs

new file mode 100644 (file)

index 0000000..0ed9be6
--- /dev/null
+++ b/rust/pspp/src/lex/scan/test.rs
@@ -0,0 +1,1017 @@
+use crate::{identifier::Identifier, lex::{
+    segment::Mode,
+    token::{Punct, Token},
+}};
+
+use super::{ScanError, ScanToken, StringScanner};
+
+fn print_token(token: &Token) {
+    match token {
+        Token::End => print!("Token::End"),
+        Token::Id(s) => print!("Token::Id(String::from({s:?}))"),
+        Token::Number(number) => print!("Token::Number({number:?})"),
+        Token::String(s) => print!("Token::String(String::from({s:?}))"),
+        Token::EndCommand => print!("Token::EndCommand"),
+        Token::Punct(punct) => print!("Token::Punct(Punct::{punct:?})"),
+    }
+}
+
+fn check_scan(input: &str, mode: Mode, expected: &[ScanToken]) {
+    let tokens = StringScanner::new(input, mode, false).collect::<Vec<_>>();
+
+    if &tokens != expected {
+        for token in &tokens {
+            match token {
+                ScanToken::Token(token) => {
+                    print!("ScanToken::Token(");
+                    print_token(token);
+                    print!(")");
+                }
+                ScanToken::Error(error) => print!("ScanToken::Error(ScanError::{error:?})"),
+            }
+            println!(",");
+        }
+
+        eprintln!("tokens differ from expected:");
+        let difference = diff::slice(expected, &tokens);
+        for result in difference {
+            match result {
+                diff::Result::Left(left) => eprintln!("-{left:?}"),
+                diff::Result::Both(left, _right) => eprintln!(" {left:?}"),
+                diff::Result::Right(right) => eprintln!("+{right:?}"),
+            }
+        }
+        panic!();
+    }
+}
+
+#[test]
+fn test_identifiers() {
+    check_scan(
+        r#"a aB i5 $x @efg @@. !abcd !* !*a #.# .x _z.
+abcd. abcd.
+QRSTUV./* end of line comment */
+QrStUv./* end of line comment */ 
+WXYZ. /* unterminated end of line comment
+�. /* U+FFFD is not valid in an identifier
+"#,
+        Mode::Auto,
+        &[
+            ScanToken::Token(Token::Id(Identifier::new("a").unwrap())),
+            ScanToken::Token(Token::Id(Identifier::new("aB").unwrap())),
+            ScanToken::Token(Token::Id(Identifier::new("i5").unwrap())),
+            ScanToken::Token(Token::Id(Identifier::new("$x").unwrap())),
+            ScanToken::Token(Token::Id(Identifier::new("@efg").unwrap())),
+            ScanToken::Token(Token::Id(Identifier::new("@@.").unwrap())),
+            ScanToken::Token(Token::Id(Identifier::new("!abcd").unwrap())),
+            ScanToken::Token(Token::Punct(Punct::BangAsterisk)),
+            ScanToken::Token(Token::Punct(Punct::BangAsterisk)),
+            ScanToken::Token(Token::Id(Identifier::new("a").unwrap())),
+            ScanToken::Token(Token::Id(Identifier::new("#.#").unwrap())),
+            ScanToken::Token(Token::Punct(Punct::Dot)),
+            ScanToken::Token(Token::Id(Identifier::new("x").unwrap())),
+            ScanToken::Token(Token::Punct(Punct::Underscore)),
+            ScanToken::Token(Token::Id(Identifier::new("z").unwrap())),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::Id(Identifier::new("abcd.").unwrap())),
+            ScanToken::Token(Token::Id(Identifier::new("abcd").unwrap())),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::Id(Identifier::new("QRSTUV").unwrap())),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::Id(Identifier::new("QrStUv").unwrap())),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::Id(Identifier::new("WXYZ").unwrap())),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Error(ScanError::UnexpectedChar('�')),
+            ScanToken::Token(Token::EndCommand),
+        ],
+    );
+}
+
+#[test]
+fn test_reserved_words() {
+    check_scan(
+        r#"and or not eq ge gt le lt ne all by to with
+AND OR NOT EQ GE GT LE LT NE ALL BY TO WITH
+andx orx notx eqx gex gtx lex ltx nex allx byx tox withx
+and. with.
+"#,
+        Mode::Auto,
+        &[
+            ScanToken::Token(Token::Punct(Punct::And)),
+            ScanToken::Token(Token::Punct(Punct::Or)),
+            ScanToken::Token(Token::Punct(Punct::Not)),
+            ScanToken::Token(Token::Punct(Punct::Eq)),
+            ScanToken::Token(Token::Punct(Punct::Ge)),
+            ScanToken::Token(Token::Punct(Punct::Gt)),
+            ScanToken::Token(Token::Punct(Punct::Le)),
+            ScanToken::Token(Token::Punct(Punct::Lt)),
+            ScanToken::Token(Token::Punct(Punct::Ne)),
+            ScanToken::Token(Token::Punct(Punct::All)),
+            ScanToken::Token(Token::Punct(Punct::By)),
+            ScanToken::Token(Token::Punct(Punct::To)),
+            ScanToken::Token(Token::Punct(Punct::With)),
+            ScanToken::Token(Token::Punct(Punct::And)),
+            ScanToken::Token(Token::Punct(Punct::Or)),
+            ScanToken::Token(Token::Punct(Punct::Not)),
+            ScanToken::Token(Token::Punct(Punct::Eq)),
+            ScanToken::Token(Token::Punct(Punct::Ge)),
+            ScanToken::Token(Token::Punct(Punct::Gt)),
+            ScanToken::Token(Token::Punct(Punct::Le)),
+            ScanToken::Token(Token::Punct(Punct::Lt)),
+            ScanToken::Token(Token::Punct(Punct::Ne)),
+            ScanToken::Token(Token::Punct(Punct::All)),
+            ScanToken::Token(Token::Punct(Punct::By)),
+            ScanToken::Token(Token::Punct(Punct::To)),
+            ScanToken::Token(Token::Punct(Punct::With)),
+            ScanToken::Token(Token::Id(Identifier::new("andx").unwrap())),
+            ScanToken::Token(Token::Id(Identifier::new("orx").unwrap())),
+            ScanToken::Token(Token::Id(Identifier::new("notx").unwrap())),
+            ScanToken::Token(Token::Id(Identifier::new("eqx").unwrap())),
+            ScanToken::Token(Token::Id(Identifier::new("gex").unwrap())),
+            ScanToken::Token(Token::Id(Identifier::new("gtx").unwrap())),
+            ScanToken::Token(Token::Id(Identifier::new("lex").unwrap())),
+            ScanToken::Token(Token::Id(Identifier::new("ltx").unwrap())),
+            ScanToken::Token(Token::Id(Identifier::new("nex").unwrap())),
+            ScanToken::Token(Token::Id(Identifier::new("allx").unwrap())),
+            ScanToken::Token(Token::Id(Identifier::new("byx").unwrap())),
+            ScanToken::Token(Token::Id(Identifier::new("tox").unwrap())),
+            ScanToken::Token(Token::Id(Identifier::new("withx").unwrap())),
+            ScanToken::Token(Token::Id(Identifier::new("and.").unwrap())),
+            ScanToken::Token(Token::Punct(Punct::With)),
+            ScanToken::Token(Token::EndCommand),
+        ],
+    );
+}
+
+#[test]
+fn test_punctuation() {
+    check_scan(
+        r#"~ & | = >= > <= < ~= <> ( ) , - + * / [ ] **
+~&|=>=><=<~=<>(),-+*/[]**
+% : ; ? _ ` { } ~
+"#,
+        Mode::Auto,
+        &[
+            ScanToken::Token(Token::Punct(Punct::Not)),
+            ScanToken::Token(Token::Punct(Punct::And)),
+            ScanToken::Token(Token::Punct(Punct::Or)),
+            ScanToken::Token(Token::Punct(Punct::Equals)),
+            ScanToken::Token(Token::Punct(Punct::Ge)),
+            ScanToken::Token(Token::Punct(Punct::Gt)),
+            ScanToken::Token(Token::Punct(Punct::Le)),
+            ScanToken::Token(Token::Punct(Punct::Lt)),
+            ScanToken::Token(Token::Punct(Punct::Ne)),
+            ScanToken::Token(Token::Punct(Punct::Ne)),
+            ScanToken::Token(Token::Punct(Punct::LParen)),
+            ScanToken::Token(Token::Punct(Punct::RParen)),
+            ScanToken::Token(Token::Punct(Punct::Comma)),
+            ScanToken::Token(Token::Punct(Punct::Dash)),
+            ScanToken::Token(Token::Punct(Punct::Plus)),
+            ScanToken::Token(Token::Punct(Punct::Asterisk)),
+            ScanToken::Token(Token::Punct(Punct::Slash)),
+            ScanToken::Token(Token::Punct(Punct::LSquare)),
+            ScanToken::Token(Token::Punct(Punct::RSquare)),
+            ScanToken::Token(Token::Punct(Punct::Exp)),
+            ScanToken::Token(Token::Punct(Punct::Not)),
+            ScanToken::Token(Token::Punct(Punct::And)),
+            ScanToken::Token(Token::Punct(Punct::Or)),
+            ScanToken::Token(Token::Punct(Punct::Equals)),
+            ScanToken::Token(Token::Punct(Punct::Ge)),
+            ScanToken::Token(Token::Punct(Punct::Gt)),
+            ScanToken::Token(Token::Punct(Punct::Le)),
+            ScanToken::Token(Token::Punct(Punct::Lt)),
+            ScanToken::Token(Token::Punct(Punct::Ne)),
+            ScanToken::Token(Token::Punct(Punct::Ne)),
+            ScanToken::Token(Token::Punct(Punct::LParen)),
+            ScanToken::Token(Token::Punct(Punct::RParen)),
+            ScanToken::Token(Token::Punct(Punct::Comma)),
+            ScanToken::Token(Token::Punct(Punct::Dash)),
+            ScanToken::Token(Token::Punct(Punct::Plus)),
+            ScanToken::Token(Token::Punct(Punct::Asterisk)),
+            ScanToken::Token(Token::Punct(Punct::Slash)),
+            ScanToken::Token(Token::Punct(Punct::LSquare)),
+            ScanToken::Token(Token::Punct(Punct::RSquare)),
+            ScanToken::Token(Token::Punct(Punct::Exp)),
+            ScanToken::Token(Token::Punct(Punct::Percent)),
+            ScanToken::Token(Token::Punct(Punct::Colon)),
+            ScanToken::Token(Token::Punct(Punct::Semicolon)),
+            ScanToken::Token(Token::Punct(Punct::Question)),
+            ScanToken::Token(Token::Punct(Punct::Underscore)),
+            ScanToken::Token(Token::Punct(Punct::Backtick)),
+            ScanToken::Token(Token::Punct(Punct::LCurly)),
+            ScanToken::Token(Token::Punct(Punct::RCurly)),
+            ScanToken::Token(Token::Punct(Punct::Not)),
+        ],
+    );
+}
+
+#[test]
+fn test_positive_numbers() {
+    check_scan(
+        r#"0 1 01 001. 1.
+123. /* comment 1 */ /* comment 2 */
+.1 0.1 00.1 00.10
+5e1 6E-1 7e+1 6E+01 6e-03
+.3E1 .4e-1 .5E+1 .6e+01 .7E-03
+1.23e1 45.6E-1 78.9e+1 99.9E+01 11.2e-03
+. 1e e1 1e+ 1e-
+"#,
+        Mode::Auto,
+        &[
+            ScanToken::Token(Token::Number(0.0)),
+            ScanToken::Token(Token::Number(1.0)),
+            ScanToken::Token(Token::Number(1.0)),
+            ScanToken::Token(Token::Number(1.0)),
+            ScanToken::Token(Token::Number(1.0)),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::Number(123.0)),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::Number(1.0)),
+            ScanToken::Token(Token::Number(0.1)),
+            ScanToken::Token(Token::Number(0.1)),
+            ScanToken::Token(Token::Number(0.1)),
+            ScanToken::Token(Token::Number(50.0)),
+            ScanToken::Token(Token::Number(0.6)),
+            ScanToken::Token(Token::Number(70.0)),
+            ScanToken::Token(Token::Number(60.0)),
+            ScanToken::Token(Token::Number(0.006)),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::Number(30.0)),
+            ScanToken::Token(Token::Number(0.04)),
+            ScanToken::Token(Token::Number(5.0)),
+            ScanToken::Token(Token::Number(6.0)),
+            ScanToken::Token(Token::Number(0.0007)),
+            ScanToken::Token(Token::Number(12.3)),
+            ScanToken::Token(Token::Number(4.56)),
+            ScanToken::Token(Token::Number(789.0)),
+            ScanToken::Token(Token::Number(999.0)),
+            ScanToken::Token(Token::Number(0.0112)),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Error(ScanError::ExpectedExponent(String::from("1e"))),
+            ScanToken::Token(Token::Id(Identifier::new("e1").unwrap())),
+            ScanToken::Error(ScanError::ExpectedExponent(String::from("1e+"))),
+            ScanToken::Error(ScanError::ExpectedExponent(String::from("1e-"))),
+        ],
+    );
+}
+
+#[test]
+fn test_negative_numbers() {
+    check_scan(
+        r#" -0 -1 -01 -001. -1.
+ -123. /* comment 1 */ /* comment 2 */
+ -.1 -0.1 -00.1 -00.10
+ -5e1 -6E-1 -7e+1 -6E+01 -6e-03
+ -.3E1 -.4e-1 -.5E+1 -.6e+01 -.7E-03
+ -1.23e1 -45.6E-1 -78.9e+1 -99.9E+01 -11.2e-03
+ -/**/1
+ -. -1e -e1 -1e+ -1e- -1.
+"#,
+        Mode::Auto,
+        &[
+            ScanToken::Token(Token::Number(-0.0)),
+            ScanToken::Token(Token::Number(-1.0)),
+            ScanToken::Token(Token::Number(-1.0)),
+            ScanToken::Token(Token::Number(-1.0)),
+            ScanToken::Token(Token::Number(-1.0)),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::Number(-123.0)),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::Number(-0.1)),
+            ScanToken::Token(Token::Number(-0.1)),
+            ScanToken::Token(Token::Number(-0.1)),
+            ScanToken::Token(Token::Number(-0.1)),
+            ScanToken::Token(Token::Number(-50.0)),
+            ScanToken::Token(Token::Number(-0.6)),
+            ScanToken::Token(Token::Number(-70.0)),
+            ScanToken::Token(Token::Number(-60.0)),
+            ScanToken::Token(Token::Number(-0.006)),
+            ScanToken::Token(Token::Number(-3.0)),
+            ScanToken::Token(Token::Number(-0.04)),
+            ScanToken::Token(Token::Number(-5.0)),
+            ScanToken::Token(Token::Number(-6.0)),
+            ScanToken::Token(Token::Number(-0.0007)),
+            ScanToken::Token(Token::Number(-12.3)),
+            ScanToken::Token(Token::Number(-4.56)),
+            ScanToken::Token(Token::Number(-789.0)),
+            ScanToken::Token(Token::Number(-999.0)),
+            ScanToken::Token(Token::Number(-0.0112)),
+            ScanToken::Token(Token::Number(-1.0)),
+            ScanToken::Token(Token::Punct(Punct::Dash)),
+            ScanToken::Token(Token::Punct(Punct::Dot)),
+            ScanToken::Error(ScanError::ExpectedExponent(String::from("-1e"))),
+            ScanToken::Token(Token::Punct(Punct::Dash)),
+            ScanToken::Token(Token::Id(Identifier::new("e1").unwrap())),
+            ScanToken::Error(ScanError::ExpectedExponent(String::from("-1e+"))),
+            ScanToken::Error(ScanError::ExpectedExponent(String::from("-1e-"))),
+            ScanToken::Token(Token::Number(-1.0)),
+            ScanToken::Token(Token::EndCommand),
+        ],
+    );
+}
+
+#[test]
+fn test_strings() {
+    check_scan(
+        r#"'x' "y" 'abc'
+'Don''t' "Can't" 'Won''t'
+"""quoted""" '"quoted"'
+'' "" '''' """"
+'missing end quote
+"missing double quote
+'x' + "y"
++ 'z' +
+'a' /* abc */ + "b" /*
++ 'c' +/* */"d"/* */+'e'
+'foo'
++          /* special case: + in column 0 would ordinarily start a new command
+'bar'
+'foo'
+ +
+'bar'
+'foo'
++
+
+'bar'
+
++
+x"4142"+'5152'
+"4142"+
+x'5152'
+x"4142"
++u'304a'
+"�あいうえお"
+"abc"+U"FFFD"+u'3048'+"xyz"
+"#,
+        Mode::Auto,
+        &[
+            ScanToken::Token(Token::String(String::from("x"))),
+            ScanToken::Token(Token::String(String::from("y"))),
+            ScanToken::Token(Token::String(String::from("abc"))),
+            ScanToken::Token(Token::String(String::from("Don't"))),
+            ScanToken::Token(Token::String(String::from("Can't"))),
+            ScanToken::Token(Token::String(String::from("Won't"))),
+            ScanToken::Token(Token::String(String::from("\"quoted\""))),
+            ScanToken::Token(Token::String(String::from("\"quoted\""))),
+            ScanToken::Token(Token::String(String::from(""))),
+            ScanToken::Token(Token::String(String::from(""))),
+            ScanToken::Token(Token::String(String::from("'"))),
+            ScanToken::Token(Token::String(String::from("\""))),
+            ScanToken::Error(ScanError::ExpectedQuote),
+            ScanToken::Error(ScanError::ExpectedQuote),
+            ScanToken::Token(Token::String(String::from("xyzabcde"))),
+            ScanToken::Token(Token::String(String::from("foobar"))),
+            ScanToken::Token(Token::String(String::from("foobar"))),
+            ScanToken::Token(Token::String(String::from("foo"))),
+            ScanToken::Token(Token::Punct(Punct::Plus)),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::String(String::from("bar"))),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::Punct(Punct::Plus)),
+            ScanToken::Token(Token::String(String::from("AB5152"))),
+            ScanToken::Token(Token::String(String::from("4142QR"))),
+            ScanToken::Token(Token::String(String::from("ABお"))),
+            ScanToken::Token(Token::String(String::from("�あいうえお"))),
+            ScanToken::Token(Token::String(String::from("abc�えxyz"))),
+            ScanToken::Token(Token::End),
+        ],
+    );
+}
+
+#[test]
+fn test_shbang() {
+    check_scan(
+        r#"#! /usr/bin/pspp
+#! /usr/bin/pspp
+"#,
+        Mode::Auto,
+        &[
+            ScanToken::Token(Token::Id(Identifier::new("#").unwrap())),
+            ScanToken::Token(Token::Punct(Punct::Bang)),
+            ScanToken::Token(Token::Punct(Punct::Slash)),
+            ScanToken::Token(Token::Id(Identifier::new("usr").unwrap())),
+            ScanToken::Token(Token::Punct(Punct::Slash)),
+            ScanToken::Token(Token::Id(Identifier::new("bin").unwrap())),
+            ScanToken::Token(Token::Punct(Punct::Slash)),
+            ScanToken::Token(Token::Id(Identifier::new("pspp").unwrap())),
+        ],
+    );
+}
+
+#[test]
+fn test_comments() {
+    check_scan(
+        r#"* Comment commands "don't
+have to contain valid tokens.
+
+** Check ambiguity with ** token.
+****************.
+
+comment keyword works too.
+COMM also.
+com is ambiguous with COMPUTE.
+
+   * Comment need not start at left margin.
+
+* Comment ends with blank line
+
+next command.
+
+"#,
+        Mode::Auto,
+        &[
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::Id(Identifier::new("com").unwrap())),
+            ScanToken::Token(Token::Id(Identifier::new("is").unwrap())),
+            ScanToken::Token(Token::Id(Identifier::new("ambiguous").unwrap())),
+            ScanToken::Token(Token::Punct(Punct::With)),
+            ScanToken::Token(Token::Id(Identifier::new("COMPUTE").unwrap())),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::Id(Identifier::new("next").unwrap())),
+            ScanToken::Token(Token::Id(Identifier::new("command").unwrap())),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::EndCommand),
+        ],
+    );
+}
+
+#[test]
+fn test_document() {
+    check_scan(
+        r#"DOCUMENT one line.
+DOC more
+    than
+        one
+            line.
+docu
+first.paragraph
+isn't parsed as tokens
+
+second paragraph.
+"#,
+        Mode::Auto,
+        &[
+            ScanToken::Token(Token::Id(Identifier::new("DOCUMENT").unwrap())),
+            ScanToken::Token(Token::String(String::from("DOCUMENT one line."))),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::Id(Identifier::new("DOCUMENT").unwrap())),
+            ScanToken::Token(Token::String(String::from("DOC more"))),
+            ScanToken::Token(Token::String(String::from("    than"))),
+            ScanToken::Token(Token::String(String::from("        one"))),
+            ScanToken::Token(Token::String(String::from("            line."))),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::Id(Identifier::new("DOCUMENT").unwrap())),
+            ScanToken::Token(Token::String(String::from("docu"))),
+            ScanToken::Token(Token::String(String::from("first.paragraph"))),
+            ScanToken::Token(Token::String(String::from("isn't parsed as tokens"))),
+            ScanToken::Token(Token::String(String::from(""))),
+            ScanToken::Token(Token::String(String::from("second paragraph."))),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::EndCommand),
+        ],
+    );
+}
+
+#[test]
+fn test_file_label() {
+    check_scan(
+        r#"FIL label isn't quoted.
+FILE
+  lab 'is quoted'.
+FILE /*
+/**/  lab not quoted here either
+
+"#,
+        Mode::Auto,
+        &[
+            ScanToken::Token(Token::Id(Identifier::new("FIL").unwrap())),
+            ScanToken::Token(Token::Id(Identifier::new("label").unwrap())),
+            ScanToken::Token(Token::String(String::from("isn't quoted"))),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::Id(Identifier::new("FILE").unwrap())),
+            ScanToken::Token(Token::Id(Identifier::new("lab").unwrap())),
+            ScanToken::Token(Token::String(String::from("is quoted"))),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::Id(Identifier::new("FILE").unwrap())),
+            ScanToken::Token(Token::Id(Identifier::new("lab").unwrap())),
+            ScanToken::Token(Token::String(String::from("not quoted here either"))),
+            ScanToken::Token(Token::EndCommand),
+        ],
+    );
+}
+
+#[test]
+fn test_begin_data() {
+    check_scan(
+        r#"begin data.
+123
+xxx
+end data.
+
+BEG /**/ DAT /*
+5 6 7 /* x
+
+end  data
+end data
+.
+"#,
+        Mode::Auto,
+        &[
+            ScanToken::Token(Token::Id(Identifier::new("begin").unwrap())),
+            ScanToken::Token(Token::Id(Identifier::new("data").unwrap())),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::String(String::from("123"))),
+            ScanToken::Token(Token::String(String::from("xxx"))),
+            ScanToken::Token(Token::Id(Identifier::new("end").unwrap())),
+            ScanToken::Token(Token::Id(Identifier::new("data").unwrap())),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::Id(Identifier::new("BEG").unwrap())),
+            ScanToken::Token(Token::Id(Identifier::new("DAT").unwrap())),
+            ScanToken::Token(Token::String(String::from("5 6 7 /* x"))),
+            ScanToken::Token(Token::String(String::from(""))),
+            ScanToken::Token(Token::String(String::from("end  data"))),
+            ScanToken::Token(Token::Id(Identifier::new("end").unwrap())),
+            ScanToken::Token(Token::Id(Identifier::new("data").unwrap())),
+            ScanToken::Token(Token::EndCommand),
+        ],
+    );
+}
+
+#[test]
+fn test_do_repeat() {
+    check_scan(
+        r#"do repeat x=a b c
+          y=d e f.
+  do repeat a=1 thru 5.
+another command.
+second command
++ third command.
+end /* x */ /* y */ repeat print.
+end
+ repeat.
+"#,
+        Mode::Auto,
+        &[
+            ScanToken::Token(Token::Id(Identifier::new("do").unwrap())),
+            ScanToken::Token(Token::Id(Identifier::new("repeat").unwrap())),
+            ScanToken::Token(Token::Id(Identifier::new("x").unwrap())),
+            ScanToken::Token(Token::Punct(Punct::Equals)),
+            ScanToken::Token(Token::Id(Identifier::new("a").unwrap())),
+            ScanToken::Token(Token::Id(Identifier::new("b").unwrap())),
+            ScanToken::Token(Token::Id(Identifier::new("c").unwrap())),
+            ScanToken::Token(Token::Id(Identifier::new("y").unwrap())),
+            ScanToken::Token(Token::Punct(Punct::Equals)),
+            ScanToken::Token(Token::Id(Identifier::new("d").unwrap())),
+            ScanToken::Token(Token::Id(Identifier::new("e").unwrap())),
+            ScanToken::Token(Token::Id(Identifier::new("f").unwrap())),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::String(String::from("  do repeat a=1 thru 5."))),
+            ScanToken::Token(Token::String(String::from("another command."))),
+            ScanToken::Token(Token::String(String::from("second command"))),
+            ScanToken::Token(Token::String(String::from("+ third command."))),
+            ScanToken::Token(Token::String(String::from(
+                "end /* x */ /* y */ repeat print.",
+            ))),
+            ScanToken::Token(Token::Id(Identifier::new("end").unwrap())),
+            ScanToken::Token(Token::Id(Identifier::new("repeat").unwrap())),
+            ScanToken::Token(Token::EndCommand),
+        ],
+    );
+}
+
+#[test]
+fn test_do_repeat_batch() {
+    check_scan(
+        r#"do repeat x=a b c
+          y=d e f
+do repeat a=1 thru 5
+another command
+second command
++ third command
+end /* x */ /* y */ repeat print
+end
+ repeat
+do
+  repeat #a=1
+
+  inner command
+end repeat
+"#,
+        Mode::Batch,
+        &[
+            ScanToken::Token(Token::Id(Identifier::new("do").unwrap())),
+            ScanToken::Token(Token::Id(Identifier::new("repeat").unwrap())),
+            ScanToken::Token(Token::Id(Identifier::new("x").unwrap())),
+            ScanToken::Token(Token::Punct(Punct::Equals)),
+            ScanToken::Token(Token::Id(Identifier::new("a").unwrap())),
+            ScanToken::Token(Token::Id(Identifier::new("b").unwrap())),
+            ScanToken::Token(Token::Id(Identifier::new("c").unwrap())),
+            ScanToken::Token(Token::Id(Identifier::new("y").unwrap())),
+            ScanToken::Token(Token::Punct(Punct::Equals)),
+            ScanToken::Token(Token::Id(Identifier::new("d").unwrap())),
+            ScanToken::Token(Token::Id(Identifier::new("e").unwrap())),
+            ScanToken::Token(Token::Id(Identifier::new("f").unwrap())),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::String(String::from("do repeat a=1 thru 5"))),
+            ScanToken::Token(Token::String(String::from("another command"))),
+            ScanToken::Token(Token::String(String::from("second command"))),
+            ScanToken::Token(Token::String(String::from("+ third command"))),
+            ScanToken::Token(Token::String(String::from(
+                "end /* x */ /* y */ repeat print",
+            ))),
+            ScanToken::Token(Token::Id(Identifier::new("end").unwrap())),
+            ScanToken::Token(Token::Id(Identifier::new("repeat").unwrap())),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::Id(Identifier::new("do").unwrap())),
+            ScanToken::Token(Token::Id(Identifier::new("repeat").unwrap())),
+            ScanToken::Token(Token::Id(Identifier::new("#a").unwrap())),
+            ScanToken::Token(Token::Punct(Punct::Equals)),
+            ScanToken::Token(Token::Number(1.0)),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::String(String::from("  inner command"))),
+            ScanToken::Token(Token::Id(Identifier::new("end").unwrap())),
+            ScanToken::Token(Token::Id(Identifier::new("repeat").unwrap())),
+        ],
+    );
+}
+
+#[test]
+fn test_batch_mode() {
+    check_scan(
+        r#"first command
+     another line of first command
++  second command
+third command
+
+fourth command.
+   fifth command.
+"#,
+        Mode::Batch,
+        &[
+            ScanToken::Token(Token::Id(Identifier::new("first").unwrap())),
+            ScanToken::Token(Token::Id(Identifier::new("command").unwrap())),
+            ScanToken::Token(Token::Id(Identifier::new("another").unwrap())),
+            ScanToken::Token(Token::Id(Identifier::new("line").unwrap())),
+            ScanToken::Token(Token::Id(Identifier::new("of").unwrap())),
+            ScanToken::Token(Token::Id(Identifier::new("first").unwrap())),
+            ScanToken::Token(Token::Id(Identifier::new("command").unwrap())),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::Id(Identifier::new("second").unwrap())),
+            ScanToken::Token(Token::Id(Identifier::new("command").unwrap())),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::Id(Identifier::new("third").unwrap())),
+            ScanToken::Token(Token::Id(Identifier::new("command").unwrap())),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::Id(Identifier::new("fourth").unwrap())),
+            ScanToken::Token(Token::Id(Identifier::new("command").unwrap())),
+            ScanToken::Token(Token::EndCommand),
+            ScanToken::Token(Token::Id(Identifier::new("fifth").unwrap())),
+            ScanToken::Token(Token::Id(Identifier::new("command").unwrap())),
+            ScanToken::Token(Token::EndCommand),
+        ],
+    );
+}
+
+mod define {
+    use crate::{identifier::Identifier, lex::{
+        scan::ScanToken,
+        segment::Mode,
+        token::{Punct, Token},
+    }};
+
+    use super::check_scan;
+
+    #[test]
+    fn test_simple() {
+        check_scan(
+            r#"define !macro1()
+var1 var2 var3
+!enddefine.
+"#,
+            Mode::Auto,
+            &[
+                ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
+                ScanToken::Token(Token::String(String::from("!macro1"))),
+                ScanToken::Token(Token::Punct(Punct::LParen)),
+                ScanToken::Token(Token::Punct(Punct::RParen)),
+                ScanToken::Token(Token::String(String::from("var1 var2 var3"))),
+                ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())),
+                ScanToken::Token(Token::EndCommand),
+            ],
+        );
+    }
+
+    #[test]
+    fn test_no_newline_after_parentheses() {
+        check_scan(
+            r#"define !macro1() var1 var2 var3
+!enddefine.
+"#,
+            Mode::Auto,
+            &[
+                ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
+                ScanToken::Token(Token::String(String::from("!macro1"))),
+                ScanToken::Token(Token::Punct(Punct::LParen)),
+                ScanToken::Token(Token::Punct(Punct::RParen)),
+                ScanToken::Token(Token::String(String::from(" var1 var2 var3"))),
+                ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())),
+                ScanToken::Token(Token::EndCommand),
+            ],
+        );
+    }
+
+    #[test]
+    fn test_no_newline_before_enddefine() {
+        check_scan(
+            r#"define !macro1()
+var1 var2 var3!enddefine.
+"#,
+            Mode::Auto,
+            &[
+                ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
+                ScanToken::Token(Token::String(String::from("!macro1"))),
+                ScanToken::Token(Token::Punct(Punct::LParen)),
+                ScanToken::Token(Token::Punct(Punct::RParen)),
+                ScanToken::Token(Token::String(String::from("var1 var2 var3"))),
+                ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())),
+                ScanToken::Token(Token::EndCommand),
+            ],
+        );
+    }
+
+    #[test]
+    fn test_all_on_one_line() {
+        check_scan(
+            r#"define !macro1()var1 var2 var3!enddefine.
+"#,
+            Mode::Auto,
+            &[
+                ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
+                ScanToken::Token(Token::String(String::from("!macro1"))),
+                ScanToken::Token(Token::Punct(Punct::LParen)),
+                ScanToken::Token(Token::Punct(Punct::RParen)),
+                ScanToken::Token(Token::String(String::from("var1 var2 var3"))),
+                ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())),
+                ScanToken::Token(Token::EndCommand),
+            ],
+        );
+    }
+
+    #[test]
+    fn test_empty() {
+        check_scan(
+            r#"define !macro1()
+!enddefine.
+"#,
+            Mode::Auto,
+            &[
+                ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
+                ScanToken::Token(Token::String(String::from("!macro1"))),
+                ScanToken::Token(Token::Punct(Punct::LParen)),
+                ScanToken::Token(Token::Punct(Punct::RParen)),
+                ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())),
+                ScanToken::Token(Token::EndCommand),
+            ],
+        );
+    }
+
+    #[test]
+    fn test_blank_lines() {
+        check_scan(
+            r#"define !macro1()
+
+
+!enddefine.
+"#,
+            Mode::Auto,
+            &[
+                ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
+                ScanToken::Token(Token::String(String::from("!macro1"))),
+                ScanToken::Token(Token::Punct(Punct::LParen)),
+                ScanToken::Token(Token::Punct(Punct::RParen)),
+                ScanToken::Token(Token::String(String::from(""))),
+                ScanToken::Token(Token::String(String::from(""))),
+                ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())),
+                ScanToken::Token(Token::EndCommand),
+            ],
+        );
+    }
+
+    #[test]
+    fn test_arguments() {
+        check_scan(
+            r#"define !macro1(a(), b(), c())
+!enddefine.
+"#,
+            Mode::Auto,
+            &[
+                ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
+                ScanToken::Token(Token::String(String::from("!macro1"))),
+                ScanToken::Token(Token::Punct(Punct::LParen)),
+                ScanToken::Token(Token::Id(Identifier::new("a").unwrap())),
+                ScanToken::Token(Token::Punct(Punct::LParen)),
+                ScanToken::Token(Token::Punct(Punct::RParen)),
+                ScanToken::Token(Token::Punct(Punct::Comma)),
+                ScanToken::Token(Token::Id(Identifier::new("b").unwrap())),
+                ScanToken::Token(Token::Punct(Punct::LParen)),
+                ScanToken::Token(Token::Punct(Punct::RParen)),
+                ScanToken::Token(Token::Punct(Punct::Comma)),
+                ScanToken::Token(Token::Id(Identifier::new("c").unwrap())),
+                ScanToken::Token(Token::Punct(Punct::LParen)),
+                ScanToken::Token(Token::Punct(Punct::RParen)),
+                ScanToken::Token(Token::Punct(Punct::RParen)),
+                ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())),
+                ScanToken::Token(Token::EndCommand),
+            ],
+        );
+    }
+
+    #[test]
+    fn test_multiline_arguments() {
+        check_scan(
+            r#"define !macro1(
+  a(), b(
+  ),
+  c()
+)
+!enddefine.
+"#,
+            Mode::Auto,
+            &[
+                ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
+                ScanToken::Token(Token::String(String::from("!macro1"))),
+                ScanToken::Token(Token::Punct(Punct::LParen)),
+                ScanToken::Token(Token::Id(Identifier::new("a").unwrap())),
+                ScanToken::Token(Token::Punct(Punct::LParen)),
+                ScanToken::Token(Token::Punct(Punct::RParen)),
+                ScanToken::Token(Token::Punct(Punct::Comma)),
+                ScanToken::Token(Token::Id(Identifier::new("b").unwrap())),
+                ScanToken::Token(Token::Punct(Punct::LParen)),
+                ScanToken::Token(Token::Punct(Punct::RParen)),
+                ScanToken::Token(Token::Punct(Punct::Comma)),
+                ScanToken::Token(Token::Id(Identifier::new("c").unwrap())),
+                ScanToken::Token(Token::Punct(Punct::LParen)),
+                ScanToken::Token(Token::Punct(Punct::RParen)),
+                ScanToken::Token(Token::Punct(Punct::RParen)),
+                ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())),
+                ScanToken::Token(Token::EndCommand),
+            ],
+        );
+    }
+
+    #[test]
+    fn test_arguments_start_on_second_line() {
+        check_scan(
+            r#"define !macro1
+(x,y,z
+)
+content 1
+content 2
+!enddefine.
+"#,
+            Mode::Auto,
+            &[
+                ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
+                ScanToken::Token(Token::String(String::from("!macro1"))),
+                ScanToken::Token(Token::Punct(Punct::LParen)),
+                ScanToken::Token(Token::Id(Identifier::new("x").unwrap())),
+                ScanToken::Token(Token::Punct(Punct::Comma)),
+                ScanToken::Token(Token::Id(Identifier::new("y").unwrap())),
+                ScanToken::Token(Token::Punct(Punct::Comma)),
+                ScanToken::Token(Token::Id(Identifier::new("z").unwrap())),
+                ScanToken::Token(Token::Punct(Punct::RParen)),
+                ScanToken::Token(Token::String(String::from("content 1"))),
+                ScanToken::Token(Token::String(String::from("content 2"))),
+                ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())),
+                ScanToken::Token(Token::EndCommand),
+            ],
+        );
+    }
+
+    #[test]
+    fn test_early_end_of_command_1() {
+        check_scan(
+            r#"define !macro1.
+data list /x 1.
+"#,
+            Mode::Auto,
+            &[
+                ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
+                ScanToken::Token(Token::String(String::from("!macro1"))),
+                ScanToken::Token(Token::EndCommand),
+                ScanToken::Token(Token::Id(Identifier::new("data").unwrap())),
+                ScanToken::Token(Token::Id(Identifier::new("list").unwrap())),
+                ScanToken::Token(Token::Punct(Punct::Slash)),
+                ScanToken::Token(Token::Id(Identifier::new("x").unwrap())),
+                ScanToken::Token(Token::Number(1.0)),
+                ScanToken::Token(Token::EndCommand),
+            ],
+        );
+    }
+
+    #[test]
+    fn test_early_end_of_command_2() {
+        check_scan(
+            r#"define !macro1
+x.
+data list /x 1.
+"#,
+            Mode::Auto,
+            &[
+                ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
+                ScanToken::Token(Token::String(String::from("!macro1"))),
+                ScanToken::Token(Token::Id(Identifier::new("x").unwrap())),
+                ScanToken::Token(Token::EndCommand),
+                ScanToken::Token(Token::Id(Identifier::new("data").unwrap())),
+                ScanToken::Token(Token::Id(Identifier::new("list").unwrap())),
+                ScanToken::Token(Token::Punct(Punct::Slash)),
+                ScanToken::Token(Token::Id(Identifier::new("x").unwrap())),
+                ScanToken::Token(Token::Number(1.0)),
+                ScanToken::Token(Token::EndCommand),
+            ],
+        );
+    }
+
+    #[test]
+    fn test_early_end_of_command_3() {
+        check_scan(
+            r#"define !macro1(.
+x.
+data list /x 1.
+"#,
+            Mode::Auto,
+            &[
+                ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
+                ScanToken::Token(Token::String(String::from("!macro1"))),
+                ScanToken::Token(Token::Punct(Punct::LParen)),
+                ScanToken::Token(Token::EndCommand),
+                ScanToken::Token(Token::Id(Identifier::new("x").unwrap())),
+                ScanToken::Token(Token::EndCommand),
+                ScanToken::Token(Token::Id(Identifier::new("data").unwrap())),
+                ScanToken::Token(Token::Id(Identifier::new("list").unwrap())),
+                ScanToken::Token(Token::Punct(Punct::Slash)),
+                ScanToken::Token(Token::Id(Identifier::new("x").unwrap())),
+                ScanToken::Token(Token::Number(1.0)),
+                ScanToken::Token(Token::EndCommand),
+            ],
+        );
+    }
+
+    #[test]
+    fn test_early_end_of_command_4() {
+        // Notice the command terminator at the end of the DEFINE command,
+        // which should not be there and ends it early.
+        check_scan(
+            r#"define !macro1.
+data list /x 1.
+"#,
+            Mode::Auto,
+            &[
+                ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
+                ScanToken::Token(Token::String(String::from("!macro1"))),
+                ScanToken::Token(Token::EndCommand),
+                ScanToken::Token(Token::Id(Identifier::new("data").unwrap())),
+                ScanToken::Token(Token::Id(Identifier::new("list").unwrap())),
+                ScanToken::Token(Token::Punct(Punct::Slash)),
+                ScanToken::Token(Token::Id(Identifier::new("x").unwrap())),
+                ScanToken::Token(Token::Number(1.0)),
+                ScanToken::Token(Token::EndCommand),
+            ],
+        );
+    }
+
+    #[test]
+    fn test_missing_enddefine() {
+        check_scan(
+            r#"define !macro1()
+content line 1
+content line 2
+"#,
+            Mode::Auto,
+            &[
+                ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
+                ScanToken::Token(Token::String(String::from("!macro1"))),
+                ScanToken::Token(Token::Punct(Punct::LParen)),
+                ScanToken::Token(Token::Punct(Punct::RParen)),
+                ScanToken::Token(Token::String(String::from("content line 1"))),
+                ScanToken::Token(Token::String(String::from("content line 2"))),
+                ScanToken::Token(Token::End),
+            ],
+        );
+    }
+}
diff --git a/rust/pspp/src/lex/segment/mod.rs b/rust/pspp/src/lex/segment/mod.rs

new file mode 100644 (file)

index 0000000..befe5b0
--- /dev/null
+++ b/rust/pspp/src/lex/segment/mod.rs
@@ -0,0 +1,1334 @@
+//! Syntax segmentation.
+//!
+//! PSPP divides traditional "lexical analysis" or "tokenization" into two
+//! phases: a lower-level phase called "segmentation" and a higher-level phase
+//! called "scanning".  This module implements the segmentation phase.
+//! [`super::scan`] contains declarations for the scanning phase.
+//!
+//! Segmentation accepts a stream of UTF-8 bytes as input.  It outputs a label
+//! (a segment type) for each byte or contiguous sequence of bytes in the input.
+//! It also, in a few corner cases, outputs zero-width segments that label the
+//! boundary between a pair of bytes in the input.
+//!
+//! Some segment types correspond directly to tokens; for example, an
+//! "identifier" segment (SEG_IDENTIFIER) becomes an identifier token (T_ID)
+//! later in lexical analysis.  Other segments contribute to tokens but do not
+//! correspond directly; for example, multiple quoted string segments
+//! (SEG_QUOTED_STRING) separated by spaces (SEG_SPACES) and "+" punctuators
+//! (SEG_PUNCT) may be combined to form a single string token (T_STRING).  Still
+//! other segments are ignored (e.g. SEG_SPACES) or trigger special behavior
+//! such as error messages later in tokenization (e.g. SEG_EXPECTED_QUOTE).
+
+use crate::{
+    identifier::{id_match, id_match_n, IdentifierChar},
+    prompt::PromptStyle,
+};
+use bitflags::bitflags;
+
+use super::command_name::{command_match, COMMAND_NAMES};
+
+/// Segmentation mode.
+///
+/// PSPP syntax is written in one of two modes which are broadly defined as
+/// follows:
+///
+/// - In interactive mode, commands end with a period at the end of the line
+///   or with a blank line.
+///
+/// - In batch mode, the second and subsequent lines of a command are indented
+///   from the left margin.
+///
+/// The segmenter can also try to automatically detect the mode in use, using a
+/// heuristic that is usually correct.
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Default)]
+pub enum Mode {
+    /// Try to interpret input correctly regardless of whether it is written
+    /// for interactive or batch mode.
+    #[default]
+    Auto,
+
+    /// Interactive syntax mode.
+    Interactive,
+
+    /// Batch syntax mode.
+    Batch,
+}
+
+/// The type of a segment.
+#[derive(Copy, Clone, Debug, PartialEq, Eq)]
+pub enum Segment {
+    Number,
+    QuotedString,
+    HexString,
+    UnicodeString,
+    UnquotedString,
+    Identifier,
+    Punct,
+    Shbang,
+    Spaces,
+    Comment,
+    Newline,
+    CommentCommand,
+    DoRepeatCommand,
+    DoRepeatOverflow,
+    InlineData,
+    MacroName,
+    MacroBody,
+    StartDocument,
+    Document,
+    StartCommand,
+    SeparateCommands,
+    EndCommand,
+    End,
+    ExpectedQuote,
+    ExpectedExponent,
+    UnexpectedChar,
+}
+
+bitflags! {
+    #[derive(Copy, Clone, Debug)]
+    pub struct Substate: u8 {
+        const START_OF_LINE = 1;
+        const START_OF_COMMAND = 2;
+    }
+}
+
+#[derive(Copy, Clone)]
+pub struct Segmenter {
+    state: (State, Substate),
+    nest: u8,
+    mode: Mode,
+}
+
+#[derive(Copy, Clone, Debug)]
+pub struct Incomplete;
+
+impl Segmenter {
+    /// Returns a segmenter with the given syntax `mode`.
+    ///
+    /// If `is_snippet` is false, then the segmenter will parse as if it's being
+    /// given a whole file.  This means, for example, that it will interpret `-`
+    /// or `+` at the beginning of the syntax as a separator between commands
+    /// (since `-` or `+` at the beginning of a line has this meaning).
+    ///
+    /// If `is_snippet` is true, then the segmenter will parse as if it's being
+    /// given an isolated piece of syntax.  This means that, for example, that
+    /// it will interpret `-` or `+` at the beginning of the syntax as an
+    /// operator token or (if followed by a digit) as part of a number.
+    pub fn new(mode: Mode, is_snippet: bool) -> Self {
+        Self {
+            state: if is_snippet {
+                (State::General, Substate::empty())
+            } else {
+                (State::Shbang, Substate::empty())
+            },
+            mode,
+            nest: 0,
+        }
+    }
+
+    pub fn mode(&self) -> Mode {
+        self.mode
+    }
+
+    fn start_of_line(&self) -> bool {
+        self.state.1.contains(Substate::START_OF_LINE)
+    }
+
+    fn start_of_command(&self) -> bool {
+        self.state.1.contains(Substate::START_OF_COMMAND)
+    }
+
+    /// Returns the style of command prompt to display to an interactive user
+    /// for input in the current state..  The return value is most accurate in
+    /// mode `Mode::Interactive` and at the beginning of a line (that is, if
+    /// [`Segmenter::push`] consumed as much as possible of the input up to a
+    /// new-line).
+    pub fn prompt(&self) -> PromptStyle {
+        match self.state.0 {
+            State::Shbang => PromptStyle::First,
+            State::General => {
+                if self.start_of_command() {
+                    PromptStyle::First
+                } else {
+                    PromptStyle::Later
+                }
+            }
+            State::Comment1 | State::Comment2 => PromptStyle::Comment,
+            State::Document1 | State::Document2 => PromptStyle::Document,
+            State::Document3 => PromptStyle::First,
+            State::FileLabel1 => PromptStyle::Later,
+            State::FileLabel2 | State::FileLabel3 => PromptStyle::First,
+            State::DoRepeat1 | State::DoRepeat2 => {
+                if self.start_of_command() {
+                    PromptStyle::First
+                } else {
+                    PromptStyle::Later
+                }
+            }
+            State::DoRepeat3 => PromptStyle::DoRepeat,
+            State::DoRepeat4 => PromptStyle::DoRepeat,
+            State::Define1 | State::Define2 | State::Define3 => {
+                if self.start_of_command() {
+                    PromptStyle::First
+                } else {
+                    PromptStyle::Later
+                }
+            }
+            State::Define4 | State::Define5 | State::Define6 => PromptStyle::Define,
+            State::BeginData1 => PromptStyle::First,
+            State::BeginData2 => PromptStyle::Later,
+            State::BeginData3 | State::BeginData4 => PromptStyle::Data,
+        }
+    }
+
+    /// Attempts to label a prefix of the remaining input with a segment type.
+    /// The caller supplies a prefix of the remaining input as `input`.  If
+    /// `eof` is true, then `input` is the entire (remainder) of the input; if
+    /// `eof` is false, then further input is potentially available.
+    ///
+    /// The input may contain '\n' or '\r\n' line ends in any combination.
+    ///
+    /// If successful, returns `Ok((n, type))`, where `n` is the number of bytes
+    /// in the segment at the beginning of `input` (a number in
+    /// `0..=input.len()`) and the type of that segment.  The next call should
+    /// not include those bytes in `input`, because they have (figuratively)
+    /// been consumed by the segmenter.
+    ///
+    /// Segments can have zero length, including segment types `Type::End`,
+    /// `Type::SeparateCommands`, `Type::StartDocument`, `Type::InlineData`, and
+    /// `Type::Spaces`.
+    ///
+    /// Failure occurs only if the segment type of the bytes in `input` cannot
+    /// yet be determined.  In this case, this function returns `Err(Incomplete)`.  If
+    /// more input is available, the caller should obtain some more, then call
+    /// again with a longer `input`.  If this is not enough, the process might
+    /// need to repeat again and again.  If input is exhausted, then the caller
+    /// may call again setting `eof` to true.  This function will never return
+    /// `Err(Incomplete)` when `eof` is true.
+    ///
+    /// The caller must not, in a sequence of calls, supply contradictory input.
+    /// That is, bytes provided as part of `input` in one call, but not
+    /// consumed, must not be provided with *different* values on subsequent
+    /// calls.  This is because the function must often make decisions based on
+    /// looking ahead beyond the bytes that it consumes.
+    fn push_rest<'a>(
+        &mut self,
+        input: &'a str,
+        eof: bool,
+    ) -> Result<(&'a str, Segment), Incomplete> {
+        if input.is_empty() {
+            if eof {
+                return Ok((input, Segment::End));
+            } else {
+                return Err(Incomplete);
+            };
+        }
+
+        match self.state.0 {
+            State::Shbang => return self.parse_shbang(input, eof),
+            State::General => {
+                if self.start_of_line() {
+                    self.parse_start_of_line(input, eof)
+                } else {
+                    self.parse_mid_line(input, eof)
+                }
+            }
+            State::Comment1 => self.parse_comment_1(input, eof),
+            State::Comment2 => self.parse_comment_2(input, eof),
+            State::Document1 => self.parse_document_1(input, eof),
+            State::Document2 => self.parse_document_2(input, eof),
+            State::Document3 => self.parse_document_3(input, eof),
+            State::FileLabel1 => self.parse_file_label_1(input, eof),
+            State::FileLabel2 => self.parse_file_label_2(input, eof),
+            State::FileLabel3 => self.parse_file_label_3(input, eof),
+            State::DoRepeat1 => self.parse_do_repeat_1(input, eof),
+            State::DoRepeat2 => self.parse_do_repeat_2(input, eof),
+            State::DoRepeat3 => self.parse_do_repeat_3(input, eof),
+            State::DoRepeat4 => self.parse_do_repeat_4(input),
+            State::Define1 => self.parse_define_1_2(input, eof),
+            State::Define2 => self.parse_define_1_2(input, eof),
+            State::Define3 => self.parse_define_3(input, eof),
+            State::Define4 => self.parse_define_4_5(input, eof),
+            State::Define5 => self.parse_define_4_5(input, eof),
+            State::Define6 => self.parse_define_6(input, eof),
+            State::BeginData1 => self.parse_begin_data_1(input, eof),
+            State::BeginData2 => self.parse_begin_data_2(input, eof),
+            State::BeginData3 => self.parse_begin_data_3(input, eof),
+            State::BeginData4 => self.parse_begin_data_4(input, eof),
+        }
+    }
+
+    pub fn push(&mut self, input: &str, eof: bool) -> Result<(usize, Segment), Incomplete> {
+        let (rest, seg_type) = self.push_rest(input, eof)?;
+        Ok((input.len() - rest.len(), seg_type))
+    }
+}
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq)]
+enum State {
+    Shbang,
+    General,
+    Comment1,
+    Comment2,
+    Document1,
+    Document2,
+    Document3,
+    FileLabel1,
+    FileLabel2,
+    FileLabel3,
+    DoRepeat1,
+    DoRepeat2,
+    DoRepeat3,
+    DoRepeat4,
+    Define1,
+    Define2,
+    Define3,
+    Define4,
+    Define5,
+    Define6,
+    BeginData1,
+    BeginData2,
+    BeginData3,
+    BeginData4,
+}
+
+fn take(input: &str, eof: bool) -> Result<(Option<char>, &str), Incomplete> {
+    let mut iter = input.chars();
+    match iter.next() {
+        None if !eof => Err(Incomplete),
+        c => Ok((c, iter.as_str())),
+    }
+}
+
+fn skip_comment(mut input: &str, eof: bool) -> Result<&str, Incomplete> {
+    loop {
+        let (Some(c), rest) = take(input, eof)? else {
+            return Ok(input);
+        };
+        match c {
+            '\n' | '\r' if is_end_of_line(input, eof)? => return Ok(input),
+            '*' => {
+                if let (Some('/'), rest) = take(rest, eof)? {
+                    return Ok(rest);
+                }
+            }
+            _ => (),
+        };
+        input = rest;
+    }
+}
+
+fn skip_matching<F>(f: F, input: &str, eof: bool) -> Result<&str, Incomplete>
+where
+    F: Fn(char) -> bool,
+{
+    let input = input.trim_start_matches(f);
+    if input.is_empty() && !eof {
+        Err(Incomplete)
+    } else {
+        Ok(input)
+    }
+}
+
+fn match_char<F>(f: F, input: &str, eof: bool) -> Result<Option<&str>, Incomplete>
+where
+    F: Fn(char) -> bool,
+{
+    if let (Some(c), rest) = take(input, eof)? {
+        if f(c) {
+            return Ok(Some(rest));
+        }
+    }
+    Ok(None)
+}
+
+fn skip_spaces(mut input: &str, eof: bool) -> Result<&str, Incomplete> {
+    loop {
+        let (Some(c), rest) = take(input, eof)? else {
+            return Ok(input);
+        };
+        match c {
+            '\r' | '\n' if is_end_of_line(input, eof)? => return Ok(input),
+            c if c.is_whitespace() => (),
+            _ => return Ok(input),
+        }
+        input = rest;
+    }
+}
+
+fn skip_digits(input: &str, eof: bool) -> Result<&str, Incomplete> {
+    skip_matching(|c| c.is_ascii_digit(), input, eof)
+}
+
+fn skip_spaces_and_comments(mut input: &str, eof: bool) -> Result<&str, Incomplete> {
+    loop {
+        let (Some(c), rest) = take(input, eof)? else {
+            return Ok(input);
+        };
+        match c {
+            '/' => {
+                let (c, rest2) = take(rest, eof)?;
+                match c {
+                    Some('*') => input = skip_comment(rest2, eof)?,
+                    Some(_) | None => return Ok(rest),
+                }
+            }
+            '\r' | '\n' if is_end_of_line(input, eof)? => return Ok(input),
+            c if c.is_whitespace() => input = rest,
+            _ => return Ok(input),
+        };
+    }
+}
+
+fn is_start_of_string(input: &str, eof: bool) -> Result<bool, Incomplete> {
+    let (Some(c), rest) = take(input, eof)? else {
+        return Ok(false);
+    };
+    match c {
+        'x' | 'X' | 'u' | 'U' => {
+            let (c, _rest) = take(rest, eof)?;
+            Ok(c == Some('\'') || c == Some('"'))
+        }
+        '\'' | '"' => Ok(true),
+        '\n' | '\r' if is_end_of_line(input, eof)? => Ok(true),
+        _ => Ok(false),
+    }
+}
+
+fn is_end_of_line(input: &str, eof: bool) -> Result<bool, Incomplete> {
+    let (Some(c), rest) = take(input, eof)? else {
+        return Ok(true);
+    };
+    Ok(match c {
+        '\n' => true,
+        '\r' => take(rest, eof)?.0 == Some('\n'),
+        _ => false,
+    })
+}
+
+fn at_end_of_line(input: &str, eof: bool) -> Result<bool, Incomplete> {
+    is_end_of_line(skip_spaces_and_comments(input, eof)?, eof)
+}
+
+fn first(s: &str) -> char {
+    s.chars().next().unwrap()
+}
+fn get_command_name_candidates(target: &str) -> &[&'static str] {
+    if target.is_empty() {
+        return &[];
+    }
+    let target_first = first(target).to_ascii_uppercase();
+    let low = COMMAND_NAMES.partition_point(|s| first(s) < target_first);
+    let high = COMMAND_NAMES.partition_point(|s| first(s) <= target_first);
+    &COMMAND_NAMES[low..high]
+}
+
+fn detect_command_name(input: &str, eof: bool) -> Result<bool, Incomplete> {
+    let command_name = input
+        .split(|c: char| {
+            !((c.is_whitespace() && c != '\n') || (c.may_continue_id() && c != '.') || c == '-')
+        })
+        .next()
+        .unwrap();
+    if !eof && command_name.len() == input.len() {
+        return Err(Incomplete);
+    }
+    let command_name = command_name.trim_end_matches(|c: char| c.is_whitespace() || c == '.');
+    for command in get_command_name_candidates(command_name) {
+        if let Some(m) = command_match(command, command_name) {
+            if m.missing_words <= 0 {
+                return Ok(true);
+            }
+        }
+    }
+    Ok(false)
+}
+
+impl Segmenter {
+    fn parse_shbang<'a>(
+        &mut self,
+        input: &'a str,
+        eof: bool,
+    ) -> Result<(&'a str, Segment), Incomplete> {
+        if let (Some('#'), rest) = take(input, eof)? {
+            if let (Some('!'), rest) = take(rest, eof)? {
+                let rest = self.parse_full_line(rest, eof)?;
+                self.state = (State::General, Substate::START_OF_COMMAND);
+                return Ok((rest, Segment::Shbang));
+            }
+        }
+
+        self.state = (
+            State::General,
+            Substate::START_OF_COMMAND | Substate::START_OF_LINE,
+        );
+        self.push_rest(input, eof)
+    }
+    fn at_command_start(&self, input: &str, eof: bool) -> Result<bool, Incomplete> {
+        match self.mode {
+            Mode::Auto => detect_command_name(input, eof),
+            Mode::Interactive => Ok(false),
+            Mode::Batch => Ok(true),
+        }
+    }
+    fn parse_start_of_line<'a>(
+        &mut self,
+        input: &'a str,
+        eof: bool,
+    ) -> Result<(&'a str, Segment), Incomplete> {
+        debug_assert_eq!(self.state.0, State::General);
+        debug_assert!(self.start_of_line());
+        debug_assert!(!input.is_empty());
+
+        let (Some(c), rest) = take(input, eof).unwrap() else {
+            unreachable!()
+        };
+        match c {
+            '+' if is_start_of_string(skip_spaces_and_comments(rest, eof)?, eof)? => {
+                // This  `+` is punctuation that may separate pieces of a string.
+                self.state = (State::General, Substate::empty());
+                return Ok((rest, Segment::Punct));
+            }
+            '+' | '-' | '.' => {
+                self.state = (State::General, Substate::START_OF_COMMAND);
+                return Ok((rest, Segment::StartCommand));
+            }
+            _ if c.is_whitespace() => {
+                if at_end_of_line(input, eof)? {
+                    self.state = (State::General, Substate::START_OF_COMMAND);
+                    return Ok((input, Segment::SeparateCommands));
+                }
+            }
+            _ => {
+                if self.at_command_start(input, eof)?
+                    && !self.state.1.contains(Substate::START_OF_COMMAND)
+                {
+                    self.state = (State::General, Substate::START_OF_COMMAND);
+                    return Ok((input, Segment::StartCommand));
+                }
+            }
+        }
+        self.state.1 = Substate::START_OF_COMMAND;
+        self.parse_mid_line(input, eof)
+    }
+    fn parse_mid_line<'a>(
+        &mut self,
+        input: &'a str,
+        eof: bool,
+    ) -> Result<(&'a str, Segment), Incomplete> {
+        debug_assert!(self.state.0 == State::General);
+        debug_assert!(!self.state.1.contains(Substate::START_OF_LINE));
+        let (Some(c), rest) = take(input, eof)? else {
+            unreachable!()
+        };
+        match c {
+            '\r' | '\n' if is_end_of_line(input, eof)? => {
+                self.state.1 |= Substate::START_OF_LINE;
+                Ok((
+                    self.parse_newline(input, eof).unwrap().unwrap(),
+                    Segment::Newline,
+                ))
+            }
+            '/' => {
+                if let (Some('*'), rest) = take(rest, eof)? {
+                    let rest = skip_comment(rest, eof)?;
+                    return Ok((rest, Segment::Comment));
+                } else {
+                    self.state.1 = Substate::empty();
+                    return Ok((rest, Segment::Punct));
+                }
+            }
+            '-' => {
+                let (c, rest2) = take(skip_spaces(rest, eof)?, eof)?;
+                match c {
+                    Some(c) if c.is_ascii_digit() => {
+                        return self.parse_number(rest, eof);
+                    }
+                    Some('.') => {
+                        if let (Some(c), _rest) = take(rest2, eof)? {
+                            if c.is_ascii_digit() {
+                                return self.parse_number(rest, eof);
+                            }
+                        }
+                    }
+                    None | Some(_) => (),
+                }
+                self.state.1 = Substate::empty();
+                return Ok((rest, Segment::Punct));
+            }
+            '(' | ')' | '[' | ']' | '{' | '}' | ',' | '=' | ';' | ':' | '&' | '|' | '+' => {
+                self.state.1 = Substate::empty();
+                return Ok((rest, Segment::Punct));
+            }
+            '*' => {
+                if self.state.1.contains(Substate::START_OF_COMMAND) {
+                    self.state = (State::Comment1, Substate::empty());
+                    self.parse_comment_1(input, eof)
+                } else {
+                    self.parse_digraph(&['*'], rest, eof)
+                }
+            }
+            '<' => self.parse_digraph(&['=', '>'], rest, eof),
+            '>' => self.parse_digraph(&['='], rest, eof),
+            '~' => self.parse_digraph(&['='], rest, eof),
+            '.' if at_end_of_line(rest, eof)? => {
+                self.state.1 = Substate::START_OF_COMMAND;
+                Ok((rest, Segment::EndCommand))
+            }
+            '.' => match take(rest, eof)? {
+                (Some(c), _) if c.is_ascii_digit() => self.parse_number(input, eof),
+                _ => Ok((rest, Segment::Punct)),
+            },
+            '0'..='9' => self.parse_number(input, eof),
+            'u' | 'U' => self.maybe_parse_string(Segment::UnicodeString, (input, rest), eof),
+            'x' | 'X' => self.maybe_parse_string(Segment::HexString, (input, rest), eof),
+            '\'' | '"' => self.parse_string(Segment::QuotedString, c, rest, eof),
+            '!' => {
+                let (c, rest2) = take(rest, eof)?;
+                match c {
+                    Some('*') => Ok((rest2, Segment::Punct)),
+                    Some(_) => self.parse_id(input, eof),
+                    None => Ok((rest, Segment::Punct)),
+                }
+            }
+            c if c.is_whitespace() => Ok((skip_spaces(rest, eof)?, Segment::Spaces)),
+            c if c.may_start_id() => self.parse_id(input, eof),
+            '#'..='~' if c != '\\' && c != '^' => {
+                self.state.1 = Substate::empty();
+                Ok((rest, Segment::Punct))
+            }
+            _ => {
+                self.state.1 = Substate::empty();
+                Ok((rest, Segment::UnexpectedChar))
+            }
+        }
+    }
+    fn parse_string<'a>(
+        &mut self,
+        segment: Segment,
+        quote: char,
+        mut input: &'a str,
+        eof: bool,
+    ) -> Result<(&'a str, Segment), Incomplete> {
+        while let (Some(c), rest) = take(input, eof)? {
+            match c {
+                _ if c == quote => {
+                    let (c, rest2) = take(rest, eof)?;
+                    if c != Some(quote) {
+                        self.state.1 = Substate::empty();
+                        return Ok((rest, segment));
+                    }
+                    input = rest2;
+                }
+                '\r' | '\n' if is_end_of_line(input, eof)? => break,
+                _ => input = rest,
+            }
+        }
+        self.state.1 = Substate::empty();
+        Ok((input, Segment::ExpectedQuote))
+    }
+    fn maybe_parse_string<'a>(
+        &mut self,
+        segment: Segment,
+        input: (&'a str, &'a str),
+        eof: bool,
+    ) -> Result<(&'a str, Segment), Incomplete> {
+        match take(input.1, eof)? {
+            (Some(c), rest) if c == '\'' || c == '"' => self.parse_string(segment, c, rest, eof),
+            _ => self.parse_id(input.0, eof),
+        }
+    }
+    fn next_id_in_command<'a>(
+        &self,
+        mut input: &'a str,
+        eof: bool,
+    ) -> Result<(&'a str, &'a str), Incomplete> {
+        let mut sub = Segmenter::new(self.mode, true);
+        loop {
+            let (seg_len, seg_type) = sub.push(input, eof)?;
+            let (segment, rest) = input.split_at(seg_len);
+            match seg_type {
+                Segment::Shbang | Segment::Spaces | Segment::Comment | Segment::Newline => (),
+
+                Segment::Identifier => return Ok((segment, rest)),
+
+                Segment::Number
+                | Segment::QuotedString
+                | Segment::HexString
+                | Segment::UnicodeString
+                | Segment::UnquotedString
+                | Segment::Punct
+                | Segment::CommentCommand
+                | Segment::DoRepeatCommand
+                | Segment::DoRepeatOverflow
+                | Segment::InlineData
+                | Segment::MacroName
+                | Segment::MacroBody
+                | Segment::StartDocument
+                | Segment::Document
+                | Segment::StartCommand
+                | Segment::SeparateCommands
+                | Segment::EndCommand
+                | Segment::End
+                | Segment::ExpectedQuote
+                | Segment::ExpectedExponent
+                | Segment::UnexpectedChar => return Ok(("", rest)),
+            }
+            input = rest;
+        }
+    }
+    fn parse_id<'a>(
+        &mut self,
+        input: &'a str,
+        eof: bool,
+    ) -> Result<(&'a str, Segment), Incomplete> {
+        let (Some(_), mut end) = take(input, eof).unwrap() else {
+            unreachable!()
+        };
+        while let (Some(c), rest) = take(end, eof)? {
+            if !c.may_continue_id() {
+                break;
+            };
+            end = rest;
+        }
+        let identifier = &input[..input.len() - end.len()];
+        let identifier = match identifier.strip_suffix('.') {
+            Some(without_dot) if at_end_of_line(end, eof)? => without_dot,
+            _ => identifier,
+        };
+        let rest = &input[identifier.len()..];
+
+        if self.state.1.contains(Substate::START_OF_COMMAND) {
+            if id_match_n("COMMENT", identifier, 4) {
+                self.state = (State::Comment1, Substate::empty());
+                return self.parse_comment_1(input, eof);
+            } else if id_match("DOCUMENT", identifier) {
+                self.state = (State::Document1, Substate::empty());
+                return Ok((input, Segment::StartDocument));
+            } else if id_match_n("DEFINE", identifier, 6) {
+                self.state = (State::Define1, Substate::empty());
+            } else if id_match("FILE", identifier) {
+                if id_match("LABEL", self.next_id_in_command(rest, eof)?.0) {
+                    self.state = (State::FileLabel1, Substate::empty());
+                    return Ok((rest, Segment::Identifier));
+                }
+            } else if id_match("DO", identifier) {
+                if id_match("REPEAT", self.next_id_in_command(rest, eof)?.0) {
+                    self.state = (State::DoRepeat1, Substate::empty());
+                    return Ok((rest, Segment::Identifier));
+                }
+            } else if id_match("BEGIN", identifier) {
+                let (next_id, rest2) = self.next_id_in_command(rest, eof)?;
+                if id_match("DATA", next_id) {
+                    let rest2 = skip_spaces_and_comments(rest2, eof)?;
+                    let rest2 = if let Some(s) = rest2.strip_prefix('.') {
+                        skip_spaces_and_comments(s, eof)?
+                    } else {
+                        rest2
+                    };
+                    if is_end_of_line(rest2, eof)? {
+                        let s = &input[..input.len() - rest2.len()];
+                        self.state = (
+                            if s.contains('\n') {
+                                State::BeginData1
+                            } else {
+                                State::BeginData2
+                            },
+                            Substate::empty(),
+                        );
+                        return Ok((rest, Segment::Identifier));
+                    }
+                }
+            }
+        }
+
+        self.state.1 = Substate::empty();
+        Ok((
+            rest,
+            if identifier != "!" {
+                Segment::Identifier
+            } else {
+                Segment::Punct
+            },
+        ))
+    }
+    fn parse_digraph<'a>(
+        &mut self,
+        seconds: &[char],
+        input: &'a str,
+        eof: bool,
+    ) -> Result<(&'a str, Segment), Incomplete> {
+        let (c, rest) = take(input, eof)?;
+        self.state.1 = Substate::empty();
+        Ok((
+            match c {
+                Some(c) if seconds.contains(&c) => rest,
+                _ => input,
+            },
+            Segment::Punct,
+        ))
+    }
+    fn parse_number<'a>(
+        &mut self,
+        input: &'a str,
+        eof: bool,
+    ) -> Result<(&'a str, Segment), Incomplete> {
+        let mut input = skip_digits(input, eof)?;
+        if let Some(rest) = match_char(|c| c == '.', input, eof)? {
+            let rest2 = skip_digits(rest, eof)?;
+            if rest2.len() < rest.len() || !at_end_of_line(rest2, eof)? {
+                input = rest2;
+            }
+        };
+        if let Some(rest) = match_char(|c| c == 'e' || c == 'E', input, eof)? {
+            let rest = match_char(|c| c == '+' || c == '-', rest, eof)?.unwrap_or(rest);
+            let rest2 = skip_digits(rest, eof)?;
+            if rest2.len() == rest.len() {
+                self.state.1 = Substate::empty();
+                return Ok((rest, Segment::ExpectedExponent));
+            }
+            input = rest2;
+        }
+        self.state.1 = Substate::empty();
+        Ok((input, Segment::Number))
+    }
+    fn parse_comment_1<'a>(
+        &mut self,
+        mut input: &'a str,
+        eof: bool,
+    ) -> Result<(&'a str, Segment), Incomplete> {
+        enum CommentState<'a> {
+            Blank,
+            NotBlank,
+            Period(&'a str),
+        }
+        let mut state = CommentState::Blank;
+        loop {
+            let (Some(c), rest) = take(input, eof)? else {
+                // End of file.
+                self.state = (State::General, Substate::START_OF_COMMAND);
+                return Ok((input, Segment::SeparateCommands));
+            };
+            match c {
+                '.' => state = CommentState::Period(input),
+                '\n' | '\r' if is_end_of_line(input, eof)? => {
+                    match state {
+                        CommentState::Blank => {
+                            // Blank line ends comment command.
+                            self.state = (State::General, Substate::START_OF_COMMAND);
+                            return Ok((input, Segment::SeparateCommands));
+                        }
+                        CommentState::Period(period) => {
+                            // '.' at end of line ends comment command.
+                            self.state = (State::General, Substate::empty());
+                            return Ok((period, Segment::CommentCommand));
+                        }
+                        CommentState::NotBlank => {
+                            // Comment continues onto next line.
+                            self.state = (State::Comment2, Substate::empty());
+                            return Ok((input, Segment::CommentCommand));
+                        }
+                    }
+                }
+                c if c.is_whitespace() => (),
+                _ => state = CommentState::NotBlank,
+            }
+            input = rest;
+        }
+    }
+    fn parse_comment_2<'a>(
+        &mut self,
+        input: &'a str,
+        eof: bool,
+    ) -> Result<(&'a str, Segment), Incomplete> {
+        let rest = self.parse_newline(input, eof)?.unwrap();
+
+        let new_command = match take(rest, eof)?.0 {
+            Some('+') | Some('-') | Some('.') => true,
+            Some(c) if !c.is_whitespace() => self.at_command_start(rest, eof)?,
+            None | Some(_) => false,
+        };
+        if new_command {
+            self.state = (
+                State::General,
+                Substate::START_OF_LINE | Substate::START_OF_COMMAND,
+            );
+        } else {
+            self.state = (State::Comment1, Substate::empty());
+        }
+        Ok((rest, Segment::Newline))
+    }
+    fn parse_document_1<'a>(
+        &mut self,
+        mut input: &'a str,
+        eof: bool,
+    ) -> Result<(&'a str, Segment), Incomplete> {
+        let mut end_cmd = false;
+        loop {
+            let (Some(c), rest) = take(input, eof)? else {
+                self.state = (State::Document3, Substate::empty());
+                return Ok((input, Segment::Document));
+            };
+            match c {
+                '.' => end_cmd = true,
+                '\n' | '\r' if is_end_of_line(input, eof)? => {
+                    self.state.0 = if end_cmd {
+                        State::Document3
+                    } else {
+                        State::Document2
+                    };
+                    return Ok((input, Segment::Document));
+                }
+                c if !c.is_whitespace() => end_cmd = false,
+                _ => (),
+            }
+            input = rest;
+        }
+    }
+    fn parse_document_2<'a>(
+        &mut self,
+        input: &'a str,
+        eof: bool,
+    ) -> Result<(&'a str, Segment), Incomplete> {
+        let rest = self.parse_newline(input, eof)?.unwrap();
+        self.state = (State::Document1, Substate::empty());
+        Ok((rest, Segment::Newline))
+    }
+    fn parse_document_3<'a>(
+        &mut self,
+        input: &'a str,
+        _eof: bool,
+    ) -> Result<(&'a str, Segment), Incomplete> {
+        self.state = (
+            State::General,
+            Substate::START_OF_COMMAND | Substate::START_OF_LINE,
+        );
+        Ok((input, Segment::EndCommand))
+    }
+    fn quoted_file_label(input: &str, eof: bool) -> Result<bool, Incomplete> {
+        let input = skip_spaces_and_comments(input, eof)?;
+        match take(input, eof)?.0 {
+            Some('\'') | Some('"') | Some('\n') => Ok(true),
+            _ => Ok(false),
+        }
+    }
+    fn parse_file_label_1<'a>(
+        &mut self,
+        input: &'a str,
+        eof: bool,
+    ) -> Result<(&'a str, Segment), Incomplete> {
+        let mut sub = Segmenter {
+            state: (State::General, self.state.1),
+            ..*self
+        };
+        let (rest, segment) = sub.push_rest(input, eof)?;
+        if segment == Segment::Identifier {
+            let id = &input[..input.len() - rest.len()];
+            debug_assert!(id_match("LABEL", id), "{id} should be LABEL");
+            if Self::quoted_file_label(rest, eof)? {
+                *self = sub;
+            } else {
+                self.state.0 = State::FileLabel2;
+            }
+        } else {
+            self.state.1 = sub.state.1;
+        }
+        Ok((rest, segment))
+    }
+    fn parse_file_label_2<'a>(
+        &mut self,
+        input: &'a str,
+        eof: bool,
+    ) -> Result<(&'a str, Segment), Incomplete> {
+        let input = skip_spaces(input, eof)?;
+        self.state = (State::FileLabel3, Substate::empty());
+        Ok((input, Segment::Spaces))
+    }
+    fn parse_file_label_3<'a>(
+        &mut self,
+        mut input: &'a str,
+        eof: bool,
+    ) -> Result<(&'a str, Segment), Incomplete> {
+        let mut end_cmd = None;
+        loop {
+            let (c, rest) = take(input, eof)?;
+            match c {
+                None | Some('\n') | Some('\r') if is_end_of_line(input, eof)? => {
+                    self.state = (State::General, Substate::empty());
+                    return Ok((end_cmd.unwrap_or(input), Segment::UnquotedString));
+                }
+                None => unreachable!(),
+                Some('.') => end_cmd = Some(input),
+                Some(c) if !c.is_whitespace() => end_cmd = None,
+                Some(_) => (),
+            }
+            input = rest;
+        }
+    }
+    fn subparse<'a>(
+        &mut self,
+        input: &'a str,
+        eof: bool,
+    ) -> Result<(&'a str, Segment), Incomplete> {
+        let mut sub = Segmenter {
+            mode: self.mode,
+            state: (State::General, self.state.1),
+            nest: 0,
+        };
+        let result = sub.push_rest(input, eof)?;
+        self.state.1 = sub.state.1;
+        Ok(result)
+    }
+    /// We are segmenting a `DO REPEAT` command, currently reading the syntax
+    /// that defines the stand-in variables (the head) before the lines of
+    /// syntax to be repeated (the body).
+    fn parse_do_repeat_1<'a>(
+        &mut self,
+        input: &'a str,
+        eof: bool,
+    ) -> Result<(&'a str, Segment), Incomplete> {
+        let (rest, segment) = self.subparse(input, eof)?;
+        if segment == Segment::SeparateCommands {
+            // We reached a blank line that separates the head from the body.
+            self.state.0 = State::DoRepeat2;
+        } else if segment == Segment::EndCommand || segment == Segment::StartCommand {
+            // We reached the body.
+            self.state.0 = State::DoRepeat3;
+            self.nest = 1;
+        }
+        Ok((rest, segment))
+    }
+    /// We are segmenting a `DO REPEAT` command, currently reading a blank line
+    /// that separates the head from the body.
+    fn parse_do_repeat_2<'a>(
+        &mut self,
+        input: &'a str,
+        eof: bool,
+    ) -> Result<(&'a str, Segment), Incomplete> {
+        let (rest, segment) = self.subparse(input, eof)?;
+        if segment == Segment::Newline {
+            // We reached the body.
+            self.state.0 = State::DoRepeat3;
+            self.nest = 1;
+        }
+        Ok((rest, segment))
+    }
+    fn parse_newline<'a>(
+        &mut self,
+        input: &'a str,
+        eof: bool,
+    ) -> Result<Option<&'a str>, Incomplete> {
+        let (Some(c), rest) = take(input, eof)? else {
+            return Ok(None);
+        };
+        match c {
+            '\n' => Ok(Some(rest)),
+            '\r' => {
+                if let (Some('\n'), rest) = take(rest, eof)? {
+                    Ok(Some(rest))
+                } else {
+                    Ok(None)
+                }
+            }
+            _ => Ok(None),
+        }
+    }
+
+    fn parse_full_line<'a>(
+        &mut self,
+        mut input: &'a str,
+        eof: bool,
+    ) -> Result<&'a str, Incomplete> {
+        loop {
+            if is_end_of_line(input, eof)? {
+                return Ok(input);
+            }
+            input = take(input, eof).unwrap().1;
+        }
+    }
+    fn check_repeat_command<'a>(&mut self, input: &'a str, eof: bool) -> Result<isize, Incomplete> {
+        let input = input.strip_prefix(&['-', '+']).unwrap_or(input);
+        let (id1, input) = self.next_id_in_command(input, eof)?;
+        if id_match("DO", id1) && id_match("REPEAT", self.next_id_in_command(input, eof)?.0) {
+            Ok(1)
+        } else if id_match("END", id1) && id_match("REPEAT", self.next_id_in_command(input, eof)?.0)
+        {
+            Ok(-1)
+        } else {
+            Ok(0)
+        }
+    }
+    /// We are in the body of `DO REPEAT`, segmenting the lines of syntax that
+    /// are to be repeated.  Report each line of syntax as a single
+    /// [`Type::DoRepeatCommand`].
+    ///
+    /// `DO REPEAT` can be nested, so we look for `DO REPEAT...END REPEAT`
+    /// blocks inside the lines we're segmenting.  `self.nest` counts the
+    /// nesting level, starting at 1.
+    fn parse_do_repeat_3<'a>(
+        &mut self,
+        input: &'a str,
+        eof: bool,
+    ) -> Result<(&'a str, Segment), Incomplete> {
+        if let Some(rest) = self.parse_newline(input, eof)? {
+            return Ok((rest, Segment::Newline));
+        }
+        let rest = self.parse_full_line(input, eof)?;
+        let direction = self.check_repeat_command(input, eof)?;
+        if direction > 0 {
+            if let Some(nest) = self.nest.checked_add(1) {
+                self.nest = nest;
+            } else {
+                self.state.0 = State::DoRepeat4;
+            }
+        } else if direction < 0 {
+            self.nest -= 1;
+            if self.nest == 0 {
+                // Nesting level dropped to 0, so we've finished reading the `DO
+                // REPEAT` body.
+                self.state = (
+                    State::General,
+                    Substate::START_OF_COMMAND | Substate::START_OF_LINE,
+                );
+                return self.push_rest(input, eof);
+            }
+        }
+        return Ok((rest, Segment::DoRepeatCommand));
+    }
+    fn parse_do_repeat_4<'a>(&mut self, input: &'a str) -> Result<(&'a str, Segment), Incomplete> {
+        self.state.0 = State::DoRepeat3;
+        Ok((input, Segment::DoRepeatOverflow))
+    }
+    /// We are segmenting a `DEFINE` command, which consists of:
+    ///
+    ///   - The `DEFINE` keyword.
+    ///
+    ///   - An identifier.  We transform this into `Type::MacroName` instead of
+    ///     `Type::Identifier` because this identifier must never  be macro-expanded.
+    ///
+    ///   - Anything but `(`.
+    ///
+    ///   - `(` followed by a sequence of tokens possibly including balanced
+    ///     parentheses up to a final `)`.
+    ///
+    ///   - A sequence of any number of lines, one string per line, ending with
+    ///     `!ENDDEFINE`.  The first line is usually blank (that is, a newline
+    ///     follows the `(`).  The last line usually just has `!ENDDEFINE.` on
+    ///     it, but it can start with other tokens.  The whole
+    ///     DEFINE...!ENDDEFINE can be on a single line, even.
+    fn parse_define_1_2<'a>(
+        &mut self,
+        input: &'a str,
+        eof: bool,
+    ) -> Result<(&'a str, Segment), Incomplete> {
+        let (rest, segment) = self.subparse(input, eof)?;
+        match segment {
+            Segment::Identifier if self.state.0 == State::Define1 => {
+                self.state.0 = State::Define2;
+                return Ok((rest, Segment::MacroName));
+            }
+            Segment::SeparateCommands | Segment::EndCommand | Segment::StartCommand => {
+                // The DEFINE command is malformed because we reached its end
+                // without ever hitting a `(` token.  Transition back to general
+                // parsing.
+                self.state.0 = State::General;
+            }
+            Segment::Punct if input.starts_with('(') => {
+                self.state.0 = State::Define3;
+                self.nest = 1;
+            }
+            _ => (),
+        }
+        Ok((rest, segment))
+    }
+    fn parse_define_3<'a>(
+        &mut self,
+        input: &'a str,
+        eof: bool,
+    ) -> Result<(&'a str, Segment), Incomplete> {
+        let (rest, segment) = self.subparse(input, eof)?;
+        match segment {
+            Segment::SeparateCommands | Segment::EndCommand | Segment::StartCommand => {
+                // The DEFINE command is malformed because we reached its end
+                // without ever hitting a `(` token.  Transition back to general
+                // parsing.
+                self.state.0 = State::General;
+            }
+            Segment::Punct if input.starts_with('(') => {
+                self.nest += 1;
+            }
+            Segment::Punct if input.starts_with(')') => {
+                self.nest -= 1;
+                if self.nest == 0 {
+                    self.state = (State::Define4, Substate::empty());
+                }
+            }
+            _ => (),
+        }
+        Ok((rest, segment))
+    }
+    fn find_enddefine<'a>(mut input: &'a str) -> Option<&'a str> {
+        loop {
+            input = skip_spaces_and_comments(input, true).unwrap();
+            let (Some(c), rest) = take(input, true).unwrap() else {
+                return None;
+            };
+            match c {
+                '!' if strip_prefix_ignore_ascii_case(input, "!ENDDEFINE").is_some() => {
+                    return Some(input)
+                }
+                '\'' | '"' => {
+                    let index = rest.find(c)?;
+                    input = &rest[index + 1..];
+                }
+                _ => input = rest,
+            }
+        }
+    }
+
+    /// We are in the body of a macro definition, looking for additional lines
+    /// of the body or `!ENDDEFINE`.
+    ///
+    /// In `State::Define4`, we're parsing the first line of the macro body (the
+    /// same line as the closing parenthesis in the argument definition).  In
+    /// `State::Define5`, we're on a later line.
+    fn parse_define_4_5<'a>(
+        &mut self,
+        input: &'a str,
+        eof: bool,
+    ) -> Result<(&'a str, Segment), Incomplete> {
+        let rest = self.parse_full_line(input, eof)?;
+        let line = &input[..input.len() - rest.len()];
+        if let Some(end) = Self::find_enddefine(line) {
+            // Macro ends at the !ENDDEFINE on this line.
+            self.state = (State::General, Substate::empty());
+            let (prefix, rest) = input.split_at(line.len() - end.len());
+            if prefix.is_empty() {
+                // Line starts with `!ENDDEFINE`.
+                self.push_rest(input, eof)
+            } else if prefix.trim_start().is_empty() {
+                // Line starts with spaces followed by `!ENDDEFINE`.
+                Ok((rest, Segment::Spaces))
+            } else {
+                // Line starts with some content followed by `!ENDDEFINE`.
+                Ok((rest, Segment::MacroBody))
+            }
+        } else {
+            // No `!ENDDEFINE`.  We have a full line of macro body.
+            //
+            // If the first line of the macro body is blank, we just report it
+            // as spaces, or not at all if there are no spaces, because it's not
+            // significant.
+            //
+            // However, if it's a later line, we need to report it because blank
+            // lines can have significance.
+            let segment = if self.state.0 == State::Define4 && line.trim_start().is_empty() {
+                if line.is_empty() {
+                    return self.parse_define_6(input, eof);
+                }
+                Segment::Spaces
+            } else {
+                Segment::MacroBody
+            };
+            self.state.0 = State::Define6;
+            Ok((rest, segment))
+        }
+    }
+    fn parse_define_6<'a>(
+        &mut self,
+        input: &'a str,
+        eof: bool,
+    ) -> Result<(&'a str, Segment), Incomplete> {
+        let rest = self.parse_newline(input, eof)?.unwrap();
+        self.state.0 = State::Define5;
+        Ok((rest, Segment::Newline))
+    }
+    fn parse_begin_data_1<'a>(
+        &mut self,
+        input: &'a str,
+        eof: bool,
+    ) -> Result<(&'a str, Segment), Incomplete> {
+        let (rest, segment) = self.subparse(input, eof)?;
+        if segment == Segment::Newline {
+            self.state.0 = State::BeginData2;
+        }
+        Ok((rest, segment))
+    }
+    fn parse_begin_data_2<'a>(
+        &mut self,
+        input: &'a str,
+        eof: bool,
+    ) -> Result<(&'a str, Segment), Incomplete> {
+        let (rest, segment) = self.subparse(input, eof)?;
+        if segment == Segment::Newline {
+            self.state.0 = State::BeginData3;
+        }
+        Ok((rest, segment))
+    }
+    fn is_end_data(line: &str) -> bool {
+        let Some(rest) = strip_prefix_ignore_ascii_case(line, "END") else {
+            return false;
+        };
+        let (Some(c), rest) = take(rest, true).unwrap() else {
+            return false;
+        };
+        if !c.is_whitespace() {
+            return false;
+        };
+        let Some(rest) = strip_prefix_ignore_ascii_case(rest, "DATA") else {
+            return false;
+        };
+
+        let mut endcmd = false;
+        for c in rest.chars() {
+            match c {
+                '.' if endcmd => return false,
+                '.' => endcmd = true,
+                c if c.is_whitespace() => (),
+                _ => return false,
+            }
+        }
+        true
+    }
+    fn parse_begin_data_3<'a>(
+        &mut self,
+        input: &'a str,
+        eof: bool,
+    ) -> Result<(&'a str, Segment), Incomplete> {
+        let rest = self.parse_full_line(input, eof)?;
+        let line = &input[..input.len() - rest.len()];
+        if Self::is_end_data(line) {
+            self.state = (
+                State::General,
+                Substate::START_OF_COMMAND | Substate::START_OF_LINE,
+            );
+            self.push_rest(input, eof)
+        } else {
+            self.state.0 = State::BeginData4;
+            Ok((rest, Segment::InlineData))
+        }
+    }
+    fn parse_begin_data_4<'a>(
+        &mut self,
+        input: &'a str,
+        eof: bool,
+    ) -> Result<(&'a str, Segment), Incomplete> {
+        let rest = self.parse_newline(input, eof)?.unwrap();
+        self.state.0 = State::BeginData3;
+        Ok((rest, Segment::Newline))
+    }
+}
+
+fn strip_prefix_ignore_ascii_case<'a>(line: &'a str, pattern: &str) -> Option<&'a str> {
+    line.get(..pattern.len())
+        .map(|prefix| {
+            prefix
+                .eq_ignore_ascii_case(pattern)
+                .then(|| &line[pattern.len()..])
+        })
+        .flatten()
+}
+
+#[cfg(test)]
+mod test;
diff --git a/rust/pspp/src/lex/segment/test.rs b/rust/pspp/src/lex/segment/test.rs

new file mode 100644 (file)

index 0000000..d8c337d
--- /dev/null
+++ b/rust/pspp/src/lex/segment/test.rs
@@ -0,0 +1,2172 @@
+use crate::prompt::PromptStyle;
+
+use super::{Mode, Segment, Segmenter};
+
+fn push_segment<'a>(
+    segmenter: &mut Segmenter,
+    input: &'a str,
+    one_byte: bool,
+) -> (usize, Segment) {
+    if one_byte {
+        for len in input.char_indices().map(|(pos, _c)| pos) {
+            if let Ok(result) = segmenter.push(&input[..len], false) {
+                return result;
+            }
+        }
+    }
+    segmenter.push(input, true).unwrap()
+}
+
+fn _check_segmentation(
+    mut input: &str,
+    mode: Mode,
+    expect_segments: &[(Segment, &str)],
+    expect_prompts: &[PromptStyle],
+    one_byte: bool,
+) {
+    let mut segments = Vec::with_capacity(expect_segments.len());
+    let mut prompts = Vec::new();
+    let mut segmenter = Segmenter::new(mode, false);
+    loop {
+        let (seg_len, seg_type) = push_segment(&mut segmenter, input, one_byte);
+        let (token, rest) = input.split_at(seg_len);
+        segments.push((seg_type, token));
+        match seg_type {
+            Segment::End => break,
+            Segment::Newline => prompts.push(segmenter.prompt()),
+            _ => (),
+        }
+        input = rest;
+    }
+
+    if &segments != expect_segments {
+        eprintln!("segments differ from expected:");
+        let difference = diff::slice(expect_segments, &segments);
+        for result in difference {
+            match result {
+                diff::Result::Left(left) => eprintln!("-{left:?}"),
+                diff::Result::Both(left, _right) => eprintln!(" {left:?}"),
+                diff::Result::Right(right) => eprintln!("+{right:?}"),
+            }
+        }
+        panic!();
+    }
+
+    if &prompts != expect_prompts {
+        eprintln!("prompts differ from expected:");
+        let difference = diff::slice(expect_prompts, &prompts);
+        for result in difference {
+            match result {
+                diff::Result::Left(left) => eprintln!("-{left:?}"),
+                diff::Result::Both(left, _right) => eprintln!(" {left:?}"),
+                diff::Result::Right(right) => eprintln!("+{right:?}"),
+            }
+        }
+        panic!();
+    }
+}
+
+fn check_segmentation(
+    input: &str,
+    mode: Mode,
+    expect_segments: &[(Segment, &str)],
+    expect_prompts: &[PromptStyle],
+) {
+    for (one_byte, one_byte_name) in [(false, "full-string"), (true, "byte-by-byte")] {
+        println!("running {one_byte_name} segmentation test with LF newlines...");
+        _check_segmentation(input, mode, expect_segments, expect_prompts, one_byte);
+
+        println!("running {one_byte_name} segmentation test with CRLF newlines...");
+        _check_segmentation(
+            &input.replace('\n', "\r\n"),
+            mode,
+            &expect_segments
+                .iter()
+                .map(|(segment, s)| match *segment {
+                    Segment::Newline => (Segment::Newline, "\r\n"),
+                    _ => (*segment, *s),
+                })
+                .collect::<Vec<_>>(),
+            expect_prompts,
+            one_byte,
+        );
+
+        if let Some(input) = input.strip_suffix('\n') {
+            println!("running {one_byte_name} segmentation test without final newline...");
+            let mut expect_segments: Vec<_> = expect_segments.iter().copied().collect();
+            assert_eq!(expect_segments.pop(), Some((Segment::End, "")));
+            assert_eq!(expect_segments.pop(), Some((Segment::Newline, "\n")));
+            while let Some((Segment::SeparateCommands | Segment::EndCommand, "")) =
+                expect_segments.last()
+            {
+                expect_segments.pop();
+            }
+            expect_segments.push((Segment::End, ""));
+            _check_segmentation(
+                input,
+                mode,
+                &expect_segments,
+                &expect_prompts[..expect_prompts.len() - 1],
+                one_byte,
+            );
+        }
+    }
+}
+
+#[allow(dead_code)]
+fn print_segmentation(mut input: &str) {
+    let mut segmenter = Segmenter::new(Mode::Interactive, false);
+    loop {
+        let (seg_len, seg_type) = segmenter.push(input, true).unwrap();
+        let (token, rest) = input.split_at(seg_len);
+        print!("{seg_type:?} {token:?}");
+        match seg_type {
+            Segment::Newline => print!(" ({:?})", segmenter.prompt()),
+            Segment::End => break,
+            _ => (),
+        }
+        println!();
+        input = rest;
+    }
+}
+
+#[test]
+fn test_identifiers() {
+    check_segmentation(
+        r#"a ab abc abcd !abcd
+A AB ABC ABCD !ABCD
+aB aBC aBcD !aBcD
+$x $y $z !$z
+grève Ângstrom poté
+#a #b #c ## #d !#d
+@efg @ @@. @#@ !@ 
+## # #12345 #.#
+f@#_.#6
+GhIjK
+.x 1y _z
+!abc abc!
+"#,
+        Mode::Auto,
+        &[
+            (Segment::Identifier, "a"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "ab"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "abc"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "abcd"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "!abcd"),
+            (Segment::Newline, "\n"),
+            (Segment::Identifier, "A"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "AB"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "ABC"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "ABCD"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "!ABCD"),
+            (Segment::Newline, "\n"),
+            (Segment::Identifier, "aB"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "aBC"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "aBcD"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "!aBcD"),
+            (Segment::Newline, "\n"),
+            (Segment::Identifier, "$x"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "$y"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "$z"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "!$z"),
+            (Segment::Newline, "\n"),
+            (Segment::Identifier, "grève"),
+            (Segment::Spaces, "\u{00a0}"),
+            (Segment::Identifier, "Ângstrom"),
+            (Segment::Spaces, "\u{00a0}"),
+            (Segment::Identifier, "poté"),
+            (Segment::Newline, "\n"),
+            (Segment::Identifier, "#a"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "#b"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "#c"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "##"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "#d"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "!#d"),
+            (Segment::Newline, "\n"),
+            (Segment::Identifier, "@efg"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "@"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "@@."),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "@#@"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "!@"),
+            (Segment::Spaces, " "),
+            (Segment::Newline, "\n"),
+            (Segment::Identifier, "##"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "#"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "#12345"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "#.#"),
+            (Segment::Newline, "\n"),
+            (Segment::Identifier, "f@#_.#6"),
+            (Segment::Newline, "\n"),
+            (Segment::Identifier, "GhIjK"),
+            (Segment::Newline, "\n"),
+            (Segment::StartCommand, "."),
+            (Segment::Identifier, "x"),
+            (Segment::Spaces, " "),
+            (Segment::Number, "1"),
+            (Segment::Identifier, "y"),
+            (Segment::Spaces, " "),
+            (Segment::Punct, "_"),
+            (Segment::Identifier, "z"),
+            (Segment::Newline, "\n"),
+            (Segment::Identifier, "!abc"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "abc"),
+            (Segment::Punct, "!"),
+            (Segment::Newline, "\n"),
+            (Segment::End, ""),
+        ],
+        &[
+            PromptStyle::Later,
+            PromptStyle::Later,
+            PromptStyle::Later,
+            PromptStyle::Later,
+            PromptStyle::Later,
+            PromptStyle::Later,
+            PromptStyle::Later,
+            PromptStyle::Later,
+            PromptStyle::Later,
+            PromptStyle::Later,
+            PromptStyle::Later,
+            PromptStyle::Later,
+        ],
+    );
+}
+
+#[test]
+fn test_identifiers_ending_in_dot() {
+    check_segmentation(
+        r#"abcd. abcd.
+ABCD. ABCD.
+aBcD. aBcD. 
+$y. $z. あいうえお.
+#c. #d..
+@@. @@....
+#.#.
+#abcd.
+.
+. 
+LMNOP. 
+QRSTUV./* end of line comment */
+qrstuv. /* end of line comment */
+QrStUv./* end of line comment */ 
+wxyz./* unterminated end of line comment
+WXYZ. /* unterminated end of line comment
+WxYz./* unterminated end of line comment 
+"#,
+        Mode::Auto,
+        &[
+            (Segment::Identifier, "abcd."),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "abcd"),
+            (Segment::EndCommand, "."),
+            (Segment::Newline, "\n"),
+            (Segment::Identifier, "ABCD."),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "ABCD"),
+            (Segment::EndCommand, "."),
+            (Segment::Newline, "\n"),
+            (Segment::Identifier, "aBcD."),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "aBcD"),
+            (Segment::EndCommand, "."),
+            (Segment::Spaces, " "),
+            (Segment::Newline, "\n"),
+            (Segment::Identifier, "$y."),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "$z."),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "あいうえお"),
+            (Segment::EndCommand, "."),
+            (Segment::Newline, "\n"),
+            (Segment::Identifier, "#c."),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "#d."),
+            (Segment::EndCommand, "."),
+            (Segment::Newline, "\n"),
+            (Segment::Identifier, "@@."),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "@@..."),
+            (Segment::EndCommand, "."),
+            (Segment::Newline, "\n"),
+            (Segment::Identifier, "#.#"),
+            (Segment::EndCommand, "."),
+            (Segment::Newline, "\n"),
+            (Segment::Identifier, "#abcd"),
+            (Segment::EndCommand, "."),
+            (Segment::Newline, "\n"),
+            (Segment::StartCommand, "."),
+            (Segment::Newline, "\n"),
+            (Segment::StartCommand, "."),
+            (Segment::Spaces, " "),
+            (Segment::Newline, "\n"),
+            (Segment::Identifier, "LMNOP"),
+            (Segment::EndCommand, "."),
+            (Segment::Spaces, " "),
+            (Segment::Newline, "\n"),
+            (Segment::Identifier, "QRSTUV"),
+            (Segment::EndCommand, "."),
+            (Segment::Comment, "/* end of line comment */"),
+            (Segment::Newline, "\n"),
+            (Segment::Identifier, "qrstuv"),
+            (Segment::EndCommand, "."),
+            (Segment::Spaces, " "),
+            (Segment::Comment, "/* end of line comment */"),
+            (Segment::Newline, "\n"),
+            (Segment::Identifier, "QrStUv"),
+            (Segment::EndCommand, "."),
+            (Segment::Comment, "/* end of line comment */"),
+            (Segment::Spaces, " "),
+            (Segment::Newline, "\n"),
+            (Segment::Identifier, "wxyz"),
+            (Segment::EndCommand, "."),
+            (Segment::Comment, "/* unterminated end of line comment"),
+            (Segment::Newline, "\n"),
+            (Segment::Identifier, "WXYZ"),
+            (Segment::EndCommand, "."),
+            (Segment::Spaces, " "),
+            (Segment::Comment, "/* unterminated end of line comment"),
+            (Segment::Newline, "\n"),
+            (Segment::Identifier, "WxYz"),
+            (Segment::EndCommand, "."),
+            (Segment::Comment, "/* unterminated end of line comment "),
+            (Segment::Newline, "\n"),
+            (Segment::End, ""),
+        ],
+        &[
+            PromptStyle::First,
+            PromptStyle::First,
+            PromptStyle::First,
+            PromptStyle::First,
+            PromptStyle::First,
+            PromptStyle::First,
+            PromptStyle::First,
+            PromptStyle::First,
+            PromptStyle::First,
+            PromptStyle::First,
+            PromptStyle::First,
+            PromptStyle::First,
+            PromptStyle::First,
+            PromptStyle::First,
+            PromptStyle::First,
+            PromptStyle::First,
+            PromptStyle::First,
+        ],
+    );
+}
+
+#[test]
+fn test_reserved_words() {
+    check_segmentation(
+        r#"and or not eq ge gt le lt ne all by to with
+AND OR NOT EQ GE GT LE LT NE ALL BY TO WITH
+andx orx notx eqx gex gtx lex ltx nex allx byx tox withx
+and. with.
+"#,
+        Mode::Auto,
+        &[
+            (Segment::Identifier, "and"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "or"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "not"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "eq"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "ge"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "gt"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "le"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "lt"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "ne"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "all"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "by"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "to"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "with"),
+            (Segment::Newline, "\n"),
+            (Segment::Identifier, "AND"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "OR"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "NOT"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "EQ"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "GE"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "GT"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "LE"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "LT"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "NE"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "ALL"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "BY"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "TO"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "WITH"),
+            (Segment::Newline, "\n"),
+            (Segment::Identifier, "andx"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "orx"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "notx"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "eqx"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "gex"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "gtx"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "lex"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "ltx"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "nex"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "allx"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "byx"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "tox"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "withx"),
+            (Segment::Newline, "\n"),
+            (Segment::Identifier, "and."),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "with"),
+            (Segment::EndCommand, "."),
+            (Segment::Newline, "\n"),
+            (Segment::End, ""),
+        ],
+        &[
+            PromptStyle::Later,
+            PromptStyle::Later,
+            PromptStyle::Later,
+            PromptStyle::First,
+        ],
+    );
+}
+
+#[test]
+fn test_punctuation() {
+    check_segmentation(
+        r#"~ & | = >= > <= < ~= <> ( ) , - + * / [ ] **
+~&|=>=><=<~=<>(),-+*/[]**!*
+% : ; ? _ ` { } ~ !*
+"#,
+        Mode::Auto,
+        &[
+            (Segment::Punct, "~"),
+            (Segment::Spaces, " "),
+            (Segment::Punct, "&"),
+            (Segment::Spaces, " "),
+            (Segment::Punct, "|"),
+            (Segment::Spaces, " "),
+            (Segment::Punct, "="),
+            (Segment::Spaces, " "),
+            (Segment::Punct, ">="),
+            (Segment::Spaces, " "),
+            (Segment::Punct, ">"),
+            (Segment::Spaces, " "),
+            (Segment::Punct, "<="),
+            (Segment::Spaces, " "),
+            (Segment::Punct, "<"),
+            (Segment::Spaces, " "),
+            (Segment::Punct, "~="),
+            (Segment::Spaces, " "),
+            (Segment::Punct, "<>"),
+            (Segment::Spaces, " "),
+            (Segment::Punct, "("),
+            (Segment::Spaces, " "),
+            (Segment::Punct, ")"),
+            (Segment::Spaces, " "),
+            (Segment::Punct, ","),
+            (Segment::Spaces, " "),
+            (Segment::Punct, "-"),
+            (Segment::Spaces, " "),
+            (Segment::Punct, "+"),
+            (Segment::Spaces, " "),
+            (Segment::Punct, "*"),
+            (Segment::Spaces, " "),
+            (Segment::Punct, "/"),
+            (Segment::Spaces, " "),
+            (Segment::Punct, "["),
+            (Segment::Spaces, " "),
+            (Segment::Punct, "]"),
+            (Segment::Spaces, " "),
+            (Segment::Punct, "**"),
+            (Segment::Newline, "\n"),
+            (Segment::Punct, "~"),
+            (Segment::Punct, "&"),
+            (Segment::Punct, "|"),
+            (Segment::Punct, "="),
+            (Segment::Punct, ">="),
+            (Segment::Punct, ">"),
+            (Segment::Punct, "<="),
+            (Segment::Punct, "<"),
+            (Segment::Punct, "~="),
+            (Segment::Punct, "<>"),
+            (Segment::Punct, "("),
+            (Segment::Punct, ")"),
+            (Segment::Punct, ","),
+            (Segment::Punct, "-"),
+            (Segment::Punct, "+"),
+            (Segment::Punct, "*"),
+            (Segment::Punct, "/"),
+            (Segment::Punct, "["),
+            (Segment::Punct, "]"),
+            (Segment::Punct, "**"),
+            (Segment::Punct, "!*"),
+            (Segment::Newline, "\n"),
+            (Segment::Punct, "%"),
+            (Segment::Spaces, " "),
+            (Segment::Punct, ":"),
+            (Segment::Spaces, " "),
+            (Segment::Punct, ";"),
+            (Segment::Spaces, " "),
+            (Segment::Punct, "?"),
+            (Segment::Spaces, " "),
+            (Segment::Punct, "_"),
+            (Segment::Spaces, " "),
+            (Segment::Punct, "`"),
+            (Segment::Spaces, " "),
+            (Segment::Punct, "{"),
+            (Segment::Spaces, " "),
+            (Segment::Punct, "}"),
+            (Segment::Spaces, " "),
+            (Segment::Punct, "~"),
+            (Segment::Spaces, " "),
+            (Segment::Punct, "!*"),
+            (Segment::Newline, "\n"),
+            (Segment::End, ""),
+        ],
+        &[PromptStyle::Later, PromptStyle::Later, PromptStyle::Later],
+    );
+}
+
+#[test]
+fn test_positive_numbers() {
+    check_segmentation(
+        r#"0 1 01 001. 1.
+123. /* comment 1 */ /* comment 2 */
+.1 0.1 00.1 00.10
+5e1 6E-1 7e+1 6E+01 6e-03
+.3E1 .4e-1 .5E+1 .6e+01 .7E-03
+1.23e1 45.6E-1 78.9e+1 99.9E+01 11.2e-03
+. 1e e1 1e+ 1e- 1.
+"#,
+        Mode::Auto,
+        &[
+            (Segment::Number, "0"),
+            (Segment::Spaces, " "),
+            (Segment::Number, "1"),
+            (Segment::Spaces, " "),
+            (Segment::Number, "01"),
+            (Segment::Spaces, " "),
+            (Segment::Number, "001."),
+            (Segment::Spaces, " "),
+            (Segment::Number, "1"),
+            (Segment::EndCommand, "."),
+            (Segment::Newline, "\n"),
+            (Segment::Number, "123"),
+            (Segment::EndCommand, "."),
+            (Segment::Spaces, " "),
+            (Segment::Comment, "/* comment 1 */"),
+            (Segment::Spaces, " "),
+            (Segment::Comment, "/* comment 2 */"),
+            (Segment::Newline, "\n"),
+            (Segment::StartCommand, "."),
+            (Segment::Number, "1"),
+            (Segment::Spaces, " "),
+            (Segment::Number, "0.1"),
+            (Segment::Spaces, " "),
+            (Segment::Number, "00.1"),
+            (Segment::Spaces, " "),
+            (Segment::Number, "00.10"),
+            (Segment::Newline, "\n"),
+            (Segment::Number, "5e1"),
+            (Segment::Spaces, " "),
+            (Segment::Number, "6E-1"),
+            (Segment::Spaces, " "),
+            (Segment::Number, "7e+1"),
+            (Segment::Spaces, " "),
+            (Segment::Number, "6E+01"),
+            (Segment::Spaces, " "),
+            (Segment::Number, "6e-03"),
+            (Segment::Newline, "\n"),
+            (Segment::StartCommand, "."),
+            (Segment::Number, "3E1"),
+            (Segment::Spaces, " "),
+            (Segment::Number, ".4e-1"),
+            (Segment::Spaces, " "),
+            (Segment::Number, ".5E+1"),
+            (Segment::Spaces, " "),
+            (Segment::Number, ".6e+01"),
+            (Segment::Spaces, " "),
+            (Segment::Number, ".7E-03"),
+            (Segment::Newline, "\n"),
+            (Segment::Number, "1.23e1"),
+            (Segment::Spaces, " "),
+            (Segment::Number, "45.6E-1"),
+            (Segment::Spaces, " "),
+            (Segment::Number, "78.9e+1"),
+            (Segment::Spaces, " "),
+            (Segment::Number, "99.9E+01"),
+            (Segment::Spaces, " "),
+            (Segment::Number, "11.2e-03"),
+            (Segment::Newline, "\n"),
+            (Segment::StartCommand, "."),
+            (Segment::Spaces, " "),
+            (Segment::ExpectedExponent, "1e"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "e1"),
+            (Segment::Spaces, " "),
+            (Segment::ExpectedExponent, "1e+"),
+            (Segment::Spaces, " "),
+            (Segment::ExpectedExponent, "1e-"),
+            (Segment::Spaces, " "),
+            (Segment::Number, "1"),
+            (Segment::EndCommand, "."),
+            (Segment::Newline, "\n"),
+            (Segment::End, ""),
+        ],
+        &[
+            PromptStyle::First,
+            PromptStyle::First,
+            PromptStyle::Later,
+            PromptStyle::Later,
+            PromptStyle::Later,
+            PromptStyle::Later,
+            PromptStyle::First,
+        ],
+    );
+}
+
+#[test]
+fn test_negative_numbers() {
+    check_segmentation(
+        r#" -0 -1 -01 -001. -1.
+ -123. /* comment 1 */ /* comment 2 */
+ -.1 -0.1 -00.1 -00.10
+ -5e1 -6E-1 -7e+1 -6E+01 -6e-03
+ -.3E1 -.4e-1 -.5E+1 -.6e+01 -.7E-03
+ -1.23e1 -45.6E-1 -78.9e+1 -99.9E+01 -11.2e-03
+ -/**/1
+ -. -1e -e1 -1e+ -1e- -1.
+"#,
+        Mode::Auto,
+        &[
+            (Segment::Spaces, " "),
+            (Segment::Number, "-0"),
+            (Segment::Spaces, " "),
+            (Segment::Number, "-1"),
+            (Segment::Spaces, " "),
+            (Segment::Number, "-01"),
+            (Segment::Spaces, " "),
+            (Segment::Number, "-001."),
+            (Segment::Spaces, " "),
+            (Segment::Number, "-1"),
+            (Segment::EndCommand, "."),
+            (Segment::Newline, "\n"),
+            (Segment::Spaces, " "),
+            (Segment::Number, "-123"),
+            (Segment::EndCommand, "."),
+            (Segment::Spaces, " "),
+            (Segment::Comment, "/* comment 1 */"),
+            (Segment::Spaces, " "),
+            (Segment::Comment, "/* comment 2 */"),
+            (Segment::Newline, "\n"),
+            (Segment::Spaces, " "),
+            (Segment::Number, "-.1"),
+            (Segment::Spaces, " "),
+            (Segment::Number, "-0.1"),
+            (Segment::Spaces, " "),
+            (Segment::Number, "-00.1"),
+            (Segment::Spaces, " "),
+            (Segment::Number, "-00.10"),
+            (Segment::Newline, "\n"),
+            (Segment::Spaces, " "),
+            (Segment::Number, "-5e1"),
+            (Segment::Spaces, " "),
+            (Segment::Number, "-6E-1"),
+            (Segment::Spaces, " "),
+            (Segment::Number, "-7e+1"),
+            (Segment::Spaces, " "),
+            (Segment::Number, "-6E+01"),
+            (Segment::Spaces, " "),
+            (Segment::Number, "-6e-03"),
+            (Segment::Newline, "\n"),
+            (Segment::Spaces, " "),
+            (Segment::Number, "-.3E1"),
+            (Segment::Spaces, " "),
+            (Segment::Number, "-.4e-1"),
+            (Segment::Spaces, " "),
+            (Segment::Number, "-.5E+1"),
+            (Segment::Spaces, " "),
+            (Segment::Number, "-.6e+01"),
+            (Segment::Spaces, " "),
+            (Segment::Number, "-.7E-03"),
+            (Segment::Newline, "\n"),
+            (Segment::Spaces, " "),
+            (Segment::Number, "-1.23e1"),
+            (Segment::Spaces, " "),
+            (Segment::Number, "-45.6E-1"),
+            (Segment::Spaces, " "),
+            (Segment::Number, "-78.9e+1"),
+            (Segment::Spaces, " "),
+            (Segment::Number, "-99.9E+01"),
+            (Segment::Spaces, " "),
+            (Segment::Number, "-11.2e-03"),
+            (Segment::Newline, "\n"),
+            (Segment::Spaces, " "),
+            (Segment::Punct, "-"),
+            (Segment::Comment, "/**/"),
+            (Segment::Number, "1"),
+            (Segment::Newline, "\n"),
+            (Segment::Spaces, " "),
+            (Segment::Punct, "-"),
+            (Segment::Punct, "."),
+            (Segment::Spaces, " "),
+            (Segment::ExpectedExponent, "-1e"),
+            (Segment::Spaces, " "),
+            (Segment::Punct, "-"),
+            (Segment::Identifier, "e1"),
+            (Segment::Spaces, " "),
+            (Segment::ExpectedExponent, "-1e+"),
+            (Segment::Spaces, " "),
+            (Segment::ExpectedExponent, "-1e-"),
+            (Segment::Spaces, " "),
+            (Segment::Number, "-1"),
+            (Segment::EndCommand, "."),
+            (Segment::Newline, "\n"),
+            (Segment::End, ""),
+        ],
+        &[
+            PromptStyle::First,
+            PromptStyle::First,
+            PromptStyle::Later,
+            PromptStyle::Later,
+            PromptStyle::Later,
+            PromptStyle::Later,
+            PromptStyle::Later,
+            PromptStyle::First,
+        ],
+    );
+}
+
+#[test]
+fn test_strings() {
+    check_segmentation(
+        r#"'x' "y" 'abc'
+'Don''t' "Can't" 'Won''t'
+"""quoted""" '"quoted"'
+'' ""
+'missing end quote
+"missing double quote
+x"4142" X'5152'
+u'fffd' U"041"
++ new command
++ /* comment */ 'string continuation'
++ /* also a punctuator on blank line
+- 'new command'
+"#,
+        Mode::Auto,
+        &[
+            (Segment::QuotedString, "'x'"),
+            (Segment::Spaces, " "),
+            (Segment::QuotedString, "\"y\""),
+            (Segment::Spaces, " "),
+            (Segment::QuotedString, "'abc'"),
+            (Segment::Newline, "\n"),
+            (Segment::QuotedString, "'Don''t'"),
+            (Segment::Spaces, " "),
+            (Segment::QuotedString, "\"Can't\""),
+            (Segment::Spaces, " "),
+            (Segment::QuotedString, "'Won''t'"),
+            (Segment::Newline, "\n"),
+            (Segment::QuotedString, "\"\"\"quoted\"\"\""),
+            (Segment::Spaces, " "),
+            (Segment::QuotedString, "'\"quoted\"'"),
+            (Segment::Newline, "\n"),
+            (Segment::QuotedString, "''"),
+            (Segment::Spaces, " "),
+            (Segment::QuotedString, "\"\""),
+            (Segment::Newline, "\n"),
+            (Segment::ExpectedQuote, "'missing end quote"),
+            (Segment::Newline, "\n"),
+            (Segment::ExpectedQuote, "\"missing double quote"),
+            (Segment::Newline, "\n"),
+            (Segment::HexString, "x\"4142\""),
+            (Segment::Spaces, " "),
+            (Segment::HexString, "X'5152'"),
+            (Segment::Newline, "\n"),
+            (Segment::UnicodeString, "u'fffd'"),
+            (Segment::Spaces, " "),
+            (Segment::UnicodeString, "U\"041\""),
+            (Segment::Newline, "\n"),
+            (Segment::StartCommand, "+"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "new"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "command"),
+            (Segment::Newline, "\n"),
+            (Segment::Punct, "+"),
+            (Segment::Spaces, " "),
+            (Segment::Comment, "/* comment */"),
+            (Segment::Spaces, " "),
+            (Segment::QuotedString, "'string continuation'"),
+            (Segment::Newline, "\n"),
+            (Segment::Punct, "+"),
+            (Segment::Spaces, " "),
+            (Segment::Comment, "/* also a punctuator on blank line"),
+            (Segment::Newline, "\n"),
+            (Segment::StartCommand, "-"),
+            (Segment::Spaces, " "),
+            (Segment::QuotedString, "'new command'"),
+            (Segment::Newline, "\n"),
+            (Segment::End, ""),
+        ],
+        &[
+            PromptStyle::Later,
+            PromptStyle::Later,
+            PromptStyle::Later,
+            PromptStyle::Later,
+            PromptStyle::Later,
+            PromptStyle::Later,
+            PromptStyle::Later,
+            PromptStyle::Later,
+            PromptStyle::Later,
+            PromptStyle::Later,
+            PromptStyle::Later,
+            PromptStyle::Later,
+        ],
+    );
+}
+
+#[test]
+fn test_shbang() {
+    check_segmentation(
+        r#"#! /usr/bin/pspp
+title my title.
+#! /usr/bin/pspp
+"#,
+        Mode::Interactive,
+        &[
+            (Segment::Shbang, "#! /usr/bin/pspp"),
+            (Segment::Newline, "\n"),
+            (Segment::Identifier, "title"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "my"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "title"),
+            (Segment::EndCommand, "."),
+            (Segment::Newline, "\n"),
+            (Segment::Identifier, "#"),
+            (Segment::Punct, "!"),
+            (Segment::Spaces, " "),
+            (Segment::Punct, "/"),
+            (Segment::Identifier, "usr"),
+            (Segment::Punct, "/"),
+            (Segment::Identifier, "bin"),
+            (Segment::Punct, "/"),
+            (Segment::Identifier, "pspp"),
+            (Segment::Newline, "\n"),
+            (Segment::End, ""),
+        ],
+        &[PromptStyle::First, PromptStyle::First, PromptStyle::Later],
+    );
+}
+
+#[test]
+fn test_comment_command() {
+    check_segmentation(
+        r#"* Comment commands "don't
+have to contain valid tokens.
+
+** Check ambiguity with ** token.
+****************.
+
+comment keyword works too.
+COMM also.
+com is ambiguous with COMPUTE.
+
+   * Comment need not start at left margin.
+
+* Comment ends with blank line
+
+next command.
+
+"#,
+        Mode::Interactive,
+        &[
+            (Segment::CommentCommand, "* Comment commands \"don't"),
+            (Segment::Newline, "\n"),
+            (Segment::CommentCommand, "have to contain valid tokens"),
+            (Segment::EndCommand, "."),
+            (Segment::Newline, "\n"),
+            (Segment::SeparateCommands, ""),
+            (Segment::Newline, "\n"),
+            (Segment::CommentCommand, "** Check ambiguity with ** token"),
+            (Segment::EndCommand, "."),
+            (Segment::Newline, "\n"),
+            (Segment::CommentCommand, "****************"),
+            (Segment::EndCommand, "."),
+            (Segment::Newline, "\n"),
+            (Segment::SeparateCommands, ""),
+            (Segment::Newline, "\n"),
+            (Segment::CommentCommand, "comment keyword works too"),
+            (Segment::EndCommand, "."),
+            (Segment::Newline, "\n"),
+            (Segment::CommentCommand, "COMM also"),
+            (Segment::EndCommand, "."),
+            (Segment::Newline, "\n"),
+            (Segment::Identifier, "com"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "is"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "ambiguous"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "with"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "COMPUTE"),
+            (Segment::EndCommand, "."),
+            (Segment::Newline, "\n"),
+            (Segment::SeparateCommands, ""),
+            (Segment::Newline, "\n"),
+            (Segment::Spaces, "   "),
+            (
+                Segment::CommentCommand,
+                "* Comment need not start at left margin",
+            ),
+            (Segment::EndCommand, "."),
+            (Segment::Newline, "\n"),
+            (Segment::SeparateCommands, ""),
+            (Segment::Newline, "\n"),
+            (Segment::CommentCommand, "* Comment ends with blank line"),
+            (Segment::Newline, "\n"),
+            (Segment::SeparateCommands, ""),
+            (Segment::Newline, "\n"),
+            (Segment::Identifier, "next"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "command"),
+            (Segment::EndCommand, "."),
+            (Segment::Newline, "\n"),
+            (Segment::SeparateCommands, ""),
+            (Segment::Newline, "\n"),
+            (Segment::End, ""),
+        ],
+        &[
+            PromptStyle::Comment,
+            PromptStyle::First,
+            PromptStyle::First,
+            PromptStyle::First,
+            PromptStyle::First,
+            PromptStyle::First,
+            PromptStyle::First,
+            PromptStyle::First,
+            PromptStyle::First,
+            PromptStyle::First,
+            PromptStyle::First,
+            PromptStyle::First,
+            PromptStyle::Comment,
+            PromptStyle::First,
+            PromptStyle::First,
+            PromptStyle::First,
+        ],
+    );
+}
+
+#[test]
+fn test_document_command() {
+    check_segmentation(
+        r#"DOCUMENT one line.
+DOC more
+    than
+        one
+            line.
+docu
+first.paragraph
+isn't parsed as tokens
+
+second paragraph.
+"#,
+        Mode::Interactive,
+        &[
+            (Segment::StartDocument, ""),
+            (Segment::Document, "DOCUMENT one line."),
+            (Segment::EndCommand, ""),
+            (Segment::SeparateCommands, ""),
+            (Segment::Newline, "\n"),
+            (Segment::StartDocument, ""),
+            (Segment::Document, "DOC more"),
+            (Segment::Newline, "\n"),
+            (Segment::Document, "    than"),
+            (Segment::Newline, "\n"),
+            (Segment::Document, "        one"),
+            (Segment::Newline, "\n"),
+            (Segment::Document, "            line."),
+            (Segment::EndCommand, ""),
+            (Segment::SeparateCommands, ""),
+            (Segment::Newline, "\n"),
+            (Segment::StartDocument, ""),
+            (Segment::Document, "docu"),
+            (Segment::Newline, "\n"),
+            (Segment::Document, "first.paragraph"),
+            (Segment::Newline, "\n"),
+            (Segment::Document, "isn't parsed as tokens"),
+            (Segment::Newline, "\n"),
+            (Segment::Document, ""),
+            (Segment::Newline, "\n"),
+            (Segment::Document, "second paragraph."),
+            (Segment::EndCommand, ""),
+            (Segment::SeparateCommands, ""),
+            (Segment::Newline, "\n"),
+            (Segment::End, ""),
+        ],
+        &[
+            PromptStyle::First,
+            PromptStyle::Document,
+            PromptStyle::Document,
+            PromptStyle::Document,
+            PromptStyle::First,
+            PromptStyle::Document,
+            PromptStyle::Document,
+            PromptStyle::Document,
+            PromptStyle::Document,
+            PromptStyle::First,
+        ],
+    );
+}
+
+#[test]
+fn test_file_label_command() {
+    check_segmentation(
+        r#"FIL label isn't quoted.
+FILE
+  lab 'is quoted'.
+FILE /*
+/**/  lab not quoted here either
+
+"#,
+        Mode::Interactive,
+        &[
+            (Segment::Identifier, "FIL"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "label"),
+            (Segment::Spaces, " "),
+            (Segment::UnquotedString, "isn't quoted"),
+            (Segment::EndCommand, "."),
+            (Segment::Newline, "\n"),
+            (Segment::Identifier, "FILE"),
+            (Segment::Newline, "\n"),
+            (Segment::Spaces, "  "),
+            (Segment::Identifier, "lab"),
+            (Segment::Spaces, " "),
+            (Segment::QuotedString, "'is quoted'"),
+            (Segment::EndCommand, "."),
+            (Segment::Newline, "\n"),
+            (Segment::Identifier, "FILE"),
+            (Segment::Spaces, " "),
+            (Segment::Comment, "/*"),
+            (Segment::Newline, "\n"),
+            (Segment::Comment, "/**/"),
+            (Segment::Spaces, "  "),
+            (Segment::Identifier, "lab"),
+            (Segment::Spaces, " "),
+            (Segment::UnquotedString, "not quoted here either"),
+            (Segment::Newline, "\n"),
+            (Segment::SeparateCommands, ""),
+            (Segment::Newline, "\n"),
+            (Segment::End, ""),
+        ],
+        &[
+            PromptStyle::First,
+            PromptStyle::Later,
+            PromptStyle::First,
+            PromptStyle::Later,
+            PromptStyle::Later,
+            PromptStyle::First,
+        ],
+    );
+}
+
+#[test]
+fn test_begin_data() {
+    check_segmentation(
+        r#"begin data.
+end data.
+
+begin data. /*
+123
+xxx
+end data.
+
+BEG /**/ DAT /*
+5 6 7 /* x
+
+end  data
+end data
+.
+
+begin
+ data.
+data
+end data.
+
+begin data "xxx".
+begin data 123.
+not data
+"#,
+        Mode::Interactive,
+        &[
+            (Segment::Identifier, "begin"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "data"),
+            (Segment::EndCommand, "."),
+            (Segment::Newline, "\n"),
+            (Segment::Identifier, "end"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "data"),
+            (Segment::EndCommand, "."),
+            (Segment::Newline, "\n"),
+            (Segment::SeparateCommands, ""),
+            (Segment::Newline, "\n"),
+            (Segment::Identifier, "begin"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "data"),
+            (Segment::EndCommand, "."),
+            (Segment::Spaces, " "),
+            (Segment::Comment, "/*"),
+            (Segment::Newline, "\n"),
+            (Segment::InlineData, "123"),
+            (Segment::Newline, "\n"),
+            (Segment::InlineData, "xxx"),
+            (Segment::Newline, "\n"),
+            (Segment::Identifier, "end"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "data"),
+            (Segment::EndCommand, "."),
+            (Segment::Newline, "\n"),
+            (Segment::SeparateCommands, ""),
+            (Segment::Newline, "\n"),
+            (Segment::Identifier, "BEG"),
+            (Segment::Spaces, " "),
+            (Segment::Comment, "/**/"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "DAT"),
+            (Segment::Spaces, " "),
+            (Segment::Comment, "/*"),
+            (Segment::Newline, "\n"),
+            (Segment::InlineData, "5 6 7 /* x"),
+            (Segment::Newline, "\n"),
+            (Segment::InlineData, ""),
+            (Segment::Newline, "\n"),
+            (Segment::InlineData, "end  data"),
+            (Segment::Newline, "\n"),
+            (Segment::Identifier, "end"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "data"),
+            (Segment::Newline, "\n"),
+            (Segment::StartCommand, "."),
+            (Segment::Newline, "\n"),
+            (Segment::SeparateCommands, ""),
+            (Segment::Newline, "\n"),
+            (Segment::Identifier, "begin"),
+            (Segment::Newline, "\n"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "data"),
+            (Segment::EndCommand, "."),
+            (Segment::Newline, "\n"),
+            (Segment::InlineData, "data"),
+            (Segment::Newline, "\n"),
+            (Segment::Identifier, "end"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "data"),
+            (Segment::EndCommand, "."),
+            (Segment::Newline, "\n"),
+            (Segment::SeparateCommands, ""),
+            (Segment::Newline, "\n"),
+            (Segment::Identifier, "begin"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "data"),
+            (Segment::Spaces, " "),
+            (Segment::QuotedString, "\"xxx\""),
+            (Segment::EndCommand, "."),
+            (Segment::Newline, "\n"),
+            (Segment::Identifier, "begin"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "data"),
+            (Segment::Spaces, " "),
+            (Segment::Number, "123"),
+            (Segment::EndCommand, "."),
+            (Segment::Newline, "\n"),
+            (Segment::Identifier, "not"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "data"),
+            (Segment::Newline, "\n"),
+            (Segment::End, ""),
+        ],
+        &[
+            PromptStyle::Data,
+            PromptStyle::First,
+            PromptStyle::First,
+            PromptStyle::Data,
+            PromptStyle::Data,
+            PromptStyle::Data,
+            PromptStyle::First,
+            PromptStyle::First,
+            PromptStyle::Data,
+            PromptStyle::Data,
+            PromptStyle::Data,
+            PromptStyle::Data,
+            PromptStyle::Later,
+            PromptStyle::First,
+            PromptStyle::First,
+            PromptStyle::Later,
+            PromptStyle::Data,
+            PromptStyle::Data,
+            PromptStyle::First,
+            PromptStyle::First,
+            PromptStyle::First,
+            PromptStyle::First,
+            PromptStyle::Later,
+        ],
+    );
+}
+
+#[test]
+fn test_do_repeat() {
+    check_segmentation(
+        r#"do repeat x=a b c
+          y=d e f.
+  do repeat a=1 thru 5.
+another command.
+second command
++ third command.
+end /* x */ /* y */ repeat print.
+end
+ repeat.
+do
+  repeat #a=1.
+  inner command.
+end repeat.
+"#,
+        Mode::Interactive,
+        &[
+            (Segment::Identifier, "do"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "repeat"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "x"),
+            (Segment::Punct, "="),
+            (Segment::Identifier, "a"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "b"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "c"),
+            (Segment::Newline, "\n"),
+            (Segment::Spaces, "          "),
+            (Segment::Identifier, "y"),
+            (Segment::Punct, "="),
+            (Segment::Identifier, "d"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "e"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "f"),
+            (Segment::EndCommand, "."),
+            (Segment::Newline, "\n"),
+            (Segment::DoRepeatCommand, "  do repeat a=1 thru 5."),
+            (Segment::Newline, "\n"),
+            (Segment::DoRepeatCommand, "another command."),
+            (Segment::Newline, "\n"),
+            (Segment::DoRepeatCommand, "second command"),
+            (Segment::Newline, "\n"),
+            (Segment::DoRepeatCommand, "+ third command."),
+            (Segment::Newline, "\n"),
+            (
+                Segment::DoRepeatCommand,
+                "end /* x */ /* y */ repeat print.",
+            ),
+            (Segment::Newline, "\n"),
+            (Segment::Identifier, "end"),
+            (Segment::Newline, "\n"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "repeat"),
+            (Segment::EndCommand, "."),
+            (Segment::Newline, "\n"),
+            (Segment::Identifier, "do"),
+            (Segment::Newline, "\n"),
+            (Segment::Spaces, "  "),
+            (Segment::Identifier, "repeat"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "#a"),
+            (Segment::Punct, "="),
+            (Segment::Number, "1"),
+            (Segment::EndCommand, "."),
+            (Segment::Newline, "\n"),
+            (Segment::DoRepeatCommand, "  inner command."),
+            (Segment::Newline, "\n"),
+            (Segment::Identifier, "end"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "repeat"),
+            (Segment::EndCommand, "."),
+            (Segment::Newline, "\n"),
+            (Segment::End, ""),
+        ],
+        &[
+            PromptStyle::Later,
+            PromptStyle::DoRepeat,
+            PromptStyle::DoRepeat,
+            PromptStyle::DoRepeat,
+            PromptStyle::DoRepeat,
+            PromptStyle::DoRepeat,
+            PromptStyle::DoRepeat,
+            PromptStyle::Later,
+            PromptStyle::First,
+            PromptStyle::Later,
+            PromptStyle::DoRepeat,
+            PromptStyle::DoRepeat,
+            PromptStyle::First,
+        ],
+    );
+}
+
+#[test]
+fn test_do_repeat_overflow() {
+    const N: usize = 257;
+    let do_repeat: Vec<String> = (0..N)
+        .map(|i| format!("do repeat v{i}={i} thru {}.\n", i + 5))
+        .collect();
+    let end_repeat: Vec<String> = (0..N)
+        .rev()
+        .map(|i| format!("end repeat. /* {i}\n"))
+        .collect();
+
+    let s: String = do_repeat
+        .iter()
+        .chain(end_repeat.iter())
+        .map(|s| s.as_str())
+        .collect();
+    let mut expect_output = vec![
+        (Segment::Identifier, "do"),
+        (Segment::Spaces, " "),
+        (Segment::Identifier, "repeat"),
+        (Segment::Spaces, " "),
+        (Segment::Identifier, "v0"),
+        (Segment::Punct, "="),
+        (Segment::Number, "0"),
+        (Segment::Spaces, " "),
+        (Segment::Identifier, "thru"),
+        (Segment::Spaces, " "),
+        (Segment::Number, "5"),
+        (Segment::EndCommand, "."),
+        (Segment::Newline, "\n"),
+    ];
+    for i in 1..N {
+        expect_output.push((Segment::DoRepeatCommand, &do_repeat[i].trim_end()));
+        if i >= 255 {
+            expect_output.push((Segment::DoRepeatOverflow, ""));
+        }
+        expect_output.push((Segment::Newline, "\n"));
+    }
+    for i in 0..254 {
+        expect_output.push((Segment::DoRepeatCommand, &end_repeat[i].trim_end()));
+        expect_output.push((Segment::Newline, "\n"));
+    }
+    let comments: Vec<String> = (0..(N - 254)).rev().map(|i| format!("/* {i}")).collect();
+    for comment in &comments {
+        expect_output.extend([
+            (Segment::Identifier, "end"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "repeat"),
+            (Segment::EndCommand, "."),
+            (Segment::Spaces, " "),
+            (Segment::Comment, comment),
+            (Segment::Newline, "\n"),
+        ]);
+    }
+    expect_output.push((Segment::End, ""));
+
+    let expect_prompts: Vec<_> = (0..N * 2 - 3)
+        .map(|_| PromptStyle::DoRepeat)
+        .chain([PromptStyle::First, PromptStyle::First, PromptStyle::First])
+        .collect();
+    check_segmentation(&s, Mode::Interactive, &expect_output, &expect_prompts);
+}
+
+#[test]
+fn test_do_repeat_batch() {
+    check_segmentation(
+        r#"do repeat x=a b c
+          y=d e f
+do repeat a=1 thru 5
+another command
+second command
++ third command
+end /* x */ /* y */ repeat print
+end
+ repeat
+do
+  repeat #a=1
+
+  inner command
+end repeat
+"#,
+        Mode::Batch,
+        &[
+            (Segment::Identifier, "do"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "repeat"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "x"),
+            (Segment::Punct, "="),
+            (Segment::Identifier, "a"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "b"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "c"),
+            (Segment::Newline, "\n"),
+            (Segment::Spaces, "          "),
+            (Segment::Identifier, "y"),
+            (Segment::Punct, "="),
+            (Segment::Identifier, "d"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "e"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "f"),
+            (Segment::Newline, "\n"),
+            (Segment::StartCommand, ""),
+            (Segment::DoRepeatCommand, "do repeat a=1 thru 5"),
+            (Segment::Newline, "\n"),
+            (Segment::DoRepeatCommand, "another command"),
+            (Segment::Newline, "\n"),
+            (Segment::DoRepeatCommand, "second command"),
+            (Segment::Newline, "\n"),
+            (Segment::DoRepeatCommand, "+ third command"),
+            (Segment::Newline, "\n"),
+            (Segment::DoRepeatCommand, "end /* x */ /* y */ repeat print"),
+            (Segment::Newline, "\n"),
+            (Segment::Identifier, "end"),
+            (Segment::Newline, "\n"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "repeat"),
+            (Segment::Newline, "\n"),
+            (Segment::StartCommand, ""),
+            (Segment::Identifier, "do"),
+            (Segment::Newline, "\n"),
+            (Segment::Spaces, "  "),
+            (Segment::Identifier, "repeat"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "#a"),
+            (Segment::Punct, "="),
+            (Segment::Number, "1"),
+            (Segment::Newline, "\n"),
+            (Segment::SeparateCommands, ""),
+            (Segment::Newline, "\n"),
+            (Segment::DoRepeatCommand, "  inner command"),
+            (Segment::Newline, "\n"),
+            (Segment::Identifier, "end"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "repeat"),
+            (Segment::Newline, "\n"),
+            (Segment::End, ""),
+        ],
+        &[
+            PromptStyle::Later,
+            PromptStyle::Later,
+            PromptStyle::DoRepeat,
+            PromptStyle::DoRepeat,
+            PromptStyle::DoRepeat,
+            PromptStyle::DoRepeat,
+            PromptStyle::DoRepeat,
+            PromptStyle::Later,
+            PromptStyle::Later,
+            PromptStyle::Later,
+            PromptStyle::Later,
+            PromptStyle::DoRepeat,
+            PromptStyle::DoRepeat,
+            PromptStyle::Later,
+        ],
+    );
+}
+
+mod define {
+    use crate::{
+        lex::segment::{Mode, Segment},
+        prompt::PromptStyle,
+    };
+
+    use super::check_segmentation;
+
+    #[test]
+    fn test_simple() {
+        check_segmentation(
+            r#"define !macro1()
+var1 var2 var3 "!enddefine"
+!enddefine.
+"#,
+            Mode::Interactive,
+            &[
+                (Segment::Identifier, "define"),
+                (Segment::Spaces, " "),
+                (Segment::MacroName, "!macro1"),
+                (Segment::Punct, "("),
+                (Segment::Punct, ")"),
+                (Segment::Newline, "\n"),
+                (Segment::MacroBody, "var1 var2 var3 \"!enddefine\""),
+                (Segment::Newline, "\n"),
+                (Segment::Identifier, "!enddefine"),
+                (Segment::EndCommand, "."),
+                (Segment::Newline, "\n"),
+                (Segment::End, ""),
+            ],
+            &[PromptStyle::Define, PromptStyle::Define, PromptStyle::First],
+        );
+    }
+
+    #[test]
+    fn test_no_newline_after_parentheses() {
+        check_segmentation(
+            r#"define !macro1() var1 var2 var3 /* !enddefine
+!enddefine.
+"#,
+            Mode::Interactive,
+            &[
+                (Segment::Identifier, "define"),
+                (Segment::Spaces, " "),
+                (Segment::MacroName, "!macro1"),
+                (Segment::Punct, "("),
+                (Segment::Punct, ")"),
+                (Segment::MacroBody, " var1 var2 var3 /* !enddefine"),
+                (Segment::Newline, "\n"),
+                (Segment::Identifier, "!enddefine"),
+                (Segment::EndCommand, "."),
+                (Segment::Newline, "\n"),
+                (Segment::End, ""),
+            ],
+            &[PromptStyle::Define, PromptStyle::First],
+        );
+    }
+
+    #[test]
+    fn test_no_newline_before_enddefine() {
+        check_segmentation(
+            r#"define !macro1()
+var1 var2 var3!enddefine.
+"#,
+            Mode::Interactive,
+            &[
+                (Segment::Identifier, "define"),
+                (Segment::Spaces, " "),
+                (Segment::MacroName, "!macro1"),
+                (Segment::Punct, "("),
+                (Segment::Punct, ")"),
+                (Segment::Newline, "\n"),
+                (Segment::MacroBody, "var1 var2 var3"),
+                (Segment::Identifier, "!enddefine"),
+                (Segment::EndCommand, "."),
+                (Segment::Newline, "\n"),
+                (Segment::End, ""),
+            ],
+            &[PromptStyle::Define, PromptStyle::First],
+        );
+    }
+
+    #[test]
+    fn test_all_on_one_line() {
+        check_segmentation(
+            r#"define !macro1()var1 var2 var3!enddefine.
+"#,
+            Mode::Interactive,
+            &[
+                (Segment::Identifier, "define"),
+                (Segment::Spaces, " "),
+                (Segment::MacroName, "!macro1"),
+                (Segment::Punct, "("),
+                (Segment::Punct, ")"),
+                (Segment::MacroBody, "var1 var2 var3"),
+                (Segment::Identifier, "!enddefine"),
+                (Segment::EndCommand, "."),
+                (Segment::Newline, "\n"),
+                (Segment::End, ""),
+            ],
+            &[PromptStyle::First],
+        );
+    }
+
+    #[test]
+    fn test_empty() {
+        check_segmentation(
+            r#"define !macro1()
+!enddefine.
+"#,
+            Mode::Interactive,
+            &[
+                (Segment::Identifier, "define"),
+                (Segment::Spaces, " "),
+                (Segment::MacroName, "!macro1"),
+                (Segment::Punct, "("),
+                (Segment::Punct, ")"),
+                (Segment::Newline, "\n"),
+                (Segment::Identifier, "!enddefine"),
+                (Segment::EndCommand, "."),
+                (Segment::Newline, "\n"),
+                (Segment::End, ""),
+            ],
+            &[PromptStyle::Define, PromptStyle::First],
+        );
+    }
+
+    #[test]
+    fn test_blank_lines() {
+        check_segmentation(
+            r#"define !macro1()
+
+
+!enddefine.
+"#,
+            Mode::Interactive,
+            &[
+                (Segment::Identifier, "define"),
+                (Segment::Spaces, " "),
+                (Segment::MacroName, "!macro1"),
+                (Segment::Punct, "("),
+                (Segment::Punct, ")"),
+                (Segment::Newline, "\n"),
+                (Segment::MacroBody, ""),
+                (Segment::Newline, "\n"),
+                (Segment::MacroBody, ""),
+                (Segment::Newline, "\n"),
+                (Segment::Identifier, "!enddefine"),
+                (Segment::EndCommand, "."),
+                (Segment::Newline, "\n"),
+                (Segment::End, ""),
+            ],
+            &[
+                PromptStyle::Define,
+                PromptStyle::Define,
+                PromptStyle::Define,
+                PromptStyle::First,
+            ],
+        );
+    }
+
+    #[test]
+    fn test_arguments() {
+        check_segmentation(
+            r#"define !macro1(a(), b(), c())
+!enddefine.
+"#,
+            Mode::Interactive,
+            &[
+                (Segment::Identifier, "define"),
+                (Segment::Spaces, " "),
+                (Segment::MacroName, "!macro1"),
+                (Segment::Punct, "("),
+                (Segment::Identifier, "a"),
+                (Segment::Punct, "("),
+                (Segment::Punct, ")"),
+                (Segment::Punct, ","),
+                (Segment::Spaces, " "),
+                (Segment::Identifier, "b"),
+                (Segment::Punct, "("),
+                (Segment::Punct, ")"),
+                (Segment::Punct, ","),
+                (Segment::Spaces, " "),
+                (Segment::Identifier, "c"),
+                (Segment::Punct, "("),
+                (Segment::Punct, ")"),
+                (Segment::Punct, ")"),
+                (Segment::Newline, "\n"),
+                (Segment::Identifier, "!enddefine"),
+                (Segment::EndCommand, "."),
+                (Segment::Newline, "\n"),
+                (Segment::End, ""),
+            ],
+            &[PromptStyle::Define, PromptStyle::First],
+        );
+    }
+
+    #[test]
+    fn test_multiline_arguments() {
+        check_segmentation(
+            r#"define !macro1(
+  a(), b(
+  ),
+  c()
+)
+!enddefine.
+"#,
+            Mode::Interactive,
+            &[
+                (Segment::Identifier, "define"),
+                (Segment::Spaces, " "),
+                (Segment::MacroName, "!macro1"),
+                (Segment::Punct, "("),
+                (Segment::Newline, "\n"),
+                (Segment::Spaces, "  "),
+                (Segment::Identifier, "a"),
+                (Segment::Punct, "("),
+                (Segment::Punct, ")"),
+                (Segment::Punct, ","),
+                (Segment::Spaces, " "),
+                (Segment::Identifier, "b"),
+                (Segment::Punct, "("),
+                (Segment::Newline, "\n"),
+                (Segment::Spaces, "  "),
+                (Segment::Punct, ")"),
+                (Segment::Punct, ","),
+                (Segment::Newline, "\n"),
+                (Segment::Spaces, "  "),
+                (Segment::Identifier, "c"),
+                (Segment::Punct, "("),
+                (Segment::Punct, ")"),
+                (Segment::Newline, "\n"),
+                (Segment::Punct, ")"),
+                (Segment::Newline, "\n"),
+                (Segment::Identifier, "!enddefine"),
+                (Segment::EndCommand, "."),
+                (Segment::Newline, "\n"),
+                (Segment::End, ""),
+            ],
+            &[
+                PromptStyle::Later,
+                PromptStyle::Later,
+                PromptStyle::Later,
+                PromptStyle::Later,
+                PromptStyle::Define,
+                PromptStyle::First,
+            ],
+        );
+    }
+
+    #[test]
+    fn test_arguments_start_on_second_line() {
+        check_segmentation(
+            r#"define !macro1
+(x,y,z
+)
+content 1
+content 2
+!enddefine.
+"#,
+            Mode::Interactive,
+            &[
+                (Segment::Identifier, "define"),
+                (Segment::Spaces, " "),
+                (Segment::MacroName, "!macro1"),
+                (Segment::Newline, "\n"),
+                (Segment::Punct, "("),
+                (Segment::Identifier, "x"),
+                (Segment::Punct, ","),
+                (Segment::Identifier, "y"),
+                (Segment::Punct, ","),
+                (Segment::Identifier, "z"),
+                (Segment::Newline, "\n"),
+                (Segment::Punct, ")"),
+                (Segment::Newline, "\n"),
+                (Segment::MacroBody, "content 1"),
+                (Segment::Newline, "\n"),
+                (Segment::MacroBody, "content 2"),
+                (Segment::Newline, "\n"),
+                (Segment::Identifier, "!enddefine"),
+                (Segment::EndCommand, "."),
+                (Segment::Newline, "\n"),
+                (Segment::End, ""),
+            ],
+            &[
+                PromptStyle::Later,
+                PromptStyle::Later,
+                PromptStyle::Define,
+                PromptStyle::Define,
+                PromptStyle::Define,
+                PromptStyle::First,
+            ],
+        );
+    }
+
+    #[test]
+    fn test_early_end_of_command_1() {
+        check_segmentation(
+            r#"define !macro1.
+data list /x 1.
+"#,
+            Mode::Interactive,
+            &[
+                (Segment::Identifier, "define"),
+                (Segment::Spaces, " "),
+                (Segment::MacroName, "!macro1"),
+                (Segment::EndCommand, "."),
+                (Segment::Newline, "\n"),
+                (Segment::Identifier, "data"),
+                (Segment::Spaces, " "),
+                (Segment::Identifier, "list"),
+                (Segment::Spaces, " "),
+                (Segment::Punct, "/"),
+                (Segment::Identifier, "x"),
+                (Segment::Spaces, " "),
+                (Segment::Number, "1"),
+                (Segment::EndCommand, "."),
+                (Segment::Newline, "\n"),
+                (Segment::End, ""),
+            ],
+            &[PromptStyle::First, PromptStyle::First],
+        );
+    }
+
+    #[test]
+    fn test_early_end_of_command_2() {
+        check_segmentation(
+            r#"define !macro1
+x.
+data list /x 1.
+"#,
+            Mode::Interactive,
+            &[
+                (Segment::Identifier, "define"),
+                (Segment::Spaces, " "),
+                (Segment::MacroName, "!macro1"),
+                (Segment::Newline, "\n"),
+                (Segment::Identifier, "x"),
+                (Segment::EndCommand, "."),
+                (Segment::Newline, "\n"),
+                (Segment::Identifier, "data"),
+                (Segment::Spaces, " "),
+                (Segment::Identifier, "list"),
+                (Segment::Spaces, " "),
+                (Segment::Punct, "/"),
+                (Segment::Identifier, "x"),
+                (Segment::Spaces, " "),
+                (Segment::Number, "1"),
+                (Segment::EndCommand, "."),
+                (Segment::Newline, "\n"),
+                (Segment::End, ""),
+            ],
+            &[PromptStyle::Later, PromptStyle::First, PromptStyle::First],
+        );
+    }
+
+    #[test]
+    fn test_early_end_of_command_3() {
+        check_segmentation(
+            r#"define !macro1(.
+x.
+data list /x 1.
+"#,
+            Mode::Interactive,
+            &[
+                (Segment::Identifier, "define"),
+                (Segment::Spaces, " "),
+                (Segment::MacroName, "!macro1"),
+                (Segment::Punct, "("),
+                (Segment::EndCommand, "."),
+                (Segment::Newline, "\n"),
+                (Segment::Identifier, "x"),
+                (Segment::EndCommand, "."),
+                (Segment::Newline, "\n"),
+                (Segment::Identifier, "data"),
+                (Segment::Spaces, " "),
+                (Segment::Identifier, "list"),
+                (Segment::Spaces, " "),
+                (Segment::Punct, "/"),
+                (Segment::Identifier, "x"),
+                (Segment::Spaces, " "),
+                (Segment::Number, "1"),
+                (Segment::EndCommand, "."),
+                (Segment::Newline, "\n"),
+                (Segment::End, ""),
+            ],
+            &[PromptStyle::First, PromptStyle::First, PromptStyle::First],
+        );
+    }
+
+    #[test]
+    fn test_early_end_of_command_4() {
+        // Notice the command terminator at the end of the `DEFINE` command,
+        // which should not be there and ends it early.
+        check_segmentation(
+            r#"define !macro1.
+data list /x 1.
+"#,
+            Mode::Interactive,
+            &[
+                (Segment::Identifier, "define"),
+                (Segment::Spaces, " "),
+                (Segment::MacroName, "!macro1"),
+                (Segment::EndCommand, "."),
+                (Segment::Newline, "\n"),
+                (Segment::Identifier, "data"),
+                (Segment::Spaces, " "),
+                (Segment::Identifier, "list"),
+                (Segment::Spaces, " "),
+                (Segment::Punct, "/"),
+                (Segment::Identifier, "x"),
+                (Segment::Spaces, " "),
+                (Segment::Number, "1"),
+                (Segment::EndCommand, "."),
+                (Segment::Newline, "\n"),
+                (Segment::End, ""),
+            ],
+            &[PromptStyle::First, PromptStyle::First],
+        );
+    }
+
+    #[test]
+    fn test_missing_enddefine() {
+        check_segmentation(
+            r#"define !macro1()
+content line 1
+content line 2
+"#,
+            Mode::Interactive,
+            &[
+                (Segment::Identifier, "define"),
+                (Segment::Spaces, " "),
+                (Segment::MacroName, "!macro1"),
+                (Segment::Punct, "("),
+                (Segment::Punct, ")"),
+                (Segment::Newline, "\n"),
+                (Segment::MacroBody, "content line 1"),
+                (Segment::Newline, "\n"),
+                (Segment::MacroBody, "content line 2"),
+                (Segment::Newline, "\n"),
+                (Segment::End, ""),
+            ],
+            &[
+                PromptStyle::Define,
+                PromptStyle::Define,
+                PromptStyle::Define,
+            ],
+        );
+    }
+
+    #[test]
+    fn test_missing_enddefine_2() {
+        check_segmentation(
+            r#"define !macro1()
+"#,
+            Mode::Interactive,
+            &[
+                (Segment::Identifier, "define"),
+                (Segment::Spaces, " "),
+                (Segment::MacroName, "!macro1"),
+                (Segment::Punct, "("),
+                (Segment::Punct, ")"),
+                (Segment::Newline, "\n"),
+                (Segment::End, ""),
+            ],
+            &[PromptStyle::Define],
+        );
+    }
+}
+
+#[test]
+fn test_batch_mode() {
+    check_segmentation(
+        r#"first command
+     another line of first command
++  second command
+third command
+
+fourth command.
+   fifth command.
+"#,
+        Mode::Batch,
+        &[
+            (Segment::Identifier, "first"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "command"),
+            (Segment::Newline, "\n"),
+            (Segment::Spaces, "     "),
+            (Segment::Identifier, "another"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "line"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "of"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "first"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "command"),
+            (Segment::Newline, "\n"),
+            (Segment::StartCommand, "+"),
+            (Segment::Spaces, "  "),
+            (Segment::Identifier, "second"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "command"),
+            (Segment::Newline, "\n"),
+            (Segment::StartCommand, ""),
+            (Segment::Identifier, "third"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "command"),
+            (Segment::Newline, "\n"),
+            (Segment::SeparateCommands, ""),
+            (Segment::Newline, "\n"),
+            (Segment::Identifier, "fourth"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "command"),
+            (Segment::EndCommand, "."),
+            (Segment::Newline, "\n"),
+            (Segment::Spaces, "   "),
+            (Segment::Identifier, "fifth"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "command"),
+            (Segment::EndCommand, "."),
+            (Segment::Newline, "\n"),
+            (Segment::End, ""),
+        ],
+        &[
+            PromptStyle::Later,
+            PromptStyle::Later,
+            PromptStyle::Later,
+            PromptStyle::Later,
+            PromptStyle::First,
+            PromptStyle::First,
+            PromptStyle::First,
+        ],
+    );
+}
+
+#[test]
+fn test_auto_mode() {
+    check_segmentation(
+        r#"command
+     another line of command
+2sls
++  another command
+another line of second command
+data list /x 1
+aggregate.
+print eject.
+twostep cluster
+
+
+fourth command.
+   fifth command.
+"#,
+        Mode::Auto,
+        &[
+            (Segment::Identifier, "command"),
+            (Segment::Newline, "\n"),
+            (Segment::Spaces, "     "),
+            (Segment::Identifier, "another"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "line"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "of"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "command"),
+            (Segment::Newline, "\n"),
+            (Segment::StartCommand, ""),
+            (Segment::Number, "2"),
+            (Segment::Identifier, "sls"),
+            (Segment::Newline, "\n"),
+            (Segment::StartCommand, "+"),
+            (Segment::Spaces, "  "),
+            (Segment::Identifier, "another"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "command"),
+            (Segment::Newline, "\n"),
+            (Segment::Identifier, "another"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "line"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "of"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "second"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "command"),
+            (Segment::Newline, "\n"),
+            (Segment::StartCommand, ""),
+            (Segment::Identifier, "data"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "list"),
+            (Segment::Spaces, " "),
+            (Segment::Punct, "/"),
+            (Segment::Identifier, "x"),
+            (Segment::Spaces, " "),
+            (Segment::Number, "1"),
+            (Segment::Newline, "\n"),
+            (Segment::StartCommand, ""),
+            (Segment::Identifier, "aggregate"),
+            (Segment::EndCommand, "."),
+            (Segment::Newline, "\n"),
+            (Segment::Identifier, "print"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "eject"),
+            (Segment::EndCommand, "."),
+            (Segment::Newline, "\n"),
+            (Segment::Identifier, "twostep"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "cluster"),
+            (Segment::Newline, "\n"),
+            (Segment::SeparateCommands, ""),
+            (Segment::Newline, "\n"),
+            (Segment::SeparateCommands, ""),
+            (Segment::Newline, "\n"),
+            (Segment::Identifier, "fourth"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "command"),
+            (Segment::EndCommand, "."),
+            (Segment::Newline, "\n"),
+            (Segment::Spaces, "   "),
+            (Segment::Identifier, "fifth"),
+            (Segment::Spaces, " "),
+            (Segment::Identifier, "command"),
+            (Segment::EndCommand, "."),
+            (Segment::Newline, "\n"),
+            (Segment::End, ""),
+        ],
+        &[
+            PromptStyle::Later,
+            PromptStyle::Later,
+            PromptStyle::Later,
+            PromptStyle::Later,
+            PromptStyle::Later,
+            PromptStyle::Later,
+            PromptStyle::First,
+            PromptStyle::First,
+            PromptStyle::Later,
+            PromptStyle::First,
+            PromptStyle::First,
+            PromptStyle::First,
+            PromptStyle::First,
+        ],
+    );
+}
diff --git a/rust/pspp/src/lex/token.rs b/rust/pspp/src/lex/token.rs

new file mode 100644 (file)

index 0000000..2b59423
--- /dev/null
+++ b/rust/pspp/src/lex/token.rs
@@ -0,0 +1,272 @@
+use std::fmt::{Display, Formatter, Result as FmtResult};
+
+use crate::identifier::Identifier;
+
+#[derive(Clone, Debug, PartialEq)]
+pub enum Token {
+    /// End of input.
+    End,
+
+    /// Identifier.
+    Id(Identifier),
+
+    /// Number.
+    Number(f64),
+
+    /// Quoted string.
+    String(String),
+
+    /// Command terminator or separator.
+    ///
+    /// Usually this is `.`, but a blank line also separates commands, and in
+    /// batch mode any line that begins with a non-blank starts a new command.
+    EndCommand,
+
+    /// Operators, punctuators, and reserved words.
+    Punct(Punct),
+}
+
+impl Token {
+    pub fn id(&self) -> Option<&Identifier> {
+        match self {
+            Self::Id(identifier) => Some(identifier),
+            _ => None,
+        }
+    }
+}
+
+fn is_printable(c: char) -> bool {
+    !c.is_control() || ['\t', '\r', '\n'].contains(&c)
+}
+
+fn string_representation(s: &str, quote: char, f: &mut Formatter<'_>) -> FmtResult {
+    write!(f, "{quote}")?;
+    for section in s.split_inclusive(quote) {
+        if let Some(rest) = section.strip_suffix(quote) {
+            write!(f, "{rest}{quote}{quote}")?;
+        } else {
+            write!(f, "{section}")?;
+        }
+    }
+    write!(f, "{quote}")
+}
+
+impl Display for Token {
+    fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
+        match self {
+            Token::End => Ok(()),
+            Token::Id(s) => write!(f, "{s}"),
+            Token::Number(number) => {
+                if number.is_sign_negative() {
+                    write!(f, "-{}", number.abs())
+                } else {
+                    write!(f, "{number}")
+                }
+            }
+            Token::String(s) => {
+                if s.chars().all(|c| is_printable(c)) {
+                    if s.contains('"') {
+                        string_representation(s, '\'', f)
+                    } else {
+                        string_representation(s, '"', f)
+                    }
+                } else {
+                    write!(f, "X\"")?;
+                    for byte in s.bytes() {
+                        let c1 = char::from_digit((byte >> 4) as u32, 16)
+                            .unwrap()
+                            .to_ascii_uppercase();
+                        let c2 = char::from_digit((byte & 0xf) as u32, 16)
+                            .unwrap()
+                            .to_ascii_uppercase()
+                            .to_ascii_lowercase();
+                        write!(f, "{c1}{c2}")?;
+                    }
+                    write!(f, "\"")
+                }
+            }
+            Token::EndCommand => write!(f, "."),
+            Token::Punct(punct) => punct.fmt(f),
+        }
+    }
+}
+
+/// Check that all negative numbers, even -0, get formatted with a leading `-`.
+#[cfg(test)]
+mod test {
+    use crate::lex::token::Token;
+
+    #[test]
+    fn test_string() {
+        assert_eq!(Token::String(String::from("abc")).to_string(), "\"abc\"");
+        assert_eq!(
+            Token::String(String::from("\u{0080}")).to_string(),
+            "X\"C280\""
+        );
+    }
+
+    #[test]
+    fn test_neg0() {
+        assert_eq!(Token::Number(-0.0).to_string(), "-0");
+    }
+}
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq)]
+pub enum Punct {
+    /// `+`.
+    Plus,
+
+    /// `-`.
+    Dash,
+
+    /// `*`.
+    Asterisk,
+
+    /// `/`.
+    Slash,
+
+    /// `=`.
+    Equals,
+
+    /// `(`.
+    LParen,
+
+    /// `)`.
+    RParen,
+
+    /// `[`.
+    LSquare,
+
+    /// `]`.
+    RSquare,
+
+    /// `{`.
+    LCurly,
+
+    /// `}`.
+    RCurly,
+
+    /// `,`.
+    Comma,
+
+    /// `;`.
+    Semicolon,
+
+    /// `:`.
+    Colon,
+
+    /// `AND` or `&`.
+    And,
+
+    /// `OR` or `|`.
+    Or,
+
+    /// `NOT` or `~`.
+    Not,
+
+    /// `EQ` or `=`.
+    Eq,
+
+    /// `GE` or '>=`
+    Ge,
+
+    /// `GT` or `>`.
+    Gt,
+
+    /// `LE` or `<=`.
+    Le,
+
+    /// `LT` or `<`.
+    Lt,
+
+    /// `NE` or `~=` or `<>`.
+    Ne,
+
+    /// `ALL`.
+    All,
+
+    /// `BY`.
+    By,
+
+    /// `TO`.
+    To,
+
+    /// `WITH`.
+    With,
+
+    /// `**`.
+    Exp,
+
+    /// `!` (only appears in macros).
+    Bang,
+
+    /// `%` (only appears in macros).
+    Percent,
+
+    /// `?` (only appears in macros).
+    Question,
+
+    /// ```` (only appears in macros).
+    Backtick,
+
+    /// `.`.
+    ///
+    /// This represents a dot in the middle of a line by itself, where it does not end a command.
+    Dot,
+
+    /// `_` (only appears in macros).
+    ///
+    /// Although underscores may appear within identifiers, they can't be the
+    /// first character, so this represents an underscore found on its own.
+    Underscore,
+
+    /// `!*` (only appears in macros).
+    BangAsterisk,
+}
+
+impl Punct {
+    pub fn as_str(&self) -> &'static str {
+        match self {
+            Self::Plus => "+",
+            Self::Dash => "-",
+            Self::Asterisk => "*",
+            Self::Slash => "/",
+            Self::Equals => "=",
+            Self::LParen => "(",
+            Self::RParen => ")",
+            Self::LSquare => "[",
+            Self::RSquare => "]",
+            Self::LCurly => "{",
+            Self::RCurly => "}",
+            Self::Comma => ",",
+            Self::Semicolon => ";",
+            Self::Colon => ":",
+            Self::And => "AND",
+            Self::Or => "OR",
+            Self::Not => "NOT",
+            Self::Eq => "EQ",
+            Self::Ge => ">=",
+            Self::Gt => ">",
+            Self::Le => "<=",
+            Self::Lt => "<",
+            Self::Ne => "~=",
+            Self::All => "ALL",
+            Self::By => "BY",
+            Self::To => "TO",
+            Self::With => "WITH",
+            Self::Exp => "**",
+            Self::Bang => "!",
+            Self::Percent => "%",
+            Self::Question => "?",
+            Self::Backtick => "`",
+            Self::Dot => ".",
+            Self::Underscore => "_",
+            Self::BangAsterisk => "!*",
+        }
+    }
+}
+impl Display for Punct {
+    fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
+        write!(f, "{}", self.as_str())
+    }
+}
diff --git a/rust/pspp/src/lib.rs b/rust/pspp/src/lib.rs

new file mode 100644 (file)

index 0000000..3548e02
--- /dev/null
+++ b/rust/pspp/src/lib.rs
@@ -0,0 +1,20 @@
+#[allow(unused_variables, unused_mut, dead_code)]
+pub mod cooked;
+pub mod dictionary;
+pub mod encoding;
+pub mod endian;
+pub mod format;
+pub mod identifier;
+pub mod locale_charset;
+pub mod output;
+#[allow(unused_variables, unused_mut, dead_code)]
+pub mod raw;
+pub mod sack;
+pub mod lex;
+pub mod prompt;
+pub mod message;
+pub mod macros;
+pub mod settings;
+pub mod command;
+pub mod integer;
+pub mod engine;
diff --git a/rust/pspp/src/locale_charset.rs b/rust/pspp/src/locale_charset.rs

new file mode 100644 (file)

index 0000000..596fd62
--- /dev/null
+++ b/rust/pspp/src/locale_charset.rs
@@ -0,0 +1,306 @@
+// Determine a canonical name for the current locale's character encoding.
+//
+// Copyright (C) 2000-2006, 2008-2023 Free Software Foundation, Inc.
+//
+// This file is free software: you can redistribute it and/or modify it under
+// the terms of the GNU Lesser General Public License as published by the Free
+// Software Foundation; either version 2.1 of the License, or (at your option)
+// any later version.
+//
+// This file is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
+// A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
+// details.
+//
+// You should have received a copy of the GNU Lesser General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+//
+// Written by Bruno Haible <bruno@clisp.org>.  Translated to Rust by Ben Pfaff
+// <blp@cs.stanford.edu>.
+
+use lazy_static::lazy_static;
+
+fn map_aliases(s: &str) -> &'static str {
+    #[cfg(target_os = "freebsd")]
+    match s {
+        "ARMSCII-8" => return "ARMSCII-8",
+        "Big5" => return "BIG5",
+        "C" => return "ASCII",
+        "CP1131" => return "CP1131",
+        "CP1251" => return "CP1251",
+        "CP866" => return "CP866",
+        "GB18030" => return "GB18030",
+        "GB2312" => return "GB2312",
+        "GBK" => return "GBK",
+        "ISCII-DEV" => return "?",
+        "ISO8859-1" => return "ISO-8859-1",
+        "ISO8859-13" => return "ISO-8859-13",
+        "ISO8859-15" => return "ISO-8859-15",
+        "ISO8859-2" => return "ISO-8859-2",
+        "ISO8859-5" => return "ISO-8859-5",
+        "ISO8859-7" => return "ISO-8859-7",
+        "ISO8859-9" => return "ISO-8859-9",
+        "KOI8-R" => return "KOI8-R",
+        "KOI8-U" => return "KOI8-U",
+        "SJIS" => return "SHIFT_JIS",
+        "US-ASCII" => return "ASCII",
+        "eucCN" => return "GB2312",
+        "eucJP" => return "EUC-JP",
+        "eucKR" => return "EUC-KR",
+        _ => (),
+    };
+
+    #[cfg(target_os = "netbsd")]
+    match s {
+        "646" => return "ASCII",
+        "ARMSCII-8" => return "ARMSCII-8",
+        "BIG5" => return "BIG5",
+        "Big5-HKSCS" => return "BIG5-HKSCS",
+        "CP1251" => return "CP1251",
+        "CP866" => return "CP866",
+        "GB18030" => return "GB18030",
+        "GB2312" => return "GB2312",
+        "ISO8859-1" => return "ISO-8859-1",
+        "ISO8859-13" => return "ISO-8859-13",
+        "ISO8859-15" => return "ISO-8859-15",
+        "ISO8859-2" => return "ISO-8859-2",
+        "ISO8859-4" => return "ISO-8859-4",
+        "ISO8859-5" => return "ISO-8859-5",
+        "ISO8859-7" => return "ISO-8859-7",
+        "KOI8-R" => return "KOI8-R",
+        "KOI8-U" => return "KOI8-U",
+        "PT154" => return "PT154",
+        "SJIS" => return "SHIFT_JIS",
+        "eucCN" => return "GB2312",
+        "eucJP" => return "EUC-JP",
+        "eucKR" => return "EUC-KR",
+        "eucTW" => return "EUC-TW",
+        _ => (),
+    };
+
+    #[cfg(target_os = "openbsd")]
+    match s {
+        "646" => return "ASCII",
+        "ISO8859-1" => return "ISO-8859-1",
+        "ISO8859-13" => return "ISO-8859-13",
+        "ISO8859-15" => return "ISO-8859-15",
+        "ISO8859-2" => return "ISO-8859-2",
+        "ISO8859-4" => return "ISO-8859-4",
+        "ISO8859-5" => return "ISO-8859-5",
+        "ISO8859-7" => return "ISO-8859-7",
+        "US-ASCII" => return "ASCII",
+        _ => (),
+    };
+
+    /* Darwin 7.5 has nl_langinfo(CODESET), but sometimes its value is
+      useless:
+      - It returns the empty string when LANG is set to a locale of the
+        form ll_CC, although ll_CC/LC_CTYPE is a symlink to an UTF-8
+        LC_CTYPE file.
+      - The environment variables LANG, LC_CTYPE, LC_ALL are not set by
+        the system; nl_langinfo(CODESET) returns "US-ASCII" in this case.
+      - The documentation says:
+          "... all code that calls BSD system routines should ensure
+           that the const *char parameters of these routines are in UTF-8
+           encoding. All BSD system functions expect their string
+           parameters to be in UTF-8 encoding and nothing else."
+        It also says
+          "An additional caveat is that string parameters for files,
+           paths, and other file-system entities must be in canonical
+           UTF-8. In a canonical UTF-8 Unicode string, all decomposable
+           characters are decomposed ..."
+        but this is not true: You can pass non-decomposed UTF-8 strings
+        to file system functions, and it is the OS which will convert
+        them to decomposed UTF-8 before accessing the file system.
+      - The Apple Terminal application displays UTF-8 by default.
+      - However, other applications are free to use different encodings:
+        - xterm uses ISO-8859-1 by default.
+        - TextEdit uses MacRoman by default.
+      We prefer UTF-8 over decomposed UTF-8-MAC because one should
+      minimize the use of decomposed Unicode. Unfortunately, through the
+      Darwin file system, decomposed UTF-8 strings are leaked into user
+      space nevertheless.
+      Then there are also the locales with encodings other than US-ASCII
+      and UTF-8. These locales can be occasionally useful to users (e.g.
+      when grepping through ISO-8859-1 encoded text files), when all their
+      file names are in US-ASCII.
+    */
+
+    #[cfg(target_os = "macos")]
+    match s {
+        "ARMSCII-8" => return "ARMSCII-8",
+        "Big5" => return "BIG5",
+        "Big5HKSCS" => return "BIG5-HKSCS",
+        "CP1131" => return "CP1131",
+        "CP1251" => return "CP1251",
+        "CP866" => return "CP866",
+        "CP949" => return "CP949",
+        "GB18030" => return "GB18030",
+        "GB2312" => return "GB2312",
+        "GBK" => return "GBK",
+        "ISO8859-1" => return "ISO-8859-1",
+        "ISO8859-13" => return "ISO-8859-13",
+        "ISO8859-15" => return "ISO-8859-15",
+        "ISO8859-2" => return "ISO-8859-2",
+        "ISO8859-4" => return "ISO-8859-4",
+        "ISO8859-5" => return "ISO-8859-5",
+        "ISO8859-7" => return "ISO-8859-7",
+        "ISO8859-9" => return "ISO-8859-9",
+        "KOI8-R" => return "KOI8-R",
+        "KOI8-U" => return "KOI8-U",
+        "PT154" => return "PT154",
+        "SJIS" => return "SHIFT_JIS",
+        "eucCN" => return "GB2312",
+        "eucJP" => return "EUC-JP",
+        "eucKR" => return "EUC-KR",
+        _ => (),
+    };
+
+    #[cfg(target_os = "aix")]
+    match s {
+        "GBK" => return "GBK",
+        "IBM-1046" => return "CP1046",
+        "IBM-1124" => return "CP1124",
+        "IBM-1129" => return "CP1129",
+        "IBM-1252" => return "CP1252",
+        "IBM-850" => return "CP850",
+        "IBM-856" => return "CP856",
+        "IBM-921" => return "ISO-8859-13",
+        "IBM-922" => return "CP922",
+        "IBM-932" => return "CP932",
+        "IBM-943" => return "CP943",
+        "IBM-eucCN" => return "GB2312",
+        "IBM-eucJP" => return "EUC-JP",
+        "IBM-eucKR" => return "EUC-KR",
+        "IBM-eucTW" => return "EUC-TW",
+        "ISO8859-1" => return "ISO-8859-1",
+        "ISO8859-15" => return "ISO-8859-15",
+        "ISO8859-2" => return "ISO-8859-2",
+        "ISO8859-5" => return "ISO-8859-5",
+        "ISO8859-6" => return "ISO-8859-6",
+        "ISO8859-7" => return "ISO-8859-7",
+        "ISO8859-8" => return "ISO-8859-8",
+        "ISO8859-9" => return "ISO-8859-9",
+        "TIS-620" => return "TIS-620",
+        "UTF-8" => return "UTF-8",
+        "big5" => return "BIG5",
+        _ => (),
+    };
+
+    #[cfg(windows)]
+    match s {
+        "CP1361" => return "JOHAB",
+        "CP20127" => return "ASCII",
+        "CP20866" => return "KOI8-R",
+        "CP20936" => return "GB2312",
+        "CP21866" => return "KOI8-RU",
+        "CP28591" => return "ISO-8859-1",
+        "CP28592" => return "ISO-8859-2",
+        "CP28593" => return "ISO-8859-3",
+        "CP28594" => return "ISO-8859-4",
+        "CP28595" => return "ISO-8859-5",
+        "CP28596" => return "ISO-8859-6",
+        "CP28597" => return "ISO-8859-7",
+        "CP28598" => return "ISO-8859-8",
+        "CP28599" => return "ISO-8859-9",
+        "CP28605" => return "ISO-8859-15",
+        "CP38598" => return "ISO-8859-8",
+        "CP51932" => return "EUC-JP",
+        "CP51936" => return "GB2312",
+        "CP51949" => return "EUC-KR",
+        "CP51950" => return "EUC-TW",
+        "CP54936" => return "GB18030",
+        "CP65001" => return "UTF-8",
+        "CP936" => return "GBK",
+        _ => (),
+    };
+
+    String::from(s).leak()
+}
+
+#[cfg(unix)]
+mod inner {
+    use std::{
+        ffi::{c_int, CStr, CString},
+        ptr::null,
+    };
+
+    use libc::{self, nl_langinfo, setlocale, CODESET, LC_CTYPE};
+
+    unsafe fn string_from_pointer(s: *const i8) -> Option<String> {
+        if s.is_null() {
+            None
+        } else {
+            Some(CStr::from_ptr(s).to_string_lossy().into())
+        }
+    }
+
+    fn set_locale(category: c_int, locale: Option<&str>) -> Option<String> {
+        unsafe {
+            let locale = locale.map(|s| CString::new(s).unwrap());
+            let locale_ptr = locale.as_ref().map_or(null(), |s| s.as_ptr());
+            string_from_pointer(setlocale(category, locale_ptr))
+        }
+    }
+
+    pub fn locale_charset() -> Option<String> {
+        unsafe {
+            let saved_locale = set_locale(LC_CTYPE, None);
+            set_locale(LC_CTYPE, Some(""));
+            let codeset = string_from_pointer(nl_langinfo(CODESET));
+            set_locale(LC_CTYPE, saved_locale.as_deref());
+            codeset
+        }
+    }
+}
+
+#[cfg(windows)]
+mod inner {
+    use libc::{setlocale, LC_CTYPE};
+    use std::ffi::{CStr, CString};
+    use windows_sys::Win32::Globalization::GetACP;
+
+    fn current_locale() -> Option<String> {
+        unsafe {
+            let empty_cstr = CString::new("").unwrap();
+            let locale = setlocale(LC_CTYPE, empty_cstr.as_ptr());
+            if locale.is_null() {
+                None
+            } else {
+                Some(CStr::from_ptr(locale).to_string_lossy().into())
+            }
+        }
+    }
+
+    pub fn locale_charset() -> Option<String> {
+        let Some(current_locale) = current_locale() else {
+            return None;
+        };
+        let codepage = if let Some((_, pdot)) = current_locale.rsplit_once('.') {
+            format!("CP{pdot}")
+        } else {
+            format!("CP{}", unsafe { GetACP() })
+        };
+        Some(match codepage.as_str() {
+            "CP65001" | "CPutf8" => String::from("UTF-8"),
+            _ => codepage,
+        })
+    }
+}
+
+#[cfg(not(any(unix, windows)))]
+mod inner {
+    pub fn locale_charse() -> String {
+        String::from("UTF-8")
+    }
+}
+
+/// Returns the character set used by the locale configured in the operating
+/// system.
+pub fn locale_charset() -> &'static str {
+    lazy_static! {
+        static ref LOCALE_CHARSET: &'static str =
+            map_aliases(&inner::locale_charset().unwrap_or(String::from("UTF-8")));
+    }
+    &LOCALE_CHARSET
+}
diff --git a/rust/pspp/src/macros.rs b/rust/pspp/src/macros.rs

new file mode 100644 (file)

index 0000000..85671b0
--- /dev/null
+++ b/rust/pspp/src/macros.rs
@@ -0,0 +1,1668 @@
+use lazy_static::lazy_static;
+use num::Integer;
+use std::{
+    cell::RefCell,
+    cmp::Ordering,
+    collections::{BTreeMap, HashMap, HashSet},
+    mem::take,
+    num::NonZeroUsize,
+    ops::RangeInclusive,
+};
+use thiserror::Error as ThisError;
+use unicase::UniCase;
+
+use crate::{
+    identifier::Identifier,
+    lex::{
+        scan::{ScanError, ScanToken, StringScanner, StringSegmenter},
+        segment::Mode,
+        token::{Punct, Token},
+    },
+    message::Location,
+    settings::Settings,
+};
+
+#[derive(Clone, Debug, ThisError)]
+pub enum MacroError {
+    /// Expected more tokens.
+    #[error(
+        "Reached end of command expecting {n} more tokens in argument {arg} to macro {macro_}."
+    )]
+    ExpectedMoreTokens {
+        n: usize,
+        arg: Identifier,
+        macro_: Identifier,
+    },
+
+    /// Expected a particular token at end of command.
+    #[error("Reached end of command expecting {token:?} in argument {arg} to macro {macro_}.")]
+    ExpectedToken {
+        token: String,
+        arg: Identifier,
+        macro_: Identifier,
+    },
+
+    /// Expected a particular token, got a different one.
+    #[error(
+        "Found `{actual}` while expecting `{expected}` reading argument {arg} to macro {macro_}."
+    )]
+    UnexpectedToken {
+        actual: String,
+        expected: String,
+        arg: Identifier,
+        macro_: Identifier,
+    },
+
+    /// Argument specified multiple times,
+    #[error("Argument {arg} specified multiple times in call to macro {macro_}.")]
+    DuplicateArg { arg: Identifier, macro_: Identifier },
+
+    /// Maximum nesting limit exceeded.
+    #[error("Maximum nesting level {limit} exceeded. (Use `SET MNEST` to change the limit.)")]
+    TooDeep { limit: usize },
+
+    /// Invalid `!*`.
+    #[error("`!*` may only be used within the expansion of a macro.")]
+    InvalidBangAsterisk,
+
+    /// Error tokenizing during expansion.
+    #[error(transparent)]
+    ScanError(ScanError),
+
+    /// Expecting `)` in macro expression.
+    #[error("Expecting `)` in macro expression.")]
+    ExpectingRParen,
+
+    /// Expecting literal.
+    #[error("Expecting literal or function invocation in macro expression.")]
+    ExpectingLiteral,
+
+    /// Expecting `!THEN`.
+    #[error("`!THEN` expected in macro `!IF` construct.")]
+    ExpectingThen,
+
+    /// Expecting `!ELSE` or `!THEN`.
+    #[error("`!ELSE` or `!THEN` expected in macro `!IF` construct.")]
+    ExpectingElseOrIfEnd,
+
+    /// Expecting `!IFEND`.
+    #[error("`!IFEND` expected in macro `!IF` construct.")]
+    ExpectingIfEnd,
+
+    /// Expecting macro variable name.
+    #[error("Expecting macro variable name following `{0}`.")]
+    ExpectingMacroVarName(&'static str),
+
+    /// Invalid macro variable name.
+    #[error("Cannot use argument name or macro keyword {name} as `{construct}` variable name.")]
+    BadMacroVarName {
+        name: Identifier,
+        construct: &'static str,
+    },
+
+    /// Expecting `=` following `!LET`.
+    #[error("Expecting `=` following `!LET`.")]
+    ExpectingEquals,
+
+    /// Expecting `=` or `!IN` in `!DO` loop.
+    #[error("Expecting `=` or `!IN` in `!DO` loop.")]
+    ExpectingEqualsOrIn,
+
+    /// Missing `!DOEND`.
+    #[error("Missing `!DOEND`.")]
+    MissingDoEnd,
+
+    /// Bad numberic macro expression.
+    #[error("Macro expression must evaluate to a number (not {0:?})")]
+    BadNumericMacroExpression(String),
+
+    /// Too many iteration for list-based loop.
+    #[error("`!DO` loop over list exceeded maximum number of iterations {0}.  (Use `SET MITERATE` to change the limit.)")]
+    MiterateList(usize),
+
+    /// Too many iteration for numerical loop.
+    #[error("Numerical `!DO` loop  exceeded maximum number of iterations {0}.  (Use `SET MITERATE` to change the limit.)")]
+    MiterateNumeric(usize),
+
+    /// Expecting `!TO`  in numerical `!DO` loop.
+    #[error("Expecting `!TO`  in numerical `!DO` loop.")]
+    ExpectingTo,
+
+    /// `!BY` value cannot be zero.
+    #[error("`!BY` value cannot be zero.")]
+    ZeroBy,
+
+    /// `!BREAK` outside `!DO`.
+    #[error("`!BREAK` outside `!DO`.")]
+    BreakOutsideDo,
+
+    /// `,` or `)` expected in call to macro function.
+    #[error("`,` or `)` expected in call to macro function `{0}`.")]
+    ExpectingCommaOrRParen(Identifier),
+
+    /// Macro function takes one argument.
+    #[error("Macro function `{name}` takes one argument (not {n_args}).")]
+    ExpectingOneArg { name: Identifier, n_args: usize },
+
+    /// Macro function takes two arguments.
+    #[error("Macro function `{name}` takes two arguments (not {n_args}).")]
+    ExpectingTwoArgs { name: Identifier, n_args: usize },
+
+    /// Macro function takes two or three arguments.
+    #[error("Macro function `{name}` takes two or three arguments (not {n_args}).")]
+    ExpectingTwoOrThreeArgs { name: Identifier, n_args: usize },
+
+    /// Macro function needs at least one argument).
+    #[error("Macro function `{name}` needs at least one argument).")]
+    ExpectingOneOrMoreArgs { name: Identifier },
+
+    /// Argument to `!BLANKS` must be non-negative integer (not `{0}`).
+    #[error("Argument to `!BLANKS` must be non-negative integer (not `{0}`).")]
+    InvalidBlanks(String),
+
+    /// Second argument of `!SUBSTR` must be positive integer (not `{0}`).
+    #[error("Second argument of `!SUBSTR` must be positive integer (not `{0}`).")]
+    InvalidSubstr2(String),
+
+    /// Third argument of `!SUBSTR` must be non-negative integer (not `{0}`).
+    #[error("Third argument of `!SUBSTR` must be non-negative integer (not `{0}`).")]
+    InvalidSubstr3(String),
+}
+
+/// A PSPP macro as defined with `!DEFINE`.
+pub struct Macro {
+    /// The macro's name. This is an ordinary identifier except that it is
+    /// allowed (but not required) to begin with `!`.
+    pub name: Identifier,
+
+    /// Source code location of macro definition, for error reporting.
+    pub location: Location,
+
+    /// Parameters.
+    parameters: Vec<Parameter>,
+
+    /// Body.
+    body: Vec<MacroToken>,
+}
+
+impl Macro {
+    fn initial_state(&self) -> ParserState {
+        if self.parameters.is_empty() {
+            ParserState::Finished
+        } else if self.parameters[0].is_positional() {
+            ParserState::Keyword
+        } else if let ValueType::Enclose(_, _) = self.parameters[0].arg {
+            ParserState::Enclose
+        } else {
+            ParserState::Arg
+        }
+    }
+
+    fn find_parameter(&self, name: &Identifier) -> Option<usize> {
+        self.parameters.iter().position(|param| &param.name == name)
+    }
+}
+
+struct Parameter {
+    /// `!name` or `!1`.
+    name: Identifier,
+
+    /// Default value.
+    ///
+    /// The tokens don't include white space, etc. between them.
+    default: Vec<MacroToken>,
+
+    /// Macro-expand the value?
+    expand_value: bool,
+
+    /// How the argument is specified.
+    arg: ValueType,
+}
+
+impl Parameter {
+    /// Returns true if this is a positional parameter. Positional parameters
+    /// are expanded by index (position) rather than by name.
+    fn is_positional(&self) -> bool {
+        self.name.0.as_bytes()[1].is_ascii_digit()
+    }
+}
+
+enum ValueType {
+    /// Argument consists of `.0` tokens.
+    NTokens(usize),
+
+    /// Argument runs until token `.0`.
+    CharEnd(Token),
+
+    /// Argument starts with token `.0` and ends with token `.1`.
+    Enclose(Token, Token),
+
+    /// Argument runs until the end of the command.
+    CmdEnd,
+}
+
+/// A token and the syntax that was tokenized to produce it.  The syntax allows
+/// the token to be turned back into syntax accurately.
+#[derive(Clone)]
+pub struct MacroToken {
+    /// The token.
+    pub token: Token,
+
+    /// The syntax that produces `token`.
+    pub syntax: String,
+}
+
+fn tokenize_string_into(
+    s: &str,
+    mode: Mode,
+    error: &impl Fn(MacroError),
+    output: &mut Vec<MacroToken>,
+) {
+    for (syntax, token) in StringSegmenter::new(s, mode, true) {
+        match token {
+            ScanToken::Token(token) => output.push(MacroToken {
+                token,
+                syntax: String::from(syntax),
+            }),
+            ScanToken::Error(scan_error) => error(MacroError::ScanError(scan_error)),
+        }
+    }
+}
+
+fn tokenize_string(s: &str, mode: Mode, error: &impl Fn(MacroError)) -> Vec<MacroToken> {
+    let mut tokens = Vec::new();
+    tokenize_string_into(s, mode, error, &mut tokens);
+    tokens
+}
+
+fn try_unquote_string(input: &String, mode: Mode) -> Option<String> {
+    let mut scanner = StringScanner::new(input, mode, true);
+    let Some(ScanToken::Token(Token::String(unquoted))) = scanner.next() else {
+        return None;
+    };
+    let None = scanner.next() else { return None };
+    return Some(unquoted);
+}
+
+fn unquote_string(input: String, mode: Mode) -> String {
+    try_unquote_string(&input, mode).unwrap_or(input)
+}
+
+#[derive(Clone)]
+struct MacroTokens<'a>(&'a [MacroToken]);
+
+impl<'a> MacroTokens<'a> {
+    fn is_empty(&self) -> bool {
+        self.0.is_empty()
+    }
+    fn match_(&mut self, s: &str) -> bool {
+        if let Some((first, rest)) = self.0.split_first() {
+            if first.syntax.eq_ignore_ascii_case(s) {
+                self.0 = rest;
+                return true;
+            }
+        }
+        false
+    }
+    fn take_relop(&mut self) -> Option<RelOp> {
+        if let Some((first, rest)) = self.0.split_first() {
+            if let Ok(relop) = first.syntax.as_str().try_into() {
+                self.0 = rest;
+                return Some(relop);
+            }
+        }
+        None
+    }
+    fn macro_id(&self) -> Option<&Identifier> {
+        self.0.get(0).map(|mt| mt.token.macro_id()).flatten()
+    }
+    fn take_macro_id(&mut self) -> Option<&Identifier> {
+        let result = self.0.get(0).map(|mt| mt.token.macro_id()).flatten();
+        if result.is_some() {
+            self.advance();
+        }
+        result
+    }
+    fn take(&mut self) -> Option<&MacroToken> {
+        match self.0.split_first() {
+            Some((first, rest)) => {
+                self.0 = rest;
+                Some(first)
+            }
+            None => None,
+        }
+    }
+    fn advance(&mut self) -> &MacroToken {
+        let (first, rest) = self.0.split_first().unwrap();
+        self.0 = rest;
+        first
+    }
+}
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq)]
+enum TokenClass {
+    /// No space before or after (new-line after).
+    EndCommand,
+
+    /// Space on both sides.
+    BinaryOperator,
+
+    /// Space afterward.
+    Comma,
+
+    /// Don't need spaces except sequentially.
+    Id,
+
+    /// Don't need spaces except sequentially.
+    Punct,
+}
+
+impl TokenClass {
+    fn separator(prev: Self, next: Self) -> &'static str {
+        match (prev, next) {
+            // Don't need a separator before the end of a command, but we
+            // need a new-line afterward.
+            (_, Self::EndCommand) => "",
+            (Self::EndCommand, _) => "\n",
+
+            // Binary operators always have a space on both sides, and a comma always has a space afterward.
+            (Self::BinaryOperator, _) | (_, Self::BinaryOperator) | (Self::Comma, _) => " ",
+
+            // Otherwise, `prev` is `Self::Punct`, which only need a space if
+            // there are two or them in a row.
+            (Self::Punct, Self::Punct) => " ",
+            _ => "",
+        }
+    }
+}
+
+impl From<&Token> for TokenClass {
+    fn from(source: &Token) -> Self {
+        match source {
+            Token::End => Self::Punct,
+            Token::Id(_) | Token::Number(_) | Token::String(_) => Self::Id,
+            Token::EndCommand => Self::EndCommand,
+            Token::Punct(punct) => match punct {
+                Punct::LParen
+                | Punct::RParen
+                | Punct::LSquare
+                | Punct::RSquare
+                | Punct::LCurly
+                | Punct::RCurly => Self::Punct,
+
+                Punct::Plus
+                | Punct::Dash
+                | Punct::Asterisk
+                | Punct::Slash
+                | Punct::Equals
+                | Punct::Colon
+                | Punct::And
+                | Punct::Or
+                | Punct::Not
+                | Punct::Eq
+                | Punct::Ge
+                | Punct::Gt
+                | Punct::Le
+                | Punct::Lt
+                | Punct::Ne
+                | Punct::All
+                | Punct::By
+                | Punct::To
+                | Punct::With
+                | Punct::Exp
+                | Punct::Bang
+                | Punct::Percent
+                | Punct::Question
+                | Punct::Backtick
+                | Punct::Dot
+                | Punct::Underscore
+                | Punct::BangAsterisk => Self::BinaryOperator,
+
+                Punct::Comma | Punct::Semicolon => Self::Comma,
+            },
+        }
+    }
+}
+
+pub fn macro_tokens_to_syntax(input: &[MacroToken]) -> impl Iterator<Item = [&str; 2]> {
+    input
+        .iter()
+        .take(1)
+        .map(|token| ["", token.syntax.as_str()])
+        .chain(input.windows(2).map(|w| {
+            let c0 = (&w[0].token).into();
+            let c1 = (&w[1].token).into();
+            [TokenClass::separator(c0, c1), w[1].syntax.as_str()]
+        }))
+}
+
+trait MacroId {
+    fn macro_id(&self) -> Option<&Identifier>;
+}
+
+impl MacroId for Token {
+    fn macro_id(&self) -> Option<&Identifier> {
+        let id = self.id()?;
+        id.0.starts_with('!').then_some(id)
+    }
+}
+
+enum RelOp {
+    Eq,
+    Ne,
+    Lt,
+    Gt,
+    Le,
+    Ge,
+}
+
+impl TryFrom<&str> for RelOp {
+    type Error = ();
+
+    fn try_from(source: &str) -> Result<Self, Self::Error> {
+        match source {
+            "=" => Ok(Self::Eq),
+            "~=" | "<>" => Ok(Self::Ne),
+            "<" => Ok(Self::Lt),
+            ">" => Ok(Self::Gt),
+            "<=" => Ok(Self::Le),
+            ">=" => Ok(Self::Ge),
+            _ if source.len() == 3 && source.as_bytes()[0] == b'!' => match (
+                source.as_bytes()[0].to_ascii_uppercase(),
+                source.as_bytes()[1].to_ascii_uppercase(),
+            ) {
+                (b'E', b'Q') => Ok(Self::Eq),
+                (b'N', b'E') => Ok(Self::Ne),
+                (b'L', b'T') => Ok(Self::Lt),
+                (b'L', b'E') => Ok(Self::Le),
+                (b'G', b'T') => Ok(Self::Gt),
+                (b'G', b'E') => Ok(Self::Ge),
+                _ => Err(()),
+            },
+            _ => Err(()),
+        }
+    }
+}
+
+impl RelOp {
+    fn evaluate(&self, cmp: Ordering) -> bool {
+        match self {
+            RelOp::Eq => cmp == Ordering::Equal,
+            RelOp::Ne => cmp != Ordering::Equal,
+            RelOp::Lt => cmp == Ordering::Less,
+            RelOp::Gt => cmp == Ordering::Greater,
+            RelOp::Le => cmp != Ordering::Greater,
+            RelOp::Ge => cmp != Ordering::Less,
+        }
+    }
+}
+
+pub type MacroSet = HashMap<UniCase<String>, Macro>;
+
+enum ParserState {
+    /// Accumulating tokens toward the end of any type of argument.
+    Arg,
+
+    /// Expecting the opening delimiter of an ARG_ENCLOSE argument.
+    Enclose,
+
+    /// Expecting a keyword for a keyword argument.
+    Keyword,
+
+    /// Expecting an equal sign for a keyword argument.
+    Equals,
+
+    /// Macro fully parsed and ready for expansion.
+    Finished,
+}
+
+/// Macro call parser FSM.
+pub struct Parser<'a> {
+    macros: &'a MacroSet,
+    macro_: &'a Macro,
+    state: ParserState,
+    args: Box<[Option<Vec<MacroToken>>]>,
+    arg_index: usize,
+
+    /// Length of macro call so far.
+    n_tokens: usize,
+}
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq)]
+pub enum ParseStatus {
+    Complete,
+    Incomplete,
+}
+
+impl<'a> Parser<'a> {
+    pub fn new(macros: &'a MacroSet, token: &Token) -> Option<Self> {
+        let macro_ = macros.get(&token.id()?.0)?;
+        Some(Self {
+            macros,
+            macro_,
+            state: macro_.initial_state(),
+            args: (0..macro_.parameters.len()).map(|_| None).collect(),
+            arg_index: 0,
+            n_tokens: 1,
+        })
+    }
+
+    fn finished(&mut self) {
+        self.state = ParserState::Finished;
+        for (i, arg) in self.args.iter_mut().enumerate() {
+            if arg.is_none() {
+                *arg = Some(self.macro_.parameters[i].default.clone());
+            }
+        }
+        self.state = ParserState::Finished;
+    }
+
+    fn next_arg(&mut self) {
+        if self.macro_.parameters.is_empty() {
+            self.finished()
+        } else {
+            let param = &self.macro_.parameters[self.arg_index];
+            if param.is_positional() {
+                self.arg_index += 1;
+                if self.arg_index >= self.args.len() {
+                    self.finished()
+                } else {
+                    let param = &self.macro_.parameters[self.arg_index];
+                    self.state = if !param.is_positional() {
+                        ParserState::Keyword
+                    } else if let ValueType::Enclose(_, _) = param.arg {
+                        ParserState::Enclose
+                    } else {
+                        ParserState::Arg
+                    };
+                }
+            } else {
+                if self.args.iter().any(|arg| arg.is_none()) {
+                    self.state = ParserState::Keyword;
+                } else {
+                    self.finished();
+                }
+            }
+        }
+    }
+
+    fn push_arg(&mut self, token: &Token, syntax: &str, error: &impl Fn(MacroError)) {
+        let param = &self.macro_.parameters[self.args.len() - 1];
+        if let Token::EndCommand | Token::End = token {
+            if let Some(arg) = &self.args[self.arg_index] {
+                let param = &self.macro_.parameters[self.args.len() - 1];
+
+                match &param.arg {
+                    ValueType::NTokens(n) => error(MacroError::ExpectedMoreTokens {
+                        n: n - arg.len(),
+                        arg: param.name.clone(),
+                        macro_: self.macro_.name.clone(),
+                    }),
+                    ValueType::CharEnd(end) | ValueType::Enclose(_, end) => {
+                        error(MacroError::ExpectedToken {
+                            token: end.to_string(),
+                            arg: param.name.clone(),
+                            macro_: self.macro_.name.clone(),
+                        })
+                    }
+                    ValueType::CmdEnd => {
+                        // This is OK, it's the expected way to end the argument.
+                    }
+                }
+            }
+            self.finished();
+        }
+
+        self.n_tokens += 1;
+        let arg = self.args[self.arg_index].get_or_insert(Vec::new());
+        let (
+            add_token, // Should we add `mt` to the current arg?
+            next_arg,  // Should we advance to the next arg?
+        ) = match &param.arg {
+            ValueType::NTokens(n) => (arg.len() + 1 >= *n, true),
+            ValueType::CharEnd(end) | ValueType::Enclose(_, end) => {
+                let at_end = token == end;
+                (at_end, !at_end)
+            }
+            ValueType::CmdEnd => (false, true),
+        };
+        if add_token {
+            if true
+            // !macro_expand_arg (&mt->token, mc->me, *argp)
+            {
+                arg.push(MacroToken {
+                    token: token.clone(),
+                    syntax: String::from(syntax),
+                });
+            }
+        }
+        if next_arg {
+            self.next_arg()
+        }
+    }
+
+    fn push_enclose(&mut self, token: &Token, syntax: &str, error: &impl Fn(MacroError)) {
+        let param = &self.macro_.parameters[self.arg_index];
+        let ValueType::Enclose(start, _) = &param.arg else {
+            unreachable!()
+        };
+        if token == start {
+            self.n_tokens += 1;
+            self.args[self.arg_index].get_or_insert(Vec::new());
+            self.state = ParserState::Arg;
+        } else if param.is_positional() && matches!(token, Token::End | Token::EndCommand) {
+            self.finished();
+        } else {
+            error(MacroError::UnexpectedToken {
+                actual: String::from(syntax),
+                expected: start.to_string(),
+                arg: param.name.clone(),
+                macro_: self.macro_.name.clone(),
+            });
+            self.finished();
+        }
+    }
+
+    fn push_keyword(&mut self, token: &Token, _syntax: &str, error: &impl Fn(MacroError)) {
+        let Some(id) = token.id() else {
+            return self.finished();
+        };
+        let Some(arg_index) = self.macro_.find_parameter(id) else {
+            return self.finished();
+        };
+        self.arg_index = arg_index;
+        if self.args[arg_index].is_some() {
+            error(MacroError::DuplicateArg {
+                arg: id.clone(),
+                macro_: self.macro_.name.clone(),
+            });
+        }
+        self.args[arg_index] = Some(Vec::new());
+    }
+
+    fn push_equals(&mut self, token: &Token, syntax: &str, error: &impl Fn(MacroError)) {
+        let param = &self.macro_.parameters[self.arg_index];
+        if let Token::Punct(Punct::Eq) = token {
+            self.n_tokens += 1;
+            self.state = if let ValueType::Enclose(_, _) = param.arg {
+                ParserState::Enclose
+            } else {
+                ParserState::Arg
+            };
+        } else {
+            error(MacroError::UnexpectedToken {
+                actual: syntax.into(),
+                expected: String::from("="),
+                arg: param.name.clone(),
+                macro_: self.macro_.name.clone(),
+            });
+            self.finished()
+        }
+    }
+
+    /// Adds `token`, which has the given `syntax`, to the collection of tokens
+    /// in `self` that potentially need to be macro expanded.
+    ///
+    /// Returns [ParseStatus::Incomplete] if the macro expander needs more
+    /// tokens, for macro arguments or to decide whether this is actually a
+    /// macro invocation.  The caller should call `push` again with the next
+    /// token.
+    ///
+    /// Returns [ParseStatus::Complete] if the macro invocation is now complete.
+    /// The caller should call [`Self::finish()`] to obtain the expansion.
+    pub fn push(
+        &mut self,
+        token: &Token,
+        syntax: &str,
+        error: &impl Fn(MacroError),
+    ) -> ParseStatus {
+        match self.state {
+            ParserState::Arg => self.push_arg(token, syntax, error),
+            ParserState::Enclose => self.push_enclose(token, syntax, error),
+            ParserState::Keyword => self.push_keyword(token, syntax, error),
+            ParserState::Equals => self.push_equals(token, syntax, error),
+            ParserState::Finished => (),
+        }
+        if let ParserState::Finished = self.state {
+            ParseStatus::Complete
+        } else {
+            ParseStatus::Incomplete
+        }
+    }
+
+    pub fn finish(self) -> Call<'a> {
+        let ParserState::Finished = self.state else {
+            panic!()
+        };
+        Call(self)
+    }
+}
+
+/// Expansion stack entry.
+struct Frame {
+    /// A macro name or `!IF`, `!DO`, etc.
+    name: Option<Identifier>,
+
+    /// Source location, if available.
+    location: Option<Location>,
+}
+
+struct Expander<'a> {
+    /// Macros to expand recursively.
+    macros: &'a MacroSet,
+
+    /// Error reporting callback.
+    error: &'a Box<dyn Fn(MacroError) + 'a>,
+
+    /// Tokenization mode.
+    mode: Mode,
+
+    /// Remaining nesting levels.
+    nesting_countdown: usize,
+
+    /// Stack for error reporting.
+    stack: Vec<Frame>,
+
+    // May macro calls be expanded?
+    expand: &'a RefCell<bool>,
+
+    /// Variables from `!DO` and `!LET`.
+    vars: &'a RefCell<BTreeMap<Identifier, String>>,
+
+    // Only set if inside a `!DO` loop. If true, break out of the loop.
+    break_: Option<&'a mut bool>,
+
+    /// Only set if expanding a macro (and not, say, a macro argument).
+    macro_: Option<&'a Macro>,
+
+    /// Only set if expanding a macro (and not, say, a macro argument).
+    args: Option<&'a [Option<Vec<MacroToken>>]>,
+}
+
+fn bool_to_string(b: bool) -> String {
+    if b {
+        String::from("1")
+    } else {
+        String::from("0")
+    }
+}
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq)]
+enum IfEndClause {
+    Else,
+    IfEnd,
+}
+
+fn macro_keywords() -> HashSet<Identifier> {
+    let mut keywords = HashSet::new();
+    for kw in [
+        "!BREAK",
+        "!CHAREND",
+        "!CMDEND",
+        "!DEFAULT",
+        "!DO",
+        "!DOEND",
+        "!ELSE",
+        "!ENCLOSE",
+        "!ENDDEFINE",
+        "!IF",
+        "!IFEND",
+        "!IN",
+        "!LET",
+        "!NOEXPAND",
+        "!OFFEXPAND",
+        "!ONEXPAND",
+        "!POSITIONAL",
+        "!THEN",
+        "!TOKENS",
+    ] {
+        keywords.insert(Identifier::new(kw).unwrap());
+    }
+    keywords
+}
+
+fn is_macro_keyword(s: &Identifier) -> bool {
+    lazy_static! {
+        static ref KEYWORDS: HashSet<Identifier> = macro_keywords();
+    }
+    KEYWORDS.contains(s)
+}
+
+enum DoInput {
+    List(Vec<String>),
+    Up { first: f64, last: f64, by: f64 },
+    Down { first: f64, last: f64, by: f64 },
+    Empty,
+}
+
+impl DoInput {
+    fn from_list(items: Vec<MacroToken>) -> Self {
+        Self::List(
+            items
+                .into_iter()
+                .rev()
+                .take(Settings::global().macros.max_iterations + 1)
+                .map(|mt| mt.syntax)
+                .collect(),
+        )
+    }
+
+    fn from_by(first: f64, last: f64, by: f64) -> Self {
+        if by > 0.0 && first <= last {
+            Self::Up { first, last, by }
+        } else if by > 0.0 && first <= last {
+            Self::Down { first, last, by }
+        } else {
+            Self::Empty
+        }
+    }
+}
+
+impl Iterator for DoInput {
+    type Item = String;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        match self {
+            DoInput::List(vec) => vec.pop(),
+            DoInput::Up { first, last, by } => {
+                if first <= last {
+                    let value = *first;
+                    *first += *by;
+                    Some(format!("{value}"))
+                } else {
+                    None
+                }
+            }
+            DoInput::Down { first, last, by } => {
+                if first >= last {
+                    let value = *first;
+                    *first += *by;
+                    Some(format!("{value}"))
+                } else {
+                    None
+                }
+            }
+            DoInput::Empty => None,
+        }
+    }
+}
+
+impl<'a> Expander<'a> {
+    fn may_expand(&self) -> bool {
+        *self.expand.borrow()
+    }
+
+    fn should_break(&self) -> bool {
+        self.break_.as_ref().map(|b| **b).unwrap_or(false)
+    }
+
+    fn expand(&mut self, input: &mut MacroTokens, output: &mut Vec<MacroToken>) {
+        if self.nesting_countdown == 0 {
+            (self.error)(MacroError::TooDeep {
+                limit: Settings::global().macros.max_nest,
+            });
+            output.extend(take(&mut input.0).iter().cloned());
+        } else {
+            while !input.is_empty() && !self.should_break() {
+                self.expand__(input, output);
+            }
+        }
+    }
+
+    fn expand_arg(&mut self, param_idx: usize, output: &mut Vec<MacroToken>) {
+        let param = &self.macro_.unwrap().parameters[param_idx];
+        let arg = &self.args.unwrap()[param_idx].as_ref().unwrap();
+        if self.may_expand() && param.expand_value {
+            let vars = RefCell::new(BTreeMap::new());
+            let mut stack = take(&mut self.stack);
+            stack.push(Frame {
+                name: Some(param.name.clone()),
+                location: None,
+            });
+            let mut subexpander = Expander {
+                stack,
+                vars: &vars,
+                break_: None,
+                macro_: None,
+                args: None,
+                ..*self
+            };
+            let mut arg_tokens = MacroTokens(&arg);
+            subexpander.expand(&mut arg_tokens, output);
+            self.stack = subexpander.stack;
+            self.stack.pop();
+        } else {
+            output.extend(arg.iter().cloned());
+        }
+    }
+    fn parse_function_args(
+        &mut self,
+        function: &Identifier,
+        input: &mut MacroTokens,
+    ) -> Option<Vec<String>> {
+        input.advance();
+        input.advance();
+        let mut args = Vec::new();
+        if input.match_(")") {
+            return Some(args);
+        }
+        loop {
+            args.push(self.parse_function_arg(input)?);
+            match input.take() {
+                Some(MacroToken {
+                    token: Token::Punct(Punct::Comma),
+                    ..
+                }) => (),
+                Some(MacroToken {
+                    token: Token::Punct(Punct::RParen),
+                    ..
+                }) => return Some(args),
+                _ => {
+                    (self.error)(MacroError::ExpectingCommaOrRParen(function.clone()));
+                    return None;
+                }
+            }
+        }
+    }
+
+    fn expand_blanks(e: &mut Expander, args: Vec<String>) -> Option<String> {
+        let Ok(n) = args[0].trim().parse::<usize>() else {
+            (e.error)(MacroError::InvalidBlanks(args[0].clone()));
+            return None;
+        };
+        Some(std::iter::repeat(' ').take(n).collect())
+    }
+
+    fn expand_concat(e: &mut Expander, args: Vec<String>) -> Option<String> {
+        Some(
+            args.into_iter()
+                .map(|arg| unquote_string(arg, e.mode))
+                .collect(),
+        )
+    }
+
+    fn expand_eval(e: &mut Expander, args: Vec<String>) -> Option<String> {
+        let tokens = tokenize_string(&args[0], e.mode, e.error);
+        let mut stack = take(&mut e.stack);
+        stack.push(Frame {
+            name: Some(Identifier::new("!EVAL").unwrap()),
+            location: None,
+        });
+        let mut break_ = false;
+        let mut subexpander = Expander {
+            break_: Some(&mut break_),
+            stack,
+            vars: e.vars,
+            ..*e
+        };
+        let mut output = Vec::new();
+        subexpander.expand(&mut MacroTokens(tokens.as_slice()), &mut output);
+        subexpander.stack.pop();
+        e.stack = subexpander.stack;
+        Some(macro_tokens_to_syntax(&output).flatten().collect())
+    }
+
+    fn expand_head(e: &mut Expander, mut args: Vec<String>) -> Option<String> {
+        let arg = unquote_string(args.remove(0), e.mode);
+        let mut output = tokenize_string(&arg, e.mode, e.error);
+        if output.is_empty() {
+            Some(String::new())
+        } else {
+            Some(output.swap_remove(0).syntax)
+        }
+    }
+
+    fn expand_index(_e: &mut Expander, args: Vec<String>) -> Option<String> {
+        let haystack = &args[0];
+        let needle = &args[1];
+        let position = haystack.find(needle);
+        Some(format!(
+            "{}",
+            position.map_or(0, |position| &haystack[0..position].chars().count() + 1)
+        ))
+    }
+
+    fn expand_length(_e: &mut Expander, args: Vec<String>) -> Option<String> {
+        Some(format!("{}", args[0].chars().count()))
+    }
+
+    fn expand_quote(e: &mut Expander, mut args: Vec<String>) -> Option<String> {
+        let arg = args.remove(0);
+        if try_unquote_string(&arg, e.mode).is_some() {
+            Some(arg)
+        } else {
+            let mut output = String::with_capacity(arg.len() + 2);
+            output.push('\'');
+            for c in arg.chars() {
+                if c == '"' {
+                    output.push('\'');
+                }
+                output.push(c);
+            }
+            output.push('\'');
+            Some(output)
+        }
+    }
+
+    fn expand_substr(e: &mut Expander, args: Vec<String>) -> Option<String> {
+        let Ok(start) = args[1].trim().parse::<NonZeroUsize>() else {
+            (e.error)(MacroError::InvalidSubstr3(args[0].clone()));
+            return None;
+        };
+        let start = start.get();
+        let Ok(count) = args[2].trim().parse::<usize>() else {
+            (e.error)(MacroError::InvalidSubstr2(args[0].clone()));
+            return None;
+        };
+
+        Some(args[0].chars().skip(start - 1).take(count).collect())
+    }
+
+    fn expand_tail(e: &mut Expander, mut args: Vec<String>) -> Option<String> {
+        let arg = unquote_string(args.remove(0), e.mode);
+        let mut output = tokenize_string(&arg, e.mode, e.error);
+        Some(
+            output
+                .pop()
+                .map_or_else(|| String::new(), |tail| tail.syntax),
+        )
+    }
+
+    fn expand_unquote(e: &mut Expander, mut args: Vec<String>) -> Option<String> {
+        Some(unquote_string(args.remove(0), e.mode))
+    }
+
+    fn expand_upcase(e: &mut Expander, mut args: Vec<String>) -> Option<String> {
+        Some(unquote_string(args.remove(0), e.mode).to_uppercase())
+    }
+
+    fn expand_macro_function(&mut self, orig_input: &mut MacroTokens) -> Option<String> {
+        let mut input = orig_input.clone();
+        let name = input.macro_id()?;
+        if name == "!NULL" {
+            return Some(String::new());
+        }
+        if input.0.len() < 2 || !matches!(input.0[1].token, Token::Punct(Punct::LParen)) {
+            return None;
+        }
+
+        struct MacroFunction {
+            name: Identifier,
+            args: RangeInclusive<usize>,
+            parser: fn(&mut Expander, Vec<String>) -> Option<String>,
+        }
+        impl MacroFunction {
+            fn new(
+                name: &str,
+                args: RangeInclusive<usize>,
+                parser: fn(&mut Expander, Vec<String>) -> Option<String>,
+            ) -> Self {
+                Self {
+                    name: Identifier::new(name).unwrap(),
+                    args,
+                    parser,
+                }
+            }
+        }
+        lazy_static! {
+            static ref MACRO_FUNCTIONS: [MacroFunction; 11] = [
+                MacroFunction::new("!BLANKS", 1..=1, Expander::expand_blanks),
+                MacroFunction::new("!CONCAT", 1..=usize::MAX, Expander::expand_concat),
+                MacroFunction::new("!HEAD", 1..=1, Expander::expand_head),
+                MacroFunction::new("!INDEX", 2..=2, Expander::expand_index),
+                MacroFunction::new("!LENGTH", 1..=1, Expander::expand_length),
+                MacroFunction::new("!QUOTE", 1..=1, Expander::expand_quote),
+                MacroFunction::new("!SUBSTR", 2..=3, Expander::expand_substr),
+                MacroFunction::new("!TAIL", 1..=1, Expander::expand_tail),
+                MacroFunction::new("!UNQUOTE", 1..=1, Expander::expand_unquote),
+                MacroFunction::new("!UPCASE", 1..=1, Expander::expand_upcase),
+                MacroFunction::new("!EVAL", 1..=1, Expander::expand_eval),
+            ];
+        }
+
+        let function = MACRO_FUNCTIONS.iter().find(|mf| &mf.name == name)?;
+
+        let args = self.parse_function_args(&function.name, &mut input)?;
+
+        let n_args = args.len();
+        if !function.args.contains(&n_args) {
+            let name = function.name.clone();
+            let error = match &function.args {
+                x if x == &(1..=1) => MacroError::ExpectingOneArg { name, n_args },
+                x if x == &(2..=2) => MacroError::ExpectingTwoArgs { name, n_args },
+                x if x == &(2..=3) => MacroError::ExpectingTwoOrThreeArgs { name, n_args },
+                x if x == &(1..=usize::MAX) => MacroError::ExpectingOneOrMoreArgs { name },
+                _ => unreachable!(),
+            };
+            (self.error)(error);
+            return None;
+        }
+
+        *orig_input = input;
+        (function.parser)(self, args)
+    }
+
+    /// Parses one function argument from `input`.  Each argument to a macro
+    /// function is one of:
+    ///
+    ///     - A quoted string or other single literal token.
+    ///
+    ///     - An argument to the macro being expanded, e.g. `!1` or a named
+    ///       argument.
+    ///
+    ///     - `!*`.
+    ///
+    ///     - A function invocation.
+    ///
+    /// Each function invocation yields a character sequence to be turned into a
+    /// sequence of tokens.  The case where that character sequence is a single
+    /// quoted string is an important special case.
+    fn parse_function_arg(&mut self, input: &mut MacroTokens) -> Option<String> {
+        if let Some(macro_) = self.macro_ {
+            match &input.0.get(0)?.token {
+                Token::Id(id) if id.0.starts_with('!') => {
+                    if let Some(param_idx) = macro_.find_parameter(id) {
+                        input.advance();
+                        return Some(
+                            macro_tokens_to_syntax(self.args.unwrap()[param_idx].as_ref().unwrap())
+                                .flatten()
+                                .collect(),
+                        );
+                    }
+                    if let Some(value) = self.vars.borrow().get(id) {
+                        return Some(value.clone());
+                    }
+
+                    if let Some(output) = self.expand_macro_function(input) {
+                        return Some(output);
+                    }
+                }
+                Token::Punct(Punct::BangAsterisk) => {
+                    let mut arg = String::new();
+                    for i in 0..macro_.parameters.len() {
+                        if !macro_.parameters[i].is_positional() {
+                            break;
+                        }
+                        if i > 0 {
+                            arg.push(' ')
+                        }
+                        arg.extend(
+                            macro_tokens_to_syntax(self.args.unwrap()[i].as_ref().unwrap())
+                                .flatten(),
+                        );
+                    }
+                    input.advance();
+                    return Some(arg);
+                }
+                _ => (),
+            }
+        }
+        Some(input.advance().syntax.clone())
+    }
+
+    fn evaluate_literal(&mut self, input: &mut MacroTokens) -> Option<String> {
+        if input.match_("(") {
+            let value = self.evaluate_or(input)?;
+            if input.match_(")") {
+                Some(value)
+            } else {
+                (self.error)(MacroError::ExpectingRParen);
+                None
+            }
+        } else if input.match_(")") {
+            (self.error)(MacroError::ExpectingLiteral);
+            None
+        } else {
+            Some(unquote_string(self.parse_function_arg(input)?, self.mode))
+        }
+    }
+
+    fn evaluate_relational(&mut self, input: &mut MacroTokens) -> Option<String> {
+        let lhs = self.evaluate_literal(input)?;
+        let Some(relop) = input.take_relop() else {
+            return Some(lhs);
+        };
+        let rhs = self.evaluate_literal(input)?;
+        let cmp = unquote_string(lhs, self.mode).cmp(&unquote_string(rhs, self.mode));
+        Some(bool_to_string(relop.evaluate(cmp)))
+    }
+
+    fn evaluate_not(&mut self, input: &mut MacroTokens) -> Option<String> {
+        let mut negations = 0;
+        while input.match_("!AND") || input.match_("&") {
+            negations += 1;
+        }
+
+        let operand = self.evaluate_relational(input)?;
+        if negations == 0 {
+            return Some(operand);
+        }
+
+        let mut b = operand != "0";
+        if negations.is_odd() {
+            b = !b;
+        }
+        Some(bool_to_string(b))
+    }
+
+    fn evaluate_and(&mut self, input: &mut MacroTokens) -> Option<String> {
+        let mut lhs = self.evaluate_not(input)?;
+        while input.match_("!AND") || input.match_("&") {
+            let rhs = self.evaluate_not(input)?;
+            lhs = bool_to_string(lhs != "0" && rhs != "0");
+        }
+        Some(lhs)
+    }
+    fn evaluate_or(&mut self, input: &mut MacroTokens) -> Option<String> {
+        let mut lhs = self.evaluate_and(input)?;
+        while input.match_("!OR") || input.match_("|") {
+            let rhs = self.evaluate_and(input)?;
+            lhs = bool_to_string(lhs != "0" || rhs != "0");
+        }
+        Some(lhs)
+    }
+
+    fn evaluate_expression(&mut self, input: &mut MacroTokens) -> Option<String> {
+        self.evaluate_or(input)
+    }
+
+    fn evaluate_number(&mut self, input: &mut MacroTokens) -> Option<f64> {
+        let s = self.evaluate_expression(input)?;
+        let tokens = tokenize_string(&s, self.mode, self.error);
+        let (
+            Some(MacroToken {
+                token: Token::Number(number),
+                ..
+            }),
+            1,
+        ) = (tokens.get(0), tokens.len())
+        else {
+            (self.error)(MacroError::BadNumericMacroExpression(s));
+            return None;
+        };
+
+        Some(*number)
+    }
+
+    fn find_ifend_clause<'b>(
+        input: &mut MacroTokens<'b>,
+    ) -> Option<(MacroTokens<'b>, IfEndClause)> {
+        let input_copy = input.clone();
+        let mut nesting = 0;
+        while !input.is_empty() {
+            if input.match_("!IF") {
+                nesting += 1;
+            } else if input.match_("!IFEND") {
+                if nesting == 0 {
+                    return Some((
+                        MacroTokens(&input_copy.0[..input_copy.0.len() - input.0.len() - 1]),
+                        IfEndClause::IfEnd,
+                    ));
+                }
+                nesting -= 1;
+            } else if input.match_("!ELSE") && nesting == 0 {
+                return Some((
+                    MacroTokens(&input_copy.0[..input_copy.0.len() - input.0.len() - 1]),
+                    IfEndClause::Else,
+                ));
+            } else {
+                input.advance();
+            }
+        }
+        return None;
+    }
+    fn expand_if(&mut self, orig_input: &mut MacroTokens, output: &mut Vec<MacroToken>) -> bool {
+        let mut input = orig_input.clone();
+        if !input.match_("!IF") {
+            return false;
+        }
+        let Some(result) = self.evaluate_expression(&mut input) else {
+            return false;
+        };
+        if !input.match_("!THEN") {
+            (self.error)(MacroError::ExpectingThen);
+            return false;
+        }
+
+        let Some((if_tokens, clause)) = Self::find_ifend_clause(&mut input) else {
+            (self.error)(MacroError::ExpectingElseOrIfEnd);
+            return false;
+        };
+
+        let else_tokens = match clause {
+            IfEndClause::Else => {
+                let Some((else_tokens, IfEndClause::IfEnd)) = Self::find_ifend_clause(&mut input)
+                else {
+                    (self.error)(MacroError::ExpectingIfEnd);
+                    return false;
+                };
+                Some(else_tokens)
+            }
+            IfEndClause::IfEnd => None,
+        };
+
+        let subinput = match result.as_str() {
+            "0" => else_tokens,
+            _ => Some(if_tokens),
+        };
+        if let Some(mut subinput) = subinput {
+            self.stack.push(Frame {
+                name: Some(Identifier::new("!IF").unwrap()),
+                location: None,
+            });
+            self.expand(&mut subinput, output);
+            self.stack.pop();
+        }
+        *orig_input = input;
+        true
+    }
+
+    fn take_macro_var_name(
+        &mut self,
+        input: &mut MacroTokens,
+        construct: &'static str,
+    ) -> Option<Identifier> {
+        let Some(var_name) = input.take_macro_id() else {
+            (self.error)(MacroError::ExpectingMacroVarName(construct));
+            return None;
+        };
+        if is_macro_keyword(var_name)
+            || self
+                .macro_
+                .map(|m| m.find_parameter(var_name))
+                .flatten()
+                .is_some()
+        {
+            (self.error)(MacroError::BadMacroVarName {
+                name: var_name.clone(),
+                construct,
+            });
+            None
+        } else {
+            Some(var_name.clone())
+        }
+    }
+
+    fn expand_let(&mut self, orig_input: &mut MacroTokens) -> bool {
+        let mut input = orig_input.clone();
+        if !input.match_("!LET") {
+            return false;
+        }
+
+        let Some(var_name) = self.take_macro_var_name(&mut input, "!LET") else {
+            return false;
+        };
+        input.advance();
+
+        if !input.match_("=") {
+            (self.error)(MacroError::ExpectingEquals);
+            return false;
+        }
+
+        let Some(value) = self.evaluate_expression(&mut input) else {
+            return false;
+        };
+        self.vars.borrow_mut().insert(var_name.clone(), value);
+        *orig_input = input;
+        true
+    }
+
+    fn find_doend<'b>(&mut self, input: &mut MacroTokens<'b>) -> Option<MacroTokens<'b>> {
+        let input_copy = input.clone();
+        let mut nesting = 0;
+        while !input.is_empty() {
+            if input.match_("!DO") {
+                nesting += 1;
+            } else if input.match_("!DOEND") {
+                if nesting == 0 {
+                    return Some(MacroTokens(
+                        &input_copy.0[..input_copy.0.len() - input.0.len() - 1],
+                    ));
+                }
+                nesting -= 1;
+            } else {
+                input.advance();
+            }
+        }
+        (self.error)(MacroError::MissingDoEnd);
+        return None;
+    }
+
+    fn expand_do(&mut self, orig_input: &mut MacroTokens, output: &mut Vec<MacroToken>) -> bool {
+        let mut input = orig_input.clone();
+        if !input.match_("!DO") {
+            return false;
+        }
+
+        let Some(var_name) = self.take_macro_var_name(&mut input, "!DO") else {
+            return false;
+        };
+
+        let (items, miterate_error) = if input.match_("!IN") {
+            let Some(list) = self.evaluate_expression(&mut input) else {
+                return false;
+            };
+            let items = tokenize_string(list.as_str(), self.mode, &self.error);
+            (
+                DoInput::from_list(items),
+                MacroError::MiterateList(Settings::global().macros.max_iterations),
+            )
+        } else if input.match_("=") {
+            let Some(first) = self.evaluate_number(&mut input) else {
+                return false;
+            };
+            if !input.match_("!TO") {
+                (self.error)(MacroError::ExpectingTo);
+                return false;
+            }
+            let Some(last) = self.evaluate_number(&mut input) else {
+                return false;
+            };
+            let by = if input.match_("!BY") {
+                let Some(by) = self.evaluate_number(&mut input) else {
+                    return false;
+                };
+                if by == 0.0 {
+                    (self.error)(MacroError::ZeroBy);
+                    return false;
+                }
+                by
+            } else {
+                1.0
+            };
+            (
+                DoInput::from_by(first, last, by),
+                MacroError::MiterateNumeric(Settings::global().macros.max_iterations),
+            )
+        } else {
+            (self.error)(MacroError::ExpectingEqualsOrIn);
+            return false;
+        };
+
+        let Some(body) = self.find_doend(&mut input) else {
+            return false;
+        };
+
+        let mut stack = take(&mut self.stack);
+        stack.push(Frame {
+            name: Some(Identifier::new("!DO").unwrap()),
+            location: None,
+        });
+        let mut break_ = false;
+        let mut subexpander = Expander {
+            break_: Some(&mut break_),
+            stack,
+            vars: self.vars,
+            ..*self
+        };
+
+        for (i, item) in items.enumerate() {
+            if subexpander.should_break() {
+                break;
+            }
+            if i >= Settings::global().macros.max_iterations {
+                (self.error)(miterate_error);
+                break;
+            }
+            let mut vars = self.vars.borrow_mut();
+            if let Some(value) = vars.get_mut(&var_name) {
+                *value = item;
+            } else {
+                vars.insert(var_name.clone(), item);
+            }
+            subexpander.expand(&mut body.clone(), output);
+        }
+        *orig_input = input;
+        true
+    }
+
+    fn expand__(&mut self, input: &mut MacroTokens, output: &mut Vec<MacroToken>) {
+        // Recursive macro calls.
+        if self.may_expand() {
+            if let Some(call) = Call::for_tokens(self.macros, &input.0, &self.error) {
+                let vars = RefCell::new(BTreeMap::new());
+                let mut stack = take(&mut self.stack);
+                stack.push(Frame {
+                    name: Some(call.0.macro_.name.clone()),
+                    location: Some(call.0.macro_.location.clone()),
+                });
+                let mut subexpander = Expander {
+                    break_: None,
+                    vars: &vars,
+                    nesting_countdown: self.nesting_countdown.saturating_sub(1),
+                    stack,
+                    ..*self
+                };
+                let mut body = MacroTokens(call.0.macro_.body.as_slice());
+                subexpander.expand(&mut body, output);
+                self.stack = subexpander.stack;
+                self.stack.pop();
+                input.0 = &input.0[call.len()..];
+                return;
+            }
+        }
+
+        // Only identifiers beginning with `!` receive further processing.
+        let id = match &input.0[0].token {
+            Token::Id(id) if id.0.starts_with('!') => id,
+            Token::Punct(Punct::BangAsterisk) => {
+                if let Some(macro_) = self.macro_ {
+                    for i in 0..macro_.parameters.len() {
+                        self.expand_arg(i, output);
+                    }
+                } else {
+                    (self.error)(MacroError::InvalidBangAsterisk);
+                }
+                input.advance();
+                return;
+            }
+            _ => {
+                output.push(input.advance().clone());
+                return;
+            }
+        };
+
+        // Macro arguments.
+        if let Some(macro_) = self.macro_ {
+            if let Some(param_idx) = macro_.find_parameter(id) {
+                self.expand_arg(param_idx, output);
+                input.advance();
+                return;
+            }
+        }
+
+        // Variables set by `!DO` or `!LET`.
+        if let Some(value) = self.vars.borrow().get(id) {
+            tokenize_string_into(value.as_str(), self.mode, &self.error, output);
+            input.advance();
+            return;
+        }
+
+        // Macro functions.
+        if self.expand_if(input, output) {
+            return;
+        }
+        if self.expand_let(input) {
+            return;
+        }
+        if self.expand_do(input, output) {
+            return;
+        }
+
+        if input.match_("!BREAK") {
+            if let Some(ref mut break_) = self.break_ {
+                **break_ = true;
+            } else {
+                (self.error)(MacroError::BreakOutsideDo);
+            }
+            return;
+        }
+
+        if input.match_("!ONEXPAND") {
+            *self.expand.borrow_mut() = true;
+        } else if input.match_("!OFFEXPAND") {
+            *self.expand.borrow_mut() = false;
+        } else {
+            output.push(input.advance().clone());
+        }
+    }
+}
+
+pub struct Call<'a>(Parser<'a>);
+
+impl<'a> Call<'a> {
+    pub fn for_tokens<F>(macros: &'a MacroSet, tokens: &[MacroToken], error: &F) -> Option<Self>
+    where
+        F: Fn(MacroError),
+    {
+        let mut parser = Parser::new(macros, &tokens.get(0)?.token)?;
+        for token in tokens[1..].iter().chain(&[MacroToken {
+            token: Token::EndCommand,
+            syntax: String::from(""),
+        }]) {
+            if parser.push(&token.token, &token.syntax, error) == ParseStatus::Complete {
+                return Some(parser.finish());
+            }
+        }
+        return None;
+    }
+
+    pub fn expand<F>(&self, mode: Mode, call_loc: Location, output: &mut Vec<MacroToken>, error: F)
+    where
+        F: Fn(MacroError) + 'a,
+    {
+        let error: Box<dyn Fn(MacroError) + 'a> = Box::new(error);
+        let vars = RefCell::new(BTreeMap::new());
+        let expand = RefCell::new(true);
+        let mut me = Expander {
+            macros: self.0.macros,
+            error: &error,
+            macro_: Some(self.0.macro_),
+            args: Some(&self.0.args),
+            mode,
+            nesting_countdown: Settings::global().macros.max_nest,
+            stack: vec![
+                Frame {
+                    name: None,
+                    location: Some(call_loc),
+                },
+                Frame {
+                    name: Some(self.0.macro_.name.clone()),
+                    location: Some(self.0.macro_.location.clone()),
+                },
+            ],
+            vars: &vars,
+            break_: None,
+            expand: &expand,
+        };
+        let mut body = MacroTokens(&self.0.macro_.body);
+        me.expand(&mut body, output);
+    }
+
+    /// Returns the number of tokens consumed from the input for the macro
+    /// invocation. If the result is 0, then there was no macro invocation and
+    /// the expansion will be empty.
+    pub fn len(&self) -> usize {
+        self.0.n_tokens
+    }
+}
diff --git a/rust/pspp/src/main.rs b/rust/pspp/src/main.rs

new file mode 100644 (file)

index 0000000..a3b3145
--- /dev/null
+++ b/rust/pspp/src/main.rs
@@ -0,0 +1,155 @@
+/* PSPP - a program for statistical analysis.
+ * Copyright (C) 2023 Free Software Foundation, Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>. */
+
+use anyhow::Result;
+use clap::{Parser, ValueEnum};
+use encoding_rs::Encoding;
+use pspp::raw::{encoding_from_headers, Decoder, Magic, Reader, Record};
+use std::fs::File;
+use std::io::BufReader;
+use std::path::{Path, PathBuf};
+use std::str;
+use thiserror::Error as ThisError;
+
+/// A utility to dissect SPSS system files.
+#[derive(Parser, Debug)]
+#[command(author, version, about, long_about = None)]
+struct Args {
+    /// Maximum number of cases to print.
+    #[arg(long = "data", default_value_t = 0)]
+    max_cases: u64,
+
+    /// Files to dissect.
+    #[arg(required = true)]
+    files: Vec<PathBuf>,
+
+    /// How to dissect the file.
+    #[arg(short, long, value_enum, default_value_t)]
+    mode: Mode,
+
+    /// The encoding to use.
+    #[arg(long, value_parser = parse_encoding)]
+    encoding: Option<&'static Encoding>,
+}
+
+#[derive(ThisError, Debug)]
+#[error("{0}: unknown encoding")]
+struct UnknownEncodingError(String);
+
+fn parse_encoding(arg: &str) -> Result<&'static Encoding, UnknownEncodingError> {
+    match Encoding::for_label_no_replacement(arg.as_bytes()) {
+        Some(encoding) => Ok(encoding),
+        None => Err(UnknownEncodingError(arg.to_string())),
+    }
+}
+
+#[derive(Clone, Copy, Debug, Default, ValueEnum)]
+enum Mode {
+    Identify,
+    Raw,
+    Decoded,
+    #[default]
+    Cooked,
+}
+
+fn main() -> Result<()> {
+    let Args {
+        max_cases,
+        files,
+        mode,
+        encoding,
+    } = Args::parse();
+
+    for file in files {
+        dissect(&file, max_cases, mode, encoding)?;
+    }
+    Ok(())
+}
+
+fn dissect(
+    file_name: &Path,
+    max_cases: u64,
+    mode: Mode,
+    encoding: Option<&'static Encoding>,
+) -> Result<()> {
+    let reader = File::open(file_name)?;
+    let reader = BufReader::new(reader);
+    let mut reader = Reader::new(reader, |warning| println!("{warning}"))?;
+
+    match mode {
+        Mode::Identify => {
+            let Record::Header(header) = reader.next().unwrap()? else {
+                unreachable!()
+            };
+            match header.magic {
+                Magic::Sav => println!("SPSS System File"),
+                Magic::Zsav => println!("SPSS System File with Zlib compression"),
+                Magic::Ebcdic => println!("EBCDIC-encoded SPSS System File"),
+            }
+            return Ok(());
+        }
+        Mode::Raw => {
+            for header in reader {
+                let header = header?;
+                println!("{:?}", header);
+                if let Record::Cases(cases) = header {
+                    let mut cases = cases.borrow_mut();
+                    for _ in 0..max_cases {
+                        let Some(Ok(record)) = cases.next() else {
+                            break;
+                        };
+                        println!("{:?}", record);
+                    }
+                }
+            }
+        }
+        Mode::Decoded => {
+            let headers: Vec<Record> = reader.collect::<Result<Vec<_>, _>>()?;
+            let encoding = match encoding {
+                Some(encoding) => encoding,
+                None => encoding_from_headers(&headers, &|e| eprintln!("{e}"))?,
+            };
+            let decoder = Decoder::new(encoding, |e| eprintln!("{e}"));
+            for header in headers {
+                let header = header.decode(&decoder);
+                println!("{:?}", header);
+                /*
+                                if let Record::Cases(cases) = header {
+                                    let mut cases = cases.borrow_mut();
+                                    for _ in 0..max_cases {
+                                        let Some(Ok(record)) = cases.next() else {
+                                            break;
+                                        };
+                                        println!("{:?}", record);
+                                    }
+                                }
+                */
+            }
+        }
+        Mode::Cooked => {
+            /*
+                let headers: Vec<Record> = reader.collect::<Result<Vec<_>, _>>()?;
+                let encoding = encoding_from_headers(&headers, &|e| eprintln!("{e}"))?;
+                let (headers, _) = decode(headers, encoding, &|e| eprintln!("{e}"))?;
+                for header in headers {
+                    println!("{header:?}");
+            }
+                */
+        }
+    }
+
+    Ok(())
+}
diff --git a/rust/pspp/src/message.rs b/rust/pspp/src/message.rs

new file mode 100644 (file)

index 0000000..a3ba1d8
--- /dev/null
+++ b/rust/pspp/src/message.rs
@@ -0,0 +1,252 @@
+use std::{
+    cmp::{max, min},
+    fmt::{Display, Formatter, Result as FmtResult},
+    ops::Range,
+    sync::Arc,
+};
+
+use enum_map::Enum;
+use unicode_width::UnicodeWidthStr;
+
+/// A line number and optional column number within a source file.
+#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord)]
+pub struct Point {
+    /// 1-based line number.
+    pub line: i32,
+
+    /// 1-based column number.
+    ///
+    /// Column numbers are measured according to the width of characters as
+    /// shown in a typical fixed-width font, in which CJK characters have width
+    /// 2 and combining characters have width 0, as measured by the
+    /// `unicode_width` crate.
+    pub column: Option<i32>,
+}
+
+impl Point {
+    /// Takes `point`, adds to it the syntax in `syntax`, incrementing the line
+    /// number for each new-line in `syntax` and the column number for each
+    /// column, and returns the result.
+    pub fn advance(&self, syntax: &str) -> Self {
+        let mut result = *self;
+        for line in syntax.split_inclusive('\n') {
+            if line.ends_with('\n') {
+                result.line += 1;
+                result.column = Some(1);
+            } else {
+                result.column = result.column.map(|column| column + line.width() as i32);
+            }
+        }
+        result
+    }
+
+    pub fn without_column(&self) -> Self {
+        Self {
+            line: self.line,
+            column: None,
+        }
+    }
+}
+
+/// Location relevant to an diagnostic message.
+#[derive(Clone, Debug)]
+pub struct Location {
+    /// File name, if any.
+    pub file_name: Option<Arc<String>>,
+
+    /// Starting and ending point, if any.
+    pub span: Option<Range<Point>>,
+
+    /// Normally, if `span` contains column information, then displaying the
+    /// message will underline the location.  Setting this to true disables
+    /// displaying underlines.
+    pub omit_underlines: bool,
+}
+
+impl Display for Location {
+    fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
+        if let Some(file_name) = &self.file_name {
+            write!(f, "{}", file_name)?;
+        }
+
+        if let Some(span) = &self.span {
+            if self.file_name.is_some() {
+                write!(f, ":")?;
+            }
+            let l1 = span.start.line;
+            let l2 = span.end.line;
+            if let (Some(c1), Some(c2)) = (span.start.column, span.end.column) {
+                if l2 > l1 {
+                    write!(f, "{l1}.{c1}-{l2}.{}", c2 - 1)?;
+                } else {
+                    write!(f, "{l1}.{c1}-{}", c2 - 1)?;
+                }
+            } else {
+                if l2 > l1 {
+                    write!(f, "{l1}-{l2}")?;
+                } else {
+                    write!(f, "{l1}")?;
+                }
+            }
+        }
+        Ok(())
+    }
+}
+
+impl Location {
+    pub fn without_columns(&self) -> Self {
+        Self {
+            file_name: self.file_name.clone(),
+            span: self
+                .span
+                .as_ref()
+                .map(|span| span.start.without_column()..span.end.without_column()),
+            omit_underlines: self.omit_underlines,
+        }
+    }
+    pub fn merge(a: Option<Self>, b: &Option<Self>) -> Option<Self> {
+        let Some(a) = a else { return b.clone() };
+        let Some(b) = b else { return Some(a) };
+        if a.file_name != b.file_name {
+            // Failure.
+            return Some(a);
+        }
+        let span = match (&a.span, &b.span) {
+            (None, None) => None,
+            (Some(r), None) | (None, Some(r)) => Some(r.clone()),
+            (Some(ar), Some(br)) => {
+                Some(min(ar.start, br.start).clone()..max(ar.end, br.end).clone())
+            }
+        };
+        Some(Self {
+            file_name: a.file_name,
+            span,
+            omit_underlines: a.omit_underlines || b.omit_underlines,
+        })
+    }
+    pub fn is_empty(&self) -> bool {
+        self.file_name.is_none() && self.span.is_none()
+    }
+}
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Enum)]
+pub enum Severity {
+    Error,
+    Warning,
+    Note,
+}
+
+impl Severity {
+    fn as_str(&self) -> &'static str {
+        match self {
+            Severity::Error => "error",
+            Severity::Warning => "warning",
+            Severity::Note => "note",
+        }
+    }
+}
+
+impl Display for Severity {
+    fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
+        write!(f, "{}", self.as_str())
+    }
+}
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq)]
+pub enum Category {
+    General,
+    Syntax,
+    Data,
+}
+
+pub struct Stack {
+    location: Location,
+    description: String,
+}
+
+pub struct Diagnostic {
+    pub severity: Severity,
+    pub category: Category,
+    pub location: Location,
+    pub source: Vec<(i32, String)>,
+    pub stack: Vec<Stack>,
+    pub command_name: Option<&'static str>,
+    pub text: String,
+}
+
+impl Display for Diagnostic {
+    fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
+        for Stack {
+            location,
+            description,
+        } in &self.stack
+        {
+            if !!location.is_empty() {
+                write!(f, "{location}: ")?;
+            }
+            writeln!(f, "{description}")?;
+        }
+        if self.category != Category::General && !self.location.is_empty() {
+            write!(f, "{}: ", self.location)?;
+        }
+
+        write!(f, "{}: ", self.severity)?;
+
+        match self.command_name {
+            Some(command_name) if self.category == Category::Syntax => {
+                write!(f, "{command_name}: ")?
+            }
+            _ => (),
+        }
+
+        write!(f, "{}", self.text)?;
+
+        if let Some(Range {
+            start: Point {
+                line: l0,
+                column: Some(c0),
+            },
+            end: Point {
+                line: l1,
+                column: Some(c1),
+            },
+        }) = self.location.span
+        {
+            let mut prev_line_number = None;
+            for (line_number, line) in &self.source {
+                if let Some(prev_line_number) = prev_line_number {
+                    if *line_number != prev_line_number + 1 {
+                        write!(f, "\n  ... |")?;
+                    }
+                }
+                prev_line_number = Some(line_number);
+
+                write!(f, "\n{line_number:5} | {line}")?;
+
+                if !self.location.omit_underlines {
+                    let c0 = if *line_number == l0 { c0 } else { 1 };
+                    let c1 = if *line_number == l1 {
+                        c1
+                    } else {
+                        line.width() as i32
+                    };
+                    write!(f, "\n      |")?;
+                    for _ in 0..c0 {
+                        f.write_str(" ")?;
+                    }
+                    if *line_number == l0 {
+                        f.write_str("^")?;
+                        for _ in c0..c1 {
+                            f.write_str("~")?;
+                        }
+                    } else {
+                        for _ in c0..=c1 {
+                            f.write_str("~")?;
+                        }
+                    }
+                }
+            }
+        }
+        Ok(())
+    }
+}
diff --git a/rust/pspp/src/output/mod.rs b/rust/pspp/src/output/mod.rs

new file mode 100644 (file)

index 0000000..944cbe7
--- /dev/null
+++ b/rust/pspp/src/output/mod.rs
@@ -0,0 +1,58 @@
+use std::sync::Arc;
+
+use self::pivot::Value;
+
+pub mod pivot;
+
+/// A single output item.
+pub struct Item {
+    /// The localized label for the item that appears in the outline pane in the
+    /// output viewer and in PDF outlines.  This is `None` if no label has been
+    /// explicitly set.
+    label: Option<String>,
+
+    /// A locale-invariant identifier for the command that produced the output,
+    /// which may be `None` if unknown or if a command did not produce this
+    /// output.
+    command_name: Option<String>,
+
+    /// For a group item, this is true if the group's subtree should
+    /// be expanded in an outline view, false otherwise.
+    ///
+    /// For other kinds of output items, this is true to show the item's
+    /// content, false to hide it.  The item's label is always shown in an
+    /// outline view.
+    show: bool,
+
+    /// Item details.
+    details: Details,
+}
+
+pub enum Details {
+    Chart,
+    Image,
+    Group(Vec<Arc<Item>>),
+    Message,
+    Table,
+    Text(Text),
+}
+
+pub struct Text {
+    type_: TextType,
+
+    content: Value,
+}
+
+pub enum TextType {
+    /// `TITLE` and `SUBTITLE` commands.
+    PageTitle,
+
+    /// Title,
+    Title,
+
+    /// Syntax printback logging.
+    Syntax,
+
+    /// Other logging.
+    Log,
+}
diff --git a/rust/pspp/src/output/pivot/mod.rs b/rust/pspp/src/output/pivot/mod.rs

new file mode 100644 (file)

index 0000000..d8f5c9f
--- /dev/null
+++ b/rust/pspp/src/output/pivot/mod.rs
@@ -0,0 +1,738 @@
+//! Pivot tables.
+//!
+//! Pivot tables are PSPP's primary form of output.  They are analogous to the
+//! pivot tables you might be familiar with from spreadsheets and databases.
+//! See <https://en.wikipedia.org/wiki/Pivot_table> for a brief introduction to
+//! the overall concept of a pivot table.
+//!
+//! In PSPP, the most important internal pieces of a pivot table are:
+//!
+//! - Title.  Every pivot table has a title that is displayed above it.  It also
+//!   has an optional caption (displayed below it) and corner text (displayed in
+//!   the upper left corner).
+//!
+//! - Dimensions.  A dimension consists of zero or more categories.  A category
+//!   has a label, such as "df" or "Asymp. Sig." or 123 or a variable name.  The
+//!   categories are the leaves of a tree whose non-leaf nodes form groups of
+//!   categories.  The tree always has a root group whose label is the name of
+//!   the dimension.
+//!
+//! - Axes.  A table has three axes: column, row, and layer.  Each dimension is
+//!   assigned to an axis, and each axis has zero or more dimensions.  When an
+//!   axis has more than one dimension, they are ordered from innermost to
+//!   outermost.
+//!
+//! - Data.  A table's data consists of zero or more cells.  Each cell maps from
+//!   a category for each dimension to a value, which is commonly a number but
+//!   could also be a variable name or an arbitrary text string.
+//!
+//! Creating a pivot table usually consists of the following steps:
+//!
+//! 1. Create the table with pivot_table_create(), passing in the title.
+//!
+//! 2. Optionally, set the format to use for "count" values with
+//!    pivot_table_set_weight_var() or pivot_table_set_weight_format().
+//!
+//! 3. Create each dimension with pivot_dimension_create() and populate it with
+//!    categories and, possibly, with groups that contain the categories.  This
+//!    call also assigns the dimension to an axis.
+//!
+//!    In simple cases, only a call to pivot_dimension_create() is needed.
+//!    Other functions such as pivot_category_create_group() can be used for
+//!    hierarchies of categories.
+//!
+//!    Sometimes it's easier to create categories in tandem with inserting data,
+//!    for example by adding a category for a variable just before inserting the
+//!    first cell for that variable.  In that case, creating categories and
+//!    inserting data can be interleaved.
+//!
+//! 4. Insert data.  For each cell, supply the category indexes, which are
+//!    assigned starting from 0 in the order in which the categories were
+//!    created in step 2, and the value to go in the cell.  If the table has a
+//!    small, fixed number of dimensions, functions like, e.g.
+//!    pivot_table_put3() for 3 dimensions, can be used.  The general function
+//!    pivot_table_put() works for other cases.
+//!
+//! 5. Output the table for user consumption.  Use pivot_table_submit().
+
+use std::{
+    collections::HashMap,
+    ops::Range,
+    sync::{Arc, OnceLock},
+};
+
+use chrono::NaiveDateTime;
+use enum_map::{enum_map, Enum, EnumMap};
+
+use crate::format::{Format, Settings as FormatSettings};
+
+/// Areas of a pivot table for styling purposes.
+#[derive(Copy, Clone, Debug, Enum, PartialEq, Eq)]
+pub enum Area {
+    Title,
+    Caption,
+
+    /// Footnotes,
+    Footer,
+
+    // Top-left corner.
+    Corner,
+
+    ColumnLabels,
+    RowLabels,
+    Data,
+
+    /// Layer indication.
+    Layers,
+}
+
+/// Table borders for styling purposes.
+#[derive(Debug, Enum)]
+pub enum Border {
+    Title,
+    OuterFrame(BoxBorder),
+    InnerFrame(BoxBorder),
+    Dimensions(RowColBorder),
+    Categories(RowColBorder),
+    DataLeft,
+    DataTop,
+}
+
+/// The borders on a box.
+#[derive(Debug, Enum)]
+pub enum BoxBorder {
+    Left,
+    Top,
+    Right,
+    Bottom,
+}
+
+/// Borders between rows and columns.
+#[derive(Debug, Enum, PartialEq, Eq)]
+pub enum RowColBorder {
+    RowHorz,
+    RowVert,
+    ColHorz,
+    ColVert,
+}
+
+/// Sizing for rows or columns of a rendered table.
+///
+/// The comments below talk about columns and their widths but they apply
+/// equally to rows and their heights.
+#[derive(Default)]
+pub struct Sizing {
+    /// Specific column widths, in 1/96" units.
+    widths: Vec<i32>,
+
+    /// Specific page breaks: 0-based columns after which a page break must
+    /// occur, e.g. a value of 1 requests a break after the second column.
+    breaks: Vec<usize>,
+
+    /// Keeps: columns to keep together on a page if possible.
+    keeps: Vec<Range<usize>>,
+}
+
+#[derive(Enum)]
+pub enum Axis3 {
+    X,
+    Y,
+    Z,
+}
+
+/// An axis within a pivot table.
+#[derive(Default)]
+pub struct TableAxis {
+    /// `dimensions[0]` is the innermost dimension.
+    dimensions: Vec<Dimension>,
+
+    /// The number of rows or columns along the axis, that is, the product of
+    /// `dimensions[*].n_leaves`.  It is 0 if any dimension has 0 leaves.
+    extent: usize,
+
+    /// Sum of `dimensions[*].label_depth`.
+    label_depth: usize,
+}
+
+/// Dimensions.
+///
+/// A [Dimension] identifies the categories associated with a single dimension
+/// within a multidimensional pivot table.
+///
+/// A dimension contains a collection of categories, which are the leaves in a
+/// tree of groups.
+///
+/// (A dimension or a group can contain zero categories, but this is unusual.
+/// If a dimension contains no categories, then its table cannot contain any
+/// data.)
+pub struct Dimension {
+    axis_type: Axis3,
+    level: usize,
+
+    top_index: usize,
+
+    /// Hierarchy of categories within the dimension.  The groups and categories
+    /// are sorted in the order that should be used for display.  This might be
+    /// different from the original order produced for output if the user
+    /// adjusted it.
+    ///
+    /// The root must always be a group, although it is allowed to have no
+    /// subcategories.
+    root: Group,
+
+    ///  All of the leaves reachable via the root.
+    ///
+    ///  The indexing for presentation_leaves is presentation order, thus
+    ///  `presentation_leaves[i]->presentation_index == i`.  This order is the
+    ///  same as would be produced by an in-order traversal of the groups.  It
+    ///  is the order into which the user reordered or sorted the categories.
+    ///
+    ///  The indexing for `data_leaves` is that used for `idx` in [Cell], thus
+    ///  `data_leaves[i]->data_index == i`.  This might differ from what an
+    ///  in-order traversal of `root` would yield, if the user reordered
+    ///  categories.
+    data_leaves: Vec<Arc<Leaf>>,
+    presentation_leaves: Vec<Arc<Leaf>>,
+
+    /// Display.
+    hide_all_labels: bool,
+
+    /// Number of rows or columns needed to express the labels.
+    label_depth: usize,
+}
+
+pub struct Group {
+    name: Value,
+    label_depth: usize,
+    extra_depth: usize,
+
+    /// The child categories.
+    ///
+    /// A group usually has multiple children, but it is allowed to have
+    /// only one or even (pathologically) none.
+    children: Vec<Category>,
+
+    /// Display a label for the group itself?
+    show_label: bool,
+
+    show_label_in_corner: bool,
+}
+
+pub struct Leaf {
+    name: Value,
+    label_depth: usize,
+    extra_depth: usize,
+
+    group_index: usize,
+    data_index: usize,
+    presentation_index: usize,
+
+    /// Default format for values in this category.
+    format: Format,
+
+    /// Honor [Table]'s `small` setting?
+    honor_small: bool,
+}
+
+/// A pivot_category is a leaf (a category) or a group.
+pub enum Category {
+    Group(Arc<Group>),
+    Leaf(Arc<Leaf>),
+}
+
+trait CategoryTrait {
+    fn name(&self) -> &Value;
+    fn label_depth(&self) -> usize;
+    fn extra_depth(&self) -> usize;
+}
+
+impl CategoryTrait for Group {
+    fn name(&self) -> &Value {
+        &self.name
+    }
+
+    fn label_depth(&self) -> usize {
+        self.label_depth
+    }
+
+    fn extra_depth(&self) -> usize {
+        self.extra_depth
+    }
+}
+
+impl CategoryTrait for Leaf {
+    fn name(&self) -> &Value {
+        &self.name
+    }
+
+    fn label_depth(&self) -> usize {
+        self.label_depth
+    }
+
+    fn extra_depth(&self) -> usize {
+        self.extra_depth
+    }
+}
+
+impl CategoryTrait for Category {
+    fn name(&self) -> &Value {
+        match self {
+            Category::Group(group) => group.name(),
+            Category::Leaf(leaf) => leaf.name(),
+        }
+    }
+
+    fn label_depth(&self) -> usize {
+        match self {
+            Category::Group(group) => group.label_depth(),
+            Category::Leaf(leaf) => leaf.label_depth(),
+        }
+    }
+
+    fn extra_depth(&self) -> usize {
+        match self {
+            Category::Group(group) => group.extra_depth(),
+            Category::Leaf(leaf) => leaf.extra_depth(),
+        }
+    }
+}
+
+/// Styling for a pivot table.
+///
+/// The division between this and the style information in [Table] seems fairly
+/// arbitrary.  The ultimate reason for the division is simply because that's
+/// how SPSS documentation and file formats do it.
+struct Look {
+    name: Option<String>,
+
+    omit_empty: bool,
+    row_labels_in_corner: bool,
+
+    /// Range of column widths for columns in the row headings and corner , in 1/96"
+    /// units.
+    row_heading_widths: Range<usize>,
+
+    /// Range of column widths for columns in the column headings , in 1/96"
+    /// units.
+    col_heading_widths: Range<usize>,
+
+    /// Kind of markers to use for footnotes.
+    footnote_marker_type: FootnoteMarkerType,
+
+    /// Where to put the footnote markers.
+    footnote_marker_position: FootnoteMarkerPosition,
+
+    /// Styles for areas of the pivot table.
+    areas: EnumMap<Area, AreaStyle>,
+
+    /// Styles for borders in the pivot table.
+    borders: EnumMap<Border, BorderStyle>,
+
+    print_all_layers: bool,
+
+    paginate_layers: bool,
+
+    shrink_to_fit: EnumMap<Axis2, bool>,
+
+    top_continuation: bool,
+
+    bottom_continuation: bool,
+
+    continuation: Option<String>,
+
+    n_orphan_lines: usize,
+}
+
+impl Default for Look {
+    fn default() -> Self {
+        Self {
+            name: None,
+            omit_empty: true,
+            row_labels_in_corner: true,
+            row_heading_widths: 36..72,
+            col_heading_widths: 36..120,
+            footnote_marker_type: FootnoteMarkerType::Alphabetic,
+            footnote_marker_position: FootnoteMarkerPosition::Subscript,
+            areas: EnumMap::from_fn(|area| {
+                use HorzAlign::*;
+                use VertAlign::*;
+                let (halign, valign, hmargins, vmargins) = match area {
+                    Area::Title => (Center, Middle, [8, 11], [1, 8]),
+                    Area::Caption => (Left, Top, [8, 11], [1, 1]),
+                    Area::Footer => (Left, Top, [11, 8], [2, 3]),
+                    Area::Corner => (Left, Bottom, [8, 11], [1, 1]),
+                    Area::ColumnLabels => (Left, Top, [8, 11], [1, 3]),
+                    Area::RowLabels => (Left, Top, [8, 11], [1, 3]),
+                    Area::Data => (Mixed, Top, [8, 11], [1, 1]),
+                    Area::Layers => (Left, Bottom, [8, 11], [1, 3]),
+                };
+                AreaStyle {
+                    cell_style: CellStyle {
+                        horz_align: halign,
+                        vert_align: valign,
+                        margins: enum_map! { Axis2::X => hmargins, Axis2::Y => vmargins },
+                    },
+                    font_style: FontStyle {
+                        bold: area == Area::Title,
+                        italic: false,
+                        underline: false,
+                        markup: false,
+                        font: String::from("Sans Serif"),
+                        fg: [Color::BLACK; 2],
+                        bg: [Color::WHITE; 2],
+                        size: 9,
+                    },
+                }
+            }),
+            borders: EnumMap::from_fn(|border| {
+                let stroke = match border {
+                    Border::InnerFrame(_) | Border::DataLeft | Border::DataTop => Stroke::Thick,
+                    Border::Dimensions(side) if side != RowColBorder::RowVert => Stroke::Solid,
+                    Border::Categories(RowColBorder::ColHorz | RowColBorder::ColVert) => {
+                        Stroke::Solid
+                    }
+                    _ => Stroke::None,
+                };
+                BorderStyle {
+                    stroke,
+                    color: Color::BLACK,
+                }
+            }),
+            print_all_layers: false,
+            paginate_layers: false,
+            shrink_to_fit: EnumMap::from_fn(|_| false),
+            top_continuation: false,
+            bottom_continuation: false,
+            continuation: None,
+            n_orphan_lines: 0,
+        }
+    }
+}
+
+impl Look {
+    fn shared_default() -> Arc<Look> {
+        static LOOK: OnceLock<Arc<Look>> = OnceLock::new();
+        LOOK.get_or_init(|| Arc::new(Look::default())).clone()
+    }
+}
+
+pub struct AreaStyle {
+    cell_style: CellStyle,
+    font_style: FontStyle,
+}
+
+pub struct CellStyle {
+    horz_align: HorzAlign,
+    vert_align: VertAlign,
+
+    /// Margins in 1/96" units.
+    ///
+    /// `margins[Axis2::X][0]` is the left margin.
+    /// `margins[Axis2::X][1]` is the right margin.
+    /// `margins[Axis2::Y][0]` is the top margin.
+    /// `margins[Axis2::Y][1]` is the bottom margin.
+    margins: EnumMap<Axis2, [i32; 2]>,
+}
+
+pub enum HorzAlign {
+    /// Right aligned.
+    Right,
+
+    /// Left aligned.
+    Left,
+
+    /// Centered.
+    Center,
+
+    /// Align strings to the left, other formats to the right.
+    Mixed,
+
+    /// Align the decimal point at the specified position.
+    Decimal {
+        /// Decimal offset from the right side of the cell, in 1/96" units.
+        offset: f64,
+
+        /// Decimal character: either `b'.'` or `b','`.
+        c: char,
+    },
+}
+
+pub enum VertAlign {
+    /// Top alignment.
+    Top,
+
+    /// Centered,
+    Middle,
+
+    /// Bottom alignment.
+    Bottom,
+}
+
+pub struct FontStyle {
+    bold: bool,
+    italic: bool,
+    underline: bool,
+    markup: bool,
+    font: String,
+    fg: [Color; 2],
+    bg: [Color; 2],
+
+    /// In 1/72" units.
+    size: i32,
+}
+
+pub struct Color {
+    alpha: u8,
+    r: u8,
+    g: u8,
+    b: u8,
+}
+
+impl Color {
+    const BLACK: Color = Color::new(0, 0, 0);
+    const WHITE: Color = Color::new(255, 255, 255);
+
+    const fn new(r: u8, g: u8, b: u8) -> Self {
+        Self {
+            alpha: 255,
+            r,
+            g,
+            b,
+        }
+    }
+}
+
+pub struct BorderStyle {
+    stroke: Stroke,
+    color: Color,
+}
+
+pub enum Stroke {
+    None,
+    Solid,
+    Dashed,
+    Thick,
+    Thin,
+    Double,
+}
+
+/// An axis of a flat table.
+#[derive(Debug, Enum)]
+pub enum Axis2 {
+    X,
+    Y,
+}
+
+pub enum FootnoteMarkerType {
+    /// a, b, c, ...
+    Alphabetic,
+
+    /// 1, 2, 3, ...
+    Numeric,
+}
+
+pub enum FootnoteMarkerPosition {
+    /// Subscripts.
+    Subscript,
+
+    /// Superscripts.
+    Superscript,
+}
+
+pub struct Table {
+    look: Arc<Look>,
+
+    rotate_inner_column_labels: bool,
+
+    rotate_outer_row_labels: bool,
+
+    show_grid_lines: bool,
+
+    show_title: bool,
+
+    show_caption: bool,
+
+    show_value: Option<ValueShow>,
+
+    show_variables: Option<ValueShow>,
+
+    weight_format: Format,
+
+    /// Current layer indexes, with axes[PIVOT_AXIS_LAYER].n_dimensions
+    /// elements.  current_layer[i] is an offset into
+    /// axes[PIVOT_AXIS_LAYER].dimensions[i]->data_leaves[], EXCEPT that a
+    /// dimension can have zero leaves, in which case current_layer[i] is zero
+    /// and there's no corresponding leaf.
+    current_layer: Vec<usize>,
+
+    /// Column and row sizing and page breaks.
+    sizing: EnumMap<Axis2, Sizing>,
+
+    /// Format settings.
+    settings: FormatSettings,
+
+    /// Numeric grouping character (usually `.` or `,`).
+    grouping: Option<char>,
+
+    small: f64,
+
+    command_local: Option<String>,
+    command_c: Option<String>,
+    language: Option<String>,
+    locale: Option<String>,
+    dataset: Option<String>,
+    datafile: Option<String>,
+    date: Option<NaiveDateTime>,
+    footnotes: Vec<Footnote>,
+    title: Option<Value>,
+    subtype: Option<Value>,
+    corner_text: Option<Value>,
+    caption: Option<Value>,
+    notes: Option<String>,
+    dimensions: Vec<Dimension>,
+    axes: EnumMap<Axis3, TableAxis>,
+    cells: HashMap<u64, Value>,
+}
+
+impl Table {
+    fn new() -> Self {
+        Self {
+            look: Look::shared_default(),
+            rotate_inner_column_labels: false,
+            rotate_outer_row_labels: false,
+            show_grid_lines: false,
+            show_title: true,
+            show_caption: true,
+            show_value: None,
+            show_variables: None,
+            weight_format: Format::F40,
+            current_layer: Vec::new(),
+            sizing: EnumMap::default(),
+            settings: FormatSettings::default(), // XXX from settings
+            grouping: None,
+            small: 0.0001, // XXX from settings.
+            command_local: None,
+            command_c: None, // XXX from current command name.
+            language: None,
+            locale: None,
+            dataset: None,
+            datafile: None,
+            date: None,
+            footnotes: Vec::new(),
+            subtype: None,
+            title: None,
+            corner_text: None,
+            caption: None,
+            notes: None,
+            dimensions: Vec::new(),
+            axes: EnumMap::default(),
+            cells: HashMap::new(),
+        }
+    }
+}
+
+/// Whether to show variable or value labels or the underlying value or variable name.
+pub enum ValueShow {
+    /// Value or variable name only.
+    Value,
+
+    /// Label only.
+    Label,
+
+    /// Value and label.
+    Both,
+}
+
+pub struct Footnote {
+    content: Value,
+    marker: Value,
+    show: bool,
+}
+
+/// The content of a single pivot table cell.
+///
+/// A [Value] is also a pivot table's title, caption, footnote marker and
+/// contents, and so on.
+///
+/// A given [Value] is one of:
+///
+/// 1. A number resulting from a calculation.
+///
+///    A number has an associated display format (usually [F] or [Pct]).  This
+///    format can be set directly, but that is not usually the easiest way.
+///    Instead, it is usually true that all of the values in a single category
+///    should have the same format (e.g. all "Significance" values might use
+///    format `F40.3`), so PSPP makes it easy to set the default format for a
+///    category while creating the category.  See pivot_dimension_create() for
+///    more details.
+///
+///    [F]: crate::format::Format::F
+///    [Pct]: crate::format::Format::Pct
+///
+/// 2. A numeric or string value obtained from data (PIVOT_VALUE_NUMERIC or
+///    PIVOT_VALUE_STRING).  If such a value corresponds to a variable, then the
+///    variable's name can be attached to the pivot_value.  If the value has a
+///    value label, then that can also be attached.  When a label is present,
+///    the user can control whether to show the value or the label or both.
+///
+/// 3. A variable name (PIVOT_VALUE_VARIABLE).  The variable label, if any, can
+///    be attached too, and again the user can control whether to show the value
+///    or the label or both.
+///
+/// 4. A text string (PIVOT_VALUE_TEXT).  The value stores the string in English
+///    and translated into the output language (localized).  Use
+///    pivot_value_new_text() or pivot_value_new_text_format() for those cases.
+///    In some cases, only an English or a localized version is available for
+///    one reason or another, although this is regrettable; in those cases, use
+///    pivot_value_new_user_text() or pivot_value_new_user_text_nocopy().
+///
+/// 5. A template. PSPP doesn't create these itself yet, but it can read and
+///    interpret those created by SPSS.
+pub struct Value {
+    styling: Option<Box<ValueStyle>>,
+    inner: ValueInner,
+}
+
+pub enum ValueInner {
+    Number {
+        show: ValueShow,
+        format: Format,
+        honor_small: bool,
+        value: f64,
+        var_name: Option<String>,
+        value_label: Option<String>,
+    },
+    String {
+        show: ValueShow,
+        hex: bool,
+        s: Option<String>,
+        var_name: Option<String>,
+        value_label: Option<String>,
+    },
+    Variable {
+        show: ValueShow,
+        var_name: Option<String>,
+        value_label: Option<String>,
+    },
+    Text {
+        user_provided: bool,
+        /// Localized.
+        local: String,
+        /// English.
+        c: String,
+        /// Identifier.
+        id: String,
+    },
+    Template {
+        args: Vec<Vec<Value>>,
+        local: String,
+        id: String,
+    },
+}
+
+pub struct ValueStyle {
+    font_style: FontStyle,
+    cell_style: CellStyle,
+    subscripts: Vec<String>,
+    footnote_indexes: Vec<usize>,
+}
diff --git a/rust/pspp/src/prompt.rs b/rust/pspp/src/prompt.rs

new file mode 100644 (file)

index 0000000..c02ca9b
--- /dev/null
+++ b/rust/pspp/src/prompt.rs
@@ -0,0 +1,37 @@
+#[derive(Copy, Clone, Debug, Hash, PartialEq, Eq)]
+pub enum PromptStyle {
+    /// First line of command.
+    First,
+
+    /// Second or later line of command.
+    Later,
+
+    /// Between `BEGIN DATA` and `END DATA`.
+    Data,
+
+    /// `COMMENT` or `*` command.
+    Comment,
+
+    /// DOCUMENT command.
+    Document,
+
+    /// `DO REPEAT` command.
+    DoRepeat,
+
+    /// `DEFINE` command.
+    Define,
+}
+
+impl PromptStyle {
+    pub fn to_string(&self) -> &'static str {
+        match self {
+            PromptStyle::First => "first",
+            PromptStyle::Later => "later",
+            PromptStyle::Data => "data",
+            PromptStyle::Comment => "COMMENT",
+            PromptStyle::Document => "DOCUMENT",
+            PromptStyle::DoRepeat => "DO REPEAT",
+            PromptStyle::Define => "DEFINE",
+        }
+    }
+}
diff --git a/rust/pspp/src/raw.rs b/rust/pspp/src/raw.rs

new file mode 100644 (file)

index 0000000..c9b0477
--- /dev/null
+++ b/rust/pspp/src/raw.rs
@@ -0,0 +1,2888 @@
+use crate::{
+    dictionary::VarWidth,
+    encoding::{default_encoding, get_encoding, Error as EncodingError},
+    endian::{Endian, Parse, ToBytes},
+    identifier::{Error as IdError, Identifier},
+};
+
+use encoding_rs::{mem::decode_latin1, DecoderResult, Encoding};
+use flate2::read::ZlibDecoder;
+use num::Integer;
+use std::{
+    borrow::Cow,
+    cell::RefCell,
+    cmp::Ordering,
+    collections::{HashMap, VecDeque},
+    fmt::{Debug, Display, Formatter, Result as FmtResult},
+    io::{Error as IoError, Read, Seek, SeekFrom},
+    iter::repeat,
+    mem::take,
+    ops::Range,
+    rc::Rc,
+    str::from_utf8,
+};
+use thiserror::Error as ThisError;
+
+#[derive(ThisError, Debug)]
+pub enum Error {
+    #[error("Not an SPSS system file")]
+    NotASystemFile,
+
+    #[error("Invalid magic number {0:?}")]
+    BadMagic([u8; 4]),
+
+    #[error("I/O error ({0})")]
+    Io(#[from] IoError),
+
+    #[error("Invalid SAV compression code {0}")]
+    InvalidSavCompression(u32),
+
+    #[error("Invalid ZSAV compression code {0}")]
+    InvalidZsavCompression(u32),
+
+    #[error("Document record at offset {offset:#x} has document line count ({n}) greater than the maximum number {max}.")]
+    BadDocumentLength { offset: u64, n: usize, max: usize },
+
+    #[error("At offset {offset:#x}, unrecognized record type {rec_type}.")]
+    BadRecordType { offset: u64, rec_type: u32 },
+
+    #[error("In variable record starting at offset {start_offset:#x}, variable width is not in the valid range -1 to 255.")]
+    BadVariableWidth { start_offset: u64, width: i32 },
+
+    #[error("In variable record starting at offset {start_offset:#x}, variable label code {code} at offset {code_offset:#x} is not 0 or 1.")]
+    BadVariableLabelCode {
+        start_offset: u64,
+        code_offset: u64,
+        code: u32,
+    },
+
+    #[error(
+        "At offset {offset:#x}, numeric missing value code ({code}) is not -3, -2, 0, 1, 2, or 3."
+    )]
+    BadNumericMissingValueCode { offset: u64, code: i32 },
+
+    #[error("At offset {offset:#x}, string missing value code ({code}) is not 0, 1, 2, or 3.")]
+    BadStringMissingValueCode { offset: u64, code: i32 },
+
+    #[error("At offset {offset:#x}, number of value labels ({n}) is greater than the maximum number {max}.")]
+    BadNumberOfValueLabels { offset: u64, n: u32, max: u32 },
+
+    #[error("At offset {offset:#x}, following value label record, found record type {rec_type} instead of expected type 4 for variable index record")]
+    ExpectedVarIndexRecord { offset: u64, rec_type: u32 },
+
+    #[error("At offset {offset:#x}, number of variables indexes for value labels ({n}) is greater than the maximum number ({max}).")]
+    TooManyVarIndexes { offset: u64, n: u32, max: u32 },
+
+    #[error("At offset {offset:#x}, record type 7 subtype {subtype} is too large with element size {size} and {count} elements.")]
+    ExtensionRecordTooLarge {
+        offset: u64,
+        subtype: u32,
+        size: u32,
+        count: u32,
+    },
+
+    #[error("Unexpected end of file at offset {offset:#x}, {case_ofs} bytes into a {case_len}-byte case.")]
+    EofInCase {
+        offset: u64,
+        case_ofs: u64,
+        case_len: usize,
+    },
+
+    #[error(
+        "Unexpected end of file at offset {offset:#x}, {case_ofs} bytes into a compressed case."
+    )]
+    EofInCompressedCase { offset: u64, case_ofs: u64 },
+
+    #[error("Data ends at offset {offset:#x}, {case_ofs} bytes into a compressed case.")]
+    PartialCompressedCase { offset: u64, case_ofs: u64 },
+
+    #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a string was found where a number was expected.")]
+    CompressedNumberExpected { offset: u64, case_ofs: u64 },
+
+    #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a number was found where a string was expected.")]
+    CompressedStringExpected { offset: u64, case_ofs: u64 },
+
+    #[error("Block count {n_blocks} in ZLIB trailer at offset {offset:#x} differs from expected block count {expected_n_blocks} calculated from trailer length {ztrailer_len}.")]
+    BadZlibTrailerNBlocks {
+        offset: u64,
+        n_blocks: u32,
+        expected_n_blocks: u64,
+        ztrailer_len: u64,
+    },
+
+    #[error("{0}")]
+    EncodingError(EncodingError),
+}
+
+#[derive(ThisError, Debug)]
+pub enum Warning {
+    #[error("Unexpected end of data inside extension record.")]
+    UnexpectedEndOfData,
+
+    #[error("At offset {offset:#x}, at least one valid variable index for value labels is required but none were specified.")]
+    NoVarIndexes { offset: u64 },
+
+    #[error("At offset {offset:#x}, the first variable index is for a {var_type} variable but the following variable indexes are for {} variables: {wrong_types:?}", var_type.opposite())]
+    MixedVarTypes {
+        offset: u64,
+        var_type: VarType,
+        wrong_types: Vec<u32>,
+    },
+
+    #[error("At offset {offset:#x}, one or more variable indexes for value labels were not in the valid range [1,{max}]: {invalid:?}")]
+    InvalidVarIndexes {
+        offset: u64,
+        max: usize,
+        invalid: Vec<u32>,
+    },
+
+    #[error("At offset {offset:#x}, {record} has bad size {size} bytes instead of the expected {expected_size}.")]
+    BadRecordSize {
+        offset: u64,
+        record: String,
+        size: u32,
+        expected_size: u32,
+    },
+
+    #[error("At offset {offset:#x}, {record} has bad count {count} instead of the expected {expected_count}.")]
+    BadRecordCount {
+        offset: u64,
+        record: String,
+        count: u32,
+        expected_count: u32,
+    },
+
+    #[error("In long string missing values record starting at offset {record_offset:#x}, value length at offset {offset:#x} is {value_len} instead of the expected 8.")]
+    BadLongMissingValueLength {
+        record_offset: u64,
+        offset: u64,
+        value_len: u32,
+    },
+
+    #[error("The encoding record at offset {offset:#x} contains an encoding name that is not valid UTF-8.")]
+    BadEncodingName { offset: u64 },
+
+    // XXX This is risky because `text` might be arbitarily long.
+    #[error("Text string contains invalid bytes for {encoding} encoding: {text}")]
+    MalformedString { encoding: String, text: String },
+
+    #[error("Invalid variable measurement level value {0}")]
+    InvalidMeasurement(u32),
+
+    #[error("Invalid variable display alignment value {0}")]
+    InvalidAlignment(u32),
+
+    #[error("Invalid attribute name.  {0}")]
+    InvalidAttributeName(IdError),
+
+    #[error("Invalid variable name in attribute record.  {0}")]
+    InvalidAttributeVariableName(IdError),
+
+    #[error("Invalid short name in long variable name record.  {0}")]
+    InvalidShortName(IdError),
+
+    #[error("Invalid name in long variable name record.  {0}")]
+    InvalidLongName(IdError),
+
+    #[error("Invalid variable name in very long string record.  {0}")]
+    InvalidLongStringName(IdError),
+
+    #[error("Invalid variable name in variable set record.  {0}")]
+    InvalidVariableSetName(IdError),
+
+    #[error("Invalid multiple response set name.  {0}")]
+    InvalidMrSetName(IdError),
+
+    #[error("Invalid multiple response set variable name.  {0}")]
+    InvalidMrSetVariableName(IdError),
+
+    #[error("Invalid variable name in long string missing values record.  {0}")]
+    InvalidLongStringMissingValueVariableName(IdError),
+
+    #[error("Invalid variable name in long string value label record.  {0}")]
+    InvalidLongStringValueLabelName(IdError),
+
+    #[error("{0}")]
+    EncodingError(EncodingError),
+
+    #[error("Details TBD")]
+    TBD,
+}
+
+impl From<IoError> for Warning {
+    fn from(_source: IoError) -> Self {
+        Self::UnexpectedEndOfData
+    }
+}
+
+#[derive(Clone, Debug)]
+pub enum Record {
+    Header(HeaderRecord<RawString>),
+    Variable(VariableRecord<RawString, RawStr<8>>),
+    ValueLabel(ValueLabelRecord<RawStr<8>, RawString>),
+    Document(DocumentRecord<RawDocumentLine>),
+    IntegerInfo(IntegerInfoRecord),
+    FloatInfo(FloatInfoRecord),
+    VarDisplay(VarDisplayRecord),
+    MultipleResponse(MultipleResponseRecord<RawString, RawString>),
+    LongStringValueLabels(LongStringValueLabelRecord<RawString, RawString>),
+    LongStringMissingValues(LongStringMissingValueRecord<RawString, RawStr<8>>),
+    Encoding(EncodingRecord),
+    NumberOfCases(NumberOfCasesRecord),
+    Text(TextRecord),
+    OtherExtension(Extension),
+    EndOfHeaders(u32),
+    ZHeader(ZHeader),
+    ZTrailer(ZTrailer),
+    Cases(Rc<RefCell<Cases>>),
+}
+
+#[derive(Clone, Debug)]
+pub enum DecodedRecord {
+    Header(HeaderRecord<String>),
+    Variable(VariableRecord<String, String>),
+    ValueLabel(ValueLabelRecord<RawStr<8>, String>),
+    Document(DocumentRecord<String>),
+    IntegerInfo(IntegerInfoRecord),
+    FloatInfo(FloatInfoRecord),
+    VarDisplay(VarDisplayRecord),
+    MultipleResponse(MultipleResponseRecord<Identifier, String>),
+    LongStringValueLabels(LongStringValueLabelRecord<Identifier, String>),
+    LongStringMissingValues(LongStringMissingValueRecord<Identifier, String>),
+    Encoding(EncodingRecord),
+    NumberOfCases(NumberOfCasesRecord),
+    VariableSets(VariableSetRecord),
+    ProductInfo(ProductInfoRecord),
+    LongNames(LongNamesRecord),
+    VeryLongStrings(VeryLongStringsRecord),
+    FileAttributes(FileAttributeRecord),
+    VariableAttributes(VariableAttributeRecord),
+    OtherExtension(Extension),
+    EndOfHeaders(u32),
+    ZHeader(ZHeader),
+    ZTrailer(ZTrailer),
+    Cases(Rc<RefCell<Cases>>),
+}
+
+impl Record {
+    fn read<R>(
+        reader: &mut R,
+        endian: Endian,
+        var_types: &[VarType],
+        warn: &dyn Fn(Warning),
+    ) -> Result<Option<Record>, Error>
+    where
+        R: Read + Seek,
+    {
+        let rec_type: u32 = endian.parse(read_bytes(reader)?);
+        match rec_type {
+            2 => Ok(Some(VariableRecord::read(reader, endian)?)),
+            3 => Ok(ValueLabelRecord::read(reader, endian, var_types, warn)?),
+            6 => Ok(Some(DocumentRecord::read(reader, endian)?)),
+            7 => Extension::read(reader, endian, var_types.len(), warn),
+            999 => Ok(Some(Record::EndOfHeaders(
+                endian.parse(read_bytes(reader)?),
+            ))),
+            _ => Err(Error::BadRecordType {
+                offset: reader.stream_position()?,
+                rec_type,
+            }),
+        }
+    }
+
+    pub fn decode(self, decoder: &Decoder) -> Result<DecodedRecord, Error> {
+        Ok(match self {
+            Record::Header(record) => record.decode(decoder),
+            Record::Variable(record) => record.decode(decoder),
+            Record::ValueLabel(record) => DecodedRecord::ValueLabel(record.decode(decoder)),
+            Record::Document(record) => record.decode(decoder),
+            Record::IntegerInfo(record) => DecodedRecord::IntegerInfo(record.clone()),
+            Record::FloatInfo(record) => DecodedRecord::FloatInfo(record.clone()),
+            Record::VarDisplay(record) => DecodedRecord::VarDisplay(record.clone()),
+            Record::MultipleResponse(record) => record.decode(decoder),
+            Record::LongStringValueLabels(record) => {
+                DecodedRecord::LongStringValueLabels(record.decode(decoder))
+            }
+            Record::LongStringMissingValues(record) => {
+                DecodedRecord::LongStringMissingValues(record.decode(decoder))
+            }
+            Record::Encoding(record) => DecodedRecord::Encoding(record.clone()),
+            Record::NumberOfCases(record) => DecodedRecord::NumberOfCases(record.clone()),
+            Record::Text(record) => record.decode(decoder),
+            Record::OtherExtension(record) => DecodedRecord::OtherExtension(record.clone()),
+            Record::EndOfHeaders(record) => DecodedRecord::EndOfHeaders(record),
+            Record::ZHeader(record) => DecodedRecord::ZHeader(record.clone()),
+            Record::ZTrailer(record) => DecodedRecord::ZTrailer(record.clone()),
+            Record::Cases(record) => DecodedRecord::Cases(record.clone()),
+        })
+    }
+}
+
+pub fn encoding_from_headers(
+    headers: &Vec<Record>,
+    warn: &impl Fn(Warning),
+) -> Result<&'static Encoding, Error> {
+    let mut encoding_record = None;
+    let mut integer_info_record = None;
+    for record in headers {
+        match record {
+            Record::Encoding(record) => encoding_record = Some(record),
+            Record::IntegerInfo(record) => integer_info_record = Some(record),
+            _ => (),
+        }
+    }
+    let encoding = encoding_record.map(|record| record.0.as_str());
+    let character_code = integer_info_record.map(|record| record.character_code);
+    match get_encoding(encoding, character_code) {
+        Ok(encoding) => Ok(encoding),
+        Err(err @ EncodingError::Ebcdic) => Err(Error::EncodingError(err)),
+        Err(err) => {
+            warn(Warning::EncodingError(err));
+            // Warn that we're using the default encoding.
+            Ok(default_encoding())
+        }
+    }
+}
+
+// If `s` is valid UTF-8, returns it decoded as UTF-8, otherwise returns it
+// decoded as Latin-1 (actually bytes interpreted as Unicode code points).
+fn default_decode(s: &[u8]) -> Cow<str> {
+    from_utf8(s).map_or_else(|_| decode_latin1(s), Cow::from)
+}
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq)]
+pub enum Compression {
+    Simple,
+    ZLib,
+}
+
+trait Header {
+    fn offsets(&self) -> Range<u64>;
+}
+
+#[derive(Clone)]
+pub struct HeaderRecord<S>
+where
+    S: Debug,
+{
+    /// Offset in file.
+    pub offsets: Range<u64>,
+
+    /// Magic number.
+    pub magic: Magic,
+
+    /// Eye-catcher string, product name, in the file's encoding.  Padded
+    /// on the right with spaces.
+    pub eye_catcher: S,
+
+    /// Layout code, normally either 2 or 3.
+    pub layout_code: u32,
+
+    /// Number of variable positions, or `None` if the value in the file is
+    /// questionably trustworthy.
+    pub nominal_case_size: Option<u32>,
+
+    /// Compression type, if any,
+    pub compression: Option<Compression>,
+
+    /// 1-based variable index of the weight variable, or `None` if the file is
+    /// unweighted.
+    pub weight_index: Option<u32>,
+
+    /// Claimed number of cases, if known.
+    pub n_cases: Option<u32>,
+
+    /// Compression bias, usually 100.0.
+    pub bias: f64,
+
+    /// `dd mmm yy` in the file's encoding.
+    pub creation_date: S,
+
+    /// `HH:MM:SS` in the file's encoding.
+    pub creation_time: S,
+
+    /// File label, in the file's encoding.  Padded on the right with spaces.
+    pub file_label: S,
+
+    /// Endianness of the data in the file header.
+    pub endian: Endian,
+}
+
+impl<S> HeaderRecord<S>
+where
+    S: Debug,
+{
+    fn debug_field<T>(&self, f: &mut Formatter, name: &str, value: T) -> FmtResult
+    where
+        T: Debug,
+    {
+        writeln!(f, "{name:>17}: {:?}", value)
+    }
+}
+
+impl<S> Debug for HeaderRecord<S>
+where
+    S: Debug,
+{
+    fn fmt(&self, f: &mut Formatter) -> FmtResult {
+        writeln!(f, "File header record:")?;
+        self.debug_field(f, "Magic", self.magic)?;
+        self.debug_field(f, "Product name", &self.eye_catcher)?;
+        self.debug_field(f, "Layout code", self.layout_code)?;
+        self.debug_field(f, "Nominal case size", self.nominal_case_size)?;
+        self.debug_field(f, "Compression", self.compression)?;
+        self.debug_field(f, "Weight index", self.weight_index)?;
+        self.debug_field(f, "Number of cases", self.n_cases)?;
+        self.debug_field(f, "Compression bias", self.bias)?;
+        self.debug_field(f, "Creation date", &self.creation_date)?;
+        self.debug_field(f, "Creation time", &self.creation_time)?;
+        self.debug_field(f, "File label", &self.file_label)?;
+        self.debug_field(f, "Endianness", self.endian)
+    }
+}
+
+impl HeaderRecord<RawString> {
+    fn read<R: Read + Seek>(r: &mut R) -> Result<Self, Error> {
+        let start = r.stream_position()?;
+
+        let magic: [u8; 4] = read_bytes(r)?;
+        let magic: Magic = magic.try_into().map_err(|_| Error::NotASystemFile)?;
+
+        let eye_catcher = RawString(read_vec(r, 60)?);
+        let layout_code: [u8; 4] = read_bytes(r)?;
+        let endian = Endian::identify_u32(2, layout_code)
+            .or_else(|| Endian::identify_u32(2, layout_code))
+            .ok_or_else(|| Error::NotASystemFile)?;
+        let layout_code = endian.parse(layout_code);
+
+        let nominal_case_size: u32 = endian.parse(read_bytes(r)?);
+        let nominal_case_size =
+            (nominal_case_size <= i32::MAX as u32 / 16).then_some(nominal_case_size);
+
+        let compression_code: u32 = endian.parse(read_bytes(r)?);
+        let compression = match (magic, compression_code) {
+            (Magic::Zsav, 2) => Some(Compression::ZLib),
+            (Magic::Zsav, code) => return Err(Error::InvalidZsavCompression(code)),
+            (_, 0) => None,
+            (_, 1) => Some(Compression::Simple),
+            (_, code) => return Err(Error::InvalidSavCompression(code)),
+        };
+
+        let weight_index: u32 = endian.parse(read_bytes(r)?);
+        let weight_index = (weight_index > 0).then_some(weight_index);
+
+        let n_cases: u32 = endian.parse(read_bytes(r)?);
+        let n_cases = (n_cases < i32::MAX as u32 / 2).then_some(n_cases);
+
+        let bias: f64 = endian.parse(read_bytes(r)?);
+
+        let creation_date = RawString(read_vec(r, 9)?);
+        let creation_time = RawString(read_vec(r, 8)?);
+        let file_label = RawString(read_vec(r, 64)?);
+        let _: [u8; 3] = read_bytes(r)?;
+
+        Ok(HeaderRecord {
+            offsets: start..r.stream_position()?,
+            magic,
+            layout_code,
+            nominal_case_size,
+            compression,
+            weight_index,
+            n_cases,
+            bias,
+            creation_date,
+            creation_time,
+            eye_catcher,
+            file_label,
+            endian,
+        })
+    }
+
+    pub fn decode(self, decoder: &Decoder) -> DecodedRecord {
+        let eye_catcher = decoder.decode(&self.eye_catcher).to_string();
+        let file_label = decoder.decode(&self.file_label).to_string();
+        let creation_date = decoder.decode(&self.creation_date).to_string();
+        let creation_time = decoder.decode(&self.creation_time).to_string();
+        DecodedRecord::Header(HeaderRecord {
+            eye_catcher,
+            weight_index: self.weight_index,
+            n_cases: self.n_cases,
+            file_label,
+            offsets: self.offsets.clone(),
+            magic: self.magic,
+            layout_code: self.layout_code,
+            nominal_case_size: self.nominal_case_size,
+            compression: self.compression,
+            bias: self.bias,
+            creation_date,
+            creation_time,
+            endian: self.endian,
+        })
+    }
+}
+
+pub struct Decoder {
+    pub encoding: &'static Encoding,
+    pub warn: Box<dyn Fn(Warning)>,
+}
+
+impl Decoder {
+    pub fn new<F>(encoding: &'static Encoding, warn: F) -> Self
+    where
+        F: Fn(Warning) + 'static,
+    {
+        Self {
+            encoding,
+            warn: Box::new(warn),
+        }
+    }
+    fn warn(&self, warning: Warning) {
+        (self.warn)(warning)
+    }
+    fn decode_slice<'a>(&self, input: &'a [u8]) -> Cow<'a, str> {
+        let (output, malformed) = self.encoding.decode_without_bom_handling(input);
+        if malformed {
+            self.warn(Warning::MalformedString {
+                encoding: self.encoding.name().into(),
+                text: output.clone().into(),
+            });
+        }
+        output
+    }
+
+    fn decode<'a>(&self, input: &'a RawString) -> Cow<'a, str> {
+        self.decode_slice(input.0.as_slice())
+    }
+
+    /// Returns `input` decoded from `self.encoding` into UTF-8 such that
+    /// re-encoding the result back into `self.encoding` will have exactly the
+    /// same length in bytes.
+    ///
+    /// XXX warn about errors?
+    pub fn decode_exact_length<'a>(&self, input: &'a [u8]) -> Cow<'a, str> {
+        if let (s, false) = self.encoding.decode_without_bom_handling(input) {
+            // This is the common case.  Usually there will be no errors.
+            s
+        } else {
+            // Unusual case.  Don't bother to optimize it much.
+            let mut decoder = self.encoding.new_decoder_without_bom_handling();
+            let mut output = String::with_capacity(
+                decoder
+                    .max_utf8_buffer_length_without_replacement(input.len())
+                    .unwrap(),
+            );
+            let mut rest = input;
+            while !rest.is_empty() {
+                match decoder.decode_to_string_without_replacement(rest, &mut output, true) {
+                    (DecoderResult::InputEmpty, _) => break,
+                    (DecoderResult::OutputFull, _) => unreachable!(),
+                    (DecoderResult::Malformed(a, b), consumed) => {
+                        let skipped = a as usize + b as usize;
+                        output.extend(repeat('?').take(skipped));
+                        rest = &rest[consumed..];
+                    }
+                }
+            }
+            assert_eq!(self.encoding.encode(&output).0.len(), input.len());
+            output.into()
+        }
+    }
+
+    pub fn decode_identifier(&self, input: &RawString) -> Result<Identifier, IdError> {
+        self.new_identifier(&self.decode(input))
+    }
+
+    pub fn new_identifier(&self, name: &str) -> Result<Identifier, IdError> {
+        Identifier::from_encoding(name, self.encoding)
+    }
+}
+
+impl<S> Header for HeaderRecord<S>
+where
+    S: Debug,
+{
+    fn offsets(&self) -> Range<u64> {
+        self.offsets.clone()
+    }
+}
+
+#[derive(Copy, Clone, PartialEq, Eq, Hash)]
+pub enum Magic {
+    /// Regular system file.
+    Sav,
+
+    /// System file with Zlib-compressed data.
+    Zsav,
+
+    /// EBCDIC-encoded system file.
+    Ebcdic,
+}
+
+impl Magic {
+    /// Magic number for a regular system file.
+    pub const SAV: [u8; 4] = *b"$FL2";
+
+    /// Magic number for a system file that contains zlib-compressed data.
+    pub const ZSAV: [u8; 4] = *b"$FL3";
+
+    /// Magic number for an EBCDIC-encoded system file.  This is `$FL2` encoded
+    /// in EBCDIC.
+    pub const EBCDIC: [u8; 4] = [0x5b, 0xc6, 0xd3, 0xf2];
+}
+
+impl Debug for Magic {
+    fn fmt(&self, f: &mut Formatter) -> FmtResult {
+        let s = match *self {
+            Magic::Sav => "$FL2",
+            Magic::Zsav => "$FL3",
+            Magic::Ebcdic => "($FL2 in EBCDIC)",
+        };
+        write!(f, "{s}")
+    }
+}
+
+impl TryFrom<[u8; 4]> for Magic {
+    type Error = Error;
+
+    fn try_from(value: [u8; 4]) -> Result<Self, Self::Error> {
+        match value {
+            Magic::SAV => Ok(Magic::Sav),
+            Magic::ZSAV => Ok(Magic::Zsav),
+            Magic::EBCDIC => Ok(Magic::Ebcdic),
+            _ => Err(Error::BadMagic(value)),
+        }
+    }
+}
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub enum VarType {
+    Numeric,
+    String,
+}
+
+impl VarType {
+    pub fn from_width(width: VarWidth) -> VarType {
+        match width {
+            VarWidth::Numeric => Self::Numeric,
+            VarWidth::String(_) => Self::String,
+        }
+    }
+
+    pub fn opposite(self) -> VarType {
+        match self {
+            Self::Numeric => Self::String,
+            Self::String => Self::Numeric,
+        }
+    }
+}
+
+impl Display for VarType {
+    fn fmt(&self, f: &mut Formatter) -> FmtResult {
+        match self {
+            VarType::Numeric => write!(f, "numeric"),
+            VarType::String => write!(f, "string"),
+        }
+    }
+}
+
+#[derive(Copy, Clone)]
+pub enum Value<S>
+where
+    S: Debug,
+{
+    Number(Option<f64>),
+    String(S),
+}
+
+type RawValue = Value<RawStr<8>>;
+
+impl<S> Debug for Value<S>
+where
+    S: Debug,
+{
+    fn fmt(&self, f: &mut Formatter) -> FmtResult {
+        match self {
+            Value::Number(Some(number)) => write!(f, "{number:?}"),
+            Value::Number(None) => write!(f, "SYSMIS"),
+            Value::String(s) => write!(f, "{:?}", s),
+        }
+    }
+}
+
+impl RawValue {
+    fn read<R: Read>(r: &mut R, var_type: VarType, endian: Endian) -> Result<Self, IoError> {
+        Ok(Self::from_raw(
+            &UntypedValue(read_bytes(r)?),
+            var_type,
+            endian,
+        ))
+    }
+
+    pub fn from_raw(raw: &UntypedValue, var_type: VarType, endian: Endian) -> Self {
+        match var_type {
+            VarType::String => Value::String(RawStr(raw.0)),
+            VarType::Numeric => {
+                let number: f64 = endian.parse(raw.0);
+                Value::Number((number != -f64::MAX).then_some(number))
+            }
+        }
+    }
+
+    fn read_case<R: Read + Seek>(
+        reader: &mut R,
+        var_types: &[VarType],
+        endian: Endian,
+    ) -> Result<Option<Vec<Self>>, Error> {
+        let case_start = reader.stream_position()?;
+        let mut values = Vec::with_capacity(var_types.len());
+        for (i, &var_type) in var_types.iter().enumerate() {
+            let Some(raw) = try_read_bytes(reader)? else {
+                if i == 0 {
+                    return Ok(None);
+                } else {
+                    let offset = reader.stream_position()?;
+                    return Err(Error::EofInCase {
+                        offset,
+                        case_ofs: offset - case_start,
+                        case_len: var_types.len() * 8,
+                    });
+                }
+            };
+            values.push(Value::from_raw(&UntypedValue(raw), var_type, endian));
+        }
+        Ok(Some(values))
+    }
+
+    fn read_compressed_case<R: Read + Seek>(
+        reader: &mut R,
+        var_types: &[VarType],
+        codes: &mut VecDeque<u8>,
+        endian: Endian,
+        bias: f64,
+    ) -> Result<Option<Vec<Self>>, Error> {
+        let case_start = reader.stream_position()?;
+        let mut values = Vec::with_capacity(var_types.len());
+        for (i, &var_type) in var_types.iter().enumerate() {
+            let value = loop {
+                let Some(code) = codes.pop_front() else {
+                    let Some(new_codes): Option<[u8; 8]> = try_read_bytes(reader)? else {
+                        if i == 0 {
+                            return Ok(None);
+                        } else {
+                            let offset = reader.stream_position()?;
+                            return Err(Error::EofInCompressedCase {
+                                offset,
+                                case_ofs: offset - case_start,
+                            });
+                        }
+                    };
+                    codes.extend(new_codes.into_iter());
+                    continue;
+                };
+                match code {
+                    0 => (),
+                    1..=251 => match var_type {
+                        VarType::Numeric => break Self::Number(Some(code as f64 - bias)),
+                        VarType::String => {
+                            break Self::String(RawStr(endian.to_bytes(code as f64 - bias)))
+                        }
+                    },
+                    252 => {
+                        if i == 0 {
+                            return Ok(None);
+                        } else {
+                            let offset = reader.stream_position()?;
+                            return Err(Error::PartialCompressedCase {
+                                offset,
+                                case_ofs: offset - case_start,
+                            });
+                        }
+                    }
+                    253 => {
+                        break Self::from_raw(&UntypedValue(read_bytes(reader)?), var_type, endian)
+                    }
+                    254 => match var_type {
+                        VarType::String => break Self::String(RawStr(*b"        ")), // XXX EBCDIC
+                        VarType::Numeric => {
+                            return Err(Error::CompressedStringExpected {
+                                offset: case_start,
+                                case_ofs: reader.stream_position()? - case_start,
+                            })
+                        }
+                    },
+                    255 => match var_type {
+                        VarType::Numeric => break Self::Number(None),
+                        VarType::String => {
+                            return Err(Error::CompressedNumberExpected {
+                                offset: case_start,
+                                case_ofs: reader.stream_position()? - case_start,
+                            })
+                        }
+                    },
+                }
+            };
+            values.push(value);
+        }
+        Ok(Some(values))
+    }
+
+    pub fn decode(self, decoder: &Decoder) -> Value<String> {
+        match self {
+            Self::Number(x) => Value::Number(x),
+            Self::String(s) => Value::String(decoder.decode_exact_length(&s.0).into()),
+        }
+    }
+}
+
+struct ZlibDecodeMultiple<R>
+where
+    R: Read + Seek,
+{
+    reader: Option<ZlibDecoder<R>>,
+}
+
+impl<R> ZlibDecodeMultiple<R>
+where
+    R: Read + Seek,
+{
+    fn new(reader: R) -> ZlibDecodeMultiple<R> {
+        ZlibDecodeMultiple {
+            reader: Some(ZlibDecoder::new(reader)),
+        }
+    }
+}
+
+impl<R> Read for ZlibDecodeMultiple<R>
+where
+    R: Read + Seek,
+{
+    fn read(&mut self, buf: &mut [u8]) -> Result<usize, IoError> {
+        loop {
+            match self.reader.as_mut().unwrap().read(buf)? {
+                0 => {
+                    let inner = self.reader.take().unwrap().into_inner();
+                    self.reader = Some(ZlibDecoder::new(inner));
+                }
+                n => return Ok(n),
+            };
+        }
+    }
+}
+
+impl<R> Seek for ZlibDecodeMultiple<R>
+where
+    R: Read + Seek,
+{
+    fn seek(&mut self, pos: SeekFrom) -> Result<u64, IoError> {
+        self.reader.as_mut().unwrap().get_mut().seek(pos)
+    }
+}
+
+enum ReaderState {
+    Start,
+    Headers,
+    ZlibHeader,
+    ZlibTrailer {
+        ztrailer_offset: u64,
+        ztrailer_len: u64,
+    },
+    Cases,
+    End,
+}
+
+pub struct Reader<R>
+where
+    R: Read + Seek + 'static,
+{
+    reader: Option<R>,
+    warn: Box<dyn Fn(Warning)>,
+
+    header: HeaderRecord<RawString>,
+    var_types: Vec<VarType>,
+
+    state: ReaderState,
+}
+
+impl<R> Reader<R>
+where
+    R: Read + Seek + 'static,
+{
+    pub fn new<F>(mut reader: R, warn: F) -> Result<Self, Error>
+    where
+        F: Fn(Warning) + 'static,
+    {
+        let header = HeaderRecord::read(&mut reader)?;
+        Ok(Self {
+            reader: Some(reader),
+            warn: Box::new(warn),
+            header,
+            var_types: Vec::new(),
+            state: ReaderState::Start,
+        })
+    }
+    fn cases(&mut self) -> Cases {
+        self.state = ReaderState::End;
+        Cases::new(
+            self.reader.take().unwrap(),
+            take(&mut self.var_types),
+            &self.header,
+        )
+    }
+    fn _next(&mut self) -> Option<<Self as Iterator>::Item> {
+        match self.state {
+            ReaderState::Start => {
+                self.state = ReaderState::Headers;
+                Some(Ok(Record::Header(self.header.clone())))
+            }
+            ReaderState::Headers => {
+                let record = loop {
+                    match Record::read(
+                        self.reader.as_mut().unwrap(),
+                        self.header.endian,
+                        self.var_types.as_slice(),
+                        &self.warn,
+                    ) {
+                        Ok(Some(record)) => break record,
+                        Ok(None) => (),
+                        Err(error) => return Some(Err(error)),
+                    }
+                };
+                match record {
+                    Record::Variable(VariableRecord { width, .. }) => {
+                        self.var_types.push(if width == 0 {
+                            VarType::Numeric
+                        } else {
+                            VarType::String
+                        });
+                    }
+                    Record::EndOfHeaders(_) => {
+                        self.state = if let Some(Compression::ZLib) = self.header.compression {
+                            ReaderState::ZlibHeader
+                        } else {
+                            ReaderState::Cases
+                        };
+                    }
+                    _ => (),
+                };
+                Some(Ok(record))
+            }
+            ReaderState::ZlibHeader => {
+                let zheader = match ZHeader::read(self.reader.as_mut().unwrap(), self.header.endian)
+                {
+                    Ok(zheader) => zheader,
+                    Err(error) => return Some(Err(error)),
+                };
+                self.state = ReaderState::ZlibTrailer {
+                    ztrailer_offset: zheader.ztrailer_offset,
+                    ztrailer_len: zheader.ztrailer_len,
+                };
+                Some(Ok(Record::ZHeader(zheader)))
+            }
+            ReaderState::ZlibTrailer {
+                ztrailer_offset,
+                ztrailer_len,
+            } => {
+                match ZTrailer::read(
+                    self.reader.as_mut().unwrap(),
+                    self.header.endian,
+                    ztrailer_offset,
+                    ztrailer_len,
+                ) {
+                    Ok(None) => Some(Ok(Record::Cases(Rc::new(RefCell::new(self.cases()))))),
+                    Ok(Some(ztrailer)) => Some(Ok(Record::ZTrailer(ztrailer))),
+                    Err(error) => Some(Err(error)),
+                }
+            }
+            ReaderState::Cases => Some(Ok(Record::Cases(Rc::new(RefCell::new(self.cases()))))),
+            ReaderState::End => None,
+        }
+    }
+}
+
+impl<R> Iterator for Reader<R>
+where
+    R: Read + Seek + 'static,
+{
+    type Item = Result<Record, Error>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        let retval = self._next();
+        if matches!(retval, Some(Err(_))) {
+            self.state = ReaderState::End;
+        }
+        retval
+    }
+}
+
+trait ReadSeek: Read + Seek {}
+impl<T> ReadSeek for T where T: Read + Seek {}
+
+pub struct Cases {
+    reader: Box<dyn ReadSeek>,
+    var_types: Vec<VarType>,
+    compression: Option<Compression>,
+    bias: f64,
+    endian: Endian,
+    codes: VecDeque<u8>,
+    eof: bool,
+}
+
+impl Debug for Cases {
+    fn fmt(&self, f: &mut Formatter) -> FmtResult {
+        write!(f, "Cases")
+    }
+}
+
+impl Cases {
+    fn new<R>(reader: R, var_types: Vec<VarType>, header: &HeaderRecord<RawString>) -> Self
+    where
+        R: Read + Seek + 'static,
+    {
+        Self {
+            reader: if header.compression == Some(Compression::ZLib) {
+                Box::new(ZlibDecodeMultiple::new(reader))
+            } else {
+                Box::new(reader)
+            },
+            var_types,
+            compression: header.compression,
+            bias: header.bias,
+            endian: header.endian,
+            codes: VecDeque::with_capacity(8),
+            eof: false,
+        }
+    }
+}
+
+impl Iterator for Cases {
+    type Item = Result<Vec<RawValue>, Error>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.eof {
+            return None;
+        }
+
+        let retval = if self.compression.is_some() {
+            Value::read_compressed_case(
+                &mut self.reader,
+                &self.var_types,
+                &mut self.codes,
+                self.endian,
+                self.bias,
+            )
+            .transpose()
+        } else {
+            Value::read_case(&mut self.reader, &self.var_types, self.endian).transpose()
+        };
+        self.eof = matches!(retval, None | Some(Err(_)));
+        retval
+    }
+}
+
+#[derive(Copy, Clone, PartialEq, Eq, Hash)]
+pub struct Spec(pub u32);
+
+impl Debug for Spec {
+    fn fmt(&self, f: &mut Formatter) -> FmtResult {
+        let type_ = format_name(self.0 >> 16);
+        let w = (self.0 >> 8) & 0xff;
+        let d = self.0 & 0xff;
+        write!(f, "{:06x} ({type_}{w}.{d})", self.0)
+    }
+}
+
+fn format_name(type_: u32) -> Cow<'static, str> {
+    match type_ {
+        1 => "A",
+        2 => "AHEX",
+        3 => "COMMA",
+        4 => "DOLLAR",
+        5 => "F",
+        6 => "IB",
+        7 => "PIBHEX",
+        8 => "P",
+        9 => "PIB",
+        10 => "PK",
+        11 => "RB",
+        12 => "RBHEX",
+        15 => "Z",
+        16 => "N",
+        17 => "E",
+        20 => "DATE",
+        21 => "TIME",
+        22 => "DATETIME",
+        23 => "ADATE",
+        24 => "JDATE",
+        25 => "DTIME",
+        26 => "WKDAY",
+        27 => "MONTH",
+        28 => "MOYR",
+        29 => "QYR",
+        30 => "WKYR",
+        31 => "PCT",
+        32 => "DOT",
+        33 => "CCA",
+        34 => "CCB",
+        35 => "CCC",
+        36 => "CCD",
+        37 => "CCE",
+        38 => "EDATE",
+        39 => "SDATE",
+        40 => "MTIME",
+        41 => "YMDHMS",
+        _ => return format!("<unknown format {type_}>").into(),
+    }
+    .into()
+}
+
+#[derive(Clone)]
+pub struct MissingValues<S = String>
+where
+    S: Debug,
+{
+    /// Individual missing values, up to 3 of them.
+    pub values: Vec<Value<S>>,
+
+    /// Optional range of missing values.
+    pub range: Option<(Value<S>, Value<S>)>,
+}
+
+impl<S> Debug for MissingValues<S>
+where
+    S: Debug,
+{
+    fn fmt(&self, f: &mut Formatter) -> FmtResult {
+        for (i, value) in self.values.iter().enumerate() {
+            if i > 0 {
+                write!(f, ", ")?;
+            }
+            write!(f, "{value:?}")?;
+        }
+
+        if let Some((low, high)) = &self.range {
+            if !self.values.is_empty() {
+                write!(f, ", ")?;
+            }
+            write!(f, "{low:?} THRU {high:?}")?;
+        }
+
+        if self.is_empty() {
+            write!(f, "none")?;
+        }
+
+        Ok(())
+    }
+}
+
+impl<S> MissingValues<S>
+where
+    S: Debug,
+{
+    fn is_empty(&self) -> bool {
+        self.values.is_empty() && self.range.is_none()
+    }
+}
+
+impl<S> Default for MissingValues<S>
+where
+    S: Debug,
+{
+    fn default() -> Self {
+        Self {
+            values: Vec::new(),
+            range: None,
+        }
+    }
+}
+
+impl MissingValues<RawStr<8>> {
+    fn read<R: Read + Seek>(
+        r: &mut R,
+        offset: u64,
+        width: i32,
+        code: i32,
+        endian: Endian,
+    ) -> Result<Self, Error> {
+        let (n_values, has_range) = match (width, code) {
+            (_, 0..=3) => (code, false),
+            (0, -2) => (0, true),
+            (0, -3) => (1, true),
+            (0, _) => return Err(Error::BadNumericMissingValueCode { offset, code }),
+            (_, _) => return Err(Error::BadStringMissingValueCode { offset, code }),
+        };
+
+        let var_type = if width == 0 {
+            VarType::Numeric
+        } else {
+            VarType::String
+        };
+
+        let mut values = Vec::new();
+        for _ in 0..n_values {
+            values.push(RawValue::read(r, var_type, endian)?);
+        }
+        let range = if has_range {
+            let low = RawValue::read(r, var_type, endian)?;
+            let high = RawValue::read(r, var_type, endian)?;
+            Some((low, high))
+        } else {
+            None
+        };
+        Ok(Self { values, range })
+    }
+    fn decode(&self, decoder: &Decoder) -> MissingValues<String> {
+        MissingValues {
+            values: self
+                .values
+                .iter()
+                .map(|value| value.decode(decoder))
+                .collect(),
+            range: self
+                .range
+                .as_ref()
+                .map(|(low, high)| (low.decode(decoder), high.decode(decoder))),
+        }
+    }
+}
+
+#[derive(Clone)]
+pub struct VariableRecord<S, V>
+where
+    S: Debug,
+    V: Debug,
+{
+    /// Range of offsets in file.
+    pub offsets: Range<u64>,
+
+    /// Variable width, in the range -1..=255.
+    pub width: i32,
+
+    /// Variable name, padded on the right with spaces.
+    pub name: S,
+
+    /// Print format.
+    pub print_format: Spec,
+
+    /// Write format.
+    pub write_format: Spec,
+
+    /// Missing values.
+    pub missing_values: MissingValues<V>,
+
+    /// Optional variable label.
+    pub label: Option<S>,
+}
+
+impl<S, V> Debug for VariableRecord<S, V>
+where
+    S: Debug,
+    V: Debug,
+{
+    fn fmt(&self, f: &mut Formatter) -> FmtResult {
+        writeln!(
+            f,
+            "Width: {} ({})",
+            self.width,
+            match self.width.cmp(&0) {
+                Ordering::Greater => "string",
+                Ordering::Equal => "numeric",
+                Ordering::Less => "long string continuation record",
+            }
+        )?;
+        writeln!(f, "Print format: {:?}", self.print_format)?;
+        writeln!(f, "Write format: {:?}", self.write_format)?;
+        writeln!(f, "Name: {:?}", &self.name)?;
+        writeln!(f, "Variable label: {:?}", self.label)?;
+        writeln!(f, "Missing values: {:?}", self.missing_values)
+    }
+}
+
+impl VariableRecord<RawString, RawStr<8>> {
+    fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Record, Error> {
+        let start_offset = r.stream_position()?;
+        let width: i32 = endian.parse(read_bytes(r)?);
+        if !(-1..=255).contains(&width) {
+            return Err(Error::BadVariableWidth {
+                start_offset,
+                width,
+            });
+        }
+        let code_offset = r.stream_position()?;
+        let has_variable_label: u32 = endian.parse(read_bytes(r)?);
+        let missing_value_code: i32 = endian.parse(read_bytes(r)?);
+        let print_format = Spec(endian.parse(read_bytes(r)?));
+        let write_format = Spec(endian.parse(read_bytes(r)?));
+        let name = RawString(read_vec(r, 8)?);
+
+        let label = match has_variable_label {
+            0 => None,
+            1 => {
+                let len: u32 = endian.parse(read_bytes(r)?);
+                let read_len = len.min(65535) as usize;
+                let label = RawString(read_vec(r, read_len)?);
+
+                let padding_bytes = Integer::next_multiple_of(&len, &4) - len;
+                let _ = read_vec(r, padding_bytes as usize)?;
+
+                Some(label)
+            }
+            _ => {
+                return Err(Error::BadVariableLabelCode {
+                    start_offset,
+                    code_offset,
+                    code: has_variable_label,
+                })
+            }
+        };
+
+        let missing_values =
+            MissingValues::read(r, start_offset, width, missing_value_code, endian)?;
+
+        let end_offset = r.stream_position()?;
+
+        Ok(Record::Variable(VariableRecord {
+            offsets: start_offset..end_offset,
+            width,
+            name,
+            print_format,
+            write_format,
+            missing_values,
+            label,
+        }))
+    }
+
+    pub fn decode(self, decoder: &Decoder) -> DecodedRecord {
+        DecodedRecord::Variable(VariableRecord {
+            offsets: self.offsets.clone(),
+            width: self.width,
+            name: decoder.decode(&self.name).to_string(),
+            print_format: self.print_format,
+            write_format: self.write_format,
+            missing_values: self.missing_values.decode(decoder),
+            label: self
+                .label
+                .as_ref()
+                .map(|label| decoder.decode(label).to_string()),
+        })
+    }
+}
+
+#[derive(Copy, Clone)]
+pub struct UntypedValue(pub [u8; 8]);
+
+impl Debug for UntypedValue {
+    fn fmt(&self, f: &mut Formatter) -> FmtResult {
+        let little: f64 = Endian::Little.parse(self.0);
+        let little = format!("{:?}", little);
+        let big: f64 = Endian::Big.parse(self.0);
+        let big = format!("{:?}", big);
+        let number = if little.len() <= big.len() {
+            little
+        } else {
+            big
+        };
+        write!(f, "{number}")?;
+
+        let string = default_decode(&self.0);
+        let string = string
+            .split(|c: char| c == '\0' || c.is_control())
+            .next()
+            .unwrap();
+        write!(f, "{string:?}")?;
+        Ok(())
+    }
+}
+
+#[derive(Clone)]
+pub struct RawString(pub Vec<u8>);
+
+impl From<Vec<u8>> for RawString {
+    fn from(source: Vec<u8>) -> Self {
+        Self(source)
+    }
+}
+
+impl From<&[u8]> for RawString {
+    fn from(source: &[u8]) -> Self {
+        Self(source.into())
+    }
+}
+
+impl Debug for RawString {
+    fn fmt(&self, f: &mut Formatter) -> FmtResult {
+        write!(f, "{:?}", default_decode(self.0.as_slice()))
+    }
+}
+
+#[derive(Copy, Clone)]
+pub struct RawStr<const N: usize>(pub [u8; N]);
+
+impl<const N: usize> From<[u8; N]> for RawStr<N> {
+    fn from(source: [u8; N]) -> Self {
+        Self(source)
+    }
+}
+
+impl<const N: usize> Debug for RawStr<N> {
+    fn fmt(&self, f: &mut Formatter) -> FmtResult {
+        write!(f, "{:?}", default_decode(&self.0))
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct ValueLabel<V, S>
+where
+    V: Debug,
+    S: Debug,
+{
+    pub value: Value<V>,
+    pub label: S,
+}
+
+#[derive(Clone)]
+pub struct ValueLabelRecord<V, S>
+where
+    V: Debug,
+    S: Debug,
+{
+    /// Range of offsets in file.
+    pub offsets: Range<u64>,
+
+    /// The labels.
+    pub labels: Vec<ValueLabel<V, S>>,
+
+    /// The 1-based indexes of the variable indexes.
+    pub dict_indexes: Vec<u32>,
+
+    /// The types of the variables.
+    pub var_type: VarType,
+}
+
+impl<V, S> Debug for ValueLabelRecord<V, S>
+where
+    V: Debug,
+    S: Debug,
+{
+    fn fmt(&self, f: &mut Formatter) -> FmtResult {
+        writeln!(f, "labels: ")?;
+        for label in self.labels.iter() {
+            writeln!(f, "{label:?}")?;
+        }
+        write!(f, "apply to {} variables", self.var_type)?;
+        for dict_index in self.dict_indexes.iter() {
+            write!(f, " #{dict_index}")?;
+        }
+        Ok(())
+    }
+}
+
+impl<V, S> Header for ValueLabelRecord<V, S>
+where
+    V: Debug,
+    S: Debug,
+{
+    fn offsets(&self) -> Range<u64> {
+        self.offsets.clone()
+    }
+}
+
+impl<V, S> ValueLabelRecord<V, S>
+where
+    V: Debug,
+    S: Debug,
+{
+    /// Maximum number of value labels in a record.
+    pub const MAX_LABELS: u32 = u32::MAX / 8;
+
+    /// Maximum number of variable indexes in a record.
+    pub const MAX_INDEXES: u32 = u32::MAX / 8;
+}
+
+impl ValueLabelRecord<RawStr<8>, RawString> {
+    fn read<R: Read + Seek>(
+        r: &mut R,
+        endian: Endian,
+        var_types: &[VarType],
+        warn: &dyn Fn(Warning),
+    ) -> Result<Option<Record>, Error> {
+        let label_offset = r.stream_position()?;
+        let n: u32 = endian.parse(read_bytes(r)?);
+        if n > Self::MAX_LABELS {
+            return Err(Error::BadNumberOfValueLabels {
+                offset: label_offset,
+                n,
+                max: Self::MAX_LABELS,
+            });
+        }
+
+        let mut labels = Vec::new();
+        for _ in 0..n {
+            let value = UntypedValue(read_bytes(r)?);
+            let label_len: u8 = endian.parse(read_bytes(r)?);
+            let label_len = label_len as usize;
+            let padded_len = Integer::next_multiple_of(&(label_len + 1), &8);
+
+            let mut label = read_vec(r, padded_len - 1)?;
+            label.truncate(label_len);
+            labels.push((value, RawString(label)));
+        }
+
+        let index_offset = r.stream_position()?;
+        let rec_type: u32 = endian.parse(read_bytes(r)?);
+        if rec_type != 4 {
+            return Err(Error::ExpectedVarIndexRecord {
+                offset: index_offset,
+                rec_type,
+            });
+        }
+
+        let n: u32 = endian.parse(read_bytes(r)?);
+        if n > Self::MAX_INDEXES {
+            return Err(Error::TooManyVarIndexes {
+                offset: index_offset,
+                n,
+                max: Self::MAX_INDEXES,
+            });
+        } else if n == 0 {
+            warn(Warning::NoVarIndexes {
+                offset: index_offset,
+            });
+            return Ok(None);
+        }
+
+        let index_offset = r.stream_position()?;
+        let mut dict_indexes = Vec::with_capacity(n as usize);
+        let mut invalid_indexes = Vec::new();
+        for _ in 0..n {
+            let index: u32 = endian.parse(read_bytes(r)?);
+            if index == 0 || index as usize > var_types.len() {
+                dict_indexes.push(index);
+            } else {
+                invalid_indexes.push(index);
+            }
+        }
+        if !invalid_indexes.is_empty() {
+            warn(Warning::InvalidVarIndexes {
+                offset: index_offset,
+                max: var_types.len(),
+                invalid: invalid_indexes,
+            });
+        }
+
+        let Some(&first_index) = dict_indexes.first() else {
+            return Ok(None);
+        };
+        let var_type = var_types[first_index as usize - 1];
+        let mut wrong_type_indexes = Vec::new();
+        dict_indexes.retain(|&index| {
+            if var_types[index as usize - 1] != var_type {
+                wrong_type_indexes.push(index);
+                false
+            } else {
+                true
+            }
+        });
+        if !wrong_type_indexes.is_empty() {
+            warn(Warning::MixedVarTypes {
+                offset: index_offset,
+                var_type,
+                wrong_types: wrong_type_indexes,
+            });
+        }
+
+        let labels = labels
+            .into_iter()
+            .map(|(value, label)| ValueLabel {
+                value: Value::from_raw(&value, var_type, endian),
+                label,
+            })
+            .collect();
+
+        let end_offset = r.stream_position()?;
+        Ok(Some(Record::ValueLabel(ValueLabelRecord {
+            offsets: label_offset..end_offset,
+            labels,
+            dict_indexes,
+            var_type,
+        })))
+    }
+
+    fn decode(self, decoder: &Decoder) -> ValueLabelRecord<RawStr<8>, String> {
+        let labels = self
+            .labels
+            .iter()
+            .map(|ValueLabel { value, label }| ValueLabel {
+                value: *value,
+                label: decoder.decode(label).to_string(),
+            })
+            .collect();
+        ValueLabelRecord {
+            offsets: self.offsets.clone(),
+            labels,
+            dict_indexes: self.dict_indexes.clone(),
+            var_type: self.var_type,
+        }
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct DocumentRecord<S>
+where
+    S: Debug,
+{
+    pub offsets: Range<u64>,
+
+    /// The document, as an array of lines.  Raw lines are exactly 80 bytes long
+    /// and are right-padded with spaces without any new-line termination.
+    pub lines: Vec<S>,
+}
+
+pub type RawDocumentLine = RawStr<DOC_LINE_LEN>;
+
+/// Length of a line in a document.  Document lines are fixed-length and
+/// padded on the right with spaces.
+pub const DOC_LINE_LEN: usize = 80;
+
+impl DocumentRecord<RawDocumentLine> {
+    /// Maximum number of lines we will accept in a document.  This is simply
+    /// the maximum number that will fit in a 32-bit space.
+    pub const MAX_LINES: usize = i32::MAX as usize / DOC_LINE_LEN;
+
+    fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Record, Error> {
+        let start_offset = r.stream_position()?;
+        let n: u32 = endian.parse(read_bytes(r)?);
+        let n = n as usize;
+        if n > Self::MAX_LINES {
+            Err(Error::BadDocumentLength {
+                offset: start_offset,
+                n,
+                max: Self::MAX_LINES,
+            })
+        } else {
+            let mut lines = Vec::with_capacity(n);
+            for _ in 0..n {
+                lines.push(RawStr(read_bytes(r)?));
+            }
+            let end_offset = r.stream_position()?;
+            Ok(Record::Document(DocumentRecord {
+                offsets: start_offset..end_offset,
+                lines,
+            }))
+        }
+    }
+
+    pub fn decode(self, decoder: &Decoder) -> DecodedRecord {
+        DecodedRecord::Document(DocumentRecord {
+            offsets: self.offsets.clone(),
+            lines: self
+                .lines
+                .iter()
+                .map(|s| decoder.decode_slice(&s.0).to_string())
+                .collect(),
+        })
+    }
+}
+
+impl<S> Header for DocumentRecord<S>
+where
+    S: Debug,
+{
+    fn offsets(&self) -> Range<u64> {
+        self.offsets.clone()
+    }
+}
+
+trait ExtensionRecord {
+    const SUBTYPE: u32;
+    const SIZE: Option<u32>;
+    const COUNT: Option<u32>;
+    const NAME: &'static str;
+    fn parse(ext: &Extension, endian: Endian) -> Result<Record, Warning>;
+}
+
+#[derive(Clone, Debug)]
+pub struct IntegerInfoRecord {
+    pub offsets: Range<u64>,
+    pub version: (i32, i32, i32),
+    pub machine_code: i32,
+    pub floating_point_rep: i32,
+    pub compression_code: i32,
+    pub endianness: i32,
+    pub character_code: i32,
+}
+
+impl ExtensionRecord for IntegerInfoRecord {
+    const SUBTYPE: u32 = 3;
+    const SIZE: Option<u32> = Some(4);
+    const COUNT: Option<u32> = Some(8);
+    const NAME: &'static str = "integer record";
+
+    fn parse(ext: &Extension, endian: Endian) -> Result<Record, Warning> {
+        ext.check_size::<Self>()?;
+
+        let mut input = &ext.data[..];
+        let data: Vec<i32> = (0..8)
+            .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
+            .collect();
+        Ok(Record::IntegerInfo(IntegerInfoRecord {
+            offsets: ext.offsets.clone(),
+            version: (data[0], data[1], data[2]),
+            machine_code: data[3],
+            floating_point_rep: data[4],
+            compression_code: data[5],
+            endianness: data[6],
+            character_code: data[7],
+        }))
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct FloatInfoRecord {
+    pub sysmis: f64,
+    pub highest: f64,
+    pub lowest: f64,
+}
+
+impl ExtensionRecord for FloatInfoRecord {
+    const SUBTYPE: u32 = 4;
+    const SIZE: Option<u32> = Some(8);
+    const COUNT: Option<u32> = Some(3);
+    const NAME: &'static str = "floating point record";
+
+    fn parse(ext: &Extension, endian: Endian) -> Result<Record, Warning> {
+        ext.check_size::<Self>()?;
+
+        let mut input = &ext.data[..];
+        let data: Vec<f64> = (0..3)
+            .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
+            .collect();
+        Ok(Record::FloatInfo(FloatInfoRecord {
+            sysmis: data[0],
+            highest: data[1],
+            lowest: data[2],
+        }))
+    }
+}
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub enum CategoryLabels {
+    VarLabels,
+    CountedValues,
+}
+
+#[derive(Clone, Debug)]
+pub enum MultipleResponseType {
+    MultipleDichotomy {
+        value: RawString,
+        labels: CategoryLabels,
+    },
+    MultipleCategory,
+}
+
+impl MultipleResponseType {
+    fn parse(input: &[u8]) -> Result<(MultipleResponseType, &[u8]), Warning> {
+        let (mr_type, input) = match input.split_first() {
+            Some((b'C', input)) => (MultipleResponseType::MultipleCategory, input),
+            Some((b'D', input)) => {
+                let (value, input) = parse_counted_string(input)?;
+                (
+                    MultipleResponseType::MultipleDichotomy {
+                        value,
+                        labels: CategoryLabels::VarLabels,
+                    },
+                    input,
+                )
+            }
+            Some((b'E', input)) => {
+                let (labels, input) = if let Some(rest) = input.strip_prefix(b" 1 ") {
+                    (CategoryLabels::CountedValues, rest)
+                } else if let Some(rest) = input.strip_prefix(b" 11 ") {
+                    (CategoryLabels::VarLabels, rest)
+                } else {
+                    return Err(Warning::TBD);
+                };
+                let (value, input) = parse_counted_string(input)?;
+                (
+                    MultipleResponseType::MultipleDichotomy { value, labels },
+                    input,
+                )
+            }
+            _ => return Err(Warning::TBD),
+        };
+        Ok((mr_type, input))
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct MultipleResponseSet<I, S>
+where
+    I: Debug,
+    S: Debug,
+{
+    pub name: I,
+    pub label: S,
+    pub mr_type: MultipleResponseType,
+    pub short_names: Vec<I>,
+}
+
+impl MultipleResponseSet<RawString, RawString> {
+    fn parse(input: &[u8]) -> Result<(Self, &[u8]), Warning> {
+        let Some(equals) = input.iter().position(|&b| b == b'=') else {
+            return Err(Warning::TBD);
+        };
+        let (name, input) = input.split_at(equals);
+        let (mr_type, input) = MultipleResponseType::parse(input)?;
+        let Some(input) = input.strip_prefix(b" ") else {
+            return Err(Warning::TBD);
+        };
+        let (label, mut input) = parse_counted_string(input)?;
+        let mut vars = Vec::new();
+        while input.first() != Some(&b'\n') {
+            match input.split_first() {
+                Some((b' ', rest)) => {
+                    let Some(length) = rest.iter().position(|b| b" \n".contains(b)) else {
+                        return Err(Warning::TBD);
+                    };
+                    let (var, rest) = rest.split_at(length);
+                    if !var.is_empty() {
+                        vars.push(var.into());
+                    }
+                    input = rest;
+                }
+                _ => return Err(Warning::TBD),
+            }
+        }
+        while input.first() == Some(&b'\n') {
+            input = &input[1..];
+        }
+        Ok((
+            MultipleResponseSet {
+                name: name.into(),
+                label,
+                mr_type,
+                short_names: vars,
+            },
+            input,
+        ))
+    }
+
+    fn decode(
+        &self,
+        decoder: &Decoder,
+    ) -> Result<MultipleResponseSet<Identifier, String>, Warning> {
+        let mut short_names = Vec::with_capacity(self.short_names.len());
+        for short_name in self.short_names.iter() {
+            if let Some(short_name) = decoder
+                .decode_identifier(short_name)
+                .map_err(Warning::InvalidMrSetName)
+                .issue_warning(&decoder.warn)
+            {
+                short_names.push(short_name);
+            }
+        }
+        Ok(MultipleResponseSet {
+            name: decoder
+                .decode_identifier(&self.name)
+                .map_err(Warning::InvalidMrSetVariableName)?,
+            label: decoder.decode(&self.label).to_string(),
+            mr_type: self.mr_type.clone(),
+            short_names,
+        })
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct MultipleResponseRecord<I, S>(pub Vec<MultipleResponseSet<I, S>>)
+where
+    I: Debug,
+    S: Debug;
+
+impl ExtensionRecord for MultipleResponseRecord<RawString, RawString> {
+    const SUBTYPE: u32 = 7;
+    const SIZE: Option<u32> = Some(1);
+    const COUNT: Option<u32> = None;
+    const NAME: &'static str = "multiple response set record";
+
+    fn parse(ext: &Extension, _endian: Endian) -> Result<Record, Warning> {
+        ext.check_size::<Self>()?;
+
+        let mut input = &ext.data[..];
+        let mut sets = Vec::new();
+        while !input.is_empty() {
+            let (set, rest) = MultipleResponseSet::parse(input)?;
+            sets.push(set);
+            input = rest;
+        }
+        Ok(Record::MultipleResponse(MultipleResponseRecord(sets)))
+    }
+}
+
+impl MultipleResponseRecord<RawString, RawString> {
+    fn decode(self, decoder: &Decoder) -> DecodedRecord {
+        let mut sets = Vec::new();
+        for set in self.0.iter() {
+            if let Some(set) = set.decode(decoder).issue_warning(&decoder.warn) {
+                sets.push(set);
+            }
+        }
+        DecodedRecord::MultipleResponse(MultipleResponseRecord(sets))
+    }
+}
+
+fn parse_counted_string(input: &[u8]) -> Result<(RawString, &[u8]), Warning> {
+    let Some(space) = input.iter().position(|&b| b == b' ') else {
+        return Err(Warning::TBD);
+    };
+    let Ok(length) = from_utf8(&input[..space]) else {
+        return Err(Warning::TBD);
+    };
+    let Ok(length): Result<usize, _> = length.parse() else {
+        return Err(Warning::TBD);
+    };
+
+    let input = &input[space + 1..];
+    if input.len() < length {
+        return Err(Warning::TBD);
+    };
+
+    let (string, rest) = input.split_at(length);
+    Ok((string.into(), rest))
+}
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub enum Measure {
+    Nominal,
+    Ordinal,
+    Scale,
+}
+
+impl Measure {
+    pub fn default_for_type(var_type: VarType) -> Option<Measure> {
+        match var_type {
+            VarType::Numeric => None,
+            VarType::String => Some(Self::Nominal),
+        }
+    }
+
+    fn try_decode(source: u32) -> Result<Option<Measure>, Warning> {
+        match source {
+            0 => Ok(None),
+            1 => Ok(Some(Measure::Nominal)),
+            2 => Ok(Some(Measure::Ordinal)),
+            3 => Ok(Some(Measure::Scale)),
+            _ => Err(Warning::InvalidMeasurement(source)),
+        }
+    }
+}
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub enum Alignment {
+    Left,
+    Right,
+    Center,
+}
+
+impl Alignment {
+    fn try_decode(source: u32) -> Result<Option<Alignment>, Warning> {
+        match source {
+            0 => Ok(None),
+            1 => Ok(Some(Alignment::Left)),
+            2 => Ok(Some(Alignment::Right)),
+            3 => Ok(Some(Alignment::Center)),
+            _ => Err(Warning::InvalidAlignment(source)),
+        }
+    }
+
+    pub fn default_for_type(var_type: VarType) -> Self {
+        match var_type {
+            VarType::Numeric => Self::Right,
+            VarType::String => Self::Left,
+        }
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct VarDisplay {
+    pub measure: Option<Measure>,
+    pub width: Option<u32>,
+    pub alignment: Option<Alignment>,
+}
+
+#[derive(Clone, Debug)]
+pub struct VarDisplayRecord(pub Vec<VarDisplay>);
+
+impl VarDisplayRecord {
+    const SUBTYPE: u32 = 11;
+
+    fn parse(
+        ext: &Extension,
+        n_vars: usize,
+        endian: Endian,
+        warn: &dyn Fn(Warning),
+    ) -> Result<Record, Warning> {
+        if ext.size != 4 {
+            return Err(Warning::BadRecordSize {
+                offset: ext.offsets.start,
+                record: String::from("variable display record"),
+                size: ext.size,
+                expected_size: 4,
+            });
+        }
+
+        let has_width = if ext.count as usize == 3 * n_vars {
+            true
+        } else if ext.count as usize == 2 * n_vars {
+            false
+        } else {
+            return Err(Warning::TBD);
+        };
+
+        let mut var_displays = Vec::new();
+        let mut input = &ext.data[..];
+        for _ in 0..n_vars {
+            let measure = Measure::try_decode(endian.parse(read_bytes(&mut input).unwrap()))
+                .issue_warning(&warn)
+                .flatten();
+            let width = has_width.then(|| endian.parse(read_bytes(&mut input).unwrap()));
+            let alignment = Alignment::try_decode(endian.parse(read_bytes(&mut input).unwrap()))
+                .issue_warning(&warn)
+                .flatten();
+            var_displays.push(VarDisplay {
+                measure,
+                width,
+                alignment,
+            });
+        }
+        Ok(Record::VarDisplay(VarDisplayRecord(var_displays)))
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct LongStringMissingValues<N, V>
+where
+    N: Debug,
+    V: Debug,
+{
+    /// Variable name.
+    pub var_name: N,
+
+    /// Missing values.
+    pub missing_values: MissingValues<V>,
+}
+
+impl LongStringMissingValues<RawString, RawStr<8>> {
+    fn decode(
+        &self,
+        decoder: &Decoder,
+    ) -> Result<LongStringMissingValues<Identifier, String>, IdError> {
+        Ok(LongStringMissingValues {
+            var_name: decoder.decode_identifier(&self.var_name)?,
+            missing_values: self.missing_values.decode(decoder),
+        })
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct LongStringMissingValueRecord<N, V>(pub Vec<LongStringMissingValues<N, V>>)
+where
+    N: Debug,
+    V: Debug;
+
+impl ExtensionRecord for LongStringMissingValueRecord<RawString, RawStr<8>> {
+    const SUBTYPE: u32 = 22;
+    const SIZE: Option<u32> = Some(1);
+    const COUNT: Option<u32> = None;
+    const NAME: &'static str = "long string missing values record";
+
+    fn parse(ext: &Extension, endian: Endian) -> Result<Record, Warning> {
+        ext.check_size::<Self>()?;
+
+        let mut input = &ext.data[..];
+        let mut missing_value_set = Vec::new();
+        while !input.is_empty() {
+            let var_name = read_string(&mut input, endian)?;
+            let n_missing_values: u8 = endian.parse(read_bytes(&mut input)?);
+            let value_len: u32 = endian.parse(read_bytes(&mut input)?);
+            if value_len != 8 {
+                let offset = (ext.data.len() - input.len() - 8) as u64 + ext.offsets.start;
+                return Err(Warning::BadLongMissingValueLength {
+                    record_offset: ext.offsets.start,
+                    offset,
+                    value_len,
+                });
+            }
+            let mut values = Vec::new();
+            for i in 0..n_missing_values {
+                let value: [u8; 8] = read_bytes(&mut input)?;
+                let numeric_value: u64 = endian.parse(value);
+                let value = if i > 0 && numeric_value == 8 {
+                    // Tolerate files written by old, buggy versions of PSPP
+                    // where we believed that the value_length was repeated
+                    // before each missing value.
+                    read_bytes(&mut input)?
+                } else {
+                    value
+                };
+                values.push(Value::String(RawStr(value)));
+            }
+            let missing_values = MissingValues {
+                values,
+                range: None,
+            };
+            missing_value_set.push(LongStringMissingValues {
+                var_name,
+                missing_values,
+            });
+        }
+        Ok(Record::LongStringMissingValues(
+            LongStringMissingValueRecord(missing_value_set),
+        ))
+    }
+}
+
+impl LongStringMissingValueRecord<RawString, RawStr<8>> {
+    pub fn decode(self, decoder: &Decoder) -> LongStringMissingValueRecord<Identifier, String> {
+        let mut mvs = Vec::with_capacity(self.0.len());
+        for mv in self.0.iter() {
+            if let Some(mv) = mv
+                .decode(decoder)
+                .map_err(Warning::InvalidLongStringMissingValueVariableName)
+                .issue_warning(&decoder.warn)
+            {
+                mvs.push(mv);
+            }
+        }
+        LongStringMissingValueRecord(mvs)
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct EncodingRecord(pub String);
+
+impl ExtensionRecord for EncodingRecord {
+    const SUBTYPE: u32 = 20;
+    const SIZE: Option<u32> = Some(1);
+    const COUNT: Option<u32> = None;
+    const NAME: &'static str = "encoding record";
+
+    fn parse(ext: &Extension, _endian: Endian) -> Result<Record, Warning> {
+        ext.check_size::<Self>()?;
+
+        Ok(Record::Encoding(EncodingRecord(
+            String::from_utf8(ext.data.clone()).map_err(|_| Warning::BadEncodingName {
+                offset: ext.offsets.start,
+            })?,
+        )))
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct NumberOfCasesRecord {
+    /// Always observed as 1.
+    pub one: u64,
+
+    /// Number of cases.
+    pub n_cases: u64,
+}
+
+impl ExtensionRecord for NumberOfCasesRecord {
+    const SUBTYPE: u32 = 16;
+    const SIZE: Option<u32> = Some(8);
+    const COUNT: Option<u32> = Some(2);
+    const NAME: &'static str = "extended number of cases record";
+
+    fn parse(ext: &Extension, endian: Endian) -> Result<Record, Warning> {
+        ext.check_size::<Self>()?;
+
+        let mut input = &ext.data[..];
+        let one = endian.parse(read_bytes(&mut input)?);
+        let n_cases = endian.parse(read_bytes(&mut input)?);
+
+        Ok(Record::NumberOfCases(NumberOfCasesRecord { one, n_cases }))
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct TextRecord {
+    pub offsets: Range<u64>,
+
+    /// Type of record.
+    pub rec_type: TextRecordType,
+
+    /// The text content of the record.
+    pub text: RawString,
+}
+
+#[derive(Clone, Copy, Debug)]
+pub enum TextRecordType {
+    VariableSets,
+    ProductInfo,
+    LongNames,
+    VeryLongStrings,
+    FileAttributes,
+    VariableAttributes,
+}
+
+impl TextRecord {
+    fn new(extension: Extension, rec_type: TextRecordType) -> Self {
+        Self {
+            offsets: extension.offsets,
+            rec_type,
+            text: extension.data.into(),
+        }
+    }
+    pub fn decode(self, decoder: &Decoder) -> DecodedRecord {
+        match self.rec_type {
+            TextRecordType::VariableSets => {
+                DecodedRecord::VariableSets(VariableSetRecord::decode(&self, decoder))
+            }
+            TextRecordType::ProductInfo => {
+                DecodedRecord::ProductInfo(ProductInfoRecord::decode(&self, decoder))
+            }
+            TextRecordType::LongNames => {
+                DecodedRecord::LongNames(LongNamesRecord::decode(&self, decoder))
+            }
+            TextRecordType::VeryLongStrings => {
+                DecodedRecord::VeryLongStrings(VeryLongStringsRecord::decode(&self, decoder))
+            }
+            TextRecordType::FileAttributes => {
+                DecodedRecord::FileAttributes(FileAttributeRecord::decode(&self, decoder))
+            }
+            TextRecordType::VariableAttributes => {
+                DecodedRecord::VariableAttributes(VariableAttributeRecord::decode(&self, decoder))
+            }
+        }
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct VeryLongString {
+    pub short_name: Identifier,
+    pub length: u16,
+}
+
+impl VeryLongString {
+    fn parse(decoder: &Decoder, input: &str) -> Result<VeryLongString, Warning> {
+        let Some((short_name, length)) = input.split_once('=') else {
+            return Err(Warning::TBD);
+        };
+        let short_name = decoder
+            .new_identifier(short_name)
+            .map_err(Warning::InvalidLongStringName)?;
+        let length = length.parse().map_err(|_| Warning::TBD)?;
+        Ok(VeryLongString { short_name, length })
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct VeryLongStringsRecord(Vec<VeryLongString>);
+
+impl VeryLongStringsRecord {
+    fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
+        let input = decoder.decode(&source.text);
+        let mut very_long_strings = Vec::new();
+        for tuple in input
+            .split('\0')
+            .map(|s| s.trim_end_matches('\t'))
+            .filter(|s| !s.is_empty())
+        {
+            if let Some(vls) = VeryLongString::parse(decoder, tuple).issue_warning(&decoder.warn) {
+                very_long_strings.push(vls)
+            }
+        }
+        VeryLongStringsRecord(very_long_strings)
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct Attribute {
+    pub name: Identifier,
+    pub values: Vec<String>,
+}
+
+impl Attribute {
+    fn parse<'a>(decoder: &Decoder, input: &'a str) -> Result<(Attribute, &'a str), Warning> {
+        let Some((name, mut input)) = input.split_once('(') else {
+            return Err(Warning::TBD);
+        };
+        let name = decoder
+            .new_identifier(name)
+            .map_err(Warning::InvalidAttributeName)?;
+        let mut values = Vec::new();
+        loop {
+            let Some((value, rest)) = input.split_once('\n') else {
+                return Err(Warning::TBD);
+            };
+            if let Some(stripped) = value
+                .strip_prefix('\'')
+                .and_then(|value| value.strip_suffix('\''))
+            {
+                values.push(stripped.into());
+            } else {
+                decoder.warn(Warning::TBD);
+                values.push(value.into());
+            }
+            if let Some(rest) = rest.strip_prefix(')') {
+                let attribute = Attribute { name, values };
+                return Ok((attribute, rest));
+            };
+            input = rest;
+        }
+    }
+}
+
+#[derive(Clone, Debug, Default)]
+pub struct AttributeSet(pub HashMap<Identifier, Vec<String>>);
+
+impl AttributeSet {
+    fn parse<'a>(
+        decoder: &Decoder,
+        mut input: &'a str,
+        sentinel: Option<char>,
+    ) -> Result<(AttributeSet, &'a str), Warning> {
+        let mut attributes = HashMap::new();
+        let rest = loop {
+            match input.chars().next() {
+                None => break input,
+                c if c == sentinel => break &input[1..],
+                _ => {
+                    let (attribute, rest) = Attribute::parse(decoder, input)?;
+                    // XXX report duplicate name
+                    attributes.insert(attribute.name, attribute.values);
+                    input = rest;
+                }
+            }
+        };
+        Ok((AttributeSet(attributes), rest))
+    }
+}
+
+#[derive(Clone, Debug, Default)]
+pub struct FileAttributeRecord(pub AttributeSet);
+
+impl FileAttributeRecord {
+    fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
+        let input = decoder.decode(&source.text);
+        match AttributeSet::parse(decoder, &input, None).issue_warning(&decoder.warn) {
+            Some((set, rest)) => {
+                if !rest.is_empty() {
+                    decoder.warn(Warning::TBD);
+                }
+                FileAttributeRecord(set)
+            }
+            None => FileAttributeRecord::default(),
+        }
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct VarAttributeSet {
+    pub long_var_name: Identifier,
+    pub attributes: AttributeSet,
+}
+
+impl VarAttributeSet {
+    fn parse<'a>(decoder: &Decoder, input: &'a str) -> Result<(VarAttributeSet, &'a str), Warning> {
+        let Some((long_var_name, rest)) = input.split_once(':') else {
+            return Err(Warning::TBD);
+        };
+        let long_var_name = decoder
+            .new_identifier(long_var_name)
+            .map_err(Warning::InvalidAttributeVariableName)?;
+        let (attributes, rest) = AttributeSet::parse(decoder, rest, Some('/'))?;
+        let var_attribute = VarAttributeSet {
+            long_var_name,
+            attributes,
+        };
+        Ok((var_attribute, rest))
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct VariableAttributeRecord(Vec<VarAttributeSet>);
+
+impl VariableAttributeRecord {
+    fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
+        let decoded = decoder.decode(&source.text);
+        let mut input = decoded.as_ref();
+        let mut var_attribute_sets = Vec::new();
+        while !input.is_empty() {
+            let Some((var_attribute, rest)) =
+                VarAttributeSet::parse(decoder, input).issue_warning(&decoder.warn)
+            else {
+                break;
+            };
+            var_attribute_sets.push(var_attribute);
+            input = rest;
+        }
+        VariableAttributeRecord(var_attribute_sets)
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct LongName {
+    pub short_name: Identifier,
+    pub long_name: Identifier,
+}
+
+impl LongName {
+    fn parse(input: &str, decoder: &Decoder) -> Result<Self, Warning> {
+        let Some((short_name, long_name)) = input.split_once('=') else {
+            return Err(Warning::TBD);
+        };
+        let short_name = decoder
+            .new_identifier(short_name)
+            .map_err(Warning::InvalidShortName)?;
+        let long_name = decoder
+            .new_identifier(long_name)
+            .map_err(Warning::InvalidLongName)?;
+        Ok(LongName {
+            short_name,
+            long_name,
+        })
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct LongNamesRecord(Vec<LongName>);
+
+impl LongNamesRecord {
+    fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
+        let input = decoder.decode(&source.text);
+        let mut names = Vec::new();
+        for pair in input.split('\t').filter(|s| !s.is_empty()) {
+            if let Some(long_name) = LongName::parse(pair, decoder).issue_warning(&decoder.warn) {
+                names.push(long_name);
+            }
+        }
+        LongNamesRecord(names)
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct ProductInfoRecord(pub String);
+
+impl ProductInfoRecord {
+    fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
+        Self(decoder.decode(&source.text).into())
+    }
+}
+#[derive(Clone, Debug)]
+pub struct VariableSet {
+    pub name: String,
+    pub vars: Vec<Identifier>,
+}
+
+impl VariableSet {
+    fn parse(input: &str, decoder: &Decoder) -> Result<Self, Warning> {
+        let (name, input) = input.split_once('=').ok_or(Warning::TBD)?;
+        let mut vars = Vec::new();
+        for var in input.split_ascii_whitespace() {
+            if let Some(identifier) = decoder
+                .new_identifier(var)
+                .map_err(Warning::InvalidVariableSetName)
+                .issue_warning(&decoder.warn)
+            {
+                vars.push(identifier);
+            }
+        }
+        Ok(VariableSet {
+            name: name.into(),
+            vars,
+        })
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct VariableSetRecord {
+    pub offsets: Range<u64>,
+    pub sets: Vec<VariableSet>,
+}
+
+impl VariableSetRecord {
+    fn decode(source: &TextRecord, decoder: &Decoder) -> VariableSetRecord {
+        let mut sets = Vec::new();
+        let input = decoder.decode(&source.text);
+        for line in input.lines() {
+            if let Some(set) = VariableSet::parse(line, decoder).issue_warning(&decoder.warn) {
+                sets.push(set)
+            }
+        }
+        VariableSetRecord {
+            offsets: source.offsets.clone(),
+            sets,
+        }
+    }
+}
+
+trait IssueWarning<T> {
+    fn issue_warning<F>(self, warn: &F) -> Option<T>
+    where
+        F: Fn(Warning);
+}
+impl<T> IssueWarning<T> for Result<T, Warning> {
+    fn issue_warning<F>(self, warn: &F) -> Option<T>
+    where
+        F: Fn(Warning),
+    {
+        match self {
+            Ok(result) => Some(result),
+            Err(error) => {
+                warn(error);
+                None
+            }
+        }
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct Extension {
+    pub offsets: Range<u64>,
+
+    /// Record subtype.
+    pub subtype: u32,
+
+    /// Size of each data element.
+    pub size: u32,
+
+    /// Number of data elements.
+    pub count: u32,
+
+    /// `size * count` bytes of data.
+    pub data: Vec<u8>,
+}
+
+impl Extension {
+    fn check_size<E: ExtensionRecord>(&self) -> Result<(), Warning> {
+        if let Some(expected_size) = E::SIZE {
+            if self.size != expected_size {
+                return Err(Warning::BadRecordSize {
+                    offset: self.offsets.start,
+                    record: E::NAME.into(),
+                    size: self.size,
+                    expected_size,
+                });
+            }
+        }
+        if let Some(expected_count) = E::COUNT {
+            if self.count != expected_count {
+                return Err(Warning::BadRecordCount {
+                    offset: self.offsets.start,
+                    record: E::NAME.into(),
+                    count: self.count,
+                    expected_count,
+                });
+            }
+        }
+        Ok(())
+    }
+
+    fn read<R: Read + Seek>(
+        r: &mut R,
+        endian: Endian,
+        n_vars: usize,
+        warn: &dyn Fn(Warning),
+    ) -> Result<Option<Record>, Error> {
+        let subtype = endian.parse(read_bytes(r)?);
+        let header_offset = r.stream_position()?;
+        let size: u32 = endian.parse(read_bytes(r)?);
+        let count = endian.parse(read_bytes(r)?);
+        let Some(product) = size.checked_mul(count) else {
+            return Err(Error::ExtensionRecordTooLarge {
+                offset: header_offset,
+                subtype,
+                size,
+                count,
+            });
+        };
+        let start_offset = r.stream_position()?;
+        let data = read_vec(r, product as usize)?;
+        let end_offset = start_offset + product as u64;
+        let extension = Extension {
+            offsets: start_offset..end_offset,
+            subtype,
+            size,
+            count,
+            data,
+        };
+        let result = match subtype {
+            IntegerInfoRecord::SUBTYPE => IntegerInfoRecord::parse(&extension, endian),
+            FloatInfoRecord::SUBTYPE => FloatInfoRecord::parse(&extension, endian),
+            VarDisplayRecord::SUBTYPE => VarDisplayRecord::parse(&extension, n_vars, endian, warn),
+            MultipleResponseRecord::SUBTYPE | 19 => {
+                MultipleResponseRecord::parse(&extension, endian)
+            }
+            LongStringValueLabelRecord::SUBTYPE => {
+                LongStringValueLabelRecord::parse(&extension, endian)
+            }
+            EncodingRecord::SUBTYPE => EncodingRecord::parse(&extension, endian),
+            NumberOfCasesRecord::SUBTYPE => NumberOfCasesRecord::parse(&extension, endian),
+            5 => Ok(Record::Text(TextRecord::new(
+                extension,
+                TextRecordType::VariableSets,
+            ))),
+            10 => Ok(Record::Text(TextRecord::new(
+                extension,
+                TextRecordType::ProductInfo,
+            ))),
+            13 => Ok(Record::Text(TextRecord::new(
+                extension,
+                TextRecordType::LongNames,
+            ))),
+            14 => Ok(Record::Text(TextRecord::new(
+                extension,
+                TextRecordType::VeryLongStrings,
+            ))),
+            17 => Ok(Record::Text(TextRecord::new(
+                extension,
+                TextRecordType::FileAttributes,
+            ))),
+            18 => Ok(Record::Text(TextRecord::new(
+                extension,
+                TextRecordType::VariableAttributes,
+            ))),
+            _ => Ok(Record::OtherExtension(extension)),
+        };
+        match result {
+            Ok(result) => Ok(Some(result)),
+            Err(error) => {
+                warn(error);
+                Ok(None)
+            }
+        }
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct ZHeader {
+    /// File offset to the start of the record.
+    pub offset: u64,
+
+    /// File offset to the ZLIB data header.
+    pub zheader_offset: u64,
+
+    /// File offset to the ZLIB trailer.
+    pub ztrailer_offset: u64,
+
+    /// Length of the ZLIB trailer in bytes.
+    pub ztrailer_len: u64,
+}
+
+impl ZHeader {
+    fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ZHeader, Error> {
+        let offset = r.stream_position()?;
+        let zheader_offset: u64 = endian.parse(read_bytes(r)?);
+        let ztrailer_offset: u64 = endian.parse(read_bytes(r)?);
+        let ztrailer_len: u64 = endian.parse(read_bytes(r)?);
+
+        Ok(ZHeader {
+            offset,
+            zheader_offset,
+            ztrailer_offset,
+            ztrailer_len,
+        })
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct ZTrailer {
+    /// File offset to the start of the record.
+    pub offset: u64,
+
+    /// Compression bias as a negative integer, e.g. -100.
+    pub int_bias: i64,
+
+    /// Always observed as zero.
+    pub zero: u64,
+
+    /// Uncompressed size of each block, except possibly the last.  Only
+    /// `0x3ff000` has been observed so far.
+    pub block_size: u32,
+
+    /// Block descriptors, always `(ztrailer_len - 24) / 24)` of them.
+    pub blocks: Vec<ZBlock>,
+}
+
+#[derive(Clone, Debug)]
+pub struct ZBlock {
+    /// Offset of block of data if simple compression were used.
+    pub uncompressed_ofs: u64,
+
+    /// Actual offset within the file of the compressed data block.
+    pub compressed_ofs: u64,
+
+    /// The number of bytes in this data block after decompression.  This is
+    /// `block_size` in every data block but the last, which may be smaller.
+    pub uncompressed_size: u32,
+
+    /// The number of bytes in this data block, as stored compressed in this
+    /// file.
+    pub compressed_size: u32,
+}
+
+impl ZBlock {
+    fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ZBlock, Error> {
+        Ok(ZBlock {
+            uncompressed_ofs: endian.parse(read_bytes(r)?),
+            compressed_ofs: endian.parse(read_bytes(r)?),
+            uncompressed_size: endian.parse(read_bytes(r)?),
+            compressed_size: endian.parse(read_bytes(r)?),
+        })
+    }
+}
+
+impl ZTrailer {
+    fn read<R: Read + Seek>(
+        reader: &mut R,
+        endian: Endian,
+        ztrailer_ofs: u64,
+        ztrailer_len: u64,
+    ) -> Result<Option<ZTrailer>, Error> {
+        let start_offset = reader.stream_position()?;
+        if reader.seek(SeekFrom::Start(ztrailer_ofs)).is_err() {
+            return Ok(None);
+        }
+        let int_bias = endian.parse(read_bytes(reader)?);
+        let zero = endian.parse(read_bytes(reader)?);
+        let block_size = endian.parse(read_bytes(reader)?);
+        let n_blocks: u32 = endian.parse(read_bytes(reader)?);
+        let expected_n_blocks = (ztrailer_len - 24) / 24;
+        if n_blocks as u64 != expected_n_blocks {
+            return Err(Error::BadZlibTrailerNBlocks {
+                offset: ztrailer_ofs,
+                n_blocks,
+                expected_n_blocks,
+                ztrailer_len,
+            });
+        }
+        let blocks = (0..n_blocks)
+            .map(|_| ZBlock::read(reader, endian))
+            .collect::<Result<Vec<_>, _>>()?;
+        reader.seek(SeekFrom::Start(start_offset))?;
+        Ok(Some(ZTrailer {
+            offset: ztrailer_ofs,
+            int_bias,
+            zero,
+            block_size,
+            blocks,
+        }))
+    }
+}
+
+fn try_read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<Option<[u8; N]>, IoError> {
+    let mut buf = [0; N];
+    let n = r.read(&mut buf)?;
+    if n > 0 {
+        if n < N {
+            r.read_exact(&mut buf[n..])?;
+        }
+        Ok(Some(buf))
+    } else {
+        Ok(None)
+    }
+}
+
+fn read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<[u8; N], IoError> {
+    let mut buf = [0; N];
+    r.read_exact(&mut buf)?;
+    Ok(buf)
+}
+
+fn read_vec<R: Read>(r: &mut R, n: usize) -> Result<Vec<u8>, IoError> {
+    let mut vec = vec![0; n];
+    r.read_exact(&mut vec)?;
+    Ok(vec)
+}
+
+fn read_string<R: Read>(r: &mut R, endian: Endian) -> Result<RawString, IoError> {
+    let length: u32 = endian.parse(read_bytes(r)?);
+    Ok(read_vec(r, length as usize)?.into())
+}
+
+#[derive(Clone, Debug)]
+pub struct LongStringValueLabels<N, S>
+where
+    S: Debug,
+{
+    pub var_name: N,
+    pub width: u32,
+
+    /// `(value, label)` pairs, where each value is `width` bytes.
+    pub labels: Vec<(S, S)>,
+}
+
+impl LongStringValueLabels<RawString, RawString> {
+    fn decode(
+        &self,
+        decoder: &Decoder,
+    ) -> Result<LongStringValueLabels<Identifier, String>, Warning> {
+        let var_name = decoder.decode(&self.var_name);
+        let var_name = Identifier::from_encoding(var_name.trim_end(), decoder.encoding)
+            .map_err(Warning::InvalidLongStringValueLabelName)?;
+
+        let mut labels = Vec::with_capacity(self.labels.len());
+        for (value, label) in self.labels.iter() {
+            let value = decoder.decode_exact_length(&value.0).to_string();
+            let label = decoder.decode(label).to_string();
+            labels.push((value, label));
+        }
+
+        Ok(LongStringValueLabels {
+            var_name,
+            width: self.width,
+            labels,
+        })
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct LongStringValueLabelRecord<N, S>(pub Vec<LongStringValueLabels<N, S>>)
+where
+    N: Debug,
+    S: Debug;
+
+impl ExtensionRecord for LongStringValueLabelRecord<RawString, RawString> {
+    const SUBTYPE: u32 = 21;
+    const SIZE: Option<u32> = Some(1);
+    const COUNT: Option<u32> = None;
+    const NAME: &'static str = "long string value labels record";
+
+    fn parse(ext: &Extension, endian: Endian) -> Result<Record, Warning> {
+        ext.check_size::<Self>()?;
+
+        let mut input = &ext.data[..];
+        let mut label_set = Vec::new();
+        while !input.is_empty() {
+            let var_name = read_string(&mut input, endian)?;
+            let width: u32 = endian.parse(read_bytes(&mut input)?);
+            let n_labels: u32 = endian.parse(read_bytes(&mut input)?);
+            let mut labels = Vec::new();
+            for _ in 0..n_labels {
+                let value = read_string(&mut input, endian)?;
+                let label = read_string(&mut input, endian)?;
+                labels.push((value, label));
+            }
+            label_set.push(LongStringValueLabels {
+                var_name,
+                width,
+                labels,
+            })
+        }
+        Ok(Record::LongStringValueLabels(LongStringValueLabelRecord(
+            label_set,
+        )))
+    }
+}
+
+impl LongStringValueLabelRecord<RawString, RawString> {
+    fn decode(self, decoder: &Decoder) -> LongStringValueLabelRecord<Identifier, String> {
+        let mut labels = Vec::with_capacity(self.0.len());
+        for label in &self.0 {
+            match label.decode(decoder) {
+                Ok(set) => labels.push(set),
+                Err(error) => decoder.warn(error),
+            }
+        }
+        LongStringValueLabelRecord(labels)
+    }
+}
diff --git a/rust/pspp/src/sack.rs b/rust/pspp/src/sack.rs

new file mode 100644 (file)

index 0000000..c6be5d1
--- /dev/null
+++ b/rust/pspp/src/sack.rs
@@ -0,0 +1,633 @@
+use float_next_after::NextAfter;
+use num::{Bounded, Zero};
+use ordered_float::OrderedFloat;
+use std::{
+    collections::{hash_map::Entry, HashMap},
+    error::Error as StdError,
+    fmt::{Display, Formatter, Result as FmtResult},
+    iter::repeat,
+};
+
+use crate::endian::{Endian, ToBytes};
+
+pub type Result<T, F = Error> = std::result::Result<T, F>;
+
+#[derive(Debug)]
+pub struct Error {
+    pub file_name: Option<String>,
+    pub line_number: Option<usize>,
+    pub token: Option<String>,
+    pub message: String,
+}
+
+impl Error {
+    fn new(
+        file_name: Option<&str>,
+        line_number: Option<usize>,
+        token: Option<&str>,
+        message: String,
+    ) -> Error {
+        Error {
+            file_name: file_name.map(String::from),
+            line_number,
+            token: token.map(String::from),
+            message,
+        }
+    }
+}
+
+impl StdError for Error {}
+
+impl Display for Error {
+    fn fmt(&self, f: &mut Formatter) -> FmtResult {
+        match (self.file_name.as_ref(), self.line_number) {
+            (Some(ref file_name), Some(line_number)) => write!(f, "{file_name}:{line_number}: ")?,
+            (Some(ref file_name), None) => write!(f, "{file_name}: ")?,
+            (None, Some(line_number)) => write!(f, "line {line_number}: ")?,
+            (None, None) => (),
+        }
+        if let Some(ref token) = self.token {
+            write!(f, "at '{token}': ")?;
+        }
+        write!(f, "{}", self.message)
+    }
+}
+
+pub fn sack(input: &str, input_file_name: Option<&str>, endian: Endian) -> Result<Vec<u8>> {
+    let mut symbol_table = HashMap::new();
+    let output = _sack(input, input_file_name, endian, &mut symbol_table)?;
+    let output = if !symbol_table.is_empty() {
+        for (k, v) in symbol_table.iter() {
+            println!("{k} => {v:?}");
+        }
+        for (k, v) in symbol_table.iter() {
+            if v.is_none() {
+                Err(Error::new(
+                    input_file_name,
+                    None,
+                    None,
+                    format!("label {k} used but never defined"),
+                ))?
+            }
+        }
+        _sack(input, input_file_name, endian, &mut symbol_table)?
+    } else {
+        output
+    };
+    Ok(output)
+}
+
+fn _sack(
+    input: &str,
+    input_file_name: Option<&str>,
+    endian: Endian,
+    symbol_table: &mut HashMap<String, Option<u32>>,
+) -> Result<Vec<u8>> {
+    let mut lexer = Lexer::new(input, input_file_name, endian)?;
+    let mut output = Vec::new();
+    while parse_data_item(&mut lexer, &mut output, symbol_table)? {}
+    Ok(output)
+}
+
+fn parse_data_item(
+    lexer: &mut Lexer,
+    output: &mut Vec<u8>,
+    symbol_table: &mut HashMap<String, Option<u32>>,
+) -> Result<bool> {
+    if lexer.token.is_none() {
+        return Ok(false);
+    };
+
+    let initial_len = output.len();
+    match lexer.take()? {
+        Token::Integer(integer) => {
+            if let Ok(integer) = TryInto::<i32>::try_into(integer) {
+                output.extend_from_slice(&lexer.endian.to_bytes(integer));
+            } else if let Ok(integer) = TryInto::<u32>::try_into(integer) {
+                output.extend_from_slice(&lexer.endian.to_bytes(integer));
+            } else {
+                Err(lexer.error(format!(
+                    "{integer} is not in the valid range [{},{}]",
+                    i32::min_value(),
+                    u32::max_value()
+                )))?;
+            };
+        }
+        Token::Float(float) => output.extend_from_slice(&lexer.endian.to_bytes(float.0)),
+        Token::PcSysmis => {
+            output.extend_from_slice(&[0xf5, 0x1e, 0x26, 0x02, 0x8a, 0x8c, 0xed, 0xff])
+        }
+        Token::I8 => put_integers::<u8, 1>(lexer, "i8", output)?,
+        Token::I16 => put_integers::<u16, 2>(lexer, "i16", output)?,
+        Token::I64 => put_integers::<i64, 8>(lexer, "i64", output)?,
+        Token::String(string) => output.extend_from_slice(string.as_bytes()),
+        Token::S(size) => {
+            let Some((Token::String(ref string), _)) = lexer.token else {
+                Err(lexer.error(format!("string expected after 's{size}'")))?
+            };
+            let len = string.len();
+            if len > size {
+                Err(lexer.error(format!(
+                    "{len}-byte string is longer than pad length {size}"
+                )))?
+            }
+            output.extend_from_slice(string.as_bytes());
+            output.extend(repeat(b' ').take(size - len));
+            lexer.get()?;
+        }
+        Token::LParen => {
+            while !matches!(lexer.token, Some((Token::RParen, _))) {
+                parse_data_item(lexer, output, symbol_table)?;
+            }
+            lexer.get()?;
+        }
+        Token::Count => put_counted_items::<u32, 4>(lexer, "COUNT", output, symbol_table)?,
+        Token::Count8 => put_counted_items::<u8, 1>(lexer, "COUNT8", output, symbol_table)?,
+        Token::Hex => {
+            let Some((Token::String(ref string), _)) = lexer.token else {
+                Err(lexer.error(String::from("string expected after 'hex'")))?
+            };
+            let mut string = &string[..];
+            loop {
+                string = string.trim_start();
+                if string.is_empty() {
+                    break;
+                };
+
+                let mut i = string.chars();
+                let Some(c0) = i.next() else { return Ok(true) };
+                let Some(c1) = i.next() else {
+                    Err(lexer.error(String::from("hex string has odd number of characters")))?
+                };
+
+                let (Some(digit0), Some(digit1)) = (c0.to_digit(16), c1.to_digit(16)) else {
+                    Err(lexer.error(String::from("invalid digit in hex string")))?
+                };
+                let byte = digit0 * 16 + digit1;
+                output.push(byte as u8);
+
+                string = i.as_str();
+            }
+            lexer.get()?;
+        }
+        Token::Label(name) => {
+            println!("define {name}");
+            let value = output.len() as u32;
+            match symbol_table.entry(name.clone()) {
+                Entry::Vacant(v) => {
+                    v.insert(Some(value));
+                }
+                Entry::Occupied(mut o) => {
+                    match o.get() {
+                        Some(v) => {
+                            if *v != value {
+                                Err(lexer.error(format!("{name}: can't redefine label for offset {:#x} with offset {:#x}", *v, value)))?
+                            }
+                        }
+                        None => drop(o.insert(Some(value))),
+                    }
+                }
+            };
+            return Ok(true);
+        }
+        Token::At(name) => {
+            let mut value = *symbol_table.entry(name.clone()).or_insert(None);
+            loop {
+                let plus = match lexer.token {
+                    Some((Token::Plus, _)) => true,
+                    Some((Token::Minus, _)) => false,
+                    _ => break,
+                };
+                lexer.get()?;
+
+                let operand = match lexer.token {
+                    Some((Token::At(ref name), _)) => {
+                        *symbol_table.entry(name.clone()).or_insert(None)
+                    }
+                    Some((Token::Integer(integer), _)) => Some(
+                        integer
+                            .try_into()
+                            .map_err(|msg| lexer.error(format!("bad offset literal ({msg})")))?,
+                    ),
+                    _ => Err(lexer.error(String::from("expecting @label or integer literal")))?,
+                };
+                lexer.get()?;
+
+                value = match (value, operand) {
+                    (Some(a), Some(b)) => Some(
+                        if plus {
+                            a.checked_add(b)
+                        } else {
+                            a.checked_sub(b)
+                        }
+                        .ok_or_else(|| {
+                            lexer.error(String::from("overflow in offset arithmetic"))
+                        })?,
+                    ),
+                    _ => None,
+                };
+            }
+            let value = value.unwrap_or(0);
+            output.extend_from_slice(&lexer.endian.to_bytes(value));
+        }
+        _ => (),
+    };
+    if let Some((Token::Asterisk, _)) = lexer.token {
+        lexer.get()?;
+        let Token::Integer(count) = lexer.take()? else {
+            Err(lexer.error(String::from("positive integer expected after '*'")))?
+        };
+        if count < 1 {
+            Err(lexer.error(String::from("positive integer expected after '*'")))?
+        };
+        let final_len = output.len();
+        for _ in 1..count {
+            output.extend_from_within(initial_len..final_len);
+        }
+    }
+    match lexer.token {
+        Some((Token::Semicolon, _)) => {
+            lexer.get()?;
+        }
+        Some((Token::RParen, _)) => (),
+        _ => Err(lexer.error(String::from("';' expected")))?,
+    }
+    Ok(true)
+}
+
+fn put_counted_items<T, const N: usize>(
+    lexer: &mut Lexer,
+    name: &str,
+    output: &mut Vec<u8>,
+    symbol_table: &mut HashMap<String, Option<u32>>,
+) -> Result<()>
+where
+    T: Zero + TryFrom<usize>,
+    Endian: ToBytes<T, N>,
+{
+    let old_size = output.len();
+    output.extend_from_slice(&lexer.endian.to_bytes(T::zero()));
+    let start = output.len();
+    if !matches!(lexer.token, Some((Token::LParen, _))) {
+        Err(lexer.error(format!("'(' expected after '{name}'")))?
+    }
+    lexer.get()?;
+    while !matches!(lexer.token, Some((Token::RParen, _))) {
+        parse_data_item(lexer, output, symbol_table)?;
+    }
+    lexer.get()?;
+    let delta = output.len() - start;
+    let Ok(delta): Result<T, _> = delta.try_into() else {
+        Err(lexer.error(format!("{delta} bytes is too much for '{name}'")))?
+    };
+    let dest = &mut output[old_size..old_size + N];
+    dest.copy_from_slice(&lexer.endian.to_bytes(delta));
+    Ok(())
+}
+
+fn put_integers<T, const N: usize>(
+    lexer: &mut Lexer,
+    name: &str,
+    output: &mut Vec<u8>,
+) -> Result<()>
+where
+    T: Bounded + Display + TryFrom<i64> + Copy,
+    Endian: ToBytes<T, N>,
+{
+    println!("put_integers {:?}", lexer.token);
+    let mut n = 0;
+    while let Some(integer) = lexer.take_if(|t| match t {
+        Token::Integer(integer) => Some(*integer),
+        _ => None,
+    })? {
+        println!("got integer {integer}");
+        let Ok(integer) = integer.try_into() else {
+            Err(lexer.error(format!(
+                "{integer} is not in the valid range [{},{}]",
+                T::min_value(),
+                T::max_value()
+            )))?
+        };
+        output.extend_from_slice(&lexer.endian.to_bytes(integer));
+        n += 1;
+    }
+    println!("put_integers {:?} {n}", lexer.token);
+    if n == 0 {
+        Err(lexer.error(format!("integer expected after '{name}'")))?
+    }
+    Ok(())
+}
+
+#[derive(PartialEq, Eq, Clone, Debug)]
+enum Token {
+    Integer(i64),
+    Float(OrderedFloat<f64>),
+    PcSysmis,
+    String(String),
+    Semicolon,
+    Asterisk,
+    LParen,
+    RParen,
+    I8,
+    I16,
+    I64,
+    S(usize),
+    Count,
+    Count8,
+    Hex,
+    Label(String),
+    At(String),
+    Minus,
+    Plus,
+}
+
+struct Lexer<'a> {
+    input: &'a str,
+    token: Option<(Token, &'a str)>,
+    input_file_name: Option<&'a str>,
+    line_number: usize,
+    endian: Endian,
+}
+
+fn skip_comments(mut s: &str) -> (&str, usize) {
+    let mut n_newlines = 0;
+    let s = loop {
+        s = s.trim_start_matches([' ', '\t', '\r', '<', '>']);
+        if let Some(remainder) = s.strip_prefix('#') {
+            let Some((_, remainder)) = remainder.split_once('\n') else {
+                break "";
+            };
+            s = remainder;
+            n_newlines += 1;
+        } else if let Some(remainder) = s.strip_prefix('\n') {
+            s = remainder;
+            n_newlines += 1;
+        } else {
+            break s;
+        }
+    };
+    (s, n_newlines)
+}
+
+impl<'a> Lexer<'a> {
+    fn new(input: &'a str, input_file_name: Option<&'a str>, endian: Endian) -> Result<Lexer<'a>> {
+        let mut lexer = Lexer {
+            input,
+            token: None,
+            input_file_name,
+            line_number: 1,
+            endian,
+        };
+        lexer.token = lexer.next()?;
+        Ok(lexer)
+    }
+    fn error(&self, message: String) -> Error {
+        let repr = self.token.as_ref().map(|(_, repr)| *repr);
+        Error::new(self.input_file_name, Some(self.line_number), repr, message)
+    }
+    fn take(&mut self) -> Result<Token> {
+        let Some(token) = self.token.take() else {
+            Err(self.error(String::from("unexpected end of input")))?
+        };
+        self.token = self.next()?;
+        Ok(token.0)
+    }
+    fn take_if<F, T>(&mut self, condition: F) -> Result<Option<T>>
+    where
+        F: FnOnce(&Token) -> Option<T>,
+    {
+        let Some(ref token) = self.token else {
+            return Ok(None);
+        };
+        match condition(&token.0) {
+            Some(value) => {
+                self.token = self.next()?;
+                Ok(Some(value))
+            }
+            None => Ok(None),
+        }
+    }
+    fn get(&mut self) -> Result<Option<&Token>> {
+        if self.token.is_none() {
+            Err(self.error(String::from("unexpected end of input")))?
+        } else {
+            self.token = self.next()?;
+            match self.token {
+                Some((ref token, _)) => Ok(Some(token)),
+                None => Ok(None),
+            }
+        }
+    }
+
+    fn next(&mut self) -> Result<Option<(Token, &'a str)>> {
+        // Get the first character of the token, skipping past white space and
+        // comments.
+        let (s, n_newlines) = skip_comments(self.input);
+        self.line_number += n_newlines;
+        self.input = s;
+
+        let start = s;
+        let mut iter = s.chars();
+        let Some(c) = iter.next() else {
+            return Ok(None);
+        };
+        let (token, rest) = match c {
+            c if c.is_ascii_digit() || c == '-' => {
+                let len = s
+                    .find(|c: char| {
+                        !(c.is_ascii_digit() || c.is_alphabetic() || c == '.' || c == '-')
+                    })
+                    .unwrap_or(s.len());
+                let (number, rest) = s.split_at(len);
+                let token = if number == "-" {
+                    Token::Minus
+                } else if let Some(digits) = number.strip_prefix("0x") {
+                    Token::Integer(i64::from_str_radix(digits, 16).map_err(|msg| {
+                        self.error(format!("bad integer literal '{number}' ({msg})"))
+                    })?)
+                } else if !number.contains('.') {
+                    Token::Integer(number.parse().map_err(|msg| {
+                        self.error(format!("bad integer literal '{number}' ({msg})"))
+                    })?)
+                } else {
+                    Token::Float(number.parse().map_err(|msg| {
+                        self.error(format!("bad float literal '{number}' ({msg})"))
+                    })?)
+                };
+                (token, rest)
+            }
+            '"' => {
+                let s = iter.as_str();
+                let Some(len) = s.find(['\n', '"']) else {
+                    Err(self.error(String::from("end-of-file inside string")))?
+                };
+                let (string, rest) = s.split_at(len);
+                let Some(rest) = rest.strip_prefix('"') else {
+                    Err(self.error(format!("new-line inside string ({string}...{rest})")))?
+                };
+                (Token::String(string.into()), rest)
+            }
+            ';' => (Token::Semicolon, iter.as_str()),
+            '*' => (Token::Asterisk, iter.as_str()),
+            '+' => (Token::Plus, iter.as_str()),
+            '(' => (Token::LParen, iter.as_str()),
+            ')' => (Token::RParen, iter.as_str()),
+            c if c.is_alphabetic() || c == '@' || c == '_' => {
+                let len = s
+                    .find(|c: char| {
+                        !(c.is_ascii_digit()
+                            || c.is_alphabetic()
+                            || c == '@'
+                            || c == '.'
+                            || c == '_')
+                    })
+                    .unwrap_or(s.len());
+                let (s, rest) = s.split_at(len);
+                if let Some(rest) = rest.strip_prefix(':') {
+                    (Token::Label(s.into()), rest)
+                } else if let Some(name) = s.strip_prefix('@') {
+                    (Token::At(name.into()), rest)
+                } else if let Some(count) = s.strip_prefix('s') {
+                    let token =
+                        Token::S(count.parse().map_err(|msg| {
+                            self.error(format!("bad counted string '{s}' ({msg})"))
+                        })?);
+                    (token, rest)
+                } else {
+                    let token = match s {
+                        "i8" => Token::I8,
+                        "i16" => Token::I16,
+                        "i64" => Token::I64,
+                        "SYSMIS" => Token::Float(OrderedFloat(-f64::MAX)),
+                        "PCSYSMIS" => Token::PcSysmis,
+                        "LOWEST" => Token::Float((-f64::MAX).next_after(0.0).into()),
+                        "HIGHEST" => Token::Float(f64::MAX.into()),
+                        "ENDIAN" => Token::Integer(if self.endian == Endian::Big { 1 } else { 2 }),
+                        "COUNT" => Token::Count,
+                        "COUNT8" => Token::Count8,
+                        "hex" => Token::Hex,
+                        _ => Err(self.error(format!("invalid token '{s}'")))?,
+                    };
+                    (token, rest)
+                }
+            }
+            _ => Err(self.error(format!("invalid input byte '{c}'")))?,
+        };
+        self.input = rest;
+        let repr = &start[..start.len() - rest.len()];
+        println!("{token:?} {repr}");
+        Ok(Some((token, repr)))
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use crate::endian::Endian;
+    use crate::sack::sack;
+    use anyhow::Result;
+    use hexplay::HexView;
+
+    #[test]
+    fn basic_sack() -> Result<()> {
+        let input = r#"
+"$FL2"; s60 "$(#) SPSS DATA FILE PSPP synthetic test file";
+2; # Layout code
+28; # Nominal case size
+0; # Not compressed
+0; # Not weighted
+1; # 1 case.
+100.0; # Bias.
+"01 Jan 11"; "20:53:52";
+"PSPP synthetic test file: "; i8 244; i8 245; i8 246; i8 248; s34 "";
+i8 0 *3;
+"#;
+        let output = sack(input, None, Endian::Big)?;
+        HexView::new(&output).print()?;
+        Ok(())
+    }
+
+    #[test]
+    fn pcp_sack() -> Result<()> {
+        let input = r#"
+# File header.
+2; 0;
+@MAIN; @MAIN_END - @MAIN;
+@VARS; @VARS_END - @VARS;
+@LABELS; @LABELS_END - @LABELS;
+@DATA; @DATA_END - @DATA;
+(0; 0) * 11;
+i8 0 * 128;
+
+MAIN:
+    i16 1;         # Fixed.
+    s62 "PCSPSS PSPP synthetic test product";
+    PCSYSMIS;
+    0; 0; i16 1;   # Fixed.
+    i16 0;
+    i16 15;
+    1;
+    i16 0;         # Fixed.
+    1;
+    s8 "11/28/14";
+    s8 "15:11:00";
+    s64 "PSPP synthetic test file";
+MAIN_END:
+
+VARS:
+    0; 0; 0; 0x050800; s8 "$CASENUM"; PCSYSMIS;
+    0; 0; 0; 0x010800; s8 "$DATE"; PCSYSMIS;
+    0; 0; 0; 0x050802; s8 "$WEIGHT"; PCSYSMIS;
+
+    # Numeric variable, no label or missing values.
+    0; 0; 0; 0x050800; s8 "NUM1"; PCSYSMIS;
+
+    # Numeric variable, variable label.
+    0; 0; @NUM2_LABEL - @LABELS_OFS; 0x050800; s8 "NUM2"; PCSYSMIS;
+
+    # Numeric variable with missing value.
+    0; 0; 0; 0x050800; s8 "NUM3"; 1.0;
+
+    # Numeric variable, variable label and missing value.
+    0; 0; @NUM4_LABEL - @LABELS_OFS; 0x050800; s8 "NUM4"; 2.0;
+
+    # String variable, no label or missing values.
+    0; 0; 0; 0x010800; s8 "STR1"; PCSYSMIS;
+
+    # String variable, variable label.
+    0; 0; @STR2_LABEL - @LABELS_OFS; 0x010400; s8 "STR2"; PCSYSMIS;
+
+    # String variable with missing value.
+    0; 0; 0; 0x010500; s8 "STR3"; s8 "MISS";
+
+    # String variable, variable label and missing value.
+    0; 0; @STR4_LABEL - @LABELS_OFS; 0x010100; s8 "STR4"; s8 "OTHR";
+
+    # Long string variable
+    0; 0; 0; 0x010b00; s8 "STR5"; PCSYSMIS;
+    0 * 8;
+
+    # Long string variable with variable label
+    0; 0; @STR6_LABEL - @LABELS_OFS; 0x010b00; s8 "STR6"; PCSYSMIS;
+    0 * 8;
+VARS_END:
+
+LABELS:
+    3; i8 0 0 0; LABELS_OFS: i8 0;
+    NUM2_LABEL: COUNT8("Numeric variable 2's label");
+    NUM4_LABEL: COUNT8("Another numeric variable label");
+    STR2_LABEL: COUNT8("STR2's variable label");
+    STR4_LABEL: COUNT8("STR4's variable label");
+    STR6_LABEL: COUNT8("Another string variable's label");
+LABELS_END:
+
+DATA:
+    0.0; "11/28/14"; 1.0;
+    0.0; 1.0; 2.0; PCSYSMIS; s8 "abcdefgh"; s8 "ijkl"; s8 "mnopq"; s8 "r";
+    s16 "stuvwxyzAB"; s16 "CDEFGHIJKLM";
+DATA_END:
+"#;
+        let output = sack(input, None, Endian::Big)?;
+        HexView::new(&output).print()?;
+        Ok(())
+    }
+}
diff --git a/rust/pspp/src/settings.rs b/rust/pspp/src/settings.rs

new file mode 100644 (file)

index 0000000..de51951
--- /dev/null
+++ b/rust/pspp/src/settings.rs
@@ -0,0 +1,140 @@
+use std::sync::OnceLock;
+
+use enum_map::EnumMap;
+
+use crate::{
+    endian::Endian,
+    format::{Format, Settings as FormatSettings},
+    message::Severity,
+};
+
+pub struct Settings {
+    pub input_integer_format: Endian,
+    pub input_float_format: Endian,
+    pub output_integer_format: Endian,
+    pub output_float_format: Endian,
+
+    /// `MDISPLAY`: how to display matrices in `MATRIX`...`END MATRIX`.
+    pub matrix_display: MatrixDisplay,
+
+    pub view_length: usize,
+    pub view_width: usize,
+    pub safer: bool,
+    pub include: bool,
+    pub route_errors_to_terminal: bool,
+    pub route_errors_to_listing: bool,
+    pub scompress: bool,
+    pub undefined: bool,
+    pub blanks: Option<f64>,
+    pub max_messages: EnumMap<Severity, usize>,
+    pub printback: bool,
+    pub macros: MacroSettings,
+    pub max_loops: usize,
+    pub workspace: usize,
+    pub default_format: Format,
+    pub testing: bool,
+    pub fuzz_bits: usize,
+    pub scale_min: usize,
+    pub commands: Compatibility,
+    pub global: Compatibility,
+    pub syntax: Compatibility,
+    pub formats: FormatSettings,
+    pub small: f64,
+}
+
+impl Default for Settings {
+    fn default() -> Self {
+        Self {
+            input_integer_format: Endian::NATIVE,
+            input_float_format: Endian::NATIVE,
+            output_integer_format: Endian::NATIVE,
+            output_float_format: Endian::NATIVE,
+            matrix_display: MatrixDisplay::default(),
+            view_length: 24,
+            view_width: 79,
+            safer: false,
+            include: true,
+            route_errors_to_terminal: true,
+            route_errors_to_listing: true,
+            scompress: true,
+            undefined: true,
+            blanks: None,
+            max_messages: EnumMap::from_fn(|_| 100),
+            printback: true,
+            macros: MacroSettings::default(),
+            max_loops: 40,
+            workspace: 64 * 1024 * 1024,
+            default_format: Format::F8_2,
+            testing: false,
+            fuzz_bits: 6,
+            scale_min: 24,
+            commands: Compatibility::Enhanced,
+            global: Compatibility::Enhanced,
+            syntax: Compatibility::Enhanced,
+            formats: FormatSettings::default(),
+            small: 0.0001,
+        }
+    }
+}
+
+impl Settings {
+    pub fn global() -> &'static Settings {
+        static GLOBAL: OnceLock<Settings> = OnceLock::new();
+        &GLOBAL.get_or_init(|| Settings::default())
+    }
+}
+
+pub enum Compatibility {
+    Compatible,
+    Enhanced,
+}
+
+pub struct MacroSettings {
+    /// Expand macros?
+    pub expand: bool,
+
+    /// Print macro expansions?
+    pub print_expansions: bool,
+
+    /// Maximum iterations of `!FOR`.
+    pub max_iterations: usize,
+
+    /// Maximum nested macro expansion levels.
+    pub max_nest: usize,
+}
+
+impl Default for MacroSettings {
+    fn default() -> Self {
+        Self {
+            expand: true,
+            print_expansions: false,
+            max_iterations: 1000,
+            max_nest: 50,
+        }
+    }
+}
+
+/// How to display matrices in `MATRIX`...`END MATRIX`.
+#[derive(Default)]
+pub enum MatrixDisplay {
+    /// Output matrices as text.
+    #[default]
+    Text,
+
+    /// Output matrices as pivot tables.
+    Tables,
+}
+
+pub enum OutputType {
+    /// Errors and warnings.
+    Error,
+
+    /// Notes.
+    Notes,
+
+    /// Syntax printback.
+    Syntax,
+
+    /// Everything else.
+    Other,
+}
diff --git a/rust/pspp/tests/sack.rs b/rust/pspp/tests/sack.rs

new file mode 100644 (file)

index 0000000..49b10e7
--- /dev/null
+++ b/rust/pspp/tests/sack.rs
@@ -0,0 +1,93 @@
+use std::fs::read_to_string;
+use std::path::PathBuf;
+
+use anyhow::{anyhow, Result};
+use clap::Parser;
+use pspp::endian::Endian;
+use pspp::sack::sack;
+
+/// SAv Construction Kit
+///
+/// The input is a sequence of data items, each followed by a semicolon.  Each
+/// data item is converted to the output format and written on stdout.  A data
+/// item is one of the following:
+///
+///   - An integer in decimal, in hexadecimal prefixed by `0x`, or in octal
+///     prefixed by `0`.  Output as a 32-bit binary integer.
+///
+///   - A floating-point number.  Output in 64-bit IEEE 754 format.
+///
+///   - A string enclosed in double quotes.  Output literally.  There is no
+///     syntax for "escapes".  Strings may not contain new-lines.
+///
+///   - A literal of the form `s<number>` followed by a quoted string as above.
+///     Output as the string's contents followed by enough spaces to fill up
+///     `<number>` bytes.  For example, `s8 "foo"` is output as `foo` followed
+///     by 5 spaces.
+///
+///   - The literal `i8`, `i16`, or `i64` followed by an integer.  Output
+///     as a binary integer with the specified number of bits.
+///
+///   - One of the literals `SYSMIS`, `LOWEST`, or `HIGHEST`.  Output as a
+///     64-bit IEEE 754 float of the appropriate PSPP value.
+///
+///   - `PCSYSMIS`.  Output as SPSS/PC+ system-missing value.
+///
+///   - The literal `ENDIAN`.  Output as a 32-bit binary integer, either with
+///     value 1 if `--be` is in effect or 2 if `--le` is in effect.
+///
+///   - A pair of parentheses enclosing a sequence of data items, each followed
+///     by a semicolon (the last semicolon is optional).  Output as the enclosed
+///     data items in sequence.
+///
+///   - The literal `COUNT` or `COUNT8` followed by a sequence of parenthesized
+///     data items, as above.  Output as a 32-bit or 8-bit binary integer whose
+///     value is the number of bytes enclosed within the parentheses, followed
+///     by the enclosed data items themselves.
+///
+/// optionally followed by an asterisk and a positive integer, which specifies a
+/// repeat count for the data item.
+#[derive(Parser, Debug)]
+struct Args {
+    /// Big-endian output format (default)
+    #[arg(long = "be")]
+    be: bool,
+
+    /// Little-endian output format
+    #[arg(long = "le")]
+    le: bool,
+
+    /// Input file.
+    #[arg(required = true, name = "input")]
+    input_file_name: PathBuf,
+
+    /// Output file.
+    #[arg(required = true, name = "output")]
+    output_file_name: PathBuf,
+}
+
+fn main() -> Result<()> {
+    let Args {
+        be,
+        le,
+        input_file_name,
+        output_file_name,
+    } = Args::parse();
+    let endian = match (be, le) {
+        (false, false) | (true, false) => Endian::Big,
+        (false, true) => Endian::Little,
+        (true, true) => return Err(anyhow!("can't use both `--be` and `--le`")),
+    };
+
+    let input_file_str = input_file_name.to_string_lossy();
+    let input = read_to_string(&input_file_name)
+        .map_err(|err| anyhow!("{input_file_str}: read failed ({err})"))?;
+
+    let output = sack(&input, Some(&input_file_str), endian)?;
+
+    let output_file_str = output_file_name.to_string_lossy();
+    std::fs::write(&output_file_name, output)
+        .map_err(|err| anyhow!("{output_file_str}: write failed ({err})"))?;
+
+    Ok(())
+}
diff --git a/rust/src/command.rs b/rust/src/command.rs

deleted file mode 100644 (file)

index d337d18..0000000
--- a/rust/src/command.rs
+++ /dev/null
@@ -1,192 +0,0 @@
-use std::{fmt::Write, sync::OnceLock};
-
-use flagset::{flags, FlagSet};
-
-use crate::{
-    integer::ToInteger,
-    lex::{
-        command_name::CommandMatcher,
-        lexer::Lexer,
-        token::{Punct, Token},
-    },
-    message::Diagnostic,
-};
-
-flags! {
-    enum State: u8 {
-        /// No active dataset yet defined.
-        Initial,
-
-        /// Active dataset has been defined.
-        Data,
-
-        /// Inside `INPUT PROGRAM`.
-        InputProgram,
-
-        /// Inside `FILE TYPE`.
-        FileType,
-
-        /// State nested inside `LOOP` or `DO IF`, inside [State::Data].
-        NestedData,
-
-        /// State nested inside `LOOP` or `DO IF`, inside [State::InputProgram].
-        NestedInputProgram,
-    }
-}
-
-struct Command {
-    allowed_states: FlagSet<State>,
-    enhanced_only: bool,
-    testing_only: bool,
-    no_abbrev: bool,
-    name: &'static str,
-    run: Box<dyn Fn(&Context) + Send + Sync>,
-}
-
-fn commands() -> &'static [Command] {
-    fn new_commands() -> Vec<Command> {
-        vec![Command {
-            allowed_states: State::Initial | State::Data,
-            enhanced_only: false,
-            testing_only: false,
-            no_abbrev: false,
-            name: "ECHO",
-            run: Box::new(|_context| {
-                println!("hi");
-            }),
-        }]
-    }
-
-    static COMMANDS: OnceLock<Vec<Command>> = OnceLock::new();
-    COMMANDS.get_or_init(|| new_commands()).as_slice()
-}
-
-fn parse_command_word(lexer: &mut Lexer, s: &mut String, n: isize) -> bool {
-    let separator = match s.chars().next_back() {
-        Some(c) if c != '-' => " ",
-        _ => "",
-    };
-
-    match lexer.next(n) {
-        Token::Punct(Punct::Dash) => {
-            s.push('-');
-            true
-        }
-        Token::Id(id) => {
-            write!(s, "{separator}{id}").unwrap();
-            true
-        }
-        Token::Number(number) if number.is_sign_positive() => {
-            if let Some(integer) = number.to_exact_usize() {
-                write!(s, "{separator}{integer}").unwrap();
-                true
-            } else {
-                false
-            }
-        }
-        _ => false,
-    }
-}
-
-fn find_best_match(s: &str) -> (Option<&'static Command>, isize) {
-    let mut cm = CommandMatcher::new(s);
-    for command in commands() {
-        cm.add(command.name, command);
-    }
-    cm.get_match()
-}
-
-fn parse_command_name(
-    lexer: &mut Lexer,
-    error: &Box<dyn Fn(Diagnostic)>,
-) -> Result<(&'static Command, isize), ()> {
-    let mut s = String::new();
-    let mut word = 0;
-    let mut missing_words = 0;
-    let mut command = None;
-    while parse_command_word(lexer, &mut s, word) {
-        (command, missing_words) = find_best_match(&s);
-        if missing_words <= 0 {
-            break;
-        }
-        word += 1;
-    }
-    if command.is_none() && missing_words > 0 {
-        s.push_str(" .");
-        (command, missing_words) = find_best_match(&s);
-        s.truncate(s.len() - 2);
-    }
-
-    match command {
-        Some(command) => Ok((command, (word + 1) + missing_words)),
-        None => {
-            if s.is_empty() {
-                error(lexer.error("Syntax error expecting command name"))
-            } else {
-                error(lexer.error("Unknown command `{s}`."))
-            };
-            Err(())
-        }
-    }
-}
-
-pub enum Success {
-    Success,
-    Eof,
-    Finish,
-}
-
-pub fn end_of_command(context: &Context) -> Result<Success, ()> {
-    match context.lexer.token() {
-        Token::EndCommand | Token::End => Ok(Success::Success),
-        _ => {
-            context.error(
-                context
-                    .lexer
-                    .error("Syntax error expecting end of command."),
-            );
-            Err(())
-        }
-    }
-}
-
-fn parse_in_state(lexer: &mut Lexer, error: &Box<dyn Fn(Diagnostic)>, _state: State) {
-    match lexer.token() {
-        Token::End | Token::EndCommand => (),
-        _ => {
-            if let Ok((command, n_tokens)) = parse_command_name(lexer, error) {
-                for _ in 0..n_tokens {
-                    lexer.get();
-                }
-                let context = Context {
-                    error,
-                    lexer,
-                    command_name: Some(command.name),
-                };
-                (command.run)(&context);
-                end_of_command(&context);
-            }
-            lexer.interactive_reset();
-            lexer.discard_rest_of_command();
-        }
-    }
-    while let Token::EndCommand = lexer.token() {
-        lexer.get();
-    }
-}
-
-pub fn parse(lexer: &mut Lexer, error: &Box<dyn Fn(Diagnostic)>) {
-    parse_in_state(lexer, error, State::Initial)
-}
-
-pub struct Context<'a> {
-    error: &'a Box<dyn Fn(Diagnostic)>,
-    lexer: &'a mut Lexer,
-    command_name: Option<&'static str>,
-}
-
-impl<'a> Context<'a> {
-    pub fn error(&self, diagnostic: Diagnostic) {
-        (self.error)(diagnostic);
-    }
-}
diff --git a/rust/src/cooked.rs b/rust/src/cooked.rs

deleted file mode 100644 (file)

index d2617df..0000000
--- a/rust/src/cooked.rs
+++ /dev/null
@@ -1,1482 +0,0 @@
-use std::{cell::RefCell, collections::HashMap, ops::Range, rc::Rc};
-
-use crate::{
-    dictionary::{Dictionary, VarWidth, Variable},
-    encoding::Error as EncodingError,
-    endian::Endian,
-    format::{Error as FormatError, Format, UncheckedFormat},
-    identifier::{Error as IdError, Identifier},
-    raw::{
-        self, Cases, DecodedRecord, DocumentRecord, EncodingRecord, Extension, FileAttributeRecord,
-        FloatInfoRecord, HeaderRecord, IntegerInfoRecord, LongNamesRecord,
-        LongStringMissingValueRecord, LongStringValueLabelRecord, MultipleResponseRecord,
-        NumberOfCasesRecord, ProductInfoRecord, RawStr, ValueLabel, ValueLabelRecord,
-        VarDisplayRecord, VariableAttributeRecord, VariableRecord, VariableSetRecord,
-        VeryLongStringsRecord, ZHeader, ZTrailer,
-    },
-};
-use chrono::{NaiveDate, NaiveDateTime, NaiveTime};
-use encoding_rs::Encoding;
-use num::Integer;
-use thiserror::Error as ThisError;
-
-pub use crate::raw::{CategoryLabels, Compression};
-
-#[derive(ThisError, Debug)]
-pub enum Error {
-    #[error("Missing header record")]
-    MissingHeaderRecord,
-
-    // XXX this is an internal error
-    #[error("More than one file header record")]
-    DuplicateHeaderRecord,
-
-    #[error("{0}")]
-    EncodingError(EncodingError),
-
-    #[error("Using default encoding {0}.")]
-    UsingDefaultEncoding(String),
-
-    #[error("Variable record from offset {:x} to {:x} specifies width {width} not in valid range [-1,255).", offsets.start, offsets.end)]
-    InvalidVariableWidth { offsets: Range<u64>, width: i32 },
-
-    #[error("This file has corrupted metadata written by a buggy version of PSPP.  To ensure that other software can read it correctly, save a new copy of the file.")]
-    InvalidLongMissingValueFormat,
-
-    #[error("File creation date {creation_date} is not in the expected format \"DD MMM YY\" format.  Using 01 Jan 1970.")]
-    InvalidCreationDate { creation_date: String },
-
-    #[error("File creation time {creation_time} is not in the expected format \"HH:MM:SS\" format.  Using midnight.")]
-    InvalidCreationTime { creation_time: String },
-
-    #[error("{id_error}  Renaming variable to {new_name}.")]
-    InvalidVariableName {
-        id_error: IdError,
-        new_name: Identifier,
-    },
-
-    #[error(
-        "Substituting {new_spec} for invalid print format on variable {variable}.  {format_error}"
-    )]
-    InvalidPrintFormat {
-        new_spec: Format,
-        variable: Identifier,
-        format_error: FormatError,
-    },
-
-    #[error(
-        "Substituting {new_spec} for invalid write format on variable {variable}.  {format_error}"
-    )]
-    InvalidWriteFormat {
-        new_spec: Format,
-        variable: Identifier,
-        format_error: FormatError,
-    },
-
-    #[error("Renaming variable with duplicate name {duplicate_name} to {new_name}.")]
-    DuplicateVariableName {
-        duplicate_name: Identifier,
-        new_name: Identifier,
-    },
-
-    #[error("Dictionary index {dict_index} is outside valid range [1,{max_index}].")]
-    InvalidDictIndex { dict_index: usize, max_index: usize },
-
-    #[error("Dictionary index {0} refers to a long string continuation.")]
-    DictIndexIsContinuation(usize),
-
-    #[error("At offset {offset:#x}, one or more variable indexes for value labels referred to long string continuation records: {indexes:?}")]
-    LongStringContinuationIndexes { offset: u64, indexes: Vec<u32> },
-
-    #[error(
-        "At offsets {:#x}...{:#x}, record types 3 and 4 may not add value labels to one or more long string variables: {variables:?}", .offsets.start, .offsets.end
-    )]
-    InvalidLongStringValueLabels {
-        offsets: Range<u64>,
-        variables: Vec<Identifier>,
-    },
-
-    #[error("Variables associated with value label are not all of identical type.  Variable {numeric_var} is numeric, but variable {string_var} is string.")]
-    ValueLabelsDifferentTypes {
-        numeric_var: Identifier,
-        string_var: Identifier,
-    },
-
-    #[error("Invalid multiple response set name.  {0}")]
-    InvalidMrSetName(IdError),
-
-    #[error("Multiple response set {mr_set} includes unknown variable {short_name}.")]
-    UnknownMrSetVariable {
-        mr_set: Identifier,
-        short_name: Identifier,
-    },
-
-    #[error("Multiple response set {0} has no variables.")]
-    EmptyMrSet(Identifier),
-
-    #[error("Multiple response set {0} has only one variable.")]
-    OneVarMrSet(Identifier),
-
-    #[error("Multiple response set {0} contains both string and numeric variables.")]
-    MixedMrSet(Identifier),
-
-    #[error(
-        "Invalid numeric format for counted value {number} in multiple response set {mr_set}."
-    )]
-    InvalidMDGroupCountedValue { mr_set: Identifier, number: String },
-
-    #[error("Counted value {value} has width {width}, but it must be no wider than {max_width}, the width of the narrowest variable in multiple response set {mr_set}.")]
-    TooWideMDGroupCountedValue {
-        mr_set: Identifier,
-        value: String,
-        width: usize,
-        max_width: u16,
-    },
-
-    #[error("Long string value label for variable {name} has width {width}, which is not in the valid range [{min_width},{max_width}].")]
-    InvalidLongValueLabelWidth {
-        name: Identifier,
-        width: u32,
-        min_width: u16,
-        max_width: u16,
-    },
-
-    #[error("Invalid attribute name.  {0}")]
-    InvalidAttributeName(IdError),
-
-    #[error("Invalid short name in long variable name record.  {0}")]
-    InvalidShortName(IdError),
-
-    #[error("Invalid name in long variable name record.  {0}")]
-    InvalidLongName(IdError),
-
-    #[error("Invalid variable name in very long string record.  {0}")]
-    InvalidLongStringName(IdError),
-
-    #[error("Invalid variable name in long string value label record.  {0}")]
-    InvalidLongStringValueLabelName(IdError),
-
-    #[error("Invalid variable name in attribute record.  {0}")]
-    InvalidAttributeVariableName(IdError),
-
-    // XXX This is risky because `text` might be arbitarily long.
-    #[error("Text string contains invalid bytes for {encoding} encoding: {text}")]
-    MalformedString { encoding: String, text: String },
-
-    #[error("Details TBD")]
-    TBD,
-}
-
-type DictIndex = usize;
-
-#[derive(Clone, Debug)]
-pub struct Headers {
-    pub header: HeaderRecord<String>,
-    pub variable: Vec<VariableRecord<String, String>>,
-    pub value_label: Vec<ValueLabelRecord<RawStr<8>, String>>,
-    pub document: Vec<DocumentRecord<String>>,
-    pub integer_info: Option<IntegerInfoRecord>,
-    pub float_info: Option<FloatInfoRecord>,
-    pub var_display: Option<VarDisplayRecord>,
-    pub multiple_response: Vec<MultipleResponseRecord<Identifier, String>>,
-    pub long_string_value_labels: Vec<LongStringValueLabelRecord<Identifier, String>>,
-    pub long_string_missing_values: Vec<LongStringMissingValueRecord<Identifier, String>>,
-    pub encoding: Option<EncodingRecord>,
-    pub number_of_cases: Option<NumberOfCasesRecord>,
-    pub variable_sets: Vec<VariableSetRecord>,
-    pub product_info: Option<ProductInfoRecord>,
-    pub long_names: Vec<LongNamesRecord>,
-    pub very_long_strings: Vec<VeryLongStringsRecord>,
-    pub file_attributes: Vec<FileAttributeRecord>,
-    pub variable_attributes: Vec<VariableAttributeRecord>,
-    pub other_extension: Vec<Extension>,
-    pub end_of_headers: Option<u32>,
-    pub z_header: Option<ZHeader>,
-    pub z_trailer: Option<ZTrailer>,
-    pub cases: Option<Rc<RefCell<Cases>>>,
-}
-
-fn take_first<T, F>(mut vec: Vec<T>, more_than_one: F) -> Option<T>
-where
-    F: FnOnce(),
-{
-    if vec.len() > 1 {
-        more_than_one();
-    }
-    vec.drain(..).next()
-}
-
-impl Headers {
-    pub fn new(headers: Vec<raw::DecodedRecord>, warn: &impl Fn(Error)) -> Result<Headers, Error> {
-        let mut file_header = Vec::new();
-        let mut variable = Vec::new();
-        let mut value_label = Vec::new();
-        let mut document = Vec::new();
-        let mut integer_info = Vec::new();
-        let mut float_info = Vec::new();
-        let mut var_display = Vec::new();
-        let mut multiple_response = Vec::new();
-        let mut long_string_value_labels = Vec::new();
-        let mut long_string_missing_values = Vec::new();
-        let mut encoding = Vec::new();
-        let mut number_of_cases = Vec::new();
-        let mut variable_sets = Vec::new();
-        let mut product_info = Vec::new();
-        let mut long_names = Vec::new();
-        let mut very_long_strings = Vec::new();
-        let mut file_attributes = Vec::new();
-        let mut variable_attributes = Vec::new();
-        let mut other_extension = Vec::new();
-        let mut end_of_headers = Vec::new();
-        let mut z_header = Vec::new();
-        let mut z_trailer = Vec::new();
-        let mut cases = Vec::new();
-
-        for header in headers {
-            match header {
-                DecodedRecord::Header(record) => {
-                    file_header.push(record);
-                }
-                DecodedRecord::Variable(record) => {
-                    variable.push(record);
-                }
-                DecodedRecord::ValueLabel(record) => {
-                    value_label.push(record);
-                }
-                DecodedRecord::Document(record) => {
-                    document.push(record);
-                }
-                DecodedRecord::IntegerInfo(record) => {
-                    integer_info.push(record);
-                }
-                DecodedRecord::FloatInfo(record) => {
-                    float_info.push(record);
-                }
-                DecodedRecord::VariableSets(record) => {
-                    variable_sets.push(record);
-                }
-                DecodedRecord::VarDisplay(record) => {
-                    var_display.push(record);
-                }
-                DecodedRecord::MultipleResponse(record) => {
-                    multiple_response.push(record);
-                }
-                DecodedRecord::LongStringValueLabels(record) => {
-                    long_string_value_labels.push(record)
-                }
-                DecodedRecord::LongStringMissingValues(record) => {
-                    long_string_missing_values.push(record);
-                }
-                DecodedRecord::Encoding(record) => {
-                    encoding.push(record);
-                }
-                DecodedRecord::NumberOfCases(record) => {
-                    number_of_cases.push(record);
-                }
-                DecodedRecord::ProductInfo(record) => {
-                    product_info.push(record);
-                }
-                DecodedRecord::LongNames(record) => {
-                    long_names.push(record);
-                }
-                DecodedRecord::VeryLongStrings(record) => {
-                    very_long_strings.push(record);
-                }
-                DecodedRecord::FileAttributes(record) => {
-                    file_attributes.push(record);
-                }
-                DecodedRecord::VariableAttributes(record) => {
-                    variable_attributes.push(record);
-                }
-                DecodedRecord::OtherExtension(record) => {
-                    other_extension.push(record);
-                }
-                DecodedRecord::EndOfHeaders(record) => {
-                    end_of_headers.push(record);
-                }
-                DecodedRecord::ZHeader(record) => {
-                    z_header.push(record);
-                }
-                DecodedRecord::ZTrailer(record) => {
-                    z_trailer.push(record);
-                }
-                DecodedRecord::Cases(record) => {
-                    cases.push(record);
-                }
-            }
-        }
-
-        let Some(file_header) = take_first(file_header, || warn(Error::DuplicateHeaderRecord))
-        else {
-            return Err(Error::MissingHeaderRecord);
-        };
-
-        Ok(Headers {
-            header: file_header,
-            variable,
-            value_label,
-            document,
-            integer_info: take_first(integer_info, || warn(Error::TBD)),
-            float_info: take_first(float_info, || warn(Error::TBD)),
-            var_display: take_first(var_display, || warn(Error::TBD)),
-            multiple_response,
-            long_string_value_labels,
-            long_string_missing_values,
-            encoding: take_first(encoding, || warn(Error::TBD)),
-            number_of_cases: take_first(number_of_cases, || warn(Error::TBD)),
-            variable_sets,
-            product_info: take_first(product_info, || warn(Error::TBD)),
-            long_names,
-            very_long_strings,
-            file_attributes,
-            variable_attributes,
-            other_extension,
-            end_of_headers: take_first(end_of_headers, || warn(Error::TBD)),
-            z_header: take_first(z_header, || warn(Error::TBD)),
-            z_trailer: take_first(z_trailer, || warn(Error::TBD)),
-            cases: take_first(cases, || warn(Error::TBD)),
-        })
-    }
-}
-
-pub struct Metadata {
-    creation: NaiveDateTime,
-    endian: Endian,
-    compression: Option<Compression>,
-    n_cases: Option<u64>,
-    product: String,
-    product_ext: Option<String>,
-    version: Option<(i32, i32, i32)>,
-}
-
-impl Metadata {
-    fn decode(headers: &Headers, warn: impl Fn(Error)) -> Self {
-        let header = &headers.header;
-        let creation_date = NaiveDate::parse_from_str(&header.creation_date, "%e %b %Y")
-            .unwrap_or_else(|_| {
-                warn(Error::InvalidCreationDate {
-                    creation_date: header.creation_date.to_string(),
-                });
-                Default::default()
-            });
-        let creation_time = NaiveTime::parse_from_str(&header.creation_time, "%H:%M:%S")
-            .unwrap_or_else(|_| {
-                warn(Error::InvalidCreationTime {
-                    creation_time: header.creation_time.to_string(),
-                });
-                Default::default()
-            });
-        let creation = NaiveDateTime::new(creation_date, creation_time);
-
-        let product = header
-            .eye_catcher
-            .trim_start_matches("@(#) SPSS DATA FILE")
-            .trim_end()
-            .to_string();
-
-        Self {
-            creation,
-            endian: header.endian,
-            compression: header.compression,
-            n_cases: header.n_cases.map(|n| n as u64),
-            product,
-            product_ext: headers.product_info.as_ref().map(|pe| fix_line_ends(&pe.0)),
-            version: headers.integer_info.as_ref().map(|ii| ii.version),
-        }
-    }
-}
-
-struct Decoder {
-    //pub raw: raw::Decoder,
-    pub encoding: &'static Encoding,
-    //pub variables: HashMap<DictIndex, Variable>,
-    //pub var_names: HashMap<Identifier, DictIndex>,
-    //pub dictionary: Dictionary,
-    //n_dict_indexes: usize,
-    n_generated_names: usize,
-}
-
-impl Decoder {
-    fn generate_name(&mut self, dictionary: &Dictionary) -> Identifier {
-        loop {
-            self.n_generated_names += 1;
-            let name = Identifier::from_encoding(&format!("VAR{:03}", self.n_generated_names), self.encoding)
-                .unwrap();
-            if !dictionary.variables.contains(&name.0) {
-                return name;
-            }
-            assert!(self.n_generated_names < usize::MAX);
-        }
-    }
-}
-
-pub fn decode(
-    mut headers: Headers,
-    encoding: &'static Encoding,
-    warn: impl Fn(Error),
-) -> Result<(Dictionary, Metadata), Error> {
-    let mut dictionary = Dictionary::new(encoding);
-
-    let file_label = fix_line_ends(headers.header.file_label.trim_end_matches(' '));
-    if !file_label.is_empty() {
-        dictionary.file_label = Some(file_label);
-    }
-
-    for attributes in headers.file_attributes.drain(..) {
-        dictionary.attributes.extend(attributes.0 .0.into_iter())
-    }
-
-    // Concatenate all the document records (really there should only be one)
-    // and trim off the trailing spaces that pad them to 80 bytes.
-    dictionary.documents = headers
-        .document
-        .drain(..)
-        .flat_map(|record| record.lines)
-        .map(trim_end_spaces)
-        .collect();
-
-    // XXX warn for weird integer format
-    // XXX warn for weird floating-point format, etc.
-
-    let mut decoder = Decoder {
-        encoding,
-        n_generated_names: 0,
-    };
-
-    let mut header_vars = headers.variable.iter().enumerate();
-    let mut var_index_map = HashMap::new();
-    while let Some((value_index, input)) = header_vars.next() {
-        let name = trim_end_spaces(input.name.to_string());
-        let name = match Identifier::from_encoding(&name, encoding) {
-            Ok(name) => {
-                if !dictionary.variables.contains(&name.0) {
-                    name
-                } else {
-                    let new_name = decoder.generate_name(&dictionary);
-                    warn(Error::DuplicateVariableName {
-                        duplicate_name: name.clone(),
-                        new_name: new_name.clone(),
-                    });
-                    new_name
-                }
-            }
-            Err(id_error) => {
-                let new_name = decoder.generate_name(&dictionary);
-                warn(Error::InvalidVariableName {
-                    id_error,
-                    new_name: new_name.clone(),
-                });
-                new_name
-            }
-        };
-        let mut variable = Variable::new(name.clone(), VarWidth::from_raw(input.width).unwrap());
-
-        // Set the short name the same as the long name (even if we renamed it).
-        variable.short_names = vec![name];
-
-        variable.label = input.label.clone();
-
-        variable.missing_values = input.missing_values.clone();
-
-        variable.print_format = decode_format(
-            input.print_format,
-            variable.width,
-            |new_spec, format_error| {
-                warn(Error::InvalidPrintFormat {
-                    new_spec,
-                    variable: variable.name.clone(),
-                    format_error,
-                })
-            },
-        );
-        variable.write_format = decode_format(
-            input.write_format,
-            variable.width,
-            |new_spec, format_error| {
-                warn(Error::InvalidWriteFormat {
-                    new_spec,
-                    variable: variable.name.clone(),
-                    format_error,
-                })
-            },
-        );
-
-        // Skip long string continuation records.
-        if input.width > 0 {
-            #[allow(unstable_name_collisions)]
-            for _ in 1..input.width.div_ceil(&8) {
-                if let Some((_, continuation)) = header_vars.next() {
-                    if continuation.width == -1 {
-                        continue;
-                    }
-                }
-                return Err(Error::TBD);
-            }
-        }
-
-        let dict_index = dictionary.add_var(variable).unwrap();
-        assert_eq!(var_index_map.insert(value_index, dict_index), None);
-    }
-
-    for record in headers.value_label.drain(..) {
-        let mut dict_indexes = Vec::with_capacity(record.dict_indexes.len());
-        let mut continuation_indexes = Vec::new();
-        let mut long_string_variables = Vec::new();
-        for value_index in record.dict_indexes.iter() {
-            if let Some(dict_index) = var_index_map.get(&(*value_index as usize - 1)) {
-                let variable = &dictionary.variables[*dict_index];
-                if variable.width.is_long_string() {
-                    long_string_variables.push(variable.name.clone());
-                } else {
-                    dict_indexes.push(*dict_index);
-                }
-            } else {
-                continuation_indexes.push(*value_index);
-            }
-        }
-        if !continuation_indexes.is_empty() {
-            warn(Error::LongStringContinuationIndexes {
-                offset: record.offsets.start,
-                indexes: continuation_indexes,
-            });
-        }
-        if !long_string_variables.is_empty() {
-            warn(Error::InvalidLongStringValueLabels {
-                offsets: record.offsets.clone(),
-                variables: long_string_variables,
-            });
-        }
-
-        for dict_index in dict_indexes {
-            let mut variable = &dictionary.variables[dict_index];
-            for ValueLabel { value, label } in record.labels.iter().cloned() {
-                
-            }
-        }
-    }
-
-    let metadata = Metadata::decode(&headers, warn);
-    Ok((dictionary, metadata))
-}
-
-fn trim_end_spaces(mut s: String) -> String {
-    s.truncate(s.trim_end_matches(' ').len());
-    s
-}
-
-/// Returns a copy of `s` in which all lone CR and CR LF pairs have been
-/// replaced by LF.
-///
-/// (A product that identifies itself as VOXCO INTERVIEWER 4.3 produces system
-/// files that use CR-only line ends in the file label and extra product info.)
-fn fix_line_ends(s: &str) -> String {
-    let mut out = String::with_capacity(s.len());
-    let mut s = s.chars().peekable();
-    while let Some(c) = s.next() {
-        match c {
-            '\r' => {
-                s.next_if_eq(&'\n');
-                out.push('\n')
-            }
-            c => out.push(c),
-        }
-    }
-    out
-}
-
-fn decode_format(raw: raw::Spec, width: VarWidth, warn: impl Fn(Format, FormatError)) -> Format {
-    UncheckedFormat::try_from(raw)
-        .and_then(Format::try_from)
-        .and_then(|x| x.check_width_compatibility(width))
-        .unwrap_or_else(|error| {
-            let new_format = Format::default_for_width(width);
-            warn(new_format, error);
-            new_format
-        })
-}
-
-/*
-impl Decoder {
-    fn generate_name(&mut self) -> Identifier {
-        loop {
-            self.n_generated_names += 1;
-            let name = Identifier::new(&format!("VAR{:03}", self.n_generated_names), self.encoding)
-                .unwrap();
-            if !self.var_names.contains_key(&name) {
-                return name;
-            }
-            assert!(self.n_generated_names < usize::MAX);
-        }
-    }
-    fn decode_string_cow<'a>(&self, input: &'a [u8], warn: &impl Fn(Error)) -> Cow<'a, str> {
-        let (output, malformed) = self.encoding.decode_without_bom_handling(input);
-        if malformed {
-            warn(Error::MalformedString {
-                encoding: self.encoding.name().into(),
-                text: output.clone().into(),
-            });
-        }
-        output
-    }
-    fn decode_string(&self, input: &[u8], warn: &impl Fn(Error)) -> String {
-        self.decode_string_cow(input, warn).into()
-    }
-    pub fn decode_identifier(
-        &self,
-        input: &[u8],
-        warn: &impl Fn(Error),
-    ) -> Result<Identifier, IdError> {
-        let s = self.decode_string_cow(input, warn);
-        Identifier::new(&s, self.encoding)
-    }
-    fn get_var_by_index(&self, dict_index: usize) -> Result<&Variable, Error> {
-        let max_index = self.n_dict_indexes;
-        if dict_index == 0 || dict_index > max_index {
-            return Err(Error::InvalidDictIndex {
-                dict_index,
-                max_index,
-            });
-        }
-        let Some(variable) = self.variables.get(&(dict_index - 1)) else {
-            return Err(Error::DictIndexIsContinuation(dict_index));
-        };
-        Ok(variable)
-    }
-
-    /// Returns `input` decoded from `self.encoding` into UTF-8 such that
-    /// re-encoding the result back into `self.encoding` will have exactly the
-    /// same length in bytes.
-    ///
-    /// XXX warn about errors?
-    fn decode_exact_length<'a>(&self, input: &'a [u8]) -> Cow<'a, str> {
-        if let (s, false) = self.encoding.decode_without_bom_handling(input) {
-            // This is the common case.  Usually there will be no errors.
-            s
-        } else {
-            // Unusual case.  Don't bother to optimize it much.
-            let mut decoder = self.encoding.new_decoder_without_bom_handling();
-            let mut output = String::with_capacity(
-                decoder
-                    .max_utf8_buffer_length_without_replacement(input.len())
-                    .unwrap(),
-            );
-            let mut rest = input;
-            while !rest.is_empty() {
-                match decoder.decode_to_string_without_replacement(rest, &mut output, true) {
-                    (DecoderResult::InputEmpty, _) => break,
-                    (DecoderResult::OutputFull, _) => unreachable!(),
-                    (DecoderResult::Malformed(a, b), consumed) => {
-                        let skipped = a as usize + b as usize;
-                        output.extend(repeat('?').take(skipped));
-                        rest = &rest[consumed..];
-                    }
-                }
-            }
-            assert_eq!(self.encoding.encode(&output).0.len(), input.len());
-            output.into()
-        }
-    }
-}
-
-pub trait TryDecode: Sized {
-    type Input<'a>;
-    fn try_decode(
-        decoder: &mut Decoder,
-        input: &Self::Input<'_>,
-        warn: impl Fn(Error),
-    ) -> Result<Option<Self>, Error>;
-}
-
-pub trait Decode<Input>: Sized {
-    fn decode(decoder: &Decoder, input: &Input, warn: impl Fn(Error)) -> Self;
-}
-
-impl<const N: usize> Decode<RawStr<N>> for String {
-    fn decode(decoder: &Decoder, input: &RawStr<N>, warn: impl Fn(Error)) -> Self {
-        decoder.decode_string(&input.0, &warn)
-    }
-}
-*/
-/*
-#[derive(Clone, Debug)]
-pub struct HeaderRecord {
-    pub eye_catcher: String,
-    pub weight_index: Option<usize>,
-    pub n_cases: Option<u64>,
-    pub creation: NaiveDateTime,
-    pub file_label: String,
-}
-
-fn trim_end_spaces(mut s: String) -> String {
-    s.truncate(s.trim_end_matches(' ').len());
-    s
-}
-
-/// Data file info that doesn't fit in [Dictionary].
-pub struct Metadata {
-    creation: NaiveDateTime,
-    endian: Endian,
-    compression: Option<Compression>,
-    n_cases: Option<u64>,
-    product: String,
-    product_ext: Option<String>,
-    version: Option<(i32, i32, i32)>,
-}
-
-impl Metadata {
-    fn decode(
-        header: &crate::raw::HeaderRecord<Cow<str>>,
-        integer_info: Option<&IntegerInfoRecord>,
-        product_ext: Option<&ProductInfoRecord>,
-        warn: impl Fn(Error),
-    ) -> Self {
-        let creation_date = NaiveDate::parse_from_str(&header.creation_date, "%e %b %Y")
-            .unwrap_or_else(|_| {
-                warn(Error::InvalidCreationDate {
-                    creation_date: header.creation_date.to_string(),
-                });
-                Default::default()
-            });
-        let creation_time = NaiveTime::parse_from_str(&header.creation_time, "%H:%M:%S")
-            .unwrap_or_else(|_| {
-                warn(Error::InvalidCreationTime {
-                    creation_time: header.creation_time.to_string(),
-                });
-                Default::default()
-            });
-        let creation = NaiveDateTime::new(creation_date, creation_time);
-
-        let product = header
-            .eye_catcher
-            .trim_start_matches("@(#) SPSS DATA FILE")
-            .trim_end()
-            .to_string();
-
-        Self {
-            creation,
-            endian: header.endian,
-            compression: header.compression,
-            n_cases: header.n_cases.map(|n| n as u64),
-            product,
-            product_ext: product_ext.map(|pe| pe.0.clone()),
-            version: integer_info.map(|ii| ii.version),
-        }
-    }
-}
-
-impl TryDecode for HeaderRecord {
-    type Input<'a> = crate::raw::HeaderRecord<Cow<'a, str>>;
-
-    fn try_decode(
-        _decoder: &mut Decoder,
-        input: &Self::Input<'_>,
-        warn: impl Fn(Error),
-    ) -> Result<Option<Self>, Error> {
-        let eye_catcher = trim_end_spaces(input.eye_catcher.to_string());
-        let file_label = trim_end_spaces(input.file_label.to_string());
-        let creation_date = NaiveDate::parse_from_str(&input.creation_date, "%e %b %Y")
-            .unwrap_or_else(|_| {
-                warn(Error::InvalidCreationDate {
-                    creation_date: input.creation_date.to_string(),
-                });
-                Default::default()
-            });
-        let creation_time = NaiveTime::parse_from_str(&input.creation_time, "%H:%M:%S")
-            .unwrap_or_else(|_| {
-                warn(Error::InvalidCreationTime {
-                    creation_time: input.creation_time.to_string(),
-                });
-                Default::default()
-            });
-        Ok(Some(HeaderRecord {
-            eye_catcher,
-            weight_index: input.weight_index.map(|n| n as usize),
-            n_cases: input.n_cases.map(|n| n as u64),
-            creation: NaiveDateTime::new(creation_date, creation_time),
-            file_label,
-        }))
-    }
-}
-
-#[derive(Clone, Debug)]
-pub struct VariableRecord {
-    pub width: VarWidth,
-    pub name: Identifier,
-    pub print_format: Spec,
-    pub write_format: Spec,
-    pub missing_values: MissingValues<String>,
-    pub label: Option<String>,
-}
-
-
-fn parse_variable_record(
-    decoder: &mut Decoder,
-    input: &raw::VariableRecord<Cow<str>, String>,
-    warn: impl Fn(Error),
-) -> Result<(), Error> {
-    let width = match input.width {
-        0 => VarWidth::Numeric,
-        w @ 1..=255 => VarWidth::String(w as u16),
-        -1 => return Ok(()),
-        _ => {
-            return Err(Error::InvalidVariableWidth {
-                offsets: input.offsets.clone(),
-                width: input.width,
-            })
-        }
-    };
-    let name = trim_end_spaces(input.name.to_string());
-    let name = match Identifier::new(&name, decoder.encoding) {
-        Ok(name) => {
-            if !decoder.var_names.contains_key(&name) {
-                name
-            } else {
-                let new_name = decoder.generate_name();
-                warn(Error::DuplicateVariableName {
-                    duplicate_name: name.clone(),
-                    new_name: new_name.clone(),
-                });
-                new_name
-            }
-        }
-        Err(id_error) => {
-            let new_name = decoder.generate_name();
-            warn(Error::InvalidVariableName {
-                id_error,
-                new_name: new_name.clone(),
-            });
-            new_name
-        }
-    };
-    let variable = Variable {
-        dict_index: decoder.n_dict_indexes,
-        short_name: name.clone(),
-        long_name: None,
-        width,
-    };
-    decoder.n_dict_indexes += width.n_dict_indexes();
-    assert!(decoder
-        .var_names
-        .insert(name.clone(), variable.dict_index)
-        .is_none());
-    assert!(decoder
-        .variables
-        .insert(variable.dict_index, variable)
-        .is_none());
-
-    let print_format = decode_format(input.print_format, width, |new_spec, format_error| {
-        warn(Error::InvalidPrintFormat {
-            new_spec,
-            variable: name.clone(),
-            format_error,
-        })
-    });
-    let write_format = decode_format(input.write_format, width, |new_spec, format_error| {
-        warn(Error::InvalidWriteFormat {
-            new_spec,
-            variable: name.clone(),
-            format_error,
-        })
-    });
-    let mut variable = dictionary::Variable::new(name, width);
-    variable.print_format = print_format;
-    variable.write_format = write_format;
-    variable.missing_values = input.missing_values.clone();
-    if let Some(ref label) = input.label {
-        variable.label = Some(label.to_string());
-    }
-    decoder.dictionary.add_var(variable).unwrap();
-    Ok(())
-}
-
-#[derive(Clone, Debug)]
-pub struct DocumentRecord(Vec<String>);
-
-impl TryDecode for DocumentRecord {
-    type Input<'a> = crate::raw::DocumentRecord<RawDocumentLine>;
-
-    fn try_decode(
-        decoder: &mut Decoder,
-        input: &Self::Input<'_>,
-        warn: impl Fn(Error),
-    ) -> Result<Option<Self>, Error> {
-        Ok(Some(DocumentRecord(
-            input
-                .lines
-                .iter()
-                .map(|s| trim_end_spaces(decoder.decode_string(&s.0, &warn)))
-                .collect(),
-        )))
-    }
-}
-
-trait TextRecord
-where
-    Self: Sized,
-{
-    const NAME: &'static str;
-    fn parse(input: &str, warn: impl Fn(Error)) -> Result<Self, Error>;
-}
-
-#[derive(Clone, Debug)]
-pub struct VariableSet {
-    pub name: String,
-    pub vars: Vec<String>,
-}
-
-impl VariableSet {
-    fn parse(input: &str) -> Result<Self, Error> {
-        let (name, input) = input.split_once('=').ok_or(Error::TBD)?;
-        let vars = input.split_ascii_whitespace().map(String::from).collect();
-        Ok(VariableSet {
-            name: name.into(),
-            vars,
-        })
-    }
-}
-
-trait WarnOnError<T> {
-    fn warn_on_error<F: Fn(Error)>(self, warn: &F) -> Option<T>;
-}
-impl<T> WarnOnError<T> for Result<T, Error> {
-    fn warn_on_error<F: Fn(Error)>(self, warn: &F) -> Option<T> {
-        match self {
-            Ok(result) => Some(result),
-            Err(error) => {
-                warn(error);
-                None
-            }
-        }
-    }
-}
-
-#[derive(Clone, Debug)]
-pub struct ValueLabel {
-    pub value: Value,
-    pub label: String,
-}
-
-#[derive(Clone, Debug)]
-pub struct ValueLabelRecord {
-    pub var_type: VarType,
-    pub labels: Vec<ValueLabel>,
-    pub variables: Vec<Identifier>,
-}
-
-impl TryDecode for ValueLabelRecord {
-    type Input<'a> = crate::raw::ValueLabelRecord<RawStr<8>, RawString>;
-    fn try_decode(
-        decoder: &mut Decoder,
-        input: &Self::Input<'_>,
-        warn: impl Fn(Error),
-    ) -> Result<Option<ValueLabelRecord>, Error> {
-        let variables: Vec<&Variable> = input
-            .dict_indexes
-            .iter()
-            .filter_map(|&dict_index| {
-                decoder
-                    .get_var_by_index(dict_index as usize)
-                    .warn_on_error(&warn)
-            })
-            .filter(|&variable| match variable.width {
-                VarWidth::String(width) if width > 8 => {
-                    warn(Error::InvalidLongStringValueLabel(
-                        variable.short_name.clone(),
-                    ));
-                    false
-                }
-                _ => true,
-            })
-            .collect();
-        let mut i = variables.iter();
-        let Some(&first_var) = i.next() else {
-            return Ok(None);
-        };
-        let var_type: VarType = first_var.width.into();
-        for &variable in i {
-            let this_type: VarType = variable.width.into();
-            if var_type != this_type {
-                let (numeric_var, string_var) = match var_type {
-                    VarType::Numeric => (first_var, variable),
-                    VarType::String => (variable, first_var),
-                };
-                warn(Error::ValueLabelsDifferentTypes {
-                    numeric_var: numeric_var.short_name.clone(),
-                    string_var: string_var.short_name.clone(),
-                });
-                return Ok(None);
-            }
-        }
-        let labels = input
-            .labels
-            .iter()
-            .map(|raw::ValueLabel { value, label }| {
-                let label = decoder.decode_string(&label.0, &warn);
-                let value = Value::decode(value, decoder);
-                ValueLabel { value, label }
-            })
-            .collect();
-        let variables = variables
-            .iter()
-            .map(|&variable| variable.short_name.clone())
-            .collect();
-        Ok(Some(ValueLabelRecord {
-            var_type,
-            labels,
-            variables,
-        }))
-    }
-}
-
-#[derive(Clone, Debug)]
-pub struct VariableSetRecord(Vec<VariableSet>);
-
-impl TextRecord for VariableSetRecord {
-    const NAME: &'static str = "variable set";
-    fn parse(input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
-        let mut sets = Vec::new();
-        for line in input.lines() {
-            if let Some(set) = VariableSet::parse(line).warn_on_error(&warn) {
-                sets.push(set)
-            }
-        }
-        Ok(VariableSetRecord(sets))
-    }
-}
-
-#[derive(Clone, Debug)]
-pub struct LongName {
-    pub short_name: Identifier,
-    pub long_name: Identifier,
-}
-
-impl LongName {
-    fn new(decoder: &mut Decoder, short_name: &str, long_name: &str) -> Result<LongName, Error> {
-        let short_name =
-            Identifier::new(short_name, decoder.encoding).map_err(Error::InvalidShortName)?;
-        let long_name =
-            Identifier::new(long_name, decoder.encoding).map_err(Error::InvalidLongName)?;
-        Ok(LongName {
-            short_name,
-            long_name,
-        })
-    }
-}
-
-#[derive(Clone, Debug)]
-pub struct LongNameRecord(Vec<LongName>);
-
-impl LongNameRecord {
-    pub fn parse(decoder: &mut Decoder, input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
-        let mut names = Vec::new();
-        for pair in input.split('\t').filter(|s| !s.is_empty()) {
-            if let Some((short_name, long_name)) = pair.split_once('=') {
-                if let Some(long_name) =
-                    LongName::new(decoder, short_name, long_name).warn_on_error(&warn)
-                {
-                    names.push(long_name);
-                }
-            } else {
-                warn(Error::TBD)
-            }
-        }
-        Ok(LongNameRecord(names))
-    }
-}
-
-#[derive(Clone, Debug)]
-pub struct VeryLongString {
-    pub short_name: Identifier,
-    pub length: u16,
-}
-
-impl VeryLongString {
-    fn parse(decoder: &Decoder, input: &str) -> Result<VeryLongString, Error> {
-        let Some((short_name, length)) = input.split_once('=') else {
-            return Err(Error::TBD);
-        };
-        let short_name =
-            Identifier::new(short_name, decoder.encoding).map_err(Error::InvalidLongStringName)?;
-        let length: u16 = length.parse().map_err(|_| Error::TBD)?;
-        if length > VarWidth::MAX_STRING {
-            return Err(Error::TBD);
-        }
-        Ok(VeryLongString { short_name, length })
-    }
-}
-
-#[derive(Clone, Debug)]
-pub struct VeryLongStringRecord(Vec<VeryLongString>);
-
-impl VeryLongStringRecord {
-    pub fn parse(decoder: &Decoder, input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
-        let mut very_long_strings = Vec::new();
-        for tuple in input
-            .split('\0')
-            .map(|s| s.trim_end_matches('\t'))
-            .filter(|s| !s.is_empty())
-        {
-            if let Some(vls) = VeryLongString::parse(decoder, tuple).warn_on_error(&warn) {
-                very_long_strings.push(vls)
-            }
-        }
-        Ok(VeryLongStringRecord(very_long_strings))
-    }
-}
-
-#[derive(Clone, Debug)]
-pub struct Attribute {
-    pub name: Identifier,
-    pub values: Vec<String>,
-}
-
-impl Attribute {
-    fn parse<'a>(
-        decoder: &Decoder,
-        input: &'a str,
-        warn: &impl Fn(Error),
-    ) -> Result<(Option<Attribute>, &'a str), Error> {
-        let Some((name, mut input)) = input.split_once('(') else {
-            return Err(Error::TBD);
-        };
-        let mut values = Vec::new();
-        loop {
-            let Some((value, rest)) = input.split_once('\n') else {
-                return Err(Error::TBD);
-            };
-            if let Some(stripped) = value
-                .strip_prefix('\'')
-                .and_then(|value| value.strip_suffix('\''))
-            {
-                values.push(stripped.into());
-            } else {
-                warn(Error::TBD);
-                values.push(value.into());
-            }
-            if let Some(rest) = rest.strip_prefix(')') {
-                let attribute = Identifier::new(name, decoder.encoding)
-                    .map_err(Error::InvalidAttributeName)
-                    .warn_on_error(warn)
-                    .map(|name| Attribute { name, values });
-                return Ok((attribute, rest));
-            };
-            input = rest;
-        }
-    }
-}
-
-#[derive(Clone, Debug)]
-pub struct AttributeSet(pub Vec<Attribute>);
-
-impl AttributeSet {
-    fn parse<'a>(
-        decoder: &Decoder,
-        mut input: &'a str,
-        sentinel: Option<char>,
-        warn: &impl Fn(Error),
-    ) -> Result<(AttributeSet, &'a str), Error> {
-        let mut attributes = Vec::new();
-        let rest = loop {
-            match input.chars().next() {
-                None => break input,
-                c if c == sentinel => break &input[1..],
-                _ => {
-                    let (attribute, rest) = Attribute::parse(decoder, input, &warn)?;
-                    if let Some(attribute) = attribute {
-                        attributes.push(attribute);
-                    }
-                    input = rest;
-                }
-            }
-        };
-        Ok((AttributeSet(attributes), rest))
-    }
-}
-
-#[derive(Clone, Debug)]
-pub struct FileAttributeRecord(AttributeSet);
-
-impl FileAttributeRecord {
-    pub fn parse(decoder: &Decoder, input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
-        let (set, rest) = AttributeSet::parse(decoder, input, None, &warn)?;
-        if !rest.is_empty() {
-            warn(Error::TBD);
-        }
-        Ok(FileAttributeRecord(set))
-    }
-}
-
-#[derive(Clone, Debug)]
-pub struct VarAttributeSet {
-    pub long_var_name: Identifier,
-    pub attributes: AttributeSet,
-}
-
-impl VarAttributeSet {
-    fn parse<'a>(
-        decoder: &Decoder,
-        input: &'a str,
-        warn: &impl Fn(Error),
-    ) -> Result<(Option<VarAttributeSet>, &'a str), Error> {
-        let Some((long_var_name, rest)) = input.split_once(':') else {
-            return Err(Error::TBD);
-        };
-        let (attributes, rest) = AttributeSet::parse(decoder, rest, Some('/'), warn)?;
-        let var_attribute = Identifier::new(long_var_name, decoder.encoding)
-            .map_err(Error::InvalidAttributeVariableName)
-            .warn_on_error(warn)
-            .map(|name| VarAttributeSet {
-                long_var_name: name,
-                attributes,
-            });
-        Ok((var_attribute, rest))
-    }
-}
-
-#[derive(Clone, Debug)]
-pub struct VariableAttributeRecord(Vec<VarAttributeSet>);
-
-impl VariableAttributeRecord {
-    pub fn parse(decoder: &Decoder, mut input: &str, warn: impl Fn(Error)) -> Result<Self, Error> {
-        let mut var_attribute_sets = Vec::new();
-        while !input.is_empty() {
-            let Some((var_attribute, rest)) =
-                VarAttributeSet::parse(decoder, input, &warn).warn_on_error(&warn)
-            else {
-                break;
-            };
-            if let Some(var_attribute) = var_attribute {
-                var_attribute_sets.push(var_attribute);
-            }
-            input = rest;
-        }
-        Ok(VariableAttributeRecord(var_attribute_sets))
-    }
-}
-
-#[derive(Clone, Debug)]
-pub enum MultipleResponseType {
-    MultipleDichotomy {
-        value: Value,
-        labels: CategoryLabels,
-    },
-    MultipleCategory,
-}
-
-impl MultipleResponseType {
-    fn decode(
-        decoder: &Decoder,
-        mr_set: &Identifier,
-        input: &raw::MultipleResponseType,
-        min_width: VarWidth,
-        warn: &impl Fn(Error),
-    ) -> Result<Self, Error> {
-        let mr_type = match input {
-            raw::MultipleResponseType::MultipleDichotomy { value, labels } => {
-                let value = decoder.decode_string_cow(&value.0, warn);
-                let value = match min_width {
-                    VarWidth::Numeric => {
-                        let number: f64 = value.trim().parse().map_err(|_| {
-                            Error::InvalidMDGroupCountedValue {
-                                mr_set: mr_set.clone(),
-                                number: value.into(),
-                            }
-                        })?;
-                        Value::Number(Some(number.into()))
-                    }
-                    VarWidth::String(max_width) => {
-                        let value = value.trim_end_matches(' ');
-                        let width = value.len();
-                        if width > max_width as usize {
-                            return Err(Error::TooWideMDGroupCountedValue {
-                                mr_set: mr_set.clone(),
-                                value: value.into(),
-                                width,
-                                max_width,
-                            });
-                        };
-                        Value::String(value.into())
-                    }
-                };
-                MultipleResponseType::MultipleDichotomy {
-                    value,
-                    labels: *labels,
-                }
-            }
-            raw::MultipleResponseType::MultipleCategory => MultipleResponseType::MultipleCategory,
-        };
-        Ok(mr_type)
-    }
-}
-
-#[derive(Clone, Debug)]
-pub struct MultipleResponseSet {
-    pub name: Identifier,
-    pub min_width: VarWidth,
-    pub max_width: VarWidth,
-    pub label: String,
-    pub mr_type: MultipleResponseType,
-    pub dict_indexes: Vec<DictIndex>,
-}
-
-impl MultipleResponseSet {
-    fn decode(
-        decoder: &Decoder,
-        input: &raw::MultipleResponseSet<Identifier, Cow<str>>,
-        warn: &impl Fn(Error),
-    ) -> Result<Self, Error> {
-        let mr_set_name = input.name.clone();
-        let mut dict_indexes = Vec::with_capacity(input.short_names.len());
-        for short_name in input.short_names.iter() {
-            let Some(&dict_index) = decoder.var_names.get(&short_name) else {
-                warn(Error::UnknownMrSetVariable {
-                    mr_set: mr_set_name.clone(),
-                    short_name: short_name.clone(),
-                });
-                continue;
-            };
-            dict_indexes.push(dict_index);
-        }
-
-        match dict_indexes.len() {
-            0 => return Err(Error::EmptyMrSet(mr_set_name)),
-            1 => return Err(Error::OneVarMrSet(mr_set_name)),
-            _ => (),
-        }
-
-        let Some((Some(min_width), Some(max_width))) = dict_indexes
-            .iter()
-            .map(|dict_index| decoder.variables[dict_index].width)
-            .map(|w| (Some(w), Some(w)))
-            .reduce(|(na, wa), (nb, wb)| (VarWidth::narrower(na, nb), VarWidth::wider(wa, wb)))
-        else {
-            return Err(Error::MixedMrSet(mr_set_name));
-        };
-
-        let mr_type =
-            MultipleResponseType::decode(decoder, &mr_set_name, &input.mr_type, min_width, warn)?;
-
-        Ok(MultipleResponseSet {
-            name: mr_set_name,
-            min_width,
-            max_width,
-            label: input.label.to_string(),
-            mr_type,
-            dict_indexes,
-        })
-    }
-}
-
-#[derive(Clone, Debug)]
-pub struct MultipleResponseRecord(pub Vec<MultipleResponseSet>);
-
-impl TryDecode for MultipleResponseRecord {
-    type Input<'a> = raw::MultipleResponseRecord<Identifier, Cow<'a, str>>;
-
-    fn try_decode(
-        decoder: &mut Decoder,
-        input: &Self::Input<'_>,
-        warn: impl Fn(Error),
-    ) -> Result<Option<Self>, Error> {
-        let mut sets = Vec::with_capacity(input.0.len());
-        for set in &input.0 {
-            match MultipleResponseSet::decode(decoder, set, &warn) {
-                Ok(set) => sets.push(set),
-                Err(error) => warn(error),
-            }
-        }
-        Ok(Some(MultipleResponseRecord(sets)))
-    }
-}
-
-#[derive(Clone, Debug)]
-pub struct LongStringValueLabels {
-    pub var_name: Identifier,
-    pub width: VarWidth,
-    pub labels: Vec<ValueLabel>,
-}
-
-impl LongStringValueLabels {
-    fn decode(
-        decoder: &Decoder,
-        input: &raw::LongStringValueLabels<RawString>,
-        warn: &impl Fn(Error),
-    ) -> Result<Self, Error> {
-        let var_name = decoder.decode_string(&input.var_name.0, warn);
-        let var_name = Identifier::new(var_name.trim_end(), decoder.encoding)
-            .map_err(Error::InvalidLongStringValueLabelName)?;
-
-        let min_width = 9;
-        let max_width = VarWidth::MAX_STRING;
-        if input.width < 9 || input.width > max_width as u32 {
-            return Err(Error::InvalidLongValueLabelWidth {
-                name: var_name,
-                width: input.width,
-                min_width,
-                max_width,
-            });
-        }
-        let width = input.width as u16;
-
-        let mut labels = Vec::with_capacity(input.labels.len());
-        for (value, label) in input.labels.iter() {
-            let value = Value::String(decoder.decode_exact_length(&value.0).into());
-            let label = decoder.decode_string(&label.0, warn);
-            labels.push(ValueLabel { value, label });
-        }
-
-        Ok(LongStringValueLabels {
-            var_name,
-            width: VarWidth::String(width),
-            labels,
-        })
-    }
-}
-
-#[derive(Clone, Debug)]
-pub struct LongStringValueLabelRecord(pub Vec<LongStringValueLabels>);
-
-impl TryDecode for LongStringValueLabelRecord {
-    type Input<'a> = raw::LongStringValueLabelRecord<RawString>;
-
-    fn try_decode(
-        decoder: &mut Decoder,
-        input: &Self::Input<'_>,
-        warn: impl Fn(Error),
-    ) -> Result<Option<Self>, Error> {
-        let mut labels = Vec::with_capacity(input.0.len());
-        for label in &input.0 {
-            match LongStringValueLabels::decode(decoder, label, &warn) {
-                Ok(set) => labels.push(set),
-                Err(error) => warn(error),
-            }
-        }
-        Ok(Some(LongStringValueLabelRecord(labels)))
-    }
-}
-
-#[cfg(test)]
-mod test {
-    use encoding_rs::WINDOWS_1252;
-
-    #[test]
-    fn test() {
-        let mut s = String::new();
-        s.push(char::REPLACEMENT_CHARACTER);
-        let encoded = WINDOWS_1252.encode(&s).0;
-        let decoded = WINDOWS_1252.decode(&encoded[..]).0;
-        println!("{:?}", decoded);
-    }
-
-    #[test]
-    fn test2() {
-        let charset: Vec<u8> = (0..=255).collect();
-        println!("{}", charset.len());
-        let decoded = WINDOWS_1252.decode(&charset[..]).0;
-        println!("{}", decoded.len());
-        let encoded = WINDOWS_1252.encode(&decoded[..]).0;
-        println!("{}", encoded.len());
-        assert_eq!(&charset[..], &encoded[..]);
-    }
-}
-*/
diff --git a/rust/src/dictionary.rs b/rust/src/dictionary.rs

deleted file mode 100644 (file)

index c260099..0000000
--- a/rust/src/dictionary.rs
+++ /dev/null
@@ -1,530 +0,0 @@
-use std::{
-    cmp::Ordering,
-    collections::{HashMap, HashSet},
-    fmt::Debug,
-    ops::{Bound, RangeBounds},
-};
-
-use encoding_rs::Encoding;
-use indexmap::IndexSet;
-use num::integer::div_ceil;
-use ordered_float::OrderedFloat;
-use unicase::UniCase;
-
-use crate::{
-    format::Format,
-    identifier::{ByIdentifier, HasIdentifier, Identifier},
-    raw::{self, Alignment, CategoryLabels, Decoder, Measure, MissingValues, RawStr, VarType},
-};
-
-pub type DictIndex = usize;
-
-#[derive(Copy, Clone, Debug, PartialEq, Eq)]
-pub enum VarWidth {
-    Numeric,
-    String(u16),
-}
-
-impl PartialOrd for VarWidth {
-    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
-        match (self, other) {
-            (VarWidth::Numeric, VarWidth::Numeric) => Some(Ordering::Equal),
-            (VarWidth::String(a), VarWidth::String(b)) => Some(a.cmp(b)),
-            _ => None,
-        }
-    }
-}
-
-impl VarWidth {
-    pub const MAX_STRING: u16 = 32767;
-
-    pub fn n_dict_indexes(self) -> usize {
-        match self {
-            VarWidth::Numeric => 1,
-            VarWidth::String(w) => div_ceil(w as usize, 8),
-        }
-    }
-
-    fn width_predicate(
-        a: Option<VarWidth>,
-        b: Option<VarWidth>,
-        f: impl Fn(u16, u16) -> u16,
-    ) -> Option<VarWidth> {
-        match (a, b) {
-            (Some(VarWidth::Numeric), Some(VarWidth::Numeric)) => Some(VarWidth::Numeric),
-            (Some(VarWidth::String(a)), Some(VarWidth::String(b))) => {
-                Some(VarWidth::String(f(a, b)))
-            }
-            _ => None,
-        }
-    }
-
-    /// Returns the wider of `self` and `other`:
-    /// - Numerical variable widths are equally wide.
-    /// - Longer strings are wider than shorter strings.
-    /// - Numerical and string types are incomparable, so result in `None`.
-    /// - Any `None` in the input yields `None` in the output.
-    pub fn wider(a: Option<VarWidth>, b: Option<VarWidth>) -> Option<VarWidth> {
-        Self::width_predicate(a, b, |a, b| a.max(b))
-    }
-
-    /// Returns the narrower of `self` and `other` (see [`Self::wider`]).
-    pub fn narrower(a: Option<VarWidth>, b: Option<VarWidth>) -> Option<VarWidth> {
-        Self::width_predicate(a, b, |a, b| a.min(b))
-    }
-
-    pub fn default_display_width(&self) -> u32 {
-        match self {
-            VarWidth::Numeric => 8,
-            VarWidth::String(width) => *width.min(&32) as u32,
-        }
-    }
-
-    pub fn from_raw(raw: impl Into<i32>) -> Result<Self, ()> {
-        let raw: i32 = raw.into();
-        match raw {
-            0 => Ok(Self::Numeric),
-            1..=255 => Ok(Self::String(raw as u16)),
-            _ => Err(()),
-        }
-    }
-
-    pub fn is_long_string(&self) -> bool {
-        if let Self::String(width) = self {
-            *width > 8
-        } else {
-            false
-        }
-    }
-}
-
-impl From<VarWidth> for VarType {
-    fn from(source: VarWidth) -> Self {
-        match source {
-            VarWidth::Numeric => VarType::Numeric,
-            VarWidth::String(_) => VarType::String,
-        }
-    }
-}
-
-#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
-pub enum Value {
-    Number(Option<OrderedFloat<f64>>),
-    String(String),
-}
-
-impl Value {
-    pub fn decode(raw: &raw::Value<RawStr<8>>, decoder: &Decoder) -> Self {
-        match raw {
-            raw::Value::Number(x) => Value::Number(x.map(|x| x.into())),
-            raw::Value::String(s) => Value::String(decoder.decode_exact_length(&s.0).into()),
-        }
-    }
-}
-
-#[derive(Clone, Debug)]
-pub struct Dictionary {
-    pub variables: IndexSet<ByIdentifier<Variable>>,
-    pub split_file: Vec<DictIndex>,
-    pub weight: Option<DictIndex>,
-    pub filter: Option<DictIndex>,
-    pub case_limit: Option<u64>,
-    pub file_label: Option<String>,
-    pub documents: Vec<String>,
-    pub vectors: HashSet<ByIdentifier<Vector>>,
-    pub attributes: HashMap<Identifier, Vec<String>>,
-    pub mrsets: HashSet<ByIdentifier<MultipleResponseSet>>,
-    pub variable_sets: HashSet<ByIdentifier<VariableSet>>,
-    pub encoding: &'static Encoding,
-}
-
-#[derive(Debug)]
-pub struct DuplicateVariableName;
-
-impl Dictionary {
-    pub fn new(encoding: &'static Encoding) -> Self {
-        Self {
-            variables: IndexSet::new(),
-            split_file: Vec::new(),
-            weight: None,
-            filter: None,
-            case_limit: None,
-            file_label: None,
-            documents: Vec::new(),
-            vectors: HashSet::new(),
-            attributes: HashMap::new(),
-            mrsets: HashSet::new(),
-            variable_sets: HashSet::new(),
-            encoding,
-        }
-    }
-
-    pub fn add_var(&mut self, variable: Variable) -> Result<usize, DuplicateVariableName> {
-        let (index, inserted) = self.variables.insert_full(ByIdentifier::new(variable));
-        if inserted {
-            Ok(index)
-        } else {
-            Err(DuplicateVariableName)
-        }
-    }
-
-    pub fn reorder_var(&mut self, from_index: DictIndex, to_index: DictIndex) {
-        if from_index != to_index {
-            self.variables.move_index(from_index, to_index);
-            self.update_dict_indexes(&|index| {
-                #[allow(clippy::collapsible_else_if)]
-                if index == from_index {
-                    Some(to_index)
-                } else if from_index < to_index {
-                    if index > from_index && index <= to_index {
-                        Some(index - 1)
-                    } else {
-                        Some(index)
-                    }
-                } else {
-                    if index >= to_index && index < from_index {
-                        Some(index + 1)
-                    } else {
-                        Some(index)
-                    }
-                }
-            })
-        }
-    }
-
-    pub fn retain_vars<F>(&mut self, keep: F)
-    where
-        F: Fn(&Variable) -> bool,
-    {
-        let mut deleted = Vec::new();
-        let mut index = 0;
-        self.variables.retain(|var_by_id| {
-            let keep = keep(&var_by_id.0);
-            if !keep {
-                deleted.push(index);
-            }
-            index += 1;
-            keep
-        });
-        if !deleted.is_empty() {
-            self.update_dict_indexes(&|index| match deleted.binary_search(&index) {
-                Ok(_) => None,
-                Err(position) => Some(position),
-            })
-        }
-    }
-
-    pub fn delete_vars<R>(&mut self, range: R)
-    where
-        R: RangeBounds<DictIndex>,
-    {
-        let start = match range.start_bound() {
-            Bound::Included(&start) => start,
-            Bound::Excluded(&start) => start + 1,
-            Bound::Unbounded => 0,
-        };
-        let end = match range.end_bound() {
-            Bound::Included(&end) => end + 1,
-            Bound::Excluded(&end) => end,
-            Bound::Unbounded => self.variables.len(),
-        };
-        if end > start {
-            self.variables.drain(start..end);
-            self.update_dict_indexes(&|index| {
-                if index < start {
-                    Some(index)
-                } else if index < end {
-                    None
-                } else {
-                    Some(index - end - start)
-                }
-            })
-        }
-    }
-
-    fn update_dict_indexes<F>(&mut self, f: &F)
-    where
-        F: Fn(DictIndex) -> Option<DictIndex>,
-    {
-        update_dict_index_vec(&mut self.split_file, f);
-        self.weight = self.weight.and_then(f);
-        self.filter = self.filter.and_then(f);
-        self.vectors = self
-            .vectors
-            .drain()
-            .filter_map(|vector_by_id| {
-                vector_by_id
-                    .0
-                    .with_updated_dict_indexes(f)
-                    .map(ByIdentifier::new)
-            })
-            .collect();
-        self.mrsets = self
-            .mrsets
-            .drain()
-            .filter_map(|mrset_by_id| {
-                mrset_by_id
-                    .0
-                    .with_updated_dict_indexes(f)
-                    .map(ByIdentifier::new)
-            })
-            .collect();
-        self.variable_sets = self
-            .variable_sets
-            .drain()
-            .filter_map(|var_set_by_id| {
-                var_set_by_id
-                    .0
-                    .with_updated_dict_indexes(f)
-                    .map(ByIdentifier::new)
-            })
-            .collect();
-    }
-}
-
-fn update_dict_index_vec<F>(dict_indexes: &mut Vec<DictIndex>, f: F)
-where
-    F: Fn(DictIndex) -> Option<DictIndex>,
-{
-    dict_indexes.retain_mut(|index| {
-        if let Some(new) = f(*index) {
-            *index = new;
-            true
-        } else {
-            false
-        }
-    });
-}
-
-#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)]
-pub enum Role {
-    Input,
-    Target,
-    Both,
-    None,
-    Partition,
-    Split,
-}
-
-impl Default for Role {
-    fn default() -> Self {
-        Self::Input
-    }
-}
-
-pub enum DictClass {
-    Ordinary,
-    System,
-    Scratch,
-}
-
-impl DictClass {
-    pub fn from_identifier(id: &Identifier) -> Self {
-        if id.0.starts_with('$') {
-            Self::System
-        } else if id.0.starts_with('#') {
-            Self::Scratch
-        } else {
-            Self::Ordinary
-        }
-    }
-
-    pub fn must_leave(self) -> bool {
-        match self {
-            DictClass::Ordinary => false,
-            DictClass::System => false,
-            DictClass::Scratch => true,
-        }
-    }
-}
-
-#[derive(Clone, Debug)]
-pub struct Variable {
-    pub name: Identifier,
-    pub width: VarWidth,
-    pub missing_values: MissingValues,
-    pub print_format: Format,
-    pub write_format: Format,
-    pub value_labels: HashMap<Value, String>,
-    pub label: Option<String>,
-    pub measure: Option<Measure>,
-    pub role: Role,
-    pub display_width: u32,
-    pub alignment: Alignment,
-    pub leave: bool,
-    pub short_names: Vec<Identifier>,
-    pub attributes: HashSet<ByIdentifier<Attribute>>,
-}
-
-impl Variable {
-    pub fn new(name: Identifier, width: VarWidth) -> Self {
-        let var_type = VarType::from_width(width);
-        let leave = DictClass::from_identifier(&name).must_leave();
-        Self {
-            name,
-            width,
-            missing_values: MissingValues::default(),
-            print_format: Format::default_for_width(width),
-            write_format: Format::default_for_width(width),
-            value_labels: HashMap::new(),
-            label: None,
-            measure: Measure::default_for_type(var_type),
-            role: Role::default(),
-            display_width: width.default_display_width(),
-            alignment: Alignment::default_for_type(var_type),
-            leave,
-            short_names: Vec::new(),
-            attributes: HashSet::new(),
-        }
-    }
-}
-
-impl HasIdentifier for Variable {
-    fn identifier(&self) -> &UniCase<String> {
-        &self.name.0
-    }
-}
-
-#[derive(Clone, Debug)]
-pub struct Vector {
-    pub name: Identifier,
-    pub variables: Vec<DictIndex>,
-}
-
-impl Vector {
-    fn with_updated_dict_indexes(
-        mut self,
-        f: impl Fn(DictIndex) -> Option<DictIndex>,
-    ) -> Option<Self> {
-        update_dict_index_vec(&mut self.variables, f);
-        (!self.variables.is_empty()).then_some(self)
-    }
-}
-
-impl HasIdentifier for Vector {
-    fn identifier(&self) -> &UniCase<String> {
-        &self.name.0
-    }
-}
-
-#[derive(Clone, Debug)]
-pub struct Attribute {
-    pub name: Identifier,
-    pub values: Vec<String>,
-}
-
-impl HasIdentifier for Attribute {
-    fn identifier(&self) -> &UniCase<String> {
-        &self.name.0
-    }
-}
-
-#[derive(Clone, Debug)]
-pub struct MultipleResponseSet {
-    pub name: Identifier,
-    pub label: String,
-    pub mr_type: MultipleResponseType,
-    pub variables: Vec<DictIndex>,
-}
-
-impl MultipleResponseSet {
-    fn with_updated_dict_indexes(
-        mut self,
-        f: impl Fn(DictIndex) -> Option<DictIndex>,
-    ) -> Option<Self> {
-        update_dict_index_vec(&mut self.variables, f);
-        (self.variables.len() > 1).then_some(self)
-    }
-}
-
-impl HasIdentifier for MultipleResponseSet {
-    fn identifier(&self) -> &UniCase<String> {
-        &self.name.0
-    }
-}
-
-#[derive(Clone, Debug)]
-pub enum MultipleResponseType {
-    MultipleDichotomy {
-        value: Value,
-        labels: CategoryLabels,
-    },
-    MultipleCategory,
-}
-
-#[derive(Clone, Debug)]
-pub struct VariableSet {
-    pub name: Identifier,
-    pub variables: Vec<DictIndex>,
-}
-
-impl VariableSet {
-    fn with_updated_dict_indexes(
-        mut self,
-        f: impl Fn(DictIndex) -> Option<DictIndex>,
-    ) -> Option<Self> {
-        update_dict_index_vec(&mut self.variables, f);
-        (!self.variables.is_empty()).then_some(self)
-    }
-}
-
-impl HasIdentifier for VariableSet {
-    fn identifier(&self) -> &UniCase<String> {
-        &self.name.0
-    }
-}
-
-#[cfg(test)]
-mod test {
-    use std::collections::HashSet;
-
-    use unicase::UniCase;
-
-    use crate::identifier::Identifier;
-
-    use super::{ByIdentifier, HasIdentifier};
-
-    #[derive(PartialEq, Eq, Debug, Clone)]
-    struct Variable {
-        name: Identifier,
-        value: i32,
-    }
-
-    impl HasIdentifier for Variable {
-        fn identifier(&self) -> &UniCase<String> {
-            &self.name.0
-        }
-    }
-
-    #[test]
-    fn test() {
-        // Variables should not be the same if their values differ.
-        let abcd = Identifier::new("abcd").unwrap();
-        let abcd1 = Variable {
-            name: abcd.clone(),
-            value: 1,
-        };
-        let abcd2 = Variable {
-            name: abcd,
-            value: 2,
-        };
-        assert_ne!(abcd1, abcd2);
-
-        // But `ByName` should treat them the same.
-        let abcd1_by_name = ByIdentifier::new(abcd1);
-        let abcd2_by_name = ByIdentifier::new(abcd2);
-        assert_eq!(abcd1_by_name, abcd2_by_name);
-
-        // And a `HashSet` of `ByName` should also treat them the same.
-        let mut vars: HashSet<ByIdentifier<Variable>> = HashSet::new();
-        assert!(vars.insert(ByIdentifier::new(abcd1_by_name.0.clone())));
-        assert!(!vars.insert(ByIdentifier::new(abcd2_by_name.0.clone())));
-        assert_eq!(
-            vars.get(&UniCase::new(String::from("abcd")))
-                .unwrap()
-                .0
-                .value,
-            1
-        );
-    }
-}
diff --git a/rust/src/encoding.rs b/rust/src/encoding.rs

deleted file mode 100644 (file)

index aaed5fd..0000000
--- a/rust/src/encoding.rs
+++ /dev/null
@@ -1,64 +0,0 @@
-use crate::locale_charset::locale_charset;
-use encoding_rs::{Encoding, UTF_8};
-
-include!(concat!(env!("OUT_DIR"), "/encodings.rs"));
-
-pub fn codepage_from_encoding(encoding: &str) -> Option<u32> {
-    CODEPAGE_NAME_TO_NUMBER
-        .get(encoding.to_ascii_lowercase().as_str())
-        .copied()
-}
-
-use thiserror::Error as ThisError;
-
-#[derive(ThisError, Debug)]
-pub enum Error {
-    #[error("This system file does not indicate its own character encoding.  For best results, specify an encoding explicitly.  Use SYSFILE INFO with ENCODING=\"DETECT\" to analyze the possible encodings.")]
-    NoEncoding,
-
-    #[error("This system file encodes text strings with unknown code page {0}.")]
-    UnknownCodepage(i32),
-
-    #[error("This system file encodes text strings with unknown encoding {0}.")]
-    UnknownEncoding(String),
-
-    #[error("This system file is encoded in EBCDIC, which is not supported.")]
-    Ebcdic,
-}
-
-pub fn default_encoding() -> &'static Encoding {
-    lazy_static! {
-        static ref DEFAULT_ENCODING: &'static Encoding =
-            Encoding::for_label(locale_charset().as_bytes()).unwrap_or(UTF_8);
-    }
-    &DEFAULT_ENCODING
-}
-
-pub fn get_encoding(
-    encoding: Option<&str>,
-    character_code: Option<i32>,
-) -> Result<&'static Encoding, Error> {
-    let label = if let Some(encoding) = encoding {
-        encoding
-    } else if let Some(codepage) = character_code {
-        match codepage {
-            1 => return Err(Error::Ebcdic),
-            2 | 3 => {
-                // These ostensibly mean "7-bit ASCII" and "8-bit ASCII"[sic]
-                // respectively.  However, many files have character code 2 but
-                // data which are clearly not ASCII.  Therefore, ignore these
-                // values.
-                return Err(Error::NoEncoding);
-            }
-            4 => "MS_KANJI",
-            _ => CODEPAGE_NUMBER_TO_NAME
-                .get(&codepage)
-                .copied()
-                .ok_or(Error::UnknownCodepage(codepage))?,
-        }
-    } else {
-        return Err(Error::NoEncoding);
-    };
-
-    Encoding::for_label(label.as_bytes()).ok_or(Error::UnknownEncoding(label.into()))
-}
diff --git a/rust/src/endian.rs b/rust/src/endian.rs

deleted file mode 100644 (file)

index dd89a6c..0000000
--- a/rust/src/endian.rs
+++ /dev/null
@@ -1,168 +0,0 @@
-/// The endianness for integer and floating-point numbers in SPSS system files.
-///
-/// SPSS system files can declare IBM 370 and DEC VAX floating-point
-/// representations, but no file that uses either of these has ever been found
-/// in the wild, so this code does not handle them.
-#[derive(Copy, Clone, Debug, PartialEq, Eq)]
-pub enum Endian {
-    /// Big-endian: MSB at lowest address.
-    Big,
-
-    /// Little-endian: LSB at lowest address.
-    Little,
-}
-
-impl Endian {
-    #[cfg(target_endian = "big")]
-    pub const NATIVE: Endian = Endian::Big;
-    #[cfg(target_endian = "little")]
-    pub const NATIVE: Endian = Endian::Little;
-
-    pub fn identify_u32(expected_value: u32, bytes: [u8; 4]) -> Option<Self> {
-        let as_big: u32 = Endian::Big.parse(bytes);
-        let as_little: u32 = Endian::Little.parse(bytes);
-        match (as_big == expected_value, as_little == expected_value) {
-            (true, false) => Some(Endian::Big),
-            (false, true) => Some(Endian::Little),
-            _ => None,
-        }
-    }
-
-    pub fn identify_f64(expected_value: f64, bytes: [u8; 8]) -> Option<Self> {
-        let as_big: f64 = Endian::Big.parse(bytes);
-        let as_little: f64 = Endian::Little.parse(bytes);
-        match (as_big == expected_value, as_little == expected_value) {
-            (true, false) => Some(Endian::Big),
-            (false, true) => Some(Endian::Little),
-            _ => None,
-        }
-    }
-}
-
-pub trait ToBytes<T, const N: usize> {
-    fn to_bytes(self, value: T) -> [u8; N];
-}
-impl ToBytes<i64, 8> for Endian {
-    fn to_bytes(self, value: i64) -> [u8; 8] {
-        match self {
-            Endian::Big => i64::to_be_bytes(value),
-            Endian::Little => i64::to_le_bytes(value),
-        }
-    }
-}
-impl ToBytes<u32, 4> for Endian {
-    fn to_bytes(self, value: u32) -> [u8; 4] {
-        match self {
-            Endian::Big => u32::to_be_bytes(value),
-            Endian::Little => u32::to_le_bytes(value),
-        }
-    }
-}
-impl ToBytes<i32, 4> for Endian {
-    fn to_bytes(self, value: i32) -> [u8; 4] {
-        match self {
-            Endian::Big => i32::to_be_bytes(value),
-            Endian::Little => i32::to_le_bytes(value),
-        }
-    }
-}
-impl ToBytes<u16, 2> for Endian {
-    fn to_bytes(self, value: u16) -> [u8; 2] {
-        match self {
-            Endian::Big => u16::to_be_bytes(value),
-            Endian::Little => u16::to_le_bytes(value),
-        }
-    }
-}
-impl ToBytes<u8, 1> for Endian {
-    fn to_bytes(self, value: u8) -> [u8; 1] {
-        [value]
-    }
-}
-impl ToBytes<f64, 8> for Endian {
-    fn to_bytes(self, value: f64) -> [u8; 8] {
-        match self {
-            Endian::Big => f64::to_be_bytes(value),
-            Endian::Little => f64::to_le_bytes(value),
-        }
-    }
-}
-
-/// Parses an `N`-byte slice in one of the supported formats into native format
-/// as type `T`.
-pub trait Parse<T, const N: usize> {
-    /// Given 'bytes', returns `T`.
-    fn parse(self, bytes: [u8; N]) -> T;
-}
-impl Parse<u64, 8> for Endian {
-    fn parse(self, bytes: [u8; 8]) -> u64 {
-        match self {
-            Endian::Big => u64::from_be_bytes(bytes),
-            Endian::Little => u64::from_le_bytes(bytes),
-        }
-    }
-}
-impl Parse<u32, 4> for Endian {
-    fn parse(self, bytes: [u8; 4]) -> u32 {
-        match self {
-            Endian::Big => u32::from_be_bytes(bytes),
-            Endian::Little => u32::from_le_bytes(bytes),
-        }
-    }
-}
-impl Parse<u16, 2> for Endian {
-    fn parse(self, bytes: [u8; 2]) -> u16 {
-        match self {
-            Endian::Big => u16::from_be_bytes(bytes),
-            Endian::Little => u16::from_le_bytes(bytes),
-        }
-    }
-}
-impl Parse<u8, 1> for Endian {
-    fn parse(self, bytes: [u8; 1]) -> u8 {
-        match self {
-            Endian::Big => u8::from_be_bytes(bytes),
-            Endian::Little => u8::from_le_bytes(bytes),
-        }
-    }
-}
-impl Parse<i64, 8> for Endian {
-    fn parse(self, bytes: [u8; 8]) -> i64 {
-        match self {
-            Endian::Big => i64::from_be_bytes(bytes),
-            Endian::Little => i64::from_le_bytes(bytes),
-        }
-    }
-}
-impl Parse<i32, 4> for Endian {
-    fn parse(self, bytes: [u8; 4]) -> i32 {
-        match self {
-            Endian::Big => i32::from_be_bytes(bytes),
-            Endian::Little => i32::from_le_bytes(bytes),
-        }
-    }
-}
-impl Parse<i16, 2> for Endian {
-    fn parse(self, bytes: [u8; 2]) -> i16 {
-        match self {
-            Endian::Big => i16::from_be_bytes(bytes),
-            Endian::Little => i16::from_le_bytes(bytes),
-        }
-    }
-}
-impl Parse<i8, 1> for Endian {
-    fn parse(self, bytes: [u8; 1]) -> i8 {
-        match self {
-            Endian::Big => i8::from_be_bytes(bytes),
-            Endian::Little => i8::from_le_bytes(bytes),
-        }
-    }
-}
-impl Parse<f64, 8> for Endian {
-    fn parse(self, bytes: [u8; 8]) -> f64 {
-        match self {
-            Endian::Big => f64::from_be_bytes(bytes),
-            Endian::Little => f64::from_le_bytes(bytes),
-        }
-    }
-}
diff --git a/rust/src/engine.rs b/rust/src/engine.rs

deleted file mode 100644 (file)

index f48c194..0000000
--- a/rust/src/engine.rs
+++ /dev/null
@@ -1,51 +0,0 @@
-use crate::{
-    command::parse,
-    lex::{lexer::{Lexer, Source}, token::Token},
-    message::Diagnostic,
-};
-
-pub struct Engine {
-    lexer: Lexer,
-}
-
-impl Engine {
-    fn new() -> Self {
-        Self {
-            lexer: Lexer::new(Box::new(|location, error| println!("{location}: {error}"))),
-        }
-    }
-    fn run(&mut self, source: Source) {
-        self.lexer.append(source);
-        self.lexer.get();
-        while self.lexer.token() != &Token::End {
-            let error: Box<dyn Fn(Diagnostic)> = Box::new(|diagnostic| {
-                println!("{diagnostic}");
-            });
-            parse(&mut self.lexer, &error);
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use encoding_rs::UTF_8;
-
-    use crate::lex::{
-        lexer::{ErrorHandling, Source},
-        segment::Mode,
-    };
-
-    use super::Engine;
-
-    #[test]
-    fn test_echo() {
-        let mut engine = Engine::new();
-        engine.run(Source::for_file_contents(
-            "ECHO 'hi there'.\nECHO 'bye there'.\n".to_string(),
-            Some("test.sps".to_string()),
-            UTF_8,
-            Mode::default(),
-            ErrorHandling::default(),
-        ));
-    }
-}
diff --git a/rust/src/format.rs b/rust/src/format.rs

deleted file mode 100644 (file)

index bafdf27..0000000
--- a/rust/src/format.rs
+++ /dev/null
@@ -1,658 +0,0 @@
-use std::{
-    fmt::{Display, Formatter, Result as FmtResult},
-    ops::RangeInclusive,
-};
-
-use enum_map::{Enum, EnumMap};
-use thiserror::Error as ThisError;
-
-use crate::{
-    dictionary::VarWidth,
-    raw::{self, VarType},
-};
-
-#[derive(ThisError, Debug)]
-pub enum Error {
-    #[error("Unknown format type {value}.")]
-    UnknownFormat { value: u16 },
-
-    #[error("Output format {0} specifies width {}, but {} requires an even width.", .0.w, .0.type_)]
-    OddWidthNotAllowed(UncheckedFormat),
-
-    #[error("Output format {0} specifies width {}, but {} requires a width between {} and {}.", .0.w, .0.type_, .0.type_.min_width(), .0.type_.max_width())]
-    BadWidth(UncheckedFormat),
-
-    #[error("Output format {0} specifies decimal places, but {} format does not allow any decimals.", .0.type_)]
-    DecimalsNotAllowedForFormat(UncheckedFormat),
-
-    #[error("Output format {0} specifies {} decimal places, but with a width of {}, {} does not allow any decimal places.", .0.d, .0.w, .0.type_)]
-    DecimalsNotAllowedForWidth(UncheckedFormat),
-
-    #[error("Output format {spec} specifies {} decimal places but, with a width of {}, {} allows at most {max_d} decimal places.", .spec.d, .spec.w, .spec.type_)]
-    TooManyDecimalsForWidth {
-        spec: UncheckedFormat,
-        max_d: Decimals,
-    },
-
-    #[error("String variable is not compatible with numeric format {0}.")]
-    UnnamedVariableNotCompatibleWithNumericFormat(Type),
-
-    #[error("Numeric variable is not compatible with string format {0}.")]
-    UnnamedVariableNotCompatibleWithStringFormat(Type),
-
-    #[error("String variable {variable} with width {width} is not compatible with format {bad_spec}.  Use format {good_spec} instead.")]
-    NamedStringVariableBadSpecWidth {
-        variable: String,
-        width: Width,
-        bad_spec: Format,
-        good_spec: Format,
-    },
-
-    #[error("String variable with width {width} is not compatible with format {bad_spec}.  Use format {good_spec} instead.")]
-    UnnamedStringVariableBadSpecWidth {
-        width: Width,
-        bad_spec: Format,
-        good_spec: Format,
-    },
-}
-
-#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
-pub enum Category {
-    // Numeric formats.
-    Basic,
-    Custom,
-    Legacy,
-    Binary,
-    Hex,
-    Date,
-    Time,
-    DateComponent,
-
-    // String formats.
-    String,
-}
-
-impl From<Type> for Category {
-    fn from(source: Type) -> Self {
-        match source {
-            Type::F | Type::Comma | Type::Dot | Type::Dollar | Type::Pct | Type::E => Self::Basic,
-            Type::CC(_) => Self::Custom,
-            Type::N | Type::Z => Self::Legacy,
-            Type::P | Type::PK | Type::IB | Type::PIB | Type::RB => Self::Binary,
-            Type::PIBHex | Type::RBHex => Self::Hex,
-            Type::Date
-            | Type::ADate
-            | Type::EDate
-            | Type::JDate
-            | Type::SDate
-            | Type::QYr
-            | Type::MoYr
-            | Type::WkYr
-            | Type::DateTime
-            | Type::YMDHMS => Self::Date,
-            Type::MTime | Type::Time | Type::DTime => Self::Time,
-            Type::WkDay | Type::Month => Self::DateComponent,
-            Type::A | Type::AHex => Self::String,
-        }
-    }
-}
-
-#[derive(Copy, Clone, Debug, Enum, PartialEq, Eq, Hash)]
-pub enum CC {
-    A,
-    B,
-    C,
-    D,
-    E,
-}
-
-impl Display for CC {
-    fn fmt(&self, f: &mut Formatter) -> FmtResult {
-        let s = match self {
-            CC::A => "A",
-            CC::B => "B",
-            CC::C => "C",
-            CC::D => "D",
-            CC::E => "E",
-        };
-        write!(f, "{}", s)
-    }
-}
-
-#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
-pub enum Type {
-    // Basic numeric formats.
-    F,
-    Comma,
-    Dot,
-    Dollar,
-    Pct,
-    E,
-
-    // Custom currency formats.
-    CC(CC),
-
-    // Legacy numeric formats.
-    N,
-    Z,
-
-    // Binary and hexadecimal formats.
-    P,
-    PK,
-    IB,
-    PIB,
-    PIBHex,
-    RB,
-    RBHex,
-
-    // Time and date formats.
-    Date,
-    ADate,
-    EDate,
-    JDate,
-    SDate,
-    QYr,
-    MoYr,
-    WkYr,
-    DateTime,
-    YMDHMS,
-    MTime,
-    Time,
-    DTime,
-
-    // Date component formats.
-    WkDay,
-    Month,
-
-    // String formats.
-    A,
-    AHex,
-}
-
-pub type Width = u16;
-pub type SignedWidth = i16;
-
-pub type Decimals = u8;
-
-impl Type {
-    pub fn max_width(self) -> Width {
-        match self {
-            Self::P | Self::PK | Self::PIBHex | Self::RBHex => 16,
-            Self::IB | Self::PIB | Self::RB => 8,
-            Self::A => 32767,
-            Self::AHex => 32767 * 2,
-            _ => 40,
-        }
-    }
-
-    pub fn min_width(self) -> Width {
-        match self {
-            // Basic numeric formats.
-            Self::F => 1,
-            Self::Comma => 1,
-            Self::Dot => 1,
-            Self::Dollar => 2,
-            Self::Pct => 2,
-            Self::E => 6,
-
-            // Custom currency formats.
-            Self::CC(_) => 2,
-
-            // Legacy numeric formats.
-            Self::N => 1,
-            Self::Z => 1,
-
-            // Binary and hexadecimal formats.
-            Self::P => 1,
-            Self::PK => 1,
-            Self::IB => 1,
-            Self::PIB => 1,
-            Self::PIBHex => 2,
-            Self::RB => 2,
-            Self::RBHex => 4,
-
-            // Time and date formats.
-            Self::Date => 9,
-            Self::ADate => 8,
-            Self::EDate => 8,
-            Self::JDate => 5,
-            Self::SDate => 8,
-            Self::QYr => 6,
-            Self::MoYr => 6,
-            Self::WkYr => 8,
-            Self::DateTime => 17,
-            Self::YMDHMS => 16,
-            Self::MTime => 5,
-            Self::Time => 5,
-            Self::DTime => 8,
-
-            // Date component formats.
-            Self::WkDay => 2,
-            Self::Month => 3,
-
-            // String formats.
-            Self::A => 1,
-            Self::AHex => 2,
-        }
-    }
-
-    pub fn width_range(self) -> RangeInclusive<Width> {
-        self.min_width()..=self.max_width()
-    }
-
-    pub fn max_decimals(self, width: Width) -> Decimals {
-        let width = width.clamp(1, 40) as SignedWidth;
-        let max = match self {
-            Self::F | Self::Comma | Self::Dot | Self::CC(_) => width - 1,
-            Self::Dollar | Self::Pct => width - 2,
-            Self::E => width - 7,
-            Self::N | Self::Z => width,
-            Self::P => width * 2 - 1,
-            Self::PK => width * 2,
-            Self::IB | Self::PIB => max_digits_for_bytes(width as usize) as SignedWidth,
-            Self::PIBHex => 0,
-            Self::RB | Self::RBHex => 16,
-            Self::Date
-            | Self::ADate
-            | Self::EDate
-            | Self::JDate
-            | Self::SDate
-            | Self::QYr
-            | Self::MoYr
-            | Self::WkYr => 0,
-            Self::DateTime => width - 21,
-            Self::YMDHMS => width - 20,
-            Self::MTime => width - 6,
-            Self::Time => width - 9,
-            Self::DTime => width - 12,
-            Self::WkDay | Self::Month | Self::A | Self::AHex => 0,
-        };
-        max.clamp(0, 16) as Decimals
-    }
-
-    pub fn takes_decimals(self) -> bool {
-        self.max_decimals(Width::MAX) > 0
-    }
-
-    pub fn category(self) -> Category {
-        self.into()
-    }
-
-    pub fn width_step(self) -> Width {
-        if self.category() == Category::Hex || self == Self::AHex {
-            2
-        } else {
-            1
-        }
-    }
-
-    pub fn clamp_width(self, width: Width) -> Width {
-        let (min, max) = self.width_range().into_inner();
-        let width = width.clamp(min, max);
-        if self.width_step() == 2 {
-            width / 2 * 2
-        } else {
-            width
-        }
-    }
-
-    pub fn var_type(self) -> VarType {
-        match self {
-            Self::A | Self::AHex => VarType::String,
-            _ => VarType::Numeric,
-        }
-    }
-
-    /// Checks whether this format is valid for a variable with the given
-    /// `var_type`.
-    pub fn check_type_compatibility(self, var_type: VarType) -> Result<(), Error> {
-        let my_type = self.var_type();
-        match (my_type, var_type) {
-            (VarType::Numeric, VarType::String) => {
-                Err(Error::UnnamedVariableNotCompatibleWithNumericFormat(self))
-            }
-            (VarType::String, VarType::Numeric) => {
-                Err(Error::UnnamedVariableNotCompatibleWithStringFormat(self))
-            }
-            _ => Ok(()),
-        }
-    }
-}
-
-impl Display for Type {
-    fn fmt(&self, f: &mut Formatter) -> FmtResult {
-        let s = match self {
-            Self::F => "F",
-            Self::Comma => "COMMA",
-            Self::Dot => "DOT",
-            Self::Dollar => "DOLLAR",
-            Self::Pct => "PCT",
-            Self::E => "E",
-            Self::CC(cc) => return write!(f, "{}", cc),
-            Self::N => "N",
-            Self::Z => "Z",
-            Self::P => "P",
-            Self::PK => "PK",
-            Self::IB => "IB",
-            Self::PIB => "PIB",
-            Self::PIBHex => "PIBHEX",
-            Self::RB => "RB",
-            Self::RBHex => "RBHEX",
-            Self::Date => "DATE",
-            Self::ADate => "ADATE",
-            Self::EDate => "EDATE",
-            Self::JDate => "JDATE",
-            Self::SDate => "SDATE",
-            Self::QYr => "QYR",
-            Self::MoYr => "MOYR",
-            Self::WkYr => "WKYR",
-            Self::DateTime => "DATETIME",
-            Self::YMDHMS => "YMDHMS",
-            Self::MTime => "MTIME",
-            Self::Time => "TIME",
-            Self::DTime => "DTIME",
-            Self::WkDay => "WKDAY",
-            Self::Month => "MONTH",
-            Self::A => "A",
-            Self::AHex => "AHEX",
-        };
-        write!(f, "{}", s)
-    }
-}
-
-fn max_digits_for_bytes(bytes: usize) -> usize {
-    *[0, 3, 5, 8, 10, 13, 15, 17].get(bytes).unwrap_or(&20)
-}
-
-#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
-pub struct Format {
-    type_: Type,
-    w: Width,
-    d: Decimals,
-}
-
-impl Format {
-    pub const F40: Format = Format {
-        type_: Type::F,
-        w: 40,
-        d: 0,
-    };
-
-    pub const F8_2: Format = Format {
-        type_: Type::F,
-        w: 8,
-        d: 2,
-    };
-
-    pub fn format(self) -> Type {
-        self.type_
-    }
-    pub fn w(self) -> Width {
-        self.w
-    }
-    pub fn d(self) -> Decimals {
-        self.d
-    }
-
-    pub fn default_for_width(var_width: VarWidth) -> Self {
-        match var_width {
-            VarWidth::Numeric => Format {
-                type_: Type::F,
-                w: 8,
-                d: 2,
-            },
-            VarWidth::String(w) => Format {
-                type_: Type::A,
-                w,
-                d: 0,
-            },
-        }
-    }
-
-    pub fn fixed_from(source: &UncheckedFormat) -> Self {
-        let UncheckedFormat {
-            type_: format,
-            w,
-            d,
-        } = *source;
-        let (min, max) = format.width_range().into_inner();
-        let mut w = w.clamp(min, max);
-        if d <= format.max_decimals(Width::MAX) {
-            while d > format.max_decimals(w) {
-                w += 1;
-                assert!(w <= 40);
-            }
-        }
-        let d = d.clamp(0, format.max_decimals(w));
-        Self {
-            type_: format,
-            w,
-            d,
-        }
-    }
-
-    pub fn var_width(self) -> VarWidth {
-        match self.type_ {
-            Type::A => VarWidth::String(self.w),
-            Type::AHex => VarWidth::String(self.w / 2),
-            _ => VarWidth::Numeric,
-        }
-    }
-
-    pub fn var_type(self) -> VarType {
-        self.type_.var_type()
-    }
-
-    /// Checks whether this format specification is valid for a variable with
-    /// width `var_width`.
-    pub fn check_width_compatibility(self, var_width: VarWidth) -> Result<Self, Error> {
-        // Verify that the format is right for the variable's type.
-        self.type_.check_type_compatibility(var_width.into())?;
-
-        if let VarWidth::String(w) = var_width {
-            if var_width != self.var_width() {
-                let bad_spec = self;
-                let good_spec = if self.type_ == Type::A {
-                    Format { w, ..self }
-                } else {
-                    Format { w: w * 2, ..self }
-                };
-                return Err(Error::UnnamedStringVariableBadSpecWidth {
-                    width: w,
-                    bad_spec,
-                    good_spec,
-                });
-            }
-        }
-
-        Ok(self)
-    }
-}
-
-impl Display for Format {
-    fn fmt(&self, f: &mut Formatter) -> FmtResult {
-        write!(f, "{}{}", self.type_, self.w)?;
-        if self.type_.takes_decimals() || self.d > 0 {
-            write!(f, ".{}", self.d)?;
-        }
-        Ok(())
-    }
-}
-
-impl TryFrom<UncheckedFormat> for Format {
-    type Error = Error;
-
-    fn try_from(source: UncheckedFormat) -> Result<Self, Self::Error> {
-        let UncheckedFormat {
-            type_: format,
-            w,
-            d,
-        } = source;
-        let max_d = format.max_decimals(w);
-        if w % format.width_step() != 0 {
-            Err(Error::OddWidthNotAllowed(source))
-        } else if !format.width_range().contains(&w) {
-            Err(Error::BadWidth(source))
-        } else if d > max_d {
-            if format.takes_decimals() {
-                Err(Error::DecimalsNotAllowedForFormat(source))
-            } else if max_d > 0 {
-                Err(Error::TooManyDecimalsForWidth {
-                    spec: source,
-                    max_d,
-                })
-            } else {
-                Err(Error::DecimalsNotAllowedForWidth(source))
-            }
-        } else {
-            Ok(Format {
-                type_: format,
-                w,
-                d,
-            })
-        }
-    }
-}
-
-impl TryFrom<u16> for Type {
-    type Error = Error;
-
-    fn try_from(source: u16) -> Result<Self, Self::Error> {
-        match source {
-            1 => Ok(Self::A),
-            2 => Ok(Self::AHex),
-            3 => Ok(Self::Comma),
-            4 => Ok(Self::Dollar),
-            5 => Ok(Self::F),
-            6 => Ok(Self::IB),
-            7 => Ok(Self::PIBHex),
-            8 => Ok(Self::P),
-            9 => Ok(Self::PIB),
-            10 => Ok(Self::PK),
-            11 => Ok(Self::RB),
-            12 => Ok(Self::RBHex),
-            15 => Ok(Self::Z),
-            16 => Ok(Self::N),
-            17 => Ok(Self::E),
-            20 => Ok(Self::Date),
-            21 => Ok(Self::Time),
-            22 => Ok(Self::DateTime),
-            23 => Ok(Self::ADate),
-            24 => Ok(Self::JDate),
-            25 => Ok(Self::DTime),
-            26 => Ok(Self::WkDay),
-            27 => Ok(Self::Month),
-            28 => Ok(Self::MoYr),
-            29 => Ok(Self::QYr),
-            30 => Ok(Self::WkYr),
-            31 => Ok(Self::Pct),
-            32 => Ok(Self::Dot),
-            33 => Ok(Self::CC(CC::A)),
-            34 => Ok(Self::CC(CC::B)),
-            35 => Ok(Self::CC(CC::C)),
-            36 => Ok(Self::CC(CC::D)),
-            37 => Ok(Self::CC(CC::E)),
-            38 => Ok(Self::EDate),
-            39 => Ok(Self::SDate),
-            40 => Ok(Self::MTime),
-            41 => Ok(Self::YMDHMS),
-            _ => Err(Error::UnknownFormat { value: source }),
-        }
-    }
-}
-
-#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
-pub struct UncheckedFormat {
-    pub type_: Type,
-
-    pub w: Width,
-
-    pub d: Decimals,
-}
-
-impl TryFrom<raw::Spec> for UncheckedFormat {
-    type Error = Error;
-
-    fn try_from(raw: raw::Spec) -> Result<Self, Self::Error> {
-        let raw = raw.0;
-        let raw_format = (raw >> 16) as u16;
-        let format = raw_format.try_into()?;
-        let w = ((raw >> 8) & 0xff) as Width;
-        let d = (raw & 0xff) as Decimals;
-        Ok(Self {
-            type_: format,
-            w,
-            d,
-        })
-    }
-}
-
-impl Display for UncheckedFormat {
-    fn fmt(&self, f: &mut Formatter) -> FmtResult {
-        write!(f, "{}{}", self.type_, self.w)?;
-        if self.type_.takes_decimals() || self.d > 0 {
-            write!(f, ".{}", self.d)?;
-        }
-        Ok(())
-    }
-}
-
-pub struct Settings {
-    epoch: Option<i32>,
-
-    /// Either `'.'` or `','`.
-    decimal: char,
-
-    /// Format `F`, `E`, `COMMA`, and `DOT` with leading zero (e.g. `0.5`
-    /// instead of `.5`)?
-    include_leading_zero: bool,
-
-    /// Custom currency styles.
-    ccs: EnumMap<CC, Option<NumberStyle>>,
-}
-
-impl Default for Settings {
-    fn default() -> Self {
-        Self {
-            epoch: None,
-            decimal: '.',
-            include_leading_zero: false,
-            ccs: Default::default(),
-        }
-    }
-}
-
-/// A numeric output style.  This can express numeric formats in
-/// [Category::Basic] and [Category::Custom].
-pub struct NumberStyle {
-    neg_prefix: Affix,
-    prefix: Affix,
-    suffix: Affix,
-    neg_suffix: Affix,
-
-    /// Decimal point: `'.'` or `','`.
-    decimal: char,
-
-    /// Grouping character: `'.'` or `','` or `None`.
-    grouping: Option<char>,
-
-    /// Format as `.5` or `0.5`?
-    include_leading_zero: bool,
-
-    /// An `Affix` may require more bytes than its display width; for example,
-    /// U+00A5 (¥) is 2 bytes in UTF-8 but occupies only one display column.
-    /// This member is the sum of the number of bytes required by all of the
-    /// `Affix` members in this struct, minus their display widths.  Thus, it
-    /// can be used to size memory allocations: for example, the formatted
-    /// result of `CCA20.5` requires no more than `(20 + extra_bytes)` bytes in
-    /// UTF-8.
-    extra_bytes: usize,
-}
-
-pub struct Affix {
-    /// String contents of affix.
-    s: String,
-
-    /// Display width in columns (see [unicode_width])
-    width: usize,
-}
diff --git a/rust/src/hexfloat.rs b/rust/src/hexfloat.rs

deleted file mode 100644 (file)

index b885fb2..0000000
--- a/rust/src/hexfloat.rs
+++ /dev/null
@@ -1,52 +0,0 @@
-use num::Float;
-use std::{num::FpCategory, fmt::{Display, Formatter, Result}};
-
-pub struct HexFloat<T: Float>(pub T);
-
-impl<T: Float> Display for HexFloat<T> {
-    fn fmt(&self, f: &mut Formatter<'_>) -> Result {
-        let sign = if self.0.is_sign_negative() { "-" } else { "" };
-        match self.0.classify() {
-            FpCategory::Nan => return write!(f, "NaN"),
-            FpCategory::Infinite => return write!(f, "{sign}Infinity"),
-            FpCategory::Zero => return write!(f, "{sign}0.0"),
-            _ => (),
-        };
-        let (significand, mut exponent, _) = self.0.integer_decode();
-        let mut hex_sig = format!("{:x}", significand);
-        while hex_sig.ends_with('0') {
-            hex_sig.pop();
-            exponent += 4;
-        }
-        match hex_sig.len() {
-            0 => write!(f, "{sign}0.0"),
-            1 => write!(f, "{sign}0x{hex_sig}.0p{exponent}"),
-            len => write!(
-                f,
-                "{sign}0x{}.{}p{}",
-                hex_sig.chars().next().unwrap(),
-                &hex_sig[1..],
-                exponent + 4 * (len as i16 - 1)
-            ),
-        }
-    }
-}
-
-#[cfg(test)]
-mod hex_float_tests {
-    use crate::HexFloat;
-    use num::Float;
-
-    #[test]
-    fn test() {
-        assert_eq!(format!("{}", HexFloat(1.0)), "0x1.0p0");
-        assert_eq!(format!("{}", HexFloat(123.0)), "0x1.ecp6");
-        assert_eq!(format!("{}", HexFloat(1.0 / 16.0)), "0x1.0p-4");
-        assert_eq!(format!("{}", HexFloat(f64::infinity())), "Infinity");
-        assert_eq!(format!("{}", HexFloat(f64::neg_infinity())), "-Infinity");
-        assert_eq!(format!("{}", HexFloat(f64::nan())), "NaN");
-        assert_eq!(format!("{}", HexFloat(0.0)), "0.0");
-        assert_eq!(format!("{}", HexFloat(f64::neg_zero())), "-0.0");
-    }
-}
-
diff --git a/rust/src/identifier.rs b/rust/src/identifier.rs

deleted file mode 100644 (file)

index 2d5c031..0000000
--- a/rust/src/identifier.rs
+++ /dev/null
@@ -1,394 +0,0 @@
-use std::{
-    borrow::Borrow,
-    cmp::Ordering,
-    fmt::{Debug, Display, Formatter, Result as FmtResult},
-    hash::{Hash, Hasher},
-    ops::Deref,
-};
-
-use encoding_rs::{EncoderResult, Encoding, UTF_8};
-use finl_unicode::categories::{CharacterCategories, MajorCategory};
-use thiserror::Error as ThisError;
-use unicase::UniCase;
-
-pub trait IdentifierChar {
-    /// Returns true if `self` is an ASCII character that may be the first
-    /// character in an identifier.
-    fn ascii_may_start_id(self) -> bool;
-
-    /// Returns true if `self` may be the first character in an identifier.
-    fn may_start_id(self) -> bool;
-
-    /// Returns true if `self` is an ASCII character that may be a second or
-    /// subsequent character in an identifier.
-    fn ascii_may_continue_id(self) -> bool;
-
-    /// Returns true if `self` may be a second or subsequent character in an
-    /// identifier.
-    fn may_continue_id(self) -> bool;
-}
-
-impl IdentifierChar for char {
-    fn ascii_may_start_id(self) -> bool {
-        matches!(self, 'a'..='z' | 'A'..='Z' | '@' | '#' | '$' | '!')
-    }
-
-    fn may_start_id(self) -> bool {
-        if self < '\u{0080}' {
-            self.ascii_may_start_id()
-        } else {
-            use MajorCategory::*;
-
-            [L, M, S].contains(&self.get_major_category()) && self != char::REPLACEMENT_CHARACTER
-        }
-    }
-
-    fn ascii_may_continue_id(self) -> bool {
-        matches!(self, 'a'..='z' | 'A'..='Z' | '0'..='9' | '@' | '#' | '$' | '.' | '_')
-    }
-
-    fn may_continue_id(self) -> bool {
-        if self < '\u{0080}' {
-            self.ascii_may_continue_id()
-        } else {
-            use MajorCategory::*;
-
-            [L, M, S, N].contains(&self.get_major_category()) && self != char::REPLACEMENT_CHARACTER
-        }
-    }
-}
-
-#[derive(Clone, Debug, ThisError)]
-pub enum Error {
-    #[error("Identifier cannot be empty string.")]
-    Empty,
-
-    #[error("\"{0}\" may not be used as an identifier because it is a reserved word.")]
-    Reserved(String),
-
-    #[error("\"!\" is not a valid identifier.")]
-    Bang,
-
-    #[error("\"{0}\" may not be used as an identifier because it begins with disallowed character \"{1}\".")]
-    BadFirstCharacter(String, char),
-
-    #[error("\"{0}\" may not be used as an identifier because it contains disallowed character \"{1}\".")]
-    BadLaterCharacter(String, char),
-
-    #[error("Identifier \"{id}\" is {length} bytes in the encoding in use ({encoding}), which exceeds the {max}-byte limit.")]
-    TooLong {
-        id: String,
-        length: usize,
-        encoding: &'static str,
-        max: usize,
-    },
-
-    #[error("\"{id}\" may not be used as an identifier because the encoding in use ({encoding}) cannot represent \"{c}\".")]
-    NotEncodable {
-        id: String,
-        encoding: &'static str,
-        c: char,
-    },
-}
-
-pub enum ReservedWord {
-    And,
-    Or,
-    Not,
-    Eq,
-    Ge,
-    Gt,
-    Le,
-    Lt,
-    Ne,
-    All,
-    By,
-    To,
-    With,
-}
-
-impl TryFrom<&str> for ReservedWord {
-    type Error = ();
-
-    fn try_from(source: &str) -> Result<Self, Self::Error> {
-        if !(2..=4).contains(&source.len()) {
-            Err(())
-        } else {
-            let b = source.as_bytes();
-            let c0 = b[0].to_ascii_uppercase();
-            let c1 = b[1].to_ascii_uppercase();
-            match (source.len(), c0, c1) {
-                (2, b'B', b'Y') => Ok(Self::By),
-                (2, b'E', b'Q') => Ok(Self::Eq),
-                (2, b'G', b'T') => Ok(Self::Gt),
-                (2, b'G', b'E') => Ok(Self::Ge),
-                (2, b'L', b'T') => Ok(Self::Lt),
-                (2, b'L', b'E') => Ok(Self::Le),
-                (2, b'N', b'E') => Ok(Self::Ne),
-                (3, b'N', b'O') if b[2].to_ascii_uppercase() == b'T' => Ok(Self::Not),
-                (2, b'O', b'R') => Ok(Self::Or),
-                (2, b'T', b'O') => Ok(Self::To),
-                (3, b'A', b'L') if b[2].to_ascii_uppercase() == b'L' => Ok(Self::All),
-                (3, b'A', b'N') if b[2].to_ascii_uppercase() == b'D' => Ok(Self::And),
-                (4, b'W', b'I')
-                    if b[2].to_ascii_uppercase() == b'T' && b[3].to_ascii_uppercase() == b'H' =>
-                {
-                    Ok(Self::With)
-                }
-                _ => Err(()),
-            }
-        }
-    }
-}
-
-pub fn is_reserved_word(s: &str) -> bool {
-    ReservedWord::try_from(s).is_ok()
-}
-
-#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
-pub struct Identifier(pub UniCase<String>);
-
-impl Identifier {
-    /// Maximum length of an identifier, in bytes.  The limit applies in the
-    /// encoding used by the dictionary, not in UTF-8.
-    pub const MAX_LEN: usize = 64;
-
-    pub fn new(s: &str) -> Result<Self, Error> {
-        Self::from_encoding(s, UTF_8)
-    }
-    pub fn from_encoding(s: &str, encoding: &'static Encoding) -> Result<Identifier, Error> {
-        Self::is_plausible(s)?;
-        let identifier = Identifier(s.into());
-        identifier.check_encoding(encoding)?;
-        Ok(identifier)
-    }
-
-    /// Checks whether this is a valid identifier in the given `encoding`.  An
-    /// identifier that is valid in one encoding might be invalid in another
-    /// because some characters are unencodable or because it is too long.
-    pub fn check_encoding(&self, encoding: &'static Encoding) -> Result<(), Error> {
-        let s = self.0.as_str();
-        let (_encoded, _, unencodable) = encoding.encode(s);
-        if unencodable {
-            let mut encoder = encoding.new_encoder();
-            let mut buf = Vec::with_capacity(
-                encoder
-                    .max_buffer_length_from_utf8_without_replacement(s.len())
-                    .unwrap(),
-            );
-            let EncoderResult::Unmappable(c) = encoder
-                .encode_from_utf8_to_vec_without_replacement(s, &mut buf, true)
-                .0
-            else {
-                unreachable!();
-            };
-            return Err(Error::NotEncodable {
-                id: s.into(),
-                encoding: encoding.name(),
-                c,
-            });
-        }
-        /*
-        if encoded.len() > Self::MAX_LEN {
-            return Err(Error::TooLong {
-                id: s.into(),
-                length: encoded.len(),
-                encoding: encoding.name(),
-                max: Self::MAX_LEN,
-            });
-        }*/
-        Ok(())
-    }
-    pub fn is_plausible(s: &str) -> Result<(), Error> {
-        if s.is_empty() {
-            return Err(Error::Empty);
-        }
-        if is_reserved_word(s) {
-            return Err(Error::Reserved(s.into()));
-        }
-        if s == "!" {
-            return Err(Error::Bang);
-        }
-
-        let mut i = s.chars();
-        let first = i.next().unwrap();
-        if !first.may_start_id() {
-            return Err(Error::BadFirstCharacter(s.into(), first));
-        }
-        for c in i {
-            if !c.may_continue_id() {
-                return Err(Error::BadLaterCharacter(s.into(), c));
-            }
-        }
-        Ok(())
-    }
-
-    /// Returns true if `token` is a case-insensitive match for `keyword`.
-    ///
-    /// Keywords match `keyword` and `token` are identical, or `token` is at
-    /// least 3 characters long and those characters are identical to `keyword`
-    /// or differ only in case.
-    ///
-    /// `keyword` must be ASCII.
-    pub fn matches_keyword(&self, keyword: &str) -> bool {
-        id_match_n_nonstatic(keyword, self.0.as_str(), 3)
-    }
-
-    /// Returns true if `token` is a case-insensitive match for at least the
-    /// first `n` characters of `keyword`.
-    ///
-    /// `keyword` must be ASCII.
-    pub fn matches_keyword_n(&self, keyword: &str, n: usize) -> bool {
-        id_match_n_nonstatic(keyword, self.0.as_str(), n)
-    }
-}
-
-impl PartialEq<str> for Identifier {
-    fn eq(&self, other: &str) -> bool {
-        self.0.eq(&UniCase::new(other))
-    }
-}
-
-/// Returns true if `token` is a case-insensitive match for `keyword`.
-///
-/// Keywords match `keyword` and `token` are identical, or `token` is at least 3
-/// characters long and those characters are identical to `keyword` or differ
-/// only in case.
-///
-/// `keyword` must be ASCII.  It's normally a constant string, so it's declared
-/// as `&'static str` to make it harder to reverse the argument order. But
-/// there's no reason that a non-static string won't work, so use
-/// [`id_match_n_nonstatic`] instead if you need it.
-pub fn id_match(keyword: &'static str, token: &str) -> bool {
-    id_match_n(keyword, token, 3)
-}
-
-/// Returns true if `token` is a case-insensitive match for at least the first
-/// `n` characters of `keyword`.
-///
-/// `keyword` must be ASCII.  It's normally a constant string, so it's declared
-/// as `&'static str` to make it harder to reverse the argument order. But
-/// there's no reason that a non-static string won't work, so use
-/// [`id_match_n_nonstatic`] instead if you need it.
-pub fn id_match_n(keyword: &'static str, token: &str, n: usize) -> bool {
-    id_match_n_nonstatic(keyword, token, n)
-}
-
-/// Returns true if `token` is a case-insensitive match for at least the first
-/// `n` characters of `keyword`.
-///
-/// `keyword` must be ASCII.
-pub fn id_match_n_nonstatic(keyword: &str, token: &str, n: usize) -> bool {
-    debug_assert!(keyword.is_ascii());
-    let keyword_prefix = if (n..keyword.len()).contains(&token.len()) {
-        &keyword[..token.len()]
-    } else {
-        keyword
-    };
-    keyword_prefix.eq_ignore_ascii_case(token)
-}
-
-impl Display for Identifier {
-    fn fmt(&self, f: &mut Formatter) -> FmtResult {
-        write!(f, "{}", self.0)
-    }
-}
-
-impl Debug for Identifier {
-    fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
-        write!(f, "{}", self.0)
-    }
-}
-
-pub trait HasIdentifier {
-    fn identifier(&self) -> &UniCase<String>;
-}
-
-pub struct ByIdentifier<T>(pub T)
-where
-    T: HasIdentifier;
-
-impl<T> ByIdentifier<T>
-where
-    T: HasIdentifier,
-{
-    pub fn new(inner: T) -> Self {
-        Self(inner)
-    }
-}
-
-impl<T> PartialEq for ByIdentifier<T>
-where
-    T: HasIdentifier,
-{
-    fn eq(&self, other: &Self) -> bool {
-        self.0.identifier().eq(other.0.identifier())
-    }
-}
-
-impl<T> Eq for ByIdentifier<T> where T: HasIdentifier {}
-
-impl<T> PartialOrd for ByIdentifier<T>
-where
-    T: HasIdentifier,
-{
-    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
-        Some(self.cmp(other))
-    }
-}
-
-impl<T> Ord for ByIdentifier<T>
-where
-    T: HasIdentifier,
-{
-    fn cmp(&self, other: &Self) -> Ordering {
-        self.0.identifier().cmp(other.0.identifier())
-    }
-}
-
-impl<T> Hash for ByIdentifier<T>
-where
-    T: HasIdentifier,
-{
-    fn hash<H: Hasher>(&self, state: &mut H) {
-        self.0.identifier().hash(state)
-    }
-}
-
-impl<T> Borrow<UniCase<String>> for ByIdentifier<T>
-where
-    T: HasIdentifier,
-{
-    fn borrow(&self) -> &UniCase<String> {
-        self.0.identifier()
-    }
-}
-
-impl<T> Debug for ByIdentifier<T>
-where
-    T: HasIdentifier + Debug,
-{
-    fn fmt(&self, f: &mut Formatter) -> FmtResult {
-        self.0.fmt(f)
-    }
-}
-
-impl<T> Clone for ByIdentifier<T>
-where
-    T: HasIdentifier + Clone,
-{
-    fn clone(&self) -> Self {
-        Self(self.0.clone())
-    }
-}
-
-impl<T> Deref for ByIdentifier<T>
-where
-    T: HasIdentifier + Clone,
-{
-    type Target = T;
-
-    fn deref(&self) -> &Self::Target {
-        &self.0
-    }
-}
diff --git a/rust/src/integer.rs b/rust/src/integer.rs

deleted file mode 100644 (file)

index 6c76839..0000000
--- a/rust/src/integer.rs
+++ /dev/null
@@ -1,86 +0,0 @@
-pub trait ToInteger {
-    fn to_exact_integer<T>(&self) -> Option<T>
-    where
-        T: FromFloat;
-    fn to_exact_usize(&self) -> Option<usize> {
-        self.to_exact_integer()
-    }
-    fn to_exact_u8(&self) -> Option<u8> {
-        self.to_exact_integer()
-    }
-    fn to_exact_u16(&self) -> Option<u16> {
-        self.to_exact_integer()
-    }
-    fn to_exact_u32(&self) -> Option<u32> {
-        self.to_exact_integer()
-    }
-    fn to_exact_u64(&self) -> Option<u64> {
-        self.to_exact_integer()
-    }
-    fn to_exact_u128(&self) -> Option<u128> {
-        self.to_exact_integer()
-    }
-    fn to_exact_isize(&self) -> Option<usize> {
-        self.to_exact_integer()
-    }
-    fn to_exact_i8(&self) -> Option<i8> {
-        self.to_exact_integer()
-    }
-    fn to_exact_i16(&self) -> Option<i16> {
-        self.to_exact_integer()
-    }
-    fn to_exact_i32(&self) -> Option<i32> {
-        self.to_exact_integer()
-    }
-    fn to_exact_i64(&self) -> Option<i64> {
-        self.to_exact_integer()
-    }
-    fn to_exact_i128(&self) -> Option<i128> {
-        self.to_exact_integer()
-    }
-}
-
-impl ToInteger for f64 {
-    fn to_exact_integer<T>(&self) -> Option<T>
-    where
-        T: FromFloat,
-    {
-        T::from_float(*self)
-    }
-}
-
-pub trait FromFloat {
-    fn from_float(x: f64) -> Option<Self>
-    where
-        Self: Sized;
-}
-
-macro_rules! impl_from_float {
-    ($T:ident) => {
-        impl FromFloat for $T {
-            fn from_float(x: f64) -> Option<Self>
-            where
-                Self: Sized,
-            {
-                if x.trunc() == x && x >= $T::MIN as f64 && x <= $T::MAX as f64 {
-                    Some(x as Self)
-                } else {
-                    None
-                }
-            }
-        }
-    };
-}
-
-impl_from_float!(usize);
-impl_from_float!(u8);
-impl_from_float!(u16);
-impl_from_float!(u32);
-impl_from_float!(u64);
-impl_from_float!(u128);
-impl_from_float!(isize);
-impl_from_float!(i8);
-impl_from_float!(i16);
-impl_from_float!(i32);
-impl_from_float!(i64);
-impl_from_float!(i128);
diff --git a/rust/src/lex/command_name.rs b/rust/src/lex/command_name.rs

deleted file mode 100644 (file)

index bccea14..0000000
--- a/rust/src/lex/command_name.rs
+++ /dev/null
@@ -1,359 +0,0 @@
-use crate::identifier::id_match_n_nonstatic;
-
-pub struct Match {
-    pub exact: bool,
-    pub missing_words: isize,
-}
-
-fn count_words(s: &str) -> isize {
-    s.split_whitespace().count() as isize
-}
-
-/// Compares `string` obtained from the user against the full name of a `command`,
-/// using this algorithm:
-///
-///   1. Divide `command` into words `c[0]` through `c[n - 1]`.
-///
-///   2. Divide `string` into words `s[0]` through `s[m - 1]`.
-///
-///   3. Compare word `c[i]` against `s[i]` for `0 <= i < min(n, m)`, using the keyword
-///      matching algorithm implemented by lex_id_match().  If any of them fail to
-///      match, then `string` does not match `command` and the function returns false.
-///
-///   4. Otherwise, `string` and `command` match.  Set *MISSING_WORDS to n - m.  Set
-///      *EXACT to false if any of the S[i] were found to be abbreviated in the
-///      comparisons done in step 3, or to true if they were all exactly equal
-///      (modulo case).  Return true.
-pub fn command_match(command: &str, string: &str) -> Option<Match> {
-    let mut command_words = command.split_whitespace();
-    let mut string_words = string.split_whitespace();
-    let mut exact = true;
-    loop {
-        let Some(cw) = command_words.next() else {
-            return Some(Match {
-                exact,
-                missing_words: -(string_words.count() as isize),
-            });
-        };
-        let Some(sw) = string_words.next() else {
-            return Some(Match {
-                exact,
-                missing_words: 1 + command_words.count() as isize,
-            });
-        };
-        if !id_match_n_nonstatic(cw, sw, 3) {
-            return None;
-        }
-        if sw.len() < cw.len() {
-            exact = false;
-        }
-    }
-}
-
-/// Matches a string against a collection of command names.
-pub struct CommandMatcher<'a, T> {
-    string: &'a str,
-    extensible: bool,
-    exact_match: Option<T>,
-    n_matches: usize,
-    match_: Option<T>,
-    match_missing_words: isize,
-}
-
-impl<'a, T> CommandMatcher<'a, T> {
-    pub fn new(string: &'a str) -> Self {
-        Self {
-            string,
-            extensible: false,
-            exact_match: None,
-            n_matches: 0,
-            match_: None,
-            match_missing_words: 0,
-        }
-    }
-
-    /// Consider `command` as a candidate for the command name being parsed. If
-    /// `command` is the correct command name, then [Self::get_match] will
-    /// return `aux` later.
-    pub fn add(&mut self, command: &str, aux: T) {
-        if let Some(Match {
-            missing_words,
-            exact,
-        }) = command_match(command, self.string)
-        {
-            if missing_words > 0 {
-                self.extensible = true;
-            } else if exact && missing_words == 0 {
-                self.exact_match = Some(aux);
-            } else {
-                if missing_words > self.match_missing_words {
-                    self.n_matches = 0;
-                }
-                if missing_words >= self.match_missing_words || self.n_matches == 0 {
-                    self.n_matches += 1;
-                    self.match_ = Some(aux);
-                    self.match_missing_words = missing_words;
-                }
-            }
-        }
-    }
-
-    pub fn get_match(self) -> (Option<T>, isize) {
-        if self.extensible {
-            (None, 1)
-        } else if let Some(exact_match) = self.exact_match {
-            (Some(exact_match), 0)
-        } else if self.n_matches == 1 {
-            (self.match_, self.match_missing_words)
-        } else {
-            (None, self.match_missing_words)
-        }
-    }
-}
-
-pub const COMMAND_NAMES: &'static [&'static str] = &[
-    "2SLS",
-    "ACF",
-    "ADD DOCUMENT",
-    "ADD FILES",
-    "ADD VALUE LABELS",
-    "AGGREGATE",
-    "ALSCAL",
-    "ANACOR",
-    "ANOVA",
-    "APPLY DICTIONARY",
-    "AUTORECODE",
-    "BEGIN DATA",
-    "BREAK",
-    "CACHE",
-    "CASEPLOT",
-    "CASESTOVARS",
-    "CATPCA",
-    "CATREG",
-    "CCF",
-    "CD",
-    "CLEAR TRANSFORMATIONS",
-    "CLOSE FILE HANDLE",
-    "CLUSTER",
-    "COMPUTE",
-    "CONJOINT",
-    "CORRELATIONS",
-    "CORRESPONDENCE",
-    "COUNT",
-    "COXREG",
-    "CREATE",
-    "CROSSTABS",
-    "CSDESCRIPTIVES",
-    "CSGLM",
-    "CSLOGISTIC",
-    "CSPLAN",
-    "CSSELECT",
-    "CSTABULATE",
-    "CTABLES",
-    "CURVEFIT",
-    "DATA LIST",
-    "DATAFILE ATTRIBUTE",
-    "DATASET ACTIVATE",
-    "DATASET CLOSE",
-    "DATASET COPY",
-    "DATASET DECLARE",
-    "DATASET DISPLAY",
-    "DATASET NAME",
-    "DATE",
-    "DEBUG EVALUATE",
-    "DEBUG EXPAND",
-    "DEBUG FLOAT FORMAT",
-    "DEBUG FORMAT GUESSER",
-    "DEBUG MATRIX READ",
-    "DEBUG MOMENTS",
-    "DEBUG PAPER SIZE",
-    "DEBUG POOL",
-    "DEBUG XFORM FAIL",
-    "DEFINE",
-    "DELETE VARIABLES",
-    "DESCRIPTIVES",
-    "DETECTANOMALY",
-    "DISCRIMINANT",
-    "DISPLAY MACROS",
-    "DISPLAY VARIABLE SETS",
-    "DISPLAY",
-    "DO IF",
-    "DO REPEAT",
-    "DOCUMENT",
-    "DROP DOCUMENTS",
-    "ECHO",
-    "EDIT",
-    "ELSE IF",
-    "ELSE",
-    "END CASE",
-    "END FILE TYPE",
-    "END FILE",
-    "END IF",
-    "END LOOP",
-    "END REPEAT",
-    "ERASE",
-    "EXAMINE",
-    "EXECUTE",
-    "EXIT",
-    "EXPORT",
-    "FACTOR",
-    "FILE HANDLE",
-    "FILE LABEL",
-    "FILE TYPE",
-    "FILTER",
-    "FINISH",
-    "FIT",
-    "FLIP",
-    "FORMATS",
-    "FREQUENCIES",
-    "GENLOG",
-    "GET DATA",
-    "GET TRANSLATE",
-    "GET",
-    "GGRAPH",
-    "GLM",
-    "GRAPH",
-    "HILOGLINEAR",
-    "HOMALS",
-    "HOST",
-    "IF",
-    "IGRAPH",
-    "IMPORT",
-    "INCLUDE",
-    "INFO",
-    "INPUT PROGRAM",
-    "INSERT",
-    "KEYED DATA LIST",
-    "KM",
-    "LEAVE",
-    "LIST",
-    "LOGISTIC REGRESSION",
-    "LOGLINEAR",
-    "LOOP",
-    "MANOVA",
-    "MAPS",
-    "MATCH FILES",
-    "MATRIX DATA",
-    "MATRIX",
-    "MCONVERT",
-    "MEANS",
-    "MISSING VALUES",
-    "MIXED",
-    "MODEL CLOSE",
-    "MODEL HANDLE",
-    "MODEL LIST",
-    "MODEL NAME",
-    "MRSETS",
-    "MULT RESPONSE",
-    "MULTIPLE CORRESPONDENCE",
-    "MVA",
-    "N OF CASES",
-    "N",
-    "NAIVEBAYES",
-    "NEW FILE",
-    "NLR",
-    "NOMREG",
-    "NONPAR CORR",
-    "NPAR TESTS",
-    "NUMBERED",
-    "NUMERIC",
-    "OLAP CUBES",
-    "OMS",
-    "ONEWAY",
-    "ORTHOPLAN",
-    "OUTPUT MODIFY",
-    "OVERALS",
-    "PACF",
-    "PARTIAL CORR",
-    "PEARSON CORRELATIONS",
-    "PERMISSIONS",
-    "PLANCARDS",
-    "PLUM",
-    "POINT",
-    "PPLOT",
-    "PREDICT",
-    "PREFSCAL",
-    "PRESERVE",
-    "PRINCALS",
-    "PRINT EJECT",
-    "PRINT FORMATS",
-    "PRINT SPACE",
-    "PRINT",
-    "PROBIT",
-    "PROCEDURE OUTPUT",
-    "PROXIMITIES",
-    "PROXSCAL",
-    "Q",
-    "QUICK CLUSTER",
-    "QUIT",
-    "RANK",
-    "RATIO STATISTICS",
-    "READ MODEL",
-    "RECODE",
-    "RECORD TYPE",
-    "REFORMAT",
-    "REGRESSION",
-    "RELIABILITY",
-    "RENAME VARIABLES",
-    "REPEATING DATA",
-    "REPORT",
-    "REREAD",
-    "RESTORE",
-    "RMV",
-    "ROC",
-    "SAMPLE",
-    "SAVE DATA COLLECTION",
-    "SAVE TRANSLATE",
-    "SAVE",
-    "SCRIPT",
-    "SEASON",
-    "SELECT IF",
-    "SELECTPRED",
-    "SET",
-    "SHOW",
-    "SORT CASES",
-    "SORT VARIABLES",
-    "SPCHART",
-    "SPECTRA",
-    "SPLIT FILE",
-    "STEMLEAF",
-    "STRING",
-    "SUBTITLE",
-    "SUMMARIZE",
-    "SURVIVAL",
-    "SYSFILE INFO",
-    "T-TEST",
-    "TDISPLAY",
-    "TEMPORARY",
-    "TITLE",
-    "TREE",
-    "TSAPPLY",
-    "TSET",
-    "TSHOW",
-    "TSMODEL",
-    "TSPLOT",
-    "TWOSTEP CLUSTER",
-    "UNIANOVA",
-    "UNNUMBERED",
-    "UPDATE",
-    "USE",
-    "VALIDATEDATA",
-    "VALUE LABELS",
-    "VARCOMP",
-    "VARIABLE ALIGNMENT",
-    "VARIABLE ATTRIBUTE",
-    "VARIABLE LABELS",
-    "VARIABLE LEVEL",
-    "VARIABLE ROLE",
-    "VARIABLE WIDTH",
-    "VARSTOCASES",
-    "VECTOR",
-    "VERIFY",
-    "WEIGHT",
-    "WLS",
-    "WRITE FORMATS",
-    "WRITE",
-    "XEXPORT",
-    "XGRAPH",
-    "XSAVE",
-];
diff --git a/rust/src/lex/lexer.rs b/rust/src/lex/lexer.rs

deleted file mode 100644 (file)

index 82ef008..0000000
--- a/rust/src/lex/lexer.rs
+++ /dev/null
@@ -1,929 +0,0 @@
-use std::{
-    borrow::{Borrow, Cow},
-    collections::{HashMap, VecDeque},
-    fmt::Write,
-    fs,
-    io::Result as IoResult,
-    mem,
-    ops::{Range, RangeInclusive},
-    path::Path,
-    sync::Arc,
-};
-
-use chardetng::EncodingDetector;
-use encoding_rs::{Encoding, UTF_8};
-use thiserror::Error as ThisError;
-use unicode_width::{UnicodeWidthChar, UnicodeWidthStr};
-
-use crate::{
-    macros::{macro_tokens_to_syntax, MacroSet, ParseStatus, Parser},
-    message::{Category, Diagnostic, Location, Point, Severity},
-    prompt::PromptStyle,
-    settings::Settings,
-};
-
-use super::{
-    scan::{MergeResult, ScanError, ScanToken},
-    segment::{Mode, Segment, Segmenter},
-    token::Token,
-};
-
-/// Error handling for a [`Reader`].
-#[derive(Copy, Clone, Debug, Default, PartialEq, Eq)]
-pub enum ErrorHandling {
-    /// Discard input line and continue reading.
-    Terminal,
-
-    /// Continue to next command, except for cascading failures.
-    #[default]
-    Continue,
-
-    /// Continue, even for cascading failures.
-    Ignore,
-
-    /// Stop processing,
-    Stop,
-}
-
-///  # Token pipeline
-///
-///  Tokens pass through a pipeline with the following stages.  Each token
-///  eventually made available to the parser passes through of these stages.
-///  The stages are named after the processing that happens in each one.
-///
-///  Initially, tokens come from the segmenter and scanner to `pp`:
-///
-///  - `pp`: Tokens that need to pass through the macro preprocessor to end up
-///    in `merge`.
-///
-///  - `merge`: Tokens that need to pass through
-///    [`super::scan::ScanToken::merge`] to end up in `parse`.
-///
-///  - `parse`: Tokens available to the client for parsing.
-///
-/// `pp` and `merge` store tokens only temporarily until they pass into `parse`.
-/// Tokens then live in `parse` until the command is fully consumed, at which
-/// time they are freed together.
-pub struct Source {
-    /// Error-handling mode.
-    error_handling: ErrorHandling,
-
-    /// Encoding.
-    encoding: &'static Encoding,
-
-    /// `None` if this reader is not associated with a file.
-    file_name: Option<Arc<String>>,
-
-    /// True if we've reached EOF already.
-    eof: bool,
-
-    /// Read some input from the source. If successful, returns the input that
-    /// was read.  At end of file or on error, returns an empty string.
-    ///
-    /// `prompt` provides a hint to interactive readers as to what kind of
-    /// syntax is being read right now.
-    read: Box<dyn Fn(PromptStyle) -> String>,
-
-    /// Source file contents.
-    buffer: String,
-
-    /// 0-based line number of the first line not yet written to the journal.
-    journal_line: usize,
-
-    /// Byte offset of first character not yet scanned as token.
-    seg_pos: usize,
-
-    /// Byte offsets into `buffer` of starts of lines.  The first element is 0.
-    lines: Vec<usize>,
-
-    /// Tokens that need to pass through the macro preprocessor to end up in
-    /// `merge`.
-    pp: VecDeque<LexToken>,
-
-    ///  Tokens that need to pass through [`super::scan::ScanToken::merge`] to
-    ///  end up in `parse`.
-    merge: VecDeque<LexToken>,
-
-    /// Tokens available to the client for parsing.
-    parse: Vec<LexToken>,
-
-    /// Offset in `parse` of the current token.
-    parse_ofs: usize,
-
-    segmenter: Segmenter,
-
-    suppress_next_newline: bool,
-}
-
-impl Default for Source {
-    fn default() -> Self {
-        Self {
-            error_handling: ErrorHandling::default(),
-            encoding: UTF_8,
-            file_name: None,
-            eof: false,
-            read: Box::new(|_| String::new()),
-            buffer: String::new(),
-            journal_line: 0,
-            seg_pos: 0,
-            lines: vec![0],
-            pp: VecDeque::new(),
-            merge: VecDeque::new(),
-            parse: Vec::new(),
-            parse_ofs: 0,
-            segmenter: Segmenter::new(Mode::default(), false),
-            suppress_next_newline: false,
-        }
-    }
-}
-
-impl Source {
-    pub fn for_file<P>(
-        path: P,
-        encoding: Option<&'static Encoding>,
-        syntax: Mode,
-        error_handling: ErrorHandling,
-    ) -> IoResult<Self>
-    where
-        P: AsRef<Path>,
-    {
-        let bytes = fs::read(path.as_ref())?;
-        let encoding = encoding.unwrap_or_else(|| {
-            let mut encoding_detector = EncodingDetector::new();
-            encoding_detector.feed(&bytes, true);
-            encoding_detector.guess(None, true)
-        });
-        let (contents, _malformed) = encoding.decode_with_bom_removal(&bytes);
-        Ok(Self::for_file_contents(
-            contents.to_string(),
-            Some(path.as_ref().to_string_lossy().to_string()),
-            encoding,
-            syntax,
-            error_handling,
-        ))
-    }
-
-    pub fn for_file_contents(
-        contents: String,
-        file_name: Option<String>,
-        encoding: &'static Encoding,
-        syntax: Mode,
-        error_handling: ErrorHandling,
-    ) -> Self {
-        Self {
-            buffer: contents,
-            file_name: file_name.map(Arc::new),
-            encoding,
-            error_handling,
-            segmenter: Segmenter::new(syntax, false),
-            ..Self::default()
-        }
-    }
-
-    pub fn for_string(contents: String, encoding: &'static Encoding) -> Self {
-        Self {
-            buffer: contents,
-            encoding,
-            ..Self::default()
-        }
-    }
-
-    pub fn for_function(
-        read: Box<dyn Fn(PromptStyle) -> String>,
-        file_name: Option<String>,
-        encoding: &'static Encoding,
-        syntax: Mode,
-        error_handling: ErrorHandling,
-    ) -> Self {
-        Self {
-            read,
-            file_name: file_name.map(Arc::new),
-            encoding,
-            segmenter: Segmenter::new(syntax, false),
-            error_handling,
-            ..Self::default()
-        }
-    }
-
-    fn read(&mut self) {
-        loop {
-            let prompt = self.segmenter.prompt();
-            let s = (self.read)(prompt);
-            if s.is_empty() {
-                self.eof = true;
-                return;
-            }
-            self.buffer.push_str(&s);
-            if self.buffer[self.seg_pos..].contains('\n') {
-                return;
-            }
-        }
-    }
-    fn try_get_pp(&mut self, context: &Context) -> bool {
-        let (seg_len, seg_type) = loop {
-            if let Ok(result) = self.segmenter.push(&self.buffer[self.seg_pos..], self.eof) {
-                break result;
-            }
-
-            debug_assert!(!self.eof);
-            self.read();
-        };
-
-        let pos = self.seg_pos..self.seg_pos + seg_len;
-        self.seg_pos += seg_len;
-        if seg_type == Segment::Newline {
-            self.lines.push(self.seg_pos);
-        }
-
-        let scan_token = ScanToken::from_segment(&self.buffer[pos.clone()], seg_type);
-
-        let n_lines = match (seg_type, self.suppress_next_newline) {
-            (Segment::EndCommand, false) => {
-                self.suppress_next_newline = true;
-                1
-            }
-            (Segment::Newline, true) => {
-                self.suppress_next_newline = false;
-                0
-            }
-            (Segment::Newline, false) => 1,
-            _ => 0,
-        };
-        for line_num in self.journal_line..self.journal_line + n_lines {
-            let start_ofs = self.lines[line_num];
-            let end_ofs = self
-                .lines
-                .get(line_num + 1)
-                .copied()
-                .unwrap_or(self.buffer.len());
-            let line = &self.buffer[start_ofs..end_ofs];
-            let _line = line
-                .strip_suffix("\r\n")
-                .unwrap_or(line.strip_suffix('\n').unwrap_or(line));
-            // XXX submit the line as syntax
-        }
-        self.journal_line += n_lines;
-
-        let pos = pos.start..pos.end;
-        match scan_token {
-            None => false,
-            Some(ScanToken::Token(Token::End)) => {
-                self.pp.push_back(LexToken {
-                    token: Token::EndCommand,
-                    pos,
-                    macro_rep: None,
-                });
-                self.eof = true;
-                true
-            }
-            Some(ScanToken::Token(token)) => {
-                self.pp.push_back(LexToken {
-                    token,
-                    pos,
-                    macro_rep: None,
-                });
-                true
-            }
-            Some(ScanToken::Error(error)) => {
-                (context.error)(
-                    Location {
-                        file_name: self.file_name.clone(),
-                        span: Some(self.offset_to_point(pos.start)..self.offset_to_point(pos.end)),
-                        omit_underlines: false,
-                    },
-                    error.into(),
-                );
-                false
-            }
-        }
-    }
-
-    fn get_pp(&mut self, context: &Context) -> bool {
-        while !self.eof {
-            if self.try_get_pp(context) {
-                return true;
-            }
-        }
-        false
-    }
-
-    fn try_get_merge(&mut self, context: &Context) -> bool {
-        if self.pp.is_empty() && !self.get_pp(context) {
-            return false;
-        }
-
-        if !Settings::global().macros.expand {
-            self.merge.append(&mut self.pp);
-            return true;
-        }
-
-        // Now pass tokens one-by-one to the macro expander.
-        let Some(mut parser) = Parser::new(context.macros, &self.pp[0].token) else {
-            // Common case where there is no macro to expand.
-            self.merge.push_back(self.pp.pop_front().unwrap());
-            return true;
-        };
-        for ofs in 1.. {
-            if self.pp.len() <= ofs && !self.get_pp(context) {
-                // This should not be reachable because we always get a
-                // `Token::EndCommand` at the end of an input file, which should
-                // always terminate macro expansion.
-                unreachable!();
-            }
-            let token = &self.pp[ofs];
-            if parser.push(&token.token, &self.buffer[token.pos.clone()], &|e| {
-                println!("{e:?}")
-            }) == ParseStatus::Complete
-            {
-                break;
-            }
-        }
-        let call = parser.finish();
-        if call.len() == 0 {
-            // False alarm: no macro to expand after all.
-            self.merge.push_back(self.pp.pop_front().unwrap());
-            return true;
-        }
-
-        // Expand the tokens.
-        let c0 = &self.pp[0];
-        let c1 = &self.pp[call.len() - 1];
-        let mut expansion = Vec::new();
-        call.expand(
-            self.segmenter.mode(),
-            self.token_location(c0..=c1),
-            &mut expansion,
-            |e| println!("{e:?}"),
-        );
-        let retval = !expansion.is_empty();
-
-        if Settings::global().macros.print_expansions {
-            // XXX
-        }
-
-        // Append the macro expansion tokens to the lookahead.
-        let mut macro_rep = String::new();
-        let mut pos = Vec::with_capacity(expansion.len());
-        for [prefix, token] in macro_tokens_to_syntax(expansion.as_slice()) {
-            macro_rep.push_str(prefix);
-            let len = macro_rep.len();
-            pos.push(len..=len + token.len() - 1);
-        }
-        let macro_rep = Arc::new(macro_rep);
-        for (index, token) in expansion.into_iter().enumerate() {
-            let lt = LexToken {
-                token: token.token,
-                pos: c0.pos.start..c1.pos.end,
-                macro_rep: Some(MacroRepresentation {
-                    expansion: Arc::clone(&macro_rep),
-                    pos: pos[index].clone(),
-                }),
-            };
-            self.merge.push_back(lt);
-        }
-        self.pp.drain(..call.len());
-        retval
-    }
-
-    /// Attempts to obtain at least one new token into `self.merge`.
-    ///
-    /// Returns true if successful, false on failure.  In the latter case, this source
-    /// exhausted and 'self.eof' is now true.
-    fn get_merge(&mut self, context: &Context) -> bool {
-        while !self.eof {
-            if self.try_get_merge(context) {
-                return true;
-            }
-        }
-        false
-    }
-
-    fn get_parse__(&mut self, context: &Context) -> bool {
-        for i in 0.. {
-            if self.merge.len() <= i && !self.get_merge(context) {
-                // We always get a `Token::EndCommand` at the end of an input
-                // file and the merger should return `Some(...)` for that token.
-                debug_assert_eq!(self.merge.len(), 0);
-                return false;
-            }
-
-            match ScanToken::merge(&self.merge) {
-                None => (),
-                Some(MergeResult::Copy) => {
-                    self.parse.push(self.merge.pop_front().unwrap());
-                    return true;
-                }
-                Some(MergeResult::Expand { n, token }) => {
-                    let first = &self.merge[0];
-                    let last = &self.merge[n - 1];
-                    self.parse.push(LexToken {
-                        token,
-                        pos: first.pos.start..last.pos.end,
-                        macro_rep: match (&first.macro_rep, &last.macro_rep) {
-                            (Some(a), Some(b)) if Arc::ptr_eq(&a.expansion, &b.expansion) => {
-                                Some(MacroRepresentation {
-                                    expansion: a.expansion.clone(),
-                                    pos: *a.pos.start()..=*b.pos.end(),
-                                })
-                            }
-                            _ => None,
-                        },
-                    });
-                    self.merge.drain(..n);
-                    return true;
-                }
-            }
-        }
-        unreachable!();
-    }
-
-    fn get_parse(&mut self, context: &Context) -> bool {
-        // XXX deal with accumulated messages
-        self.get_parse__(context)
-    }
-
-    fn offset_to_point(&self, offset: usize) -> Point {
-        let line = self
-            .lines
-            .partition_point(|&line_start| line_start <= offset);
-        Point {
-            line: line as i32,
-            column: Some(
-                self.buffer
-                    .get(self.lines[line - 1]..offset)
-                    .unwrap_or_default()
-                    .width() as i32
-                    + 1,
-            ),
-        }
-    }
-
-    /// Returns the syntax for 1-based line-number `line_number`.
-    fn get_line(&self, line_number: i32) -> &str {
-        if (1..=self.lines.len() as i32).contains(&line_number) {
-            let line_number = line_number as usize;
-            let start = self.lines[line_number - 1];
-            let end = self.lines.get(line_number).copied().unwrap_or(
-                self.buffer[start..]
-                    .find('\n')
-                    .map(|ofs| ofs + start)
-                    .unwrap_or(self.buffer.len()),
-            );
-            let line = &self.buffer[start..end];
-            line.strip_suffix("\r\n")
-                .unwrap_or(line.strip_suffix('\n').unwrap_or(line))
-        } else {
-            ""
-        }
-    }
-
-    fn token_location(&self, range: RangeInclusive<&LexToken>) -> Location {
-        Location {
-            file_name: self.file_name.clone(),
-            span: Some(
-                self.offset_to_point(range.start().pos.start)
-                    ..self.offset_to_point(range.end().pos.end),
-            ),
-            omit_underlines: false,
-        }
-    }
-
-    fn ofs_location(&self, range: RangeInclusive<usize>) -> Location {
-        if *range.start() <= *range.end() && *range.end() < self.parse.len() {
-            self.token_location(&self.parse[*range.start()]..=&self.parse[*range.end()])
-        } else {
-            Location {
-                file_name: self.file_name.clone(),
-                span: None,
-                omit_underlines: false,
-            }
-        }
-    }
-
-    fn token(&self) -> &Token {
-        &self.parse[self.parse_ofs].token
-    }
-
-    fn next(&mut self, offset: isize, context: &Context) -> &Token {
-        let Some(index) = offset.checked_add(self.parse_ofs as isize) else {
-            return &Token::EndCommand;
-        };
-        let Ok(index) = usize::try_from(index) else {
-            return &Token::EndCommand;
-        };
-
-        while index >= self.parse.len() {
-            if let Some(token) = self.parse.last() {
-                match token.token {
-                    Token::End => return &Token::End,
-                    Token::EndCommand => return &Token::EndCommand,
-                    _ => (),
-                }
-            }
-            self.get_parse(context);
-        }
-        &self.parse[index].token
-    }
-
-    /// If the tokens in `ofs` contains a macro call, this returns the raw
-    /// syntax for the macro call (not for the expansion) and for any other
-    /// tokens included in that range.  The syntax is encoded in UTF-8 and in
-    /// the original form supplied to the lexer so that, for example, it may
-    /// include comments, spaces, and new-lines if it spans multiple tokens.
-    ///
-    /// Returns `None` if the token range doesn't include a macro call.
-    fn get_macro_call(&self, ofs: RangeInclusive<usize>) -> Option<&str> {
-        if self
-            .parse
-            .get(ofs.clone())
-            .unwrap_or_default()
-            .iter()
-            .all(|token| token.macro_rep.is_none())
-        {
-            return None;
-        }
-
-        let token0 = &self.parse[*ofs.start()];
-        let token1 = &self.parse[*ofs.end()];
-        Some(&self.buffer[token0.pos.start..token1.pos.end])
-    }
-
-    fn is_empty(&self) -> bool {
-        self.buffer.is_empty() && self.eof
-    }
-
-    fn diagnostic(
-        &self,
-        severity: Severity,
-        ofs: RangeInclusive<usize>,
-        text: String,
-    ) -> Diagnostic {
-        let mut s = String::with_capacity(text.len() + 16);
-        if self.is_empty() {
-            s.push_str("At end of input: ");
-        } else if let Some(call) = self.get_macro_call(ofs.clone()) {
-            write!(&mut s, "In syntax expanded from `{}`: ", ellipsize(call)).unwrap();
-        }
-
-        if !text.is_empty() {
-            s.push_str(&text);
-        } else {
-            s.push_str("Syntax error.");
-        }
-
-        if !s.ends_with('.') {
-            s.push('.');
-        }
-
-        let location = self.ofs_location(ofs);
-        let mut source = Vec::new();
-        if let Some(Range {
-            start: Point { line: l0, .. },
-            end: Point { line: l1, .. },
-        }) = location.span
-        {
-            let lines = if l1 - l0 > 3 {
-                vec![l0, l0 + 1, l1]
-            } else {
-                (l0..=l1).collect()
-            };
-            for line_number in lines {
-                source.push((line_number, self.get_line(line_number).to_string()));
-            }
-        }
-
-        Diagnostic {
-            category: Category::Syntax,
-            severity,
-            location,
-            source,
-            stack: Vec::new(),
-            command_name: None, // XXX
-            text: s,
-        }
-    }
-
-    fn interactive_reset(&mut self) {
-        if self.error_handling == ErrorHandling::Terminal {
-            let Source {
-                error_handling,
-                encoding,
-                read,
-                ..
-            } = mem::take(self);
-            *self = Self {
-                error_handling,
-                encoding,
-                read,
-                ..Source::default()
-            };
-        }
-    }
-}
-
-fn ellipsize(s: &str) -> Cow<str> {
-    if s.width() > 64 {
-        let mut out = String::new();
-        let mut width = 0;
-        for c in s.chars() {
-            out.push(c);
-            width += c.width().unwrap_or(0);
-            if width > 64 {
-                break;
-            }
-        }
-        out.push_str("...");
-        Cow::from(out)
-    } else {
-        Cow::from(s)
-    }
-}
-
-/// A token in a [`Source`].
-struct LexToken {
-    /// The regular token.
-    token: Token,
-
-    /// For a token obtained through the lexer in an ordinary way, this is the
-    /// location of the token in the [`Source`]'s buffer.
-    ///
-    /// For a token produced through macro expansion, this is the entire macro
-    /// call.
-    pos: Range<usize>,
-
-    /// For a token obtained through macro expansion, the part of the macro
-    /// expansion that represents this token.
-    ///
-    /// For a token obtained through the lexer in an ordinary way, this is
-    /// `None`.
-    macro_rep: Option<MacroRepresentation>,
-}
-
-impl Borrow<Token> for LexToken {
-    fn borrow(&self) -> &Token {
-        &self.token
-    }
-}
-
-struct MacroRepresentation {
-    /// An entire macro expansion.
-    expansion: Arc<String>,
-
-    /// The substring of `expansion` that represents a single token.
-    pos: RangeInclusive<usize>,
-}
-
-pub struct Lexer {
-    source: Source,
-    stack: Vec<Source>,
-    macros: MacroSet,
-    error: Box<dyn Fn(Location, Error)>,
-}
-
-struct Context<'a> {
-    macros: &'a MacroSet,
-    error: &'a Box<dyn Fn(Location, Error)>,
-}
-
-impl Lexer {
-    pub fn new(error: Box<dyn Fn(Location, Error)>) -> Self {
-        Self {
-            source: Source::default(),
-            stack: Vec::new(),
-            macros: HashMap::new(),
-            error,
-        }
-    }
-
-    pub fn get(&mut self) -> &Token {
-        if self.source.parse_ofs < self.source.parse.len() {
-            if let Token::EndCommand = self.source.token() {
-                self.source.parse.clear();
-                self.source.parse_ofs = 0;
-            } else {
-                self.source.parse_ofs += 1;
-            }
-        }
-
-        while self.source.parse_ofs == self.source.parse.len() {
-            let context = Context {
-                macros: &self.macros,
-                error: &self.error,
-            };
-            if !self.source.get_parse(&context) && !self.pop_stack() {
-                return &Token::End;
-            }
-        }
-        self.source.token()
-    }
-
-    fn pop_stack(&mut self) -> bool {
-        if let Some(new_source) = self.stack.pop() {
-            self.source = new_source;
-            true
-        } else {
-            self.source = Source::default();
-            self.source.parse.push(LexToken {
-                token: Token::End,
-                pos: 0..0,
-                macro_rep: None,
-            });
-            false
-        }
-    }
-
-    /// Inserts `source` so that the next token comes from it.  This is only
-    /// permitted when the lexer is either empty or at `Token::EndCommand`.
-    pub fn include(&mut self, mut source: Source) {
-        // XXX what's the right assertion?
-        let context = Context {
-            macros: &self.macros,
-            error: &self.error,
-        };
-        source.get_parse(&context);
-        let old_source = mem::replace(&mut self.source, source);
-        self.stack.push(old_source);
-    }
-
-    /// Inserts `source` so that it will be read after all the other sources.
-    pub fn append(&mut self, mut source: Source) {
-        let context = Context {
-            macros: &self.macros,
-            error: &self.error,
-        };
-        source.get_parse(&context);
-        self.stack.insert(0, source);
-    }
-
-    pub fn token(&self) -> &Token {
-        self.source.token()
-    }
-
-    pub fn next(&mut self, offset: isize) -> &Token {
-        let context = Context {
-            macros: &self.macros,
-            error: &self.error,
-        };
-        self.source.next(offset, &context)
-    }
-
-    pub fn error<S>(&self, text: S) -> Diagnostic
-    where
-        S: ToString,
-    {
-        self.diagnostic(
-            Severity::Error,
-            self.source.parse_ofs..=self.source.parse_ofs,
-            text,
-        )
-    }
-
-    pub fn diagnostic<S>(
-        &self,
-        severity: Severity,
-        ofs: RangeInclusive<usize>,
-        text: S,
-    ) -> Diagnostic
-    where
-        S: ToString,
-    {
-        self.source.diagnostic(severity, ofs, text.to_string())
-    }
-
-    pub fn error_handling(&self) -> ErrorHandling {
-        self.source.error_handling
-    }
-
-    /// Discards all lookahead tokens, then discards all input sources
-    /// until it encounters one with error mode [ErrorHandling::Terminal] or until it
-    /// runs out of input sources.
-    pub fn discard_noninteractive(&mut self) {
-        while self.source.error_handling != ErrorHandling::Ignore {
-            self.source.pp.clear();
-            self.source.merge.clear();
-            self.source.parse.clear();
-            self.source.parse_ofs = 0;
-
-            if self.source.error_handling == ErrorHandling::Terminal || !self.pop_stack() {
-                return;
-            }
-        }
-    }
-
-    /// If the source that the lexer is currently reading has error mode
-    /// [ErrorHandling::Terminal], discards all buffered input and tokens, so
-    /// that the next token to be read comes directly from whatever is next read
-    /// from the stream.
-    ///
-    /// It makes sense to call this function after encountering an error in a
-    /// command entered on the console, because usually the user would prefer
-    /// not to have cascading errors.
-    pub fn interactive_reset(&mut self) {
-        self.source.interactive_reset()
-    }
-
-    /// Advances past any tokens up to [Token::EndCommand] or [Token::End].
-    pub fn discard_rest_of_command(&mut self) {
-        while !matches!(self.token(), Token::EndCommand | Token::End) {
-            self.get();
-        }
-    }
-}
-
-#[derive(ThisError, Clone, Debug, PartialEq, Eq)]
-pub enum Error {
-    /// Error forming tokens from the input.
-    #[error("{0}")]
-    TokenError(#[from] ScanError),
-}
-
-#[cfg(test)]
-mod tests {
-    use encoding_rs::UTF_8;
-
-    use crate::lex::{segment::Mode, token::Token};
-
-    use super::{ErrorHandling, Lexer, Source};
-
-    #[test]
-    fn test() {
-        let mut lexer = Lexer::new(Box::new(|location, error| println!("{location}: {error}")));
-        lexer.include(Source::for_string(
-            String::from(
-                r#"#! /usr/local/bin/pspp
-DATA LIST LIST NOTABLE /a.
-BEGIN DATA.
-1
-2
-END DATA.
-LIST.
-"#,
-            ),
-            UTF_8,
-        ));
-        loop {
-            lexer.get();
-            let token = lexer.token();
-            println!("{token:?}");
-            if let Token::End = token {
-                break;
-            }
-        }
-    }
-
-    #[test]
-    fn test_scan_errors() {
-        let mut lexer = Lexer::new(Box::new(|location, error| println!("{location}: {error}")));
-        lexer.include(Source::for_file_contents(
-            String::from(
-                r#"x'123'
-x'1x'
-u''
-u'012345678'
-u'd800'
-u'110000'
-'foo
-'very long unterminated string that be ellipsized in its error message
-1e .x
-^
-�
-"#,
-            ),
-            Some(String::from("syntax.sps")),
-            UTF_8,
-            Mode::default(),
-            ErrorHandling::default(),
-        ));
-        loop {
-            lexer.get();
-            let token = lexer.token();
-            println!("{token:?}");
-            if let Token::End = token {
-                break;
-            }
-        }
-    }
-
-    #[test]
-    fn test_null_byte() {
-        let mut lexer = Lexer::new(Box::new(|location, error| println!("{location}: {error}")));
-        lexer.include(Source::for_file_contents(
-            String::from(
-                "datA dist list notable file='input.txt'/a b c.
-lis|.\0",
-            ),
-            Some(String::from("syntax.sps")),
-            UTF_8,
-            Mode::default(),
-            ErrorHandling::default(),
-        ));
-        loop {
-            lexer.get();
-            let token = lexer.token();
-            println!("{token:?}");
-            if let Token::End = token {
-                break;
-            }
-        }
-    }
-}
diff --git a/rust/src/lex/mod.rs b/rust/src/lex/mod.rs

deleted file mode 100644 (file)

index e87b088..0000000
--- a/rust/src/lex/mod.rs
+++ /dev/null
@@ -1,17 +0,0 @@
-//! PSPP syntax scanning.
-//!
-//! PSPP divides traditional "lexical analysis" or "tokenization" into two
-//! phases: a lower-level phase called "segmentation" and a higher-level phase
-//! called "scanning".  [super::segment] implements the segmentation phase and
-//! this module the scanning phase.
-//!
-//! Scanning accepts as input a stream of segments, which are UTF-8 strings each
-//! labeled with a segment type.  It outputs a stream of "scan tokens", which
-//! are the same as the tokens used by the PSPP parser with a few additional
-//! types.
-
-pub mod segment;
-pub mod scan;
-pub mod command_name;
-pub mod token;
-pub mod lexer;
diff --git a/rust/src/lex/scan/mod.rs b/rust/src/lex/scan/mod.rs

deleted file mode 100644 (file)

index 05577a9..0000000
--- a/rust/src/lex/scan/mod.rs
+++ /dev/null
@@ -1,416 +0,0 @@
-//! PSPP lexical analysis.
-//!
-//! PSPP divides traditional "lexical analysis" or "tokenization" into two
-//! phases: a lower-level phase called "segmentation" and a higher-level phase
-//! called "scanning".  [segment] implements the segmentation phase and [scan]
-//! the scanning phase.
-//!
-//! Scanning accepts as input a stream of segments, which are UTF-8 strings each
-//! labeled with a segment type.  It outputs a stream of "scan tokens", which
-//! are the same as the tokens used by the PSPP parser with a few additional
-//! types.
-
-use crate::identifier::{Identifier, ReservedWord};
-
-use super::{
-    segment::{Mode, Segment, Segmenter},
-    token::{Punct, Token},
-};
-use std::{borrow::Borrow, collections::VecDeque};
-use thiserror::Error as ThisError;
-
-#[derive(ThisError, Clone, Debug, PartialEq, Eq)]
-pub enum ScanError {
-    /// Unterminated string constant.
-    #[error("Unterminated string constant.")]
-    ExpectedQuote,
-
-    /// Missing exponent.
-    #[error("Missing exponent following `{0}`")]
-    ExpectedExponent(String),
-
-    /// Odd length hex string.
-    #[error("String of hex digits has {0} characters, which is not a multiple of 2.")]
-    OddLengthHexString(usize),
-
-    /// Invalid hex digit.
-    #[error("Invalid hex digit {0:?}.")]
-    BadHexDigit(char),
-
-    /// Incomplete UTF-8 sequence.
-    #[error("Incomplete UTF-8 sequence `{substring}` starting {offset} digits into hex string.")]
-    IncompleteUtf8 { substring: String, offset: usize },
-
-    /// Bad UTF-8 sequence.
-    #[error("Invalid UTF-8 sequence `{substring}` starting {offset} digits into hex string.")]
-    BadUtf8 { substring: String, offset: usize },
-
-    /// Invalid length Unicode string.
-    #[error("Unicode string contains {0} bytes, which is not in the valid range of 1 to 8 bytes.")]
-    BadLengthUnicodeString(usize),
-
-    /// Invalid code point.
-    #[error("U+{0:04X} is not a valid Unicode code point.")]
-    BadCodePoint(u32),
-
-    /// Expected hexadecimal Unicode code point
-    #[error("Expected hexadecimal Unicode code point.")]
-    ExpectedCodePoint,
-
-    /// `DO REPEAT` nested too deeply.
-    #[error("`DO REPEAT` nested too deeply.")]
-    DoRepeatOverflow,
-
-    /// Unexpected character.
-    #[error("Unexpected character {0:?} in input.")]
-    UnexpectedChar(char),
-}
-
-/// The input or output to token merging.
-#[derive(Clone, Debug, PartialEq)]
-pub enum ScanToken {
-    Token(Token),
-    Error(ScanError),
-}
-
-/// The result of merging tokens.
-#[derive(Clone, Debug)]
-pub enum MergeResult {
-    /// Copy one token literally from input to output.
-    Copy,
-
-    /// Expand `n` tokens from the input into `token` in the output.
-    Expand {
-        /// Number of tokens to expand.
-        n: usize,
-
-        /// Replacement token.
-        token: Token,
-    },
-}
-
-impl ScanToken {
-    pub fn from_segment(s: &str, segment: Segment) -> Option<Self> {
-        match segment {
-            Segment::Number => Some(Self::Token(Token::Number(s.parse().unwrap()))),
-            Segment::QuotedString => {
-                // Trim quote mark from front and back.
-                let mut chars = s.chars();
-                let quote = chars.next().unwrap();
-                let s = chars.as_str().strip_suffix(quote).unwrap();
-
-                // Replace doubled quotes by single ones.
-                let (single_quote, double_quote) = match quote {
-                    '\'' => ("'", "''"),
-                    '"' => ("\"", "\"\""),
-                    _ => unreachable!(),
-                };
-                Some(Self::Token(Token::String(
-                    s.replace(double_quote, single_quote),
-                )))
-            }
-            Segment::HexString => {
-                // Strip `X"` prefix and `"` suffix (or variations).
-                let s = &s[2..s.len() - 1];
-                for c in s.chars() {
-                    if !c.is_ascii_hexdigit() {
-                        return Some(Self::Error(ScanError::BadHexDigit(c)));
-                    }
-                }
-                if s.len() % 2 != 0 {
-                    return Some(Self::Error(ScanError::OddLengthHexString(s.len())));
-                }
-                let bytes = s
-                    .as_bytes()
-                    .chunks_exact(2)
-                    .map(|pair| {
-                        let hi = char::from(pair[0]).to_digit(16).unwrap() as u8;
-                        let lo = char::from(pair[1]).to_digit(16).unwrap() as u8;
-                        hi * 16 + lo
-                    })
-                    .collect::<Vec<_>>();
-                match String::from_utf8(bytes) {
-                    Ok(string) => Some(Self::Token(Token::String(string))),
-                    Err(error) => {
-                        let details = error.utf8_error();
-                        let offset = details.valid_up_to() * 2;
-                        let end = details
-                            .error_len()
-                            .map(|len| offset + len * 2)
-                            .unwrap_or(s.len());
-                        let substring = String::from(&s[offset..end]);
-                        Some(Self::Error(if details.error_len().is_some() {
-                            ScanError::BadUtf8 { substring, offset }
-                        } else {
-                            ScanError::IncompleteUtf8 { substring, offset }
-                        }))
-                    }
-                }
-            }
-            Segment::UnicodeString => {
-                // Strip `U"` prefix and `"` suffix (or variations).
-                let s = &s[2..s.len() - 1];
-                if !(1..=8).contains(&s.len()) {
-                    return Some(Self::Error(ScanError::BadLengthUnicodeString(s.len())));
-                }
-                let Ok(code_point) = u32::from_str_radix(s, 16) else {
-                    return Some(Self::Error(ScanError::ExpectedCodePoint));
-                };
-                let Some(c) = char::from_u32(code_point) else {
-                    return Some(Self::Error(ScanError::BadCodePoint(code_point)));
-                };
-                Some(Self::Token(Token::String(String::from(c))))
-            }
-
-            Segment::UnquotedString
-            | Segment::DoRepeatCommand
-            | Segment::InlineData
-            | Segment::Document
-            | Segment::MacroBody
-            | Segment::MacroName => Some(Self::Token(Token::String(String::from(s)))),
-
-            Segment::Identifier => {
-                if let Ok(reserved_word) = ReservedWord::try_from(s) {
-                    match reserved_word {
-                        ReservedWord::And => Some(Self::Token(Token::Punct(Punct::And))),
-                        ReservedWord::Or => Some(Self::Token(Token::Punct(Punct::Or))),
-                        ReservedWord::Not => Some(Self::Token(Token::Punct(Punct::Not))),
-                        ReservedWord::Eq => Some(Self::Token(Token::Punct(Punct::Eq))),
-                        ReservedWord::Ge => Some(Self::Token(Token::Punct(Punct::Ge))),
-                        ReservedWord::Gt => Some(Self::Token(Token::Punct(Punct::Gt))),
-                        ReservedWord::Le => Some(Self::Token(Token::Punct(Punct::Le))),
-                        ReservedWord::Lt => Some(Self::Token(Token::Punct(Punct::Lt))),
-                        ReservedWord::Ne => Some(Self::Token(Token::Punct(Punct::Ne))),
-                        ReservedWord::All => Some(Self::Token(Token::Punct(Punct::All))),
-                        ReservedWord::By => Some(Self::Token(Token::Punct(Punct::By))),
-                        ReservedWord::To => Some(Self::Token(Token::Punct(Punct::To))),
-                        ReservedWord::With => Some(Self::Token(Token::Punct(Punct::With))),
-                    }
-                } else {
-                    Some(Self::Token(Token::Id(Identifier::new(s).unwrap())))
-                }
-            }
-            Segment::Punct => match s {
-                "(" => Some(Self::Token(Token::Punct(Punct::LParen))),
-                ")" => Some(Self::Token(Token::Punct(Punct::RParen))),
-                "[" => Some(Self::Token(Token::Punct(Punct::LSquare))),
-                "]" => Some(Self::Token(Token::Punct(Punct::RSquare))),
-                "{" => Some(Self::Token(Token::Punct(Punct::LCurly))),
-                "}" => Some(Self::Token(Token::Punct(Punct::RCurly))),
-                "," => Some(Self::Token(Token::Punct(Punct::Comma))),
-                "=" => Some(Self::Token(Token::Punct(Punct::Equals))),
-                "-" => Some(Self::Token(Token::Punct(Punct::Dash))),
-                "&" => Some(Self::Token(Token::Punct(Punct::And))),
-                "|" => Some(Self::Token(Token::Punct(Punct::Or))),
-                "+" => Some(Self::Token(Token::Punct(Punct::Plus))),
-                "/" => Some(Self::Token(Token::Punct(Punct::Slash))),
-                "*" => Some(Self::Token(Token::Punct(Punct::Asterisk))),
-                "<" => Some(Self::Token(Token::Punct(Punct::Lt))),
-                ">" => Some(Self::Token(Token::Punct(Punct::Gt))),
-                "~" => Some(Self::Token(Token::Punct(Punct::Not))),
-                ":" => Some(Self::Token(Token::Punct(Punct::Colon))),
-                ";" => Some(Self::Token(Token::Punct(Punct::Semicolon))),
-                "**" => Some(Self::Token(Token::Punct(Punct::Exp))),
-                "<=" => Some(Self::Token(Token::Punct(Punct::Le))),
-                "<>" => Some(Self::Token(Token::Punct(Punct::Ne))),
-                "~=" => Some(Self::Token(Token::Punct(Punct::Ne))),
-                ">=" => Some(Self::Token(Token::Punct(Punct::Ge))),
-                "!" => Some(Self::Token(Token::Punct(Punct::Bang))),
-                "%" => Some(Self::Token(Token::Punct(Punct::Percent))),
-                "?" => Some(Self::Token(Token::Punct(Punct::Question))),
-                "`" => Some(Self::Token(Token::Punct(Punct::Backtick))),
-                "_" => Some(Self::Token(Token::Punct(Punct::Underscore))),
-                "." => Some(Self::Token(Token::Punct(Punct::Dot))),
-                "!*" => Some(Self::Token(Token::Punct(Punct::BangAsterisk))),
-                _ => unreachable!("bad punctuator {s:?}"),
-            },
-            Segment::Shbang
-            | Segment::Spaces
-            | Segment::Comment
-            | Segment::Newline
-            | Segment::CommentCommand => None,
-            Segment::DoRepeatOverflow => Some(Self::Error(ScanError::DoRepeatOverflow)),
-            Segment::StartDocument => {
-                Some(Self::Token(Token::Id(Identifier::new("DOCUMENT").unwrap())))
-            }
-            Segment::StartCommand | Segment::SeparateCommands | Segment::EndCommand => {
-                Some(Self::Token(Token::EndCommand))
-            }
-            Segment::End => Some(Self::Token(Token::End)),
-            Segment::ExpectedQuote => Some(Self::Error(ScanError::ExpectedQuote)),
-            Segment::ExpectedExponent => {
-                Some(Self::Error(ScanError::ExpectedExponent(String::from(s))))
-            }
-            Segment::UnexpectedChar => Some(Self::Error(ScanError::UnexpectedChar(
-                s.chars().next().unwrap(),
-            ))),
-        }
-    }
-
-    /// Attempts to merge a sequence of tokens together into a single token. The
-    /// tokens are taken from the beginning of `input`. If successful, removes one
-    /// or more token from the beginning of `input` and returnss the merged
-    /// token. More input tokens might be needed; if so, leaves `input` alone and
-    /// returns `None`. In the latter case, the caller should add more tokens to the
-    /// input ([Token::End] or [Token::Punct(Punct::EndCmd)] is always sufficient).
-    ///
-    /// This performs two different kinds of token merging:
-    ///
-    ///   - String concatenation, where syntax like `"a" + "b"` is converted into a
-    ///   single string token.  This is definitely needed because the parser relies
-    ///   on it.
-    ///
-    ///   - Negative number merging, where syntax like `-5` is converted from a pair
-    ///     of tokens (a dash and a positive number) into a single token (a negative
-    ///     number).  This might not be needed anymore because the segmenter
-    ///     directly treats a dash followed by a number, with optional intervening
-    ///     white space, as a negative number.  It's only needed if we want
-    ///     intervening comments to be allowed or for part of the negative number
-    ///     token to be produced by macro expansion.
-    pub fn merge<T>(tokens: &T) -> Option<MergeResult>
-    where
-        T: Tokens,
-    {
-        match tokens.get(0)? {
-            Token::Punct(Punct::Dash) => match tokens.get(1)? {
-                Token::Number(number) if number.is_sign_positive() => {
-                    let number = *number;
-                    return Some(MergeResult::Expand {
-                        n: 2,
-                        token: Token::Number(-number),
-                    });
-                }
-                _ => Some(MergeResult::Copy),
-            },
-            Token::String(_) => {
-                let mut i = 0;
-                while matches!(tokens.get(i * 2 + 1)?, Token::Punct(Punct::Plus))
-                    && matches!(tokens.get(i * 2 + 2)?, Token::String(_))
-                {
-                    i += 1;
-                }
-                if i == 0 {
-                    Some(MergeResult::Copy)
-                } else {
-                    let mut output = String::new();
-                    for i in 0..=i {
-                        let Token::String(s) = tokens.get(i * 2).unwrap() else {
-                            unreachable!()
-                        };
-                        output.push_str(&s);
-                    }
-                    Some(MergeResult::Expand {
-                        n: i * 2 + 1,
-                        token: Token::String(output),
-                    })
-                }
-            }
-            _ => Some(MergeResult::Copy),
-        }
-    }
-}
-
-pub trait Tokens {
-    fn get(&self, index: usize) -> Option<&Token>;
-}
-
-impl<T> Tokens for VecDeque<T>
-where
-    T: Borrow<Token>,
-{
-    fn get(&self, index: usize) -> Option<&Token> {
-        self.get(index).map(|token| token.borrow())
-    }
-}
-
-pub struct StringSegmenter<'a> {
-    input: &'a str,
-    segmenter: Segmenter,
-}
-
-impl<'a> StringSegmenter<'a> {
-    pub fn new(input: &'a str, mode: Mode, is_snippet: bool) -> Self {
-        Self {
-            input,
-            segmenter: Segmenter::new(mode, is_snippet),
-        }
-    }
-}
-
-impl<'a> Iterator for StringSegmenter<'a> {
-    type Item = (&'a str, ScanToken);
-
-    fn next(&mut self) -> Option<Self::Item> {
-        loop {
-            let (seg_len, seg_type) = self.segmenter.push(self.input, true).unwrap();
-            if seg_type == Segment::End {
-                return None;
-            }
-            let (s, rest) = self.input.split_at(seg_len);
-            self.input = rest;
-
-            if let Some(token) = ScanToken::from_segment(s, seg_type) {
-                return Some((s, token));
-            }
-        }
-    }
-}
-
-pub struct StringScanner<'a> {
-    input: &'a str,
-    segmenter: Segmenter,
-    tokens: VecDeque<Token>,
-}
-
-impl<'a> StringScanner<'a> {
-    pub fn new(input: &'a str, mode: Mode, is_snippet: bool) -> Self {
-        Self {
-            input,
-            segmenter: Segmenter::new(mode, is_snippet),
-            tokens: VecDeque::with_capacity(1),
-        }
-    }
-
-    fn merge(&mut self) -> Option<ScanToken> {
-        let result = ScanToken::merge(&self.tokens)?;
-        match result {
-            MergeResult::Copy => Some(ScanToken::Token(self.tokens.pop_front().unwrap())),
-            MergeResult::Expand { n, token } => {
-                self.tokens.drain(..n);
-                Some(ScanToken::Token(token))
-            }
-        }
-    }
-}
-
-impl<'a> Iterator for StringScanner<'a> {
-    type Item = ScanToken;
-
-    fn next(&mut self) -> Option<Self::Item> {
-        if let Some(token) = self.merge() {
-            return Some(token);
-        }
-        loop {
-            let (seg_len, seg_type) = self.segmenter.push(self.input, true).unwrap();
-            if seg_type == Segment::End && self.tokens.is_empty() {
-                return None;
-            }
-            let (s, rest) = self.input.split_at(seg_len);
-            self.input = rest;
-
-            match ScanToken::from_segment(s, seg_type) {
-                Some(ScanToken::Error(error)) => return Some(ScanToken::Error(error)),
-                Some(ScanToken::Token(token)) => {
-                    self.tokens.push_back(token);
-                    if let Some(token) = self.merge() {
-                        return Some(token);
-                    }
-                }
-                None => (),
-            }
-        }
-    }
-}
-
-#[cfg(test)]
-mod test;
diff --git a/rust/src/lex/scan/test.rs b/rust/src/lex/scan/test.rs

deleted file mode 100644 (file)

index 0ed9be6..0000000
--- a/rust/src/lex/scan/test.rs
+++ /dev/null
@@ -1,1017 +0,0 @@
-use crate::{identifier::Identifier, lex::{
-    segment::Mode,
-    token::{Punct, Token},
-}};
-
-use super::{ScanError, ScanToken, StringScanner};
-
-fn print_token(token: &Token) {
-    match token {
-        Token::End => print!("Token::End"),
-        Token::Id(s) => print!("Token::Id(String::from({s:?}))"),
-        Token::Number(number) => print!("Token::Number({number:?})"),
-        Token::String(s) => print!("Token::String(String::from({s:?}))"),
-        Token::EndCommand => print!("Token::EndCommand"),
-        Token::Punct(punct) => print!("Token::Punct(Punct::{punct:?})"),
-    }
-}
-
-fn check_scan(input: &str, mode: Mode, expected: &[ScanToken]) {
-    let tokens = StringScanner::new(input, mode, false).collect::<Vec<_>>();
-
-    if &tokens != expected {
-        for token in &tokens {
-            match token {
-                ScanToken::Token(token) => {
-                    print!("ScanToken::Token(");
-                    print_token(token);
-                    print!(")");
-                }
-                ScanToken::Error(error) => print!("ScanToken::Error(ScanError::{error:?})"),
-            }
-            println!(",");
-        }
-
-        eprintln!("tokens differ from expected:");
-        let difference = diff::slice(expected, &tokens);
-        for result in difference {
-            match result {
-                diff::Result::Left(left) => eprintln!("-{left:?}"),
-                diff::Result::Both(left, _right) => eprintln!(" {left:?}"),
-                diff::Result::Right(right) => eprintln!("+{right:?}"),
-            }
-        }
-        panic!();
-    }
-}
-
-#[test]
-fn test_identifiers() {
-    check_scan(
-        r#"a aB i5 $x @efg @@. !abcd !* !*a #.# .x _z.
-abcd. abcd.
-QRSTUV./* end of line comment */
-QrStUv./* end of line comment */ 
-WXYZ. /* unterminated end of line comment
-�. /* U+FFFD is not valid in an identifier
-"#,
-        Mode::Auto,
-        &[
-            ScanToken::Token(Token::Id(Identifier::new("a").unwrap())),
-            ScanToken::Token(Token::Id(Identifier::new("aB").unwrap())),
-            ScanToken::Token(Token::Id(Identifier::new("i5").unwrap())),
-            ScanToken::Token(Token::Id(Identifier::new("$x").unwrap())),
-            ScanToken::Token(Token::Id(Identifier::new("@efg").unwrap())),
-            ScanToken::Token(Token::Id(Identifier::new("@@.").unwrap())),
-            ScanToken::Token(Token::Id(Identifier::new("!abcd").unwrap())),
-            ScanToken::Token(Token::Punct(Punct::BangAsterisk)),
-            ScanToken::Token(Token::Punct(Punct::BangAsterisk)),
-            ScanToken::Token(Token::Id(Identifier::new("a").unwrap())),
-            ScanToken::Token(Token::Id(Identifier::new("#.#").unwrap())),
-            ScanToken::Token(Token::Punct(Punct::Dot)),
-            ScanToken::Token(Token::Id(Identifier::new("x").unwrap())),
-            ScanToken::Token(Token::Punct(Punct::Underscore)),
-            ScanToken::Token(Token::Id(Identifier::new("z").unwrap())),
-            ScanToken::Token(Token::EndCommand),
-            ScanToken::Token(Token::Id(Identifier::new("abcd.").unwrap())),
-            ScanToken::Token(Token::Id(Identifier::new("abcd").unwrap())),
-            ScanToken::Token(Token::EndCommand),
-            ScanToken::Token(Token::Id(Identifier::new("QRSTUV").unwrap())),
-            ScanToken::Token(Token::EndCommand),
-            ScanToken::Token(Token::Id(Identifier::new("QrStUv").unwrap())),
-            ScanToken::Token(Token::EndCommand),
-            ScanToken::Token(Token::Id(Identifier::new("WXYZ").unwrap())),
-            ScanToken::Token(Token::EndCommand),
-            ScanToken::Error(ScanError::UnexpectedChar('�')),
-            ScanToken::Token(Token::EndCommand),
-        ],
-    );
-}
-
-#[test]
-fn test_reserved_words() {
-    check_scan(
-        r#"and or not eq ge gt le lt ne all by to with
-AND OR NOT EQ GE GT LE LT NE ALL BY TO WITH
-andx orx notx eqx gex gtx lex ltx nex allx byx tox withx
-and. with.
-"#,
-        Mode::Auto,
-        &[
-            ScanToken::Token(Token::Punct(Punct::And)),
-            ScanToken::Token(Token::Punct(Punct::Or)),
-            ScanToken::Token(Token::Punct(Punct::Not)),
-            ScanToken::Token(Token::Punct(Punct::Eq)),
-            ScanToken::Token(Token::Punct(Punct::Ge)),
-            ScanToken::Token(Token::Punct(Punct::Gt)),
-            ScanToken::Token(Token::Punct(Punct::Le)),
-            ScanToken::Token(Token::Punct(Punct::Lt)),
-            ScanToken::Token(Token::Punct(Punct::Ne)),
-            ScanToken::Token(Token::Punct(Punct::All)),
-            ScanToken::Token(Token::Punct(Punct::By)),
-            ScanToken::Token(Token::Punct(Punct::To)),
-            ScanToken::Token(Token::Punct(Punct::With)),
-            ScanToken::Token(Token::Punct(Punct::And)),
-            ScanToken::Token(Token::Punct(Punct::Or)),
-            ScanToken::Token(Token::Punct(Punct::Not)),
-            ScanToken::Token(Token::Punct(Punct::Eq)),
-            ScanToken::Token(Token::Punct(Punct::Ge)),
-            ScanToken::Token(Token::Punct(Punct::Gt)),
-            ScanToken::Token(Token::Punct(Punct::Le)),
-            ScanToken::Token(Token::Punct(Punct::Lt)),
-            ScanToken::Token(Token::Punct(Punct::Ne)),
-            ScanToken::Token(Token::Punct(Punct::All)),
-            ScanToken::Token(Token::Punct(Punct::By)),
-            ScanToken::Token(Token::Punct(Punct::To)),
-            ScanToken::Token(Token::Punct(Punct::With)),
-            ScanToken::Token(Token::Id(Identifier::new("andx").unwrap())),
-            ScanToken::Token(Token::Id(Identifier::new("orx").unwrap())),
-            ScanToken::Token(Token::Id(Identifier::new("notx").unwrap())),
-            ScanToken::Token(Token::Id(Identifier::new("eqx").unwrap())),
-            ScanToken::Token(Token::Id(Identifier::new("gex").unwrap())),
-            ScanToken::Token(Token::Id(Identifier::new("gtx").unwrap())),
-            ScanToken::Token(Token::Id(Identifier::new("lex").unwrap())),
-            ScanToken::Token(Token::Id(Identifier::new("ltx").unwrap())),
-            ScanToken::Token(Token::Id(Identifier::new("nex").unwrap())),
-            ScanToken::Token(Token::Id(Identifier::new("allx").unwrap())),
-            ScanToken::Token(Token::Id(Identifier::new("byx").unwrap())),
-            ScanToken::Token(Token::Id(Identifier::new("tox").unwrap())),
-            ScanToken::Token(Token::Id(Identifier::new("withx").unwrap())),
-            ScanToken::Token(Token::Id(Identifier::new("and.").unwrap())),
-            ScanToken::Token(Token::Punct(Punct::With)),
-            ScanToken::Token(Token::EndCommand),
-        ],
-    );
-}
-
-#[test]
-fn test_punctuation() {
-    check_scan(
-        r#"~ & | = >= > <= < ~= <> ( ) , - + * / [ ] **
-~&|=>=><=<~=<>(),-+*/[]**
-% : ; ? _ ` { } ~
-"#,
-        Mode::Auto,
-        &[
-            ScanToken::Token(Token::Punct(Punct::Not)),
-            ScanToken::Token(Token::Punct(Punct::And)),
-            ScanToken::Token(Token::Punct(Punct::Or)),
-            ScanToken::Token(Token::Punct(Punct::Equals)),
-            ScanToken::Token(Token::Punct(Punct::Ge)),
-            ScanToken::Token(Token::Punct(Punct::Gt)),
-            ScanToken::Token(Token::Punct(Punct::Le)),
-            ScanToken::Token(Token::Punct(Punct::Lt)),
-            ScanToken::Token(Token::Punct(Punct::Ne)),
-            ScanToken::Token(Token::Punct(Punct::Ne)),
-            ScanToken::Token(Token::Punct(Punct::LParen)),
-            ScanToken::Token(Token::Punct(Punct::RParen)),
-            ScanToken::Token(Token::Punct(Punct::Comma)),
-            ScanToken::Token(Token::Punct(Punct::Dash)),
-            ScanToken::Token(Token::Punct(Punct::Plus)),
-            ScanToken::Token(Token::Punct(Punct::Asterisk)),
-            ScanToken::Token(Token::Punct(Punct::Slash)),
-            ScanToken::Token(Token::Punct(Punct::LSquare)),
-            ScanToken::Token(Token::Punct(Punct::RSquare)),
-            ScanToken::Token(Token::Punct(Punct::Exp)),
-            ScanToken::Token(Token::Punct(Punct::Not)),
-            ScanToken::Token(Token::Punct(Punct::And)),
-            ScanToken::Token(Token::Punct(Punct::Or)),
-            ScanToken::Token(Token::Punct(Punct::Equals)),
-            ScanToken::Token(Token::Punct(Punct::Ge)),
-            ScanToken::Token(Token::Punct(Punct::Gt)),
-            ScanToken::Token(Token::Punct(Punct::Le)),
-            ScanToken::Token(Token::Punct(Punct::Lt)),
-            ScanToken::Token(Token::Punct(Punct::Ne)),
-            ScanToken::Token(Token::Punct(Punct::Ne)),
-            ScanToken::Token(Token::Punct(Punct::LParen)),
-            ScanToken::Token(Token::Punct(Punct::RParen)),
-            ScanToken::Token(Token::Punct(Punct::Comma)),
-            ScanToken::Token(Token::Punct(Punct::Dash)),
-            ScanToken::Token(Token::Punct(Punct::Plus)),
-            ScanToken::Token(Token::Punct(Punct::Asterisk)),
-            ScanToken::Token(Token::Punct(Punct::Slash)),
-            ScanToken::Token(Token::Punct(Punct::LSquare)),
-            ScanToken::Token(Token::Punct(Punct::RSquare)),
-            ScanToken::Token(Token::Punct(Punct::Exp)),
-            ScanToken::Token(Token::Punct(Punct::Percent)),
-            ScanToken::Token(Token::Punct(Punct::Colon)),
-            ScanToken::Token(Token::Punct(Punct::Semicolon)),
-            ScanToken::Token(Token::Punct(Punct::Question)),
-            ScanToken::Token(Token::Punct(Punct::Underscore)),
-            ScanToken::Token(Token::Punct(Punct::Backtick)),
-            ScanToken::Token(Token::Punct(Punct::LCurly)),
-            ScanToken::Token(Token::Punct(Punct::RCurly)),
-            ScanToken::Token(Token::Punct(Punct::Not)),
-        ],
-    );
-}
-
-#[test]
-fn test_positive_numbers() {
-    check_scan(
-        r#"0 1 01 001. 1.
-123. /* comment 1 */ /* comment 2 */
-.1 0.1 00.1 00.10
-5e1 6E-1 7e+1 6E+01 6e-03
-.3E1 .4e-1 .5E+1 .6e+01 .7E-03
-1.23e1 45.6E-1 78.9e+1 99.9E+01 11.2e-03
-. 1e e1 1e+ 1e-
-"#,
-        Mode::Auto,
-        &[
-            ScanToken::Token(Token::Number(0.0)),
-            ScanToken::Token(Token::Number(1.0)),
-            ScanToken::Token(Token::Number(1.0)),
-            ScanToken::Token(Token::Number(1.0)),
-            ScanToken::Token(Token::Number(1.0)),
-            ScanToken::Token(Token::EndCommand),
-            ScanToken::Token(Token::Number(123.0)),
-            ScanToken::Token(Token::EndCommand),
-            ScanToken::Token(Token::EndCommand),
-            ScanToken::Token(Token::Number(1.0)),
-            ScanToken::Token(Token::Number(0.1)),
-            ScanToken::Token(Token::Number(0.1)),
-            ScanToken::Token(Token::Number(0.1)),
-            ScanToken::Token(Token::Number(50.0)),
-            ScanToken::Token(Token::Number(0.6)),
-            ScanToken::Token(Token::Number(70.0)),
-            ScanToken::Token(Token::Number(60.0)),
-            ScanToken::Token(Token::Number(0.006)),
-            ScanToken::Token(Token::EndCommand),
-            ScanToken::Token(Token::Number(30.0)),
-            ScanToken::Token(Token::Number(0.04)),
-            ScanToken::Token(Token::Number(5.0)),
-            ScanToken::Token(Token::Number(6.0)),
-            ScanToken::Token(Token::Number(0.0007)),
-            ScanToken::Token(Token::Number(12.3)),
-            ScanToken::Token(Token::Number(4.56)),
-            ScanToken::Token(Token::Number(789.0)),
-            ScanToken::Token(Token::Number(999.0)),
-            ScanToken::Token(Token::Number(0.0112)),
-            ScanToken::Token(Token::EndCommand),
-            ScanToken::Error(ScanError::ExpectedExponent(String::from("1e"))),
-            ScanToken::Token(Token::Id(Identifier::new("e1").unwrap())),
-            ScanToken::Error(ScanError::ExpectedExponent(String::from("1e+"))),
-            ScanToken::Error(ScanError::ExpectedExponent(String::from("1e-"))),
-        ],
-    );
-}
-
-#[test]
-fn test_negative_numbers() {
-    check_scan(
-        r#" -0 -1 -01 -001. -1.
- -123. /* comment 1 */ /* comment 2 */
- -.1 -0.1 -00.1 -00.10
- -5e1 -6E-1 -7e+1 -6E+01 -6e-03
- -.3E1 -.4e-1 -.5E+1 -.6e+01 -.7E-03
- -1.23e1 -45.6E-1 -78.9e+1 -99.9E+01 -11.2e-03
- -/**/1
- -. -1e -e1 -1e+ -1e- -1.
-"#,
-        Mode::Auto,
-        &[
-            ScanToken::Token(Token::Number(-0.0)),
-            ScanToken::Token(Token::Number(-1.0)),
-            ScanToken::Token(Token::Number(-1.0)),
-            ScanToken::Token(Token::Number(-1.0)),
-            ScanToken::Token(Token::Number(-1.0)),
-            ScanToken::Token(Token::EndCommand),
-            ScanToken::Token(Token::Number(-123.0)),
-            ScanToken::Token(Token::EndCommand),
-            ScanToken::Token(Token::Number(-0.1)),
-            ScanToken::Token(Token::Number(-0.1)),
-            ScanToken::Token(Token::Number(-0.1)),
-            ScanToken::Token(Token::Number(-0.1)),
-            ScanToken::Token(Token::Number(-50.0)),
-            ScanToken::Token(Token::Number(-0.6)),
-            ScanToken::Token(Token::Number(-70.0)),
-            ScanToken::Token(Token::Number(-60.0)),
-            ScanToken::Token(Token::Number(-0.006)),
-            ScanToken::Token(Token::Number(-3.0)),
-            ScanToken::Token(Token::Number(-0.04)),
-            ScanToken::Token(Token::Number(-5.0)),
-            ScanToken::Token(Token::Number(-6.0)),
-            ScanToken::Token(Token::Number(-0.0007)),
-            ScanToken::Token(Token::Number(-12.3)),
-            ScanToken::Token(Token::Number(-4.56)),
-            ScanToken::Token(Token::Number(-789.0)),
-            ScanToken::Token(Token::Number(-999.0)),
-            ScanToken::Token(Token::Number(-0.0112)),
-            ScanToken::Token(Token::Number(-1.0)),
-            ScanToken::Token(Token::Punct(Punct::Dash)),
-            ScanToken::Token(Token::Punct(Punct::Dot)),
-            ScanToken::Error(ScanError::ExpectedExponent(String::from("-1e"))),
-            ScanToken::Token(Token::Punct(Punct::Dash)),
-            ScanToken::Token(Token::Id(Identifier::new("e1").unwrap())),
-            ScanToken::Error(ScanError::ExpectedExponent(String::from("-1e+"))),
-            ScanToken::Error(ScanError::ExpectedExponent(String::from("-1e-"))),
-            ScanToken::Token(Token::Number(-1.0)),
-            ScanToken::Token(Token::EndCommand),
-        ],
-    );
-}
-
-#[test]
-fn test_strings() {
-    check_scan(
-        r#"'x' "y" 'abc'
-'Don''t' "Can't" 'Won''t'
-"""quoted""" '"quoted"'
-'' "" '''' """"
-'missing end quote
-"missing double quote
-'x' + "y"
-+ 'z' +
-'a' /* abc */ + "b" /*
-+ 'c' +/* */"d"/* */+'e'
-'foo'
-+          /* special case: + in column 0 would ordinarily start a new command
-'bar'
-'foo'
- +
-'bar'
-'foo'
-+
-
-'bar'
-
-+
-x"4142"+'5152'
-"4142"+
-x'5152'
-x"4142"
-+u'304a'
-"�あいうえお"
-"abc"+U"FFFD"+u'3048'+"xyz"
-"#,
-        Mode::Auto,
-        &[
-            ScanToken::Token(Token::String(String::from("x"))),
-            ScanToken::Token(Token::String(String::from("y"))),
-            ScanToken::Token(Token::String(String::from("abc"))),
-            ScanToken::Token(Token::String(String::from("Don't"))),
-            ScanToken::Token(Token::String(String::from("Can't"))),
-            ScanToken::Token(Token::String(String::from("Won't"))),
-            ScanToken::Token(Token::String(String::from("\"quoted\""))),
-            ScanToken::Token(Token::String(String::from("\"quoted\""))),
-            ScanToken::Token(Token::String(String::from(""))),
-            ScanToken::Token(Token::String(String::from(""))),
-            ScanToken::Token(Token::String(String::from("'"))),
-            ScanToken::Token(Token::String(String::from("\""))),
-            ScanToken::Error(ScanError::ExpectedQuote),
-            ScanToken::Error(ScanError::ExpectedQuote),
-            ScanToken::Token(Token::String(String::from("xyzabcde"))),
-            ScanToken::Token(Token::String(String::from("foobar"))),
-            ScanToken::Token(Token::String(String::from("foobar"))),
-            ScanToken::Token(Token::String(String::from("foo"))),
-            ScanToken::Token(Token::Punct(Punct::Plus)),
-            ScanToken::Token(Token::EndCommand),
-            ScanToken::Token(Token::String(String::from("bar"))),
-            ScanToken::Token(Token::EndCommand),
-            ScanToken::Token(Token::Punct(Punct::Plus)),
-            ScanToken::Token(Token::String(String::from("AB5152"))),
-            ScanToken::Token(Token::String(String::from("4142QR"))),
-            ScanToken::Token(Token::String(String::from("ABお"))),
-            ScanToken::Token(Token::String(String::from("�あいうえお"))),
-            ScanToken::Token(Token::String(String::from("abc�えxyz"))),
-            ScanToken::Token(Token::End),
-        ],
-    );
-}
-
-#[test]
-fn test_shbang() {
-    check_scan(
-        r#"#! /usr/bin/pspp
-#! /usr/bin/pspp
-"#,
-        Mode::Auto,
-        &[
-            ScanToken::Token(Token::Id(Identifier::new("#").unwrap())),
-            ScanToken::Token(Token::Punct(Punct::Bang)),
-            ScanToken::Token(Token::Punct(Punct::Slash)),
-            ScanToken::Token(Token::Id(Identifier::new("usr").unwrap())),
-            ScanToken::Token(Token::Punct(Punct::Slash)),
-            ScanToken::Token(Token::Id(Identifier::new("bin").unwrap())),
-            ScanToken::Token(Token::Punct(Punct::Slash)),
-            ScanToken::Token(Token::Id(Identifier::new("pspp").unwrap())),
-        ],
-    );
-}
-
-#[test]
-fn test_comments() {
-    check_scan(
-        r#"* Comment commands "don't
-have to contain valid tokens.
-
-** Check ambiguity with ** token.
-****************.
-
-comment keyword works too.
-COMM also.
-com is ambiguous with COMPUTE.
-
-   * Comment need not start at left margin.
-
-* Comment ends with blank line
-
-next command.
-
-"#,
-        Mode::Auto,
-        &[
-            ScanToken::Token(Token::EndCommand),
-            ScanToken::Token(Token::EndCommand),
-            ScanToken::Token(Token::EndCommand),
-            ScanToken::Token(Token::EndCommand),
-            ScanToken::Token(Token::EndCommand),
-            ScanToken::Token(Token::EndCommand),
-            ScanToken::Token(Token::EndCommand),
-            ScanToken::Token(Token::Id(Identifier::new("com").unwrap())),
-            ScanToken::Token(Token::Id(Identifier::new("is").unwrap())),
-            ScanToken::Token(Token::Id(Identifier::new("ambiguous").unwrap())),
-            ScanToken::Token(Token::Punct(Punct::With)),
-            ScanToken::Token(Token::Id(Identifier::new("COMPUTE").unwrap())),
-            ScanToken::Token(Token::EndCommand),
-            ScanToken::Token(Token::EndCommand),
-            ScanToken::Token(Token::EndCommand),
-            ScanToken::Token(Token::EndCommand),
-            ScanToken::Token(Token::EndCommand),
-            ScanToken::Token(Token::Id(Identifier::new("next").unwrap())),
-            ScanToken::Token(Token::Id(Identifier::new("command").unwrap())),
-            ScanToken::Token(Token::EndCommand),
-            ScanToken::Token(Token::EndCommand),
-        ],
-    );
-}
-
-#[test]
-fn test_document() {
-    check_scan(
-        r#"DOCUMENT one line.
-DOC more
-    than
-        one
-            line.
-docu
-first.paragraph
-isn't parsed as tokens
-
-second paragraph.
-"#,
-        Mode::Auto,
-        &[
-            ScanToken::Token(Token::Id(Identifier::new("DOCUMENT").unwrap())),
-            ScanToken::Token(Token::String(String::from("DOCUMENT one line."))),
-            ScanToken::Token(Token::EndCommand),
-            ScanToken::Token(Token::EndCommand),
-            ScanToken::Token(Token::Id(Identifier::new("DOCUMENT").unwrap())),
-            ScanToken::Token(Token::String(String::from("DOC more"))),
-            ScanToken::Token(Token::String(String::from("    than"))),
-            ScanToken::Token(Token::String(String::from("        one"))),
-            ScanToken::Token(Token::String(String::from("            line."))),
-            ScanToken::Token(Token::EndCommand),
-            ScanToken::Token(Token::EndCommand),
-            ScanToken::Token(Token::Id(Identifier::new("DOCUMENT").unwrap())),
-            ScanToken::Token(Token::String(String::from("docu"))),
-            ScanToken::Token(Token::String(String::from("first.paragraph"))),
-            ScanToken::Token(Token::String(String::from("isn't parsed as tokens"))),
-            ScanToken::Token(Token::String(String::from(""))),
-            ScanToken::Token(Token::String(String::from("second paragraph."))),
-            ScanToken::Token(Token::EndCommand),
-            ScanToken::Token(Token::EndCommand),
-        ],
-    );
-}
-
-#[test]
-fn test_file_label() {
-    check_scan(
-        r#"FIL label isn't quoted.
-FILE
-  lab 'is quoted'.
-FILE /*
-/**/  lab not quoted here either
-
-"#,
-        Mode::Auto,
-        &[
-            ScanToken::Token(Token::Id(Identifier::new("FIL").unwrap())),
-            ScanToken::Token(Token::Id(Identifier::new("label").unwrap())),
-            ScanToken::Token(Token::String(String::from("isn't quoted"))),
-            ScanToken::Token(Token::EndCommand),
-            ScanToken::Token(Token::Id(Identifier::new("FILE").unwrap())),
-            ScanToken::Token(Token::Id(Identifier::new("lab").unwrap())),
-            ScanToken::Token(Token::String(String::from("is quoted"))),
-            ScanToken::Token(Token::EndCommand),
-            ScanToken::Token(Token::Id(Identifier::new("FILE").unwrap())),
-            ScanToken::Token(Token::Id(Identifier::new("lab").unwrap())),
-            ScanToken::Token(Token::String(String::from("not quoted here either"))),
-            ScanToken::Token(Token::EndCommand),
-        ],
-    );
-}
-
-#[test]
-fn test_begin_data() {
-    check_scan(
-        r#"begin data.
-123
-xxx
-end data.
-
-BEG /**/ DAT /*
-5 6 7 /* x
-
-end  data
-end data
-.
-"#,
-        Mode::Auto,
-        &[
-            ScanToken::Token(Token::Id(Identifier::new("begin").unwrap())),
-            ScanToken::Token(Token::Id(Identifier::new("data").unwrap())),
-            ScanToken::Token(Token::EndCommand),
-            ScanToken::Token(Token::String(String::from("123"))),
-            ScanToken::Token(Token::String(String::from("xxx"))),
-            ScanToken::Token(Token::Id(Identifier::new("end").unwrap())),
-            ScanToken::Token(Token::Id(Identifier::new("data").unwrap())),
-            ScanToken::Token(Token::EndCommand),
-            ScanToken::Token(Token::EndCommand),
-            ScanToken::Token(Token::Id(Identifier::new("BEG").unwrap())),
-            ScanToken::Token(Token::Id(Identifier::new("DAT").unwrap())),
-            ScanToken::Token(Token::String(String::from("5 6 7 /* x"))),
-            ScanToken::Token(Token::String(String::from(""))),
-            ScanToken::Token(Token::String(String::from("end  data"))),
-            ScanToken::Token(Token::Id(Identifier::new("end").unwrap())),
-            ScanToken::Token(Token::Id(Identifier::new("data").unwrap())),
-            ScanToken::Token(Token::EndCommand),
-        ],
-    );
-}
-
-#[test]
-fn test_do_repeat() {
-    check_scan(
-        r#"do repeat x=a b c
-          y=d e f.
-  do repeat a=1 thru 5.
-another command.
-second command
-+ third command.
-end /* x */ /* y */ repeat print.
-end
- repeat.
-"#,
-        Mode::Auto,
-        &[
-            ScanToken::Token(Token::Id(Identifier::new("do").unwrap())),
-            ScanToken::Token(Token::Id(Identifier::new("repeat").unwrap())),
-            ScanToken::Token(Token::Id(Identifier::new("x").unwrap())),
-            ScanToken::Token(Token::Punct(Punct::Equals)),
-            ScanToken::Token(Token::Id(Identifier::new("a").unwrap())),
-            ScanToken::Token(Token::Id(Identifier::new("b").unwrap())),
-            ScanToken::Token(Token::Id(Identifier::new("c").unwrap())),
-            ScanToken::Token(Token::Id(Identifier::new("y").unwrap())),
-            ScanToken::Token(Token::Punct(Punct::Equals)),
-            ScanToken::Token(Token::Id(Identifier::new("d").unwrap())),
-            ScanToken::Token(Token::Id(Identifier::new("e").unwrap())),
-            ScanToken::Token(Token::Id(Identifier::new("f").unwrap())),
-            ScanToken::Token(Token::EndCommand),
-            ScanToken::Token(Token::String(String::from("  do repeat a=1 thru 5."))),
-            ScanToken::Token(Token::String(String::from("another command."))),
-            ScanToken::Token(Token::String(String::from("second command"))),
-            ScanToken::Token(Token::String(String::from("+ third command."))),
-            ScanToken::Token(Token::String(String::from(
-                "end /* x */ /* y */ repeat print.",
-            ))),
-            ScanToken::Token(Token::Id(Identifier::new("end").unwrap())),
-            ScanToken::Token(Token::Id(Identifier::new("repeat").unwrap())),
-            ScanToken::Token(Token::EndCommand),
-        ],
-    );
-}
-
-#[test]
-fn test_do_repeat_batch() {
-    check_scan(
-        r#"do repeat x=a b c
-          y=d e f
-do repeat a=1 thru 5
-another command
-second command
-+ third command
-end /* x */ /* y */ repeat print
-end
- repeat
-do
-  repeat #a=1
-
-  inner command
-end repeat
-"#,
-        Mode::Batch,
-        &[
-            ScanToken::Token(Token::Id(Identifier::new("do").unwrap())),
-            ScanToken::Token(Token::Id(Identifier::new("repeat").unwrap())),
-            ScanToken::Token(Token::Id(Identifier::new("x").unwrap())),
-            ScanToken::Token(Token::Punct(Punct::Equals)),
-            ScanToken::Token(Token::Id(Identifier::new("a").unwrap())),
-            ScanToken::Token(Token::Id(Identifier::new("b").unwrap())),
-            ScanToken::Token(Token::Id(Identifier::new("c").unwrap())),
-            ScanToken::Token(Token::Id(Identifier::new("y").unwrap())),
-            ScanToken::Token(Token::Punct(Punct::Equals)),
-            ScanToken::Token(Token::Id(Identifier::new("d").unwrap())),
-            ScanToken::Token(Token::Id(Identifier::new("e").unwrap())),
-            ScanToken::Token(Token::Id(Identifier::new("f").unwrap())),
-            ScanToken::Token(Token::EndCommand),
-            ScanToken::Token(Token::String(String::from("do repeat a=1 thru 5"))),
-            ScanToken::Token(Token::String(String::from("another command"))),
-            ScanToken::Token(Token::String(String::from("second command"))),
-            ScanToken::Token(Token::String(String::from("+ third command"))),
-            ScanToken::Token(Token::String(String::from(
-                "end /* x */ /* y */ repeat print",
-            ))),
-            ScanToken::Token(Token::Id(Identifier::new("end").unwrap())),
-            ScanToken::Token(Token::Id(Identifier::new("repeat").unwrap())),
-            ScanToken::Token(Token::EndCommand),
-            ScanToken::Token(Token::Id(Identifier::new("do").unwrap())),
-            ScanToken::Token(Token::Id(Identifier::new("repeat").unwrap())),
-            ScanToken::Token(Token::Id(Identifier::new("#a").unwrap())),
-            ScanToken::Token(Token::Punct(Punct::Equals)),
-            ScanToken::Token(Token::Number(1.0)),
-            ScanToken::Token(Token::EndCommand),
-            ScanToken::Token(Token::String(String::from("  inner command"))),
-            ScanToken::Token(Token::Id(Identifier::new("end").unwrap())),
-            ScanToken::Token(Token::Id(Identifier::new("repeat").unwrap())),
-        ],
-    );
-}
-
-#[test]
-fn test_batch_mode() {
-    check_scan(
-        r#"first command
-     another line of first command
-+  second command
-third command
-
-fourth command.
-   fifth command.
-"#,
-        Mode::Batch,
-        &[
-            ScanToken::Token(Token::Id(Identifier::new("first").unwrap())),
-            ScanToken::Token(Token::Id(Identifier::new("command").unwrap())),
-            ScanToken::Token(Token::Id(Identifier::new("another").unwrap())),
-            ScanToken::Token(Token::Id(Identifier::new("line").unwrap())),
-            ScanToken::Token(Token::Id(Identifier::new("of").unwrap())),
-            ScanToken::Token(Token::Id(Identifier::new("first").unwrap())),
-            ScanToken::Token(Token::Id(Identifier::new("command").unwrap())),
-            ScanToken::Token(Token::EndCommand),
-            ScanToken::Token(Token::Id(Identifier::new("second").unwrap())),
-            ScanToken::Token(Token::Id(Identifier::new("command").unwrap())),
-            ScanToken::Token(Token::EndCommand),
-            ScanToken::Token(Token::Id(Identifier::new("third").unwrap())),
-            ScanToken::Token(Token::Id(Identifier::new("command").unwrap())),
-            ScanToken::Token(Token::EndCommand),
-            ScanToken::Token(Token::Id(Identifier::new("fourth").unwrap())),
-            ScanToken::Token(Token::Id(Identifier::new("command").unwrap())),
-            ScanToken::Token(Token::EndCommand),
-            ScanToken::Token(Token::Id(Identifier::new("fifth").unwrap())),
-            ScanToken::Token(Token::Id(Identifier::new("command").unwrap())),
-            ScanToken::Token(Token::EndCommand),
-        ],
-    );
-}
-
-mod define {
-    use crate::{identifier::Identifier, lex::{
-        scan::ScanToken,
-        segment::Mode,
-        token::{Punct, Token},
-    }};
-
-    use super::check_scan;
-
-    #[test]
-    fn test_simple() {
-        check_scan(
-            r#"define !macro1()
-var1 var2 var3
-!enddefine.
-"#,
-            Mode::Auto,
-            &[
-                ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
-                ScanToken::Token(Token::String(String::from("!macro1"))),
-                ScanToken::Token(Token::Punct(Punct::LParen)),
-                ScanToken::Token(Token::Punct(Punct::RParen)),
-                ScanToken::Token(Token::String(String::from("var1 var2 var3"))),
-                ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())),
-                ScanToken::Token(Token::EndCommand),
-            ],
-        );
-    }
-
-    #[test]
-    fn test_no_newline_after_parentheses() {
-        check_scan(
-            r#"define !macro1() var1 var2 var3
-!enddefine.
-"#,
-            Mode::Auto,
-            &[
-                ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
-                ScanToken::Token(Token::String(String::from("!macro1"))),
-                ScanToken::Token(Token::Punct(Punct::LParen)),
-                ScanToken::Token(Token::Punct(Punct::RParen)),
-                ScanToken::Token(Token::String(String::from(" var1 var2 var3"))),
-                ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())),
-                ScanToken::Token(Token::EndCommand),
-            ],
-        );
-    }
-
-    #[test]
-    fn test_no_newline_before_enddefine() {
-        check_scan(
-            r#"define !macro1()
-var1 var2 var3!enddefine.
-"#,
-            Mode::Auto,
-            &[
-                ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
-                ScanToken::Token(Token::String(String::from("!macro1"))),
-                ScanToken::Token(Token::Punct(Punct::LParen)),
-                ScanToken::Token(Token::Punct(Punct::RParen)),
-                ScanToken::Token(Token::String(String::from("var1 var2 var3"))),
-                ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())),
-                ScanToken::Token(Token::EndCommand),
-            ],
-        );
-    }
-
-    #[test]
-    fn test_all_on_one_line() {
-        check_scan(
-            r#"define !macro1()var1 var2 var3!enddefine.
-"#,
-            Mode::Auto,
-            &[
-                ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
-                ScanToken::Token(Token::String(String::from("!macro1"))),
-                ScanToken::Token(Token::Punct(Punct::LParen)),
-                ScanToken::Token(Token::Punct(Punct::RParen)),
-                ScanToken::Token(Token::String(String::from("var1 var2 var3"))),
-                ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())),
-                ScanToken::Token(Token::EndCommand),
-            ],
-        );
-    }
-
-    #[test]
-    fn test_empty() {
-        check_scan(
-            r#"define !macro1()
-!enddefine.
-"#,
-            Mode::Auto,
-            &[
-                ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
-                ScanToken::Token(Token::String(String::from("!macro1"))),
-                ScanToken::Token(Token::Punct(Punct::LParen)),
-                ScanToken::Token(Token::Punct(Punct::RParen)),
-                ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())),
-                ScanToken::Token(Token::EndCommand),
-            ],
-        );
-    }
-
-    #[test]
-    fn test_blank_lines() {
-        check_scan(
-            r#"define !macro1()
-
-
-!enddefine.
-"#,
-            Mode::Auto,
-            &[
-                ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
-                ScanToken::Token(Token::String(String::from("!macro1"))),
-                ScanToken::Token(Token::Punct(Punct::LParen)),
-                ScanToken::Token(Token::Punct(Punct::RParen)),
-                ScanToken::Token(Token::String(String::from(""))),
-                ScanToken::Token(Token::String(String::from(""))),
-                ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())),
-                ScanToken::Token(Token::EndCommand),
-            ],
-        );
-    }
-
-    #[test]
-    fn test_arguments() {
-        check_scan(
-            r#"define !macro1(a(), b(), c())
-!enddefine.
-"#,
-            Mode::Auto,
-            &[
-                ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
-                ScanToken::Token(Token::String(String::from("!macro1"))),
-                ScanToken::Token(Token::Punct(Punct::LParen)),
-                ScanToken::Token(Token::Id(Identifier::new("a").unwrap())),
-                ScanToken::Token(Token::Punct(Punct::LParen)),
-                ScanToken::Token(Token::Punct(Punct::RParen)),
-                ScanToken::Token(Token::Punct(Punct::Comma)),
-                ScanToken::Token(Token::Id(Identifier::new("b").unwrap())),
-                ScanToken::Token(Token::Punct(Punct::LParen)),
-                ScanToken::Token(Token::Punct(Punct::RParen)),
-                ScanToken::Token(Token::Punct(Punct::Comma)),
-                ScanToken::Token(Token::Id(Identifier::new("c").unwrap())),
-                ScanToken::Token(Token::Punct(Punct::LParen)),
-                ScanToken::Token(Token::Punct(Punct::RParen)),
-                ScanToken::Token(Token::Punct(Punct::RParen)),
-                ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())),
-                ScanToken::Token(Token::EndCommand),
-            ],
-        );
-    }
-
-    #[test]
-    fn test_multiline_arguments() {
-        check_scan(
-            r#"define !macro1(
-  a(), b(
-  ),
-  c()
-)
-!enddefine.
-"#,
-            Mode::Auto,
-            &[
-                ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
-                ScanToken::Token(Token::String(String::from("!macro1"))),
-                ScanToken::Token(Token::Punct(Punct::LParen)),
-                ScanToken::Token(Token::Id(Identifier::new("a").unwrap())),
-                ScanToken::Token(Token::Punct(Punct::LParen)),
-                ScanToken::Token(Token::Punct(Punct::RParen)),
-                ScanToken::Token(Token::Punct(Punct::Comma)),
-                ScanToken::Token(Token::Id(Identifier::new("b").unwrap())),
-                ScanToken::Token(Token::Punct(Punct::LParen)),
-                ScanToken::Token(Token::Punct(Punct::RParen)),
-                ScanToken::Token(Token::Punct(Punct::Comma)),
-                ScanToken::Token(Token::Id(Identifier::new("c").unwrap())),
-                ScanToken::Token(Token::Punct(Punct::LParen)),
-                ScanToken::Token(Token::Punct(Punct::RParen)),
-                ScanToken::Token(Token::Punct(Punct::RParen)),
-                ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())),
-                ScanToken::Token(Token::EndCommand),
-            ],
-        );
-    }
-
-    #[test]
-    fn test_arguments_start_on_second_line() {
-        check_scan(
-            r#"define !macro1
-(x,y,z
-)
-content 1
-content 2
-!enddefine.
-"#,
-            Mode::Auto,
-            &[
-                ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
-                ScanToken::Token(Token::String(String::from("!macro1"))),
-                ScanToken::Token(Token::Punct(Punct::LParen)),
-                ScanToken::Token(Token::Id(Identifier::new("x").unwrap())),
-                ScanToken::Token(Token::Punct(Punct::Comma)),
-                ScanToken::Token(Token::Id(Identifier::new("y").unwrap())),
-                ScanToken::Token(Token::Punct(Punct::Comma)),
-                ScanToken::Token(Token::Id(Identifier::new("z").unwrap())),
-                ScanToken::Token(Token::Punct(Punct::RParen)),
-                ScanToken::Token(Token::String(String::from("content 1"))),
-                ScanToken::Token(Token::String(String::from("content 2"))),
-                ScanToken::Token(Token::Id(Identifier::new("!enddefine").unwrap())),
-                ScanToken::Token(Token::EndCommand),
-            ],
-        );
-    }
-
-    #[test]
-    fn test_early_end_of_command_1() {
-        check_scan(
-            r#"define !macro1.
-data list /x 1.
-"#,
-            Mode::Auto,
-            &[
-                ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
-                ScanToken::Token(Token::String(String::from("!macro1"))),
-                ScanToken::Token(Token::EndCommand),
-                ScanToken::Token(Token::Id(Identifier::new("data").unwrap())),
-                ScanToken::Token(Token::Id(Identifier::new("list").unwrap())),
-                ScanToken::Token(Token::Punct(Punct::Slash)),
-                ScanToken::Token(Token::Id(Identifier::new("x").unwrap())),
-                ScanToken::Token(Token::Number(1.0)),
-                ScanToken::Token(Token::EndCommand),
-            ],
-        );
-    }
-
-    #[test]
-    fn test_early_end_of_command_2() {
-        check_scan(
-            r#"define !macro1
-x.
-data list /x 1.
-"#,
-            Mode::Auto,
-            &[
-                ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
-                ScanToken::Token(Token::String(String::from("!macro1"))),
-                ScanToken::Token(Token::Id(Identifier::new("x").unwrap())),
-                ScanToken::Token(Token::EndCommand),
-                ScanToken::Token(Token::Id(Identifier::new("data").unwrap())),
-                ScanToken::Token(Token::Id(Identifier::new("list").unwrap())),
-                ScanToken::Token(Token::Punct(Punct::Slash)),
-                ScanToken::Token(Token::Id(Identifier::new("x").unwrap())),
-                ScanToken::Token(Token::Number(1.0)),
-                ScanToken::Token(Token::EndCommand),
-            ],
-        );
-    }
-
-    #[test]
-    fn test_early_end_of_command_3() {
-        check_scan(
-            r#"define !macro1(.
-x.
-data list /x 1.
-"#,
-            Mode::Auto,
-            &[
-                ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
-                ScanToken::Token(Token::String(String::from("!macro1"))),
-                ScanToken::Token(Token::Punct(Punct::LParen)),
-                ScanToken::Token(Token::EndCommand),
-                ScanToken::Token(Token::Id(Identifier::new("x").unwrap())),
-                ScanToken::Token(Token::EndCommand),
-                ScanToken::Token(Token::Id(Identifier::new("data").unwrap())),
-                ScanToken::Token(Token::Id(Identifier::new("list").unwrap())),
-                ScanToken::Token(Token::Punct(Punct::Slash)),
-                ScanToken::Token(Token::Id(Identifier::new("x").unwrap())),
-                ScanToken::Token(Token::Number(1.0)),
-                ScanToken::Token(Token::EndCommand),
-            ],
-        );
-    }
-
-    #[test]
-    fn test_early_end_of_command_4() {
-        // Notice the command terminator at the end of the DEFINE command,
-        // which should not be there and ends it early.
-        check_scan(
-            r#"define !macro1.
-data list /x 1.
-"#,
-            Mode::Auto,
-            &[
-                ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
-                ScanToken::Token(Token::String(String::from("!macro1"))),
-                ScanToken::Token(Token::EndCommand),
-                ScanToken::Token(Token::Id(Identifier::new("data").unwrap())),
-                ScanToken::Token(Token::Id(Identifier::new("list").unwrap())),
-                ScanToken::Token(Token::Punct(Punct::Slash)),
-                ScanToken::Token(Token::Id(Identifier::new("x").unwrap())),
-                ScanToken::Token(Token::Number(1.0)),
-                ScanToken::Token(Token::EndCommand),
-            ],
-        );
-    }
-
-    #[test]
-    fn test_missing_enddefine() {
-        check_scan(
-            r#"define !macro1()
-content line 1
-content line 2
-"#,
-            Mode::Auto,
-            &[
-                ScanToken::Token(Token::Id(Identifier::new("define").unwrap())),
-                ScanToken::Token(Token::String(String::from("!macro1"))),
-                ScanToken::Token(Token::Punct(Punct::LParen)),
-                ScanToken::Token(Token::Punct(Punct::RParen)),
-                ScanToken::Token(Token::String(String::from("content line 1"))),
-                ScanToken::Token(Token::String(String::from("content line 2"))),
-                ScanToken::Token(Token::End),
-            ],
-        );
-    }
-}
diff --git a/rust/src/lex/segment/mod.rs b/rust/src/lex/segment/mod.rs

deleted file mode 100644 (file)

index befe5b0..0000000
--- a/rust/src/lex/segment/mod.rs
+++ /dev/null
@@ -1,1334 +0,0 @@
-//! Syntax segmentation.
-//!
-//! PSPP divides traditional "lexical analysis" or "tokenization" into two
-//! phases: a lower-level phase called "segmentation" and a higher-level phase
-//! called "scanning".  This module implements the segmentation phase.
-//! [`super::scan`] contains declarations for the scanning phase.
-//!
-//! Segmentation accepts a stream of UTF-8 bytes as input.  It outputs a label
-//! (a segment type) for each byte or contiguous sequence of bytes in the input.
-//! It also, in a few corner cases, outputs zero-width segments that label the
-//! boundary between a pair of bytes in the input.
-//!
-//! Some segment types correspond directly to tokens; for example, an
-//! "identifier" segment (SEG_IDENTIFIER) becomes an identifier token (T_ID)
-//! later in lexical analysis.  Other segments contribute to tokens but do not
-//! correspond directly; for example, multiple quoted string segments
-//! (SEG_QUOTED_STRING) separated by spaces (SEG_SPACES) and "+" punctuators
-//! (SEG_PUNCT) may be combined to form a single string token (T_STRING).  Still
-//! other segments are ignored (e.g. SEG_SPACES) or trigger special behavior
-//! such as error messages later in tokenization (e.g. SEG_EXPECTED_QUOTE).
-
-use crate::{
-    identifier::{id_match, id_match_n, IdentifierChar},
-    prompt::PromptStyle,
-};
-use bitflags::bitflags;
-
-use super::command_name::{command_match, COMMAND_NAMES};
-
-/// Segmentation mode.
-///
-/// PSPP syntax is written in one of two modes which are broadly defined as
-/// follows:
-///
-/// - In interactive mode, commands end with a period at the end of the line
-///   or with a blank line.
-///
-/// - In batch mode, the second and subsequent lines of a command are indented
-///   from the left margin.
-///
-/// The segmenter can also try to automatically detect the mode in use, using a
-/// heuristic that is usually correct.
-#[derive(Copy, Clone, Debug, PartialEq, Eq, Default)]
-pub enum Mode {
-    /// Try to interpret input correctly regardless of whether it is written
-    /// for interactive or batch mode.
-    #[default]
-    Auto,
-
-    /// Interactive syntax mode.
-    Interactive,
-
-    /// Batch syntax mode.
-    Batch,
-}
-
-/// The type of a segment.
-#[derive(Copy, Clone, Debug, PartialEq, Eq)]
-pub enum Segment {
-    Number,
-    QuotedString,
-    HexString,
-    UnicodeString,
-    UnquotedString,
-    Identifier,
-    Punct,
-    Shbang,
-    Spaces,
-    Comment,
-    Newline,
-    CommentCommand,
-    DoRepeatCommand,
-    DoRepeatOverflow,
-    InlineData,
-    MacroName,
-    MacroBody,
-    StartDocument,
-    Document,
-    StartCommand,
-    SeparateCommands,
-    EndCommand,
-    End,
-    ExpectedQuote,
-    ExpectedExponent,
-    UnexpectedChar,
-}
-
-bitflags! {
-    #[derive(Copy, Clone, Debug)]
-    pub struct Substate: u8 {
-        const START_OF_LINE = 1;
-        const START_OF_COMMAND = 2;
-    }
-}
-
-#[derive(Copy, Clone)]
-pub struct Segmenter {
-    state: (State, Substate),
-    nest: u8,
-    mode: Mode,
-}
-
-#[derive(Copy, Clone, Debug)]
-pub struct Incomplete;
-
-impl Segmenter {
-    /// Returns a segmenter with the given syntax `mode`.
-    ///
-    /// If `is_snippet` is false, then the segmenter will parse as if it's being
-    /// given a whole file.  This means, for example, that it will interpret `-`
-    /// or `+` at the beginning of the syntax as a separator between commands
-    /// (since `-` or `+` at the beginning of a line has this meaning).
-    ///
-    /// If `is_snippet` is true, then the segmenter will parse as if it's being
-    /// given an isolated piece of syntax.  This means that, for example, that
-    /// it will interpret `-` or `+` at the beginning of the syntax as an
-    /// operator token or (if followed by a digit) as part of a number.
-    pub fn new(mode: Mode, is_snippet: bool) -> Self {
-        Self {
-            state: if is_snippet {
-                (State::General, Substate::empty())
-            } else {
-                (State::Shbang, Substate::empty())
-            },
-            mode,
-            nest: 0,
-        }
-    }
-
-    pub fn mode(&self) -> Mode {
-        self.mode
-    }
-
-    fn start_of_line(&self) -> bool {
-        self.state.1.contains(Substate::START_OF_LINE)
-    }
-
-    fn start_of_command(&self) -> bool {
-        self.state.1.contains(Substate::START_OF_COMMAND)
-    }
-
-    /// Returns the style of command prompt to display to an interactive user
-    /// for input in the current state..  The return value is most accurate in
-    /// mode `Mode::Interactive` and at the beginning of a line (that is, if
-    /// [`Segmenter::push`] consumed as much as possible of the input up to a
-    /// new-line).
-    pub fn prompt(&self) -> PromptStyle {
-        match self.state.0 {
-            State::Shbang => PromptStyle::First,
-            State::General => {
-                if self.start_of_command() {
-                    PromptStyle::First
-                } else {
-                    PromptStyle::Later
-                }
-            }
-            State::Comment1 | State::Comment2 => PromptStyle::Comment,
-            State::Document1 | State::Document2 => PromptStyle::Document,
-            State::Document3 => PromptStyle::First,
-            State::FileLabel1 => PromptStyle::Later,
-            State::FileLabel2 | State::FileLabel3 => PromptStyle::First,
-            State::DoRepeat1 | State::DoRepeat2 => {
-                if self.start_of_command() {
-                    PromptStyle::First
-                } else {
-                    PromptStyle::Later
-                }
-            }
-            State::DoRepeat3 => PromptStyle::DoRepeat,
-            State::DoRepeat4 => PromptStyle::DoRepeat,
-            State::Define1 | State::Define2 | State::Define3 => {
-                if self.start_of_command() {
-                    PromptStyle::First
-                } else {
-                    PromptStyle::Later
-                }
-            }
-            State::Define4 | State::Define5 | State::Define6 => PromptStyle::Define,
-            State::BeginData1 => PromptStyle::First,
-            State::BeginData2 => PromptStyle::Later,
-            State::BeginData3 | State::BeginData4 => PromptStyle::Data,
-        }
-    }
-
-    /// Attempts to label a prefix of the remaining input with a segment type.
-    /// The caller supplies a prefix of the remaining input as `input`.  If
-    /// `eof` is true, then `input` is the entire (remainder) of the input; if
-    /// `eof` is false, then further input is potentially available.
-    ///
-    /// The input may contain '\n' or '\r\n' line ends in any combination.
-    ///
-    /// If successful, returns `Ok((n, type))`, where `n` is the number of bytes
-    /// in the segment at the beginning of `input` (a number in
-    /// `0..=input.len()`) and the type of that segment.  The next call should
-    /// not include those bytes in `input`, because they have (figuratively)
-    /// been consumed by the segmenter.
-    ///
-    /// Segments can have zero length, including segment types `Type::End`,
-    /// `Type::SeparateCommands`, `Type::StartDocument`, `Type::InlineData`, and
-    /// `Type::Spaces`.
-    ///
-    /// Failure occurs only if the segment type of the bytes in `input` cannot
-    /// yet be determined.  In this case, this function returns `Err(Incomplete)`.  If
-    /// more input is available, the caller should obtain some more, then call
-    /// again with a longer `input`.  If this is not enough, the process might
-    /// need to repeat again and again.  If input is exhausted, then the caller
-    /// may call again setting `eof` to true.  This function will never return
-    /// `Err(Incomplete)` when `eof` is true.
-    ///
-    /// The caller must not, in a sequence of calls, supply contradictory input.
-    /// That is, bytes provided as part of `input` in one call, but not
-    /// consumed, must not be provided with *different* values on subsequent
-    /// calls.  This is because the function must often make decisions based on
-    /// looking ahead beyond the bytes that it consumes.
-    fn push_rest<'a>(
-        &mut self,
-        input: &'a str,
-        eof: bool,
-    ) -> Result<(&'a str, Segment), Incomplete> {
-        if input.is_empty() {
-            if eof {
-                return Ok((input, Segment::End));
-            } else {
-                return Err(Incomplete);
-            };
-        }
-
-        match self.state.0 {
-            State::Shbang => return self.parse_shbang(input, eof),
-            State::General => {
-                if self.start_of_line() {
-                    self.parse_start_of_line(input, eof)
-                } else {
-                    self.parse_mid_line(input, eof)
-                }
-            }
-            State::Comment1 => self.parse_comment_1(input, eof),
-            State::Comment2 => self.parse_comment_2(input, eof),
-            State::Document1 => self.parse_document_1(input, eof),
-            State::Document2 => self.parse_document_2(input, eof),
-            State::Document3 => self.parse_document_3(input, eof),
-            State::FileLabel1 => self.parse_file_label_1(input, eof),
-            State::FileLabel2 => self.parse_file_label_2(input, eof),
-            State::FileLabel3 => self.parse_file_label_3(input, eof),
-            State::DoRepeat1 => self.parse_do_repeat_1(input, eof),
-            State::DoRepeat2 => self.parse_do_repeat_2(input, eof),
-            State::DoRepeat3 => self.parse_do_repeat_3(input, eof),
-            State::DoRepeat4 => self.parse_do_repeat_4(input),
-            State::Define1 => self.parse_define_1_2(input, eof),
-            State::Define2 => self.parse_define_1_2(input, eof),
-            State::Define3 => self.parse_define_3(input, eof),
-            State::Define4 => self.parse_define_4_5(input, eof),
-            State::Define5 => self.parse_define_4_5(input, eof),
-            State::Define6 => self.parse_define_6(input, eof),
-            State::BeginData1 => self.parse_begin_data_1(input, eof),
-            State::BeginData2 => self.parse_begin_data_2(input, eof),
-            State::BeginData3 => self.parse_begin_data_3(input, eof),
-            State::BeginData4 => self.parse_begin_data_4(input, eof),
-        }
-    }
-
-    pub fn push(&mut self, input: &str, eof: bool) -> Result<(usize, Segment), Incomplete> {
-        let (rest, seg_type) = self.push_rest(input, eof)?;
-        Ok((input.len() - rest.len(), seg_type))
-    }
-}
-
-#[derive(Copy, Clone, Debug, PartialEq, Eq)]
-enum State {
-    Shbang,
-    General,
-    Comment1,
-    Comment2,
-    Document1,
-    Document2,
-    Document3,
-    FileLabel1,
-    FileLabel2,
-    FileLabel3,
-    DoRepeat1,
-    DoRepeat2,
-    DoRepeat3,
-    DoRepeat4,
-    Define1,
-    Define2,
-    Define3,
-    Define4,
-    Define5,
-    Define6,
-    BeginData1,
-    BeginData2,
-    BeginData3,
-    BeginData4,
-}
-
-fn take(input: &str, eof: bool) -> Result<(Option<char>, &str), Incomplete> {
-    let mut iter = input.chars();
-    match iter.next() {
-        None if !eof => Err(Incomplete),
-        c => Ok((c, iter.as_str())),
-    }
-}
-
-fn skip_comment(mut input: &str, eof: bool) -> Result<&str, Incomplete> {
-    loop {
-        let (Some(c), rest) = take(input, eof)? else {
-            return Ok(input);
-        };
-        match c {
-            '\n' | '\r' if is_end_of_line(input, eof)? => return Ok(input),
-            '*' => {
-                if let (Some('/'), rest) = take(rest, eof)? {
-                    return Ok(rest);
-                }
-            }
-            _ => (),
-        };
-        input = rest;
-    }
-}
-
-fn skip_matching<F>(f: F, input: &str, eof: bool) -> Result<&str, Incomplete>
-where
-    F: Fn(char) -> bool,
-{
-    let input = input.trim_start_matches(f);
-    if input.is_empty() && !eof {
-        Err(Incomplete)
-    } else {
-        Ok(input)
-    }
-}
-
-fn match_char<F>(f: F, input: &str, eof: bool) -> Result<Option<&str>, Incomplete>
-where
-    F: Fn(char) -> bool,
-{
-    if let (Some(c), rest) = take(input, eof)? {
-        if f(c) {
-            return Ok(Some(rest));
-        }
-    }
-    Ok(None)
-}
-
-fn skip_spaces(mut input: &str, eof: bool) -> Result<&str, Incomplete> {
-    loop {
-        let (Some(c), rest) = take(input, eof)? else {
-            return Ok(input);
-        };
-        match c {
-            '\r' | '\n' if is_end_of_line(input, eof)? => return Ok(input),
-            c if c.is_whitespace() => (),
-            _ => return Ok(input),
-        }
-        input = rest;
-    }
-}
-
-fn skip_digits(input: &str, eof: bool) -> Result<&str, Incomplete> {
-    skip_matching(|c| c.is_ascii_digit(), input, eof)
-}
-
-fn skip_spaces_and_comments(mut input: &str, eof: bool) -> Result<&str, Incomplete> {
-    loop {
-        let (Some(c), rest) = take(input, eof)? else {
-            return Ok(input);
-        };
-        match c {
-            '/' => {
-                let (c, rest2) = take(rest, eof)?;
-                match c {
-                    Some('*') => input = skip_comment(rest2, eof)?,
-                    Some(_) | None => return Ok(rest),
-                }
-            }
-            '\r' | '\n' if is_end_of_line(input, eof)? => return Ok(input),
-            c if c.is_whitespace() => input = rest,
-            _ => return Ok(input),
-        };
-    }
-}
-
-fn is_start_of_string(input: &str, eof: bool) -> Result<bool, Incomplete> {
-    let (Some(c), rest) = take(input, eof)? else {
-        return Ok(false);
-    };
-    match c {
-        'x' | 'X' | 'u' | 'U' => {
-            let (c, _rest) = take(rest, eof)?;
-            Ok(c == Some('\'') || c == Some('"'))
-        }
-        '\'' | '"' => Ok(true),
-        '\n' | '\r' if is_end_of_line(input, eof)? => Ok(true),
-        _ => Ok(false),
-    }
-}
-
-fn is_end_of_line(input: &str, eof: bool) -> Result<bool, Incomplete> {
-    let (Some(c), rest) = take(input, eof)? else {
-        return Ok(true);
-    };
-    Ok(match c {
-        '\n' => true,
-        '\r' => take(rest, eof)?.0 == Some('\n'),
-        _ => false,
-    })
-}
-
-fn at_end_of_line(input: &str, eof: bool) -> Result<bool, Incomplete> {
-    is_end_of_line(skip_spaces_and_comments(input, eof)?, eof)
-}
-
-fn first(s: &str) -> char {
-    s.chars().next().unwrap()
-}
-fn get_command_name_candidates(target: &str) -> &[&'static str] {
-    if target.is_empty() {
-        return &[];
-    }
-    let target_first = first(target).to_ascii_uppercase();
-    let low = COMMAND_NAMES.partition_point(|s| first(s) < target_first);
-    let high = COMMAND_NAMES.partition_point(|s| first(s) <= target_first);
-    &COMMAND_NAMES[low..high]
-}
-
-fn detect_command_name(input: &str, eof: bool) -> Result<bool, Incomplete> {
-    let command_name = input
-        .split(|c: char| {
-            !((c.is_whitespace() && c != '\n') || (c.may_continue_id() && c != '.') || c == '-')
-        })
-        .next()
-        .unwrap();
-    if !eof && command_name.len() == input.len() {
-        return Err(Incomplete);
-    }
-    let command_name = command_name.trim_end_matches(|c: char| c.is_whitespace() || c == '.');
-    for command in get_command_name_candidates(command_name) {
-        if let Some(m) = command_match(command, command_name) {
-            if m.missing_words <= 0 {
-                return Ok(true);
-            }
-        }
-    }
-    Ok(false)
-}
-
-impl Segmenter {
-    fn parse_shbang<'a>(
-        &mut self,
-        input: &'a str,
-        eof: bool,
-    ) -> Result<(&'a str, Segment), Incomplete> {
-        if let (Some('#'), rest) = take(input, eof)? {
-            if let (Some('!'), rest) = take(rest, eof)? {
-                let rest = self.parse_full_line(rest, eof)?;
-                self.state = (State::General, Substate::START_OF_COMMAND);
-                return Ok((rest, Segment::Shbang));
-            }
-        }
-
-        self.state = (
-            State::General,
-            Substate::START_OF_COMMAND | Substate::START_OF_LINE,
-        );
-        self.push_rest(input, eof)
-    }
-    fn at_command_start(&self, input: &str, eof: bool) -> Result<bool, Incomplete> {
-        match self.mode {
-            Mode::Auto => detect_command_name(input, eof),
-            Mode::Interactive => Ok(false),
-            Mode::Batch => Ok(true),
-        }
-    }
-    fn parse_start_of_line<'a>(
-        &mut self,
-        input: &'a str,
-        eof: bool,
-    ) -> Result<(&'a str, Segment), Incomplete> {
-        debug_assert_eq!(self.state.0, State::General);
-        debug_assert!(self.start_of_line());
-        debug_assert!(!input.is_empty());
-
-        let (Some(c), rest) = take(input, eof).unwrap() else {
-            unreachable!()
-        };
-        match c {
-            '+' if is_start_of_string(skip_spaces_and_comments(rest, eof)?, eof)? => {
-                // This  `+` is punctuation that may separate pieces of a string.
-                self.state = (State::General, Substate::empty());
-                return Ok((rest, Segment::Punct));
-            }
-            '+' | '-' | '.' => {
-                self.state = (State::General, Substate::START_OF_COMMAND);
-                return Ok((rest, Segment::StartCommand));
-            }
-            _ if c.is_whitespace() => {
-                if at_end_of_line(input, eof)? {
-                    self.state = (State::General, Substate::START_OF_COMMAND);
-                    return Ok((input, Segment::SeparateCommands));
-                }
-            }
-            _ => {
-                if self.at_command_start(input, eof)?
-                    && !self.state.1.contains(Substate::START_OF_COMMAND)
-                {
-                    self.state = (State::General, Substate::START_OF_COMMAND);
-                    return Ok((input, Segment::StartCommand));
-                }
-            }
-        }
-        self.state.1 = Substate::START_OF_COMMAND;
-        self.parse_mid_line(input, eof)
-    }
-    fn parse_mid_line<'a>(
-        &mut self,
-        input: &'a str,
-        eof: bool,
-    ) -> Result<(&'a str, Segment), Incomplete> {
-        debug_assert!(self.state.0 == State::General);
-        debug_assert!(!self.state.1.contains(Substate::START_OF_LINE));
-        let (Some(c), rest) = take(input, eof)? else {
-            unreachable!()
-        };
-        match c {
-            '\r' | '\n' if is_end_of_line(input, eof)? => {
-                self.state.1 |= Substate::START_OF_LINE;
-                Ok((
-                    self.parse_newline(input, eof).unwrap().unwrap(),
-                    Segment::Newline,
-                ))
-            }
-            '/' => {
-                if let (Some('*'), rest) = take(rest, eof)? {
-                    let rest = skip_comment(rest, eof)?;
-                    return Ok((rest, Segment::Comment));
-                } else {
-                    self.state.1 = Substate::empty();
-                    return Ok((rest, Segment::Punct));
-                }
-            }
-            '-' => {
-                let (c, rest2) = take(skip_spaces(rest, eof)?, eof)?;
-                match c {
-                    Some(c) if c.is_ascii_digit() => {
-                        return self.parse_number(rest, eof);
-                    }
-                    Some('.') => {
-                        if let (Some(c), _rest) = take(rest2, eof)? {
-                            if c.is_ascii_digit() {
-                                return self.parse_number(rest, eof);
-                            }
-                        }
-                    }
-                    None | Some(_) => (),
-                }
-                self.state.1 = Substate::empty();
-                return Ok((rest, Segment::Punct));
-            }
-            '(' | ')' | '[' | ']' | '{' | '}' | ',' | '=' | ';' | ':' | '&' | '|' | '+' => {
-                self.state.1 = Substate::empty();
-                return Ok((rest, Segment::Punct));
-            }
-            '*' => {
-                if self.state.1.contains(Substate::START_OF_COMMAND) {
-                    self.state = (State::Comment1, Substate::empty());
-                    self.parse_comment_1(input, eof)
-                } else {
-                    self.parse_digraph(&['*'], rest, eof)
-                }
-            }
-            '<' => self.parse_digraph(&['=', '>'], rest, eof),
-            '>' => self.parse_digraph(&['='], rest, eof),
-            '~' => self.parse_digraph(&['='], rest, eof),
-            '.' if at_end_of_line(rest, eof)? => {
-                self.state.1 = Substate::START_OF_COMMAND;
-                Ok((rest, Segment::EndCommand))
-            }
-            '.' => match take(rest, eof)? {
-                (Some(c), _) if c.is_ascii_digit() => self.parse_number(input, eof),
-                _ => Ok((rest, Segment::Punct)),
-            },
-            '0'..='9' => self.parse_number(input, eof),
-            'u' | 'U' => self.maybe_parse_string(Segment::UnicodeString, (input, rest), eof),
-            'x' | 'X' => self.maybe_parse_string(Segment::HexString, (input, rest), eof),
-            '\'' | '"' => self.parse_string(Segment::QuotedString, c, rest, eof),
-            '!' => {
-                let (c, rest2) = take(rest, eof)?;
-                match c {
-                    Some('*') => Ok((rest2, Segment::Punct)),
-                    Some(_) => self.parse_id(input, eof),
-                    None => Ok((rest, Segment::Punct)),
-                }
-            }
-            c if c.is_whitespace() => Ok((skip_spaces(rest, eof)?, Segment::Spaces)),
-            c if c.may_start_id() => self.parse_id(input, eof),
-            '#'..='~' if c != '\\' && c != '^' => {
-                self.state.1 = Substate::empty();
-                Ok((rest, Segment::Punct))
-            }
-            _ => {
-                self.state.1 = Substate::empty();
-                Ok((rest, Segment::UnexpectedChar))
-            }
-        }
-    }
-    fn parse_string<'a>(
-        &mut self,
-        segment: Segment,
-        quote: char,
-        mut input: &'a str,
-        eof: bool,
-    ) -> Result<(&'a str, Segment), Incomplete> {
-        while let (Some(c), rest) = take(input, eof)? {
-            match c {
-                _ if c == quote => {
-                    let (c, rest2) = take(rest, eof)?;
-                    if c != Some(quote) {
-                        self.state.1 = Substate::empty();
-                        return Ok((rest, segment));
-                    }
-                    input = rest2;
-                }
-                '\r' | '\n' if is_end_of_line(input, eof)? => break,
-                _ => input = rest,
-            }
-        }
-        self.state.1 = Substate::empty();
-        Ok((input, Segment::ExpectedQuote))
-    }
-    fn maybe_parse_string<'a>(
-        &mut self,
-        segment: Segment,
-        input: (&'a str, &'a str),
-        eof: bool,
-    ) -> Result<(&'a str, Segment), Incomplete> {
-        match take(input.1, eof)? {
-            (Some(c), rest) if c == '\'' || c == '"' => self.parse_string(segment, c, rest, eof),
-            _ => self.parse_id(input.0, eof),
-        }
-    }
-    fn next_id_in_command<'a>(
-        &self,
-        mut input: &'a str,
-        eof: bool,
-    ) -> Result<(&'a str, &'a str), Incomplete> {
-        let mut sub = Segmenter::new(self.mode, true);
-        loop {
-            let (seg_len, seg_type) = sub.push(input, eof)?;
-            let (segment, rest) = input.split_at(seg_len);
-            match seg_type {
-                Segment::Shbang | Segment::Spaces | Segment::Comment | Segment::Newline => (),
-
-                Segment::Identifier => return Ok((segment, rest)),
-
-                Segment::Number
-                | Segment::QuotedString
-                | Segment::HexString
-                | Segment::UnicodeString
-                | Segment::UnquotedString
-                | Segment::Punct
-                | Segment::CommentCommand
-                | Segment::DoRepeatCommand
-                | Segment::DoRepeatOverflow
-                | Segment::InlineData
-                | Segment::MacroName
-                | Segment::MacroBody
-                | Segment::StartDocument
-                | Segment::Document
-                | Segment::StartCommand
-                | Segment::SeparateCommands
-                | Segment::EndCommand
-                | Segment::End
-                | Segment::ExpectedQuote
-                | Segment::ExpectedExponent
-                | Segment::UnexpectedChar => return Ok(("", rest)),
-            }
-            input = rest;
-        }
-    }
-    fn parse_id<'a>(
-        &mut self,
-        input: &'a str,
-        eof: bool,
-    ) -> Result<(&'a str, Segment), Incomplete> {
-        let (Some(_), mut end) = take(input, eof).unwrap() else {
-            unreachable!()
-        };
-        while let (Some(c), rest) = take(end, eof)? {
-            if !c.may_continue_id() {
-                break;
-            };
-            end = rest;
-        }
-        let identifier = &input[..input.len() - end.len()];
-        let identifier = match identifier.strip_suffix('.') {
-            Some(without_dot) if at_end_of_line(end, eof)? => without_dot,
-            _ => identifier,
-        };
-        let rest = &input[identifier.len()..];
-
-        if self.state.1.contains(Substate::START_OF_COMMAND) {
-            if id_match_n("COMMENT", identifier, 4) {
-                self.state = (State::Comment1, Substate::empty());
-                return self.parse_comment_1(input, eof);
-            } else if id_match("DOCUMENT", identifier) {
-                self.state = (State::Document1, Substate::empty());
-                return Ok((input, Segment::StartDocument));
-            } else if id_match_n("DEFINE", identifier, 6) {
-                self.state = (State::Define1, Substate::empty());
-            } else if id_match("FILE", identifier) {
-                if id_match("LABEL", self.next_id_in_command(rest, eof)?.0) {
-                    self.state = (State::FileLabel1, Substate::empty());
-                    return Ok((rest, Segment::Identifier));
-                }
-            } else if id_match("DO", identifier) {
-                if id_match("REPEAT", self.next_id_in_command(rest, eof)?.0) {
-                    self.state = (State::DoRepeat1, Substate::empty());
-                    return Ok((rest, Segment::Identifier));
-                }
-            } else if id_match("BEGIN", identifier) {
-                let (next_id, rest2) = self.next_id_in_command(rest, eof)?;
-                if id_match("DATA", next_id) {
-                    let rest2 = skip_spaces_and_comments(rest2, eof)?;
-                    let rest2 = if let Some(s) = rest2.strip_prefix('.') {
-                        skip_spaces_and_comments(s, eof)?
-                    } else {
-                        rest2
-                    };
-                    if is_end_of_line(rest2, eof)? {
-                        let s = &input[..input.len() - rest2.len()];
-                        self.state = (
-                            if s.contains('\n') {
-                                State::BeginData1
-                            } else {
-                                State::BeginData2
-                            },
-                            Substate::empty(),
-                        );
-                        return Ok((rest, Segment::Identifier));
-                    }
-                }
-            }
-        }
-
-        self.state.1 = Substate::empty();
-        Ok((
-            rest,
-            if identifier != "!" {
-                Segment::Identifier
-            } else {
-                Segment::Punct
-            },
-        ))
-    }
-    fn parse_digraph<'a>(
-        &mut self,
-        seconds: &[char],
-        input: &'a str,
-        eof: bool,
-    ) -> Result<(&'a str, Segment), Incomplete> {
-        let (c, rest) = take(input, eof)?;
-        self.state.1 = Substate::empty();
-        Ok((
-            match c {
-                Some(c) if seconds.contains(&c) => rest,
-                _ => input,
-            },
-            Segment::Punct,
-        ))
-    }
-    fn parse_number<'a>(
-        &mut self,
-        input: &'a str,
-        eof: bool,
-    ) -> Result<(&'a str, Segment), Incomplete> {
-        let mut input = skip_digits(input, eof)?;
-        if let Some(rest) = match_char(|c| c == '.', input, eof)? {
-            let rest2 = skip_digits(rest, eof)?;
-            if rest2.len() < rest.len() || !at_end_of_line(rest2, eof)? {
-                input = rest2;
-            }
-        };
-        if let Some(rest) = match_char(|c| c == 'e' || c == 'E', input, eof)? {
-            let rest = match_char(|c| c == '+' || c == '-', rest, eof)?.unwrap_or(rest);
-            let rest2 = skip_digits(rest, eof)?;
-            if rest2.len() == rest.len() {
-                self.state.1 = Substate::empty();
-                return Ok((rest, Segment::ExpectedExponent));
-            }
-            input = rest2;
-        }
-        self.state.1 = Substate::empty();
-        Ok((input, Segment::Number))
-    }
-    fn parse_comment_1<'a>(
-        &mut self,
-        mut input: &'a str,
-        eof: bool,
-    ) -> Result<(&'a str, Segment), Incomplete> {
-        enum CommentState<'a> {
-            Blank,
-            NotBlank,
-            Period(&'a str),
-        }
-        let mut state = CommentState::Blank;
-        loop {
-            let (Some(c), rest) = take(input, eof)? else {
-                // End of file.
-                self.state = (State::General, Substate::START_OF_COMMAND);
-                return Ok((input, Segment::SeparateCommands));
-            };
-            match c {
-                '.' => state = CommentState::Period(input),
-                '\n' | '\r' if is_end_of_line(input, eof)? => {
-                    match state {
-                        CommentState::Blank => {
-                            // Blank line ends comment command.
-                            self.state = (State::General, Substate::START_OF_COMMAND);
-                            return Ok((input, Segment::SeparateCommands));
-                        }
-                        CommentState::Period(period) => {
-                            // '.' at end of line ends comment command.
-                            self.state = (State::General, Substate::empty());
-                            return Ok((period, Segment::CommentCommand));
-                        }
-                        CommentState::NotBlank => {
-                            // Comment continues onto next line.
-                            self.state = (State::Comment2, Substate::empty());
-                            return Ok((input, Segment::CommentCommand));
-                        }
-                    }
-                }
-                c if c.is_whitespace() => (),
-                _ => state = CommentState::NotBlank,
-            }
-            input = rest;
-        }
-    }
-    fn parse_comment_2<'a>(
-        &mut self,
-        input: &'a str,
-        eof: bool,
-    ) -> Result<(&'a str, Segment), Incomplete> {
-        let rest = self.parse_newline(input, eof)?.unwrap();
-
-        let new_command = match take(rest, eof)?.0 {
-            Some('+') | Some('-') | Some('.') => true,
-            Some(c) if !c.is_whitespace() => self.at_command_start(rest, eof)?,
-            None | Some(_) => false,
-        };
-        if new_command {
-            self.state = (
-                State::General,
-                Substate::START_OF_LINE | Substate::START_OF_COMMAND,
-            );
-        } else {
-            self.state = (State::Comment1, Substate::empty());
-        }
-        Ok((rest, Segment::Newline))
-    }
-    fn parse_document_1<'a>(
-        &mut self,
-        mut input: &'a str,
-        eof: bool,
-    ) -> Result<(&'a str, Segment), Incomplete> {
-        let mut end_cmd = false;
-        loop {
-            let (Some(c), rest) = take(input, eof)? else {
-                self.state = (State::Document3, Substate::empty());
-                return Ok((input, Segment::Document));
-            };
-            match c {
-                '.' => end_cmd = true,
-                '\n' | '\r' if is_end_of_line(input, eof)? => {
-                    self.state.0 = if end_cmd {
-                        State::Document3
-                    } else {
-                        State::Document2
-                    };
-                    return Ok((input, Segment::Document));
-                }
-                c if !c.is_whitespace() => end_cmd = false,
-                _ => (),
-            }
-            input = rest;
-        }
-    }
-    fn parse_document_2<'a>(
-        &mut self,
-        input: &'a str,
-        eof: bool,
-    ) -> Result<(&'a str, Segment), Incomplete> {
-        let rest = self.parse_newline(input, eof)?.unwrap();
-        self.state = (State::Document1, Substate::empty());
-        Ok((rest, Segment::Newline))
-    }
-    fn parse_document_3<'a>(
-        &mut self,
-        input: &'a str,
-        _eof: bool,
-    ) -> Result<(&'a str, Segment), Incomplete> {
-        self.state = (
-            State::General,
-            Substate::START_OF_COMMAND | Substate::START_OF_LINE,
-        );
-        Ok((input, Segment::EndCommand))
-    }
-    fn quoted_file_label(input: &str, eof: bool) -> Result<bool, Incomplete> {
-        let input = skip_spaces_and_comments(input, eof)?;
-        match take(input, eof)?.0 {
-            Some('\'') | Some('"') | Some('\n') => Ok(true),
-            _ => Ok(false),
-        }
-    }
-    fn parse_file_label_1<'a>(
-        &mut self,
-        input: &'a str,
-        eof: bool,
-    ) -> Result<(&'a str, Segment), Incomplete> {
-        let mut sub = Segmenter {
-            state: (State::General, self.state.1),
-            ..*self
-        };
-        let (rest, segment) = sub.push_rest(input, eof)?;
-        if segment == Segment::Identifier {
-            let id = &input[..input.len() - rest.len()];
-            debug_assert!(id_match("LABEL", id), "{id} should be LABEL");
-            if Self::quoted_file_label(rest, eof)? {
-                *self = sub;
-            } else {
-                self.state.0 = State::FileLabel2;
-            }
-        } else {
-            self.state.1 = sub.state.1;
-        }
-        Ok((rest, segment))
-    }
-    fn parse_file_label_2<'a>(
-        &mut self,
-        input: &'a str,
-        eof: bool,
-    ) -> Result<(&'a str, Segment), Incomplete> {
-        let input = skip_spaces(input, eof)?;
-        self.state = (State::FileLabel3, Substate::empty());
-        Ok((input, Segment::Spaces))
-    }
-    fn parse_file_label_3<'a>(
-        &mut self,
-        mut input: &'a str,
-        eof: bool,
-    ) -> Result<(&'a str, Segment), Incomplete> {
-        let mut end_cmd = None;
-        loop {
-            let (c, rest) = take(input, eof)?;
-            match c {
-                None | Some('\n') | Some('\r') if is_end_of_line(input, eof)? => {
-                    self.state = (State::General, Substate::empty());
-                    return Ok((end_cmd.unwrap_or(input), Segment::UnquotedString));
-                }
-                None => unreachable!(),
-                Some('.') => end_cmd = Some(input),
-                Some(c) if !c.is_whitespace() => end_cmd = None,
-                Some(_) => (),
-            }
-            input = rest;
-        }
-    }
-    fn subparse<'a>(
-        &mut self,
-        input: &'a str,
-        eof: bool,
-    ) -> Result<(&'a str, Segment), Incomplete> {
-        let mut sub = Segmenter {
-            mode: self.mode,
-            state: (State::General, self.state.1),
-            nest: 0,
-        };
-        let result = sub.push_rest(input, eof)?;
-        self.state.1 = sub.state.1;
-        Ok(result)
-    }
-    /// We are segmenting a `DO REPEAT` command, currently reading the syntax
-    /// that defines the stand-in variables (the head) before the lines of
-    /// syntax to be repeated (the body).
-    fn parse_do_repeat_1<'a>(
-        &mut self,
-        input: &'a str,
-        eof: bool,
-    ) -> Result<(&'a str, Segment), Incomplete> {
-        let (rest, segment) = self.subparse(input, eof)?;
-        if segment == Segment::SeparateCommands {
-            // We reached a blank line that separates the head from the body.
-            self.state.0 = State::DoRepeat2;
-        } else if segment == Segment::EndCommand || segment == Segment::StartCommand {
-            // We reached the body.
-            self.state.0 = State::DoRepeat3;
-            self.nest = 1;
-        }
-        Ok((rest, segment))
-    }
-    /// We are segmenting a `DO REPEAT` command, currently reading a blank line
-    /// that separates the head from the body.
-    fn parse_do_repeat_2<'a>(
-        &mut self,
-        input: &'a str,
-        eof: bool,
-    ) -> Result<(&'a str, Segment), Incomplete> {
-        let (rest, segment) = self.subparse(input, eof)?;
-        if segment == Segment::Newline {
-            // We reached the body.
-            self.state.0 = State::DoRepeat3;
-            self.nest = 1;
-        }
-        Ok((rest, segment))
-    }
-    fn parse_newline<'a>(
-        &mut self,
-        input: &'a str,
-        eof: bool,
-    ) -> Result<Option<&'a str>, Incomplete> {
-        let (Some(c), rest) = take(input, eof)? else {
-            return Ok(None);
-        };
-        match c {
-            '\n' => Ok(Some(rest)),
-            '\r' => {
-                if let (Some('\n'), rest) = take(rest, eof)? {
-                    Ok(Some(rest))
-                } else {
-                    Ok(None)
-                }
-            }
-            _ => Ok(None),
-        }
-    }
-
-    fn parse_full_line<'a>(
-        &mut self,
-        mut input: &'a str,
-        eof: bool,
-    ) -> Result<&'a str, Incomplete> {
-        loop {
-            if is_end_of_line(input, eof)? {
-                return Ok(input);
-            }
-            input = take(input, eof).unwrap().1;
-        }
-    }
-    fn check_repeat_command<'a>(&mut self, input: &'a str, eof: bool) -> Result<isize, Incomplete> {
-        let input = input.strip_prefix(&['-', '+']).unwrap_or(input);
-        let (id1, input) = self.next_id_in_command(input, eof)?;
-        if id_match("DO", id1) && id_match("REPEAT", self.next_id_in_command(input, eof)?.0) {
-            Ok(1)
-        } else if id_match("END", id1) && id_match("REPEAT", self.next_id_in_command(input, eof)?.0)
-        {
-            Ok(-1)
-        } else {
-            Ok(0)
-        }
-    }
-    /// We are in the body of `DO REPEAT`, segmenting the lines of syntax that
-    /// are to be repeated.  Report each line of syntax as a single
-    /// [`Type::DoRepeatCommand`].
-    ///
-    /// `DO REPEAT` can be nested, so we look for `DO REPEAT...END REPEAT`
-    /// blocks inside the lines we're segmenting.  `self.nest` counts the
-    /// nesting level, starting at 1.
-    fn parse_do_repeat_3<'a>(
-        &mut self,
-        input: &'a str,
-        eof: bool,
-    ) -> Result<(&'a str, Segment), Incomplete> {
-        if let Some(rest) = self.parse_newline(input, eof)? {
-            return Ok((rest, Segment::Newline));
-        }
-        let rest = self.parse_full_line(input, eof)?;
-        let direction = self.check_repeat_command(input, eof)?;
-        if direction > 0 {
-            if let Some(nest) = self.nest.checked_add(1) {
-                self.nest = nest;
-            } else {
-                self.state.0 = State::DoRepeat4;
-            }
-        } else if direction < 0 {
-            self.nest -= 1;
-            if self.nest == 0 {
-                // Nesting level dropped to 0, so we've finished reading the `DO
-                // REPEAT` body.
-                self.state = (
-                    State::General,
-                    Substate::START_OF_COMMAND | Substate::START_OF_LINE,
-                );
-                return self.push_rest(input, eof);
-            }
-        }
-        return Ok((rest, Segment::DoRepeatCommand));
-    }
-    fn parse_do_repeat_4<'a>(&mut self, input: &'a str) -> Result<(&'a str, Segment), Incomplete> {
-        self.state.0 = State::DoRepeat3;
-        Ok((input, Segment::DoRepeatOverflow))
-    }
-    /// We are segmenting a `DEFINE` command, which consists of:
-    ///
-    ///   - The `DEFINE` keyword.
-    ///
-    ///   - An identifier.  We transform this into `Type::MacroName` instead of
-    ///     `Type::Identifier` because this identifier must never  be macro-expanded.
-    ///
-    ///   - Anything but `(`.
-    ///
-    ///   - `(` followed by a sequence of tokens possibly including balanced
-    ///     parentheses up to a final `)`.
-    ///
-    ///   - A sequence of any number of lines, one string per line, ending with
-    ///     `!ENDDEFINE`.  The first line is usually blank (that is, a newline
-    ///     follows the `(`).  The last line usually just has `!ENDDEFINE.` on
-    ///     it, but it can start with other tokens.  The whole
-    ///     DEFINE...!ENDDEFINE can be on a single line, even.
-    fn parse_define_1_2<'a>(
-        &mut self,
-        input: &'a str,
-        eof: bool,
-    ) -> Result<(&'a str, Segment), Incomplete> {
-        let (rest, segment) = self.subparse(input, eof)?;
-        match segment {
-            Segment::Identifier if self.state.0 == State::Define1 => {
-                self.state.0 = State::Define2;
-                return Ok((rest, Segment::MacroName));
-            }
-            Segment::SeparateCommands | Segment::EndCommand | Segment::StartCommand => {
-                // The DEFINE command is malformed because we reached its end
-                // without ever hitting a `(` token.  Transition back to general
-                // parsing.
-                self.state.0 = State::General;
-            }
-            Segment::Punct if input.starts_with('(') => {
-                self.state.0 = State::Define3;
-                self.nest = 1;
-            }
-            _ => (),
-        }
-        Ok((rest, segment))
-    }
-    fn parse_define_3<'a>(
-        &mut self,
-        input: &'a str,
-        eof: bool,
-    ) -> Result<(&'a str, Segment), Incomplete> {
-        let (rest, segment) = self.subparse(input, eof)?;
-        match segment {
-            Segment::SeparateCommands | Segment::EndCommand | Segment::StartCommand => {
-                // The DEFINE command is malformed because we reached its end
-                // without ever hitting a `(` token.  Transition back to general
-                // parsing.
-                self.state.0 = State::General;
-            }
-            Segment::Punct if input.starts_with('(') => {
-                self.nest += 1;
-            }
-            Segment::Punct if input.starts_with(')') => {
-                self.nest -= 1;
-                if self.nest == 0 {
-                    self.state = (State::Define4, Substate::empty());
-                }
-            }
-            _ => (),
-        }
-        Ok((rest, segment))
-    }
-    fn find_enddefine<'a>(mut input: &'a str) -> Option<&'a str> {
-        loop {
-            input = skip_spaces_and_comments(input, true).unwrap();
-            let (Some(c), rest) = take(input, true).unwrap() else {
-                return None;
-            };
-            match c {
-                '!' if strip_prefix_ignore_ascii_case(input, "!ENDDEFINE").is_some() => {
-                    return Some(input)
-                }
-                '\'' | '"' => {
-                    let index = rest.find(c)?;
-                    input = &rest[index + 1..];
-                }
-                _ => input = rest,
-            }
-        }
-    }
-
-    /// We are in the body of a macro definition, looking for additional lines
-    /// of the body or `!ENDDEFINE`.
-    ///
-    /// In `State::Define4`, we're parsing the first line of the macro body (the
-    /// same line as the closing parenthesis in the argument definition).  In
-    /// `State::Define5`, we're on a later line.
-    fn parse_define_4_5<'a>(
-        &mut self,
-        input: &'a str,
-        eof: bool,
-    ) -> Result<(&'a str, Segment), Incomplete> {
-        let rest = self.parse_full_line(input, eof)?;
-        let line = &input[..input.len() - rest.len()];
-        if let Some(end) = Self::find_enddefine(line) {
-            // Macro ends at the !ENDDEFINE on this line.
-            self.state = (State::General, Substate::empty());
-            let (prefix, rest) = input.split_at(line.len() - end.len());
-            if prefix.is_empty() {
-                // Line starts with `!ENDDEFINE`.
-                self.push_rest(input, eof)
-            } else if prefix.trim_start().is_empty() {
-                // Line starts with spaces followed by `!ENDDEFINE`.
-                Ok((rest, Segment::Spaces))
-            } else {
-                // Line starts with some content followed by `!ENDDEFINE`.
-                Ok((rest, Segment::MacroBody))
-            }
-        } else {
-            // No `!ENDDEFINE`.  We have a full line of macro body.
-            //
-            // If the first line of the macro body is blank, we just report it
-            // as spaces, or not at all if there are no spaces, because it's not
-            // significant.
-            //
-            // However, if it's a later line, we need to report it because blank
-            // lines can have significance.
-            let segment = if self.state.0 == State::Define4 && line.trim_start().is_empty() {
-                if line.is_empty() {
-                    return self.parse_define_6(input, eof);
-                }
-                Segment::Spaces
-            } else {
-                Segment::MacroBody
-            };
-            self.state.0 = State::Define6;
-            Ok((rest, segment))
-        }
-    }
-    fn parse_define_6<'a>(
-        &mut self,
-        input: &'a str,
-        eof: bool,
-    ) -> Result<(&'a str, Segment), Incomplete> {
-        let rest = self.parse_newline(input, eof)?.unwrap();
-        self.state.0 = State::Define5;
-        Ok((rest, Segment::Newline))
-    }
-    fn parse_begin_data_1<'a>(
-        &mut self,
-        input: &'a str,
-        eof: bool,
-    ) -> Result<(&'a str, Segment), Incomplete> {
-        let (rest, segment) = self.subparse(input, eof)?;
-        if segment == Segment::Newline {
-            self.state.0 = State::BeginData2;
-        }
-        Ok((rest, segment))
-    }
-    fn parse_begin_data_2<'a>(
-        &mut self,
-        input: &'a str,
-        eof: bool,
-    ) -> Result<(&'a str, Segment), Incomplete> {
-        let (rest, segment) = self.subparse(input, eof)?;
-        if segment == Segment::Newline {
-            self.state.0 = State::BeginData3;
-        }
-        Ok((rest, segment))
-    }
-    fn is_end_data(line: &str) -> bool {
-        let Some(rest) = strip_prefix_ignore_ascii_case(line, "END") else {
-            return false;
-        };
-        let (Some(c), rest) = take(rest, true).unwrap() else {
-            return false;
-        };
-        if !c.is_whitespace() {
-            return false;
-        };
-        let Some(rest) = strip_prefix_ignore_ascii_case(rest, "DATA") else {
-            return false;
-        };
-
-        let mut endcmd = false;
-        for c in rest.chars() {
-            match c {
-                '.' if endcmd => return false,
-                '.' => endcmd = true,
-                c if c.is_whitespace() => (),
-                _ => return false,
-            }
-        }
-        true
-    }
-    fn parse_begin_data_3<'a>(
-        &mut self,
-        input: &'a str,
-        eof: bool,
-    ) -> Result<(&'a str, Segment), Incomplete> {
-        let rest = self.parse_full_line(input, eof)?;
-        let line = &input[..input.len() - rest.len()];
-        if Self::is_end_data(line) {
-            self.state = (
-                State::General,
-                Substate::START_OF_COMMAND | Substate::START_OF_LINE,
-            );
-            self.push_rest(input, eof)
-        } else {
-            self.state.0 = State::BeginData4;
-            Ok((rest, Segment::InlineData))
-        }
-    }
-    fn parse_begin_data_4<'a>(
-        &mut self,
-        input: &'a str,
-        eof: bool,
-    ) -> Result<(&'a str, Segment), Incomplete> {
-        let rest = self.parse_newline(input, eof)?.unwrap();
-        self.state.0 = State::BeginData3;
-        Ok((rest, Segment::Newline))
-    }
-}
-
-fn strip_prefix_ignore_ascii_case<'a>(line: &'a str, pattern: &str) -> Option<&'a str> {
-    line.get(..pattern.len())
-        .map(|prefix| {
-            prefix
-                .eq_ignore_ascii_case(pattern)
-                .then(|| &line[pattern.len()..])
-        })
-        .flatten()
-}
-
-#[cfg(test)]
-mod test;
diff --git a/rust/src/lex/segment/test.rs b/rust/src/lex/segment/test.rs

deleted file mode 100644 (file)

index d8c337d..0000000
--- a/rust/src/lex/segment/test.rs
+++ /dev/null
@@ -1,2172 +0,0 @@
-use crate::prompt::PromptStyle;
-
-use super::{Mode, Segment, Segmenter};
-
-fn push_segment<'a>(
-    segmenter: &mut Segmenter,
-    input: &'a str,
-    one_byte: bool,
-) -> (usize, Segment) {
-    if one_byte {
-        for len in input.char_indices().map(|(pos, _c)| pos) {
-            if let Ok(result) = segmenter.push(&input[..len], false) {
-                return result;
-            }
-        }
-    }
-    segmenter.push(input, true).unwrap()
-}
-
-fn _check_segmentation(
-    mut input: &str,
-    mode: Mode,
-    expect_segments: &[(Segment, &str)],
-    expect_prompts: &[PromptStyle],
-    one_byte: bool,
-) {
-    let mut segments = Vec::with_capacity(expect_segments.len());
-    let mut prompts = Vec::new();
-    let mut segmenter = Segmenter::new(mode, false);
-    loop {
-        let (seg_len, seg_type) = push_segment(&mut segmenter, input, one_byte);
-        let (token, rest) = input.split_at(seg_len);
-        segments.push((seg_type, token));
-        match seg_type {
-            Segment::End => break,
-            Segment::Newline => prompts.push(segmenter.prompt()),
-            _ => (),
-        }
-        input = rest;
-    }
-
-    if &segments != expect_segments {
-        eprintln!("segments differ from expected:");
-        let difference = diff::slice(expect_segments, &segments);
-        for result in difference {
-            match result {
-                diff::Result::Left(left) => eprintln!("-{left:?}"),
-                diff::Result::Both(left, _right) => eprintln!(" {left:?}"),
-                diff::Result::Right(right) => eprintln!("+{right:?}"),
-            }
-        }
-        panic!();
-    }
-
-    if &prompts != expect_prompts {
-        eprintln!("prompts differ from expected:");
-        let difference = diff::slice(expect_prompts, &prompts);
-        for result in difference {
-            match result {
-                diff::Result::Left(left) => eprintln!("-{left:?}"),
-                diff::Result::Both(left, _right) => eprintln!(" {left:?}"),
-                diff::Result::Right(right) => eprintln!("+{right:?}"),
-            }
-        }
-        panic!();
-    }
-}
-
-fn check_segmentation(
-    input: &str,
-    mode: Mode,
-    expect_segments: &[(Segment, &str)],
-    expect_prompts: &[PromptStyle],
-) {
-    for (one_byte, one_byte_name) in [(false, "full-string"), (true, "byte-by-byte")] {
-        println!("running {one_byte_name} segmentation test with LF newlines...");
-        _check_segmentation(input, mode, expect_segments, expect_prompts, one_byte);
-
-        println!("running {one_byte_name} segmentation test with CRLF newlines...");
-        _check_segmentation(
-            &input.replace('\n', "\r\n"),
-            mode,
-            &expect_segments
-                .iter()
-                .map(|(segment, s)| match *segment {
-                    Segment::Newline => (Segment::Newline, "\r\n"),
-                    _ => (*segment, *s),
-                })
-                .collect::<Vec<_>>(),
-            expect_prompts,
-            one_byte,
-        );
-
-        if let Some(input) = input.strip_suffix('\n') {
-            println!("running {one_byte_name} segmentation test without final newline...");
-            let mut expect_segments: Vec<_> = expect_segments.iter().copied().collect();
-            assert_eq!(expect_segments.pop(), Some((Segment::End, "")));
-            assert_eq!(expect_segments.pop(), Some((Segment::Newline, "\n")));
-            while let Some((Segment::SeparateCommands | Segment::EndCommand, "")) =
-                expect_segments.last()
-            {
-                expect_segments.pop();
-            }
-            expect_segments.push((Segment::End, ""));
-            _check_segmentation(
-                input,
-                mode,
-                &expect_segments,
-                &expect_prompts[..expect_prompts.len() - 1],
-                one_byte,
-            );
-        }
-    }
-}
-
-#[allow(dead_code)]
-fn print_segmentation(mut input: &str) {
-    let mut segmenter = Segmenter::new(Mode::Interactive, false);
-    loop {
-        let (seg_len, seg_type) = segmenter.push(input, true).unwrap();
-        let (token, rest) = input.split_at(seg_len);
-        print!("{seg_type:?} {token:?}");
-        match seg_type {
-            Segment::Newline => print!(" ({:?})", segmenter.prompt()),
-            Segment::End => break,
-            _ => (),
-        }
-        println!();
-        input = rest;
-    }
-}
-
-#[test]
-fn test_identifiers() {
-    check_segmentation(
-        r#"a ab abc abcd !abcd
-A AB ABC ABCD !ABCD
-aB aBC aBcD !aBcD
-$x $y $z !$z
-grève Ângstrom poté
-#a #b #c ## #d !#d
-@efg @ @@. @#@ !@ 
-## # #12345 #.#
-f@#_.#6
-GhIjK
-.x 1y _z
-!abc abc!
-"#,
-        Mode::Auto,
-        &[
-            (Segment::Identifier, "a"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "ab"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "abc"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "abcd"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "!abcd"),
-            (Segment::Newline, "\n"),
-            (Segment::Identifier, "A"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "AB"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "ABC"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "ABCD"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "!ABCD"),
-            (Segment::Newline, "\n"),
-            (Segment::Identifier, "aB"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "aBC"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "aBcD"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "!aBcD"),
-            (Segment::Newline, "\n"),
-            (Segment::Identifier, "$x"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "$y"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "$z"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "!$z"),
-            (Segment::Newline, "\n"),
-            (Segment::Identifier, "grève"),
-            (Segment::Spaces, "\u{00a0}"),
-            (Segment::Identifier, "Ângstrom"),
-            (Segment::Spaces, "\u{00a0}"),
-            (Segment::Identifier, "poté"),
-            (Segment::Newline, "\n"),
-            (Segment::Identifier, "#a"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "#b"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "#c"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "##"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "#d"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "!#d"),
-            (Segment::Newline, "\n"),
-            (Segment::Identifier, "@efg"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "@"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "@@."),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "@#@"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "!@"),
-            (Segment::Spaces, " "),
-            (Segment::Newline, "\n"),
-            (Segment::Identifier, "##"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "#"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "#12345"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "#.#"),
-            (Segment::Newline, "\n"),
-            (Segment::Identifier, "f@#_.#6"),
-            (Segment::Newline, "\n"),
-            (Segment::Identifier, "GhIjK"),
-            (Segment::Newline, "\n"),
-            (Segment::StartCommand, "."),
-            (Segment::Identifier, "x"),
-            (Segment::Spaces, " "),
-            (Segment::Number, "1"),
-            (Segment::Identifier, "y"),
-            (Segment::Spaces, " "),
-            (Segment::Punct, "_"),
-            (Segment::Identifier, "z"),
-            (Segment::Newline, "\n"),
-            (Segment::Identifier, "!abc"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "abc"),
-            (Segment::Punct, "!"),
-            (Segment::Newline, "\n"),
-            (Segment::End, ""),
-        ],
-        &[
-            PromptStyle::Later,
-            PromptStyle::Later,
-            PromptStyle::Later,
-            PromptStyle::Later,
-            PromptStyle::Later,
-            PromptStyle::Later,
-            PromptStyle::Later,
-            PromptStyle::Later,
-            PromptStyle::Later,
-            PromptStyle::Later,
-            PromptStyle::Later,
-            PromptStyle::Later,
-        ],
-    );
-}
-
-#[test]
-fn test_identifiers_ending_in_dot() {
-    check_segmentation(
-        r#"abcd. abcd.
-ABCD. ABCD.
-aBcD. aBcD. 
-$y. $z. あいうえお.
-#c. #d..
-@@. @@....
-#.#.
-#abcd.
-.
-. 
-LMNOP. 
-QRSTUV./* end of line comment */
-qrstuv. /* end of line comment */
-QrStUv./* end of line comment */ 
-wxyz./* unterminated end of line comment
-WXYZ. /* unterminated end of line comment
-WxYz./* unterminated end of line comment 
-"#,
-        Mode::Auto,
-        &[
-            (Segment::Identifier, "abcd."),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "abcd"),
-            (Segment::EndCommand, "."),
-            (Segment::Newline, "\n"),
-            (Segment::Identifier, "ABCD."),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "ABCD"),
-            (Segment::EndCommand, "."),
-            (Segment::Newline, "\n"),
-            (Segment::Identifier, "aBcD."),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "aBcD"),
-            (Segment::EndCommand, "."),
-            (Segment::Spaces, " "),
-            (Segment::Newline, "\n"),
-            (Segment::Identifier, "$y."),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "$z."),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "あいうえお"),
-            (Segment::EndCommand, "."),
-            (Segment::Newline, "\n"),
-            (Segment::Identifier, "#c."),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "#d."),
-            (Segment::EndCommand, "."),
-            (Segment::Newline, "\n"),
-            (Segment::Identifier, "@@."),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "@@..."),
-            (Segment::EndCommand, "."),
-            (Segment::Newline, "\n"),
-            (Segment::Identifier, "#.#"),
-            (Segment::EndCommand, "."),
-            (Segment::Newline, "\n"),
-            (Segment::Identifier, "#abcd"),
-            (Segment::EndCommand, "."),
-            (Segment::Newline, "\n"),
-            (Segment::StartCommand, "."),
-            (Segment::Newline, "\n"),
-            (Segment::StartCommand, "."),
-            (Segment::Spaces, " "),
-            (Segment::Newline, "\n"),
-            (Segment::Identifier, "LMNOP"),
-            (Segment::EndCommand, "."),
-            (Segment::Spaces, " "),
-            (Segment::Newline, "\n"),
-            (Segment::Identifier, "QRSTUV"),
-            (Segment::EndCommand, "."),
-            (Segment::Comment, "/* end of line comment */"),
-            (Segment::Newline, "\n"),
-            (Segment::Identifier, "qrstuv"),
-            (Segment::EndCommand, "."),
-            (Segment::Spaces, " "),
-            (Segment::Comment, "/* end of line comment */"),
-            (Segment::Newline, "\n"),
-            (Segment::Identifier, "QrStUv"),
-            (Segment::EndCommand, "."),
-            (Segment::Comment, "/* end of line comment */"),
-            (Segment::Spaces, " "),
-            (Segment::Newline, "\n"),
-            (Segment::Identifier, "wxyz"),
-            (Segment::EndCommand, "."),
-            (Segment::Comment, "/* unterminated end of line comment"),
-            (Segment::Newline, "\n"),
-            (Segment::Identifier, "WXYZ"),
-            (Segment::EndCommand, "."),
-            (Segment::Spaces, " "),
-            (Segment::Comment, "/* unterminated end of line comment"),
-            (Segment::Newline, "\n"),
-            (Segment::Identifier, "WxYz"),
-            (Segment::EndCommand, "."),
-            (Segment::Comment, "/* unterminated end of line comment "),
-            (Segment::Newline, "\n"),
-            (Segment::End, ""),
-        ],
-        &[
-            PromptStyle::First,
-            PromptStyle::First,
-            PromptStyle::First,
-            PromptStyle::First,
-            PromptStyle::First,
-            PromptStyle::First,
-            PromptStyle::First,
-            PromptStyle::First,
-            PromptStyle::First,
-            PromptStyle::First,
-            PromptStyle::First,
-            PromptStyle::First,
-            PromptStyle::First,
-            PromptStyle::First,
-            PromptStyle::First,
-            PromptStyle::First,
-            PromptStyle::First,
-        ],
-    );
-}
-
-#[test]
-fn test_reserved_words() {
-    check_segmentation(
-        r#"and or not eq ge gt le lt ne all by to with
-AND OR NOT EQ GE GT LE LT NE ALL BY TO WITH
-andx orx notx eqx gex gtx lex ltx nex allx byx tox withx
-and. with.
-"#,
-        Mode::Auto,
-        &[
-            (Segment::Identifier, "and"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "or"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "not"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "eq"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "ge"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "gt"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "le"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "lt"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "ne"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "all"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "by"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "to"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "with"),
-            (Segment::Newline, "\n"),
-            (Segment::Identifier, "AND"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "OR"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "NOT"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "EQ"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "GE"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "GT"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "LE"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "LT"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "NE"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "ALL"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "BY"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "TO"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "WITH"),
-            (Segment::Newline, "\n"),
-            (Segment::Identifier, "andx"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "orx"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "notx"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "eqx"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "gex"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "gtx"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "lex"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "ltx"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "nex"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "allx"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "byx"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "tox"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "withx"),
-            (Segment::Newline, "\n"),
-            (Segment::Identifier, "and."),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "with"),
-            (Segment::EndCommand, "."),
-            (Segment::Newline, "\n"),
-            (Segment::End, ""),
-        ],
-        &[
-            PromptStyle::Later,
-            PromptStyle::Later,
-            PromptStyle::Later,
-            PromptStyle::First,
-        ],
-    );
-}
-
-#[test]
-fn test_punctuation() {
-    check_segmentation(
-        r#"~ & | = >= > <= < ~= <> ( ) , - + * / [ ] **
-~&|=>=><=<~=<>(),-+*/[]**!*
-% : ; ? _ ` { } ~ !*
-"#,
-        Mode::Auto,
-        &[
-            (Segment::Punct, "~"),
-            (Segment::Spaces, " "),
-            (Segment::Punct, "&"),
-            (Segment::Spaces, " "),
-            (Segment::Punct, "|"),
-            (Segment::Spaces, " "),
-            (Segment::Punct, "="),
-            (Segment::Spaces, " "),
-            (Segment::Punct, ">="),
-            (Segment::Spaces, " "),
-            (Segment::Punct, ">"),
-            (Segment::Spaces, " "),
-            (Segment::Punct, "<="),
-            (Segment::Spaces, " "),
-            (Segment::Punct, "<"),
-            (Segment::Spaces, " "),
-            (Segment::Punct, "~="),
-            (Segment::Spaces, " "),
-            (Segment::Punct, "<>"),
-            (Segment::Spaces, " "),
-            (Segment::Punct, "("),
-            (Segment::Spaces, " "),
-            (Segment::Punct, ")"),
-            (Segment::Spaces, " "),
-            (Segment::Punct, ","),
-            (Segment::Spaces, " "),
-            (Segment::Punct, "-"),
-            (Segment::Spaces, " "),
-            (Segment::Punct, "+"),
-            (Segment::Spaces, " "),
-            (Segment::Punct, "*"),
-            (Segment::Spaces, " "),
-            (Segment::Punct, "/"),
-            (Segment::Spaces, " "),
-            (Segment::Punct, "["),
-            (Segment::Spaces, " "),
-            (Segment::Punct, "]"),
-            (Segment::Spaces, " "),
-            (Segment::Punct, "**"),
-            (Segment::Newline, "\n"),
-            (Segment::Punct, "~"),
-            (Segment::Punct, "&"),
-            (Segment::Punct, "|"),
-            (Segment::Punct, "="),
-            (Segment::Punct, ">="),
-            (Segment::Punct, ">"),
-            (Segment::Punct, "<="),
-            (Segment::Punct, "<"),
-            (Segment::Punct, "~="),
-            (Segment::Punct, "<>"),
-            (Segment::Punct, "("),
-            (Segment::Punct, ")"),
-            (Segment::Punct, ","),
-            (Segment::Punct, "-"),
-            (Segment::Punct, "+"),
-            (Segment::Punct, "*"),
-            (Segment::Punct, "/"),
-            (Segment::Punct, "["),
-            (Segment::Punct, "]"),
-            (Segment::Punct, "**"),
-            (Segment::Punct, "!*"),
-            (Segment::Newline, "\n"),
-            (Segment::Punct, "%"),
-            (Segment::Spaces, " "),
-            (Segment::Punct, ":"),
-            (Segment::Spaces, " "),
-            (Segment::Punct, ";"),
-            (Segment::Spaces, " "),
-            (Segment::Punct, "?"),
-            (Segment::Spaces, " "),
-            (Segment::Punct, "_"),
-            (Segment::Spaces, " "),
-            (Segment::Punct, "`"),
-            (Segment::Spaces, " "),
-            (Segment::Punct, "{"),
-            (Segment::Spaces, " "),
-            (Segment::Punct, "}"),
-            (Segment::Spaces, " "),
-            (Segment::Punct, "~"),
-            (Segment::Spaces, " "),
-            (Segment::Punct, "!*"),
-            (Segment::Newline, "\n"),
-            (Segment::End, ""),
-        ],
-        &[PromptStyle::Later, PromptStyle::Later, PromptStyle::Later],
-    );
-}
-
-#[test]
-fn test_positive_numbers() {
-    check_segmentation(
-        r#"0 1 01 001. 1.
-123. /* comment 1 */ /* comment 2 */
-.1 0.1 00.1 00.10
-5e1 6E-1 7e+1 6E+01 6e-03
-.3E1 .4e-1 .5E+1 .6e+01 .7E-03
-1.23e1 45.6E-1 78.9e+1 99.9E+01 11.2e-03
-. 1e e1 1e+ 1e- 1.
-"#,
-        Mode::Auto,
-        &[
-            (Segment::Number, "0"),
-            (Segment::Spaces, " "),
-            (Segment::Number, "1"),
-            (Segment::Spaces, " "),
-            (Segment::Number, "01"),
-            (Segment::Spaces, " "),
-            (Segment::Number, "001."),
-            (Segment::Spaces, " "),
-            (Segment::Number, "1"),
-            (Segment::EndCommand, "."),
-            (Segment::Newline, "\n"),
-            (Segment::Number, "123"),
-            (Segment::EndCommand, "."),
-            (Segment::Spaces, " "),
-            (Segment::Comment, "/* comment 1 */"),
-            (Segment::Spaces, " "),
-            (Segment::Comment, "/* comment 2 */"),
-            (Segment::Newline, "\n"),
-            (Segment::StartCommand, "."),
-            (Segment::Number, "1"),
-            (Segment::Spaces, " "),
-            (Segment::Number, "0.1"),
-            (Segment::Spaces, " "),
-            (Segment::Number, "00.1"),
-            (Segment::Spaces, " "),
-            (Segment::Number, "00.10"),
-            (Segment::Newline, "\n"),
-            (Segment::Number, "5e1"),
-            (Segment::Spaces, " "),
-            (Segment::Number, "6E-1"),
-            (Segment::Spaces, " "),
-            (Segment::Number, "7e+1"),
-            (Segment::Spaces, " "),
-            (Segment::Number, "6E+01"),
-            (Segment::Spaces, " "),
-            (Segment::Number, "6e-03"),
-            (Segment::Newline, "\n"),
-            (Segment::StartCommand, "."),
-            (Segment::Number, "3E1"),
-            (Segment::Spaces, " "),
-            (Segment::Number, ".4e-1"),
-            (Segment::Spaces, " "),
-            (Segment::Number, ".5E+1"),
-            (Segment::Spaces, " "),
-            (Segment::Number, ".6e+01"),
-            (Segment::Spaces, " "),
-            (Segment::Number, ".7E-03"),
-            (Segment::Newline, "\n"),
-            (Segment::Number, "1.23e1"),
-            (Segment::Spaces, " "),
-            (Segment::Number, "45.6E-1"),
-            (Segment::Spaces, " "),
-            (Segment::Number, "78.9e+1"),
-            (Segment::Spaces, " "),
-            (Segment::Number, "99.9E+01"),
-            (Segment::Spaces, " "),
-            (Segment::Number, "11.2e-03"),
-            (Segment::Newline, "\n"),
-            (Segment::StartCommand, "."),
-            (Segment::Spaces, " "),
-            (Segment::ExpectedExponent, "1e"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "e1"),
-            (Segment::Spaces, " "),
-            (Segment::ExpectedExponent, "1e+"),
-            (Segment::Spaces, " "),
-            (Segment::ExpectedExponent, "1e-"),
-            (Segment::Spaces, " "),
-            (Segment::Number, "1"),
-            (Segment::EndCommand, "."),
-            (Segment::Newline, "\n"),
-            (Segment::End, ""),
-        ],
-        &[
-            PromptStyle::First,
-            PromptStyle::First,
-            PromptStyle::Later,
-            PromptStyle::Later,
-            PromptStyle::Later,
-            PromptStyle::Later,
-            PromptStyle::First,
-        ],
-    );
-}
-
-#[test]
-fn test_negative_numbers() {
-    check_segmentation(
-        r#" -0 -1 -01 -001. -1.
- -123. /* comment 1 */ /* comment 2 */
- -.1 -0.1 -00.1 -00.10
- -5e1 -6E-1 -7e+1 -6E+01 -6e-03
- -.3E1 -.4e-1 -.5E+1 -.6e+01 -.7E-03
- -1.23e1 -45.6E-1 -78.9e+1 -99.9E+01 -11.2e-03
- -/**/1
- -. -1e -e1 -1e+ -1e- -1.
-"#,
-        Mode::Auto,
-        &[
-            (Segment::Spaces, " "),
-            (Segment::Number, "-0"),
-            (Segment::Spaces, " "),
-            (Segment::Number, "-1"),
-            (Segment::Spaces, " "),
-            (Segment::Number, "-01"),
-            (Segment::Spaces, " "),
-            (Segment::Number, "-001."),
-            (Segment::Spaces, " "),
-            (Segment::Number, "-1"),
-            (Segment::EndCommand, "."),
-            (Segment::Newline, "\n"),
-            (Segment::Spaces, " "),
-            (Segment::Number, "-123"),
-            (Segment::EndCommand, "."),
-            (Segment::Spaces, " "),
-            (Segment::Comment, "/* comment 1 */"),
-            (Segment::Spaces, " "),
-            (Segment::Comment, "/* comment 2 */"),
-            (Segment::Newline, "\n"),
-            (Segment::Spaces, " "),
-            (Segment::Number, "-.1"),
-            (Segment::Spaces, " "),
-            (Segment::Number, "-0.1"),
-            (Segment::Spaces, " "),
-            (Segment::Number, "-00.1"),
-            (Segment::Spaces, " "),
-            (Segment::Number, "-00.10"),
-            (Segment::Newline, "\n"),
-            (Segment::Spaces, " "),
-            (Segment::Number, "-5e1"),
-            (Segment::Spaces, " "),
-            (Segment::Number, "-6E-1"),
-            (Segment::Spaces, " "),
-            (Segment::Number, "-7e+1"),
-            (Segment::Spaces, " "),
-            (Segment::Number, "-6E+01"),
-            (Segment::Spaces, " "),
-            (Segment::Number, "-6e-03"),
-            (Segment::Newline, "\n"),
-            (Segment::Spaces, " "),
-            (Segment::Number, "-.3E1"),
-            (Segment::Spaces, " "),
-            (Segment::Number, "-.4e-1"),
-            (Segment::Spaces, " "),
-            (Segment::Number, "-.5E+1"),
-            (Segment::Spaces, " "),
-            (Segment::Number, "-.6e+01"),
-            (Segment::Spaces, " "),
-            (Segment::Number, "-.7E-03"),
-            (Segment::Newline, "\n"),
-            (Segment::Spaces, " "),
-            (Segment::Number, "-1.23e1"),
-            (Segment::Spaces, " "),
-            (Segment::Number, "-45.6E-1"),
-            (Segment::Spaces, " "),
-            (Segment::Number, "-78.9e+1"),
-            (Segment::Spaces, " "),
-            (Segment::Number, "-99.9E+01"),
-            (Segment::Spaces, " "),
-            (Segment::Number, "-11.2e-03"),
-            (Segment::Newline, "\n"),
-            (Segment::Spaces, " "),
-            (Segment::Punct, "-"),
-            (Segment::Comment, "/**/"),
-            (Segment::Number, "1"),
-            (Segment::Newline, "\n"),
-            (Segment::Spaces, " "),
-            (Segment::Punct, "-"),
-            (Segment::Punct, "."),
-            (Segment::Spaces, " "),
-            (Segment::ExpectedExponent, "-1e"),
-            (Segment::Spaces, " "),
-            (Segment::Punct, "-"),
-            (Segment::Identifier, "e1"),
-            (Segment::Spaces, " "),
-            (Segment::ExpectedExponent, "-1e+"),
-            (Segment::Spaces, " "),
-            (Segment::ExpectedExponent, "-1e-"),
-            (Segment::Spaces, " "),
-            (Segment::Number, "-1"),
-            (Segment::EndCommand, "."),
-            (Segment::Newline, "\n"),
-            (Segment::End, ""),
-        ],
-        &[
-            PromptStyle::First,
-            PromptStyle::First,
-            PromptStyle::Later,
-            PromptStyle::Later,
-            PromptStyle::Later,
-            PromptStyle::Later,
-            PromptStyle::Later,
-            PromptStyle::First,
-        ],
-    );
-}
-
-#[test]
-fn test_strings() {
-    check_segmentation(
-        r#"'x' "y" 'abc'
-'Don''t' "Can't" 'Won''t'
-"""quoted""" '"quoted"'
-'' ""
-'missing end quote
-"missing double quote
-x"4142" X'5152'
-u'fffd' U"041"
-+ new command
-+ /* comment */ 'string continuation'
-+ /* also a punctuator on blank line
-- 'new command'
-"#,
-        Mode::Auto,
-        &[
-            (Segment::QuotedString, "'x'"),
-            (Segment::Spaces, " "),
-            (Segment::QuotedString, "\"y\""),
-            (Segment::Spaces, " "),
-            (Segment::QuotedString, "'abc'"),
-            (Segment::Newline, "\n"),
-            (Segment::QuotedString, "'Don''t'"),
-            (Segment::Spaces, " "),
-            (Segment::QuotedString, "\"Can't\""),
-            (Segment::Spaces, " "),
-            (Segment::QuotedString, "'Won''t'"),
-            (Segment::Newline, "\n"),
-            (Segment::QuotedString, "\"\"\"quoted\"\"\""),
-            (Segment::Spaces, " "),
-            (Segment::QuotedString, "'\"quoted\"'"),
-            (Segment::Newline, "\n"),
-            (Segment::QuotedString, "''"),
-            (Segment::Spaces, " "),
-            (Segment::QuotedString, "\"\""),
-            (Segment::Newline, "\n"),
-            (Segment::ExpectedQuote, "'missing end quote"),
-            (Segment::Newline, "\n"),
-            (Segment::ExpectedQuote, "\"missing double quote"),
-            (Segment::Newline, "\n"),
-            (Segment::HexString, "x\"4142\""),
-            (Segment::Spaces, " "),
-            (Segment::HexString, "X'5152'"),
-            (Segment::Newline, "\n"),
-            (Segment::UnicodeString, "u'fffd'"),
-            (Segment::Spaces, " "),
-            (Segment::UnicodeString, "U\"041\""),
-            (Segment::Newline, "\n"),
-            (Segment::StartCommand, "+"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "new"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "command"),
-            (Segment::Newline, "\n"),
-            (Segment::Punct, "+"),
-            (Segment::Spaces, " "),
-            (Segment::Comment, "/* comment */"),
-            (Segment::Spaces, " "),
-            (Segment::QuotedString, "'string continuation'"),
-            (Segment::Newline, "\n"),
-            (Segment::Punct, "+"),
-            (Segment::Spaces, " "),
-            (Segment::Comment, "/* also a punctuator on blank line"),
-            (Segment::Newline, "\n"),
-            (Segment::StartCommand, "-"),
-            (Segment::Spaces, " "),
-            (Segment::QuotedString, "'new command'"),
-            (Segment::Newline, "\n"),
-            (Segment::End, ""),
-        ],
-        &[
-            PromptStyle::Later,
-            PromptStyle::Later,
-            PromptStyle::Later,
-            PromptStyle::Later,
-            PromptStyle::Later,
-            PromptStyle::Later,
-            PromptStyle::Later,
-            PromptStyle::Later,
-            PromptStyle::Later,
-            PromptStyle::Later,
-            PromptStyle::Later,
-            PromptStyle::Later,
-        ],
-    );
-}
-
-#[test]
-fn test_shbang() {
-    check_segmentation(
-        r#"#! /usr/bin/pspp
-title my title.
-#! /usr/bin/pspp
-"#,
-        Mode::Interactive,
-        &[
-            (Segment::Shbang, "#! /usr/bin/pspp"),
-            (Segment::Newline, "\n"),
-            (Segment::Identifier, "title"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "my"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "title"),
-            (Segment::EndCommand, "."),
-            (Segment::Newline, "\n"),
-            (Segment::Identifier, "#"),
-            (Segment::Punct, "!"),
-            (Segment::Spaces, " "),
-            (Segment::Punct, "/"),
-            (Segment::Identifier, "usr"),
-            (Segment::Punct, "/"),
-            (Segment::Identifier, "bin"),
-            (Segment::Punct, "/"),
-            (Segment::Identifier, "pspp"),
-            (Segment::Newline, "\n"),
-            (Segment::End, ""),
-        ],
-        &[PromptStyle::First, PromptStyle::First, PromptStyle::Later],
-    );
-}
-
-#[test]
-fn test_comment_command() {
-    check_segmentation(
-        r#"* Comment commands "don't
-have to contain valid tokens.
-
-** Check ambiguity with ** token.
-****************.
-
-comment keyword works too.
-COMM also.
-com is ambiguous with COMPUTE.
-
-   * Comment need not start at left margin.
-
-* Comment ends with blank line
-
-next command.
-
-"#,
-        Mode::Interactive,
-        &[
-            (Segment::CommentCommand, "* Comment commands \"don't"),
-            (Segment::Newline, "\n"),
-            (Segment::CommentCommand, "have to contain valid tokens"),
-            (Segment::EndCommand, "."),
-            (Segment::Newline, "\n"),
-            (Segment::SeparateCommands, ""),
-            (Segment::Newline, "\n"),
-            (Segment::CommentCommand, "** Check ambiguity with ** token"),
-            (Segment::EndCommand, "."),
-            (Segment::Newline, "\n"),
-            (Segment::CommentCommand, "****************"),
-            (Segment::EndCommand, "."),
-            (Segment::Newline, "\n"),
-            (Segment::SeparateCommands, ""),
-            (Segment::Newline, "\n"),
-            (Segment::CommentCommand, "comment keyword works too"),
-            (Segment::EndCommand, "."),
-            (Segment::Newline, "\n"),
-            (Segment::CommentCommand, "COMM also"),
-            (Segment::EndCommand, "."),
-            (Segment::Newline, "\n"),
-            (Segment::Identifier, "com"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "is"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "ambiguous"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "with"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "COMPUTE"),
-            (Segment::EndCommand, "."),
-            (Segment::Newline, "\n"),
-            (Segment::SeparateCommands, ""),
-            (Segment::Newline, "\n"),
-            (Segment::Spaces, "   "),
-            (
-                Segment::CommentCommand,
-                "* Comment need not start at left margin",
-            ),
-            (Segment::EndCommand, "."),
-            (Segment::Newline, "\n"),
-            (Segment::SeparateCommands, ""),
-            (Segment::Newline, "\n"),
-            (Segment::CommentCommand, "* Comment ends with blank line"),
-            (Segment::Newline, "\n"),
-            (Segment::SeparateCommands, ""),
-            (Segment::Newline, "\n"),
-            (Segment::Identifier, "next"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "command"),
-            (Segment::EndCommand, "."),
-            (Segment::Newline, "\n"),
-            (Segment::SeparateCommands, ""),
-            (Segment::Newline, "\n"),
-            (Segment::End, ""),
-        ],
-        &[
-            PromptStyle::Comment,
-            PromptStyle::First,
-            PromptStyle::First,
-            PromptStyle::First,
-            PromptStyle::First,
-            PromptStyle::First,
-            PromptStyle::First,
-            PromptStyle::First,
-            PromptStyle::First,
-            PromptStyle::First,
-            PromptStyle::First,
-            PromptStyle::First,
-            PromptStyle::Comment,
-            PromptStyle::First,
-            PromptStyle::First,
-            PromptStyle::First,
-        ],
-    );
-}
-
-#[test]
-fn test_document_command() {
-    check_segmentation(
-        r#"DOCUMENT one line.
-DOC more
-    than
-        one
-            line.
-docu
-first.paragraph
-isn't parsed as tokens
-
-second paragraph.
-"#,
-        Mode::Interactive,
-        &[
-            (Segment::StartDocument, ""),
-            (Segment::Document, "DOCUMENT one line."),
-            (Segment::EndCommand, ""),
-            (Segment::SeparateCommands, ""),
-            (Segment::Newline, "\n"),
-            (Segment::StartDocument, ""),
-            (Segment::Document, "DOC more"),
-            (Segment::Newline, "\n"),
-            (Segment::Document, "    than"),
-            (Segment::Newline, "\n"),
-            (Segment::Document, "        one"),
-            (Segment::Newline, "\n"),
-            (Segment::Document, "            line."),
-            (Segment::EndCommand, ""),
-            (Segment::SeparateCommands, ""),
-            (Segment::Newline, "\n"),
-            (Segment::StartDocument, ""),
-            (Segment::Document, "docu"),
-            (Segment::Newline, "\n"),
-            (Segment::Document, "first.paragraph"),
-            (Segment::Newline, "\n"),
-            (Segment::Document, "isn't parsed as tokens"),
-            (Segment::Newline, "\n"),
-            (Segment::Document, ""),
-            (Segment::Newline, "\n"),
-            (Segment::Document, "second paragraph."),
-            (Segment::EndCommand, ""),
-            (Segment::SeparateCommands, ""),
-            (Segment::Newline, "\n"),
-            (Segment::End, ""),
-        ],
-        &[
-            PromptStyle::First,
-            PromptStyle::Document,
-            PromptStyle::Document,
-            PromptStyle::Document,
-            PromptStyle::First,
-            PromptStyle::Document,
-            PromptStyle::Document,
-            PromptStyle::Document,
-            PromptStyle::Document,
-            PromptStyle::First,
-        ],
-    );
-}
-
-#[test]
-fn test_file_label_command() {
-    check_segmentation(
-        r#"FIL label isn't quoted.
-FILE
-  lab 'is quoted'.
-FILE /*
-/**/  lab not quoted here either
-
-"#,
-        Mode::Interactive,
-        &[
-            (Segment::Identifier, "FIL"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "label"),
-            (Segment::Spaces, " "),
-            (Segment::UnquotedString, "isn't quoted"),
-            (Segment::EndCommand, "."),
-            (Segment::Newline, "\n"),
-            (Segment::Identifier, "FILE"),
-            (Segment::Newline, "\n"),
-            (Segment::Spaces, "  "),
-            (Segment::Identifier, "lab"),
-            (Segment::Spaces, " "),
-            (Segment::QuotedString, "'is quoted'"),
-            (Segment::EndCommand, "."),
-            (Segment::Newline, "\n"),
-            (Segment::Identifier, "FILE"),
-            (Segment::Spaces, " "),
-            (Segment::Comment, "/*"),
-            (Segment::Newline, "\n"),
-            (Segment::Comment, "/**/"),
-            (Segment::Spaces, "  "),
-            (Segment::Identifier, "lab"),
-            (Segment::Spaces, " "),
-            (Segment::UnquotedString, "not quoted here either"),
-            (Segment::Newline, "\n"),
-            (Segment::SeparateCommands, ""),
-            (Segment::Newline, "\n"),
-            (Segment::End, ""),
-        ],
-        &[
-            PromptStyle::First,
-            PromptStyle::Later,
-            PromptStyle::First,
-            PromptStyle::Later,
-            PromptStyle::Later,
-            PromptStyle::First,
-        ],
-    );
-}
-
-#[test]
-fn test_begin_data() {
-    check_segmentation(
-        r#"begin data.
-end data.
-
-begin data. /*
-123
-xxx
-end data.
-
-BEG /**/ DAT /*
-5 6 7 /* x
-
-end  data
-end data
-.
-
-begin
- data.
-data
-end data.
-
-begin data "xxx".
-begin data 123.
-not data
-"#,
-        Mode::Interactive,
-        &[
-            (Segment::Identifier, "begin"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "data"),
-            (Segment::EndCommand, "."),
-            (Segment::Newline, "\n"),
-            (Segment::Identifier, "end"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "data"),
-            (Segment::EndCommand, "."),
-            (Segment::Newline, "\n"),
-            (Segment::SeparateCommands, ""),
-            (Segment::Newline, "\n"),
-            (Segment::Identifier, "begin"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "data"),
-            (Segment::EndCommand, "."),
-            (Segment::Spaces, " "),
-            (Segment::Comment, "/*"),
-            (Segment::Newline, "\n"),
-            (Segment::InlineData, "123"),
-            (Segment::Newline, "\n"),
-            (Segment::InlineData, "xxx"),
-            (Segment::Newline, "\n"),
-            (Segment::Identifier, "end"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "data"),
-            (Segment::EndCommand, "."),
-            (Segment::Newline, "\n"),
-            (Segment::SeparateCommands, ""),
-            (Segment::Newline, "\n"),
-            (Segment::Identifier, "BEG"),
-            (Segment::Spaces, " "),
-            (Segment::Comment, "/**/"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "DAT"),
-            (Segment::Spaces, " "),
-            (Segment::Comment, "/*"),
-            (Segment::Newline, "\n"),
-            (Segment::InlineData, "5 6 7 /* x"),
-            (Segment::Newline, "\n"),
-            (Segment::InlineData, ""),
-            (Segment::Newline, "\n"),
-            (Segment::InlineData, "end  data"),
-            (Segment::Newline, "\n"),
-            (Segment::Identifier, "end"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "data"),
-            (Segment::Newline, "\n"),
-            (Segment::StartCommand, "."),
-            (Segment::Newline, "\n"),
-            (Segment::SeparateCommands, ""),
-            (Segment::Newline, "\n"),
-            (Segment::Identifier, "begin"),
-            (Segment::Newline, "\n"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "data"),
-            (Segment::EndCommand, "."),
-            (Segment::Newline, "\n"),
-            (Segment::InlineData, "data"),
-            (Segment::Newline, "\n"),
-            (Segment::Identifier, "end"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "data"),
-            (Segment::EndCommand, "."),
-            (Segment::Newline, "\n"),
-            (Segment::SeparateCommands, ""),
-            (Segment::Newline, "\n"),
-            (Segment::Identifier, "begin"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "data"),
-            (Segment::Spaces, " "),
-            (Segment::QuotedString, "\"xxx\""),
-            (Segment::EndCommand, "."),
-            (Segment::Newline, "\n"),
-            (Segment::Identifier, "begin"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "data"),
-            (Segment::Spaces, " "),
-            (Segment::Number, "123"),
-            (Segment::EndCommand, "."),
-            (Segment::Newline, "\n"),
-            (Segment::Identifier, "not"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "data"),
-            (Segment::Newline, "\n"),
-            (Segment::End, ""),
-        ],
-        &[
-            PromptStyle::Data,
-            PromptStyle::First,
-            PromptStyle::First,
-            PromptStyle::Data,
-            PromptStyle::Data,
-            PromptStyle::Data,
-            PromptStyle::First,
-            PromptStyle::First,
-            PromptStyle::Data,
-            PromptStyle::Data,
-            PromptStyle::Data,
-            PromptStyle::Data,
-            PromptStyle::Later,
-            PromptStyle::First,
-            PromptStyle::First,
-            PromptStyle::Later,
-            PromptStyle::Data,
-            PromptStyle::Data,
-            PromptStyle::First,
-            PromptStyle::First,
-            PromptStyle::First,
-            PromptStyle::First,
-            PromptStyle::Later,
-        ],
-    );
-}
-
-#[test]
-fn test_do_repeat() {
-    check_segmentation(
-        r#"do repeat x=a b c
-          y=d e f.
-  do repeat a=1 thru 5.
-another command.
-second command
-+ third command.
-end /* x */ /* y */ repeat print.
-end
- repeat.
-do
-  repeat #a=1.
-  inner command.
-end repeat.
-"#,
-        Mode::Interactive,
-        &[
-            (Segment::Identifier, "do"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "repeat"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "x"),
-            (Segment::Punct, "="),
-            (Segment::Identifier, "a"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "b"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "c"),
-            (Segment::Newline, "\n"),
-            (Segment::Spaces, "          "),
-            (Segment::Identifier, "y"),
-            (Segment::Punct, "="),
-            (Segment::Identifier, "d"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "e"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "f"),
-            (Segment::EndCommand, "."),
-            (Segment::Newline, "\n"),
-            (Segment::DoRepeatCommand, "  do repeat a=1 thru 5."),
-            (Segment::Newline, "\n"),
-            (Segment::DoRepeatCommand, "another command."),
-            (Segment::Newline, "\n"),
-            (Segment::DoRepeatCommand, "second command"),
-            (Segment::Newline, "\n"),
-            (Segment::DoRepeatCommand, "+ third command."),
-            (Segment::Newline, "\n"),
-            (
-                Segment::DoRepeatCommand,
-                "end /* x */ /* y */ repeat print.",
-            ),
-            (Segment::Newline, "\n"),
-            (Segment::Identifier, "end"),
-            (Segment::Newline, "\n"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "repeat"),
-            (Segment::EndCommand, "."),
-            (Segment::Newline, "\n"),
-            (Segment::Identifier, "do"),
-            (Segment::Newline, "\n"),
-            (Segment::Spaces, "  "),
-            (Segment::Identifier, "repeat"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "#a"),
-            (Segment::Punct, "="),
-            (Segment::Number, "1"),
-            (Segment::EndCommand, "."),
-            (Segment::Newline, "\n"),
-            (Segment::DoRepeatCommand, "  inner command."),
-            (Segment::Newline, "\n"),
-            (Segment::Identifier, "end"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "repeat"),
-            (Segment::EndCommand, "."),
-            (Segment::Newline, "\n"),
-            (Segment::End, ""),
-        ],
-        &[
-            PromptStyle::Later,
-            PromptStyle::DoRepeat,
-            PromptStyle::DoRepeat,
-            PromptStyle::DoRepeat,
-            PromptStyle::DoRepeat,
-            PromptStyle::DoRepeat,
-            PromptStyle::DoRepeat,
-            PromptStyle::Later,
-            PromptStyle::First,
-            PromptStyle::Later,
-            PromptStyle::DoRepeat,
-            PromptStyle::DoRepeat,
-            PromptStyle::First,
-        ],
-    );
-}
-
-#[test]
-fn test_do_repeat_overflow() {
-    const N: usize = 257;
-    let do_repeat: Vec<String> = (0..N)
-        .map(|i| format!("do repeat v{i}={i} thru {}.\n", i + 5))
-        .collect();
-    let end_repeat: Vec<String> = (0..N)
-        .rev()
-        .map(|i| format!("end repeat. /* {i}\n"))
-        .collect();
-
-    let s: String = do_repeat
-        .iter()
-        .chain(end_repeat.iter())
-        .map(|s| s.as_str())
-        .collect();
-    let mut expect_output = vec![
-        (Segment::Identifier, "do"),
-        (Segment::Spaces, " "),
-        (Segment::Identifier, "repeat"),
-        (Segment::Spaces, " "),
-        (Segment::Identifier, "v0"),
-        (Segment::Punct, "="),
-        (Segment::Number, "0"),
-        (Segment::Spaces, " "),
-        (Segment::Identifier, "thru"),
-        (Segment::Spaces, " "),
-        (Segment::Number, "5"),
-        (Segment::EndCommand, "."),
-        (Segment::Newline, "\n"),
-    ];
-    for i in 1..N {
-        expect_output.push((Segment::DoRepeatCommand, &do_repeat[i].trim_end()));
-        if i >= 255 {
-            expect_output.push((Segment::DoRepeatOverflow, ""));
-        }
-        expect_output.push((Segment::Newline, "\n"));
-    }
-    for i in 0..254 {
-        expect_output.push((Segment::DoRepeatCommand, &end_repeat[i].trim_end()));
-        expect_output.push((Segment::Newline, "\n"));
-    }
-    let comments: Vec<String> = (0..(N - 254)).rev().map(|i| format!("/* {i}")).collect();
-    for comment in &comments {
-        expect_output.extend([
-            (Segment::Identifier, "end"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "repeat"),
-            (Segment::EndCommand, "."),
-            (Segment::Spaces, " "),
-            (Segment::Comment, comment),
-            (Segment::Newline, "\n"),
-        ]);
-    }
-    expect_output.push((Segment::End, ""));
-
-    let expect_prompts: Vec<_> = (0..N * 2 - 3)
-        .map(|_| PromptStyle::DoRepeat)
-        .chain([PromptStyle::First, PromptStyle::First, PromptStyle::First])
-        .collect();
-    check_segmentation(&s, Mode::Interactive, &expect_output, &expect_prompts);
-}
-
-#[test]
-fn test_do_repeat_batch() {
-    check_segmentation(
-        r#"do repeat x=a b c
-          y=d e f
-do repeat a=1 thru 5
-another command
-second command
-+ third command
-end /* x */ /* y */ repeat print
-end
- repeat
-do
-  repeat #a=1
-
-  inner command
-end repeat
-"#,
-        Mode::Batch,
-        &[
-            (Segment::Identifier, "do"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "repeat"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "x"),
-            (Segment::Punct, "="),
-            (Segment::Identifier, "a"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "b"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "c"),
-            (Segment::Newline, "\n"),
-            (Segment::Spaces, "          "),
-            (Segment::Identifier, "y"),
-            (Segment::Punct, "="),
-            (Segment::Identifier, "d"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "e"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "f"),
-            (Segment::Newline, "\n"),
-            (Segment::StartCommand, ""),
-            (Segment::DoRepeatCommand, "do repeat a=1 thru 5"),
-            (Segment::Newline, "\n"),
-            (Segment::DoRepeatCommand, "another command"),
-            (Segment::Newline, "\n"),
-            (Segment::DoRepeatCommand, "second command"),
-            (Segment::Newline, "\n"),
-            (Segment::DoRepeatCommand, "+ third command"),
-            (Segment::Newline, "\n"),
-            (Segment::DoRepeatCommand, "end /* x */ /* y */ repeat print"),
-            (Segment::Newline, "\n"),
-            (Segment::Identifier, "end"),
-            (Segment::Newline, "\n"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "repeat"),
-            (Segment::Newline, "\n"),
-            (Segment::StartCommand, ""),
-            (Segment::Identifier, "do"),
-            (Segment::Newline, "\n"),
-            (Segment::Spaces, "  "),
-            (Segment::Identifier, "repeat"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "#a"),
-            (Segment::Punct, "="),
-            (Segment::Number, "1"),
-            (Segment::Newline, "\n"),
-            (Segment::SeparateCommands, ""),
-            (Segment::Newline, "\n"),
-            (Segment::DoRepeatCommand, "  inner command"),
-            (Segment::Newline, "\n"),
-            (Segment::Identifier, "end"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "repeat"),
-            (Segment::Newline, "\n"),
-            (Segment::End, ""),
-        ],
-        &[
-            PromptStyle::Later,
-            PromptStyle::Later,
-            PromptStyle::DoRepeat,
-            PromptStyle::DoRepeat,
-            PromptStyle::DoRepeat,
-            PromptStyle::DoRepeat,
-            PromptStyle::DoRepeat,
-            PromptStyle::Later,
-            PromptStyle::Later,
-            PromptStyle::Later,
-            PromptStyle::Later,
-            PromptStyle::DoRepeat,
-            PromptStyle::DoRepeat,
-            PromptStyle::Later,
-        ],
-    );
-}
-
-mod define {
-    use crate::{
-        lex::segment::{Mode, Segment},
-        prompt::PromptStyle,
-    };
-
-    use super::check_segmentation;
-
-    #[test]
-    fn test_simple() {
-        check_segmentation(
-            r#"define !macro1()
-var1 var2 var3 "!enddefine"
-!enddefine.
-"#,
-            Mode::Interactive,
-            &[
-                (Segment::Identifier, "define"),
-                (Segment::Spaces, " "),
-                (Segment::MacroName, "!macro1"),
-                (Segment::Punct, "("),
-                (Segment::Punct, ")"),
-                (Segment::Newline, "\n"),
-                (Segment::MacroBody, "var1 var2 var3 \"!enddefine\""),
-                (Segment::Newline, "\n"),
-                (Segment::Identifier, "!enddefine"),
-                (Segment::EndCommand, "."),
-                (Segment::Newline, "\n"),
-                (Segment::End, ""),
-            ],
-            &[PromptStyle::Define, PromptStyle::Define, PromptStyle::First],
-        );
-    }
-
-    #[test]
-    fn test_no_newline_after_parentheses() {
-        check_segmentation(
-            r#"define !macro1() var1 var2 var3 /* !enddefine
-!enddefine.
-"#,
-            Mode::Interactive,
-            &[
-                (Segment::Identifier, "define"),
-                (Segment::Spaces, " "),
-                (Segment::MacroName, "!macro1"),
-                (Segment::Punct, "("),
-                (Segment::Punct, ")"),
-                (Segment::MacroBody, " var1 var2 var3 /* !enddefine"),
-                (Segment::Newline, "\n"),
-                (Segment::Identifier, "!enddefine"),
-                (Segment::EndCommand, "."),
-                (Segment::Newline, "\n"),
-                (Segment::End, ""),
-            ],
-            &[PromptStyle::Define, PromptStyle::First],
-        );
-    }
-
-    #[test]
-    fn test_no_newline_before_enddefine() {
-        check_segmentation(
-            r#"define !macro1()
-var1 var2 var3!enddefine.
-"#,
-            Mode::Interactive,
-            &[
-                (Segment::Identifier, "define"),
-                (Segment::Spaces, " "),
-                (Segment::MacroName, "!macro1"),
-                (Segment::Punct, "("),
-                (Segment::Punct, ")"),
-                (Segment::Newline, "\n"),
-                (Segment::MacroBody, "var1 var2 var3"),
-                (Segment::Identifier, "!enddefine"),
-                (Segment::EndCommand, "."),
-                (Segment::Newline, "\n"),
-                (Segment::End, ""),
-            ],
-            &[PromptStyle::Define, PromptStyle::First],
-        );
-    }
-
-    #[test]
-    fn test_all_on_one_line() {
-        check_segmentation(
-            r#"define !macro1()var1 var2 var3!enddefine.
-"#,
-            Mode::Interactive,
-            &[
-                (Segment::Identifier, "define"),
-                (Segment::Spaces, " "),
-                (Segment::MacroName, "!macro1"),
-                (Segment::Punct, "("),
-                (Segment::Punct, ")"),
-                (Segment::MacroBody, "var1 var2 var3"),
-                (Segment::Identifier, "!enddefine"),
-                (Segment::EndCommand, "."),
-                (Segment::Newline, "\n"),
-                (Segment::End, ""),
-            ],
-            &[PromptStyle::First],
-        );
-    }
-
-    #[test]
-    fn test_empty() {
-        check_segmentation(
-            r#"define !macro1()
-!enddefine.
-"#,
-            Mode::Interactive,
-            &[
-                (Segment::Identifier, "define"),
-                (Segment::Spaces, " "),
-                (Segment::MacroName, "!macro1"),
-                (Segment::Punct, "("),
-                (Segment::Punct, ")"),
-                (Segment::Newline, "\n"),
-                (Segment::Identifier, "!enddefine"),
-                (Segment::EndCommand, "."),
-                (Segment::Newline, "\n"),
-                (Segment::End, ""),
-            ],
-            &[PromptStyle::Define, PromptStyle::First],
-        );
-    }
-
-    #[test]
-    fn test_blank_lines() {
-        check_segmentation(
-            r#"define !macro1()
-
-
-!enddefine.
-"#,
-            Mode::Interactive,
-            &[
-                (Segment::Identifier, "define"),
-                (Segment::Spaces, " "),
-                (Segment::MacroName, "!macro1"),
-                (Segment::Punct, "("),
-                (Segment::Punct, ")"),
-                (Segment::Newline, "\n"),
-                (Segment::MacroBody, ""),
-                (Segment::Newline, "\n"),
-                (Segment::MacroBody, ""),
-                (Segment::Newline, "\n"),
-                (Segment::Identifier, "!enddefine"),
-                (Segment::EndCommand, "."),
-                (Segment::Newline, "\n"),
-                (Segment::End, ""),
-            ],
-            &[
-                PromptStyle::Define,
-                PromptStyle::Define,
-                PromptStyle::Define,
-                PromptStyle::First,
-            ],
-        );
-    }
-
-    #[test]
-    fn test_arguments() {
-        check_segmentation(
-            r#"define !macro1(a(), b(), c())
-!enddefine.
-"#,
-            Mode::Interactive,
-            &[
-                (Segment::Identifier, "define"),
-                (Segment::Spaces, " "),
-                (Segment::MacroName, "!macro1"),
-                (Segment::Punct, "("),
-                (Segment::Identifier, "a"),
-                (Segment::Punct, "("),
-                (Segment::Punct, ")"),
-                (Segment::Punct, ","),
-                (Segment::Spaces, " "),
-                (Segment::Identifier, "b"),
-                (Segment::Punct, "("),
-                (Segment::Punct, ")"),
-                (Segment::Punct, ","),
-                (Segment::Spaces, " "),
-                (Segment::Identifier, "c"),
-                (Segment::Punct, "("),
-                (Segment::Punct, ")"),
-                (Segment::Punct, ")"),
-                (Segment::Newline, "\n"),
-                (Segment::Identifier, "!enddefine"),
-                (Segment::EndCommand, "."),
-                (Segment::Newline, "\n"),
-                (Segment::End, ""),
-            ],
-            &[PromptStyle::Define, PromptStyle::First],
-        );
-    }
-
-    #[test]
-    fn test_multiline_arguments() {
-        check_segmentation(
-            r#"define !macro1(
-  a(), b(
-  ),
-  c()
-)
-!enddefine.
-"#,
-            Mode::Interactive,
-            &[
-                (Segment::Identifier, "define"),
-                (Segment::Spaces, " "),
-                (Segment::MacroName, "!macro1"),
-                (Segment::Punct, "("),
-                (Segment::Newline, "\n"),
-                (Segment::Spaces, "  "),
-                (Segment::Identifier, "a"),
-                (Segment::Punct, "("),
-                (Segment::Punct, ")"),
-                (Segment::Punct, ","),
-                (Segment::Spaces, " "),
-                (Segment::Identifier, "b"),
-                (Segment::Punct, "("),
-                (Segment::Newline, "\n"),
-                (Segment::Spaces, "  "),
-                (Segment::Punct, ")"),
-                (Segment::Punct, ","),
-                (Segment::Newline, "\n"),
-                (Segment::Spaces, "  "),
-                (Segment::Identifier, "c"),
-                (Segment::Punct, "("),
-                (Segment::Punct, ")"),
-                (Segment::Newline, "\n"),
-                (Segment::Punct, ")"),
-                (Segment::Newline, "\n"),
-                (Segment::Identifier, "!enddefine"),
-                (Segment::EndCommand, "."),
-                (Segment::Newline, "\n"),
-                (Segment::End, ""),
-            ],
-            &[
-                PromptStyle::Later,
-                PromptStyle::Later,
-                PromptStyle::Later,
-                PromptStyle::Later,
-                PromptStyle::Define,
-                PromptStyle::First,
-            ],
-        );
-    }
-
-    #[test]
-    fn test_arguments_start_on_second_line() {
-        check_segmentation(
-            r#"define !macro1
-(x,y,z
-)
-content 1
-content 2
-!enddefine.
-"#,
-            Mode::Interactive,
-            &[
-                (Segment::Identifier, "define"),
-                (Segment::Spaces, " "),
-                (Segment::MacroName, "!macro1"),
-                (Segment::Newline, "\n"),
-                (Segment::Punct, "("),
-                (Segment::Identifier, "x"),
-                (Segment::Punct, ","),
-                (Segment::Identifier, "y"),
-                (Segment::Punct, ","),
-                (Segment::Identifier, "z"),
-                (Segment::Newline, "\n"),
-                (Segment::Punct, ")"),
-                (Segment::Newline, "\n"),
-                (Segment::MacroBody, "content 1"),
-                (Segment::Newline, "\n"),
-                (Segment::MacroBody, "content 2"),
-                (Segment::Newline, "\n"),
-                (Segment::Identifier, "!enddefine"),
-                (Segment::EndCommand, "."),
-                (Segment::Newline, "\n"),
-                (Segment::End, ""),
-            ],
-            &[
-                PromptStyle::Later,
-                PromptStyle::Later,
-                PromptStyle::Define,
-                PromptStyle::Define,
-                PromptStyle::Define,
-                PromptStyle::First,
-            ],
-        );
-    }
-
-    #[test]
-    fn test_early_end_of_command_1() {
-        check_segmentation(
-            r#"define !macro1.
-data list /x 1.
-"#,
-            Mode::Interactive,
-            &[
-                (Segment::Identifier, "define"),
-                (Segment::Spaces, " "),
-                (Segment::MacroName, "!macro1"),
-                (Segment::EndCommand, "."),
-                (Segment::Newline, "\n"),
-                (Segment::Identifier, "data"),
-                (Segment::Spaces, " "),
-                (Segment::Identifier, "list"),
-                (Segment::Spaces, " "),
-                (Segment::Punct, "/"),
-                (Segment::Identifier, "x"),
-                (Segment::Spaces, " "),
-                (Segment::Number, "1"),
-                (Segment::EndCommand, "."),
-                (Segment::Newline, "\n"),
-                (Segment::End, ""),
-            ],
-            &[PromptStyle::First, PromptStyle::First],
-        );
-    }
-
-    #[test]
-    fn test_early_end_of_command_2() {
-        check_segmentation(
-            r#"define !macro1
-x.
-data list /x 1.
-"#,
-            Mode::Interactive,
-            &[
-                (Segment::Identifier, "define"),
-                (Segment::Spaces, " "),
-                (Segment::MacroName, "!macro1"),
-                (Segment::Newline, "\n"),
-                (Segment::Identifier, "x"),
-                (Segment::EndCommand, "."),
-                (Segment::Newline, "\n"),
-                (Segment::Identifier, "data"),
-                (Segment::Spaces, " "),
-                (Segment::Identifier, "list"),
-                (Segment::Spaces, " "),
-                (Segment::Punct, "/"),
-                (Segment::Identifier, "x"),
-                (Segment::Spaces, " "),
-                (Segment::Number, "1"),
-                (Segment::EndCommand, "."),
-                (Segment::Newline, "\n"),
-                (Segment::End, ""),
-            ],
-            &[PromptStyle::Later, PromptStyle::First, PromptStyle::First],
-        );
-    }
-
-    #[test]
-    fn test_early_end_of_command_3() {
-        check_segmentation(
-            r#"define !macro1(.
-x.
-data list /x 1.
-"#,
-            Mode::Interactive,
-            &[
-                (Segment::Identifier, "define"),
-                (Segment::Spaces, " "),
-                (Segment::MacroName, "!macro1"),
-                (Segment::Punct, "("),
-                (Segment::EndCommand, "."),
-                (Segment::Newline, "\n"),
-                (Segment::Identifier, "x"),
-                (Segment::EndCommand, "."),
-                (Segment::Newline, "\n"),
-                (Segment::Identifier, "data"),
-                (Segment::Spaces, " "),
-                (Segment::Identifier, "list"),
-                (Segment::Spaces, " "),
-                (Segment::Punct, "/"),
-                (Segment::Identifier, "x"),
-                (Segment::Spaces, " "),
-                (Segment::Number, "1"),
-                (Segment::EndCommand, "."),
-                (Segment::Newline, "\n"),
-                (Segment::End, ""),
-            ],
-            &[PromptStyle::First, PromptStyle::First, PromptStyle::First],
-        );
-    }
-
-    #[test]
-    fn test_early_end_of_command_4() {
-        // Notice the command terminator at the end of the `DEFINE` command,
-        // which should not be there and ends it early.
-        check_segmentation(
-            r#"define !macro1.
-data list /x 1.
-"#,
-            Mode::Interactive,
-            &[
-                (Segment::Identifier, "define"),
-                (Segment::Spaces, " "),
-                (Segment::MacroName, "!macro1"),
-                (Segment::EndCommand, "."),
-                (Segment::Newline, "\n"),
-                (Segment::Identifier, "data"),
-                (Segment::Spaces, " "),
-                (Segment::Identifier, "list"),
-                (Segment::Spaces, " "),
-                (Segment::Punct, "/"),
-                (Segment::Identifier, "x"),
-                (Segment::Spaces, " "),
-                (Segment::Number, "1"),
-                (Segment::EndCommand, "."),
-                (Segment::Newline, "\n"),
-                (Segment::End, ""),
-            ],
-            &[PromptStyle::First, PromptStyle::First],
-        );
-    }
-
-    #[test]
-    fn test_missing_enddefine() {
-        check_segmentation(
-            r#"define !macro1()
-content line 1
-content line 2
-"#,
-            Mode::Interactive,
-            &[
-                (Segment::Identifier, "define"),
-                (Segment::Spaces, " "),
-                (Segment::MacroName, "!macro1"),
-                (Segment::Punct, "("),
-                (Segment::Punct, ")"),
-                (Segment::Newline, "\n"),
-                (Segment::MacroBody, "content line 1"),
-                (Segment::Newline, "\n"),
-                (Segment::MacroBody, "content line 2"),
-                (Segment::Newline, "\n"),
-                (Segment::End, ""),
-            ],
-            &[
-                PromptStyle::Define,
-                PromptStyle::Define,
-                PromptStyle::Define,
-            ],
-        );
-    }
-
-    #[test]
-    fn test_missing_enddefine_2() {
-        check_segmentation(
-            r#"define !macro1()
-"#,
-            Mode::Interactive,
-            &[
-                (Segment::Identifier, "define"),
-                (Segment::Spaces, " "),
-                (Segment::MacroName, "!macro1"),
-                (Segment::Punct, "("),
-                (Segment::Punct, ")"),
-                (Segment::Newline, "\n"),
-                (Segment::End, ""),
-            ],
-            &[PromptStyle::Define],
-        );
-    }
-}
-
-#[test]
-fn test_batch_mode() {
-    check_segmentation(
-        r#"first command
-     another line of first command
-+  second command
-third command
-
-fourth command.
-   fifth command.
-"#,
-        Mode::Batch,
-        &[
-            (Segment::Identifier, "first"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "command"),
-            (Segment::Newline, "\n"),
-            (Segment::Spaces, "     "),
-            (Segment::Identifier, "another"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "line"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "of"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "first"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "command"),
-            (Segment::Newline, "\n"),
-            (Segment::StartCommand, "+"),
-            (Segment::Spaces, "  "),
-            (Segment::Identifier, "second"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "command"),
-            (Segment::Newline, "\n"),
-            (Segment::StartCommand, ""),
-            (Segment::Identifier, "third"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "command"),
-            (Segment::Newline, "\n"),
-            (Segment::SeparateCommands, ""),
-            (Segment::Newline, "\n"),
-            (Segment::Identifier, "fourth"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "command"),
-            (Segment::EndCommand, "."),
-            (Segment::Newline, "\n"),
-            (Segment::Spaces, "   "),
-            (Segment::Identifier, "fifth"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "command"),
-            (Segment::EndCommand, "."),
-            (Segment::Newline, "\n"),
-            (Segment::End, ""),
-        ],
-        &[
-            PromptStyle::Later,
-            PromptStyle::Later,
-            PromptStyle::Later,
-            PromptStyle::Later,
-            PromptStyle::First,
-            PromptStyle::First,
-            PromptStyle::First,
-        ],
-    );
-}
-
-#[test]
-fn test_auto_mode() {
-    check_segmentation(
-        r#"command
-     another line of command
-2sls
-+  another command
-another line of second command
-data list /x 1
-aggregate.
-print eject.
-twostep cluster
-
-
-fourth command.
-   fifth command.
-"#,
-        Mode::Auto,
-        &[
-            (Segment::Identifier, "command"),
-            (Segment::Newline, "\n"),
-            (Segment::Spaces, "     "),
-            (Segment::Identifier, "another"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "line"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "of"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "command"),
-            (Segment::Newline, "\n"),
-            (Segment::StartCommand, ""),
-            (Segment::Number, "2"),
-            (Segment::Identifier, "sls"),
-            (Segment::Newline, "\n"),
-            (Segment::StartCommand, "+"),
-            (Segment::Spaces, "  "),
-            (Segment::Identifier, "another"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "command"),
-            (Segment::Newline, "\n"),
-            (Segment::Identifier, "another"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "line"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "of"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "second"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "command"),
-            (Segment::Newline, "\n"),
-            (Segment::StartCommand, ""),
-            (Segment::Identifier, "data"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "list"),
-            (Segment::Spaces, " "),
-            (Segment::Punct, "/"),
-            (Segment::Identifier, "x"),
-            (Segment::Spaces, " "),
-            (Segment::Number, "1"),
-            (Segment::Newline, "\n"),
-            (Segment::StartCommand, ""),
-            (Segment::Identifier, "aggregate"),
-            (Segment::EndCommand, "."),
-            (Segment::Newline, "\n"),
-            (Segment::Identifier, "print"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "eject"),
-            (Segment::EndCommand, "."),
-            (Segment::Newline, "\n"),
-            (Segment::Identifier, "twostep"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "cluster"),
-            (Segment::Newline, "\n"),
-            (Segment::SeparateCommands, ""),
-            (Segment::Newline, "\n"),
-            (Segment::SeparateCommands, ""),
-            (Segment::Newline, "\n"),
-            (Segment::Identifier, "fourth"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "command"),
-            (Segment::EndCommand, "."),
-            (Segment::Newline, "\n"),
-            (Segment::Spaces, "   "),
-            (Segment::Identifier, "fifth"),
-            (Segment::Spaces, " "),
-            (Segment::Identifier, "command"),
-            (Segment::EndCommand, "."),
-            (Segment::Newline, "\n"),
-            (Segment::End, ""),
-        ],
-        &[
-            PromptStyle::Later,
-            PromptStyle::Later,
-            PromptStyle::Later,
-            PromptStyle::Later,
-            PromptStyle::Later,
-            PromptStyle::Later,
-            PromptStyle::First,
-            PromptStyle::First,
-            PromptStyle::Later,
-            PromptStyle::First,
-            PromptStyle::First,
-            PromptStyle::First,
-            PromptStyle::First,
-        ],
-    );
-}
diff --git a/rust/src/lex/token.rs b/rust/src/lex/token.rs

deleted file mode 100644 (file)

index 2b59423..0000000
--- a/rust/src/lex/token.rs
+++ /dev/null
@@ -1,272 +0,0 @@
-use std::fmt::{Display, Formatter, Result as FmtResult};
-
-use crate::identifier::Identifier;
-
-#[derive(Clone, Debug, PartialEq)]
-pub enum Token {
-    /// End of input.
-    End,
-
-    /// Identifier.
-    Id(Identifier),
-
-    /// Number.
-    Number(f64),
-
-    /// Quoted string.
-    String(String),
-
-    /// Command terminator or separator.
-    ///
-    /// Usually this is `.`, but a blank line also separates commands, and in
-    /// batch mode any line that begins with a non-blank starts a new command.
-    EndCommand,
-
-    /// Operators, punctuators, and reserved words.
-    Punct(Punct),
-}
-
-impl Token {
-    pub fn id(&self) -> Option<&Identifier> {
-        match self {
-            Self::Id(identifier) => Some(identifier),
-            _ => None,
-        }
-    }
-}
-
-fn is_printable(c: char) -> bool {
-    !c.is_control() || ['\t', '\r', '\n'].contains(&c)
-}
-
-fn string_representation(s: &str, quote: char, f: &mut Formatter<'_>) -> FmtResult {
-    write!(f, "{quote}")?;
-    for section in s.split_inclusive(quote) {
-        if let Some(rest) = section.strip_suffix(quote) {
-            write!(f, "{rest}{quote}{quote}")?;
-        } else {
-            write!(f, "{section}")?;
-        }
-    }
-    write!(f, "{quote}")
-}
-
-impl Display for Token {
-    fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
-        match self {
-            Token::End => Ok(()),
-            Token::Id(s) => write!(f, "{s}"),
-            Token::Number(number) => {
-                if number.is_sign_negative() {
-                    write!(f, "-{}", number.abs())
-                } else {
-                    write!(f, "{number}")
-                }
-            }
-            Token::String(s) => {
-                if s.chars().all(|c| is_printable(c)) {
-                    if s.contains('"') {
-                        string_representation(s, '\'', f)
-                    } else {
-                        string_representation(s, '"', f)
-                    }
-                } else {
-                    write!(f, "X\"")?;
-                    for byte in s.bytes() {
-                        let c1 = char::from_digit((byte >> 4) as u32, 16)
-                            .unwrap()
-                            .to_ascii_uppercase();
-                        let c2 = char::from_digit((byte & 0xf) as u32, 16)
-                            .unwrap()
-                            .to_ascii_uppercase()
-                            .to_ascii_lowercase();
-                        write!(f, "{c1}{c2}")?;
-                    }
-                    write!(f, "\"")
-                }
-            }
-            Token::EndCommand => write!(f, "."),
-            Token::Punct(punct) => punct.fmt(f),
-        }
-    }
-}
-
-/// Check that all negative numbers, even -0, get formatted with a leading `-`.
-#[cfg(test)]
-mod test {
-    use crate::lex::token::Token;
-
-    #[test]
-    fn test_string() {
-        assert_eq!(Token::String(String::from("abc")).to_string(), "\"abc\"");
-        assert_eq!(
-            Token::String(String::from("\u{0080}")).to_string(),
-            "X\"C280\""
-        );
-    }
-
-    #[test]
-    fn test_neg0() {
-        assert_eq!(Token::Number(-0.0).to_string(), "-0");
-    }
-}
-
-#[derive(Copy, Clone, Debug, PartialEq, Eq)]
-pub enum Punct {
-    /// `+`.
-    Plus,
-
-    /// `-`.
-    Dash,
-
-    /// `*`.
-    Asterisk,
-
-    /// `/`.
-    Slash,
-
-    /// `=`.
-    Equals,
-
-    /// `(`.
-    LParen,
-
-    /// `)`.
-    RParen,
-
-    /// `[`.
-    LSquare,
-
-    /// `]`.
-    RSquare,
-
-    /// `{`.
-    LCurly,
-
-    /// `}`.
-    RCurly,
-
-    /// `,`.
-    Comma,
-
-    /// `;`.
-    Semicolon,
-
-    /// `:`.
-    Colon,
-
-    /// `AND` or `&`.
-    And,
-
-    /// `OR` or `|`.
-    Or,
-
-    /// `NOT` or `~`.
-    Not,
-
-    /// `EQ` or `=`.
-    Eq,
-
-    /// `GE` or '>=`
-    Ge,
-
-    /// `GT` or `>`.
-    Gt,
-
-    /// `LE` or `<=`.
-    Le,
-
-    /// `LT` or `<`.
-    Lt,
-
-    /// `NE` or `~=` or `<>`.
-    Ne,
-
-    /// `ALL`.
-    All,
-
-    /// `BY`.
-    By,
-
-    /// `TO`.
-    To,
-
-    /// `WITH`.
-    With,
-
-    /// `**`.
-    Exp,
-
-    /// `!` (only appears in macros).
-    Bang,
-
-    /// `%` (only appears in macros).
-    Percent,
-
-    /// `?` (only appears in macros).
-    Question,
-
-    /// ```` (only appears in macros).
-    Backtick,
-
-    /// `.`.
-    ///
-    /// This represents a dot in the middle of a line by itself, where it does not end a command.
-    Dot,
-
-    /// `_` (only appears in macros).
-    ///
-    /// Although underscores may appear within identifiers, they can't be the
-    /// first character, so this represents an underscore found on its own.
-    Underscore,
-
-    /// `!*` (only appears in macros).
-    BangAsterisk,
-}
-
-impl Punct {
-    pub fn as_str(&self) -> &'static str {
-        match self {
-            Self::Plus => "+",
-            Self::Dash => "-",
-            Self::Asterisk => "*",
-            Self::Slash => "/",
-            Self::Equals => "=",
-            Self::LParen => "(",
-            Self::RParen => ")",
-            Self::LSquare => "[",
-            Self::RSquare => "]",
-            Self::LCurly => "{",
-            Self::RCurly => "}",
-            Self::Comma => ",",
-            Self::Semicolon => ";",
-            Self::Colon => ":",
-            Self::And => "AND",
-            Self::Or => "OR",
-            Self::Not => "NOT",
-            Self::Eq => "EQ",
-            Self::Ge => ">=",
-            Self::Gt => ">",
-            Self::Le => "<=",
-            Self::Lt => "<",
-            Self::Ne => "~=",
-            Self::All => "ALL",
-            Self::By => "BY",
-            Self::To => "TO",
-            Self::With => "WITH",
-            Self::Exp => "**",
-            Self::Bang => "!",
-            Self::Percent => "%",
-            Self::Question => "?",
-            Self::Backtick => "`",
-            Self::Dot => ".",
-            Self::Underscore => "_",
-            Self::BangAsterisk => "!*",
-        }
-    }
-}
-impl Display for Punct {
-    fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
-        write!(f, "{}", self.as_str())
-    }
-}
diff --git a/rust/src/lib.rs b/rust/src/lib.rs

deleted file mode 100644 (file)

index 3548e02..0000000
--- a/rust/src/lib.rs
+++ /dev/null
@@ -1,20 +0,0 @@
-#[allow(unused_variables, unused_mut, dead_code)]
-pub mod cooked;
-pub mod dictionary;
-pub mod encoding;
-pub mod endian;
-pub mod format;
-pub mod identifier;
-pub mod locale_charset;
-pub mod output;
-#[allow(unused_variables, unused_mut, dead_code)]
-pub mod raw;
-pub mod sack;
-pub mod lex;
-pub mod prompt;
-pub mod message;
-pub mod macros;
-pub mod settings;
-pub mod command;
-pub mod integer;
-pub mod engine;
diff --git a/rust/src/locale_charset.rs b/rust/src/locale_charset.rs

deleted file mode 100644 (file)

index 596fd62..0000000
--- a/rust/src/locale_charset.rs
+++ /dev/null
@@ -1,306 +0,0 @@
-// Determine a canonical name for the current locale's character encoding.
-//
-// Copyright (C) 2000-2006, 2008-2023 Free Software Foundation, Inc.
-//
-// This file is free software: you can redistribute it and/or modify it under
-// the terms of the GNU Lesser General Public License as published by the Free
-// Software Foundation; either version 2.1 of the License, or (at your option)
-// any later version.
-//
-// This file is distributed in the hope that it will be useful, but WITHOUT ANY
-// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
-// A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
-// details.
-//
-// You should have received a copy of the GNU Lesser General Public License
-// along with this program.  If not, see <https://www.gnu.org/licenses/>.
-//
-// Written by Bruno Haible <bruno@clisp.org>.  Translated to Rust by Ben Pfaff
-// <blp@cs.stanford.edu>.
-
-use lazy_static::lazy_static;
-
-fn map_aliases(s: &str) -> &'static str {
-    #[cfg(target_os = "freebsd")]
-    match s {
-        "ARMSCII-8" => return "ARMSCII-8",
-        "Big5" => return "BIG5",
-        "C" => return "ASCII",
-        "CP1131" => return "CP1131",
-        "CP1251" => return "CP1251",
-        "CP866" => return "CP866",
-        "GB18030" => return "GB18030",
-        "GB2312" => return "GB2312",
-        "GBK" => return "GBK",
-        "ISCII-DEV" => return "?",
-        "ISO8859-1" => return "ISO-8859-1",
-        "ISO8859-13" => return "ISO-8859-13",
-        "ISO8859-15" => return "ISO-8859-15",
-        "ISO8859-2" => return "ISO-8859-2",
-        "ISO8859-5" => return "ISO-8859-5",
-        "ISO8859-7" => return "ISO-8859-7",
-        "ISO8859-9" => return "ISO-8859-9",
-        "KOI8-R" => return "KOI8-R",
-        "KOI8-U" => return "KOI8-U",
-        "SJIS" => return "SHIFT_JIS",
-        "US-ASCII" => return "ASCII",
-        "eucCN" => return "GB2312",
-        "eucJP" => return "EUC-JP",
-        "eucKR" => return "EUC-KR",
-        _ => (),
-    };
-
-    #[cfg(target_os = "netbsd")]
-    match s {
-        "646" => return "ASCII",
-        "ARMSCII-8" => return "ARMSCII-8",
-        "BIG5" => return "BIG5",
-        "Big5-HKSCS" => return "BIG5-HKSCS",
-        "CP1251" => return "CP1251",
-        "CP866" => return "CP866",
-        "GB18030" => return "GB18030",
-        "GB2312" => return "GB2312",
-        "ISO8859-1" => return "ISO-8859-1",
-        "ISO8859-13" => return "ISO-8859-13",
-        "ISO8859-15" => return "ISO-8859-15",
-        "ISO8859-2" => return "ISO-8859-2",
-        "ISO8859-4" => return "ISO-8859-4",
-        "ISO8859-5" => return "ISO-8859-5",
-        "ISO8859-7" => return "ISO-8859-7",
-        "KOI8-R" => return "KOI8-R",
-        "KOI8-U" => return "KOI8-U",
-        "PT154" => return "PT154",
-        "SJIS" => return "SHIFT_JIS",
-        "eucCN" => return "GB2312",
-        "eucJP" => return "EUC-JP",
-        "eucKR" => return "EUC-KR",
-        "eucTW" => return "EUC-TW",
-        _ => (),
-    };
-
-    #[cfg(target_os = "openbsd")]
-    match s {
-        "646" => return "ASCII",
-        "ISO8859-1" => return "ISO-8859-1",
-        "ISO8859-13" => return "ISO-8859-13",
-        "ISO8859-15" => return "ISO-8859-15",
-        "ISO8859-2" => return "ISO-8859-2",
-        "ISO8859-4" => return "ISO-8859-4",
-        "ISO8859-5" => return "ISO-8859-5",
-        "ISO8859-7" => return "ISO-8859-7",
-        "US-ASCII" => return "ASCII",
-        _ => (),
-    };
-
-    /* Darwin 7.5 has nl_langinfo(CODESET), but sometimes its value is
-      useless:
-      - It returns the empty string when LANG is set to a locale of the
-        form ll_CC, although ll_CC/LC_CTYPE is a symlink to an UTF-8
-        LC_CTYPE file.
-      - The environment variables LANG, LC_CTYPE, LC_ALL are not set by
-        the system; nl_langinfo(CODESET) returns "US-ASCII" in this case.
-      - The documentation says:
-          "... all code that calls BSD system routines should ensure
-           that the const *char parameters of these routines are in UTF-8
-           encoding. All BSD system functions expect their string
-           parameters to be in UTF-8 encoding and nothing else."
-        It also says
-          "An additional caveat is that string parameters for files,
-           paths, and other file-system entities must be in canonical
-           UTF-8. In a canonical UTF-8 Unicode string, all decomposable
-           characters are decomposed ..."
-        but this is not true: You can pass non-decomposed UTF-8 strings
-        to file system functions, and it is the OS which will convert
-        them to decomposed UTF-8 before accessing the file system.
-      - The Apple Terminal application displays UTF-8 by default.
-      - However, other applications are free to use different encodings:
-        - xterm uses ISO-8859-1 by default.
-        - TextEdit uses MacRoman by default.
-      We prefer UTF-8 over decomposed UTF-8-MAC because one should
-      minimize the use of decomposed Unicode. Unfortunately, through the
-      Darwin file system, decomposed UTF-8 strings are leaked into user
-      space nevertheless.
-      Then there are also the locales with encodings other than US-ASCII
-      and UTF-8. These locales can be occasionally useful to users (e.g.
-      when grepping through ISO-8859-1 encoded text files), when all their
-      file names are in US-ASCII.
-    */
-
-    #[cfg(target_os = "macos")]
-    match s {
-        "ARMSCII-8" => return "ARMSCII-8",
-        "Big5" => return "BIG5",
-        "Big5HKSCS" => return "BIG5-HKSCS",
-        "CP1131" => return "CP1131",
-        "CP1251" => return "CP1251",
-        "CP866" => return "CP866",
-        "CP949" => return "CP949",
-        "GB18030" => return "GB18030",
-        "GB2312" => return "GB2312",
-        "GBK" => return "GBK",
-        "ISO8859-1" => return "ISO-8859-1",
-        "ISO8859-13" => return "ISO-8859-13",
-        "ISO8859-15" => return "ISO-8859-15",
-        "ISO8859-2" => return "ISO-8859-2",
-        "ISO8859-4" => return "ISO-8859-4",
-        "ISO8859-5" => return "ISO-8859-5",
-        "ISO8859-7" => return "ISO-8859-7",
-        "ISO8859-9" => return "ISO-8859-9",
-        "KOI8-R" => return "KOI8-R",
-        "KOI8-U" => return "KOI8-U",
-        "PT154" => return "PT154",
-        "SJIS" => return "SHIFT_JIS",
-        "eucCN" => return "GB2312",
-        "eucJP" => return "EUC-JP",
-        "eucKR" => return "EUC-KR",
-        _ => (),
-    };
-
-    #[cfg(target_os = "aix")]
-    match s {
-        "GBK" => return "GBK",
-        "IBM-1046" => return "CP1046",
-        "IBM-1124" => return "CP1124",
-        "IBM-1129" => return "CP1129",
-        "IBM-1252" => return "CP1252",
-        "IBM-850" => return "CP850",
-        "IBM-856" => return "CP856",
-        "IBM-921" => return "ISO-8859-13",
-        "IBM-922" => return "CP922",
-        "IBM-932" => return "CP932",
-        "IBM-943" => return "CP943",
-        "IBM-eucCN" => return "GB2312",
-        "IBM-eucJP" => return "EUC-JP",
-        "IBM-eucKR" => return "EUC-KR",
-        "IBM-eucTW" => return "EUC-TW",
-        "ISO8859-1" => return "ISO-8859-1",
-        "ISO8859-15" => return "ISO-8859-15",
-        "ISO8859-2" => return "ISO-8859-2",
-        "ISO8859-5" => return "ISO-8859-5",
-        "ISO8859-6" => return "ISO-8859-6",
-        "ISO8859-7" => return "ISO-8859-7",
-        "ISO8859-8" => return "ISO-8859-8",
-        "ISO8859-9" => return "ISO-8859-9",
-        "TIS-620" => return "TIS-620",
-        "UTF-8" => return "UTF-8",
-        "big5" => return "BIG5",
-        _ => (),
-    };
-
-    #[cfg(windows)]
-    match s {
-        "CP1361" => return "JOHAB",
-        "CP20127" => return "ASCII",
-        "CP20866" => return "KOI8-R",
-        "CP20936" => return "GB2312",
-        "CP21866" => return "KOI8-RU",
-        "CP28591" => return "ISO-8859-1",
-        "CP28592" => return "ISO-8859-2",
-        "CP28593" => return "ISO-8859-3",
-        "CP28594" => return "ISO-8859-4",
-        "CP28595" => return "ISO-8859-5",
-        "CP28596" => return "ISO-8859-6",
-        "CP28597" => return "ISO-8859-7",
-        "CP28598" => return "ISO-8859-8",
-        "CP28599" => return "ISO-8859-9",
-        "CP28605" => return "ISO-8859-15",
-        "CP38598" => return "ISO-8859-8",
-        "CP51932" => return "EUC-JP",
-        "CP51936" => return "GB2312",
-        "CP51949" => return "EUC-KR",
-        "CP51950" => return "EUC-TW",
-        "CP54936" => return "GB18030",
-        "CP65001" => return "UTF-8",
-        "CP936" => return "GBK",
-        _ => (),
-    };
-
-    String::from(s).leak()
-}
-
-#[cfg(unix)]
-mod inner {
-    use std::{
-        ffi::{c_int, CStr, CString},
-        ptr::null,
-    };
-
-    use libc::{self, nl_langinfo, setlocale, CODESET, LC_CTYPE};
-
-    unsafe fn string_from_pointer(s: *const i8) -> Option<String> {
-        if s.is_null() {
-            None
-        } else {
-            Some(CStr::from_ptr(s).to_string_lossy().into())
-        }
-    }
-
-    fn set_locale(category: c_int, locale: Option<&str>) -> Option<String> {
-        unsafe {
-            let locale = locale.map(|s| CString::new(s).unwrap());
-            let locale_ptr = locale.as_ref().map_or(null(), |s| s.as_ptr());
-            string_from_pointer(setlocale(category, locale_ptr))
-        }
-    }
-
-    pub fn locale_charset() -> Option<String> {
-        unsafe {
-            let saved_locale = set_locale(LC_CTYPE, None);
-            set_locale(LC_CTYPE, Some(""));
-            let codeset = string_from_pointer(nl_langinfo(CODESET));
-            set_locale(LC_CTYPE, saved_locale.as_deref());
-            codeset
-        }
-    }
-}
-
-#[cfg(windows)]
-mod inner {
-    use libc::{setlocale, LC_CTYPE};
-    use std::ffi::{CStr, CString};
-    use windows_sys::Win32::Globalization::GetACP;
-
-    fn current_locale() -> Option<String> {
-        unsafe {
-            let empty_cstr = CString::new("").unwrap();
-            let locale = setlocale(LC_CTYPE, empty_cstr.as_ptr());
-            if locale.is_null() {
-                None
-            } else {
-                Some(CStr::from_ptr(locale).to_string_lossy().into())
-            }
-        }
-    }
-
-    pub fn locale_charset() -> Option<String> {
-        let Some(current_locale) = current_locale() else {
-            return None;
-        };
-        let codepage = if let Some((_, pdot)) = current_locale.rsplit_once('.') {
-            format!("CP{pdot}")
-        } else {
-            format!("CP{}", unsafe { GetACP() })
-        };
-        Some(match codepage.as_str() {
-            "CP65001" | "CPutf8" => String::from("UTF-8"),
-            _ => codepage,
-        })
-    }
-}
-
-#[cfg(not(any(unix, windows)))]
-mod inner {
-    pub fn locale_charse() -> String {
-        String::from("UTF-8")
-    }
-}
-
-/// Returns the character set used by the locale configured in the operating
-/// system.
-pub fn locale_charset() -> &'static str {
-    lazy_static! {
-        static ref LOCALE_CHARSET: &'static str =
-            map_aliases(&inner::locale_charset().unwrap_or(String::from("UTF-8")));
-    }
-    &LOCALE_CHARSET
-}
diff --git a/rust/src/macros.rs b/rust/src/macros.rs

deleted file mode 100644 (file)

index 85671b0..0000000
--- a/rust/src/macros.rs
+++ /dev/null
@@ -1,1668 +0,0 @@
-use lazy_static::lazy_static;
-use num::Integer;
-use std::{
-    cell::RefCell,
-    cmp::Ordering,
-    collections::{BTreeMap, HashMap, HashSet},
-    mem::take,
-    num::NonZeroUsize,
-    ops::RangeInclusive,
-};
-use thiserror::Error as ThisError;
-use unicase::UniCase;
-
-use crate::{
-    identifier::Identifier,
-    lex::{
-        scan::{ScanError, ScanToken, StringScanner, StringSegmenter},
-        segment::Mode,
-        token::{Punct, Token},
-    },
-    message::Location,
-    settings::Settings,
-};
-
-#[derive(Clone, Debug, ThisError)]
-pub enum MacroError {
-    /// Expected more tokens.
-    #[error(
-        "Reached end of command expecting {n} more tokens in argument {arg} to macro {macro_}."
-    )]
-    ExpectedMoreTokens {
-        n: usize,
-        arg: Identifier,
-        macro_: Identifier,
-    },
-
-    /// Expected a particular token at end of command.
-    #[error("Reached end of command expecting {token:?} in argument {arg} to macro {macro_}.")]
-    ExpectedToken {
-        token: String,
-        arg: Identifier,
-        macro_: Identifier,
-    },
-
-    /// Expected a particular token, got a different one.
-    #[error(
-        "Found `{actual}` while expecting `{expected}` reading argument {arg} to macro {macro_}."
-    )]
-    UnexpectedToken {
-        actual: String,
-        expected: String,
-        arg: Identifier,
-        macro_: Identifier,
-    },
-
-    /// Argument specified multiple times,
-    #[error("Argument {arg} specified multiple times in call to macro {macro_}.")]
-    DuplicateArg { arg: Identifier, macro_: Identifier },
-
-    /// Maximum nesting limit exceeded.
-    #[error("Maximum nesting level {limit} exceeded. (Use `SET MNEST` to change the limit.)")]
-    TooDeep { limit: usize },
-
-    /// Invalid `!*`.
-    #[error("`!*` may only be used within the expansion of a macro.")]
-    InvalidBangAsterisk,
-
-    /// Error tokenizing during expansion.
-    #[error(transparent)]
-    ScanError(ScanError),
-
-    /// Expecting `)` in macro expression.
-    #[error("Expecting `)` in macro expression.")]
-    ExpectingRParen,
-
-    /// Expecting literal.
-    #[error("Expecting literal or function invocation in macro expression.")]
-    ExpectingLiteral,
-
-    /// Expecting `!THEN`.
-    #[error("`!THEN` expected in macro `!IF` construct.")]
-    ExpectingThen,
-
-    /// Expecting `!ELSE` or `!THEN`.
-    #[error("`!ELSE` or `!THEN` expected in macro `!IF` construct.")]
-    ExpectingElseOrIfEnd,
-
-    /// Expecting `!IFEND`.
-    #[error("`!IFEND` expected in macro `!IF` construct.")]
-    ExpectingIfEnd,
-
-    /// Expecting macro variable name.
-    #[error("Expecting macro variable name following `{0}`.")]
-    ExpectingMacroVarName(&'static str),
-
-    /// Invalid macro variable name.
-    #[error("Cannot use argument name or macro keyword {name} as `{construct}` variable name.")]
-    BadMacroVarName {
-        name: Identifier,
-        construct: &'static str,
-    },
-
-    /// Expecting `=` following `!LET`.
-    #[error("Expecting `=` following `!LET`.")]
-    ExpectingEquals,
-
-    /// Expecting `=` or `!IN` in `!DO` loop.
-    #[error("Expecting `=` or `!IN` in `!DO` loop.")]
-    ExpectingEqualsOrIn,
-
-    /// Missing `!DOEND`.
-    #[error("Missing `!DOEND`.")]
-    MissingDoEnd,
-
-    /// Bad numberic macro expression.
-    #[error("Macro expression must evaluate to a number (not {0:?})")]
-    BadNumericMacroExpression(String),
-
-    /// Too many iteration for list-based loop.
-    #[error("`!DO` loop over list exceeded maximum number of iterations {0}.  (Use `SET MITERATE` to change the limit.)")]
-    MiterateList(usize),
-
-    /// Too many iteration for numerical loop.
-    #[error("Numerical `!DO` loop  exceeded maximum number of iterations {0}.  (Use `SET MITERATE` to change the limit.)")]
-    MiterateNumeric(usize),
-
-    /// Expecting `!TO`  in numerical `!DO` loop.
-    #[error("Expecting `!TO`  in numerical `!DO` loop.")]
-    ExpectingTo,
-
-    /// `!BY` value cannot be zero.
-    #[error("`!BY` value cannot be zero.")]
-    ZeroBy,
-
-    /// `!BREAK` outside `!DO`.
-    #[error("`!BREAK` outside `!DO`.")]
-    BreakOutsideDo,
-
-    /// `,` or `)` expected in call to macro function.
-    #[error("`,` or `)` expected in call to macro function `{0}`.")]
-    ExpectingCommaOrRParen(Identifier),
-
-    /// Macro function takes one argument.
-    #[error("Macro function `{name}` takes one argument (not {n_args}).")]
-    ExpectingOneArg { name: Identifier, n_args: usize },
-
-    /// Macro function takes two arguments.
-    #[error("Macro function `{name}` takes two arguments (not {n_args}).")]
-    ExpectingTwoArgs { name: Identifier, n_args: usize },
-
-    /// Macro function takes two or three arguments.
-    #[error("Macro function `{name}` takes two or three arguments (not {n_args}).")]
-    ExpectingTwoOrThreeArgs { name: Identifier, n_args: usize },
-
-    /// Macro function needs at least one argument).
-    #[error("Macro function `{name}` needs at least one argument).")]
-    ExpectingOneOrMoreArgs { name: Identifier },
-
-    /// Argument to `!BLANKS` must be non-negative integer (not `{0}`).
-    #[error("Argument to `!BLANKS` must be non-negative integer (not `{0}`).")]
-    InvalidBlanks(String),
-
-    /// Second argument of `!SUBSTR` must be positive integer (not `{0}`).
-    #[error("Second argument of `!SUBSTR` must be positive integer (not `{0}`).")]
-    InvalidSubstr2(String),
-
-    /// Third argument of `!SUBSTR` must be non-negative integer (not `{0}`).
-    #[error("Third argument of `!SUBSTR` must be non-negative integer (not `{0}`).")]
-    InvalidSubstr3(String),
-}
-
-/// A PSPP macro as defined with `!DEFINE`.
-pub struct Macro {
-    /// The macro's name. This is an ordinary identifier except that it is
-    /// allowed (but not required) to begin with `!`.
-    pub name: Identifier,
-
-    /// Source code location of macro definition, for error reporting.
-    pub location: Location,
-
-    /// Parameters.
-    parameters: Vec<Parameter>,
-
-    /// Body.
-    body: Vec<MacroToken>,
-}
-
-impl Macro {
-    fn initial_state(&self) -> ParserState {
-        if self.parameters.is_empty() {
-            ParserState::Finished
-        } else if self.parameters[0].is_positional() {
-            ParserState::Keyword
-        } else if let ValueType::Enclose(_, _) = self.parameters[0].arg {
-            ParserState::Enclose
-        } else {
-            ParserState::Arg
-        }
-    }
-
-    fn find_parameter(&self, name: &Identifier) -> Option<usize> {
-        self.parameters.iter().position(|param| &param.name == name)
-    }
-}
-
-struct Parameter {
-    /// `!name` or `!1`.
-    name: Identifier,
-
-    /// Default value.
-    ///
-    /// The tokens don't include white space, etc. between them.
-    default: Vec<MacroToken>,
-
-    /// Macro-expand the value?
-    expand_value: bool,
-
-    /// How the argument is specified.
-    arg: ValueType,
-}
-
-impl Parameter {
-    /// Returns true if this is a positional parameter. Positional parameters
-    /// are expanded by index (position) rather than by name.
-    fn is_positional(&self) -> bool {
-        self.name.0.as_bytes()[1].is_ascii_digit()
-    }
-}
-
-enum ValueType {
-    /// Argument consists of `.0` tokens.
-    NTokens(usize),
-
-    /// Argument runs until token `.0`.
-    CharEnd(Token),
-
-    /// Argument starts with token `.0` and ends with token `.1`.
-    Enclose(Token, Token),
-
-    /// Argument runs until the end of the command.
-    CmdEnd,
-}
-
-/// A token and the syntax that was tokenized to produce it.  The syntax allows
-/// the token to be turned back into syntax accurately.
-#[derive(Clone)]
-pub struct MacroToken {
-    /// The token.
-    pub token: Token,
-
-    /// The syntax that produces `token`.
-    pub syntax: String,
-}
-
-fn tokenize_string_into(
-    s: &str,
-    mode: Mode,
-    error: &impl Fn(MacroError),
-    output: &mut Vec<MacroToken>,
-) {
-    for (syntax, token) in StringSegmenter::new(s, mode, true) {
-        match token {
-            ScanToken::Token(token) => output.push(MacroToken {
-                token,
-                syntax: String::from(syntax),
-            }),
-            ScanToken::Error(scan_error) => error(MacroError::ScanError(scan_error)),
-        }
-    }
-}
-
-fn tokenize_string(s: &str, mode: Mode, error: &impl Fn(MacroError)) -> Vec<MacroToken> {
-    let mut tokens = Vec::new();
-    tokenize_string_into(s, mode, error, &mut tokens);
-    tokens
-}
-
-fn try_unquote_string(input: &String, mode: Mode) -> Option<String> {
-    let mut scanner = StringScanner::new(input, mode, true);
-    let Some(ScanToken::Token(Token::String(unquoted))) = scanner.next() else {
-        return None;
-    };
-    let None = scanner.next() else { return None };
-    return Some(unquoted);
-}
-
-fn unquote_string(input: String, mode: Mode) -> String {
-    try_unquote_string(&input, mode).unwrap_or(input)
-}
-
-#[derive(Clone)]
-struct MacroTokens<'a>(&'a [MacroToken]);
-
-impl<'a> MacroTokens<'a> {
-    fn is_empty(&self) -> bool {
-        self.0.is_empty()
-    }
-    fn match_(&mut self, s: &str) -> bool {
-        if let Some((first, rest)) = self.0.split_first() {
-            if first.syntax.eq_ignore_ascii_case(s) {
-                self.0 = rest;
-                return true;
-            }
-        }
-        false
-    }
-    fn take_relop(&mut self) -> Option<RelOp> {
-        if let Some((first, rest)) = self.0.split_first() {
-            if let Ok(relop) = first.syntax.as_str().try_into() {
-                self.0 = rest;
-                return Some(relop);
-            }
-        }
-        None
-    }
-    fn macro_id(&self) -> Option<&Identifier> {
-        self.0.get(0).map(|mt| mt.token.macro_id()).flatten()
-    }
-    fn take_macro_id(&mut self) -> Option<&Identifier> {
-        let result = self.0.get(0).map(|mt| mt.token.macro_id()).flatten();
-        if result.is_some() {
-            self.advance();
-        }
-        result
-    }
-    fn take(&mut self) -> Option<&MacroToken> {
-        match self.0.split_first() {
-            Some((first, rest)) => {
-                self.0 = rest;
-                Some(first)
-            }
-            None => None,
-        }
-    }
-    fn advance(&mut self) -> &MacroToken {
-        let (first, rest) = self.0.split_first().unwrap();
-        self.0 = rest;
-        first
-    }
-}
-
-#[derive(Copy, Clone, Debug, PartialEq, Eq)]
-enum TokenClass {
-    /// No space before or after (new-line after).
-    EndCommand,
-
-    /// Space on both sides.
-    BinaryOperator,
-
-    /// Space afterward.
-    Comma,
-
-    /// Don't need spaces except sequentially.
-    Id,
-
-    /// Don't need spaces except sequentially.
-    Punct,
-}
-
-impl TokenClass {
-    fn separator(prev: Self, next: Self) -> &'static str {
-        match (prev, next) {
-            // Don't need a separator before the end of a command, but we
-            // need a new-line afterward.
-            (_, Self::EndCommand) => "",
-            (Self::EndCommand, _) => "\n",
-
-            // Binary operators always have a space on both sides, and a comma always has a space afterward.
-            (Self::BinaryOperator, _) | (_, Self::BinaryOperator) | (Self::Comma, _) => " ",
-
-            // Otherwise, `prev` is `Self::Punct`, which only need a space if
-            // there are two or them in a row.
-            (Self::Punct, Self::Punct) => " ",
-            _ => "",
-        }
-    }
-}
-
-impl From<&Token> for TokenClass {
-    fn from(source: &Token) -> Self {
-        match source {
-            Token::End => Self::Punct,
-            Token::Id(_) | Token::Number(_) | Token::String(_) => Self::Id,
-            Token::EndCommand => Self::EndCommand,
-            Token::Punct(punct) => match punct {
-                Punct::LParen
-                | Punct::RParen
-                | Punct::LSquare
-                | Punct::RSquare
-                | Punct::LCurly
-                | Punct::RCurly => Self::Punct,
-
-                Punct::Plus
-                | Punct::Dash
-                | Punct::Asterisk
-                | Punct::Slash
-                | Punct::Equals
-                | Punct::Colon
-                | Punct::And
-                | Punct::Or
-                | Punct::Not
-                | Punct::Eq
-                | Punct::Ge
-                | Punct::Gt
-                | Punct::Le
-                | Punct::Lt
-                | Punct::Ne
-                | Punct::All
-                | Punct::By
-                | Punct::To
-                | Punct::With
-                | Punct::Exp
-                | Punct::Bang
-                | Punct::Percent
-                | Punct::Question
-                | Punct::Backtick
-                | Punct::Dot
-                | Punct::Underscore
-                | Punct::BangAsterisk => Self::BinaryOperator,
-
-                Punct::Comma | Punct::Semicolon => Self::Comma,
-            },
-        }
-    }
-}
-
-pub fn macro_tokens_to_syntax(input: &[MacroToken]) -> impl Iterator<Item = [&str; 2]> {
-    input
-        .iter()
-        .take(1)
-        .map(|token| ["", token.syntax.as_str()])
-        .chain(input.windows(2).map(|w| {
-            let c0 = (&w[0].token).into();
-            let c1 = (&w[1].token).into();
-            [TokenClass::separator(c0, c1), w[1].syntax.as_str()]
-        }))
-}
-
-trait MacroId {
-    fn macro_id(&self) -> Option<&Identifier>;
-}
-
-impl MacroId for Token {
-    fn macro_id(&self) -> Option<&Identifier> {
-        let id = self.id()?;
-        id.0.starts_with('!').then_some(id)
-    }
-}
-
-enum RelOp {
-    Eq,
-    Ne,
-    Lt,
-    Gt,
-    Le,
-    Ge,
-}
-
-impl TryFrom<&str> for RelOp {
-    type Error = ();
-
-    fn try_from(source: &str) -> Result<Self, Self::Error> {
-        match source {
-            "=" => Ok(Self::Eq),
-            "~=" | "<>" => Ok(Self::Ne),
-            "<" => Ok(Self::Lt),
-            ">" => Ok(Self::Gt),
-            "<=" => Ok(Self::Le),
-            ">=" => Ok(Self::Ge),
-            _ if source.len() == 3 && source.as_bytes()[0] == b'!' => match (
-                source.as_bytes()[0].to_ascii_uppercase(),
-                source.as_bytes()[1].to_ascii_uppercase(),
-            ) {
-                (b'E', b'Q') => Ok(Self::Eq),
-                (b'N', b'E') => Ok(Self::Ne),
-                (b'L', b'T') => Ok(Self::Lt),
-                (b'L', b'E') => Ok(Self::Le),
-                (b'G', b'T') => Ok(Self::Gt),
-                (b'G', b'E') => Ok(Self::Ge),
-                _ => Err(()),
-            },
-            _ => Err(()),
-        }
-    }
-}
-
-impl RelOp {
-    fn evaluate(&self, cmp: Ordering) -> bool {
-        match self {
-            RelOp::Eq => cmp == Ordering::Equal,
-            RelOp::Ne => cmp != Ordering::Equal,
-            RelOp::Lt => cmp == Ordering::Less,
-            RelOp::Gt => cmp == Ordering::Greater,
-            RelOp::Le => cmp != Ordering::Greater,
-            RelOp::Ge => cmp != Ordering::Less,
-        }
-    }
-}
-
-pub type MacroSet = HashMap<UniCase<String>, Macro>;
-
-enum ParserState {
-    /// Accumulating tokens toward the end of any type of argument.
-    Arg,
-
-    /// Expecting the opening delimiter of an ARG_ENCLOSE argument.
-    Enclose,
-
-    /// Expecting a keyword for a keyword argument.
-    Keyword,
-
-    /// Expecting an equal sign for a keyword argument.
-    Equals,
-
-    /// Macro fully parsed and ready for expansion.
-    Finished,
-}
-
-/// Macro call parser FSM.
-pub struct Parser<'a> {
-    macros: &'a MacroSet,
-    macro_: &'a Macro,
-    state: ParserState,
-    args: Box<[Option<Vec<MacroToken>>]>,
-    arg_index: usize,
-
-    /// Length of macro call so far.
-    n_tokens: usize,
-}
-
-#[derive(Copy, Clone, Debug, PartialEq, Eq)]
-pub enum ParseStatus {
-    Complete,
-    Incomplete,
-}
-
-impl<'a> Parser<'a> {
-    pub fn new(macros: &'a MacroSet, token: &Token) -> Option<Self> {
-        let macro_ = macros.get(&token.id()?.0)?;
-        Some(Self {
-            macros,
-            macro_,
-            state: macro_.initial_state(),
-            args: (0..macro_.parameters.len()).map(|_| None).collect(),
-            arg_index: 0,
-            n_tokens: 1,
-        })
-    }
-
-    fn finished(&mut self) {
-        self.state = ParserState::Finished;
-        for (i, arg) in self.args.iter_mut().enumerate() {
-            if arg.is_none() {
-                *arg = Some(self.macro_.parameters[i].default.clone());
-            }
-        }
-        self.state = ParserState::Finished;
-    }
-
-    fn next_arg(&mut self) {
-        if self.macro_.parameters.is_empty() {
-            self.finished()
-        } else {
-            let param = &self.macro_.parameters[self.arg_index];
-            if param.is_positional() {
-                self.arg_index += 1;
-                if self.arg_index >= self.args.len() {
-                    self.finished()
-                } else {
-                    let param = &self.macro_.parameters[self.arg_index];
-                    self.state = if !param.is_positional() {
-                        ParserState::Keyword
-                    } else if let ValueType::Enclose(_, _) = param.arg {
-                        ParserState::Enclose
-                    } else {
-                        ParserState::Arg
-                    };
-                }
-            } else {
-                if self.args.iter().any(|arg| arg.is_none()) {
-                    self.state = ParserState::Keyword;
-                } else {
-                    self.finished();
-                }
-            }
-        }
-    }
-
-    fn push_arg(&mut self, token: &Token, syntax: &str, error: &impl Fn(MacroError)) {
-        let param = &self.macro_.parameters[self.args.len() - 1];
-        if let Token::EndCommand | Token::End = token {
-            if let Some(arg) = &self.args[self.arg_index] {
-                let param = &self.macro_.parameters[self.args.len() - 1];
-
-                match &param.arg {
-                    ValueType::NTokens(n) => error(MacroError::ExpectedMoreTokens {
-                        n: n - arg.len(),
-                        arg: param.name.clone(),
-                        macro_: self.macro_.name.clone(),
-                    }),
-                    ValueType::CharEnd(end) | ValueType::Enclose(_, end) => {
-                        error(MacroError::ExpectedToken {
-                            token: end.to_string(),
-                            arg: param.name.clone(),
-                            macro_: self.macro_.name.clone(),
-                        })
-                    }
-                    ValueType::CmdEnd => {
-                        // This is OK, it's the expected way to end the argument.
-                    }
-                }
-            }
-            self.finished();
-        }
-
-        self.n_tokens += 1;
-        let arg = self.args[self.arg_index].get_or_insert(Vec::new());
-        let (
-            add_token, // Should we add `mt` to the current arg?
-            next_arg,  // Should we advance to the next arg?
-        ) = match &param.arg {
-            ValueType::NTokens(n) => (arg.len() + 1 >= *n, true),
-            ValueType::CharEnd(end) | ValueType::Enclose(_, end) => {
-                let at_end = token == end;
-                (at_end, !at_end)
-            }
-            ValueType::CmdEnd => (false, true),
-        };
-        if add_token {
-            if true
-            // !macro_expand_arg (&mt->token, mc->me, *argp)
-            {
-                arg.push(MacroToken {
-                    token: token.clone(),
-                    syntax: String::from(syntax),
-                });
-            }
-        }
-        if next_arg {
-            self.next_arg()
-        }
-    }
-
-    fn push_enclose(&mut self, token: &Token, syntax: &str, error: &impl Fn(MacroError)) {
-        let param = &self.macro_.parameters[self.arg_index];
-        let ValueType::Enclose(start, _) = &param.arg else {
-            unreachable!()
-        };
-        if token == start {
-            self.n_tokens += 1;
-            self.args[self.arg_index].get_or_insert(Vec::new());
-            self.state = ParserState::Arg;
-        } else if param.is_positional() && matches!(token, Token::End | Token::EndCommand) {
-            self.finished();
-        } else {
-            error(MacroError::UnexpectedToken {
-                actual: String::from(syntax),
-                expected: start.to_string(),
-                arg: param.name.clone(),
-                macro_: self.macro_.name.clone(),
-            });
-            self.finished();
-        }
-    }
-
-    fn push_keyword(&mut self, token: &Token, _syntax: &str, error: &impl Fn(MacroError)) {
-        let Some(id) = token.id() else {
-            return self.finished();
-        };
-        let Some(arg_index) = self.macro_.find_parameter(id) else {
-            return self.finished();
-        };
-        self.arg_index = arg_index;
-        if self.args[arg_index].is_some() {
-            error(MacroError::DuplicateArg {
-                arg: id.clone(),
-                macro_: self.macro_.name.clone(),
-            });
-        }
-        self.args[arg_index] = Some(Vec::new());
-    }
-
-    fn push_equals(&mut self, token: &Token, syntax: &str, error: &impl Fn(MacroError)) {
-        let param = &self.macro_.parameters[self.arg_index];
-        if let Token::Punct(Punct::Eq) = token {
-            self.n_tokens += 1;
-            self.state = if let ValueType::Enclose(_, _) = param.arg {
-                ParserState::Enclose
-            } else {
-                ParserState::Arg
-            };
-        } else {
-            error(MacroError::UnexpectedToken {
-                actual: syntax.into(),
-                expected: String::from("="),
-                arg: param.name.clone(),
-                macro_: self.macro_.name.clone(),
-            });
-            self.finished()
-        }
-    }
-
-    /// Adds `token`, which has the given `syntax`, to the collection of tokens
-    /// in `self` that potentially need to be macro expanded.
-    ///
-    /// Returns [ParseStatus::Incomplete] if the macro expander needs more
-    /// tokens, for macro arguments or to decide whether this is actually a
-    /// macro invocation.  The caller should call `push` again with the next
-    /// token.
-    ///
-    /// Returns [ParseStatus::Complete] if the macro invocation is now complete.
-    /// The caller should call [`Self::finish()`] to obtain the expansion.
-    pub fn push(
-        &mut self,
-        token: &Token,
-        syntax: &str,
-        error: &impl Fn(MacroError),
-    ) -> ParseStatus {
-        match self.state {
-            ParserState::Arg => self.push_arg(token, syntax, error),
-            ParserState::Enclose => self.push_enclose(token, syntax, error),
-            ParserState::Keyword => self.push_keyword(token, syntax, error),
-            ParserState::Equals => self.push_equals(token, syntax, error),
-            ParserState::Finished => (),
-        }
-        if let ParserState::Finished = self.state {
-            ParseStatus::Complete
-        } else {
-            ParseStatus::Incomplete
-        }
-    }
-
-    pub fn finish(self) -> Call<'a> {
-        let ParserState::Finished = self.state else {
-            panic!()
-        };
-        Call(self)
-    }
-}
-
-/// Expansion stack entry.
-struct Frame {
-    /// A macro name or `!IF`, `!DO`, etc.
-    name: Option<Identifier>,
-
-    /// Source location, if available.
-    location: Option<Location>,
-}
-
-struct Expander<'a> {
-    /// Macros to expand recursively.
-    macros: &'a MacroSet,
-
-    /// Error reporting callback.
-    error: &'a Box<dyn Fn(MacroError) + 'a>,
-
-    /// Tokenization mode.
-    mode: Mode,
-
-    /// Remaining nesting levels.
-    nesting_countdown: usize,
-
-    /// Stack for error reporting.
-    stack: Vec<Frame>,
-
-    // May macro calls be expanded?
-    expand: &'a RefCell<bool>,
-
-    /// Variables from `!DO` and `!LET`.
-    vars: &'a RefCell<BTreeMap<Identifier, String>>,
-
-    // Only set if inside a `!DO` loop. If true, break out of the loop.
-    break_: Option<&'a mut bool>,
-
-    /// Only set if expanding a macro (and not, say, a macro argument).
-    macro_: Option<&'a Macro>,
-
-    /// Only set if expanding a macro (and not, say, a macro argument).
-    args: Option<&'a [Option<Vec<MacroToken>>]>,
-}
-
-fn bool_to_string(b: bool) -> String {
-    if b {
-        String::from("1")
-    } else {
-        String::from("0")
-    }
-}
-
-#[derive(Copy, Clone, Debug, PartialEq, Eq)]
-enum IfEndClause {
-    Else,
-    IfEnd,
-}
-
-fn macro_keywords() -> HashSet<Identifier> {
-    let mut keywords = HashSet::new();
-    for kw in [
-        "!BREAK",
-        "!CHAREND",
-        "!CMDEND",
-        "!DEFAULT",
-        "!DO",
-        "!DOEND",
-        "!ELSE",
-        "!ENCLOSE",
-        "!ENDDEFINE",
-        "!IF",
-        "!IFEND",
-        "!IN",
-        "!LET",
-        "!NOEXPAND",
-        "!OFFEXPAND",
-        "!ONEXPAND",
-        "!POSITIONAL",
-        "!THEN",
-        "!TOKENS",
-    ] {
-        keywords.insert(Identifier::new(kw).unwrap());
-    }
-    keywords
-}
-
-fn is_macro_keyword(s: &Identifier) -> bool {
-    lazy_static! {
-        static ref KEYWORDS: HashSet<Identifier> = macro_keywords();
-    }
-    KEYWORDS.contains(s)
-}
-
-enum DoInput {
-    List(Vec<String>),
-    Up { first: f64, last: f64, by: f64 },
-    Down { first: f64, last: f64, by: f64 },
-    Empty,
-}
-
-impl DoInput {
-    fn from_list(items: Vec<MacroToken>) -> Self {
-        Self::List(
-            items
-                .into_iter()
-                .rev()
-                .take(Settings::global().macros.max_iterations + 1)
-                .map(|mt| mt.syntax)
-                .collect(),
-        )
-    }
-
-    fn from_by(first: f64, last: f64, by: f64) -> Self {
-        if by > 0.0 && first <= last {
-            Self::Up { first, last, by }
-        } else if by > 0.0 && first <= last {
-            Self::Down { first, last, by }
-        } else {
-            Self::Empty
-        }
-    }
-}
-
-impl Iterator for DoInput {
-    type Item = String;
-
-    fn next(&mut self) -> Option<Self::Item> {
-        match self {
-            DoInput::List(vec) => vec.pop(),
-            DoInput::Up { first, last, by } => {
-                if first <= last {
-                    let value = *first;
-                    *first += *by;
-                    Some(format!("{value}"))
-                } else {
-                    None
-                }
-            }
-            DoInput::Down { first, last, by } => {
-                if first >= last {
-                    let value = *first;
-                    *first += *by;
-                    Some(format!("{value}"))
-                } else {
-                    None
-                }
-            }
-            DoInput::Empty => None,
-        }
-    }
-}
-
-impl<'a> Expander<'a> {
-    fn may_expand(&self) -> bool {
-        *self.expand.borrow()
-    }
-
-    fn should_break(&self) -> bool {
-        self.break_.as_ref().map(|b| **b).unwrap_or(false)
-    }
-
-    fn expand(&mut self, input: &mut MacroTokens, output: &mut Vec<MacroToken>) {
-        if self.nesting_countdown == 0 {
-            (self.error)(MacroError::TooDeep {
-                limit: Settings::global().macros.max_nest,
-            });
-            output.extend(take(&mut input.0).iter().cloned());
-        } else {
-            while !input.is_empty() && !self.should_break() {
-                self.expand__(input, output);
-            }
-        }
-    }
-
-    fn expand_arg(&mut self, param_idx: usize, output: &mut Vec<MacroToken>) {
-        let param = &self.macro_.unwrap().parameters[param_idx];
-        let arg = &self.args.unwrap()[param_idx].as_ref().unwrap();
-        if self.may_expand() && param.expand_value {
-            let vars = RefCell::new(BTreeMap::new());
-            let mut stack = take(&mut self.stack);
-            stack.push(Frame {
-                name: Some(param.name.clone()),
-                location: None,
-            });
-            let mut subexpander = Expander {
-                stack,
-                vars: &vars,
-                break_: None,
-                macro_: None,
-                args: None,
-                ..*self
-            };
-            let mut arg_tokens = MacroTokens(&arg);
-            subexpander.expand(&mut arg_tokens, output);
-            self.stack = subexpander.stack;
-            self.stack.pop();
-        } else {
-            output.extend(arg.iter().cloned());
-        }
-    }
-    fn parse_function_args(
-        &mut self,
-        function: &Identifier,
-        input: &mut MacroTokens,
-    ) -> Option<Vec<String>> {
-        input.advance();
-        input.advance();
-        let mut args = Vec::new();
-        if input.match_(")") {
-            return Some(args);
-        }
-        loop {
-            args.push(self.parse_function_arg(input)?);
-            match input.take() {
-                Some(MacroToken {
-                    token: Token::Punct(Punct::Comma),
-                    ..
-                }) => (),
-                Some(MacroToken {
-                    token: Token::Punct(Punct::RParen),
-                    ..
-                }) => return Some(args),
-                _ => {
-                    (self.error)(MacroError::ExpectingCommaOrRParen(function.clone()));
-                    return None;
-                }
-            }
-        }
-    }
-
-    fn expand_blanks(e: &mut Expander, args: Vec<String>) -> Option<String> {
-        let Ok(n) = args[0].trim().parse::<usize>() else {
-            (e.error)(MacroError::InvalidBlanks(args[0].clone()));
-            return None;
-        };
-        Some(std::iter::repeat(' ').take(n).collect())
-    }
-
-    fn expand_concat(e: &mut Expander, args: Vec<String>) -> Option<String> {
-        Some(
-            args.into_iter()
-                .map(|arg| unquote_string(arg, e.mode))
-                .collect(),
-        )
-    }
-
-    fn expand_eval(e: &mut Expander, args: Vec<String>) -> Option<String> {
-        let tokens = tokenize_string(&args[0], e.mode, e.error);
-        let mut stack = take(&mut e.stack);
-        stack.push(Frame {
-            name: Some(Identifier::new("!EVAL").unwrap()),
-            location: None,
-        });
-        let mut break_ = false;
-        let mut subexpander = Expander {
-            break_: Some(&mut break_),
-            stack,
-            vars: e.vars,
-            ..*e
-        };
-        let mut output = Vec::new();
-        subexpander.expand(&mut MacroTokens(tokens.as_slice()), &mut output);
-        subexpander.stack.pop();
-        e.stack = subexpander.stack;
-        Some(macro_tokens_to_syntax(&output).flatten().collect())
-    }
-
-    fn expand_head(e: &mut Expander, mut args: Vec<String>) -> Option<String> {
-        let arg = unquote_string(args.remove(0), e.mode);
-        let mut output = tokenize_string(&arg, e.mode, e.error);
-        if output.is_empty() {
-            Some(String::new())
-        } else {
-            Some(output.swap_remove(0).syntax)
-        }
-    }
-
-    fn expand_index(_e: &mut Expander, args: Vec<String>) -> Option<String> {
-        let haystack = &args[0];
-        let needle = &args[1];
-        let position = haystack.find(needle);
-        Some(format!(
-            "{}",
-            position.map_or(0, |position| &haystack[0..position].chars().count() + 1)
-        ))
-    }
-
-    fn expand_length(_e: &mut Expander, args: Vec<String>) -> Option<String> {
-        Some(format!("{}", args[0].chars().count()))
-    }
-
-    fn expand_quote(e: &mut Expander, mut args: Vec<String>) -> Option<String> {
-        let arg = args.remove(0);
-        if try_unquote_string(&arg, e.mode).is_some() {
-            Some(arg)
-        } else {
-            let mut output = String::with_capacity(arg.len() + 2);
-            output.push('\'');
-            for c in arg.chars() {
-                if c == '"' {
-                    output.push('\'');
-                }
-                output.push(c);
-            }
-            output.push('\'');
-            Some(output)
-        }
-    }
-
-    fn expand_substr(e: &mut Expander, args: Vec<String>) -> Option<String> {
-        let Ok(start) = args[1].trim().parse::<NonZeroUsize>() else {
-            (e.error)(MacroError::InvalidSubstr3(args[0].clone()));
-            return None;
-        };
-        let start = start.get();
-        let Ok(count) = args[2].trim().parse::<usize>() else {
-            (e.error)(MacroError::InvalidSubstr2(args[0].clone()));
-            return None;
-        };
-
-        Some(args[0].chars().skip(start - 1).take(count).collect())
-    }
-
-    fn expand_tail(e: &mut Expander, mut args: Vec<String>) -> Option<String> {
-        let arg = unquote_string(args.remove(0), e.mode);
-        let mut output = tokenize_string(&arg, e.mode, e.error);
-        Some(
-            output
-                .pop()
-                .map_or_else(|| String::new(), |tail| tail.syntax),
-        )
-    }
-
-    fn expand_unquote(e: &mut Expander, mut args: Vec<String>) -> Option<String> {
-        Some(unquote_string(args.remove(0), e.mode))
-    }
-
-    fn expand_upcase(e: &mut Expander, mut args: Vec<String>) -> Option<String> {
-        Some(unquote_string(args.remove(0), e.mode).to_uppercase())
-    }
-
-    fn expand_macro_function(&mut self, orig_input: &mut MacroTokens) -> Option<String> {
-        let mut input = orig_input.clone();
-        let name = input.macro_id()?;
-        if name == "!NULL" {
-            return Some(String::new());
-        }
-        if input.0.len() < 2 || !matches!(input.0[1].token, Token::Punct(Punct::LParen)) {
-            return None;
-        }
-
-        struct MacroFunction {
-            name: Identifier,
-            args: RangeInclusive<usize>,
-            parser: fn(&mut Expander, Vec<String>) -> Option<String>,
-        }
-        impl MacroFunction {
-            fn new(
-                name: &str,
-                args: RangeInclusive<usize>,
-                parser: fn(&mut Expander, Vec<String>) -> Option<String>,
-            ) -> Self {
-                Self {
-                    name: Identifier::new(name).unwrap(),
-                    args,
-                    parser,
-                }
-            }
-        }
-        lazy_static! {
-            static ref MACRO_FUNCTIONS: [MacroFunction; 11] = [
-                MacroFunction::new("!BLANKS", 1..=1, Expander::expand_blanks),
-                MacroFunction::new("!CONCAT", 1..=usize::MAX, Expander::expand_concat),
-                MacroFunction::new("!HEAD", 1..=1, Expander::expand_head),
-                MacroFunction::new("!INDEX", 2..=2, Expander::expand_index),
-                MacroFunction::new("!LENGTH", 1..=1, Expander::expand_length),
-                MacroFunction::new("!QUOTE", 1..=1, Expander::expand_quote),
-                MacroFunction::new("!SUBSTR", 2..=3, Expander::expand_substr),
-                MacroFunction::new("!TAIL", 1..=1, Expander::expand_tail),
-                MacroFunction::new("!UNQUOTE", 1..=1, Expander::expand_unquote),
-                MacroFunction::new("!UPCASE", 1..=1, Expander::expand_upcase),
-                MacroFunction::new("!EVAL", 1..=1, Expander::expand_eval),
-            ];
-        }
-
-        let function = MACRO_FUNCTIONS.iter().find(|mf| &mf.name == name)?;
-
-        let args = self.parse_function_args(&function.name, &mut input)?;
-
-        let n_args = args.len();
-        if !function.args.contains(&n_args) {
-            let name = function.name.clone();
-            let error = match &function.args {
-                x if x == &(1..=1) => MacroError::ExpectingOneArg { name, n_args },
-                x if x == &(2..=2) => MacroError::ExpectingTwoArgs { name, n_args },
-                x if x == &(2..=3) => MacroError::ExpectingTwoOrThreeArgs { name, n_args },
-                x if x == &(1..=usize::MAX) => MacroError::ExpectingOneOrMoreArgs { name },
-                _ => unreachable!(),
-            };
-            (self.error)(error);
-            return None;
-        }
-
-        *orig_input = input;
-        (function.parser)(self, args)
-    }
-
-    /// Parses one function argument from `input`.  Each argument to a macro
-    /// function is one of:
-    ///
-    ///     - A quoted string or other single literal token.
-    ///
-    ///     - An argument to the macro being expanded, e.g. `!1` or a named
-    ///       argument.
-    ///
-    ///     - `!*`.
-    ///
-    ///     - A function invocation.
-    ///
-    /// Each function invocation yields a character sequence to be turned into a
-    /// sequence of tokens.  The case where that character sequence is a single
-    /// quoted string is an important special case.
-    fn parse_function_arg(&mut self, input: &mut MacroTokens) -> Option<String> {
-        if let Some(macro_) = self.macro_ {
-            match &input.0.get(0)?.token {
-                Token::Id(id) if id.0.starts_with('!') => {
-                    if let Some(param_idx) = macro_.find_parameter(id) {
-                        input.advance();
-                        return Some(
-                            macro_tokens_to_syntax(self.args.unwrap()[param_idx].as_ref().unwrap())
-                                .flatten()
-                                .collect(),
-                        );
-                    }
-                    if let Some(value) = self.vars.borrow().get(id) {
-                        return Some(value.clone());
-                    }
-
-                    if let Some(output) = self.expand_macro_function(input) {
-                        return Some(output);
-                    }
-                }
-                Token::Punct(Punct::BangAsterisk) => {
-                    let mut arg = String::new();
-                    for i in 0..macro_.parameters.len() {
-                        if !macro_.parameters[i].is_positional() {
-                            break;
-                        }
-                        if i > 0 {
-                            arg.push(' ')
-                        }
-                        arg.extend(
-                            macro_tokens_to_syntax(self.args.unwrap()[i].as_ref().unwrap())
-                                .flatten(),
-                        );
-                    }
-                    input.advance();
-                    return Some(arg);
-                }
-                _ => (),
-            }
-        }
-        Some(input.advance().syntax.clone())
-    }
-
-    fn evaluate_literal(&mut self, input: &mut MacroTokens) -> Option<String> {
-        if input.match_("(") {
-            let value = self.evaluate_or(input)?;
-            if input.match_(")") {
-                Some(value)
-            } else {
-                (self.error)(MacroError::ExpectingRParen);
-                None
-            }
-        } else if input.match_(")") {
-            (self.error)(MacroError::ExpectingLiteral);
-            None
-        } else {
-            Some(unquote_string(self.parse_function_arg(input)?, self.mode))
-        }
-    }
-
-    fn evaluate_relational(&mut self, input: &mut MacroTokens) -> Option<String> {
-        let lhs = self.evaluate_literal(input)?;
-        let Some(relop) = input.take_relop() else {
-            return Some(lhs);
-        };
-        let rhs = self.evaluate_literal(input)?;
-        let cmp = unquote_string(lhs, self.mode).cmp(&unquote_string(rhs, self.mode));
-        Some(bool_to_string(relop.evaluate(cmp)))
-    }
-
-    fn evaluate_not(&mut self, input: &mut MacroTokens) -> Option<String> {
-        let mut negations = 0;
-        while input.match_("!AND") || input.match_("&") {
-            negations += 1;
-        }
-
-        let operand = self.evaluate_relational(input)?;
-        if negations == 0 {
-            return Some(operand);
-        }
-
-        let mut b = operand != "0";
-        if negations.is_odd() {
-            b = !b;
-        }
-        Some(bool_to_string(b))
-    }
-
-    fn evaluate_and(&mut self, input: &mut MacroTokens) -> Option<String> {
-        let mut lhs = self.evaluate_not(input)?;
-        while input.match_("!AND") || input.match_("&") {
-            let rhs = self.evaluate_not(input)?;
-            lhs = bool_to_string(lhs != "0" && rhs != "0");
-        }
-        Some(lhs)
-    }
-    fn evaluate_or(&mut self, input: &mut MacroTokens) -> Option<String> {
-        let mut lhs = self.evaluate_and(input)?;
-        while input.match_("!OR") || input.match_("|") {
-            let rhs = self.evaluate_and(input)?;
-            lhs = bool_to_string(lhs != "0" || rhs != "0");
-        }
-        Some(lhs)
-    }
-
-    fn evaluate_expression(&mut self, input: &mut MacroTokens) -> Option<String> {
-        self.evaluate_or(input)
-    }
-
-    fn evaluate_number(&mut self, input: &mut MacroTokens) -> Option<f64> {
-        let s = self.evaluate_expression(input)?;
-        let tokens = tokenize_string(&s, self.mode, self.error);
-        let (
-            Some(MacroToken {
-                token: Token::Number(number),
-                ..
-            }),
-            1,
-        ) = (tokens.get(0), tokens.len())
-        else {
-            (self.error)(MacroError::BadNumericMacroExpression(s));
-            return None;
-        };
-
-        Some(*number)
-    }
-
-    fn find_ifend_clause<'b>(
-        input: &mut MacroTokens<'b>,
-    ) -> Option<(MacroTokens<'b>, IfEndClause)> {
-        let input_copy = input.clone();
-        let mut nesting = 0;
-        while !input.is_empty() {
-            if input.match_("!IF") {
-                nesting += 1;
-            } else if input.match_("!IFEND") {
-                if nesting == 0 {
-                    return Some((
-                        MacroTokens(&input_copy.0[..input_copy.0.len() - input.0.len() - 1]),
-                        IfEndClause::IfEnd,
-                    ));
-                }
-                nesting -= 1;
-            } else if input.match_("!ELSE") && nesting == 0 {
-                return Some((
-                    MacroTokens(&input_copy.0[..input_copy.0.len() - input.0.len() - 1]),
-                    IfEndClause::Else,
-                ));
-            } else {
-                input.advance();
-            }
-        }
-        return None;
-    }
-    fn expand_if(&mut self, orig_input: &mut MacroTokens, output: &mut Vec<MacroToken>) -> bool {
-        let mut input = orig_input.clone();
-        if !input.match_("!IF") {
-            return false;
-        }
-        let Some(result) = self.evaluate_expression(&mut input) else {
-            return false;
-        };
-        if !input.match_("!THEN") {
-            (self.error)(MacroError::ExpectingThen);
-            return false;
-        }
-
-        let Some((if_tokens, clause)) = Self::find_ifend_clause(&mut input) else {
-            (self.error)(MacroError::ExpectingElseOrIfEnd);
-            return false;
-        };
-
-        let else_tokens = match clause {
-            IfEndClause::Else => {
-                let Some((else_tokens, IfEndClause::IfEnd)) = Self::find_ifend_clause(&mut input)
-                else {
-                    (self.error)(MacroError::ExpectingIfEnd);
-                    return false;
-                };
-                Some(else_tokens)
-            }
-            IfEndClause::IfEnd => None,
-        };
-
-        let subinput = match result.as_str() {
-            "0" => else_tokens,
-            _ => Some(if_tokens),
-        };
-        if let Some(mut subinput) = subinput {
-            self.stack.push(Frame {
-                name: Some(Identifier::new("!IF").unwrap()),
-                location: None,
-            });
-            self.expand(&mut subinput, output);
-            self.stack.pop();
-        }
-        *orig_input = input;
-        true
-    }
-
-    fn take_macro_var_name(
-        &mut self,
-        input: &mut MacroTokens,
-        construct: &'static str,
-    ) -> Option<Identifier> {
-        let Some(var_name) = input.take_macro_id() else {
-            (self.error)(MacroError::ExpectingMacroVarName(construct));
-            return None;
-        };
-        if is_macro_keyword(var_name)
-            || self
-                .macro_
-                .map(|m| m.find_parameter(var_name))
-                .flatten()
-                .is_some()
-        {
-            (self.error)(MacroError::BadMacroVarName {
-                name: var_name.clone(),
-                construct,
-            });
-            None
-        } else {
-            Some(var_name.clone())
-        }
-    }
-
-    fn expand_let(&mut self, orig_input: &mut MacroTokens) -> bool {
-        let mut input = orig_input.clone();
-        if !input.match_("!LET") {
-            return false;
-        }
-
-        let Some(var_name) = self.take_macro_var_name(&mut input, "!LET") else {
-            return false;
-        };
-        input.advance();
-
-        if !input.match_("=") {
-            (self.error)(MacroError::ExpectingEquals);
-            return false;
-        }
-
-        let Some(value) = self.evaluate_expression(&mut input) else {
-            return false;
-        };
-        self.vars.borrow_mut().insert(var_name.clone(), value);
-        *orig_input = input;
-        true
-    }
-
-    fn find_doend<'b>(&mut self, input: &mut MacroTokens<'b>) -> Option<MacroTokens<'b>> {
-        let input_copy = input.clone();
-        let mut nesting = 0;
-        while !input.is_empty() {
-            if input.match_("!DO") {
-                nesting += 1;
-            } else if input.match_("!DOEND") {
-                if nesting == 0 {
-                    return Some(MacroTokens(
-                        &input_copy.0[..input_copy.0.len() - input.0.len() - 1],
-                    ));
-                }
-                nesting -= 1;
-            } else {
-                input.advance();
-            }
-        }
-        (self.error)(MacroError::MissingDoEnd);
-        return None;
-    }
-
-    fn expand_do(&mut self, orig_input: &mut MacroTokens, output: &mut Vec<MacroToken>) -> bool {
-        let mut input = orig_input.clone();
-        if !input.match_("!DO") {
-            return false;
-        }
-
-        let Some(var_name) = self.take_macro_var_name(&mut input, "!DO") else {
-            return false;
-        };
-
-        let (items, miterate_error) = if input.match_("!IN") {
-            let Some(list) = self.evaluate_expression(&mut input) else {
-                return false;
-            };
-            let items = tokenize_string(list.as_str(), self.mode, &self.error);
-            (
-                DoInput::from_list(items),
-                MacroError::MiterateList(Settings::global().macros.max_iterations),
-            )
-        } else if input.match_("=") {
-            let Some(first) = self.evaluate_number(&mut input) else {
-                return false;
-            };
-            if !input.match_("!TO") {
-                (self.error)(MacroError::ExpectingTo);
-                return false;
-            }
-            let Some(last) = self.evaluate_number(&mut input) else {
-                return false;
-            };
-            let by = if input.match_("!BY") {
-                let Some(by) = self.evaluate_number(&mut input) else {
-                    return false;
-                };
-                if by == 0.0 {
-                    (self.error)(MacroError::ZeroBy);
-                    return false;
-                }
-                by
-            } else {
-                1.0
-            };
-            (
-                DoInput::from_by(first, last, by),
-                MacroError::MiterateNumeric(Settings::global().macros.max_iterations),
-            )
-        } else {
-            (self.error)(MacroError::ExpectingEqualsOrIn);
-            return false;
-        };
-
-        let Some(body) = self.find_doend(&mut input) else {
-            return false;
-        };
-
-        let mut stack = take(&mut self.stack);
-        stack.push(Frame {
-            name: Some(Identifier::new("!DO").unwrap()),
-            location: None,
-        });
-        let mut break_ = false;
-        let mut subexpander = Expander {
-            break_: Some(&mut break_),
-            stack,
-            vars: self.vars,
-            ..*self
-        };
-
-        for (i, item) in items.enumerate() {
-            if subexpander.should_break() {
-                break;
-            }
-            if i >= Settings::global().macros.max_iterations {
-                (self.error)(miterate_error);
-                break;
-            }
-            let mut vars = self.vars.borrow_mut();
-            if let Some(value) = vars.get_mut(&var_name) {
-                *value = item;
-            } else {
-                vars.insert(var_name.clone(), item);
-            }
-            subexpander.expand(&mut body.clone(), output);
-        }
-        *orig_input = input;
-        true
-    }
-
-    fn expand__(&mut self, input: &mut MacroTokens, output: &mut Vec<MacroToken>) {
-        // Recursive macro calls.
-        if self.may_expand() {
-            if let Some(call) = Call::for_tokens(self.macros, &input.0, &self.error) {
-                let vars = RefCell::new(BTreeMap::new());
-                let mut stack = take(&mut self.stack);
-                stack.push(Frame {
-                    name: Some(call.0.macro_.name.clone()),
-                    location: Some(call.0.macro_.location.clone()),
-                });
-                let mut subexpander = Expander {
-                    break_: None,
-                    vars: &vars,
-                    nesting_countdown: self.nesting_countdown.saturating_sub(1),
-                    stack,
-                    ..*self
-                };
-                let mut body = MacroTokens(call.0.macro_.body.as_slice());
-                subexpander.expand(&mut body, output);
-                self.stack = subexpander.stack;
-                self.stack.pop();
-                input.0 = &input.0[call.len()..];
-                return;
-            }
-        }
-
-        // Only identifiers beginning with `!` receive further processing.
-        let id = match &input.0[0].token {
-            Token::Id(id) if id.0.starts_with('!') => id,
-            Token::Punct(Punct::BangAsterisk) => {
-                if let Some(macro_) = self.macro_ {
-                    for i in 0..macro_.parameters.len() {
-                        self.expand_arg(i, output);
-                    }
-                } else {
-                    (self.error)(MacroError::InvalidBangAsterisk);
-                }
-                input.advance();
-                return;
-            }
-            _ => {
-                output.push(input.advance().clone());
-                return;
-            }
-        };
-
-        // Macro arguments.
-        if let Some(macro_) = self.macro_ {
-            if let Some(param_idx) = macro_.find_parameter(id) {
-                self.expand_arg(param_idx, output);
-                input.advance();
-                return;
-            }
-        }
-
-        // Variables set by `!DO` or `!LET`.
-        if let Some(value) = self.vars.borrow().get(id) {
-            tokenize_string_into(value.as_str(), self.mode, &self.error, output);
-            input.advance();
-            return;
-        }
-
-        // Macro functions.
-        if self.expand_if(input, output) {
-            return;
-        }
-        if self.expand_let(input) {
-            return;
-        }
-        if self.expand_do(input, output) {
-            return;
-        }
-
-        if input.match_("!BREAK") {
-            if let Some(ref mut break_) = self.break_ {
-                **break_ = true;
-            } else {
-                (self.error)(MacroError::BreakOutsideDo);
-            }
-            return;
-        }
-
-        if input.match_("!ONEXPAND") {
-            *self.expand.borrow_mut() = true;
-        } else if input.match_("!OFFEXPAND") {
-            *self.expand.borrow_mut() = false;
-        } else {
-            output.push(input.advance().clone());
-        }
-    }
-}
-
-pub struct Call<'a>(Parser<'a>);
-
-impl<'a> Call<'a> {
-    pub fn for_tokens<F>(macros: &'a MacroSet, tokens: &[MacroToken], error: &F) -> Option<Self>
-    where
-        F: Fn(MacroError),
-    {
-        let mut parser = Parser::new(macros, &tokens.get(0)?.token)?;
-        for token in tokens[1..].iter().chain(&[MacroToken {
-            token: Token::EndCommand,
-            syntax: String::from(""),
-        }]) {
-            if parser.push(&token.token, &token.syntax, error) == ParseStatus::Complete {
-                return Some(parser.finish());
-            }
-        }
-        return None;
-    }
-
-    pub fn expand<F>(&self, mode: Mode, call_loc: Location, output: &mut Vec<MacroToken>, error: F)
-    where
-        F: Fn(MacroError) + 'a,
-    {
-        let error: Box<dyn Fn(MacroError) + 'a> = Box::new(error);
-        let vars = RefCell::new(BTreeMap::new());
-        let expand = RefCell::new(true);
-        let mut me = Expander {
-            macros: self.0.macros,
-            error: &error,
-            macro_: Some(self.0.macro_),
-            args: Some(&self.0.args),
-            mode,
-            nesting_countdown: Settings::global().macros.max_nest,
-            stack: vec![
-                Frame {
-                    name: None,
-                    location: Some(call_loc),
-                },
-                Frame {
-                    name: Some(self.0.macro_.name.clone()),
-                    location: Some(self.0.macro_.location.clone()),
-                },
-            ],
-            vars: &vars,
-            break_: None,
-            expand: &expand,
-        };
-        let mut body = MacroTokens(&self.0.macro_.body);
-        me.expand(&mut body, output);
-    }
-
-    /// Returns the number of tokens consumed from the input for the macro
-    /// invocation. If the result is 0, then there was no macro invocation and
-    /// the expansion will be empty.
-    pub fn len(&self) -> usize {
-        self.0.n_tokens
-    }
-}
diff --git a/rust/src/main.rs b/rust/src/main.rs

deleted file mode 100644 (file)

index a3b3145..0000000
--- a/rust/src/main.rs
+++ /dev/null
@@ -1,155 +0,0 @@
-/* PSPP - a program for statistical analysis.
- * Copyright (C) 2023 Free Software Foundation, Inc.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>. */
-
-use anyhow::Result;
-use clap::{Parser, ValueEnum};
-use encoding_rs::Encoding;
-use pspp::raw::{encoding_from_headers, Decoder, Magic, Reader, Record};
-use std::fs::File;
-use std::io::BufReader;
-use std::path::{Path, PathBuf};
-use std::str;
-use thiserror::Error as ThisError;
-
-/// A utility to dissect SPSS system files.
-#[derive(Parser, Debug)]
-#[command(author, version, about, long_about = None)]
-struct Args {
-    /// Maximum number of cases to print.
-    #[arg(long = "data", default_value_t = 0)]
-    max_cases: u64,
-
-    /// Files to dissect.
-    #[arg(required = true)]
-    files: Vec<PathBuf>,
-
-    /// How to dissect the file.
-    #[arg(short, long, value_enum, default_value_t)]
-    mode: Mode,
-
-    /// The encoding to use.
-    #[arg(long, value_parser = parse_encoding)]
-    encoding: Option<&'static Encoding>,
-}
-
-#[derive(ThisError, Debug)]
-#[error("{0}: unknown encoding")]
-struct UnknownEncodingError(String);
-
-fn parse_encoding(arg: &str) -> Result<&'static Encoding, UnknownEncodingError> {
-    match Encoding::for_label_no_replacement(arg.as_bytes()) {
-        Some(encoding) => Ok(encoding),
-        None => Err(UnknownEncodingError(arg.to_string())),
-    }
-}
-
-#[derive(Clone, Copy, Debug, Default, ValueEnum)]
-enum Mode {
-    Identify,
-    Raw,
-    Decoded,
-    #[default]
-    Cooked,
-}
-
-fn main() -> Result<()> {
-    let Args {
-        max_cases,
-        files,
-        mode,
-        encoding,
-    } = Args::parse();
-
-    for file in files {
-        dissect(&file, max_cases, mode, encoding)?;
-    }
-    Ok(())
-}
-
-fn dissect(
-    file_name: &Path,
-    max_cases: u64,
-    mode: Mode,
-    encoding: Option<&'static Encoding>,
-) -> Result<()> {
-    let reader = File::open(file_name)?;
-    let reader = BufReader::new(reader);
-    let mut reader = Reader::new(reader, |warning| println!("{warning}"))?;
-
-    match mode {
-        Mode::Identify => {
-            let Record::Header(header) = reader.next().unwrap()? else {
-                unreachable!()
-            };
-            match header.magic {
-                Magic::Sav => println!("SPSS System File"),
-                Magic::Zsav => println!("SPSS System File with Zlib compression"),
-                Magic::Ebcdic => println!("EBCDIC-encoded SPSS System File"),
-            }
-            return Ok(());
-        }
-        Mode::Raw => {
-            for header in reader {
-                let header = header?;
-                println!("{:?}", header);
-                if let Record::Cases(cases) = header {
-                    let mut cases = cases.borrow_mut();
-                    for _ in 0..max_cases {
-                        let Some(Ok(record)) = cases.next() else {
-                            break;
-                        };
-                        println!("{:?}", record);
-                    }
-                }
-            }
-        }
-        Mode::Decoded => {
-            let headers: Vec<Record> = reader.collect::<Result<Vec<_>, _>>()?;
-            let encoding = match encoding {
-                Some(encoding) => encoding,
-                None => encoding_from_headers(&headers, &|e| eprintln!("{e}"))?,
-            };
-            let decoder = Decoder::new(encoding, |e| eprintln!("{e}"));
-            for header in headers {
-                let header = header.decode(&decoder);
-                println!("{:?}", header);
-                /*
-                                if let Record::Cases(cases) = header {
-                                    let mut cases = cases.borrow_mut();
-                                    for _ in 0..max_cases {
-                                        let Some(Ok(record)) = cases.next() else {
-                                            break;
-                                        };
-                                        println!("{:?}", record);
-                                    }
-                                }
-                */
-            }
-        }
-        Mode::Cooked => {
-            /*
-                let headers: Vec<Record> = reader.collect::<Result<Vec<_>, _>>()?;
-                let encoding = encoding_from_headers(&headers, &|e| eprintln!("{e}"))?;
-                let (headers, _) = decode(headers, encoding, &|e| eprintln!("{e}"))?;
-                for header in headers {
-                    println!("{header:?}");
-            }
-                */
-        }
-    }
-
-    Ok(())
-}
diff --git a/rust/src/message.rs b/rust/src/message.rs

deleted file mode 100644 (file)

index a3ba1d8..0000000
--- a/rust/src/message.rs
+++ /dev/null
@@ -1,252 +0,0 @@
-use std::{
-    cmp::{max, min},
-    fmt::{Display, Formatter, Result as FmtResult},
-    ops::Range,
-    sync::Arc,
-};
-
-use enum_map::Enum;
-use unicode_width::UnicodeWidthStr;
-
-/// A line number and optional column number within a source file.
-#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord)]
-pub struct Point {
-    /// 1-based line number.
-    pub line: i32,
-
-    /// 1-based column number.
-    ///
-    /// Column numbers are measured according to the width of characters as
-    /// shown in a typical fixed-width font, in which CJK characters have width
-    /// 2 and combining characters have width 0, as measured by the
-    /// `unicode_width` crate.
-    pub column: Option<i32>,
-}
-
-impl Point {
-    /// Takes `point`, adds to it the syntax in `syntax`, incrementing the line
-    /// number for each new-line in `syntax` and the column number for each
-    /// column, and returns the result.
-    pub fn advance(&self, syntax: &str) -> Self {
-        let mut result = *self;
-        for line in syntax.split_inclusive('\n') {
-            if line.ends_with('\n') {
-                result.line += 1;
-                result.column = Some(1);
-            } else {
-                result.column = result.column.map(|column| column + line.width() as i32);
-            }
-        }
-        result
-    }
-
-    pub fn without_column(&self) -> Self {
-        Self {
-            line: self.line,
-            column: None,
-        }
-    }
-}
-
-/// Location relevant to an diagnostic message.
-#[derive(Clone, Debug)]
-pub struct Location {
-    /// File name, if any.
-    pub file_name: Option<Arc<String>>,
-
-    /// Starting and ending point, if any.
-    pub span: Option<Range<Point>>,
-
-    /// Normally, if `span` contains column information, then displaying the
-    /// message will underline the location.  Setting this to true disables
-    /// displaying underlines.
-    pub omit_underlines: bool,
-}
-
-impl Display for Location {
-    fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
-        if let Some(file_name) = &self.file_name {
-            write!(f, "{}", file_name)?;
-        }
-
-        if let Some(span) = &self.span {
-            if self.file_name.is_some() {
-                write!(f, ":")?;
-            }
-            let l1 = span.start.line;
-            let l2 = span.end.line;
-            if let (Some(c1), Some(c2)) = (span.start.column, span.end.column) {
-                if l2 > l1 {
-                    write!(f, "{l1}.{c1}-{l2}.{}", c2 - 1)?;
-                } else {
-                    write!(f, "{l1}.{c1}-{}", c2 - 1)?;
-                }
-            } else {
-                if l2 > l1 {
-                    write!(f, "{l1}-{l2}")?;
-                } else {
-                    write!(f, "{l1}")?;
-                }
-            }
-        }
-        Ok(())
-    }
-}
-
-impl Location {
-    pub fn without_columns(&self) -> Self {
-        Self {
-            file_name: self.file_name.clone(),
-            span: self
-                .span
-                .as_ref()
-                .map(|span| span.start.without_column()..span.end.without_column()),
-            omit_underlines: self.omit_underlines,
-        }
-    }
-    pub fn merge(a: Option<Self>, b: &Option<Self>) -> Option<Self> {
-        let Some(a) = a else { return b.clone() };
-        let Some(b) = b else { return Some(a) };
-        if a.file_name != b.file_name {
-            // Failure.
-            return Some(a);
-        }
-        let span = match (&a.span, &b.span) {
-            (None, None) => None,
-            (Some(r), None) | (None, Some(r)) => Some(r.clone()),
-            (Some(ar), Some(br)) => {
-                Some(min(ar.start, br.start).clone()..max(ar.end, br.end).clone())
-            }
-        };
-        Some(Self {
-            file_name: a.file_name,
-            span,
-            omit_underlines: a.omit_underlines || b.omit_underlines,
-        })
-    }
-    pub fn is_empty(&self) -> bool {
-        self.file_name.is_none() && self.span.is_none()
-    }
-}
-
-#[derive(Copy, Clone, Debug, PartialEq, Eq, Enum)]
-pub enum Severity {
-    Error,
-    Warning,
-    Note,
-}
-
-impl Severity {
-    fn as_str(&self) -> &'static str {
-        match self {
-            Severity::Error => "error",
-            Severity::Warning => "warning",
-            Severity::Note => "note",
-        }
-    }
-}
-
-impl Display for Severity {
-    fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
-        write!(f, "{}", self.as_str())
-    }
-}
-
-#[derive(Copy, Clone, Debug, PartialEq, Eq)]
-pub enum Category {
-    General,
-    Syntax,
-    Data,
-}
-
-pub struct Stack {
-    location: Location,
-    description: String,
-}
-
-pub struct Diagnostic {
-    pub severity: Severity,
-    pub category: Category,
-    pub location: Location,
-    pub source: Vec<(i32, String)>,
-    pub stack: Vec<Stack>,
-    pub command_name: Option<&'static str>,
-    pub text: String,
-}
-
-impl Display for Diagnostic {
-    fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
-        for Stack {
-            location,
-            description,
-        } in &self.stack
-        {
-            if !!location.is_empty() {
-                write!(f, "{location}: ")?;
-            }
-            writeln!(f, "{description}")?;
-        }
-        if self.category != Category::General && !self.location.is_empty() {
-            write!(f, "{}: ", self.location)?;
-        }
-
-        write!(f, "{}: ", self.severity)?;
-
-        match self.command_name {
-            Some(command_name) if self.category == Category::Syntax => {
-                write!(f, "{command_name}: ")?
-            }
-            _ => (),
-        }
-
-        write!(f, "{}", self.text)?;
-
-        if let Some(Range {
-            start: Point {
-                line: l0,
-                column: Some(c0),
-            },
-            end: Point {
-                line: l1,
-                column: Some(c1),
-            },
-        }) = self.location.span
-        {
-            let mut prev_line_number = None;
-            for (line_number, line) in &self.source {
-                if let Some(prev_line_number) = prev_line_number {
-                    if *line_number != prev_line_number + 1 {
-                        write!(f, "\n  ... |")?;
-                    }
-                }
-                prev_line_number = Some(line_number);
-
-                write!(f, "\n{line_number:5} | {line}")?;
-
-                if !self.location.omit_underlines {
-                    let c0 = if *line_number == l0 { c0 } else { 1 };
-                    let c1 = if *line_number == l1 {
-                        c1
-                    } else {
-                        line.width() as i32
-                    };
-                    write!(f, "\n      |")?;
-                    for _ in 0..c0 {
-                        f.write_str(" ")?;
-                    }
-                    if *line_number == l0 {
-                        f.write_str("^")?;
-                        for _ in c0..c1 {
-                            f.write_str("~")?;
-                        }
-                    } else {
-                        for _ in c0..=c1 {
-                            f.write_str("~")?;
-                        }
-                    }
-                }
-            }
-        }
-        Ok(())
-    }
-}
diff --git a/rust/src/output/mod.rs b/rust/src/output/mod.rs

deleted file mode 100644 (file)

index 944cbe7..0000000
--- a/rust/src/output/mod.rs
+++ /dev/null
@@ -1,58 +0,0 @@
-use std::sync::Arc;
-
-use self::pivot::Value;
-
-pub mod pivot;
-
-/// A single output item.
-pub struct Item {
-    /// The localized label for the item that appears in the outline pane in the
-    /// output viewer and in PDF outlines.  This is `None` if no label has been
-    /// explicitly set.
-    label: Option<String>,
-
-    /// A locale-invariant identifier for the command that produced the output,
-    /// which may be `None` if unknown or if a command did not produce this
-    /// output.
-    command_name: Option<String>,
-
-    /// For a group item, this is true if the group's subtree should
-    /// be expanded in an outline view, false otherwise.
-    ///
-    /// For other kinds of output items, this is true to show the item's
-    /// content, false to hide it.  The item's label is always shown in an
-    /// outline view.
-    show: bool,
-
-    /// Item details.
-    details: Details,
-}
-
-pub enum Details {
-    Chart,
-    Image,
-    Group(Vec<Arc<Item>>),
-    Message,
-    Table,
-    Text(Text),
-}
-
-pub struct Text {
-    type_: TextType,
-
-    content: Value,
-}
-
-pub enum TextType {
-    /// `TITLE` and `SUBTITLE` commands.
-    PageTitle,
-
-    /// Title,
-    Title,
-
-    /// Syntax printback logging.
-    Syntax,
-
-    /// Other logging.
-    Log,
-}
diff --git a/rust/src/output/pivot/mod.rs b/rust/src/output/pivot/mod.rs

deleted file mode 100644 (file)

index d8f5c9f..0000000
--- a/rust/src/output/pivot/mod.rs
+++ /dev/null
@@ -1,738 +0,0 @@
-//! Pivot tables.
-//!
-//! Pivot tables are PSPP's primary form of output.  They are analogous to the
-//! pivot tables you might be familiar with from spreadsheets and databases.
-//! See <https://en.wikipedia.org/wiki/Pivot_table> for a brief introduction to
-//! the overall concept of a pivot table.
-//!
-//! In PSPP, the most important internal pieces of a pivot table are:
-//!
-//! - Title.  Every pivot table has a title that is displayed above it.  It also
-//!   has an optional caption (displayed below it) and corner text (displayed in
-//!   the upper left corner).
-//!
-//! - Dimensions.  A dimension consists of zero or more categories.  A category
-//!   has a label, such as "df" or "Asymp. Sig." or 123 or a variable name.  The
-//!   categories are the leaves of a tree whose non-leaf nodes form groups of
-//!   categories.  The tree always has a root group whose label is the name of
-//!   the dimension.
-//!
-//! - Axes.  A table has three axes: column, row, and layer.  Each dimension is
-//!   assigned to an axis, and each axis has zero or more dimensions.  When an
-//!   axis has more than one dimension, they are ordered from innermost to
-//!   outermost.
-//!
-//! - Data.  A table's data consists of zero or more cells.  Each cell maps from
-//!   a category for each dimension to a value, which is commonly a number but
-//!   could also be a variable name or an arbitrary text string.
-//!
-//! Creating a pivot table usually consists of the following steps:
-//!
-//! 1. Create the table with pivot_table_create(), passing in the title.
-//!
-//! 2. Optionally, set the format to use for "count" values with
-//!    pivot_table_set_weight_var() or pivot_table_set_weight_format().
-//!
-//! 3. Create each dimension with pivot_dimension_create() and populate it with
-//!    categories and, possibly, with groups that contain the categories.  This
-//!    call also assigns the dimension to an axis.
-//!
-//!    In simple cases, only a call to pivot_dimension_create() is needed.
-//!    Other functions such as pivot_category_create_group() can be used for
-//!    hierarchies of categories.
-//!
-//!    Sometimes it's easier to create categories in tandem with inserting data,
-//!    for example by adding a category for a variable just before inserting the
-//!    first cell for that variable.  In that case, creating categories and
-//!    inserting data can be interleaved.
-//!
-//! 4. Insert data.  For each cell, supply the category indexes, which are
-//!    assigned starting from 0 in the order in which the categories were
-//!    created in step 2, and the value to go in the cell.  If the table has a
-//!    small, fixed number of dimensions, functions like, e.g.
-//!    pivot_table_put3() for 3 dimensions, can be used.  The general function
-//!    pivot_table_put() works for other cases.
-//!
-//! 5. Output the table for user consumption.  Use pivot_table_submit().
-
-use std::{
-    collections::HashMap,
-    ops::Range,
-    sync::{Arc, OnceLock},
-};
-
-use chrono::NaiveDateTime;
-use enum_map::{enum_map, Enum, EnumMap};
-
-use crate::format::{Format, Settings as FormatSettings};
-
-/// Areas of a pivot table for styling purposes.
-#[derive(Copy, Clone, Debug, Enum, PartialEq, Eq)]
-pub enum Area {
-    Title,
-    Caption,
-
-    /// Footnotes,
-    Footer,
-
-    // Top-left corner.
-    Corner,
-
-    ColumnLabels,
-    RowLabels,
-    Data,
-
-    /// Layer indication.
-    Layers,
-}
-
-/// Table borders for styling purposes.
-#[derive(Debug, Enum)]
-pub enum Border {
-    Title,
-    OuterFrame(BoxBorder),
-    InnerFrame(BoxBorder),
-    Dimensions(RowColBorder),
-    Categories(RowColBorder),
-    DataLeft,
-    DataTop,
-}
-
-/// The borders on a box.
-#[derive(Debug, Enum)]
-pub enum BoxBorder {
-    Left,
-    Top,
-    Right,
-    Bottom,
-}
-
-/// Borders between rows and columns.
-#[derive(Debug, Enum, PartialEq, Eq)]
-pub enum RowColBorder {
-    RowHorz,
-    RowVert,
-    ColHorz,
-    ColVert,
-}
-
-/// Sizing for rows or columns of a rendered table.
-///
-/// The comments below talk about columns and their widths but they apply
-/// equally to rows and their heights.
-#[derive(Default)]
-pub struct Sizing {
-    /// Specific column widths, in 1/96" units.
-    widths: Vec<i32>,
-
-    /// Specific page breaks: 0-based columns after which a page break must
-    /// occur, e.g. a value of 1 requests a break after the second column.
-    breaks: Vec<usize>,
-
-    /// Keeps: columns to keep together on a page if possible.
-    keeps: Vec<Range<usize>>,
-}
-
-#[derive(Enum)]
-pub enum Axis3 {
-    X,
-    Y,
-    Z,
-}
-
-/// An axis within a pivot table.
-#[derive(Default)]
-pub struct TableAxis {
-    /// `dimensions[0]` is the innermost dimension.
-    dimensions: Vec<Dimension>,
-
-    /// The number of rows or columns along the axis, that is, the product of
-    /// `dimensions[*].n_leaves`.  It is 0 if any dimension has 0 leaves.
-    extent: usize,
-
-    /// Sum of `dimensions[*].label_depth`.
-    label_depth: usize,
-}
-
-/// Dimensions.
-///
-/// A [Dimension] identifies the categories associated with a single dimension
-/// within a multidimensional pivot table.
-///
-/// A dimension contains a collection of categories, which are the leaves in a
-/// tree of groups.
-///
-/// (A dimension or a group can contain zero categories, but this is unusual.
-/// If a dimension contains no categories, then its table cannot contain any
-/// data.)
-pub struct Dimension {
-    axis_type: Axis3,
-    level: usize,
-
-    top_index: usize,
-
-    /// Hierarchy of categories within the dimension.  The groups and categories
-    /// are sorted in the order that should be used for display.  This might be
-    /// different from the original order produced for output if the user
-    /// adjusted it.
-    ///
-    /// The root must always be a group, although it is allowed to have no
-    /// subcategories.
-    root: Group,
-
-    ///  All of the leaves reachable via the root.
-    ///
-    ///  The indexing for presentation_leaves is presentation order, thus
-    ///  `presentation_leaves[i]->presentation_index == i`.  This order is the
-    ///  same as would be produced by an in-order traversal of the groups.  It
-    ///  is the order into which the user reordered or sorted the categories.
-    ///
-    ///  The indexing for `data_leaves` is that used for `idx` in [Cell], thus
-    ///  `data_leaves[i]->data_index == i`.  This might differ from what an
-    ///  in-order traversal of `root` would yield, if the user reordered
-    ///  categories.
-    data_leaves: Vec<Arc<Leaf>>,
-    presentation_leaves: Vec<Arc<Leaf>>,
-
-    /// Display.
-    hide_all_labels: bool,
-
-    /// Number of rows or columns needed to express the labels.
-    label_depth: usize,
-}
-
-pub struct Group {
-    name: Value,
-    label_depth: usize,
-    extra_depth: usize,
-
-    /// The child categories.
-    ///
-    /// A group usually has multiple children, but it is allowed to have
-    /// only one or even (pathologically) none.
-    children: Vec<Category>,
-
-    /// Display a label for the group itself?
-    show_label: bool,
-
-    show_label_in_corner: bool,
-}
-
-pub struct Leaf {
-    name: Value,
-    label_depth: usize,
-    extra_depth: usize,
-
-    group_index: usize,
-    data_index: usize,
-    presentation_index: usize,
-
-    /// Default format for values in this category.
-    format: Format,
-
-    /// Honor [Table]'s `small` setting?
-    honor_small: bool,
-}
-
-/// A pivot_category is a leaf (a category) or a group.
-pub enum Category {
-    Group(Arc<Group>),
-    Leaf(Arc<Leaf>),
-}
-
-trait CategoryTrait {
-    fn name(&self) -> &Value;
-    fn label_depth(&self) -> usize;
-    fn extra_depth(&self) -> usize;
-}
-
-impl CategoryTrait for Group {
-    fn name(&self) -> &Value {
-        &self.name
-    }
-
-    fn label_depth(&self) -> usize {
-        self.label_depth
-    }
-
-    fn extra_depth(&self) -> usize {
-        self.extra_depth
-    }
-}
-
-impl CategoryTrait for Leaf {
-    fn name(&self) -> &Value {
-        &self.name
-    }
-
-    fn label_depth(&self) -> usize {
-        self.label_depth
-    }
-
-    fn extra_depth(&self) -> usize {
-        self.extra_depth
-    }
-}
-
-impl CategoryTrait for Category {
-    fn name(&self) -> &Value {
-        match self {
-            Category::Group(group) => group.name(),
-            Category::Leaf(leaf) => leaf.name(),
-        }
-    }
-
-    fn label_depth(&self) -> usize {
-        match self {
-            Category::Group(group) => group.label_depth(),
-            Category::Leaf(leaf) => leaf.label_depth(),
-        }
-    }
-
-    fn extra_depth(&self) -> usize {
-        match self {
-            Category::Group(group) => group.extra_depth(),
-            Category::Leaf(leaf) => leaf.extra_depth(),
-        }
-    }
-}
-
-/// Styling for a pivot table.
-///
-/// The division between this and the style information in [Table] seems fairly
-/// arbitrary.  The ultimate reason for the division is simply because that's
-/// how SPSS documentation and file formats do it.
-struct Look {
-    name: Option<String>,
-
-    omit_empty: bool,
-    row_labels_in_corner: bool,
-
-    /// Range of column widths for columns in the row headings and corner , in 1/96"
-    /// units.
-    row_heading_widths: Range<usize>,
-
-    /// Range of column widths for columns in the column headings , in 1/96"
-    /// units.
-    col_heading_widths: Range<usize>,
-
-    /// Kind of markers to use for footnotes.
-    footnote_marker_type: FootnoteMarkerType,
-
-    /// Where to put the footnote markers.
-    footnote_marker_position: FootnoteMarkerPosition,
-
-    /// Styles for areas of the pivot table.
-    areas: EnumMap<Area, AreaStyle>,
-
-    /// Styles for borders in the pivot table.
-    borders: EnumMap<Border, BorderStyle>,
-
-    print_all_layers: bool,
-
-    paginate_layers: bool,
-
-    shrink_to_fit: EnumMap<Axis2, bool>,
-
-    top_continuation: bool,
-
-    bottom_continuation: bool,
-
-    continuation: Option<String>,
-
-    n_orphan_lines: usize,
-}
-
-impl Default for Look {
-    fn default() -> Self {
-        Self {
-            name: None,
-            omit_empty: true,
-            row_labels_in_corner: true,
-            row_heading_widths: 36..72,
-            col_heading_widths: 36..120,
-            footnote_marker_type: FootnoteMarkerType::Alphabetic,
-            footnote_marker_position: FootnoteMarkerPosition::Subscript,
-            areas: EnumMap::from_fn(|area| {
-                use HorzAlign::*;
-                use VertAlign::*;
-                let (halign, valign, hmargins, vmargins) = match area {
-                    Area::Title => (Center, Middle, [8, 11], [1, 8]),
-                    Area::Caption => (Left, Top, [8, 11], [1, 1]),
-                    Area::Footer => (Left, Top, [11, 8], [2, 3]),
-                    Area::Corner => (Left, Bottom, [8, 11], [1, 1]),
-                    Area::ColumnLabels => (Left, Top, [8, 11], [1, 3]),
-                    Area::RowLabels => (Left, Top, [8, 11], [1, 3]),
-                    Area::Data => (Mixed, Top, [8, 11], [1, 1]),
-                    Area::Layers => (Left, Bottom, [8, 11], [1, 3]),
-                };
-                AreaStyle {
-                    cell_style: CellStyle {
-                        horz_align: halign,
-                        vert_align: valign,
-                        margins: enum_map! { Axis2::X => hmargins, Axis2::Y => vmargins },
-                    },
-                    font_style: FontStyle {
-                        bold: area == Area::Title,
-                        italic: false,
-                        underline: false,
-                        markup: false,
-                        font: String::from("Sans Serif"),
-                        fg: [Color::BLACK; 2],
-                        bg: [Color::WHITE; 2],
-                        size: 9,
-                    },
-                }
-            }),
-            borders: EnumMap::from_fn(|border| {
-                let stroke = match border {
-                    Border::InnerFrame(_) | Border::DataLeft | Border::DataTop => Stroke::Thick,
-                    Border::Dimensions(side) if side != RowColBorder::RowVert => Stroke::Solid,
-                    Border::Categories(RowColBorder::ColHorz | RowColBorder::ColVert) => {
-                        Stroke::Solid
-                    }
-                    _ => Stroke::None,
-                };
-                BorderStyle {
-                    stroke,
-                    color: Color::BLACK,
-                }
-            }),
-            print_all_layers: false,
-            paginate_layers: false,
-            shrink_to_fit: EnumMap::from_fn(|_| false),
-            top_continuation: false,
-            bottom_continuation: false,
-            continuation: None,
-            n_orphan_lines: 0,
-        }
-    }
-}
-
-impl Look {
-    fn shared_default() -> Arc<Look> {
-        static LOOK: OnceLock<Arc<Look>> = OnceLock::new();
-        LOOK.get_or_init(|| Arc::new(Look::default())).clone()
-    }
-}
-
-pub struct AreaStyle {
-    cell_style: CellStyle,
-    font_style: FontStyle,
-}
-
-pub struct CellStyle {
-    horz_align: HorzAlign,
-    vert_align: VertAlign,
-
-    /// Margins in 1/96" units.
-    ///
-    /// `margins[Axis2::X][0]` is the left margin.
-    /// `margins[Axis2::X][1]` is the right margin.
-    /// `margins[Axis2::Y][0]` is the top margin.
-    /// `margins[Axis2::Y][1]` is the bottom margin.
-    margins: EnumMap<Axis2, [i32; 2]>,
-}
-
-pub enum HorzAlign {
-    /// Right aligned.
-    Right,
-
-    /// Left aligned.
-    Left,
-
-    /// Centered.
-    Center,
-
-    /// Align strings to the left, other formats to the right.
-    Mixed,
-
-    /// Align the decimal point at the specified position.
-    Decimal {
-        /// Decimal offset from the right side of the cell, in 1/96" units.
-        offset: f64,
-
-        /// Decimal character: either `b'.'` or `b','`.
-        c: char,
-    },
-}
-
-pub enum VertAlign {
-    /// Top alignment.
-    Top,
-
-    /// Centered,
-    Middle,
-
-    /// Bottom alignment.
-    Bottom,
-}
-
-pub struct FontStyle {
-    bold: bool,
-    italic: bool,
-    underline: bool,
-    markup: bool,
-    font: String,
-    fg: [Color; 2],
-    bg: [Color; 2],
-
-    /// In 1/72" units.
-    size: i32,
-}
-
-pub struct Color {
-    alpha: u8,
-    r: u8,
-    g: u8,
-    b: u8,
-}
-
-impl Color {
-    const BLACK: Color = Color::new(0, 0, 0);
-    const WHITE: Color = Color::new(255, 255, 255);
-
-    const fn new(r: u8, g: u8, b: u8) -> Self {
-        Self {
-            alpha: 255,
-            r,
-            g,
-            b,
-        }
-    }
-}
-
-pub struct BorderStyle {
-    stroke: Stroke,
-    color: Color,
-}
-
-pub enum Stroke {
-    None,
-    Solid,
-    Dashed,
-    Thick,
-    Thin,
-    Double,
-}
-
-/// An axis of a flat table.
-#[derive(Debug, Enum)]
-pub enum Axis2 {
-    X,
-    Y,
-}
-
-pub enum FootnoteMarkerType {
-    /// a, b, c, ...
-    Alphabetic,
-
-    /// 1, 2, 3, ...
-    Numeric,
-}
-
-pub enum FootnoteMarkerPosition {
-    /// Subscripts.
-    Subscript,
-
-    /// Superscripts.
-    Superscript,
-}
-
-pub struct Table {
-    look: Arc<Look>,
-
-    rotate_inner_column_labels: bool,
-
-    rotate_outer_row_labels: bool,
-
-    show_grid_lines: bool,
-
-    show_title: bool,
-
-    show_caption: bool,
-
-    show_value: Option<ValueShow>,
-
-    show_variables: Option<ValueShow>,
-
-    weight_format: Format,
-
-    /// Current layer indexes, with axes[PIVOT_AXIS_LAYER].n_dimensions
-    /// elements.  current_layer[i] is an offset into
-    /// axes[PIVOT_AXIS_LAYER].dimensions[i]->data_leaves[], EXCEPT that a
-    /// dimension can have zero leaves, in which case current_layer[i] is zero
-    /// and there's no corresponding leaf.
-    current_layer: Vec<usize>,
-
-    /// Column and row sizing and page breaks.
-    sizing: EnumMap<Axis2, Sizing>,
-
-    /// Format settings.
-    settings: FormatSettings,
-
-    /// Numeric grouping character (usually `.` or `,`).
-    grouping: Option<char>,
-
-    small: f64,
-
-    command_local: Option<String>,
-    command_c: Option<String>,
-    language: Option<String>,
-    locale: Option<String>,
-    dataset: Option<String>,
-    datafile: Option<String>,
-    date: Option<NaiveDateTime>,
-    footnotes: Vec<Footnote>,
-    title: Option<Value>,
-    subtype: Option<Value>,
-    corner_text: Option<Value>,
-    caption: Option<Value>,
-    notes: Option<String>,
-    dimensions: Vec<Dimension>,
-    axes: EnumMap<Axis3, TableAxis>,
-    cells: HashMap<u64, Value>,
-}
-
-impl Table {
-    fn new() -> Self {
-        Self {
-            look: Look::shared_default(),
-            rotate_inner_column_labels: false,
-            rotate_outer_row_labels: false,
-            show_grid_lines: false,
-            show_title: true,
-            show_caption: true,
-            show_value: None,
-            show_variables: None,
-            weight_format: Format::F40,
-            current_layer: Vec::new(),
-            sizing: EnumMap::default(),
-            settings: FormatSettings::default(), // XXX from settings
-            grouping: None,
-            small: 0.0001, // XXX from settings.
-            command_local: None,
-            command_c: None, // XXX from current command name.
-            language: None,
-            locale: None,
-            dataset: None,
-            datafile: None,
-            date: None,
-            footnotes: Vec::new(),
-            subtype: None,
-            title: None,
-            corner_text: None,
-            caption: None,
-            notes: None,
-            dimensions: Vec::new(),
-            axes: EnumMap::default(),
-            cells: HashMap::new(),
-        }
-    }
-}
-
-/// Whether to show variable or value labels or the underlying value or variable name.
-pub enum ValueShow {
-    /// Value or variable name only.
-    Value,
-
-    /// Label only.
-    Label,
-
-    /// Value and label.
-    Both,
-}
-
-pub struct Footnote {
-    content: Value,
-    marker: Value,
-    show: bool,
-}
-
-/// The content of a single pivot table cell.
-///
-/// A [Value] is also a pivot table's title, caption, footnote marker and
-/// contents, and so on.
-///
-/// A given [Value] is one of:
-///
-/// 1. A number resulting from a calculation.
-///
-///    A number has an associated display format (usually [F] or [Pct]).  This
-///    format can be set directly, but that is not usually the easiest way.
-///    Instead, it is usually true that all of the values in a single category
-///    should have the same format (e.g. all "Significance" values might use
-///    format `F40.3`), so PSPP makes it easy to set the default format for a
-///    category while creating the category.  See pivot_dimension_create() for
-///    more details.
-///
-///    [F]: crate::format::Format::F
-///    [Pct]: crate::format::Format::Pct
-///
-/// 2. A numeric or string value obtained from data (PIVOT_VALUE_NUMERIC or
-///    PIVOT_VALUE_STRING).  If such a value corresponds to a variable, then the
-///    variable's name can be attached to the pivot_value.  If the value has a
-///    value label, then that can also be attached.  When a label is present,
-///    the user can control whether to show the value or the label or both.
-///
-/// 3. A variable name (PIVOT_VALUE_VARIABLE).  The variable label, if any, can
-///    be attached too, and again the user can control whether to show the value
-///    or the label or both.
-///
-/// 4. A text string (PIVOT_VALUE_TEXT).  The value stores the string in English
-///    and translated into the output language (localized).  Use
-///    pivot_value_new_text() or pivot_value_new_text_format() for those cases.
-///    In some cases, only an English or a localized version is available for
-///    one reason or another, although this is regrettable; in those cases, use
-///    pivot_value_new_user_text() or pivot_value_new_user_text_nocopy().
-///
-/// 5. A template. PSPP doesn't create these itself yet, but it can read and
-///    interpret those created by SPSS.
-pub struct Value {
-    styling: Option<Box<ValueStyle>>,
-    inner: ValueInner,
-}
-
-pub enum ValueInner {
-    Number {
-        show: ValueShow,
-        format: Format,
-        honor_small: bool,
-        value: f64,
-        var_name: Option<String>,
-        value_label: Option<String>,
-    },
-    String {
-        show: ValueShow,
-        hex: bool,
-        s: Option<String>,
-        var_name: Option<String>,
-        value_label: Option<String>,
-    },
-    Variable {
-        show: ValueShow,
-        var_name: Option<String>,
-        value_label: Option<String>,
-    },
-    Text {
-        user_provided: bool,
-        /// Localized.
-        local: String,
-        /// English.
-        c: String,
-        /// Identifier.
-        id: String,
-    },
-    Template {
-        args: Vec<Vec<Value>>,
-        local: String,
-        id: String,
-    },
-}
-
-pub struct ValueStyle {
-    font_style: FontStyle,
-    cell_style: CellStyle,
-    subscripts: Vec<String>,
-    footnote_indexes: Vec<usize>,
-}
diff --git a/rust/src/prompt.rs b/rust/src/prompt.rs

deleted file mode 100644 (file)

index c02ca9b..0000000
--- a/rust/src/prompt.rs
+++ /dev/null
@@ -1,37 +0,0 @@
-#[derive(Copy, Clone, Debug, Hash, PartialEq, Eq)]
-pub enum PromptStyle {
-    /// First line of command.
-    First,
-
-    /// Second or later line of command.
-    Later,
-
-    /// Between `BEGIN DATA` and `END DATA`.
-    Data,
-
-    /// `COMMENT` or `*` command.
-    Comment,
-
-    /// DOCUMENT command.
-    Document,
-
-    /// `DO REPEAT` command.
-    DoRepeat,
-
-    /// `DEFINE` command.
-    Define,
-}
-
-impl PromptStyle {
-    pub fn to_string(&self) -> &'static str {
-        match self {
-            PromptStyle::First => "first",
-            PromptStyle::Later => "later",
-            PromptStyle::Data => "data",
-            PromptStyle::Comment => "COMMENT",
-            PromptStyle::Document => "DOCUMENT",
-            PromptStyle::DoRepeat => "DO REPEAT",
-            PromptStyle::Define => "DEFINE",
-        }
-    }
-}
diff --git a/rust/src/raw.rs b/rust/src/raw.rs

deleted file mode 100644 (file)

index c9b0477..0000000
--- a/rust/src/raw.rs
+++ /dev/null
@@ -1,2888 +0,0 @@
-use crate::{
-    dictionary::VarWidth,
-    encoding::{default_encoding, get_encoding, Error as EncodingError},
-    endian::{Endian, Parse, ToBytes},
-    identifier::{Error as IdError, Identifier},
-};
-
-use encoding_rs::{mem::decode_latin1, DecoderResult, Encoding};
-use flate2::read::ZlibDecoder;
-use num::Integer;
-use std::{
-    borrow::Cow,
-    cell::RefCell,
-    cmp::Ordering,
-    collections::{HashMap, VecDeque},
-    fmt::{Debug, Display, Formatter, Result as FmtResult},
-    io::{Error as IoError, Read, Seek, SeekFrom},
-    iter::repeat,
-    mem::take,
-    ops::Range,
-    rc::Rc,
-    str::from_utf8,
-};
-use thiserror::Error as ThisError;
-
-#[derive(ThisError, Debug)]
-pub enum Error {
-    #[error("Not an SPSS system file")]
-    NotASystemFile,
-
-    #[error("Invalid magic number {0:?}")]
-    BadMagic([u8; 4]),
-
-    #[error("I/O error ({0})")]
-    Io(#[from] IoError),
-
-    #[error("Invalid SAV compression code {0}")]
-    InvalidSavCompression(u32),
-
-    #[error("Invalid ZSAV compression code {0}")]
-    InvalidZsavCompression(u32),
-
-    #[error("Document record at offset {offset:#x} has document line count ({n}) greater than the maximum number {max}.")]
-    BadDocumentLength { offset: u64, n: usize, max: usize },
-
-    #[error("At offset {offset:#x}, unrecognized record type {rec_type}.")]
-    BadRecordType { offset: u64, rec_type: u32 },
-
-    #[error("In variable record starting at offset {start_offset:#x}, variable width is not in the valid range -1 to 255.")]
-    BadVariableWidth { start_offset: u64, width: i32 },
-
-    #[error("In variable record starting at offset {start_offset:#x}, variable label code {code} at offset {code_offset:#x} is not 0 or 1.")]
-    BadVariableLabelCode {
-        start_offset: u64,
-        code_offset: u64,
-        code: u32,
-    },
-
-    #[error(
-        "At offset {offset:#x}, numeric missing value code ({code}) is not -3, -2, 0, 1, 2, or 3."
-    )]
-    BadNumericMissingValueCode { offset: u64, code: i32 },
-
-    #[error("At offset {offset:#x}, string missing value code ({code}) is not 0, 1, 2, or 3.")]
-    BadStringMissingValueCode { offset: u64, code: i32 },
-
-    #[error("At offset {offset:#x}, number of value labels ({n}) is greater than the maximum number {max}.")]
-    BadNumberOfValueLabels { offset: u64, n: u32, max: u32 },
-
-    #[error("At offset {offset:#x}, following value label record, found record type {rec_type} instead of expected type 4 for variable index record")]
-    ExpectedVarIndexRecord { offset: u64, rec_type: u32 },
-
-    #[error("At offset {offset:#x}, number of variables indexes for value labels ({n}) is greater than the maximum number ({max}).")]
-    TooManyVarIndexes { offset: u64, n: u32, max: u32 },
-
-    #[error("At offset {offset:#x}, record type 7 subtype {subtype} is too large with element size {size} and {count} elements.")]
-    ExtensionRecordTooLarge {
-        offset: u64,
-        subtype: u32,
-        size: u32,
-        count: u32,
-    },
-
-    #[error("Unexpected end of file at offset {offset:#x}, {case_ofs} bytes into a {case_len}-byte case.")]
-    EofInCase {
-        offset: u64,
-        case_ofs: u64,
-        case_len: usize,
-    },
-
-    #[error(
-        "Unexpected end of file at offset {offset:#x}, {case_ofs} bytes into a compressed case."
-    )]
-    EofInCompressedCase { offset: u64, case_ofs: u64 },
-
-    #[error("Data ends at offset {offset:#x}, {case_ofs} bytes into a compressed case.")]
-    PartialCompressedCase { offset: u64, case_ofs: u64 },
-
-    #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a string was found where a number was expected.")]
-    CompressedNumberExpected { offset: u64, case_ofs: u64 },
-
-    #[error("At {case_ofs} bytes into compressed case starting at offset {offset:#x}, a number was found where a string was expected.")]
-    CompressedStringExpected { offset: u64, case_ofs: u64 },
-
-    #[error("Block count {n_blocks} in ZLIB trailer at offset {offset:#x} differs from expected block count {expected_n_blocks} calculated from trailer length {ztrailer_len}.")]
-    BadZlibTrailerNBlocks {
-        offset: u64,
-        n_blocks: u32,
-        expected_n_blocks: u64,
-        ztrailer_len: u64,
-    },
-
-    #[error("{0}")]
-    EncodingError(EncodingError),
-}
-
-#[derive(ThisError, Debug)]
-pub enum Warning {
-    #[error("Unexpected end of data inside extension record.")]
-    UnexpectedEndOfData,
-
-    #[error("At offset {offset:#x}, at least one valid variable index for value labels is required but none were specified.")]
-    NoVarIndexes { offset: u64 },
-
-    #[error("At offset {offset:#x}, the first variable index is for a {var_type} variable but the following variable indexes are for {} variables: {wrong_types:?}", var_type.opposite())]
-    MixedVarTypes {
-        offset: u64,
-        var_type: VarType,
-        wrong_types: Vec<u32>,
-    },
-
-    #[error("At offset {offset:#x}, one or more variable indexes for value labels were not in the valid range [1,{max}]: {invalid:?}")]
-    InvalidVarIndexes {
-        offset: u64,
-        max: usize,
-        invalid: Vec<u32>,
-    },
-
-    #[error("At offset {offset:#x}, {record} has bad size {size} bytes instead of the expected {expected_size}.")]
-    BadRecordSize {
-        offset: u64,
-        record: String,
-        size: u32,
-        expected_size: u32,
-    },
-
-    #[error("At offset {offset:#x}, {record} has bad count {count} instead of the expected {expected_count}.")]
-    BadRecordCount {
-        offset: u64,
-        record: String,
-        count: u32,
-        expected_count: u32,
-    },
-
-    #[error("In long string missing values record starting at offset {record_offset:#x}, value length at offset {offset:#x} is {value_len} instead of the expected 8.")]
-    BadLongMissingValueLength {
-        record_offset: u64,
-        offset: u64,
-        value_len: u32,
-    },
-
-    #[error("The encoding record at offset {offset:#x} contains an encoding name that is not valid UTF-8.")]
-    BadEncodingName { offset: u64 },
-
-    // XXX This is risky because `text` might be arbitarily long.
-    #[error("Text string contains invalid bytes for {encoding} encoding: {text}")]
-    MalformedString { encoding: String, text: String },
-
-    #[error("Invalid variable measurement level value {0}")]
-    InvalidMeasurement(u32),
-
-    #[error("Invalid variable display alignment value {0}")]
-    InvalidAlignment(u32),
-
-    #[error("Invalid attribute name.  {0}")]
-    InvalidAttributeName(IdError),
-
-    #[error("Invalid variable name in attribute record.  {0}")]
-    InvalidAttributeVariableName(IdError),
-
-    #[error("Invalid short name in long variable name record.  {0}")]
-    InvalidShortName(IdError),
-
-    #[error("Invalid name in long variable name record.  {0}")]
-    InvalidLongName(IdError),
-
-    #[error("Invalid variable name in very long string record.  {0}")]
-    InvalidLongStringName(IdError),
-
-    #[error("Invalid variable name in variable set record.  {0}")]
-    InvalidVariableSetName(IdError),
-
-    #[error("Invalid multiple response set name.  {0}")]
-    InvalidMrSetName(IdError),
-
-    #[error("Invalid multiple response set variable name.  {0}")]
-    InvalidMrSetVariableName(IdError),
-
-    #[error("Invalid variable name in long string missing values record.  {0}")]
-    InvalidLongStringMissingValueVariableName(IdError),
-
-    #[error("Invalid variable name in long string value label record.  {0}")]
-    InvalidLongStringValueLabelName(IdError),
-
-    #[error("{0}")]
-    EncodingError(EncodingError),
-
-    #[error("Details TBD")]
-    TBD,
-}
-
-impl From<IoError> for Warning {
-    fn from(_source: IoError) -> Self {
-        Self::UnexpectedEndOfData
-    }
-}
-
-#[derive(Clone, Debug)]
-pub enum Record {
-    Header(HeaderRecord<RawString>),
-    Variable(VariableRecord<RawString, RawStr<8>>),
-    ValueLabel(ValueLabelRecord<RawStr<8>, RawString>),
-    Document(DocumentRecord<RawDocumentLine>),
-    IntegerInfo(IntegerInfoRecord),
-    FloatInfo(FloatInfoRecord),
-    VarDisplay(VarDisplayRecord),
-    MultipleResponse(MultipleResponseRecord<RawString, RawString>),
-    LongStringValueLabels(LongStringValueLabelRecord<RawString, RawString>),
-    LongStringMissingValues(LongStringMissingValueRecord<RawString, RawStr<8>>),
-    Encoding(EncodingRecord),
-    NumberOfCases(NumberOfCasesRecord),
-    Text(TextRecord),
-    OtherExtension(Extension),
-    EndOfHeaders(u32),
-    ZHeader(ZHeader),
-    ZTrailer(ZTrailer),
-    Cases(Rc<RefCell<Cases>>),
-}
-
-#[derive(Clone, Debug)]
-pub enum DecodedRecord {
-    Header(HeaderRecord<String>),
-    Variable(VariableRecord<String, String>),
-    ValueLabel(ValueLabelRecord<RawStr<8>, String>),
-    Document(DocumentRecord<String>),
-    IntegerInfo(IntegerInfoRecord),
-    FloatInfo(FloatInfoRecord),
-    VarDisplay(VarDisplayRecord),
-    MultipleResponse(MultipleResponseRecord<Identifier, String>),
-    LongStringValueLabels(LongStringValueLabelRecord<Identifier, String>),
-    LongStringMissingValues(LongStringMissingValueRecord<Identifier, String>),
-    Encoding(EncodingRecord),
-    NumberOfCases(NumberOfCasesRecord),
-    VariableSets(VariableSetRecord),
-    ProductInfo(ProductInfoRecord),
-    LongNames(LongNamesRecord),
-    VeryLongStrings(VeryLongStringsRecord),
-    FileAttributes(FileAttributeRecord),
-    VariableAttributes(VariableAttributeRecord),
-    OtherExtension(Extension),
-    EndOfHeaders(u32),
-    ZHeader(ZHeader),
-    ZTrailer(ZTrailer),
-    Cases(Rc<RefCell<Cases>>),
-}
-
-impl Record {
-    fn read<R>(
-        reader: &mut R,
-        endian: Endian,
-        var_types: &[VarType],
-        warn: &dyn Fn(Warning),
-    ) -> Result<Option<Record>, Error>
-    where
-        R: Read + Seek,
-    {
-        let rec_type: u32 = endian.parse(read_bytes(reader)?);
-        match rec_type {
-            2 => Ok(Some(VariableRecord::read(reader, endian)?)),
-            3 => Ok(ValueLabelRecord::read(reader, endian, var_types, warn)?),
-            6 => Ok(Some(DocumentRecord::read(reader, endian)?)),
-            7 => Extension::read(reader, endian, var_types.len(), warn),
-            999 => Ok(Some(Record::EndOfHeaders(
-                endian.parse(read_bytes(reader)?),
-            ))),
-            _ => Err(Error::BadRecordType {
-                offset: reader.stream_position()?,
-                rec_type,
-            }),
-        }
-    }
-
-    pub fn decode(self, decoder: &Decoder) -> Result<DecodedRecord, Error> {
-        Ok(match self {
-            Record::Header(record) => record.decode(decoder),
-            Record::Variable(record) => record.decode(decoder),
-            Record::ValueLabel(record) => DecodedRecord::ValueLabel(record.decode(decoder)),
-            Record::Document(record) => record.decode(decoder),
-            Record::IntegerInfo(record) => DecodedRecord::IntegerInfo(record.clone()),
-            Record::FloatInfo(record) => DecodedRecord::FloatInfo(record.clone()),
-            Record::VarDisplay(record) => DecodedRecord::VarDisplay(record.clone()),
-            Record::MultipleResponse(record) => record.decode(decoder),
-            Record::LongStringValueLabels(record) => {
-                DecodedRecord::LongStringValueLabels(record.decode(decoder))
-            }
-            Record::LongStringMissingValues(record) => {
-                DecodedRecord::LongStringMissingValues(record.decode(decoder))
-            }
-            Record::Encoding(record) => DecodedRecord::Encoding(record.clone()),
-            Record::NumberOfCases(record) => DecodedRecord::NumberOfCases(record.clone()),
-            Record::Text(record) => record.decode(decoder),
-            Record::OtherExtension(record) => DecodedRecord::OtherExtension(record.clone()),
-            Record::EndOfHeaders(record) => DecodedRecord::EndOfHeaders(record),
-            Record::ZHeader(record) => DecodedRecord::ZHeader(record.clone()),
-            Record::ZTrailer(record) => DecodedRecord::ZTrailer(record.clone()),
-            Record::Cases(record) => DecodedRecord::Cases(record.clone()),
-        })
-    }
-}
-
-pub fn encoding_from_headers(
-    headers: &Vec<Record>,
-    warn: &impl Fn(Warning),
-) -> Result<&'static Encoding, Error> {
-    let mut encoding_record = None;
-    let mut integer_info_record = None;
-    for record in headers {
-        match record {
-            Record::Encoding(record) => encoding_record = Some(record),
-            Record::IntegerInfo(record) => integer_info_record = Some(record),
-            _ => (),
-        }
-    }
-    let encoding = encoding_record.map(|record| record.0.as_str());
-    let character_code = integer_info_record.map(|record| record.character_code);
-    match get_encoding(encoding, character_code) {
-        Ok(encoding) => Ok(encoding),
-        Err(err @ EncodingError::Ebcdic) => Err(Error::EncodingError(err)),
-        Err(err) => {
-            warn(Warning::EncodingError(err));
-            // Warn that we're using the default encoding.
-            Ok(default_encoding())
-        }
-    }
-}
-
-// If `s` is valid UTF-8, returns it decoded as UTF-8, otherwise returns it
-// decoded as Latin-1 (actually bytes interpreted as Unicode code points).
-fn default_decode(s: &[u8]) -> Cow<str> {
-    from_utf8(s).map_or_else(|_| decode_latin1(s), Cow::from)
-}
-
-#[derive(Copy, Clone, Debug, PartialEq, Eq)]
-pub enum Compression {
-    Simple,
-    ZLib,
-}
-
-trait Header {
-    fn offsets(&self) -> Range<u64>;
-}
-
-#[derive(Clone)]
-pub struct HeaderRecord<S>
-where
-    S: Debug,
-{
-    /// Offset in file.
-    pub offsets: Range<u64>,
-
-    /// Magic number.
-    pub magic: Magic,
-
-    /// Eye-catcher string, product name, in the file's encoding.  Padded
-    /// on the right with spaces.
-    pub eye_catcher: S,
-
-    /// Layout code, normally either 2 or 3.
-    pub layout_code: u32,
-
-    /// Number of variable positions, or `None` if the value in the file is
-    /// questionably trustworthy.
-    pub nominal_case_size: Option<u32>,
-
-    /// Compression type, if any,
-    pub compression: Option<Compression>,
-
-    /// 1-based variable index of the weight variable, or `None` if the file is
-    /// unweighted.
-    pub weight_index: Option<u32>,
-
-    /// Claimed number of cases, if known.
-    pub n_cases: Option<u32>,
-
-    /// Compression bias, usually 100.0.
-    pub bias: f64,
-
-    /// `dd mmm yy` in the file's encoding.
-    pub creation_date: S,
-
-    /// `HH:MM:SS` in the file's encoding.
-    pub creation_time: S,
-
-    /// File label, in the file's encoding.  Padded on the right with spaces.
-    pub file_label: S,
-
-    /// Endianness of the data in the file header.
-    pub endian: Endian,
-}
-
-impl<S> HeaderRecord<S>
-where
-    S: Debug,
-{
-    fn debug_field<T>(&self, f: &mut Formatter, name: &str, value: T) -> FmtResult
-    where
-        T: Debug,
-    {
-        writeln!(f, "{name:>17}: {:?}", value)
-    }
-}
-
-impl<S> Debug for HeaderRecord<S>
-where
-    S: Debug,
-{
-    fn fmt(&self, f: &mut Formatter) -> FmtResult {
-        writeln!(f, "File header record:")?;
-        self.debug_field(f, "Magic", self.magic)?;
-        self.debug_field(f, "Product name", &self.eye_catcher)?;
-        self.debug_field(f, "Layout code", self.layout_code)?;
-        self.debug_field(f, "Nominal case size", self.nominal_case_size)?;
-        self.debug_field(f, "Compression", self.compression)?;
-        self.debug_field(f, "Weight index", self.weight_index)?;
-        self.debug_field(f, "Number of cases", self.n_cases)?;
-        self.debug_field(f, "Compression bias", self.bias)?;
-        self.debug_field(f, "Creation date", &self.creation_date)?;
-        self.debug_field(f, "Creation time", &self.creation_time)?;
-        self.debug_field(f, "File label", &self.file_label)?;
-        self.debug_field(f, "Endianness", self.endian)
-    }
-}
-
-impl HeaderRecord<RawString> {
-    fn read<R: Read + Seek>(r: &mut R) -> Result<Self, Error> {
-        let start = r.stream_position()?;
-
-        let magic: [u8; 4] = read_bytes(r)?;
-        let magic: Magic = magic.try_into().map_err(|_| Error::NotASystemFile)?;
-
-        let eye_catcher = RawString(read_vec(r, 60)?);
-        let layout_code: [u8; 4] = read_bytes(r)?;
-        let endian = Endian::identify_u32(2, layout_code)
-            .or_else(|| Endian::identify_u32(2, layout_code))
-            .ok_or_else(|| Error::NotASystemFile)?;
-        let layout_code = endian.parse(layout_code);
-
-        let nominal_case_size: u32 = endian.parse(read_bytes(r)?);
-        let nominal_case_size =
-            (nominal_case_size <= i32::MAX as u32 / 16).then_some(nominal_case_size);
-
-        let compression_code: u32 = endian.parse(read_bytes(r)?);
-        let compression = match (magic, compression_code) {
-            (Magic::Zsav, 2) => Some(Compression::ZLib),
-            (Magic::Zsav, code) => return Err(Error::InvalidZsavCompression(code)),
-            (_, 0) => None,
-            (_, 1) => Some(Compression::Simple),
-            (_, code) => return Err(Error::InvalidSavCompression(code)),
-        };
-
-        let weight_index: u32 = endian.parse(read_bytes(r)?);
-        let weight_index = (weight_index > 0).then_some(weight_index);
-
-        let n_cases: u32 = endian.parse(read_bytes(r)?);
-        let n_cases = (n_cases < i32::MAX as u32 / 2).then_some(n_cases);
-
-        let bias: f64 = endian.parse(read_bytes(r)?);
-
-        let creation_date = RawString(read_vec(r, 9)?);
-        let creation_time = RawString(read_vec(r, 8)?);
-        let file_label = RawString(read_vec(r, 64)?);
-        let _: [u8; 3] = read_bytes(r)?;
-
-        Ok(HeaderRecord {
-            offsets: start..r.stream_position()?,
-            magic,
-            layout_code,
-            nominal_case_size,
-            compression,
-            weight_index,
-            n_cases,
-            bias,
-            creation_date,
-            creation_time,
-            eye_catcher,
-            file_label,
-            endian,
-        })
-    }
-
-    pub fn decode(self, decoder: &Decoder) -> DecodedRecord {
-        let eye_catcher = decoder.decode(&self.eye_catcher).to_string();
-        let file_label = decoder.decode(&self.file_label).to_string();
-        let creation_date = decoder.decode(&self.creation_date).to_string();
-        let creation_time = decoder.decode(&self.creation_time).to_string();
-        DecodedRecord::Header(HeaderRecord {
-            eye_catcher,
-            weight_index: self.weight_index,
-            n_cases: self.n_cases,
-            file_label,
-            offsets: self.offsets.clone(),
-            magic: self.magic,
-            layout_code: self.layout_code,
-            nominal_case_size: self.nominal_case_size,
-            compression: self.compression,
-            bias: self.bias,
-            creation_date,
-            creation_time,
-            endian: self.endian,
-        })
-    }
-}
-
-pub struct Decoder {
-    pub encoding: &'static Encoding,
-    pub warn: Box<dyn Fn(Warning)>,
-}
-
-impl Decoder {
-    pub fn new<F>(encoding: &'static Encoding, warn: F) -> Self
-    where
-        F: Fn(Warning) + 'static,
-    {
-        Self {
-            encoding,
-            warn: Box::new(warn),
-        }
-    }
-    fn warn(&self, warning: Warning) {
-        (self.warn)(warning)
-    }
-    fn decode_slice<'a>(&self, input: &'a [u8]) -> Cow<'a, str> {
-        let (output, malformed) = self.encoding.decode_without_bom_handling(input);
-        if malformed {
-            self.warn(Warning::MalformedString {
-                encoding: self.encoding.name().into(),
-                text: output.clone().into(),
-            });
-        }
-        output
-    }
-
-    fn decode<'a>(&self, input: &'a RawString) -> Cow<'a, str> {
-        self.decode_slice(input.0.as_slice())
-    }
-
-    /// Returns `input` decoded from `self.encoding` into UTF-8 such that
-    /// re-encoding the result back into `self.encoding` will have exactly the
-    /// same length in bytes.
-    ///
-    /// XXX warn about errors?
-    pub fn decode_exact_length<'a>(&self, input: &'a [u8]) -> Cow<'a, str> {
-        if let (s, false) = self.encoding.decode_without_bom_handling(input) {
-            // This is the common case.  Usually there will be no errors.
-            s
-        } else {
-            // Unusual case.  Don't bother to optimize it much.
-            let mut decoder = self.encoding.new_decoder_without_bom_handling();
-            let mut output = String::with_capacity(
-                decoder
-                    .max_utf8_buffer_length_without_replacement(input.len())
-                    .unwrap(),
-            );
-            let mut rest = input;
-            while !rest.is_empty() {
-                match decoder.decode_to_string_without_replacement(rest, &mut output, true) {
-                    (DecoderResult::InputEmpty, _) => break,
-                    (DecoderResult::OutputFull, _) => unreachable!(),
-                    (DecoderResult::Malformed(a, b), consumed) => {
-                        let skipped = a as usize + b as usize;
-                        output.extend(repeat('?').take(skipped));
-                        rest = &rest[consumed..];
-                    }
-                }
-            }
-            assert_eq!(self.encoding.encode(&output).0.len(), input.len());
-            output.into()
-        }
-    }
-
-    pub fn decode_identifier(&self, input: &RawString) -> Result<Identifier, IdError> {
-        self.new_identifier(&self.decode(input))
-    }
-
-    pub fn new_identifier(&self, name: &str) -> Result<Identifier, IdError> {
-        Identifier::from_encoding(name, self.encoding)
-    }
-}
-
-impl<S> Header for HeaderRecord<S>
-where
-    S: Debug,
-{
-    fn offsets(&self) -> Range<u64> {
-        self.offsets.clone()
-    }
-}
-
-#[derive(Copy, Clone, PartialEq, Eq, Hash)]
-pub enum Magic {
-    /// Regular system file.
-    Sav,
-
-    /// System file with Zlib-compressed data.
-    Zsav,
-
-    /// EBCDIC-encoded system file.
-    Ebcdic,
-}
-
-impl Magic {
-    /// Magic number for a regular system file.
-    pub const SAV: [u8; 4] = *b"$FL2";
-
-    /// Magic number for a system file that contains zlib-compressed data.
-    pub const ZSAV: [u8; 4] = *b"$FL3";
-
-    /// Magic number for an EBCDIC-encoded system file.  This is `$FL2` encoded
-    /// in EBCDIC.
-    pub const EBCDIC: [u8; 4] = [0x5b, 0xc6, 0xd3, 0xf2];
-}
-
-impl Debug for Magic {
-    fn fmt(&self, f: &mut Formatter) -> FmtResult {
-        let s = match *self {
-            Magic::Sav => "$FL2",
-            Magic::Zsav => "$FL3",
-            Magic::Ebcdic => "($FL2 in EBCDIC)",
-        };
-        write!(f, "{s}")
-    }
-}
-
-impl TryFrom<[u8; 4]> for Magic {
-    type Error = Error;
-
-    fn try_from(value: [u8; 4]) -> Result<Self, Self::Error> {
-        match value {
-            Magic::SAV => Ok(Magic::Sav),
-            Magic::ZSAV => Ok(Magic::Zsav),
-            Magic::EBCDIC => Ok(Magic::Ebcdic),
-            _ => Err(Error::BadMagic(value)),
-        }
-    }
-}
-
-#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
-pub enum VarType {
-    Numeric,
-    String,
-}
-
-impl VarType {
-    pub fn from_width(width: VarWidth) -> VarType {
-        match width {
-            VarWidth::Numeric => Self::Numeric,
-            VarWidth::String(_) => Self::String,
-        }
-    }
-
-    pub fn opposite(self) -> VarType {
-        match self {
-            Self::Numeric => Self::String,
-            Self::String => Self::Numeric,
-        }
-    }
-}
-
-impl Display for VarType {
-    fn fmt(&self, f: &mut Formatter) -> FmtResult {
-        match self {
-            VarType::Numeric => write!(f, "numeric"),
-            VarType::String => write!(f, "string"),
-        }
-    }
-}
-
-#[derive(Copy, Clone)]
-pub enum Value<S>
-where
-    S: Debug,
-{
-    Number(Option<f64>),
-    String(S),
-}
-
-type RawValue = Value<RawStr<8>>;
-
-impl<S> Debug for Value<S>
-where
-    S: Debug,
-{
-    fn fmt(&self, f: &mut Formatter) -> FmtResult {
-        match self {
-            Value::Number(Some(number)) => write!(f, "{number:?}"),
-            Value::Number(None) => write!(f, "SYSMIS"),
-            Value::String(s) => write!(f, "{:?}", s),
-        }
-    }
-}
-
-impl RawValue {
-    fn read<R: Read>(r: &mut R, var_type: VarType, endian: Endian) -> Result<Self, IoError> {
-        Ok(Self::from_raw(
-            &UntypedValue(read_bytes(r)?),
-            var_type,
-            endian,
-        ))
-    }
-
-    pub fn from_raw(raw: &UntypedValue, var_type: VarType, endian: Endian) -> Self {
-        match var_type {
-            VarType::String => Value::String(RawStr(raw.0)),
-            VarType::Numeric => {
-                let number: f64 = endian.parse(raw.0);
-                Value::Number((number != -f64::MAX).then_some(number))
-            }
-        }
-    }
-
-    fn read_case<R: Read + Seek>(
-        reader: &mut R,
-        var_types: &[VarType],
-        endian: Endian,
-    ) -> Result<Option<Vec<Self>>, Error> {
-        let case_start = reader.stream_position()?;
-        let mut values = Vec::with_capacity(var_types.len());
-        for (i, &var_type) in var_types.iter().enumerate() {
-            let Some(raw) = try_read_bytes(reader)? else {
-                if i == 0 {
-                    return Ok(None);
-                } else {
-                    let offset = reader.stream_position()?;
-                    return Err(Error::EofInCase {
-                        offset,
-                        case_ofs: offset - case_start,
-                        case_len: var_types.len() * 8,
-                    });
-                }
-            };
-            values.push(Value::from_raw(&UntypedValue(raw), var_type, endian));
-        }
-        Ok(Some(values))
-    }
-
-    fn read_compressed_case<R: Read + Seek>(
-        reader: &mut R,
-        var_types: &[VarType],
-        codes: &mut VecDeque<u8>,
-        endian: Endian,
-        bias: f64,
-    ) -> Result<Option<Vec<Self>>, Error> {
-        let case_start = reader.stream_position()?;
-        let mut values = Vec::with_capacity(var_types.len());
-        for (i, &var_type) in var_types.iter().enumerate() {
-            let value = loop {
-                let Some(code) = codes.pop_front() else {
-                    let Some(new_codes): Option<[u8; 8]> = try_read_bytes(reader)? else {
-                        if i == 0 {
-                            return Ok(None);
-                        } else {
-                            let offset = reader.stream_position()?;
-                            return Err(Error::EofInCompressedCase {
-                                offset,
-                                case_ofs: offset - case_start,
-                            });
-                        }
-                    };
-                    codes.extend(new_codes.into_iter());
-                    continue;
-                };
-                match code {
-                    0 => (),
-                    1..=251 => match var_type {
-                        VarType::Numeric => break Self::Number(Some(code as f64 - bias)),
-                        VarType::String => {
-                            break Self::String(RawStr(endian.to_bytes(code as f64 - bias)))
-                        }
-                    },
-                    252 => {
-                        if i == 0 {
-                            return Ok(None);
-                        } else {
-                            let offset = reader.stream_position()?;
-                            return Err(Error::PartialCompressedCase {
-                                offset,
-                                case_ofs: offset - case_start,
-                            });
-                        }
-                    }
-                    253 => {
-                        break Self::from_raw(&UntypedValue(read_bytes(reader)?), var_type, endian)
-                    }
-                    254 => match var_type {
-                        VarType::String => break Self::String(RawStr(*b"        ")), // XXX EBCDIC
-                        VarType::Numeric => {
-                            return Err(Error::CompressedStringExpected {
-                                offset: case_start,
-                                case_ofs: reader.stream_position()? - case_start,
-                            })
-                        }
-                    },
-                    255 => match var_type {
-                        VarType::Numeric => break Self::Number(None),
-                        VarType::String => {
-                            return Err(Error::CompressedNumberExpected {
-                                offset: case_start,
-                                case_ofs: reader.stream_position()? - case_start,
-                            })
-                        }
-                    },
-                }
-            };
-            values.push(value);
-        }
-        Ok(Some(values))
-    }
-
-    pub fn decode(self, decoder: &Decoder) -> Value<String> {
-        match self {
-            Self::Number(x) => Value::Number(x),
-            Self::String(s) => Value::String(decoder.decode_exact_length(&s.0).into()),
-        }
-    }
-}
-
-struct ZlibDecodeMultiple<R>
-where
-    R: Read + Seek,
-{
-    reader: Option<ZlibDecoder<R>>,
-}
-
-impl<R> ZlibDecodeMultiple<R>
-where
-    R: Read + Seek,
-{
-    fn new(reader: R) -> ZlibDecodeMultiple<R> {
-        ZlibDecodeMultiple {
-            reader: Some(ZlibDecoder::new(reader)),
-        }
-    }
-}
-
-impl<R> Read for ZlibDecodeMultiple<R>
-where
-    R: Read + Seek,
-{
-    fn read(&mut self, buf: &mut [u8]) -> Result<usize, IoError> {
-        loop {
-            match self.reader.as_mut().unwrap().read(buf)? {
-                0 => {
-                    let inner = self.reader.take().unwrap().into_inner();
-                    self.reader = Some(ZlibDecoder::new(inner));
-                }
-                n => return Ok(n),
-            };
-        }
-    }
-}
-
-impl<R> Seek for ZlibDecodeMultiple<R>
-where
-    R: Read + Seek,
-{
-    fn seek(&mut self, pos: SeekFrom) -> Result<u64, IoError> {
-        self.reader.as_mut().unwrap().get_mut().seek(pos)
-    }
-}
-
-enum ReaderState {
-    Start,
-    Headers,
-    ZlibHeader,
-    ZlibTrailer {
-        ztrailer_offset: u64,
-        ztrailer_len: u64,
-    },
-    Cases,
-    End,
-}
-
-pub struct Reader<R>
-where
-    R: Read + Seek + 'static,
-{
-    reader: Option<R>,
-    warn: Box<dyn Fn(Warning)>,
-
-    header: HeaderRecord<RawString>,
-    var_types: Vec<VarType>,
-
-    state: ReaderState,
-}
-
-impl<R> Reader<R>
-where
-    R: Read + Seek + 'static,
-{
-    pub fn new<F>(mut reader: R, warn: F) -> Result<Self, Error>
-    where
-        F: Fn(Warning) + 'static,
-    {
-        let header = HeaderRecord::read(&mut reader)?;
-        Ok(Self {
-            reader: Some(reader),
-            warn: Box::new(warn),
-            header,
-            var_types: Vec::new(),
-            state: ReaderState::Start,
-        })
-    }
-    fn cases(&mut self) -> Cases {
-        self.state = ReaderState::End;
-        Cases::new(
-            self.reader.take().unwrap(),
-            take(&mut self.var_types),
-            &self.header,
-        )
-    }
-    fn _next(&mut self) -> Option<<Self as Iterator>::Item> {
-        match self.state {
-            ReaderState::Start => {
-                self.state = ReaderState::Headers;
-                Some(Ok(Record::Header(self.header.clone())))
-            }
-            ReaderState::Headers => {
-                let record = loop {
-                    match Record::read(
-                        self.reader.as_mut().unwrap(),
-                        self.header.endian,
-                        self.var_types.as_slice(),
-                        &self.warn,
-                    ) {
-                        Ok(Some(record)) => break record,
-                        Ok(None) => (),
-                        Err(error) => return Some(Err(error)),
-                    }
-                };
-                match record {
-                    Record::Variable(VariableRecord { width, .. }) => {
-                        self.var_types.push(if width == 0 {
-                            VarType::Numeric
-                        } else {
-                            VarType::String
-                        });
-                    }
-                    Record::EndOfHeaders(_) => {
-                        self.state = if let Some(Compression::ZLib) = self.header.compression {
-                            ReaderState::ZlibHeader
-                        } else {
-                            ReaderState::Cases
-                        };
-                    }
-                    _ => (),
-                };
-                Some(Ok(record))
-            }
-            ReaderState::ZlibHeader => {
-                let zheader = match ZHeader::read(self.reader.as_mut().unwrap(), self.header.endian)
-                {
-                    Ok(zheader) => zheader,
-                    Err(error) => return Some(Err(error)),
-                };
-                self.state = ReaderState::ZlibTrailer {
-                    ztrailer_offset: zheader.ztrailer_offset,
-                    ztrailer_len: zheader.ztrailer_len,
-                };
-                Some(Ok(Record::ZHeader(zheader)))
-            }
-            ReaderState::ZlibTrailer {
-                ztrailer_offset,
-                ztrailer_len,
-            } => {
-                match ZTrailer::read(
-                    self.reader.as_mut().unwrap(),
-                    self.header.endian,
-                    ztrailer_offset,
-                    ztrailer_len,
-                ) {
-                    Ok(None) => Some(Ok(Record::Cases(Rc::new(RefCell::new(self.cases()))))),
-                    Ok(Some(ztrailer)) => Some(Ok(Record::ZTrailer(ztrailer))),
-                    Err(error) => Some(Err(error)),
-                }
-            }
-            ReaderState::Cases => Some(Ok(Record::Cases(Rc::new(RefCell::new(self.cases()))))),
-            ReaderState::End => None,
-        }
-    }
-}
-
-impl<R> Iterator for Reader<R>
-where
-    R: Read + Seek + 'static,
-{
-    type Item = Result<Record, Error>;
-
-    fn next(&mut self) -> Option<Self::Item> {
-        let retval = self._next();
-        if matches!(retval, Some(Err(_))) {
-            self.state = ReaderState::End;
-        }
-        retval
-    }
-}
-
-trait ReadSeek: Read + Seek {}
-impl<T> ReadSeek for T where T: Read + Seek {}
-
-pub struct Cases {
-    reader: Box<dyn ReadSeek>,
-    var_types: Vec<VarType>,
-    compression: Option<Compression>,
-    bias: f64,
-    endian: Endian,
-    codes: VecDeque<u8>,
-    eof: bool,
-}
-
-impl Debug for Cases {
-    fn fmt(&self, f: &mut Formatter) -> FmtResult {
-        write!(f, "Cases")
-    }
-}
-
-impl Cases {
-    fn new<R>(reader: R, var_types: Vec<VarType>, header: &HeaderRecord<RawString>) -> Self
-    where
-        R: Read + Seek + 'static,
-    {
-        Self {
-            reader: if header.compression == Some(Compression::ZLib) {
-                Box::new(ZlibDecodeMultiple::new(reader))
-            } else {
-                Box::new(reader)
-            },
-            var_types,
-            compression: header.compression,
-            bias: header.bias,
-            endian: header.endian,
-            codes: VecDeque::with_capacity(8),
-            eof: false,
-        }
-    }
-}
-
-impl Iterator for Cases {
-    type Item = Result<Vec<RawValue>, Error>;
-
-    fn next(&mut self) -> Option<Self::Item> {
-        if self.eof {
-            return None;
-        }
-
-        let retval = if self.compression.is_some() {
-            Value::read_compressed_case(
-                &mut self.reader,
-                &self.var_types,
-                &mut self.codes,
-                self.endian,
-                self.bias,
-            )
-            .transpose()
-        } else {
-            Value::read_case(&mut self.reader, &self.var_types, self.endian).transpose()
-        };
-        self.eof = matches!(retval, None | Some(Err(_)));
-        retval
-    }
-}
-
-#[derive(Copy, Clone, PartialEq, Eq, Hash)]
-pub struct Spec(pub u32);
-
-impl Debug for Spec {
-    fn fmt(&self, f: &mut Formatter) -> FmtResult {
-        let type_ = format_name(self.0 >> 16);
-        let w = (self.0 >> 8) & 0xff;
-        let d = self.0 & 0xff;
-        write!(f, "{:06x} ({type_}{w}.{d})", self.0)
-    }
-}
-
-fn format_name(type_: u32) -> Cow<'static, str> {
-    match type_ {
-        1 => "A",
-        2 => "AHEX",
-        3 => "COMMA",
-        4 => "DOLLAR",
-        5 => "F",
-        6 => "IB",
-        7 => "PIBHEX",
-        8 => "P",
-        9 => "PIB",
-        10 => "PK",
-        11 => "RB",
-        12 => "RBHEX",
-        15 => "Z",
-        16 => "N",
-        17 => "E",
-        20 => "DATE",
-        21 => "TIME",
-        22 => "DATETIME",
-        23 => "ADATE",
-        24 => "JDATE",
-        25 => "DTIME",
-        26 => "WKDAY",
-        27 => "MONTH",
-        28 => "MOYR",
-        29 => "QYR",
-        30 => "WKYR",
-        31 => "PCT",
-        32 => "DOT",
-        33 => "CCA",
-        34 => "CCB",
-        35 => "CCC",
-        36 => "CCD",
-        37 => "CCE",
-        38 => "EDATE",
-        39 => "SDATE",
-        40 => "MTIME",
-        41 => "YMDHMS",
-        _ => return format!("<unknown format {type_}>").into(),
-    }
-    .into()
-}
-
-#[derive(Clone)]
-pub struct MissingValues<S = String>
-where
-    S: Debug,
-{
-    /// Individual missing values, up to 3 of them.
-    pub values: Vec<Value<S>>,
-
-    /// Optional range of missing values.
-    pub range: Option<(Value<S>, Value<S>)>,
-}
-
-impl<S> Debug for MissingValues<S>
-where
-    S: Debug,
-{
-    fn fmt(&self, f: &mut Formatter) -> FmtResult {
-        for (i, value) in self.values.iter().enumerate() {
-            if i > 0 {
-                write!(f, ", ")?;
-            }
-            write!(f, "{value:?}")?;
-        }
-
-        if let Some((low, high)) = &self.range {
-            if !self.values.is_empty() {
-                write!(f, ", ")?;
-            }
-            write!(f, "{low:?} THRU {high:?}")?;
-        }
-
-        if self.is_empty() {
-            write!(f, "none")?;
-        }
-
-        Ok(())
-    }
-}
-
-impl<S> MissingValues<S>
-where
-    S: Debug,
-{
-    fn is_empty(&self) -> bool {
-        self.values.is_empty() && self.range.is_none()
-    }
-}
-
-impl<S> Default for MissingValues<S>
-where
-    S: Debug,
-{
-    fn default() -> Self {
-        Self {
-            values: Vec::new(),
-            range: None,
-        }
-    }
-}
-
-impl MissingValues<RawStr<8>> {
-    fn read<R: Read + Seek>(
-        r: &mut R,
-        offset: u64,
-        width: i32,
-        code: i32,
-        endian: Endian,
-    ) -> Result<Self, Error> {
-        let (n_values, has_range) = match (width, code) {
-            (_, 0..=3) => (code, false),
-            (0, -2) => (0, true),
-            (0, -3) => (1, true),
-            (0, _) => return Err(Error::BadNumericMissingValueCode { offset, code }),
-            (_, _) => return Err(Error::BadStringMissingValueCode { offset, code }),
-        };
-
-        let var_type = if width == 0 {
-            VarType::Numeric
-        } else {
-            VarType::String
-        };
-
-        let mut values = Vec::new();
-        for _ in 0..n_values {
-            values.push(RawValue::read(r, var_type, endian)?);
-        }
-        let range = if has_range {
-            let low = RawValue::read(r, var_type, endian)?;
-            let high = RawValue::read(r, var_type, endian)?;
-            Some((low, high))
-        } else {
-            None
-        };
-        Ok(Self { values, range })
-    }
-    fn decode(&self, decoder: &Decoder) -> MissingValues<String> {
-        MissingValues {
-            values: self
-                .values
-                .iter()
-                .map(|value| value.decode(decoder))
-                .collect(),
-            range: self
-                .range
-                .as_ref()
-                .map(|(low, high)| (low.decode(decoder), high.decode(decoder))),
-        }
-    }
-}
-
-#[derive(Clone)]
-pub struct VariableRecord<S, V>
-where
-    S: Debug,
-    V: Debug,
-{
-    /// Range of offsets in file.
-    pub offsets: Range<u64>,
-
-    /// Variable width, in the range -1..=255.
-    pub width: i32,
-
-    /// Variable name, padded on the right with spaces.
-    pub name: S,
-
-    /// Print format.
-    pub print_format: Spec,
-
-    /// Write format.
-    pub write_format: Spec,
-
-    /// Missing values.
-    pub missing_values: MissingValues<V>,
-
-    /// Optional variable label.
-    pub label: Option<S>,
-}
-
-impl<S, V> Debug for VariableRecord<S, V>
-where
-    S: Debug,
-    V: Debug,
-{
-    fn fmt(&self, f: &mut Formatter) -> FmtResult {
-        writeln!(
-            f,
-            "Width: {} ({})",
-            self.width,
-            match self.width.cmp(&0) {
-                Ordering::Greater => "string",
-                Ordering::Equal => "numeric",
-                Ordering::Less => "long string continuation record",
-            }
-        )?;
-        writeln!(f, "Print format: {:?}", self.print_format)?;
-        writeln!(f, "Write format: {:?}", self.write_format)?;
-        writeln!(f, "Name: {:?}", &self.name)?;
-        writeln!(f, "Variable label: {:?}", self.label)?;
-        writeln!(f, "Missing values: {:?}", self.missing_values)
-    }
-}
-
-impl VariableRecord<RawString, RawStr<8>> {
-    fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Record, Error> {
-        let start_offset = r.stream_position()?;
-        let width: i32 = endian.parse(read_bytes(r)?);
-        if !(-1..=255).contains(&width) {
-            return Err(Error::BadVariableWidth {
-                start_offset,
-                width,
-            });
-        }
-        let code_offset = r.stream_position()?;
-        let has_variable_label: u32 = endian.parse(read_bytes(r)?);
-        let missing_value_code: i32 = endian.parse(read_bytes(r)?);
-        let print_format = Spec(endian.parse(read_bytes(r)?));
-        let write_format = Spec(endian.parse(read_bytes(r)?));
-        let name = RawString(read_vec(r, 8)?);
-
-        let label = match has_variable_label {
-            0 => None,
-            1 => {
-                let len: u32 = endian.parse(read_bytes(r)?);
-                let read_len = len.min(65535) as usize;
-                let label = RawString(read_vec(r, read_len)?);
-
-                let padding_bytes = Integer::next_multiple_of(&len, &4) - len;
-                let _ = read_vec(r, padding_bytes as usize)?;
-
-                Some(label)
-            }
-            _ => {
-                return Err(Error::BadVariableLabelCode {
-                    start_offset,
-                    code_offset,
-                    code: has_variable_label,
-                })
-            }
-        };
-
-        let missing_values =
-            MissingValues::read(r, start_offset, width, missing_value_code, endian)?;
-
-        let end_offset = r.stream_position()?;
-
-        Ok(Record::Variable(VariableRecord {
-            offsets: start_offset..end_offset,
-            width,
-            name,
-            print_format,
-            write_format,
-            missing_values,
-            label,
-        }))
-    }
-
-    pub fn decode(self, decoder: &Decoder) -> DecodedRecord {
-        DecodedRecord::Variable(VariableRecord {
-            offsets: self.offsets.clone(),
-            width: self.width,
-            name: decoder.decode(&self.name).to_string(),
-            print_format: self.print_format,
-            write_format: self.write_format,
-            missing_values: self.missing_values.decode(decoder),
-            label: self
-                .label
-                .as_ref()
-                .map(|label| decoder.decode(label).to_string()),
-        })
-    }
-}
-
-#[derive(Copy, Clone)]
-pub struct UntypedValue(pub [u8; 8]);
-
-impl Debug for UntypedValue {
-    fn fmt(&self, f: &mut Formatter) -> FmtResult {
-        let little: f64 = Endian::Little.parse(self.0);
-        let little = format!("{:?}", little);
-        let big: f64 = Endian::Big.parse(self.0);
-        let big = format!("{:?}", big);
-        let number = if little.len() <= big.len() {
-            little
-        } else {
-            big
-        };
-        write!(f, "{number}")?;
-
-        let string = default_decode(&self.0);
-        let string = string
-            .split(|c: char| c == '\0' || c.is_control())
-            .next()
-            .unwrap();
-        write!(f, "{string:?}")?;
-        Ok(())
-    }
-}
-
-#[derive(Clone)]
-pub struct RawString(pub Vec<u8>);
-
-impl From<Vec<u8>> for RawString {
-    fn from(source: Vec<u8>) -> Self {
-        Self(source)
-    }
-}
-
-impl From<&[u8]> for RawString {
-    fn from(source: &[u8]) -> Self {
-        Self(source.into())
-    }
-}
-
-impl Debug for RawString {
-    fn fmt(&self, f: &mut Formatter) -> FmtResult {
-        write!(f, "{:?}", default_decode(self.0.as_slice()))
-    }
-}
-
-#[derive(Copy, Clone)]
-pub struct RawStr<const N: usize>(pub [u8; N]);
-
-impl<const N: usize> From<[u8; N]> for RawStr<N> {
-    fn from(source: [u8; N]) -> Self {
-        Self(source)
-    }
-}
-
-impl<const N: usize> Debug for RawStr<N> {
-    fn fmt(&self, f: &mut Formatter) -> FmtResult {
-        write!(f, "{:?}", default_decode(&self.0))
-    }
-}
-
-#[derive(Clone, Debug)]
-pub struct ValueLabel<V, S>
-where
-    V: Debug,
-    S: Debug,
-{
-    pub value: Value<V>,
-    pub label: S,
-}
-
-#[derive(Clone)]
-pub struct ValueLabelRecord<V, S>
-where
-    V: Debug,
-    S: Debug,
-{
-    /// Range of offsets in file.
-    pub offsets: Range<u64>,
-
-    /// The labels.
-    pub labels: Vec<ValueLabel<V, S>>,
-
-    /// The 1-based indexes of the variable indexes.
-    pub dict_indexes: Vec<u32>,
-
-    /// The types of the variables.
-    pub var_type: VarType,
-}
-
-impl<V, S> Debug for ValueLabelRecord<V, S>
-where
-    V: Debug,
-    S: Debug,
-{
-    fn fmt(&self, f: &mut Formatter) -> FmtResult {
-        writeln!(f, "labels: ")?;
-        for label in self.labels.iter() {
-            writeln!(f, "{label:?}")?;
-        }
-        write!(f, "apply to {} variables", self.var_type)?;
-        for dict_index in self.dict_indexes.iter() {
-            write!(f, " #{dict_index}")?;
-        }
-        Ok(())
-    }
-}
-
-impl<V, S> Header for ValueLabelRecord<V, S>
-where
-    V: Debug,
-    S: Debug,
-{
-    fn offsets(&self) -> Range<u64> {
-        self.offsets.clone()
-    }
-}
-
-impl<V, S> ValueLabelRecord<V, S>
-where
-    V: Debug,
-    S: Debug,
-{
-    /// Maximum number of value labels in a record.
-    pub const MAX_LABELS: u32 = u32::MAX / 8;
-
-    /// Maximum number of variable indexes in a record.
-    pub const MAX_INDEXES: u32 = u32::MAX / 8;
-}
-
-impl ValueLabelRecord<RawStr<8>, RawString> {
-    fn read<R: Read + Seek>(
-        r: &mut R,
-        endian: Endian,
-        var_types: &[VarType],
-        warn: &dyn Fn(Warning),
-    ) -> Result<Option<Record>, Error> {
-        let label_offset = r.stream_position()?;
-        let n: u32 = endian.parse(read_bytes(r)?);
-        if n > Self::MAX_LABELS {
-            return Err(Error::BadNumberOfValueLabels {
-                offset: label_offset,
-                n,
-                max: Self::MAX_LABELS,
-            });
-        }
-
-        let mut labels = Vec::new();
-        for _ in 0..n {
-            let value = UntypedValue(read_bytes(r)?);
-            let label_len: u8 = endian.parse(read_bytes(r)?);
-            let label_len = label_len as usize;
-            let padded_len = Integer::next_multiple_of(&(label_len + 1), &8);
-
-            let mut label = read_vec(r, padded_len - 1)?;
-            label.truncate(label_len);
-            labels.push((value, RawString(label)));
-        }
-
-        let index_offset = r.stream_position()?;
-        let rec_type: u32 = endian.parse(read_bytes(r)?);
-        if rec_type != 4 {
-            return Err(Error::ExpectedVarIndexRecord {
-                offset: index_offset,
-                rec_type,
-            });
-        }
-
-        let n: u32 = endian.parse(read_bytes(r)?);
-        if n > Self::MAX_INDEXES {
-            return Err(Error::TooManyVarIndexes {
-                offset: index_offset,
-                n,
-                max: Self::MAX_INDEXES,
-            });
-        } else if n == 0 {
-            warn(Warning::NoVarIndexes {
-                offset: index_offset,
-            });
-            return Ok(None);
-        }
-
-        let index_offset = r.stream_position()?;
-        let mut dict_indexes = Vec::with_capacity(n as usize);
-        let mut invalid_indexes = Vec::new();
-        for _ in 0..n {
-            let index: u32 = endian.parse(read_bytes(r)?);
-            if index == 0 || index as usize > var_types.len() {
-                dict_indexes.push(index);
-            } else {
-                invalid_indexes.push(index);
-            }
-        }
-        if !invalid_indexes.is_empty() {
-            warn(Warning::InvalidVarIndexes {
-                offset: index_offset,
-                max: var_types.len(),
-                invalid: invalid_indexes,
-            });
-        }
-
-        let Some(&first_index) = dict_indexes.first() else {
-            return Ok(None);
-        };
-        let var_type = var_types[first_index as usize - 1];
-        let mut wrong_type_indexes = Vec::new();
-        dict_indexes.retain(|&index| {
-            if var_types[index as usize - 1] != var_type {
-                wrong_type_indexes.push(index);
-                false
-            } else {
-                true
-            }
-        });
-        if !wrong_type_indexes.is_empty() {
-            warn(Warning::MixedVarTypes {
-                offset: index_offset,
-                var_type,
-                wrong_types: wrong_type_indexes,
-            });
-        }
-
-        let labels = labels
-            .into_iter()
-            .map(|(value, label)| ValueLabel {
-                value: Value::from_raw(&value, var_type, endian),
-                label,
-            })
-            .collect();
-
-        let end_offset = r.stream_position()?;
-        Ok(Some(Record::ValueLabel(ValueLabelRecord {
-            offsets: label_offset..end_offset,
-            labels,
-            dict_indexes,
-            var_type,
-        })))
-    }
-
-    fn decode(self, decoder: &Decoder) -> ValueLabelRecord<RawStr<8>, String> {
-        let labels = self
-            .labels
-            .iter()
-            .map(|ValueLabel { value, label }| ValueLabel {
-                value: *value,
-                label: decoder.decode(label).to_string(),
-            })
-            .collect();
-        ValueLabelRecord {
-            offsets: self.offsets.clone(),
-            labels,
-            dict_indexes: self.dict_indexes.clone(),
-            var_type: self.var_type,
-        }
-    }
-}
-
-#[derive(Clone, Debug)]
-pub struct DocumentRecord<S>
-where
-    S: Debug,
-{
-    pub offsets: Range<u64>,
-
-    /// The document, as an array of lines.  Raw lines are exactly 80 bytes long
-    /// and are right-padded with spaces without any new-line termination.
-    pub lines: Vec<S>,
-}
-
-pub type RawDocumentLine = RawStr<DOC_LINE_LEN>;
-
-/// Length of a line in a document.  Document lines are fixed-length and
-/// padded on the right with spaces.
-pub const DOC_LINE_LEN: usize = 80;
-
-impl DocumentRecord<RawDocumentLine> {
-    /// Maximum number of lines we will accept in a document.  This is simply
-    /// the maximum number that will fit in a 32-bit space.
-    pub const MAX_LINES: usize = i32::MAX as usize / DOC_LINE_LEN;
-
-    fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<Record, Error> {
-        let start_offset = r.stream_position()?;
-        let n: u32 = endian.parse(read_bytes(r)?);
-        let n = n as usize;
-        if n > Self::MAX_LINES {
-            Err(Error::BadDocumentLength {
-                offset: start_offset,
-                n,
-                max: Self::MAX_LINES,
-            })
-        } else {
-            let mut lines = Vec::with_capacity(n);
-            for _ in 0..n {
-                lines.push(RawStr(read_bytes(r)?));
-            }
-            let end_offset = r.stream_position()?;
-            Ok(Record::Document(DocumentRecord {
-                offsets: start_offset..end_offset,
-                lines,
-            }))
-        }
-    }
-
-    pub fn decode(self, decoder: &Decoder) -> DecodedRecord {
-        DecodedRecord::Document(DocumentRecord {
-            offsets: self.offsets.clone(),
-            lines: self
-                .lines
-                .iter()
-                .map(|s| decoder.decode_slice(&s.0).to_string())
-                .collect(),
-        })
-    }
-}
-
-impl<S> Header for DocumentRecord<S>
-where
-    S: Debug,
-{
-    fn offsets(&self) -> Range<u64> {
-        self.offsets.clone()
-    }
-}
-
-trait ExtensionRecord {
-    const SUBTYPE: u32;
-    const SIZE: Option<u32>;
-    const COUNT: Option<u32>;
-    const NAME: &'static str;
-    fn parse(ext: &Extension, endian: Endian) -> Result<Record, Warning>;
-}
-
-#[derive(Clone, Debug)]
-pub struct IntegerInfoRecord {
-    pub offsets: Range<u64>,
-    pub version: (i32, i32, i32),
-    pub machine_code: i32,
-    pub floating_point_rep: i32,
-    pub compression_code: i32,
-    pub endianness: i32,
-    pub character_code: i32,
-}
-
-impl ExtensionRecord for IntegerInfoRecord {
-    const SUBTYPE: u32 = 3;
-    const SIZE: Option<u32> = Some(4);
-    const COUNT: Option<u32> = Some(8);
-    const NAME: &'static str = "integer record";
-
-    fn parse(ext: &Extension, endian: Endian) -> Result<Record, Warning> {
-        ext.check_size::<Self>()?;
-
-        let mut input = &ext.data[..];
-        let data: Vec<i32> = (0..8)
-            .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
-            .collect();
-        Ok(Record::IntegerInfo(IntegerInfoRecord {
-            offsets: ext.offsets.clone(),
-            version: (data[0], data[1], data[2]),
-            machine_code: data[3],
-            floating_point_rep: data[4],
-            compression_code: data[5],
-            endianness: data[6],
-            character_code: data[7],
-        }))
-    }
-}
-
-#[derive(Clone, Debug)]
-pub struct FloatInfoRecord {
-    pub sysmis: f64,
-    pub highest: f64,
-    pub lowest: f64,
-}
-
-impl ExtensionRecord for FloatInfoRecord {
-    const SUBTYPE: u32 = 4;
-    const SIZE: Option<u32> = Some(8);
-    const COUNT: Option<u32> = Some(3);
-    const NAME: &'static str = "floating point record";
-
-    fn parse(ext: &Extension, endian: Endian) -> Result<Record, Warning> {
-        ext.check_size::<Self>()?;
-
-        let mut input = &ext.data[..];
-        let data: Vec<f64> = (0..3)
-            .map(|_| endian.parse(read_bytes(&mut input).unwrap()))
-            .collect();
-        Ok(Record::FloatInfo(FloatInfoRecord {
-            sysmis: data[0],
-            highest: data[1],
-            lowest: data[2],
-        }))
-    }
-}
-
-#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
-pub enum CategoryLabels {
-    VarLabels,
-    CountedValues,
-}
-
-#[derive(Clone, Debug)]
-pub enum MultipleResponseType {
-    MultipleDichotomy {
-        value: RawString,
-        labels: CategoryLabels,
-    },
-    MultipleCategory,
-}
-
-impl MultipleResponseType {
-    fn parse(input: &[u8]) -> Result<(MultipleResponseType, &[u8]), Warning> {
-        let (mr_type, input) = match input.split_first() {
-            Some((b'C', input)) => (MultipleResponseType::MultipleCategory, input),
-            Some((b'D', input)) => {
-                let (value, input) = parse_counted_string(input)?;
-                (
-                    MultipleResponseType::MultipleDichotomy {
-                        value,
-                        labels: CategoryLabels::VarLabels,
-                    },
-                    input,
-                )
-            }
-            Some((b'E', input)) => {
-                let (labels, input) = if let Some(rest) = input.strip_prefix(b" 1 ") {
-                    (CategoryLabels::CountedValues, rest)
-                } else if let Some(rest) = input.strip_prefix(b" 11 ") {
-                    (CategoryLabels::VarLabels, rest)
-                } else {
-                    return Err(Warning::TBD);
-                };
-                let (value, input) = parse_counted_string(input)?;
-                (
-                    MultipleResponseType::MultipleDichotomy { value, labels },
-                    input,
-                )
-            }
-            _ => return Err(Warning::TBD),
-        };
-        Ok((mr_type, input))
-    }
-}
-
-#[derive(Clone, Debug)]
-pub struct MultipleResponseSet<I, S>
-where
-    I: Debug,
-    S: Debug,
-{
-    pub name: I,
-    pub label: S,
-    pub mr_type: MultipleResponseType,
-    pub short_names: Vec<I>,
-}
-
-impl MultipleResponseSet<RawString, RawString> {
-    fn parse(input: &[u8]) -> Result<(Self, &[u8]), Warning> {
-        let Some(equals) = input.iter().position(|&b| b == b'=') else {
-            return Err(Warning::TBD);
-        };
-        let (name, input) = input.split_at(equals);
-        let (mr_type, input) = MultipleResponseType::parse(input)?;
-        let Some(input) = input.strip_prefix(b" ") else {
-            return Err(Warning::TBD);
-        };
-        let (label, mut input) = parse_counted_string(input)?;
-        let mut vars = Vec::new();
-        while input.first() != Some(&b'\n') {
-            match input.split_first() {
-                Some((b' ', rest)) => {
-                    let Some(length) = rest.iter().position(|b| b" \n".contains(b)) else {
-                        return Err(Warning::TBD);
-                    };
-                    let (var, rest) = rest.split_at(length);
-                    if !var.is_empty() {
-                        vars.push(var.into());
-                    }
-                    input = rest;
-                }
-                _ => return Err(Warning::TBD),
-            }
-        }
-        while input.first() == Some(&b'\n') {
-            input = &input[1..];
-        }
-        Ok((
-            MultipleResponseSet {
-                name: name.into(),
-                label,
-                mr_type,
-                short_names: vars,
-            },
-            input,
-        ))
-    }
-
-    fn decode(
-        &self,
-        decoder: &Decoder,
-    ) -> Result<MultipleResponseSet<Identifier, String>, Warning> {
-        let mut short_names = Vec::with_capacity(self.short_names.len());
-        for short_name in self.short_names.iter() {
-            if let Some(short_name) = decoder
-                .decode_identifier(short_name)
-                .map_err(Warning::InvalidMrSetName)
-                .issue_warning(&decoder.warn)
-            {
-                short_names.push(short_name);
-            }
-        }
-        Ok(MultipleResponseSet {
-            name: decoder
-                .decode_identifier(&self.name)
-                .map_err(Warning::InvalidMrSetVariableName)?,
-            label: decoder.decode(&self.label).to_string(),
-            mr_type: self.mr_type.clone(),
-            short_names,
-        })
-    }
-}
-
-#[derive(Clone, Debug)]
-pub struct MultipleResponseRecord<I, S>(pub Vec<MultipleResponseSet<I, S>>)
-where
-    I: Debug,
-    S: Debug;
-
-impl ExtensionRecord for MultipleResponseRecord<RawString, RawString> {
-    const SUBTYPE: u32 = 7;
-    const SIZE: Option<u32> = Some(1);
-    const COUNT: Option<u32> = None;
-    const NAME: &'static str = "multiple response set record";
-
-    fn parse(ext: &Extension, _endian: Endian) -> Result<Record, Warning> {
-        ext.check_size::<Self>()?;
-
-        let mut input = &ext.data[..];
-        let mut sets = Vec::new();
-        while !input.is_empty() {
-            let (set, rest) = MultipleResponseSet::parse(input)?;
-            sets.push(set);
-            input = rest;
-        }
-        Ok(Record::MultipleResponse(MultipleResponseRecord(sets)))
-    }
-}
-
-impl MultipleResponseRecord<RawString, RawString> {
-    fn decode(self, decoder: &Decoder) -> DecodedRecord {
-        let mut sets = Vec::new();
-        for set in self.0.iter() {
-            if let Some(set) = set.decode(decoder).issue_warning(&decoder.warn) {
-                sets.push(set);
-            }
-        }
-        DecodedRecord::MultipleResponse(MultipleResponseRecord(sets))
-    }
-}
-
-fn parse_counted_string(input: &[u8]) -> Result<(RawString, &[u8]), Warning> {
-    let Some(space) = input.iter().position(|&b| b == b' ') else {
-        return Err(Warning::TBD);
-    };
-    let Ok(length) = from_utf8(&input[..space]) else {
-        return Err(Warning::TBD);
-    };
-    let Ok(length): Result<usize, _> = length.parse() else {
-        return Err(Warning::TBD);
-    };
-
-    let input = &input[space + 1..];
-    if input.len() < length {
-        return Err(Warning::TBD);
-    };
-
-    let (string, rest) = input.split_at(length);
-    Ok((string.into(), rest))
-}
-
-#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
-pub enum Measure {
-    Nominal,
-    Ordinal,
-    Scale,
-}
-
-impl Measure {
-    pub fn default_for_type(var_type: VarType) -> Option<Measure> {
-        match var_type {
-            VarType::Numeric => None,
-            VarType::String => Some(Self::Nominal),
-        }
-    }
-
-    fn try_decode(source: u32) -> Result<Option<Measure>, Warning> {
-        match source {
-            0 => Ok(None),
-            1 => Ok(Some(Measure::Nominal)),
-            2 => Ok(Some(Measure::Ordinal)),
-            3 => Ok(Some(Measure::Scale)),
-            _ => Err(Warning::InvalidMeasurement(source)),
-        }
-    }
-}
-
-#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
-pub enum Alignment {
-    Left,
-    Right,
-    Center,
-}
-
-impl Alignment {
-    fn try_decode(source: u32) -> Result<Option<Alignment>, Warning> {
-        match source {
-            0 => Ok(None),
-            1 => Ok(Some(Alignment::Left)),
-            2 => Ok(Some(Alignment::Right)),
-            3 => Ok(Some(Alignment::Center)),
-            _ => Err(Warning::InvalidAlignment(source)),
-        }
-    }
-
-    pub fn default_for_type(var_type: VarType) -> Self {
-        match var_type {
-            VarType::Numeric => Self::Right,
-            VarType::String => Self::Left,
-        }
-    }
-}
-
-#[derive(Clone, Debug)]
-pub struct VarDisplay {
-    pub measure: Option<Measure>,
-    pub width: Option<u32>,
-    pub alignment: Option<Alignment>,
-}
-
-#[derive(Clone, Debug)]
-pub struct VarDisplayRecord(pub Vec<VarDisplay>);
-
-impl VarDisplayRecord {
-    const SUBTYPE: u32 = 11;
-
-    fn parse(
-        ext: &Extension,
-        n_vars: usize,
-        endian: Endian,
-        warn: &dyn Fn(Warning),
-    ) -> Result<Record, Warning> {
-        if ext.size != 4 {
-            return Err(Warning::BadRecordSize {
-                offset: ext.offsets.start,
-                record: String::from("variable display record"),
-                size: ext.size,
-                expected_size: 4,
-            });
-        }
-
-        let has_width = if ext.count as usize == 3 * n_vars {
-            true
-        } else if ext.count as usize == 2 * n_vars {
-            false
-        } else {
-            return Err(Warning::TBD);
-        };
-
-        let mut var_displays = Vec::new();
-        let mut input = &ext.data[..];
-        for _ in 0..n_vars {
-            let measure = Measure::try_decode(endian.parse(read_bytes(&mut input).unwrap()))
-                .issue_warning(&warn)
-                .flatten();
-            let width = has_width.then(|| endian.parse(read_bytes(&mut input).unwrap()));
-            let alignment = Alignment::try_decode(endian.parse(read_bytes(&mut input).unwrap()))
-                .issue_warning(&warn)
-                .flatten();
-            var_displays.push(VarDisplay {
-                measure,
-                width,
-                alignment,
-            });
-        }
-        Ok(Record::VarDisplay(VarDisplayRecord(var_displays)))
-    }
-}
-
-#[derive(Clone, Debug)]
-pub struct LongStringMissingValues<N, V>
-where
-    N: Debug,
-    V: Debug,
-{
-    /// Variable name.
-    pub var_name: N,
-
-    /// Missing values.
-    pub missing_values: MissingValues<V>,
-}
-
-impl LongStringMissingValues<RawString, RawStr<8>> {
-    fn decode(
-        &self,
-        decoder: &Decoder,
-    ) -> Result<LongStringMissingValues<Identifier, String>, IdError> {
-        Ok(LongStringMissingValues {
-            var_name: decoder.decode_identifier(&self.var_name)?,
-            missing_values: self.missing_values.decode(decoder),
-        })
-    }
-}
-
-#[derive(Clone, Debug)]
-pub struct LongStringMissingValueRecord<N, V>(pub Vec<LongStringMissingValues<N, V>>)
-where
-    N: Debug,
-    V: Debug;
-
-impl ExtensionRecord for LongStringMissingValueRecord<RawString, RawStr<8>> {
-    const SUBTYPE: u32 = 22;
-    const SIZE: Option<u32> = Some(1);
-    const COUNT: Option<u32> = None;
-    const NAME: &'static str = "long string missing values record";
-
-    fn parse(ext: &Extension, endian: Endian) -> Result<Record, Warning> {
-        ext.check_size::<Self>()?;
-
-        let mut input = &ext.data[..];
-        let mut missing_value_set = Vec::new();
-        while !input.is_empty() {
-            let var_name = read_string(&mut input, endian)?;
-            let n_missing_values: u8 = endian.parse(read_bytes(&mut input)?);
-            let value_len: u32 = endian.parse(read_bytes(&mut input)?);
-            if value_len != 8 {
-                let offset = (ext.data.len() - input.len() - 8) as u64 + ext.offsets.start;
-                return Err(Warning::BadLongMissingValueLength {
-                    record_offset: ext.offsets.start,
-                    offset,
-                    value_len,
-                });
-            }
-            let mut values = Vec::new();
-            for i in 0..n_missing_values {
-                let value: [u8; 8] = read_bytes(&mut input)?;
-                let numeric_value: u64 = endian.parse(value);
-                let value = if i > 0 && numeric_value == 8 {
-                    // Tolerate files written by old, buggy versions of PSPP
-                    // where we believed that the value_length was repeated
-                    // before each missing value.
-                    read_bytes(&mut input)?
-                } else {
-                    value
-                };
-                values.push(Value::String(RawStr(value)));
-            }
-            let missing_values = MissingValues {
-                values,
-                range: None,
-            };
-            missing_value_set.push(LongStringMissingValues {
-                var_name,
-                missing_values,
-            });
-        }
-        Ok(Record::LongStringMissingValues(
-            LongStringMissingValueRecord(missing_value_set),
-        ))
-    }
-}
-
-impl LongStringMissingValueRecord<RawString, RawStr<8>> {
-    pub fn decode(self, decoder: &Decoder) -> LongStringMissingValueRecord<Identifier, String> {
-        let mut mvs = Vec::with_capacity(self.0.len());
-        for mv in self.0.iter() {
-            if let Some(mv) = mv
-                .decode(decoder)
-                .map_err(Warning::InvalidLongStringMissingValueVariableName)
-                .issue_warning(&decoder.warn)
-            {
-                mvs.push(mv);
-            }
-        }
-        LongStringMissingValueRecord(mvs)
-    }
-}
-
-#[derive(Clone, Debug)]
-pub struct EncodingRecord(pub String);
-
-impl ExtensionRecord for EncodingRecord {
-    const SUBTYPE: u32 = 20;
-    const SIZE: Option<u32> = Some(1);
-    const COUNT: Option<u32> = None;
-    const NAME: &'static str = "encoding record";
-
-    fn parse(ext: &Extension, _endian: Endian) -> Result<Record, Warning> {
-        ext.check_size::<Self>()?;
-
-        Ok(Record::Encoding(EncodingRecord(
-            String::from_utf8(ext.data.clone()).map_err(|_| Warning::BadEncodingName {
-                offset: ext.offsets.start,
-            })?,
-        )))
-    }
-}
-
-#[derive(Clone, Debug)]
-pub struct NumberOfCasesRecord {
-    /// Always observed as 1.
-    pub one: u64,
-
-    /// Number of cases.
-    pub n_cases: u64,
-}
-
-impl ExtensionRecord for NumberOfCasesRecord {
-    const SUBTYPE: u32 = 16;
-    const SIZE: Option<u32> = Some(8);
-    const COUNT: Option<u32> = Some(2);
-    const NAME: &'static str = "extended number of cases record";
-
-    fn parse(ext: &Extension, endian: Endian) -> Result<Record, Warning> {
-        ext.check_size::<Self>()?;
-
-        let mut input = &ext.data[..];
-        let one = endian.parse(read_bytes(&mut input)?);
-        let n_cases = endian.parse(read_bytes(&mut input)?);
-
-        Ok(Record::NumberOfCases(NumberOfCasesRecord { one, n_cases }))
-    }
-}
-
-#[derive(Clone, Debug)]
-pub struct TextRecord {
-    pub offsets: Range<u64>,
-
-    /// Type of record.
-    pub rec_type: TextRecordType,
-
-    /// The text content of the record.
-    pub text: RawString,
-}
-
-#[derive(Clone, Copy, Debug)]
-pub enum TextRecordType {
-    VariableSets,
-    ProductInfo,
-    LongNames,
-    VeryLongStrings,
-    FileAttributes,
-    VariableAttributes,
-}
-
-impl TextRecord {
-    fn new(extension: Extension, rec_type: TextRecordType) -> Self {
-        Self {
-            offsets: extension.offsets,
-            rec_type,
-            text: extension.data.into(),
-        }
-    }
-    pub fn decode(self, decoder: &Decoder) -> DecodedRecord {
-        match self.rec_type {
-            TextRecordType::VariableSets => {
-                DecodedRecord::VariableSets(VariableSetRecord::decode(&self, decoder))
-            }
-            TextRecordType::ProductInfo => {
-                DecodedRecord::ProductInfo(ProductInfoRecord::decode(&self, decoder))
-            }
-            TextRecordType::LongNames => {
-                DecodedRecord::LongNames(LongNamesRecord::decode(&self, decoder))
-            }
-            TextRecordType::VeryLongStrings => {
-                DecodedRecord::VeryLongStrings(VeryLongStringsRecord::decode(&self, decoder))
-            }
-            TextRecordType::FileAttributes => {
-                DecodedRecord::FileAttributes(FileAttributeRecord::decode(&self, decoder))
-            }
-            TextRecordType::VariableAttributes => {
-                DecodedRecord::VariableAttributes(VariableAttributeRecord::decode(&self, decoder))
-            }
-        }
-    }
-}
-
-#[derive(Clone, Debug)]
-pub struct VeryLongString {
-    pub short_name: Identifier,
-    pub length: u16,
-}
-
-impl VeryLongString {
-    fn parse(decoder: &Decoder, input: &str) -> Result<VeryLongString, Warning> {
-        let Some((short_name, length)) = input.split_once('=') else {
-            return Err(Warning::TBD);
-        };
-        let short_name = decoder
-            .new_identifier(short_name)
-            .map_err(Warning::InvalidLongStringName)?;
-        let length = length.parse().map_err(|_| Warning::TBD)?;
-        Ok(VeryLongString { short_name, length })
-    }
-}
-
-#[derive(Clone, Debug)]
-pub struct VeryLongStringsRecord(Vec<VeryLongString>);
-
-impl VeryLongStringsRecord {
-    fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
-        let input = decoder.decode(&source.text);
-        let mut very_long_strings = Vec::new();
-        for tuple in input
-            .split('\0')
-            .map(|s| s.trim_end_matches('\t'))
-            .filter(|s| !s.is_empty())
-        {
-            if let Some(vls) = VeryLongString::parse(decoder, tuple).issue_warning(&decoder.warn) {
-                very_long_strings.push(vls)
-            }
-        }
-        VeryLongStringsRecord(very_long_strings)
-    }
-}
-
-#[derive(Clone, Debug)]
-pub struct Attribute {
-    pub name: Identifier,
-    pub values: Vec<String>,
-}
-
-impl Attribute {
-    fn parse<'a>(decoder: &Decoder, input: &'a str) -> Result<(Attribute, &'a str), Warning> {
-        let Some((name, mut input)) = input.split_once('(') else {
-            return Err(Warning::TBD);
-        };
-        let name = decoder
-            .new_identifier(name)
-            .map_err(Warning::InvalidAttributeName)?;
-        let mut values = Vec::new();
-        loop {
-            let Some((value, rest)) = input.split_once('\n') else {
-                return Err(Warning::TBD);
-            };
-            if let Some(stripped) = value
-                .strip_prefix('\'')
-                .and_then(|value| value.strip_suffix('\''))
-            {
-                values.push(stripped.into());
-            } else {
-                decoder.warn(Warning::TBD);
-                values.push(value.into());
-            }
-            if let Some(rest) = rest.strip_prefix(')') {
-                let attribute = Attribute { name, values };
-                return Ok((attribute, rest));
-            };
-            input = rest;
-        }
-    }
-}
-
-#[derive(Clone, Debug, Default)]
-pub struct AttributeSet(pub HashMap<Identifier, Vec<String>>);
-
-impl AttributeSet {
-    fn parse<'a>(
-        decoder: &Decoder,
-        mut input: &'a str,
-        sentinel: Option<char>,
-    ) -> Result<(AttributeSet, &'a str), Warning> {
-        let mut attributes = HashMap::new();
-        let rest = loop {
-            match input.chars().next() {
-                None => break input,
-                c if c == sentinel => break &input[1..],
-                _ => {
-                    let (attribute, rest) = Attribute::parse(decoder, input)?;
-                    // XXX report duplicate name
-                    attributes.insert(attribute.name, attribute.values);
-                    input = rest;
-                }
-            }
-        };
-        Ok((AttributeSet(attributes), rest))
-    }
-}
-
-#[derive(Clone, Debug, Default)]
-pub struct FileAttributeRecord(pub AttributeSet);
-
-impl FileAttributeRecord {
-    fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
-        let input = decoder.decode(&source.text);
-        match AttributeSet::parse(decoder, &input, None).issue_warning(&decoder.warn) {
-            Some((set, rest)) => {
-                if !rest.is_empty() {
-                    decoder.warn(Warning::TBD);
-                }
-                FileAttributeRecord(set)
-            }
-            None => FileAttributeRecord::default(),
-        }
-    }
-}
-
-#[derive(Clone, Debug)]
-pub struct VarAttributeSet {
-    pub long_var_name: Identifier,
-    pub attributes: AttributeSet,
-}
-
-impl VarAttributeSet {
-    fn parse<'a>(decoder: &Decoder, input: &'a str) -> Result<(VarAttributeSet, &'a str), Warning> {
-        let Some((long_var_name, rest)) = input.split_once(':') else {
-            return Err(Warning::TBD);
-        };
-        let long_var_name = decoder
-            .new_identifier(long_var_name)
-            .map_err(Warning::InvalidAttributeVariableName)?;
-        let (attributes, rest) = AttributeSet::parse(decoder, rest, Some('/'))?;
-        let var_attribute = VarAttributeSet {
-            long_var_name,
-            attributes,
-        };
-        Ok((var_attribute, rest))
-    }
-}
-
-#[derive(Clone, Debug)]
-pub struct VariableAttributeRecord(Vec<VarAttributeSet>);
-
-impl VariableAttributeRecord {
-    fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
-        let decoded = decoder.decode(&source.text);
-        let mut input = decoded.as_ref();
-        let mut var_attribute_sets = Vec::new();
-        while !input.is_empty() {
-            let Some((var_attribute, rest)) =
-                VarAttributeSet::parse(decoder, input).issue_warning(&decoder.warn)
-            else {
-                break;
-            };
-            var_attribute_sets.push(var_attribute);
-            input = rest;
-        }
-        VariableAttributeRecord(var_attribute_sets)
-    }
-}
-
-#[derive(Clone, Debug)]
-pub struct LongName {
-    pub short_name: Identifier,
-    pub long_name: Identifier,
-}
-
-impl LongName {
-    fn parse(input: &str, decoder: &Decoder) -> Result<Self, Warning> {
-        let Some((short_name, long_name)) = input.split_once('=') else {
-            return Err(Warning::TBD);
-        };
-        let short_name = decoder
-            .new_identifier(short_name)
-            .map_err(Warning::InvalidShortName)?;
-        let long_name = decoder
-            .new_identifier(long_name)
-            .map_err(Warning::InvalidLongName)?;
-        Ok(LongName {
-            short_name,
-            long_name,
-        })
-    }
-}
-
-#[derive(Clone, Debug)]
-pub struct LongNamesRecord(Vec<LongName>);
-
-impl LongNamesRecord {
-    fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
-        let input = decoder.decode(&source.text);
-        let mut names = Vec::new();
-        for pair in input.split('\t').filter(|s| !s.is_empty()) {
-            if let Some(long_name) = LongName::parse(pair, decoder).issue_warning(&decoder.warn) {
-                names.push(long_name);
-            }
-        }
-        LongNamesRecord(names)
-    }
-}
-
-#[derive(Clone, Debug)]
-pub struct ProductInfoRecord(pub String);
-
-impl ProductInfoRecord {
-    fn decode(source: &TextRecord, decoder: &Decoder) -> Self {
-        Self(decoder.decode(&source.text).into())
-    }
-}
-#[derive(Clone, Debug)]
-pub struct VariableSet {
-    pub name: String,
-    pub vars: Vec<Identifier>,
-}
-
-impl VariableSet {
-    fn parse(input: &str, decoder: &Decoder) -> Result<Self, Warning> {
-        let (name, input) = input.split_once('=').ok_or(Warning::TBD)?;
-        let mut vars = Vec::new();
-        for var in input.split_ascii_whitespace() {
-            if let Some(identifier) = decoder
-                .new_identifier(var)
-                .map_err(Warning::InvalidVariableSetName)
-                .issue_warning(&decoder.warn)
-            {
-                vars.push(identifier);
-            }
-        }
-        Ok(VariableSet {
-            name: name.into(),
-            vars,
-        })
-    }
-}
-
-#[derive(Clone, Debug)]
-pub struct VariableSetRecord {
-    pub offsets: Range<u64>,
-    pub sets: Vec<VariableSet>,
-}
-
-impl VariableSetRecord {
-    fn decode(source: &TextRecord, decoder: &Decoder) -> VariableSetRecord {
-        let mut sets = Vec::new();
-        let input = decoder.decode(&source.text);
-        for line in input.lines() {
-            if let Some(set) = VariableSet::parse(line, decoder).issue_warning(&decoder.warn) {
-                sets.push(set)
-            }
-        }
-        VariableSetRecord {
-            offsets: source.offsets.clone(),
-            sets,
-        }
-    }
-}
-
-trait IssueWarning<T> {
-    fn issue_warning<F>(self, warn: &F) -> Option<T>
-    where
-        F: Fn(Warning);
-}
-impl<T> IssueWarning<T> for Result<T, Warning> {
-    fn issue_warning<F>(self, warn: &F) -> Option<T>
-    where
-        F: Fn(Warning),
-    {
-        match self {
-            Ok(result) => Some(result),
-            Err(error) => {
-                warn(error);
-                None
-            }
-        }
-    }
-}
-
-#[derive(Clone, Debug)]
-pub struct Extension {
-    pub offsets: Range<u64>,
-
-    /// Record subtype.
-    pub subtype: u32,
-
-    /// Size of each data element.
-    pub size: u32,
-
-    /// Number of data elements.
-    pub count: u32,
-
-    /// `size * count` bytes of data.
-    pub data: Vec<u8>,
-}
-
-impl Extension {
-    fn check_size<E: ExtensionRecord>(&self) -> Result<(), Warning> {
-        if let Some(expected_size) = E::SIZE {
-            if self.size != expected_size {
-                return Err(Warning::BadRecordSize {
-                    offset: self.offsets.start,
-                    record: E::NAME.into(),
-                    size: self.size,
-                    expected_size,
-                });
-            }
-        }
-        if let Some(expected_count) = E::COUNT {
-            if self.count != expected_count {
-                return Err(Warning::BadRecordCount {
-                    offset: self.offsets.start,
-                    record: E::NAME.into(),
-                    count: self.count,
-                    expected_count,
-                });
-            }
-        }
-        Ok(())
-    }
-
-    fn read<R: Read + Seek>(
-        r: &mut R,
-        endian: Endian,
-        n_vars: usize,
-        warn: &dyn Fn(Warning),
-    ) -> Result<Option<Record>, Error> {
-        let subtype = endian.parse(read_bytes(r)?);
-        let header_offset = r.stream_position()?;
-        let size: u32 = endian.parse(read_bytes(r)?);
-        let count = endian.parse(read_bytes(r)?);
-        let Some(product) = size.checked_mul(count) else {
-            return Err(Error::ExtensionRecordTooLarge {
-                offset: header_offset,
-                subtype,
-                size,
-                count,
-            });
-        };
-        let start_offset = r.stream_position()?;
-        let data = read_vec(r, product as usize)?;
-        let end_offset = start_offset + product as u64;
-        let extension = Extension {
-            offsets: start_offset..end_offset,
-            subtype,
-            size,
-            count,
-            data,
-        };
-        let result = match subtype {
-            IntegerInfoRecord::SUBTYPE => IntegerInfoRecord::parse(&extension, endian),
-            FloatInfoRecord::SUBTYPE => FloatInfoRecord::parse(&extension, endian),
-            VarDisplayRecord::SUBTYPE => VarDisplayRecord::parse(&extension, n_vars, endian, warn),
-            MultipleResponseRecord::SUBTYPE | 19 => {
-                MultipleResponseRecord::parse(&extension, endian)
-            }
-            LongStringValueLabelRecord::SUBTYPE => {
-                LongStringValueLabelRecord::parse(&extension, endian)
-            }
-            EncodingRecord::SUBTYPE => EncodingRecord::parse(&extension, endian),
-            NumberOfCasesRecord::SUBTYPE => NumberOfCasesRecord::parse(&extension, endian),
-            5 => Ok(Record::Text(TextRecord::new(
-                extension,
-                TextRecordType::VariableSets,
-            ))),
-            10 => Ok(Record::Text(TextRecord::new(
-                extension,
-                TextRecordType::ProductInfo,
-            ))),
-            13 => Ok(Record::Text(TextRecord::new(
-                extension,
-                TextRecordType::LongNames,
-            ))),
-            14 => Ok(Record::Text(TextRecord::new(
-                extension,
-                TextRecordType::VeryLongStrings,
-            ))),
-            17 => Ok(Record::Text(TextRecord::new(
-                extension,
-                TextRecordType::FileAttributes,
-            ))),
-            18 => Ok(Record::Text(TextRecord::new(
-                extension,
-                TextRecordType::VariableAttributes,
-            ))),
-            _ => Ok(Record::OtherExtension(extension)),
-        };
-        match result {
-            Ok(result) => Ok(Some(result)),
-            Err(error) => {
-                warn(error);
-                Ok(None)
-            }
-        }
-    }
-}
-
-#[derive(Clone, Debug)]
-pub struct ZHeader {
-    /// File offset to the start of the record.
-    pub offset: u64,
-
-    /// File offset to the ZLIB data header.
-    pub zheader_offset: u64,
-
-    /// File offset to the ZLIB trailer.
-    pub ztrailer_offset: u64,
-
-    /// Length of the ZLIB trailer in bytes.
-    pub ztrailer_len: u64,
-}
-
-impl ZHeader {
-    fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ZHeader, Error> {
-        let offset = r.stream_position()?;
-        let zheader_offset: u64 = endian.parse(read_bytes(r)?);
-        let ztrailer_offset: u64 = endian.parse(read_bytes(r)?);
-        let ztrailer_len: u64 = endian.parse(read_bytes(r)?);
-
-        Ok(ZHeader {
-            offset,
-            zheader_offset,
-            ztrailer_offset,
-            ztrailer_len,
-        })
-    }
-}
-
-#[derive(Clone, Debug)]
-pub struct ZTrailer {
-    /// File offset to the start of the record.
-    pub offset: u64,
-
-    /// Compression bias as a negative integer, e.g. -100.
-    pub int_bias: i64,
-
-    /// Always observed as zero.
-    pub zero: u64,
-
-    /// Uncompressed size of each block, except possibly the last.  Only
-    /// `0x3ff000` has been observed so far.
-    pub block_size: u32,
-
-    /// Block descriptors, always `(ztrailer_len - 24) / 24)` of them.
-    pub blocks: Vec<ZBlock>,
-}
-
-#[derive(Clone, Debug)]
-pub struct ZBlock {
-    /// Offset of block of data if simple compression were used.
-    pub uncompressed_ofs: u64,
-
-    /// Actual offset within the file of the compressed data block.
-    pub compressed_ofs: u64,
-
-    /// The number of bytes in this data block after decompression.  This is
-    /// `block_size` in every data block but the last, which may be smaller.
-    pub uncompressed_size: u32,
-
-    /// The number of bytes in this data block, as stored compressed in this
-    /// file.
-    pub compressed_size: u32,
-}
-
-impl ZBlock {
-    fn read<R: Read + Seek>(r: &mut R, endian: Endian) -> Result<ZBlock, Error> {
-        Ok(ZBlock {
-            uncompressed_ofs: endian.parse(read_bytes(r)?),
-            compressed_ofs: endian.parse(read_bytes(r)?),
-            uncompressed_size: endian.parse(read_bytes(r)?),
-            compressed_size: endian.parse(read_bytes(r)?),
-        })
-    }
-}
-
-impl ZTrailer {
-    fn read<R: Read + Seek>(
-        reader: &mut R,
-        endian: Endian,
-        ztrailer_ofs: u64,
-        ztrailer_len: u64,
-    ) -> Result<Option<ZTrailer>, Error> {
-        let start_offset = reader.stream_position()?;
-        if reader.seek(SeekFrom::Start(ztrailer_ofs)).is_err() {
-            return Ok(None);
-        }
-        let int_bias = endian.parse(read_bytes(reader)?);
-        let zero = endian.parse(read_bytes(reader)?);
-        let block_size = endian.parse(read_bytes(reader)?);
-        let n_blocks: u32 = endian.parse(read_bytes(reader)?);
-        let expected_n_blocks = (ztrailer_len - 24) / 24;
-        if n_blocks as u64 != expected_n_blocks {
-            return Err(Error::BadZlibTrailerNBlocks {
-                offset: ztrailer_ofs,
-                n_blocks,
-                expected_n_blocks,
-                ztrailer_len,
-            });
-        }
-        let blocks = (0..n_blocks)
-            .map(|_| ZBlock::read(reader, endian))
-            .collect::<Result<Vec<_>, _>>()?;
-        reader.seek(SeekFrom::Start(start_offset))?;
-        Ok(Some(ZTrailer {
-            offset: ztrailer_ofs,
-            int_bias,
-            zero,
-            block_size,
-            blocks,
-        }))
-    }
-}
-
-fn try_read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<Option<[u8; N]>, IoError> {
-    let mut buf = [0; N];
-    let n = r.read(&mut buf)?;
-    if n > 0 {
-        if n < N {
-            r.read_exact(&mut buf[n..])?;
-        }
-        Ok(Some(buf))
-    } else {
-        Ok(None)
-    }
-}
-
-fn read_bytes<const N: usize, R: Read>(r: &mut R) -> Result<[u8; N], IoError> {
-    let mut buf = [0; N];
-    r.read_exact(&mut buf)?;
-    Ok(buf)
-}
-
-fn read_vec<R: Read>(r: &mut R, n: usize) -> Result<Vec<u8>, IoError> {
-    let mut vec = vec![0; n];
-    r.read_exact(&mut vec)?;
-    Ok(vec)
-}
-
-fn read_string<R: Read>(r: &mut R, endian: Endian) -> Result<RawString, IoError> {
-    let length: u32 = endian.parse(read_bytes(r)?);
-    Ok(read_vec(r, length as usize)?.into())
-}
-
-#[derive(Clone, Debug)]
-pub struct LongStringValueLabels<N, S>
-where
-    S: Debug,
-{
-    pub var_name: N,
-    pub width: u32,
-
-    /// `(value, label)` pairs, where each value is `width` bytes.
-    pub labels: Vec<(S, S)>,
-}
-
-impl LongStringValueLabels<RawString, RawString> {
-    fn decode(
-        &self,
-        decoder: &Decoder,
-    ) -> Result<LongStringValueLabels<Identifier, String>, Warning> {
-        let var_name = decoder.decode(&self.var_name);
-        let var_name = Identifier::from_encoding(var_name.trim_end(), decoder.encoding)
-            .map_err(Warning::InvalidLongStringValueLabelName)?;
-
-        let mut labels = Vec::with_capacity(self.labels.len());
-        for (value, label) in self.labels.iter() {
-            let value = decoder.decode_exact_length(&value.0).to_string();
-            let label = decoder.decode(label).to_string();
-            labels.push((value, label));
-        }
-
-        Ok(LongStringValueLabels {
-            var_name,
-            width: self.width,
-            labels,
-        })
-    }
-}
-
-#[derive(Clone, Debug)]
-pub struct LongStringValueLabelRecord<N, S>(pub Vec<LongStringValueLabels<N, S>>)
-where
-    N: Debug,
-    S: Debug;
-
-impl ExtensionRecord for LongStringValueLabelRecord<RawString, RawString> {
-    const SUBTYPE: u32 = 21;
-    const SIZE: Option<u32> = Some(1);
-    const COUNT: Option<u32> = None;
-    const NAME: &'static str = "long string value labels record";
-
-    fn parse(ext: &Extension, endian: Endian) -> Result<Record, Warning> {
-        ext.check_size::<Self>()?;
-
-        let mut input = &ext.data[..];
-        let mut label_set = Vec::new();
-        while !input.is_empty() {
-            let var_name = read_string(&mut input, endian)?;
-            let width: u32 = endian.parse(read_bytes(&mut input)?);
-            let n_labels: u32 = endian.parse(read_bytes(&mut input)?);
-            let mut labels = Vec::new();
-            for _ in 0..n_labels {
-                let value = read_string(&mut input, endian)?;
-                let label = read_string(&mut input, endian)?;
-                labels.push((value, label));
-            }
-            label_set.push(LongStringValueLabels {
-                var_name,
-                width,
-                labels,
-            })
-        }
-        Ok(Record::LongStringValueLabels(LongStringValueLabelRecord(
-            label_set,
-        )))
-    }
-}
-
-impl LongStringValueLabelRecord<RawString, RawString> {
-    fn decode(self, decoder: &Decoder) -> LongStringValueLabelRecord<Identifier, String> {
-        let mut labels = Vec::with_capacity(self.0.len());
-        for label in &self.0 {
-            match label.decode(decoder) {
-                Ok(set) => labels.push(set),
-                Err(error) => decoder.warn(error),
-            }
-        }
-        LongStringValueLabelRecord(labels)
-    }
-}
diff --git a/rust/src/sack.rs b/rust/src/sack.rs

deleted file mode 100644 (file)

index c6be5d1..0000000
--- a/rust/src/sack.rs
+++ /dev/null
@@ -1,633 +0,0 @@
-use float_next_after::NextAfter;
-use num::{Bounded, Zero};
-use ordered_float::OrderedFloat;
-use std::{
-    collections::{hash_map::Entry, HashMap},
-    error::Error as StdError,
-    fmt::{Display, Formatter, Result as FmtResult},
-    iter::repeat,
-};
-
-use crate::endian::{Endian, ToBytes};
-
-pub type Result<T, F = Error> = std::result::Result<T, F>;
-
-#[derive(Debug)]
-pub struct Error {
-    pub file_name: Option<String>,
-    pub line_number: Option<usize>,
-    pub token: Option<String>,
-    pub message: String,
-}
-
-impl Error {
-    fn new(
-        file_name: Option<&str>,
-        line_number: Option<usize>,
-        token: Option<&str>,
-        message: String,
-    ) -> Error {
-        Error {
-            file_name: file_name.map(String::from),
-            line_number,
-            token: token.map(String::from),
-            message,
-        }
-    }
-}
-
-impl StdError for Error {}
-
-impl Display for Error {
-    fn fmt(&self, f: &mut Formatter) -> FmtResult {
-        match (self.file_name.as_ref(), self.line_number) {
-            (Some(ref file_name), Some(line_number)) => write!(f, "{file_name}:{line_number}: ")?,
-            (Some(ref file_name), None) => write!(f, "{file_name}: ")?,
-            (None, Some(line_number)) => write!(f, "line {line_number}: ")?,
-            (None, None) => (),
-        }
-        if let Some(ref token) = self.token {
-            write!(f, "at '{token}': ")?;
-        }
-        write!(f, "{}", self.message)
-    }
-}
-
-pub fn sack(input: &str, input_file_name: Option<&str>, endian: Endian) -> Result<Vec<u8>> {
-    let mut symbol_table = HashMap::new();
-    let output = _sack(input, input_file_name, endian, &mut symbol_table)?;
-    let output = if !symbol_table.is_empty() {
-        for (k, v) in symbol_table.iter() {
-            println!("{k} => {v:?}");
-        }
-        for (k, v) in symbol_table.iter() {
-            if v.is_none() {
-                Err(Error::new(
-                    input_file_name,
-                    None,
-                    None,
-                    format!("label {k} used but never defined"),
-                ))?
-            }
-        }
-        _sack(input, input_file_name, endian, &mut symbol_table)?
-    } else {
-        output
-    };
-    Ok(output)
-}
-
-fn _sack(
-    input: &str,
-    input_file_name: Option<&str>,
-    endian: Endian,
-    symbol_table: &mut HashMap<String, Option<u32>>,
-) -> Result<Vec<u8>> {
-    let mut lexer = Lexer::new(input, input_file_name, endian)?;
-    let mut output = Vec::new();
-    while parse_data_item(&mut lexer, &mut output, symbol_table)? {}
-    Ok(output)
-}
-
-fn parse_data_item(
-    lexer: &mut Lexer,
-    output: &mut Vec<u8>,
-    symbol_table: &mut HashMap<String, Option<u32>>,
-) -> Result<bool> {
-    if lexer.token.is_none() {
-        return Ok(false);
-    };
-
-    let initial_len = output.len();
-    match lexer.take()? {
-        Token::Integer(integer) => {
-            if let Ok(integer) = TryInto::<i32>::try_into(integer) {
-                output.extend_from_slice(&lexer.endian.to_bytes(integer));
-            } else if let Ok(integer) = TryInto::<u32>::try_into(integer) {
-                output.extend_from_slice(&lexer.endian.to_bytes(integer));
-            } else {
-                Err(lexer.error(format!(
-                    "{integer} is not in the valid range [{},{}]",
-                    i32::min_value(),
-                    u32::max_value()
-                )))?;
-            };
-        }
-        Token::Float(float) => output.extend_from_slice(&lexer.endian.to_bytes(float.0)),
-        Token::PcSysmis => {
-            output.extend_from_slice(&[0xf5, 0x1e, 0x26, 0x02, 0x8a, 0x8c, 0xed, 0xff])
-        }
-        Token::I8 => put_integers::<u8, 1>(lexer, "i8", output)?,
-        Token::I16 => put_integers::<u16, 2>(lexer, "i16", output)?,
-        Token::I64 => put_integers::<i64, 8>(lexer, "i64", output)?,
-        Token::String(string) => output.extend_from_slice(string.as_bytes()),
-        Token::S(size) => {
-            let Some((Token::String(ref string), _)) = lexer.token else {
-                Err(lexer.error(format!("string expected after 's{size}'")))?
-            };
-            let len = string.len();
-            if len > size {
-                Err(lexer.error(format!(
-                    "{len}-byte string is longer than pad length {size}"
-                )))?
-            }
-            output.extend_from_slice(string.as_bytes());
-            output.extend(repeat(b' ').take(size - len));
-            lexer.get()?;
-        }
-        Token::LParen => {
-            while !matches!(lexer.token, Some((Token::RParen, _))) {
-                parse_data_item(lexer, output, symbol_table)?;
-            }
-            lexer.get()?;
-        }
-        Token::Count => put_counted_items::<u32, 4>(lexer, "COUNT", output, symbol_table)?,
-        Token::Count8 => put_counted_items::<u8, 1>(lexer, "COUNT8", output, symbol_table)?,
-        Token::Hex => {
-            let Some((Token::String(ref string), _)) = lexer.token else {
-                Err(lexer.error(String::from("string expected after 'hex'")))?
-            };
-            let mut string = &string[..];
-            loop {
-                string = string.trim_start();
-                if string.is_empty() {
-                    break;
-                };
-
-                let mut i = string.chars();
-                let Some(c0) = i.next() else { return Ok(true) };
-                let Some(c1) = i.next() else {
-                    Err(lexer.error(String::from("hex string has odd number of characters")))?
-                };
-
-                let (Some(digit0), Some(digit1)) = (c0.to_digit(16), c1.to_digit(16)) else {
-                    Err(lexer.error(String::from("invalid digit in hex string")))?
-                };
-                let byte = digit0 * 16 + digit1;
-                output.push(byte as u8);
-
-                string = i.as_str();
-            }
-            lexer.get()?;
-        }
-        Token::Label(name) => {
-            println!("define {name}");
-            let value = output.len() as u32;
-            match symbol_table.entry(name.clone()) {
-                Entry::Vacant(v) => {
-                    v.insert(Some(value));
-                }
-                Entry::Occupied(mut o) => {
-                    match o.get() {
-                        Some(v) => {
-                            if *v != value {
-                                Err(lexer.error(format!("{name}: can't redefine label for offset {:#x} with offset {:#x}", *v, value)))?
-                            }
-                        }
-                        None => drop(o.insert(Some(value))),
-                    }
-                }
-            };
-            return Ok(true);
-        }
-        Token::At(name) => {
-            let mut value = *symbol_table.entry(name.clone()).or_insert(None);
-            loop {
-                let plus = match lexer.token {
-                    Some((Token::Plus, _)) => true,
-                    Some((Token::Minus, _)) => false,
-                    _ => break,
-                };
-                lexer.get()?;
-
-                let operand = match lexer.token {
-                    Some((Token::At(ref name), _)) => {
-                        *symbol_table.entry(name.clone()).or_insert(None)
-                    }
-                    Some((Token::Integer(integer), _)) => Some(
-                        integer
-                            .try_into()
-                            .map_err(|msg| lexer.error(format!("bad offset literal ({msg})")))?,
-                    ),
-                    _ => Err(lexer.error(String::from("expecting @label or integer literal")))?,
-                };
-                lexer.get()?;
-
-                value = match (value, operand) {
-                    (Some(a), Some(b)) => Some(
-                        if plus {
-                            a.checked_add(b)
-                        } else {
-                            a.checked_sub(b)
-                        }
-                        .ok_or_else(|| {
-                            lexer.error(String::from("overflow in offset arithmetic"))
-                        })?,
-                    ),
-                    _ => None,
-                };
-            }
-            let value = value.unwrap_or(0);
-            output.extend_from_slice(&lexer.endian.to_bytes(value));
-        }
-        _ => (),
-    };
-    if let Some((Token::Asterisk, _)) = lexer.token {
-        lexer.get()?;
-        let Token::Integer(count) = lexer.take()? else {
-            Err(lexer.error(String::from("positive integer expected after '*'")))?
-        };
-        if count < 1 {
-            Err(lexer.error(String::from("positive integer expected after '*'")))?
-        };
-        let final_len = output.len();
-        for _ in 1..count {
-            output.extend_from_within(initial_len..final_len);
-        }
-    }
-    match lexer.token {
-        Some((Token::Semicolon, _)) => {
-            lexer.get()?;
-        }
-        Some((Token::RParen, _)) => (),
-        _ => Err(lexer.error(String::from("';' expected")))?,
-    }
-    Ok(true)
-}
-
-fn put_counted_items<T, const N: usize>(
-    lexer: &mut Lexer,
-    name: &str,
-    output: &mut Vec<u8>,
-    symbol_table: &mut HashMap<String, Option<u32>>,
-) -> Result<()>
-where
-    T: Zero + TryFrom<usize>,
-    Endian: ToBytes<T, N>,
-{
-    let old_size = output.len();
-    output.extend_from_slice(&lexer.endian.to_bytes(T::zero()));
-    let start = output.len();
-    if !matches!(lexer.token, Some((Token::LParen, _))) {
-        Err(lexer.error(format!("'(' expected after '{name}'")))?
-    }
-    lexer.get()?;
-    while !matches!(lexer.token, Some((Token::RParen, _))) {
-        parse_data_item(lexer, output, symbol_table)?;
-    }
-    lexer.get()?;
-    let delta = output.len() - start;
-    let Ok(delta): Result<T, _> = delta.try_into() else {
-        Err(lexer.error(format!("{delta} bytes is too much for '{name}'")))?
-    };
-    let dest = &mut output[old_size..old_size + N];
-    dest.copy_from_slice(&lexer.endian.to_bytes(delta));
-    Ok(())
-}
-
-fn put_integers<T, const N: usize>(
-    lexer: &mut Lexer,
-    name: &str,
-    output: &mut Vec<u8>,
-) -> Result<()>
-where
-    T: Bounded + Display + TryFrom<i64> + Copy,
-    Endian: ToBytes<T, N>,
-{
-    println!("put_integers {:?}", lexer.token);
-    let mut n = 0;
-    while let Some(integer) = lexer.take_if(|t| match t {
-        Token::Integer(integer) => Some(*integer),
-        _ => None,
-    })? {
-        println!("got integer {integer}");
-        let Ok(integer) = integer.try_into() else {
-            Err(lexer.error(format!(
-                "{integer} is not in the valid range [{},{}]",
-                T::min_value(),
-                T::max_value()
-            )))?
-        };
-        output.extend_from_slice(&lexer.endian.to_bytes(integer));
-        n += 1;
-    }
-    println!("put_integers {:?} {n}", lexer.token);
-    if n == 0 {
-        Err(lexer.error(format!("integer expected after '{name}'")))?
-    }
-    Ok(())
-}
-
-#[derive(PartialEq, Eq, Clone, Debug)]
-enum Token {
-    Integer(i64),
-    Float(OrderedFloat<f64>),
-    PcSysmis,
-    String(String),
-    Semicolon,
-    Asterisk,
-    LParen,
-    RParen,
-    I8,
-    I16,
-    I64,
-    S(usize),
-    Count,
-    Count8,
-    Hex,
-    Label(String),
-    At(String),
-    Minus,
-    Plus,
-}
-
-struct Lexer<'a> {
-    input: &'a str,
-    token: Option<(Token, &'a str)>,
-    input_file_name: Option<&'a str>,
-    line_number: usize,
-    endian: Endian,
-}
-
-fn skip_comments(mut s: &str) -> (&str, usize) {
-    let mut n_newlines = 0;
-    let s = loop {
-        s = s.trim_start_matches([' ', '\t', '\r', '<', '>']);
-        if let Some(remainder) = s.strip_prefix('#') {
-            let Some((_, remainder)) = remainder.split_once('\n') else {
-                break "";
-            };
-            s = remainder;
-            n_newlines += 1;
-        } else if let Some(remainder) = s.strip_prefix('\n') {
-            s = remainder;
-            n_newlines += 1;
-        } else {
-            break s;
-        }
-    };
-    (s, n_newlines)
-}
-
-impl<'a> Lexer<'a> {
-    fn new(input: &'a str, input_file_name: Option<&'a str>, endian: Endian) -> Result<Lexer<'a>> {
-        let mut lexer = Lexer {
-            input,
-            token: None,
-            input_file_name,
-            line_number: 1,
-            endian,
-        };
-        lexer.token = lexer.next()?;
-        Ok(lexer)
-    }
-    fn error(&self, message: String) -> Error {
-        let repr = self.token.as_ref().map(|(_, repr)| *repr);
-        Error::new(self.input_file_name, Some(self.line_number), repr, message)
-    }
-    fn take(&mut self) -> Result<Token> {
-        let Some(token) = self.token.take() else {
-            Err(self.error(String::from("unexpected end of input")))?
-        };
-        self.token = self.next()?;
-        Ok(token.0)
-    }
-    fn take_if<F, T>(&mut self, condition: F) -> Result<Option<T>>
-    where
-        F: FnOnce(&Token) -> Option<T>,
-    {
-        let Some(ref token) = self.token else {
-            return Ok(None);
-        };
-        match condition(&token.0) {
-            Some(value) => {
-                self.token = self.next()?;
-                Ok(Some(value))
-            }
-            None => Ok(None),
-        }
-    }
-    fn get(&mut self) -> Result<Option<&Token>> {
-        if self.token.is_none() {
-            Err(self.error(String::from("unexpected end of input")))?
-        } else {
-            self.token = self.next()?;
-            match self.token {
-                Some((ref token, _)) => Ok(Some(token)),
-                None => Ok(None),
-            }
-        }
-    }
-
-    fn next(&mut self) -> Result<Option<(Token, &'a str)>> {
-        // Get the first character of the token, skipping past white space and
-        // comments.
-        let (s, n_newlines) = skip_comments(self.input);
-        self.line_number += n_newlines;
-        self.input = s;
-
-        let start = s;
-        let mut iter = s.chars();
-        let Some(c) = iter.next() else {
-            return Ok(None);
-        };
-        let (token, rest) = match c {
-            c if c.is_ascii_digit() || c == '-' => {
-                let len = s
-                    .find(|c: char| {
-                        !(c.is_ascii_digit() || c.is_alphabetic() || c == '.' || c == '-')
-                    })
-                    .unwrap_or(s.len());
-                let (number, rest) = s.split_at(len);
-                let token = if number == "-" {
-                    Token::Minus
-                } else if let Some(digits) = number.strip_prefix("0x") {
-                    Token::Integer(i64::from_str_radix(digits, 16).map_err(|msg| {
-                        self.error(format!("bad integer literal '{number}' ({msg})"))
-                    })?)
-                } else if !number.contains('.') {
-                    Token::Integer(number.parse().map_err(|msg| {
-                        self.error(format!("bad integer literal '{number}' ({msg})"))
-                    })?)
-                } else {
-                    Token::Float(number.parse().map_err(|msg| {
-                        self.error(format!("bad float literal '{number}' ({msg})"))
-                    })?)
-                };
-                (token, rest)
-            }
-            '"' => {
-                let s = iter.as_str();
-                let Some(len) = s.find(['\n', '"']) else {
-                    Err(self.error(String::from("end-of-file inside string")))?
-                };
-                let (string, rest) = s.split_at(len);
-                let Some(rest) = rest.strip_prefix('"') else {
-                    Err(self.error(format!("new-line inside string ({string}...{rest})")))?
-                };
-                (Token::String(string.into()), rest)
-            }
-            ';' => (Token::Semicolon, iter.as_str()),
-            '*' => (Token::Asterisk, iter.as_str()),
-            '+' => (Token::Plus, iter.as_str()),
-            '(' => (Token::LParen, iter.as_str()),
-            ')' => (Token::RParen, iter.as_str()),
-            c if c.is_alphabetic() || c == '@' || c == '_' => {
-                let len = s
-                    .find(|c: char| {
-                        !(c.is_ascii_digit()
-                            || c.is_alphabetic()
-                            || c == '@'
-                            || c == '.'
-                            || c == '_')
-                    })
-                    .unwrap_or(s.len());
-                let (s, rest) = s.split_at(len);
-                if let Some(rest) = rest.strip_prefix(':') {
-                    (Token::Label(s.into()), rest)
-                } else if let Some(name) = s.strip_prefix('@') {
-                    (Token::At(name.into()), rest)
-                } else if let Some(count) = s.strip_prefix('s') {
-                    let token =
-                        Token::S(count.parse().map_err(|msg| {
-                            self.error(format!("bad counted string '{s}' ({msg})"))
-                        })?);
-                    (token, rest)
-                } else {
-                    let token = match s {
-                        "i8" => Token::I8,
-                        "i16" => Token::I16,
-                        "i64" => Token::I64,
-                        "SYSMIS" => Token::Float(OrderedFloat(-f64::MAX)),
-                        "PCSYSMIS" => Token::PcSysmis,
-                        "LOWEST" => Token::Float((-f64::MAX).next_after(0.0).into()),
-                        "HIGHEST" => Token::Float(f64::MAX.into()),
-                        "ENDIAN" => Token::Integer(if self.endian == Endian::Big { 1 } else { 2 }),
-                        "COUNT" => Token::Count,
-                        "COUNT8" => Token::Count8,
-                        "hex" => Token::Hex,
-                        _ => Err(self.error(format!("invalid token '{s}'")))?,
-                    };
-                    (token, rest)
-                }
-            }
-            _ => Err(self.error(format!("invalid input byte '{c}'")))?,
-        };
-        self.input = rest;
-        let repr = &start[..start.len() - rest.len()];
-        println!("{token:?} {repr}");
-        Ok(Some((token, repr)))
-    }
-}
-
-#[cfg(test)]
-mod test {
-    use crate::endian::Endian;
-    use crate::sack::sack;
-    use anyhow::Result;
-    use hexplay::HexView;
-
-    #[test]
-    fn basic_sack() -> Result<()> {
-        let input = r#"
-"$FL2"; s60 "$(#) SPSS DATA FILE PSPP synthetic test file";
-2; # Layout code
-28; # Nominal case size
-0; # Not compressed
-0; # Not weighted
-1; # 1 case.
-100.0; # Bias.
-"01 Jan 11"; "20:53:52";
-"PSPP synthetic test file: "; i8 244; i8 245; i8 246; i8 248; s34 "";
-i8 0 *3;
-"#;
-        let output = sack(input, None, Endian::Big)?;
-        HexView::new(&output).print()?;
-        Ok(())
-    }
-
-    #[test]
-    fn pcp_sack() -> Result<()> {
-        let input = r#"
-# File header.
-2; 0;
-@MAIN; @MAIN_END - @MAIN;
-@VARS; @VARS_END - @VARS;
-@LABELS; @LABELS_END - @LABELS;
-@DATA; @DATA_END - @DATA;
-(0; 0) * 11;
-i8 0 * 128;
-
-MAIN:
-    i16 1;         # Fixed.
-    s62 "PCSPSS PSPP synthetic test product";
-    PCSYSMIS;
-    0; 0; i16 1;   # Fixed.
-    i16 0;
-    i16 15;
-    1;
-    i16 0;         # Fixed.
-    1;
-    s8 "11/28/14";
-    s8 "15:11:00";
-    s64 "PSPP synthetic test file";
-MAIN_END:
-
-VARS:
-    0; 0; 0; 0x050800; s8 "$CASENUM"; PCSYSMIS;
-    0; 0; 0; 0x010800; s8 "$DATE"; PCSYSMIS;
-    0; 0; 0; 0x050802; s8 "$WEIGHT"; PCSYSMIS;
-
-    # Numeric variable, no label or missing values.
-    0; 0; 0; 0x050800; s8 "NUM1"; PCSYSMIS;
-
-    # Numeric variable, variable label.
-    0; 0; @NUM2_LABEL - @LABELS_OFS; 0x050800; s8 "NUM2"; PCSYSMIS;
-
-    # Numeric variable with missing value.
-    0; 0; 0; 0x050800; s8 "NUM3"; 1.0;
-
-    # Numeric variable, variable label and missing value.
-    0; 0; @NUM4_LABEL - @LABELS_OFS; 0x050800; s8 "NUM4"; 2.0;
-
-    # String variable, no label or missing values.
-    0; 0; 0; 0x010800; s8 "STR1"; PCSYSMIS;
-
-    # String variable, variable label.
-    0; 0; @STR2_LABEL - @LABELS_OFS; 0x010400; s8 "STR2"; PCSYSMIS;
-
-    # String variable with missing value.
-    0; 0; 0; 0x010500; s8 "STR3"; s8 "MISS";
-
-    # String variable, variable label and missing value.
-    0; 0; @STR4_LABEL - @LABELS_OFS; 0x010100; s8 "STR4"; s8 "OTHR";
-
-    # Long string variable
-    0; 0; 0; 0x010b00; s8 "STR5"; PCSYSMIS;
-    0 * 8;
-
-    # Long string variable with variable label
-    0; 0; @STR6_LABEL - @LABELS_OFS; 0x010b00; s8 "STR6"; PCSYSMIS;
-    0 * 8;
-VARS_END:
-
-LABELS:
-    3; i8 0 0 0; LABELS_OFS: i8 0;
-    NUM2_LABEL: COUNT8("Numeric variable 2's label");
-    NUM4_LABEL: COUNT8("Another numeric variable label");
-    STR2_LABEL: COUNT8("STR2's variable label");
-    STR4_LABEL: COUNT8("STR4's variable label");
-    STR6_LABEL: COUNT8("Another string variable's label");
-LABELS_END:
-
-DATA:
-    0.0; "11/28/14"; 1.0;
-    0.0; 1.0; 2.0; PCSYSMIS; s8 "abcdefgh"; s8 "ijkl"; s8 "mnopq"; s8 "r";
-    s16 "stuvwxyzAB"; s16 "CDEFGHIJKLM";
-DATA_END:
-"#;
-        let output = sack(input, None, Endian::Big)?;
-        HexView::new(&output).print()?;
-        Ok(())
-    }
-}
diff --git a/rust/src/settings.rs b/rust/src/settings.rs

deleted file mode 100644 (file)

index de51951..0000000
--- a/rust/src/settings.rs
+++ /dev/null
@@ -1,140 +0,0 @@
-use std::sync::OnceLock;
-
-use enum_map::EnumMap;
-
-use crate::{
-    endian::Endian,
-    format::{Format, Settings as FormatSettings},
-    message::Severity,
-};
-
-pub struct Settings {
-    pub input_integer_format: Endian,
-    pub input_float_format: Endian,
-    pub output_integer_format: Endian,
-    pub output_float_format: Endian,
-
-    /// `MDISPLAY`: how to display matrices in `MATRIX`...`END MATRIX`.
-    pub matrix_display: MatrixDisplay,
-
-    pub view_length: usize,
-    pub view_width: usize,
-    pub safer: bool,
-    pub include: bool,
-    pub route_errors_to_terminal: bool,
-    pub route_errors_to_listing: bool,
-    pub scompress: bool,
-    pub undefined: bool,
-    pub blanks: Option<f64>,
-    pub max_messages: EnumMap<Severity, usize>,
-    pub printback: bool,
-    pub macros: MacroSettings,
-    pub max_loops: usize,
-    pub workspace: usize,
-    pub default_format: Format,
-    pub testing: bool,
-    pub fuzz_bits: usize,
-    pub scale_min: usize,
-    pub commands: Compatibility,
-    pub global: Compatibility,
-    pub syntax: Compatibility,
-    pub formats: FormatSettings,
-    pub small: f64,
-}
-
-impl Default for Settings {
-    fn default() -> Self {
-        Self {
-            input_integer_format: Endian::NATIVE,
-            input_float_format: Endian::NATIVE,
-            output_integer_format: Endian::NATIVE,
-            output_float_format: Endian::NATIVE,
-            matrix_display: MatrixDisplay::default(),
-            view_length: 24,
-            view_width: 79,
-            safer: false,
-            include: true,
-            route_errors_to_terminal: true,
-            route_errors_to_listing: true,
-            scompress: true,
-            undefined: true,
-            blanks: None,
-            max_messages: EnumMap::from_fn(|_| 100),
-            printback: true,
-            macros: MacroSettings::default(),
-            max_loops: 40,
-            workspace: 64 * 1024 * 1024,
-            default_format: Format::F8_2,
-            testing: false,
-            fuzz_bits: 6,
-            scale_min: 24,
-            commands: Compatibility::Enhanced,
-            global: Compatibility::Enhanced,
-            syntax: Compatibility::Enhanced,
-            formats: FormatSettings::default(),
-            small: 0.0001,
-        }
-    }
-}
-
-impl Settings {
-    pub fn global() -> &'static Settings {
-        static GLOBAL: OnceLock<Settings> = OnceLock::new();
-        &GLOBAL.get_or_init(|| Settings::default())
-    }
-}
-
-pub enum Compatibility {
-    Compatible,
-    Enhanced,
-}
-
-pub struct MacroSettings {
-    /// Expand macros?
-    pub expand: bool,
-
-    /// Print macro expansions?
-    pub print_expansions: bool,
-
-    /// Maximum iterations of `!FOR`.
-    pub max_iterations: usize,
-
-    /// Maximum nested macro expansion levels.
-    pub max_nest: usize,
-}
-
-impl Default for MacroSettings {
-    fn default() -> Self {
-        Self {
-            expand: true,
-            print_expansions: false,
-            max_iterations: 1000,
-            max_nest: 50,
-        }
-    }
-}
-
-/// How to display matrices in `MATRIX`...`END MATRIX`.
-#[derive(Default)]
-pub enum MatrixDisplay {
-    /// Output matrices as text.
-    #[default]
-    Text,
-
-    /// Output matrices as pivot tables.
-    Tables,
-}
-
-pub enum OutputType {
-    /// Errors and warnings.
-    Error,
-
-    /// Notes.
-    Notes,
-
-    /// Syntax printback.
-    Syntax,
-
-    /// Everything else.
-    Other,
-}
diff --git a/rust/tests/sack.rs b/rust/tests/sack.rs

deleted file mode 100644 (file)

index 49b10e7..0000000
--- a/rust/tests/sack.rs
+++ /dev/null
@@ -1,93 +0,0 @@
-use std::fs::read_to_string;
-use std::path::PathBuf;
-
-use anyhow::{anyhow, Result};
-use clap::Parser;
-use pspp::endian::Endian;
-use pspp::sack::sack;
-
-/// SAv Construction Kit
-///
-/// The input is a sequence of data items, each followed by a semicolon.  Each
-/// data item is converted to the output format and written on stdout.  A data
-/// item is one of the following:
-///
-///   - An integer in decimal, in hexadecimal prefixed by `0x`, or in octal
-///     prefixed by `0`.  Output as a 32-bit binary integer.
-///
-///   - A floating-point number.  Output in 64-bit IEEE 754 format.
-///
-///   - A string enclosed in double quotes.  Output literally.  There is no
-///     syntax for "escapes".  Strings may not contain new-lines.
-///
-///   - A literal of the form `s<number>` followed by a quoted string as above.
-///     Output as the string's contents followed by enough spaces to fill up
-///     `<number>` bytes.  For example, `s8 "foo"` is output as `foo` followed
-///     by 5 spaces.
-///
-///   - The literal `i8`, `i16`, or `i64` followed by an integer.  Output
-///     as a binary integer with the specified number of bits.
-///
-///   - One of the literals `SYSMIS`, `LOWEST`, or `HIGHEST`.  Output as a
-///     64-bit IEEE 754 float of the appropriate PSPP value.
-///
-///   - `PCSYSMIS`.  Output as SPSS/PC+ system-missing value.
-///
-///   - The literal `ENDIAN`.  Output as a 32-bit binary integer, either with
-///     value 1 if `--be` is in effect or 2 if `--le` is in effect.
-///
-///   - A pair of parentheses enclosing a sequence of data items, each followed
-///     by a semicolon (the last semicolon is optional).  Output as the enclosed
-///     data items in sequence.
-///
-///   - The literal `COUNT` or `COUNT8` followed by a sequence of parenthesized
-///     data items, as above.  Output as a 32-bit or 8-bit binary integer whose
-///     value is the number of bytes enclosed within the parentheses, followed
-///     by the enclosed data items themselves.
-///
-/// optionally followed by an asterisk and a positive integer, which specifies a
-/// repeat count for the data item.
-#[derive(Parser, Debug)]
-struct Args {
-    /// Big-endian output format (default)
-    #[arg(long = "be")]
-    be: bool,
-
-    /// Little-endian output format
-    #[arg(long = "le")]
-    le: bool,
-
-    /// Input file.
-    #[arg(required = true, name = "input")]
-    input_file_name: PathBuf,
-
-    /// Output file.
-    #[arg(required = true, name = "output")]
-    output_file_name: PathBuf,
-}
-
-fn main() -> Result<()> {
-    let Args {
-        be,
-        le,
-        input_file_name,
-        output_file_name,
-    } = Args::parse();
-    let endian = match (be, le) {
-        (false, false) | (true, false) => Endian::Big,
-        (false, true) => Endian::Little,
-        (true, true) => return Err(anyhow!("can't use both `--be` and `--le`")),
-    };
-
-    let input_file_str = input_file_name.to_string_lossy();
-    let input = read_to_string(&input_file_name)
-        .map_err(|err| anyhow!("{input_file_str}: read failed ({err})"))?;
-
-    let output = sack(&input, Some(&input_file_str), endian)?;
-
-    let output_file_str = output_file_name.to_string_lossy();
-    std::fs::write(&output_file_name, output)
-        .map_err(|err| anyhow!("{output_file_str}: write failed ({err})"))?;
-
-    Ok(())
-}
author	Ben Pfaff <blp@cs.stanford.edu>
	Tue, 20 Aug 2024 04:31:52 +0000 (21:31 -0700)
committer	Ben Pfaff <blp@cs.stanford.edu>
	Tue, 20 Aug 2024 04:31:52 +0000 (21:31 -0700)
rust/Cargo.lock		patch \| blob \| history
rust/Cargo.toml		patch \| blob \| history
rust/build.rs	[deleted file]	patch \| blob \| history
rust/convrtrs.txt	[deleted file]	patch \| blob \| history
rust/fuzz/.gitignore	[deleted file]	patch \| blob \| history
rust/fuzz/Cargo.lock	[deleted file]	patch \| blob \| history
rust/fuzz/Cargo.toml	[deleted file]	patch \| blob \| history
rust/fuzz/fuzz_targets/fuzz_target_1.rs	[deleted file]	patch \| blob \| history
rust/fuzz/fuzz_targets/segment.rs	[deleted file]	patch \| blob \| history
rust/pspp-lsp/Cargo.toml	[new file with mode: 0644]	patch \| blob
rust/pspp-lsp/src/main.rs	[new file with mode: 0644]	patch \| blob
rust/pspp/Cargo.lock	[new file with mode: 0644]	patch \| blob
rust/pspp/Cargo.toml	[new file with mode: 0644]	patch \| blob
rust/pspp/build.rs	[new file with mode: 0644]	patch \| blob
rust/pspp/convrtrs.txt	[new file with mode: 0644]	patch \| blob
rust/pspp/fuzz/.gitignore	[new file with mode: 0644]	patch \| blob
rust/pspp/fuzz/Cargo.lock	[new file with mode: 0644]	patch \| blob
rust/pspp/fuzz/Cargo.toml	[new file with mode: 0644]	patch \| blob
rust/pspp/fuzz/fuzz_targets/fuzz_target_1.rs	[new file with mode: 0644]	patch \| blob
rust/pspp/fuzz/fuzz_targets/segment.rs	[new file with mode: 0644]	patch \| blob
rust/pspp/src/command.rs	[new file with mode: 0644]	patch \| blob
rust/pspp/src/cooked.rs	[new file with mode: 0644]	patch \| blob
rust/pspp/src/dictionary.rs	[new file with mode: 0644]	patch \| blob
rust/pspp/src/encoding.rs	[new file with mode: 0644]	patch \| blob
rust/pspp/src/endian.rs	[new file with mode: 0644]	patch \| blob
rust/pspp/src/engine.rs	[new file with mode: 0644]	patch \| blob
rust/pspp/src/format.rs	[new file with mode: 0644]	patch \| blob
rust/pspp/src/hexfloat.rs	[new file with mode: 0644]	patch \| blob
rust/pspp/src/identifier.rs	[new file with mode: 0644]	patch \| blob
rust/pspp/src/integer.rs	[new file with mode: 0644]	patch \| blob
rust/pspp/src/lex/command_name.rs	[new file with mode: 0644]	patch \| blob
rust/pspp/src/lex/lexer.rs	[new file with mode: 0644]	patch \| blob
rust/pspp/src/lex/mod.rs	[new file with mode: 0644]	patch \| blob
rust/pspp/src/lex/scan/mod.rs	[new file with mode: 0644]	patch \| blob
rust/pspp/src/lex/scan/test.rs	[new file with mode: 0644]	patch \| blob
rust/pspp/src/lex/segment/mod.rs	[new file with mode: 0644]	patch \| blob
rust/pspp/src/lex/segment/test.rs	[new file with mode: 0644]	patch \| blob
rust/pspp/src/lex/token.rs	[new file with mode: 0644]	patch \| blob
rust/pspp/src/lib.rs	[new file with mode: 0644]	patch \| blob
rust/pspp/src/locale_charset.rs	[new file with mode: 0644]	patch \| blob
rust/pspp/src/macros.rs	[new file with mode: 0644]	patch \| blob
rust/pspp/src/main.rs	[new file with mode: 0644]	patch \| blob
rust/pspp/src/message.rs	[new file with mode: 0644]	patch \| blob
rust/pspp/src/output/mod.rs	[new file with mode: 0644]	patch \| blob
rust/pspp/src/output/pivot/mod.rs	[new file with mode: 0644]	patch \| blob
rust/pspp/src/prompt.rs	[new file with mode: 0644]	patch \| blob
rust/pspp/src/raw.rs	[new file with mode: 0644]	patch \| blob
rust/pspp/src/sack.rs	[new file with mode: 0644]	patch \| blob
rust/pspp/src/settings.rs	[new file with mode: 0644]	patch \| blob
rust/pspp/tests/sack.rs	[new file with mode: 0644]	patch \| blob
rust/src/command.rs	[deleted file]	patch \| blob \| history
rust/src/cooked.rs	[deleted file]	patch \| blob \| history
rust/src/dictionary.rs	[deleted file]	patch \| blob \| history
rust/src/encoding.rs	[deleted file]	patch \| blob \| history
rust/src/endian.rs	[deleted file]	patch \| blob \| history
rust/src/engine.rs	[deleted file]	patch \| blob \| history
rust/src/format.rs	[deleted file]	patch \| blob \| history
rust/src/hexfloat.rs	[deleted file]	patch \| blob \| history
rust/src/identifier.rs	[deleted file]	patch \| blob \| history
rust/src/integer.rs	[deleted file]	patch \| blob \| history
rust/src/lex/command_name.rs	[deleted file]	patch \| blob \| history
rust/src/lex/lexer.rs	[deleted file]	patch \| blob \| history
rust/src/lex/mod.rs	[deleted file]	patch \| blob \| history
rust/src/lex/scan/mod.rs	[deleted file]	patch \| blob \| history
rust/src/lex/scan/test.rs	[deleted file]	patch \| blob \| history
rust/src/lex/segment/mod.rs	[deleted file]	patch \| blob \| history
rust/src/lex/segment/test.rs	[deleted file]	patch \| blob \| history
rust/src/lex/token.rs	[deleted file]	patch \| blob \| history
rust/src/lib.rs	[deleted file]	patch \| blob \| history
rust/src/locale_charset.rs	[deleted file]	patch \| blob \| history
rust/src/macros.rs	[deleted file]	patch \| blob \| history
rust/src/main.rs	[deleted file]	patch \| blob \| history
rust/src/message.rs	[deleted file]	patch \| blob \| history
rust/src/output/mod.rs	[deleted file]	patch \| blob \| history
rust/src/output/pivot/mod.rs	[deleted file]	patch \| blob \| history
rust/src/prompt.rs	[deleted file]	patch \| blob \| history
rust/src/raw.rs	[deleted file]	patch \| blob \| history
rust/src/sack.rs	[deleted file]	patch \| blob \| history
rust/src/settings.rs	[deleted file]	patch \| blob \| history
rust/tests/sack.rs	[deleted file]	patch \| blob \| history