From 779b018c3ab27ed5cac5625360a0873537b9fc65 Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Thu, 11 Jul 2024 07:59:00 -0700 Subject: [PATCH] more tests --- rust/Cargo.lock | 7 + rust/Cargo.toml | 3 + rust/fuzz/.gitignore | 4 + rust/fuzz/Cargo.lock | 872 ++++++++++ rust/fuzz/Cargo.toml | 28 + rust/fuzz/fuzz_targets/fuzz_target_1.rs | 7 + rust/fuzz/fuzz_targets/segment.rs | 18 + rust/src/identifier.rs | 2 +- rust/src/lex/segment.rs | 2012 +++++++++++++++++++++-- src/language/lexer/segment.c | 41 + src/language/lexer/segment.h | 1 + src/libpspp/prompt.c | 24 + src/libpspp/prompt.h | 1 + tests/language/lexer/segment-test.c | 126 +- tests/language/lexer/segment.at | 1 + 15 files changed, 2953 insertions(+), 194 deletions(-) create mode 100644 rust/fuzz/.gitignore create mode 100644 rust/fuzz/Cargo.lock create mode 100644 rust/fuzz/Cargo.toml create mode 100644 rust/fuzz/fuzz_targets/fuzz_target_1.rs create mode 100644 rust/fuzz/fuzz_targets/segment.rs diff --git a/rust/Cargo.lock b/rust/Cargo.lock index 4569faca60..c8fc850cf8 100644 --- a/rust/Cargo.lock +++ b/rust/Cargo.lock @@ -144,6 +144,12 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "diff" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56254986775e3233ffa9c4d7d3faaf6d36a2c09d30b20687e9f88bc8bafc16c8" + [[package]] name = "encoding_rs" version = "0.8.32" @@ -502,6 +508,7 @@ dependencies = [ "bitflags 2.5.0", "chrono", "clap", + "diff", "encoding_rs", "finl_unicode", "flate2", diff --git a/rust/Cargo.toml b/rust/Cargo.toml index 371ac6dff2..f735638094 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -42,3 +42,6 @@ path = "src/lib.rs" name = "sack" path = "tests/sack.rs" harness = false + +[dev-dependencies] +diff = "0.1.13" diff --git a/rust/fuzz/.gitignore b/rust/fuzz/.gitignore new file mode 100644 index 0000000000..1a45eee776 --- /dev/null +++ b/rust/fuzz/.gitignore @@ -0,0 +1,4 @@ +target +corpus +artifacts +coverage diff --git a/rust/fuzz/Cargo.lock b/rust/fuzz/Cargo.lock new file mode 100644 index 0000000000..c840c28160 --- /dev/null +++ b/rust/fuzz/Cargo.lock @@ -0,0 +1,872 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "adler" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" + +[[package]] +name = "android-tzdata" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0" + +[[package]] +name = "android_system_properties" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" +dependencies = [ + "libc", +] + +[[package]] +name = "anstream" +version = "0.6.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "418c75fa768af9c03be99d17643f93f79bbba589895012a80e3452a19ddda15b" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "038dfcf04a5feb68e9c60b21c9625a54c2c0616e79b72b0fd87075a056ae1d1b" + +[[package]] +name = "anstyle-parse" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c03a11a9034d92058ceb6ee011ce58af4a9bf61491aa7e1e59ecd24bd40d22d4" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ad186efb764318d35165f1758e7dcef3b10628e26d41a44bc5550652e6804391" +dependencies = [ + "windows-sys 0.52.0", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61a38449feb7068f52bb06c12759005cf459ee52bb4adc1d5a7c4322d716fb19" +dependencies = [ + "anstyle", + "windows-sys 0.52.0", +] + +[[package]] +name = "anyhow" +version = "1.0.86" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b3d1d046238990b9cf5bcde22a3fb3584ee5cf65fb2765f454ed428c7a0063da" + +[[package]] +name = "arbitrary" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d5a26814d8dcb93b0e5a0ff3c6d80a8843bafb21b39e8e18a6f05471870e110" + +[[package]] +name = "atty" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" +dependencies = [ + "hermit-abi", + "libc", + "winapi", +] + +[[package]] +name = "autocfg" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0" + +[[package]] +name = "bitflags" +version = "2.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de" + +[[package]] +name = "bumpalo" +version = "3.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" + +[[package]] +name = "cc" +version = "1.0.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "066fce287b1d4eafef758e89e09d724a24808a9196fe9756b8ca90e86d0719a2" +dependencies = [ + "jobserver", + "libc", + "once_cell", +] + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "chrono" +version = "0.4.38" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a21f936df1771bf62b77f047b726c4625ff2e8aa607c01ec06e5a05bd8463401" +dependencies = [ + "android-tzdata", + "iana-time-zone", + "js-sys", + "num-traits", + "wasm-bindgen", + "windows-targets 0.52.6", +] + +[[package]] +name = "clap" +version = "4.5.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "84b3edb18336f4df585bc9aa31dd99c036dfa5dc5e9a2939a722a188f3a8970d" +dependencies = [ + "clap_builder", + "clap_derive", +] + +[[package]] +name = "clap_builder" +version = "4.5.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1c09dd5ada6c6c78075d6fd0da3f90d8080651e2d6cc8eb2f1aaa4034ced708" +dependencies = [ + "anstream", + "anstyle", + "clap_lex", + "strsim", + "terminal_size", +] + +[[package]] +name = "clap_derive" +version = "4.5.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2bac35c6dafb060fd4d275d9a4ffae97917c13a6327903a8be2153cd964f7085" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "clap_lex" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b82cf0babdbd58558212896d1a4272303a57bdb245c2bf1147185fb45640e70" + +[[package]] +name = "colorchoice" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b6a852b24ab71dffc585bcb46eaf7959d175cb865a7152e35b348d1b2960422" + +[[package]] +name = "core-foundation-sys" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f" + +[[package]] +name = "crc32fast" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "encoding_rs" +version = "0.8.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b45de904aa0b010bce2ab45264d0631681847fa7b6f2eaa7dab7619943bc4f59" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "equivalent" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" + +[[package]] +name = "errno" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba" +dependencies = [ + "libc", + "windows-sys 0.52.0", +] + +[[package]] +name = "finl_unicode" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fcfdc7a0362c9f4444381a9e697c79d435fe65b52a37466fc2c1184cee9edc6" + +[[package]] +name = "flate2" +version = "1.0.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f54427cfd1c7829e2a139fcefea601bf088ebca651d2bf53ebc600eac295dae" +dependencies = [ + "crc32fast", + "miniz_oxide", +] + +[[package]] +name = "float_next_after" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8bf7cc16383c4b8d58b9905a8509f02926ce3058053c056376248d958c9df1e8" + +[[package]] +name = "hashbrown" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "hermit-abi" +version = "0.1.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" +dependencies = [ + "libc", +] + +[[package]] +name = "hexplay" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0962bea6731e28b5a443ba4aa00fe3e4fe7555dadf12012435efb738eeac5898" +dependencies = [ + "atty", + "termcolor", +] + +[[package]] +name = "iana-time-zone" +version = "0.1.60" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7ffbb5a1b541ea2561f8c41c087286cc091e21e556a4f09a8f6cbf17b69b141" +dependencies = [ + "android_system_properties", + "core-foundation-sys", + "iana-time-zone-haiku", + "js-sys", + "wasm-bindgen", + "windows-core", +] + +[[package]] +name = "iana-time-zone-haiku" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" +dependencies = [ + "cc", +] + +[[package]] +name = "indexmap" +version = "2.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "168fb715dda47215e360912c096649d23d58bf392ac62f73919e831745e40f26" +dependencies = [ + "equivalent", + "hashbrown", +] + +[[package]] +name = "is_terminal_polyfill" +version = "1.70.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8478577c03552c21db0e2724ffb8986a5ce7af88107e6be5d2ee6e158c12800" + +[[package]] +name = "jobserver" +version = "0.1.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2b099aaa34a9751c5bf0878add70444e1ed2dd73f347be99003d4577277de6e" +dependencies = [ + "libc", +] + +[[package]] +name = "js-sys" +version = "0.3.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29c15563dc2726973df627357ce0c9ddddbea194836909d655df6a75d2cf296d" +dependencies = [ + "wasm-bindgen", +] + +[[package]] +name = "lazy_static" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" + +[[package]] +name = "libc" +version = "0.2.155" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c" + +[[package]] +name = "libfuzzer-sys" +version = "0.4.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a96cfd5557eb82f2b83fed4955246c988d331975a002961b07c81584d107e7f7" +dependencies = [ + "arbitrary", + "cc", + "once_cell", +] + +[[package]] +name = "linux-raw-sys" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" + +[[package]] +name = "log" +version = "0.4.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" + +[[package]] +name = "miniz_oxide" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8a240ddb74feaf34a79a7add65a741f3167852fba007066dcac1ca548d89c08" +dependencies = [ + "adler", +] + +[[package]] +name = "num" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35bd024e8b2ff75562e5f34e7f4905839deb4b22955ef5e73d2fea1b9813cb23" +dependencies = [ + "num-bigint", + "num-complex", + "num-integer", + "num-iter", + "num-rational", + "num-traits", +] + +[[package]] +name = "num-bigint" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9" +dependencies = [ + "num-integer", + "num-traits", +] + +[[package]] +name = "num-complex" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-derive" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed3955f1a9c7c0c15e092f9c887db08b1fc683305fdf6eb6684f22555355e202" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "num-integer" +version = "0.1.46" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-iter" +version = "0.1.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-rational" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824" +dependencies = [ + "num-bigint", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", +] + +[[package]] +name = "once_cell" +version = "1.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" + +[[package]] +name = "ordered-float" +version = "3.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1e1c390732d15f1d48471625cd92d154e66db2c56645e29a9cd26f4699f72dc" +dependencies = [ + "num-traits", +] + +[[package]] +name = "proc-macro2" +version = "1.0.86" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "pspp" +version = "1.0.0" +dependencies = [ + "anyhow", + "bitflags", + "chrono", + "clap", + "encoding_rs", + "finl_unicode", + "flate2", + "float_next_after", + "hexplay", + "indexmap", + "lazy_static", + "libc", + "num", + "num-derive", + "num-traits", + "ordered-float", + "thiserror", + "unicase", + "utf8-decode", + "windows-sys 0.48.0", +] + +[[package]] +name = "pspp-fuzz" +version = "0.0.0" +dependencies = [ + "libfuzzer-sys", + "pspp", +] + +[[package]] +name = "quote" +version = "1.0.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rustix" +version = "0.38.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70dc5ec042f7a43c4a73241207cecc9873a06d45debb38b329f8541d85c2730f" +dependencies = [ + "bitflags", + "errno", + "libc", + "linux-raw-sys", + "windows-sys 0.52.0", +] + +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + +[[package]] +name = "syn" +version = "2.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "201fcda3845c23e8212cd466bfebf0bd20694490fc0356ae8e428e0824a915a6" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "termcolor" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "adc4587ead41bf016f11af03e55a624c06568b5a19db4e90fde573d805074f83" +dependencies = [ + "wincolor", +] + +[[package]] +name = "terminal_size" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "21bebf2b7c9e0a515f6e0f8c51dc0f8e4696391e6f1ff30379559f8365fb0df7" +dependencies = [ + "rustix", + "windows-sys 0.48.0", +] + +[[package]] +name = "thiserror" +version = "1.0.61" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c546c80d6be4bc6a00c0f01730c08df82eaa7a7a61f11d656526506112cc1709" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.61" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46c3384250002a6d5af4d114f2845d37b57521033f30d5c3f46c4d70e1197533" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "unicase" +version = "2.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7d2d4dafb69621809a81864c9c1b864479e1235c0dd4e199924b9742439ed89" +dependencies = [ + "version_check", +] + +[[package]] +name = "unicode-ident" +version = "1.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" + +[[package]] +name = "utf8-decode" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca61eb27fa339aa08826a29f03e87b99b4d8f0fc2255306fd266bb1b6a9de498" + +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + +[[package]] +name = "version_check" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" + +[[package]] +name = "wasm-bindgen" +version = "0.2.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4be2531df63900aeb2bca0daaaddec08491ee64ceecbee5076636a3b026795a8" +dependencies = [ + "cfg-if", + "wasm-bindgen-macro", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "614d787b966d3989fa7bb98a654e369c762374fd3213d212cfc0251257e747da" +dependencies = [ + "bumpalo", + "log", + "once_cell", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1f8823de937b71b9460c0c34e25f3da88250760bec0ebac694b49997550d726" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96" + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "wincolor" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eeb06499a3a4d44302791052df005d5232b927ed1a9658146d842165c4de7767" +dependencies = [ + "winapi", +] + +[[package]] +name = "windows-core" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9" +dependencies = [ + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-sys" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +dependencies = [ + "windows-targets 0.48.5", +] + +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-targets" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" +dependencies = [ + "windows_aarch64_gnullvm 0.48.5", + "windows_aarch64_msvc 0.48.5", + "windows_i686_gnu 0.48.5", + "windows_i686_msvc 0.48.5", + "windows_x86_64_gnu 0.48.5", + "windows_x86_64_gnullvm 0.48.5", + "windows_x86_64_msvc 0.48.5", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm 0.52.6", + "windows_aarch64_msvc 0.52.6", + "windows_i686_gnu 0.52.6", + "windows_i686_gnullvm", + "windows_i686_msvc 0.52.6", + "windows_x86_64_gnu 0.52.6", + "windows_x86_64_gnullvm 0.52.6", + "windows_x86_64_msvc 0.52.6", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_i686_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" diff --git a/rust/fuzz/Cargo.toml b/rust/fuzz/Cargo.toml new file mode 100644 index 0000000000..8b44789bad --- /dev/null +++ b/rust/fuzz/Cargo.toml @@ -0,0 +1,28 @@ +[package] +name = "pspp-fuzz" +version = "0.0.0" +publish = false +edition = "2021" + +[package.metadata] +cargo-fuzz = true + +[dependencies] +libfuzzer-sys = "0.4" + +[dependencies.pspp] +path = ".." + +[[bin]] +name = "fuzz_target_1" +path = "fuzz_targets/fuzz_target_1.rs" +test = false +doc = false +bench = false + +[[bin]] +name = "segment" +path = "fuzz_targets/segment.rs" +test = false +doc = false +bench = false diff --git a/rust/fuzz/fuzz_targets/fuzz_target_1.rs b/rust/fuzz/fuzz_targets/fuzz_target_1.rs new file mode 100644 index 0000000000..43a88c14f3 --- /dev/null +++ b/rust/fuzz/fuzz_targets/fuzz_target_1.rs @@ -0,0 +1,7 @@ +#![no_main] + +use libfuzzer_sys::fuzz_target; + +fuzz_target!(|data: &[u8]| { + // fuzzed code goes here +}); diff --git a/rust/fuzz/fuzz_targets/segment.rs b/rust/fuzz/fuzz_targets/segment.rs new file mode 100644 index 0000000000..1e5a109449 --- /dev/null +++ b/rust/fuzz/fuzz_targets/segment.rs @@ -0,0 +1,18 @@ +#![no_main] + +use libfuzzer_sys::fuzz_target; +use pspp::lex::segment::{Segmenter, Mode, Type}; + +fuzz_target!(|data: &[u8]| { + if let Ok(mut input) = std::str::from_utf8(data) { + let mut segmenter = Segmenter::new(Mode::Auto, false); + loop { + let (rest, type_) = segmenter.push(input, true).unwrap(); + match type_ { + Type::End => break, + _ => (), + } + input = rest; + } + } +}); diff --git a/rust/src/identifier.rs b/rust/src/identifier.rs index 8e37e64ecb..3d00520535 100644 --- a/rust/src/identifier.rs +++ b/rust/src/identifier.rs @@ -96,7 +96,7 @@ pub enum Error { pub fn is_reserved_word(s: &str) -> bool { for word in [ - "and", "or", "not", "eq", "ge", "gt", "le", "ne", "all", "by", "to", "with", + "and", "or", "not", "eq", "ge", "gt", "le", "lt", "ne", "all", "by", "to", "with", ] { if s.eq_ignore_ascii_case(word) { return true; diff --git a/rust/src/lex/segment.rs b/rust/src/lex/segment.rs index 94ead036b7..53bc26d5ea 100644 --- a/rust/src/lex/segment.rs +++ b/rust/src/lex/segment.rs @@ -88,7 +88,7 @@ pub enum Type { } bitflags! { - #[derive(Copy, Clone)] + #[derive(Copy, Clone, Debug)] pub struct Substate: u8 { const START_OF_LINE = 1; const START_OF_COMMAND = 2; @@ -400,8 +400,7 @@ fn is_end_of_line(input: &str, eof: bool) -> Result { } fn at_end_of_line(input: &str, eof: bool) -> Result { - let input = skip_spaces_and_comments(input, eof)?; - is_end_of_line(input, eof) + is_end_of_line(skip_spaces_and_comments(input, eof)?, eof) } fn first(s: &str) -> char { @@ -419,17 +418,18 @@ fn get_command_name_candidates(target: &str) -> &[&'static str] { fn detect_command_name(input: &str, eof: bool) -> Result { let command_name = input - .split(|c: char| !(c.is_whitespace() || c.may_continue_id() || c == '-')) + .split(|c: char| { + !((c.is_whitespace() && c != '\n') || (c.may_continue_id() && c != '.') || c == '-') + }) .next() .unwrap(); if !eof && command_name.len() == input.len() { return Err(Incomplete); } - let string = command_name.strip_suffix('.').unwrap_or(command_name); + let command_name = command_name.trim_end_matches(|c: char| c.is_whitespace() || c == '.'); for command in get_command_name_candidates(command_name) { - if let Some(m) = command_match(command, string) { + if let Some(m) = command_match(command, command_name) { if m.missing_words <= 0 { - println!("{command}"); return Ok(true); } } @@ -443,11 +443,11 @@ impl Segmenter { input: &'a str, eof: bool, ) -> Result<(&'a str, Type), Incomplete> { - let (c, rest) = take(input, eof)?; - if c == Some('#') { + if let (Some('#'), rest) = take(input, eof)? { if let (Some('!'), rest) = take(rest, eof)? { + let rest = self.parse_full_line(rest, eof)?; self.state = (State::General, Substate::START_OF_COMMAND); - return Ok((self.parse_full_line(rest, eof)?, Type::Shbang)); + return Ok((rest, Type::Shbang)); } } @@ -477,7 +477,7 @@ impl Segmenter { unreachable!() }; match c { - '+' if is_start_of_string(skip_spaces_and_comments(input, eof)?, eof)? => { + '+' if is_start_of_string(skip_spaces_and_comments(rest, eof)?, eof)? => { // This `+` is punctuation that may separate pieces of a string. self.state = (State::General, Substate::empty()); return Ok((rest, Type::Punct)); @@ -486,14 +486,16 @@ impl Segmenter { self.state = (State::General, Substate::START_OF_COMMAND); return Ok((rest, Type::StartCommand)); } - c if c.is_whitespace() => { - if at_end_of_line(rest, eof)? { + _ if c.is_whitespace() => { + if at_end_of_line(input, eof)? { self.state = (State::General, Substate::START_OF_COMMAND); return Ok((input, Type::SeparateCommands)); } } _ => { - if self.at_command_start(input, eof)? { + if self.at_command_start(input, eof)? + && !self.state.1.contains(Substate::START_OF_COMMAND) + { self.state = (State::General, Substate::START_OF_COMMAND); return Ok((input, Type::StartCommand)); } @@ -556,23 +558,21 @@ impl Segmenter { self.state.0 = State::Comment1; self.parse_comment_1(input, eof) } else { - self.parse_digraph(&['*'], input, eof) + self.parse_digraph(&['*'], rest, eof) } } '<' => self.parse_digraph(&['=', '>'], rest, eof), '>' => self.parse_digraph(&['='], rest, eof), '~' => self.parse_digraph(&['='], rest, eof), + '.' if at_end_of_line(rest, eof)? => { + self.state.1 = Substate::START_OF_COMMAND; + Ok((rest, Type::EndCommand)) + } '.' => match take(rest, eof)? { (Some(c), _) if c.is_ascii_digit() => self.parse_number(input, eof), - (Some('\r' | '\n'), _) if is_end_of_line(rest, eof)? => { - self.state.1 = Substate::START_OF_COMMAND; - Ok((rest, Type::EndCommand)) - } _ => Ok((rest, Type::Punct)), }, - '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' => { - self.parse_number(input, eof) - } + '0'..='9' => self.parse_number(input, eof), 'u' | 'U' => self.maybe_parse_string(Type::UnicodeString, (input, rest), eof), 'x' | 'X' => self.maybe_parse_string(Type::HexString, (input, rest), eof), '\'' | '"' => self.parse_string(Type::QuotedString, c, rest, eof), @@ -603,21 +603,19 @@ impl Segmenter { mut input: &'a str, eof: bool, ) -> Result<(&'a str, Type), Incomplete> { - loop { - let (Some(c), rest) = take(input, eof)? else { - break; - }; - if c == quote { - if take(rest, eof)?.0 == Some(quote) { - input = rest; - continue; - } else { - return Ok((rest, type_)); + while let (Some(c), rest) = take(input, eof)? { + match c { + _ if c == quote => { + let (c, rest2) = take(rest, eof)?; + if c != Some(quote) { + self.state.1 = Substate::empty(); + return Ok((rest, type_)); + } + input = rest2; } - } else if is_end_of_line(input, eof)? { - break; + '\r' | '\n' if is_end_of_line(input, eof)? => break, + _ => input = rest, } - input = rest; } self.state.1 = Substate::empty(); Ok((input, Type::ExpectedQuote)) @@ -751,6 +749,7 @@ impl Segmenter { eof: bool, ) -> Result<(&'a str, Type), Incomplete> { let (c, rest) = take(input, eof)?; + self.state.1 = Substate::empty(); Ok(( match c { Some(c) if seconds.contains(&c) => rest, @@ -780,6 +779,7 @@ impl Segmenter { } input = rest2; } + self.state.1 = Substate::empty(); Ok((input, Type::Number)) } fn parse_comment_1<'a>( @@ -1076,15 +1076,12 @@ impl Segmenter { State::General, Substate::START_OF_COMMAND | Substate::START_OF_LINE, ); - return self.push(input, eof) + return self.push(input, eof); } } - return Ok((rest, Type::DoRepeatCommand)) + return Ok((rest, Type::DoRepeatCommand)); } - fn parse_do_repeat_4<'a>( - &mut self, - input: &'a str, - ) -> Result<(&'a str, Type), Incomplete> { + fn parse_do_repeat_4<'a>(&mut self, input: &'a str) -> Result<(&'a str, Type), Incomplete> { self.state.0 = State::DoRepeat3; Ok((input, Type::DoRepeatOverflow)) } @@ -1203,19 +1200,19 @@ impl Segmenter { // Line starts with some content followed by `!ENDDEFINE`. Ok((rest, Type::MacroBody)) } - } else if line.is_empty() { - // Entirely blank line. - self.parse_define_6(input, eof) } else { // No `!ENDDEFINE`. We have a full line of macro body. // - // The line might be blank, whether completely empty or just spaces - // and comments. That's OK: we need to report blank lines because - // they can have significance. + // If the first line of the macro body is blank, we just report it + // as spaces, or not at all if there are no spaces, because it's not + // significant. // - // However, if the first line of the macro body is blank, we just - // report it as spaces because it's not significant. + // However, if it's a later line, we need to report it because blank + // lines can have significance. let type_ = if self.state.0 == State::Define4 && line.trim_start().is_empty() { + if line.is_empty() { + return self.parse_define_6(input, eof); + } Type::Spaces } else { Type::MacroBody @@ -1278,7 +1275,7 @@ impl Segmenter { _ => return false, } } - endcmd + true } fn parse_begin_data_3<'a>( &mut self, @@ -1321,16 +1318,58 @@ fn strip_prefix_ignore_ascii_case<'a>(line: &'a str, pattern: &str) -> Option<&' #[cfg(test)] mod test { + use crate::prompt::PromptStyle; + use super::{Mode, Segmenter, Type}; - /* - fn check_segmentation(mut input: &str, output: &[(Type, &str)]) { - let mut segmenter = Segmenter::new(Mode::Auto, false); - for (&exp_type, &exp_s) in output { - let (rest, type_) = segmenter.push(input, true).unwrap(); + fn check_segmentation( + mut input: &str, + mode: Mode, + expect_segments: &[(Type, &str)], + expect_prompts: &[PromptStyle], + ) { + let mut segments = Vec::with_capacity(expect_segments.len()); + let mut prompts = Vec::new(); + let mut segmenter = Segmenter::new(mode, false); + loop { + let (rest, type_) = segmenter.push(input, true).unwrap(); + let len = input.len() - rest.len(); + let token = &input[..len]; + segments.push((type_, token)); + match type_ { + Type::End => break, + Type::Newline => prompts.push(segmenter.prompt()), + _ => (), + } + input = rest; + } + + if &segments != expect_segments { + eprintln!("segments differ from expected:"); + let difference = diff::slice(expect_segments, &segments); + for result in difference { + match result { + diff::Result::Left(left) => eprintln!("-{left:?}"), + diff::Result::Both(left, _right) => eprintln!(" {left:?}"), + diff::Result::Right(right) => eprintln!("+{right:?}"), + } + } + panic!(); + } + if &prompts != expect_prompts { + eprintln!("prompts differ from expected:"); + let difference = diff::slice(expect_prompts, &prompts); + for result in difference { + match result { + diff::Result::Left(left) => eprintln!("-{left:?}"), + diff::Result::Both(left, _right) => eprintln!(" {left:?}"), + diff::Result::Right(right) => eprintln!("+{right:?}"), + } } - }*/ + panic!(); + } + } fn print_segmentation(mut input: &str) { let mut segmenter = Segmenter::new(Mode::Auto, false); @@ -1351,7 +1390,7 @@ mod test { #[test] fn test_identifiers() { - print_segmentation( + check_segmentation( r#"a ab abc abcd !abcd A AB ABC ABCD !ABCD aB aBC aBcD !aBcD @@ -1364,12 +1403,115 @@ f@#_.#6 GhIjK .x 1y _z "#, + Mode::Auto, + &[ + (Type::Identifier, "a"), + (Type::Spaces, " "), + (Type::Identifier, "ab"), + (Type::Spaces, " "), + (Type::Identifier, "abc"), + (Type::Spaces, " "), + (Type::Identifier, "abcd"), + (Type::Spaces, " "), + (Type::MacroId, "!abcd"), + (Type::Newline, "\n"), + (Type::Identifier, "A"), + (Type::Spaces, " "), + (Type::Identifier, "AB"), + (Type::Spaces, " "), + (Type::Identifier, "ABC"), + (Type::Spaces, " "), + (Type::Identifier, "ABCD"), + (Type::Spaces, " "), + (Type::MacroId, "!ABCD"), + (Type::Newline, "\n"), + (Type::Identifier, "aB"), + (Type::Spaces, " "), + (Type::Identifier, "aBC"), + (Type::Spaces, " "), + (Type::Identifier, "aBcD"), + (Type::Spaces, " "), + (Type::MacroId, "!aBcD"), + (Type::Newline, "\n"), + (Type::Identifier, "$x"), + (Type::Spaces, " "), + (Type::Identifier, "$y"), + (Type::Spaces, " "), + (Type::Identifier, "$z"), + (Type::Spaces, " "), + (Type::MacroId, "!$z"), + (Type::Newline, "\n"), + (Type::Identifier, "grève"), + (Type::Spaces, "\u{00a0}"), + (Type::Identifier, "Ângstrom"), + (Type::Spaces, "\u{00a0}"), + (Type::Identifier, "poté"), + (Type::Newline, "\n"), + (Type::Identifier, "#a"), + (Type::Spaces, " "), + (Type::Identifier, "#b"), + (Type::Spaces, " "), + (Type::Identifier, "#c"), + (Type::Spaces, " "), + (Type::Identifier, "##"), + (Type::Spaces, " "), + (Type::Identifier, "#d"), + (Type::Spaces, " "), + (Type::MacroId, "!#d"), + (Type::Newline, "\n"), + (Type::Identifier, "@efg"), + (Type::Spaces, " "), + (Type::Identifier, "@"), + (Type::Spaces, " "), + (Type::Identifier, "@@."), + (Type::Spaces, " "), + (Type::Identifier, "@#@"), + (Type::Spaces, " "), + (Type::MacroId, "!@"), + (Type::Spaces, " "), + (Type::Newline, "\n"), + (Type::Identifier, "##"), + (Type::Spaces, " "), + (Type::Identifier, "#"), + (Type::Spaces, " "), + (Type::Identifier, "#12345"), + (Type::Spaces, " "), + (Type::Identifier, "#.#"), + (Type::Newline, "\n"), + (Type::Identifier, "f@#_.#6"), + (Type::Newline, "\n"), + (Type::Identifier, "GhIjK"), + (Type::Newline, "\n"), + (Type::StartCommand, "."), + (Type::Identifier, "x"), + (Type::Spaces, " "), + (Type::Number, "1"), + (Type::Identifier, "y"), + (Type::Spaces, " "), + (Type::Punct, "_"), + (Type::Identifier, "z"), + (Type::Newline, "\n"), + (Type::End, ""), + ], + &[ + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + ], ); } #[test] fn test_identifiers_ending_in_dot() { - print_segmentation( + check_segmentation( r#"abcd. abcd. ABCD. ABCD. aBcD. aBcD. @@ -1388,33 +1530,311 @@ wxyz./* unterminated end of line comment WXYZ. /* unterminated end of line comment WxYz./* unterminated end of line comment "#, + Mode::Auto, + &[ + (Type::Identifier, "abcd."), + (Type::Spaces, " "), + (Type::Identifier, "abcd"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::Identifier, "ABCD."), + (Type::Spaces, " "), + (Type::Identifier, "ABCD"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::Identifier, "aBcD."), + (Type::Spaces, " "), + (Type::Identifier, "aBcD"), + (Type::EndCommand, "."), + (Type::Spaces, " "), + (Type::Newline, "\n"), + (Type::Identifier, "$y."), + (Type::Spaces, " "), + (Type::Identifier, "$z."), + (Type::Spaces, " "), + (Type::Identifier, "あいうえお"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::Identifier, "#c."), + (Type::Spaces, " "), + (Type::Identifier, "#d."), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::Identifier, "@@."), + (Type::Spaces, " "), + (Type::Identifier, "@@..."), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::Identifier, "#.#"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::Identifier, "#abcd"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::StartCommand, "."), + (Type::Newline, "\n"), + (Type::StartCommand, "."), + (Type::Spaces, " "), + (Type::Newline, "\n"), + (Type::Identifier, "LMNOP"), + (Type::EndCommand, "."), + (Type::Spaces, " "), + (Type::Newline, "\n"), + (Type::Identifier, "QRSTUV"), + (Type::EndCommand, "."), + (Type::Comment, "/* end of line comment */"), + (Type::Newline, "\n"), + (Type::Identifier, "qrstuv"), + (Type::EndCommand, "."), + (Type::Spaces, " "), + (Type::Comment, "/* end of line comment */"), + (Type::Newline, "\n"), + (Type::Identifier, "QrStUv"), + (Type::EndCommand, "."), + (Type::Comment, "/* end of line comment */"), + (Type::Spaces, " "), + (Type::Newline, "\n"), + (Type::Identifier, "wxyz"), + (Type::EndCommand, "."), + (Type::Comment, "/* unterminated end of line comment"), + (Type::Newline, "\n"), + (Type::Identifier, "WXYZ"), + (Type::EndCommand, "."), + (Type::Spaces, " "), + (Type::Comment, "/* unterminated end of line comment"), + (Type::Newline, "\n"), + (Type::Identifier, "WxYz"), + (Type::EndCommand, "."), + (Type::Comment, "/* unterminated end of line comment "), + (Type::Newline, "\n"), + (Type::End, ""), + ], + &[ + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + ], ); } #[test] fn test_reserved_words() { - print_segmentation( + check_segmentation( r#"and or not eq ge gt le lt ne all by to with AND OR NOT EQ GE GT LE LT NE ALL BY TO WITH andx orx notx eqx gex gtx lex ltx nex allx byx tox withx and. with. "#, + Mode::Auto, + &[ + (Type::ReservedWord, "and"), + (Type::Spaces, " "), + (Type::ReservedWord, "or"), + (Type::Spaces, " "), + (Type::ReservedWord, "not"), + (Type::Spaces, " "), + (Type::ReservedWord, "eq"), + (Type::Spaces, " "), + (Type::ReservedWord, "ge"), + (Type::Spaces, " "), + (Type::ReservedWord, "gt"), + (Type::Spaces, " "), + (Type::ReservedWord, "le"), + (Type::Spaces, " "), + (Type::ReservedWord, "lt"), + (Type::Spaces, " "), + (Type::ReservedWord, "ne"), + (Type::Spaces, " "), + (Type::ReservedWord, "all"), + (Type::Spaces, " "), + (Type::ReservedWord, "by"), + (Type::Spaces, " "), + (Type::ReservedWord, "to"), + (Type::Spaces, " "), + (Type::ReservedWord, "with"), + (Type::Newline, "\n"), + (Type::ReservedWord, "AND"), + (Type::Spaces, " "), + (Type::ReservedWord, "OR"), + (Type::Spaces, " "), + (Type::ReservedWord, "NOT"), + (Type::Spaces, " "), + (Type::ReservedWord, "EQ"), + (Type::Spaces, " "), + (Type::ReservedWord, "GE"), + (Type::Spaces, " "), + (Type::ReservedWord, "GT"), + (Type::Spaces, " "), + (Type::ReservedWord, "LE"), + (Type::Spaces, " "), + (Type::ReservedWord, "LT"), + (Type::Spaces, " "), + (Type::ReservedWord, "NE"), + (Type::Spaces, " "), + (Type::ReservedWord, "ALL"), + (Type::Spaces, " "), + (Type::ReservedWord, "BY"), + (Type::Spaces, " "), + (Type::ReservedWord, "TO"), + (Type::Spaces, " "), + (Type::ReservedWord, "WITH"), + (Type::Newline, "\n"), + (Type::Identifier, "andx"), + (Type::Spaces, " "), + (Type::Identifier, "orx"), + (Type::Spaces, " "), + (Type::Identifier, "notx"), + (Type::Spaces, " "), + (Type::Identifier, "eqx"), + (Type::Spaces, " "), + (Type::Identifier, "gex"), + (Type::Spaces, " "), + (Type::Identifier, "gtx"), + (Type::Spaces, " "), + (Type::Identifier, "lex"), + (Type::Spaces, " "), + (Type::Identifier, "ltx"), + (Type::Spaces, " "), + (Type::Identifier, "nex"), + (Type::Spaces, " "), + (Type::Identifier, "allx"), + (Type::Spaces, " "), + (Type::Identifier, "byx"), + (Type::Spaces, " "), + (Type::Identifier, "tox"), + (Type::Spaces, " "), + (Type::Identifier, "withx"), + (Type::Newline, "\n"), + (Type::Identifier, "and."), + (Type::Spaces, " "), + (Type::ReservedWord, "with"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::End, ""), + ], + &[ + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::First, + ], ); } #[test] fn test_punctuation() { - print_segmentation( - r#"~ & | = >= > <= < ~= <> ( ) , - + * / [[ ]] ** -~&|=>=><=<~=<>(),-+*/[[]]**!* + check_segmentation( + r#"~ & | = >= > <= < ~= <> ( ) , - + * / [ ] ** +~&|=>=><=<~=<>(),-+*/[]**!* % : ; ? _ ` { } ~ !* "#, + Mode::Auto, + &[ + (Type::Punct, "~"), + (Type::Spaces, " "), + (Type::Punct, "&"), + (Type::Spaces, " "), + (Type::Punct, "|"), + (Type::Spaces, " "), + (Type::Punct, "="), + (Type::Spaces, " "), + (Type::Punct, ">="), + (Type::Spaces, " "), + (Type::Punct, ">"), + (Type::Spaces, " "), + (Type::Punct, "<="), + (Type::Spaces, " "), + (Type::Punct, "<"), + (Type::Spaces, " "), + (Type::Punct, "~="), + (Type::Spaces, " "), + (Type::Punct, "<>"), + (Type::Spaces, " "), + (Type::Punct, "("), + (Type::Spaces, " "), + (Type::Punct, ")"), + (Type::Spaces, " "), + (Type::Punct, ","), + (Type::Spaces, " "), + (Type::Punct, "-"), + (Type::Spaces, " "), + (Type::Punct, "+"), + (Type::Spaces, " "), + (Type::Punct, "*"), + (Type::Spaces, " "), + (Type::Punct, "/"), + (Type::Spaces, " "), + (Type::Punct, "["), + (Type::Spaces, " "), + (Type::Punct, "]"), + (Type::Spaces, " "), + (Type::Punct, "**"), + (Type::Newline, "\n"), + (Type::Punct, "~"), + (Type::Punct, "&"), + (Type::Punct, "|"), + (Type::Punct, "="), + (Type::Punct, ">="), + (Type::Punct, ">"), + (Type::Punct, "<="), + (Type::Punct, "<"), + (Type::Punct, "~="), + (Type::Punct, "<>"), + (Type::Punct, "("), + (Type::Punct, ")"), + (Type::Punct, ","), + (Type::Punct, "-"), + (Type::Punct, "+"), + (Type::Punct, "*"), + (Type::Punct, "/"), + (Type::Punct, "["), + (Type::Punct, "]"), + (Type::Punct, "**"), + (Type::MacroId, "!*"), + (Type::Newline, "\n"), + (Type::Punct, "%"), + (Type::Spaces, " "), + (Type::Punct, ":"), + (Type::Spaces, " "), + (Type::Punct, ";"), + (Type::Spaces, " "), + (Type::Punct, "?"), + (Type::Spaces, " "), + (Type::Punct, "_"), + (Type::Spaces, " "), + (Type::Punct, "`"), + (Type::Spaces, " "), + (Type::Punct, "{"), + (Type::Spaces, " "), + (Type::Punct, "}"), + (Type::Spaces, " "), + (Type::Punct, "~"), + (Type::Spaces, " "), + (Type::MacroId, "!*"), + (Type::Newline, "\n"), + (Type::End, ""), + ], + &[PromptStyle::Later, PromptStyle::Later, PromptStyle::Later], ); } #[test] fn test_positive_numbers() { - print_segmentation( + check_segmentation( r#"0 1 01 001. 1. 123. /* comment 1 */ /* comment 2 */ .1 0.1 00.1 00.10 @@ -1423,12 +1843,96 @@ and. with. 1.23e1 45.6E-1 78.9e+1 99.9E+01 11.2e-03 . 1e e1 1e+ 1e- 1. "#, + Mode::Auto, + &[ + (Type::Number, "0"), + (Type::Spaces, " "), + (Type::Number, "1"), + (Type::Spaces, " "), + (Type::Number, "01"), + (Type::Spaces, " "), + (Type::Number, "001."), + (Type::Spaces, " "), + (Type::Number, "1"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::Number, "123"), + (Type::EndCommand, "."), + (Type::Spaces, " "), + (Type::Comment, "/* comment 1 */"), + (Type::Spaces, " "), + (Type::Comment, "/* comment 2 */"), + (Type::Newline, "\n"), + (Type::StartCommand, "."), + (Type::Number, "1"), + (Type::Spaces, " "), + (Type::Number, "0.1"), + (Type::Spaces, " "), + (Type::Number, "00.1"), + (Type::Spaces, " "), + (Type::Number, "00.10"), + (Type::Newline, "\n"), + (Type::Number, "5e1"), + (Type::Spaces, " "), + (Type::Number, "6E-1"), + (Type::Spaces, " "), + (Type::Number, "7e+1"), + (Type::Spaces, " "), + (Type::Number, "6E+01"), + (Type::Spaces, " "), + (Type::Number, "6e-03"), + (Type::Newline, "\n"), + (Type::StartCommand, "."), + (Type::Number, "3E1"), + (Type::Spaces, " "), + (Type::Number, ".4e-1"), + (Type::Spaces, " "), + (Type::Number, ".5E+1"), + (Type::Spaces, " "), + (Type::Number, ".6e+01"), + (Type::Spaces, " "), + (Type::Number, ".7E-03"), + (Type::Newline, "\n"), + (Type::Number, "1.23e1"), + (Type::Spaces, " "), + (Type::Number, "45.6E-1"), + (Type::Spaces, " "), + (Type::Number, "78.9e+1"), + (Type::Spaces, " "), + (Type::Number, "99.9E+01"), + (Type::Spaces, " "), + (Type::Number, "11.2e-03"), + (Type::Newline, "\n"), + (Type::StartCommand, "."), + (Type::Spaces, " "), + (Type::ExpectedExponent, "1e"), + (Type::Spaces, " "), + (Type::Identifier, "e1"), + (Type::Spaces, " "), + (Type::ExpectedExponent, "1e+"), + (Type::Spaces, " "), + (Type::ExpectedExponent, "1e-"), + (Type::Spaces, " "), + (Type::Number, "1"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::End, ""), + ], + &[ + PromptStyle::First, + PromptStyle::First, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::First, + ], ); } #[test] fn test_negative_numbers() { - print_segmentation( + check_segmentation( r#" -0 -1 -01 -001. -1. -123. /* comment 1 */ /* comment 2 */ -.1 -0.1 -00.1 -00.10 @@ -1438,12 +1942,109 @@ and. with. -/**/1 -. -1e -e1 -1e+ -1e- -1. "#, + Mode::Auto, + &[ + (Type::Spaces, " "), + (Type::Number, "-0"), + (Type::Spaces, " "), + (Type::Number, "-1"), + (Type::Spaces, " "), + (Type::Number, "-01"), + (Type::Spaces, " "), + (Type::Number, "-001."), + (Type::Spaces, " "), + (Type::Number, "-1"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::Spaces, " "), + (Type::Number, "-123"), + (Type::EndCommand, "."), + (Type::Spaces, " "), + (Type::Comment, "/* comment 1 */"), + (Type::Spaces, " "), + (Type::Comment, "/* comment 2 */"), + (Type::Newline, "\n"), + (Type::Spaces, " "), + (Type::Number, "-.1"), + (Type::Spaces, " "), + (Type::Number, "-0.1"), + (Type::Spaces, " "), + (Type::Number, "-00.1"), + (Type::Spaces, " "), + (Type::Number, "-00.10"), + (Type::Newline, "\n"), + (Type::Spaces, " "), + (Type::Number, "-5e1"), + (Type::Spaces, " "), + (Type::Number, "-6E-1"), + (Type::Spaces, " "), + (Type::Number, "-7e+1"), + (Type::Spaces, " "), + (Type::Number, "-6E+01"), + (Type::Spaces, " "), + (Type::Number, "-6e-03"), + (Type::Newline, "\n"), + (Type::Spaces, " "), + (Type::Number, "-.3E1"), + (Type::Spaces, " "), + (Type::Number, "-.4e-1"), + (Type::Spaces, " "), + (Type::Number, "-.5E+1"), + (Type::Spaces, " "), + (Type::Number, "-.6e+01"), + (Type::Spaces, " "), + (Type::Number, "-.7E-03"), + (Type::Newline, "\n"), + (Type::Spaces, " "), + (Type::Number, "-1.23e1"), + (Type::Spaces, " "), + (Type::Number, "-45.6E-1"), + (Type::Spaces, " "), + (Type::Number, "-78.9e+1"), + (Type::Spaces, " "), + (Type::Number, "-99.9E+01"), + (Type::Spaces, " "), + (Type::Number, "-11.2e-03"), + (Type::Newline, "\n"), + (Type::Spaces, " "), + (Type::Punct, "-"), + (Type::Comment, "/**/"), + (Type::Number, "1"), + (Type::Newline, "\n"), + (Type::Spaces, " "), + (Type::Punct, "-"), + (Type::Punct, "."), + (Type::Spaces, " "), + (Type::ExpectedExponent, "-1e"), + (Type::Spaces, " "), + (Type::Punct, "-"), + (Type::Identifier, "e1"), + (Type::Spaces, " "), + (Type::ExpectedExponent, "-1e+"), + (Type::Spaces, " "), + (Type::ExpectedExponent, "-1e-"), + (Type::Spaces, " "), + (Type::Number, "-1"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::End, ""), + ], + &[ + PromptStyle::First, + PromptStyle::First, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::First, + ], ); } #[test] fn test_strings() { - print_segmentation( + check_segmentation( r#"'x' "y" 'abc' 'Don''t' "Can't" 'Won''t' """quoted""" '"quoted"' @@ -1457,22 +2058,116 @@ u'fffd' U"041" + /* also a punctuator on blank line - 'new command' "#, + Mode::Auto, + &[ + (Type::QuotedString, "'x'"), + (Type::Spaces, " "), + (Type::QuotedString, "\"y\""), + (Type::Spaces, " "), + (Type::QuotedString, "'abc'"), + (Type::Newline, "\n"), + (Type::QuotedString, "'Don''t'"), + (Type::Spaces, " "), + (Type::QuotedString, "\"Can't\""), + (Type::Spaces, " "), + (Type::QuotedString, "'Won''t'"), + (Type::Newline, "\n"), + (Type::QuotedString, "\"\"\"quoted\"\"\""), + (Type::Spaces, " "), + (Type::QuotedString, "'\"quoted\"'"), + (Type::Newline, "\n"), + (Type::QuotedString, "''"), + (Type::Spaces, " "), + (Type::QuotedString, "\"\""), + (Type::Newline, "\n"), + (Type::ExpectedQuote, "'missing end quote"), + (Type::Newline, "\n"), + (Type::ExpectedQuote, "\"missing double quote"), + (Type::Newline, "\n"), + (Type::HexString, "x\"4142\""), + (Type::Spaces, " "), + (Type::HexString, "X'5152'"), + (Type::Newline, "\n"), + (Type::UnicodeString, "u'fffd'"), + (Type::Spaces, " "), + (Type::UnicodeString, "U\"041\""), + (Type::Newline, "\n"), + (Type::StartCommand, "+"), + (Type::Spaces, " "), + (Type::Identifier, "new"), + (Type::Spaces, " "), + (Type::Identifier, "command"), + (Type::Newline, "\n"), + (Type::Punct, "+"), + (Type::Spaces, " "), + (Type::Comment, "/* comment */"), + (Type::Spaces, " "), + (Type::QuotedString, "'string continuation'"), + (Type::Newline, "\n"), + (Type::Punct, "+"), + (Type::Spaces, " "), + (Type::Comment, "/* also a punctuator on blank line"), + (Type::Newline, "\n"), + (Type::StartCommand, "-"), + (Type::Spaces, " "), + (Type::QuotedString, "'new command'"), + (Type::Newline, "\n"), + (Type::End, ""), + ], + &[ + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + ], ); } #[test] fn test_shbang() { - print_segmentation( + check_segmentation( r#"#! /usr/bin/pspp title my title. #! /usr/bin/pspp "#, + Mode::Interactive, + &[ + (Type::Shbang, "#! /usr/bin/pspp"), + (Type::Newline, "\n"), + (Type::Identifier, "title"), + (Type::Spaces, " "), + (Type::Identifier, "my"), + (Type::Spaces, " "), + (Type::Identifier, "title"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::Identifier, "#"), + (Type::MacroId, "!"), + (Type::Spaces, " "), + (Type::Punct, "/"), + (Type::Identifier, "usr"), + (Type::Punct, "/"), + (Type::Identifier, "bin"), + (Type::Punct, "/"), + (Type::Identifier, "pspp"), + (Type::Newline, "\n"), + (Type::End, ""), + ], + &[PromptStyle::First, PromptStyle::First, PromptStyle::Later], ); } #[test] fn test_comment_command() { - print_segmentation( + check_segmentation( r#"* Comment commands "don't have to contain valid tokens. @@ -1490,12 +2185,88 @@ com is ambiguous with COMPUTE. next command. "#, + Mode::Interactive, + &[ + (Type::CommentCommand, "* Comment commands \"don't"), + (Type::Newline, "\n"), + (Type::CommentCommand, "have to contain valid tokens"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::SeparateCommands, ""), + (Type::Newline, "\n"), + (Type::CommentCommand, "** Check ambiguity with ** token"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::CommentCommand, "****************"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::SeparateCommands, ""), + (Type::Newline, "\n"), + (Type::CommentCommand, "comment keyword works too"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::CommentCommand, "COMM also"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::Identifier, "com"), + (Type::Spaces, " "), + (Type::Identifier, "is"), + (Type::Spaces, " "), + (Type::Identifier, "ambiguous"), + (Type::Spaces, " "), + (Type::ReservedWord, "with"), + (Type::Spaces, " "), + (Type::Identifier, "COMPUTE"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::SeparateCommands, ""), + (Type::Newline, "\n"), + (Type::Spaces, " "), + ( + Type::CommentCommand, + "* Comment need not start at left margin", + ), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::SeparateCommands, ""), + (Type::Newline, "\n"), + (Type::CommentCommand, "* Comment ends with blank line"), + (Type::Newline, "\n"), + (Type::SeparateCommands, ""), + (Type::Newline, "\n"), + (Type::Identifier, "next"), + (Type::Spaces, " "), + (Type::Identifier, "command"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::SeparateCommands, ""), + (Type::Newline, "\n"), + (Type::End, ""), + ], + &[ + PromptStyle::Comment, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::Comment, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + ], ); } #[test] fn test_document_command() { - print_segmentation( + check_segmentation( r#"DOCUMENT one line. DOC more than @@ -1506,14 +2277,58 @@ first.paragraph isn't parsed as tokens second paragraph. - "#, + Mode::Interactive, + &[ + (Type::StartDocument, ""), + (Type::Document, "DOCUMENT one line."), + (Type::EndCommand, ""), + (Type::SeparateCommands, ""), + (Type::Newline, "\n"), + (Type::StartDocument, ""), + (Type::Document, "DOC more"), + (Type::Newline, "\n"), + (Type::Document, " than"), + (Type::Newline, "\n"), + (Type::Document, " one"), + (Type::Newline, "\n"), + (Type::Document, " line."), + (Type::EndCommand, ""), + (Type::SeparateCommands, ""), + (Type::Newline, "\n"), + (Type::StartDocument, ""), + (Type::Document, "docu"), + (Type::Newline, "\n"), + (Type::Document, "first.paragraph"), + (Type::Newline, "\n"), + (Type::Document, "isn't parsed as tokens"), + (Type::Newline, "\n"), + (Type::Document, ""), + (Type::Newline, "\n"), + (Type::Document, "second paragraph."), + (Type::EndCommand, ""), + (Type::SeparateCommands, ""), + (Type::Newline, "\n"), + (Type::End, ""), + ], + &[ + PromptStyle::First, + PromptStyle::Document, + PromptStyle::Document, + PromptStyle::Document, + PromptStyle::First, + PromptStyle::Document, + PromptStyle::Document, + PromptStyle::Document, + PromptStyle::Document, + PromptStyle::First, + ], ); } #[test] fn test_file_label_command() { - print_segmentation( + check_segmentation( r#"FIL label isn't quoted. FILE lab 'is quoted'. @@ -1521,12 +2336,51 @@ FILE /* /**/ lab not quoted here either "#, + Mode::Interactive, + &[ + (Type::Identifier, "FIL"), + (Type::Spaces, " "), + (Type::Identifier, "label"), + (Type::Spaces, " "), + (Type::UnquotedString, "isn't quoted"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::Identifier, "FILE"), + (Type::Newline, "\n"), + (Type::Spaces, " "), + (Type::Identifier, "lab"), + (Type::Spaces, " "), + (Type::QuotedString, "'is quoted'"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::Identifier, "FILE"), + (Type::Spaces, " "), + (Type::Comment, "/*"), + (Type::Newline, "\n"), + (Type::Comment, "/**/"), + (Type::Spaces, " "), + (Type::Identifier, "lab"), + (Type::Spaces, " "), + (Type::UnquotedString, "not quoted here either"), + (Type::Newline, "\n"), + (Type::SeparateCommands, ""), + (Type::Newline, "\n"), + (Type::End, ""), + ], + &[ + PromptStyle::First, + PromptStyle::Later, + PromptStyle::First, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::First, + ], ); } #[test] fn test_begin_data() { - print_segmentation( + check_segmentation( r#"begin data. end data. @@ -1550,14 +2404,127 @@ end data. begin data "xxx". begin data 123. not data - "#, + Mode::Interactive, + &[ + (Type::Identifier, "begin"), + (Type::Spaces, " "), + (Type::Identifier, "data"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::Identifier, "end"), + (Type::Spaces, " "), + (Type::Identifier, "data"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::SeparateCommands, ""), + (Type::Newline, "\n"), + (Type::Identifier, "begin"), + (Type::Spaces, " "), + (Type::Identifier, "data"), + (Type::EndCommand, "."), + (Type::Spaces, " "), + (Type::Comment, "/*"), + (Type::Newline, "\n"), + (Type::InlineData, "123"), + (Type::Newline, "\n"), + (Type::InlineData, "xxx"), + (Type::Newline, "\n"), + (Type::Identifier, "end"), + (Type::Spaces, " "), + (Type::Identifier, "data"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::SeparateCommands, ""), + (Type::Newline, "\n"), + (Type::Identifier, "BEG"), + (Type::Spaces, " "), + (Type::Comment, "/**/"), + (Type::Spaces, " "), + (Type::Identifier, "DAT"), + (Type::Spaces, " "), + (Type::Comment, "/*"), + (Type::Newline, "\n"), + (Type::InlineData, "5 6 7 /* x"), + (Type::Newline, "\n"), + (Type::InlineData, ""), + (Type::Newline, "\n"), + (Type::InlineData, "end data"), + (Type::Newline, "\n"), + (Type::Identifier, "end"), + (Type::Spaces, " "), + (Type::Identifier, "data"), + (Type::Newline, "\n"), + (Type::StartCommand, "."), + (Type::Newline, "\n"), + (Type::SeparateCommands, ""), + (Type::Newline, "\n"), + (Type::Identifier, "begin"), + (Type::Newline, "\n"), + (Type::Spaces, " "), + (Type::Identifier, "data"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::InlineData, "data"), + (Type::Newline, "\n"), + (Type::Identifier, "end"), + (Type::Spaces, " "), + (Type::Identifier, "data"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::SeparateCommands, ""), + (Type::Newline, "\n"), + (Type::Identifier, "begin"), + (Type::Spaces, " "), + (Type::Identifier, "data"), + (Type::Spaces, " "), + (Type::QuotedString, "\"xxx\""), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::Identifier, "begin"), + (Type::Spaces, " "), + (Type::Identifier, "data"), + (Type::Spaces, " "), + (Type::Number, "123"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::ReservedWord, "not"), + (Type::Spaces, " "), + (Type::Identifier, "data"), + (Type::Newline, "\n"), + (Type::End, ""), + ], + &[ + PromptStyle::Data, + PromptStyle::First, + PromptStyle::First, + PromptStyle::Data, + PromptStyle::Data, + PromptStyle::Data, + PromptStyle::First, + PromptStyle::First, + PromptStyle::Data, + PromptStyle::Data, + PromptStyle::Data, + PromptStyle::Data, + PromptStyle::Later, + PromptStyle::First, + PromptStyle::First, + PromptStyle::Later, + PromptStyle::Data, + PromptStyle::Data, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::Later, + ], ); } #[test] fn test_do_repeat() { - print_segmentation( + check_segmentation( r#"do repeat x=a b c y=d e f. do repeat a=1 thru 5. @@ -1572,168 +2539,883 @@ do inner command. end repeat. "#, + Mode::Interactive, + &[ + (Type::Identifier, "do"), + (Type::Spaces, " "), + (Type::Identifier, "repeat"), + (Type::Spaces, " "), + (Type::Identifier, "x"), + (Type::Punct, "="), + (Type::Identifier, "a"), + (Type::Spaces, " "), + (Type::Identifier, "b"), + (Type::Spaces, " "), + (Type::Identifier, "c"), + (Type::Newline, "\n"), + (Type::Spaces, " "), + (Type::Identifier, "y"), + (Type::Punct, "="), + (Type::Identifier, "d"), + (Type::Spaces, " "), + (Type::Identifier, "e"), + (Type::Spaces, " "), + (Type::Identifier, "f"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::DoRepeatCommand, " do repeat a=1 thru 5."), + (Type::Newline, "\n"), + (Type::DoRepeatCommand, "another command."), + (Type::Newline, "\n"), + (Type::DoRepeatCommand, "second command"), + (Type::Newline, "\n"), + (Type::DoRepeatCommand, "+ third command."), + (Type::Newline, "\n"), + (Type::DoRepeatCommand, "end /* x */ /* y */ repeat print."), + (Type::Newline, "\n"), + (Type::Identifier, "end"), + (Type::Newline, "\n"), + (Type::Spaces, " "), + (Type::Identifier, "repeat"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::Identifier, "do"), + (Type::Newline, "\n"), + (Type::Spaces, " "), + (Type::Identifier, "repeat"), + (Type::Spaces, " "), + (Type::Identifier, "#a"), + (Type::Punct, "="), + (Type::Number, "1"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::DoRepeatCommand, " inner command."), + (Type::Newline, "\n"), + (Type::Identifier, "end"), + (Type::Spaces, " "), + (Type::Identifier, "repeat"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::End, ""), + ], + &[ + PromptStyle::Later, + PromptStyle::DoRepeat, + PromptStyle::DoRepeat, + PromptStyle::DoRepeat, + PromptStyle::DoRepeat, + PromptStyle::DoRepeat, + PromptStyle::DoRepeat, + PromptStyle::Later, + PromptStyle::First, + PromptStyle::Later, + PromptStyle::DoRepeat, + PromptStyle::DoRepeat, + PromptStyle::First, + ], ); } #[test] fn test_do_repeat_overflow() { - let mut s = String::new(); const N: usize = 257; - for i in 0..N { - s.push_str(&format!("do repeat v{i}={i} thru {}\n", i + 5)); + let do_repeat: Vec = (0..N) + .map(|i| format!("do repeat v{i}={i} thru {}.\n", i + 5)) + .collect(); + let end_repeat: Vec = (0..N) + .rev() + .map(|i| format!("end repeat. /* {i}\n")) + .collect(); + + let s: String = do_repeat + .iter() + .chain(end_repeat.iter()) + .map(|s| s.as_str()) + .collect(); + let mut expect_output = vec![ + (Type::Identifier, "do"), + (Type::Spaces, " "), + (Type::Identifier, "repeat"), + (Type::Spaces, " "), + (Type::Identifier, "v0"), + (Type::Punct, "="), + (Type::Number, "0"), + (Type::Spaces, " "), + (Type::Identifier, "thru"), + (Type::Spaces, " "), + (Type::Number, "5"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + ]; + for i in 1..N { + expect_output.push((Type::DoRepeatCommand, &do_repeat[i].trim_end())); + if i >= 255 { + expect_output.push((Type::DoRepeatOverflow, "")); + } + expect_output.push((Type::Newline, "\n")); } - for i in (0..N).rev() { - s.push_str(&format!("end repeat. /* {i}\n")); + for i in 0..254 { + expect_output.push((Type::DoRepeatCommand, &end_repeat[i].trim_end())); + expect_output.push((Type::Newline, "\n")); } - print_segmentation(&s); + let comments: Vec = (0..(N - 254)).rev().map(|i| format!("/* {i}")).collect(); + for comment in &comments { + expect_output.extend([ + (Type::Identifier, "end"), + (Type::Spaces, " "), + (Type::Identifier, "repeat"), + (Type::EndCommand, "."), + (Type::Spaces, " "), + (Type::Comment, comment), + (Type::Newline, "\n"), + ]); + } + expect_output.push((Type::End, "")); + + let expect_prompts: Vec<_> = (0..N * 2 - 3) + .map(|_| PromptStyle::DoRepeat) + .chain([PromptStyle::First, PromptStyle::First, PromptStyle::First]) + .collect(); + check_segmentation(&s, Mode::Interactive, &expect_output, &expect_prompts); } #[test] - fn test_define_simple() { - print_segmentation( - r#"define !macro1() -var1 var2 var3 "!enddefine" -!enddefine. + fn test_do_repeat_batch() { + check_segmentation( + r#"do repeat x=a b c + y=d e f +do repeat a=1 thru 5 +another command +second command ++ third command +end /* x */ /* y */ repeat print +end + repeat +do + repeat #a=1 + + inner command +end repeat "#, + Mode::Batch, + &[ + (Type::Identifier, "do"), + (Type::Spaces, " "), + (Type::Identifier, "repeat"), + (Type::Spaces, " "), + (Type::Identifier, "x"), + (Type::Punct, "="), + (Type::Identifier, "a"), + (Type::Spaces, " "), + (Type::Identifier, "b"), + (Type::Spaces, " "), + (Type::Identifier, "c"), + (Type::Newline, "\n"), + (Type::Spaces, " "), + (Type::Identifier, "y"), + (Type::Punct, "="), + (Type::Identifier, "d"), + (Type::Spaces, " "), + (Type::Identifier, "e"), + (Type::Spaces, " "), + (Type::Identifier, "f"), + (Type::Newline, "\n"), + (Type::StartCommand, ""), + (Type::DoRepeatCommand, "do repeat a=1 thru 5"), + (Type::Newline, "\n"), + (Type::DoRepeatCommand, "another command"), + (Type::Newline, "\n"), + (Type::DoRepeatCommand, "second command"), + (Type::Newline, "\n"), + (Type::DoRepeatCommand, "+ third command"), + (Type::Newline, "\n"), + (Type::DoRepeatCommand, "end /* x */ /* y */ repeat print"), + (Type::Newline, "\n"), + (Type::Identifier, "end"), + (Type::Newline, "\n"), + (Type::Spaces, " "), + (Type::Identifier, "repeat"), + (Type::Newline, "\n"), + (Type::StartCommand, ""), + (Type::Identifier, "do"), + (Type::Newline, "\n"), + (Type::Spaces, " "), + (Type::Identifier, "repeat"), + (Type::Spaces, " "), + (Type::Identifier, "#a"), + (Type::Punct, "="), + (Type::Number, "1"), + (Type::Newline, "\n"), + (Type::SeparateCommands, ""), + (Type::Newline, "\n"), + (Type::DoRepeatCommand, " inner command"), + (Type::Newline, "\n"), + (Type::Identifier, "end"), + (Type::Spaces, " "), + (Type::Identifier, "repeat"), + (Type::Newline, "\n"), + (Type::End, ""), + ], + &[ + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::DoRepeat, + PromptStyle::DoRepeat, + PromptStyle::DoRepeat, + PromptStyle::DoRepeat, + PromptStyle::DoRepeat, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::DoRepeat, + PromptStyle::DoRepeat, + PromptStyle::Later, + ], ); } - #[test] - fn test_define_no_newline_after_parentheses() { - print_segmentation( - r#"define !macro1() var1 var2 var3 /* !enddefine + mod define { + use crate::{ + lex::segment::{Mode, Type}, + prompt::PromptStyle, + }; + + use super::check_segmentation; + + #[test] + fn test_simple() { + check_segmentation( + r#"define !macro1() +var1 var2 var3 "!enddefine" !enddefine. "#, - ); - } + Mode::Interactive, + &[ + (Type::Identifier, "define"), + (Type::Spaces, " "), + (Type::MacroName, "!macro1"), + (Type::Punct, "("), + (Type::Punct, ")"), + (Type::Newline, "\n"), + (Type::MacroBody, "var1 var2 var3 \"!enddefine\""), + (Type::Newline, "\n"), + (Type::MacroId, "!enddefine"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::End, ""), + ], + &[PromptStyle::Define, PromptStyle::Define, PromptStyle::First], + ); + } - #[test] - fn test_define_no_newline_before_enddefine() { - print_segmentation( - r#"define !macro1() + #[test] + fn test_no_newline_after_parentheses() { + check_segmentation( + r#"define !macro1() var1 var2 var3 /* !enddefine +!enddefine. +"#, + Mode::Interactive, + &[ + (Type::Identifier, "define"), + (Type::Spaces, " "), + (Type::MacroName, "!macro1"), + (Type::Punct, "("), + (Type::Punct, ")"), + (Type::MacroBody, " var1 var2 var3 /* !enddefine"), + (Type::Newline, "\n"), + (Type::MacroId, "!enddefine"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::End, ""), + ], + &[PromptStyle::Define, PromptStyle::First], + ); + } + + #[test] + fn test_no_newline_before_enddefine() { + check_segmentation( + r#"define !macro1() var1 var2 var3!enddefine. "#, - ); - } + Mode::Interactive, + &[ + (Type::Identifier, "define"), + (Type::Spaces, " "), + (Type::MacroName, "!macro1"), + (Type::Punct, "("), + (Type::Punct, ")"), + (Type::Newline, "\n"), + (Type::MacroBody, "var1 var2 var3"), + (Type::MacroId, "!enddefine"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::End, ""), + ], + &[PromptStyle::Define, PromptStyle::First], + ); + } - #[test] - fn test_define_all_on_one_line() { - print_segmentation( - r#"define !macro1()var1 var2 var3!enddefine. + #[test] + fn test_all_on_one_line() { + check_segmentation( + r#"define !macro1()var1 var2 var3!enddefine. "#, - ); - } + Mode::Interactive, + &[ + (Type::Identifier, "define"), + (Type::Spaces, " "), + (Type::MacroName, "!macro1"), + (Type::Punct, "("), + (Type::Punct, ")"), + (Type::MacroBody, "var1 var2 var3"), + (Type::MacroId, "!enddefine"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::End, ""), + ], + &[PromptStyle::First], + ); + } - #[test] - fn test_define_empty() { - print_segmentation( - r#"define !macro1() + #[test] + fn test_empty() { + check_segmentation( + r#"define !macro1() !enddefine. "#, - ); - } + Mode::Interactive, + &[ + (Type::Identifier, "define"), + (Type::Spaces, " "), + (Type::MacroName, "!macro1"), + (Type::Punct, "("), + (Type::Punct, ")"), + (Type::Newline, "\n"), + (Type::MacroId, "!enddefine"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::End, ""), + ], + &[PromptStyle::Define, PromptStyle::First], + ); + } - #[test] - fn test_define_blank_lines() { - print_segmentation( - r#"define !macro1() + #[test] + fn test_blank_lines() { + check_segmentation( + r#"define !macro1() !enddefine. "#, - ); - } + Mode::Interactive, + &[ + (Type::Identifier, "define"), + (Type::Spaces, " "), + (Type::MacroName, "!macro1"), + (Type::Punct, "("), + (Type::Punct, ")"), + (Type::Newline, "\n"), + (Type::MacroBody, ""), + (Type::Newline, "\n"), + (Type::MacroBody, ""), + (Type::Newline, "\n"), + (Type::MacroId, "!enddefine"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::End, ""), + ], + &[ + PromptStyle::Define, + PromptStyle::Define, + PromptStyle::Define, + PromptStyle::First, + ], + ); + } - #[test] - fn test_define_arguments() { - print_segmentation( - r#"define !macro1(a(), b(), c()) + #[test] + fn test_arguments() { + check_segmentation( + r#"define !macro1(a(), b(), c()) !enddefine. "#, - ); - } + Mode::Interactive, + &[ + (Type::Identifier, "define"), + (Type::Spaces, " "), + (Type::MacroName, "!macro1"), + (Type::Punct, "("), + (Type::Identifier, "a"), + (Type::Punct, "("), + (Type::Punct, ")"), + (Type::Punct, ","), + (Type::Spaces, " "), + (Type::Identifier, "b"), + (Type::Punct, "("), + (Type::Punct, ")"), + (Type::Punct, ","), + (Type::Spaces, " "), + (Type::Identifier, "c"), + (Type::Punct, "("), + (Type::Punct, ")"), + (Type::Punct, ")"), + (Type::Newline, "\n"), + (Type::MacroId, "!enddefine"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::End, ""), + ], + &[PromptStyle::Define, PromptStyle::First], + ); + } - #[test] - fn test_define_multiline_arguments() { - print_segmentation( - r#"define !macro1( + #[test] + fn test_multiline_arguments() { + check_segmentation( + r#"define !macro1( a(), b( ), c() ) !enddefine. "#, - ); - } + Mode::Interactive, + &[ + (Type::Identifier, "define"), + (Type::Spaces, " "), + (Type::MacroName, "!macro1"), + (Type::Punct, "("), + (Type::Newline, "\n"), + (Type::Spaces, " "), + (Type::Identifier, "a"), + (Type::Punct, "("), + (Type::Punct, ")"), + (Type::Punct, ","), + (Type::Spaces, " "), + (Type::Identifier, "b"), + (Type::Punct, "("), + (Type::Newline, "\n"), + (Type::Spaces, " "), + (Type::Punct, ")"), + (Type::Punct, ","), + (Type::Newline, "\n"), + (Type::Spaces, " "), + (Type::Identifier, "c"), + (Type::Punct, "("), + (Type::Punct, ")"), + (Type::Newline, "\n"), + (Type::Punct, ")"), + (Type::Newline, "\n"), + (Type::MacroId, "!enddefine"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::End, ""), + ], + &[ + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Define, + PromptStyle::First, + ], + ); + } - #[test] - fn test_define_arguments_start_on_second_line() { - print_segmentation( - r#"define !macro1 + #[test] + fn test_arguments_start_on_second_line() { + check_segmentation( + r#"define !macro1 (x,y,z ) content 1 content 2 !enddefine. "#, - ); - } + Mode::Interactive, + &[ + (Type::Identifier, "define"), + (Type::Spaces, " "), + (Type::MacroName, "!macro1"), + (Type::Newline, "\n"), + (Type::Punct, "("), + (Type::Identifier, "x"), + (Type::Punct, ","), + (Type::Identifier, "y"), + (Type::Punct, ","), + (Type::Identifier, "z"), + (Type::Newline, "\n"), + (Type::Punct, ")"), + (Type::Newline, "\n"), + (Type::MacroBody, "content 1"), + (Type::Newline, "\n"), + (Type::MacroBody, "content 2"), + (Type::Newline, "\n"), + (Type::MacroId, "!enddefine"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::End, ""), + ], + &[ + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Define, + PromptStyle::Define, + PromptStyle::Define, + PromptStyle::First, + ], + ); + } - #[test] - fn test_early_end_of_command_1() { - print_segmentation( - r#"define !macro1. + #[test] + fn test_early_end_of_command_1() { + check_segmentation( + r#"define !macro1. data list /x 1. "#, - ); - } + Mode::Interactive, + &[ + (Type::Identifier, "define"), + (Type::Spaces, " "), + (Type::MacroName, "!macro1"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::Identifier, "data"), + (Type::Spaces, " "), + (Type::Identifier, "list"), + (Type::Spaces, " "), + (Type::Punct, "/"), + (Type::Identifier, "x"), + (Type::Spaces, " "), + (Type::Number, "1"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::End, ""), + ], + &[PromptStyle::First, PromptStyle::First], + ); + } - #[test] - fn test_early_end_of_command_2() { - print_segmentation( - r#"define !macro1 + #[test] + fn test_early_end_of_command_2() { + check_segmentation( + r#"define !macro1 x. data list /x 1. "#, - ); - } + Mode::Interactive, + &[ + (Type::Identifier, "define"), + (Type::Spaces, " "), + (Type::MacroName, "!macro1"), + (Type::Newline, "\n"), + (Type::Identifier, "x"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::Identifier, "data"), + (Type::Spaces, " "), + (Type::Identifier, "list"), + (Type::Spaces, " "), + (Type::Punct, "/"), + (Type::Identifier, "x"), + (Type::Spaces, " "), + (Type::Number, "1"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::End, ""), + ], + &[PromptStyle::Later, PromptStyle::First, PromptStyle::First], + ); + } - #[test] - fn test_early_end_of_command_3() { - print_segmentation( - r#"define !macro1(. + #[test] + fn test_early_end_of_command_3() { + check_segmentation( + r#"define !macro1(. x. data list /x 1. "#, - ); - } + Mode::Interactive, + &[ + (Type::Identifier, "define"), + (Type::Spaces, " "), + (Type::MacroName, "!macro1"), + (Type::Punct, "("), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::Identifier, "x"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::Identifier, "data"), + (Type::Spaces, " "), + (Type::Identifier, "list"), + (Type::Spaces, " "), + (Type::Punct, "/"), + (Type::Identifier, "x"), + (Type::Spaces, " "), + (Type::Number, "1"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::End, ""), + ], + &[PromptStyle::First, PromptStyle::First, PromptStyle::First], + ); + } - #[test] - fn test_early_end_of_command_4() { - // Notice the command terminator at the end of the `DEFINE` command, - // which should not be there and ends it early. - print_segmentation( - r#"define !macro1. + #[test] + fn test_early_end_of_command_4() { + // Notice the command terminator at the end of the `DEFINE` command, + // which should not be there and ends it early. + check_segmentation( + r#"define !macro1. data list /x 1. "#, - ); - } + Mode::Interactive, + &[ + (Type::Identifier, "define"), + (Type::Spaces, " "), + (Type::MacroName, "!macro1"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::Identifier, "data"), + (Type::Spaces, " "), + (Type::Identifier, "list"), + (Type::Spaces, " "), + (Type::Punct, "/"), + (Type::Identifier, "x"), + (Type::Spaces, " "), + (Type::Number, "1"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::End, ""), + ], + &[PromptStyle::First, PromptStyle::First], + ); + } - #[test] - fn test_define_missing_enddefine() { - print_segmentation( - r#"define !macro1() + #[test] + fn test_missing_enddefine() { + check_segmentation( + r#"define !macro1() content line 1 content line 2 "#, + Mode::Interactive, + &[ + (Type::Identifier, "define"), + (Type::Spaces, " "), + (Type::MacroName, "!macro1"), + (Type::Punct, "("), + (Type::Punct, ")"), + (Type::Newline, "\n"), + (Type::MacroBody, "content line 1"), + (Type::Newline, "\n"), + (Type::MacroBody, "content line 2"), + (Type::Newline, "\n"), + (Type::End, ""), + ], + &[ + PromptStyle::Define, + PromptStyle::Define, + PromptStyle::Define, + ], + ); + } + + #[test] + fn test_missing_enddefine_2() { + check_segmentation( + r#"define !macro1() +"#, + Mode::Interactive, + &[ + (Type::Identifier, "define"), + (Type::Spaces, " "), + (Type::MacroName, "!macro1"), + (Type::Punct, "("), + (Type::Punct, ")"), + (Type::Newline, "\n"), + (Type::End, ""), + ], + &[PromptStyle::Define], + ); + } + } + + #[test] + fn test_batch_mode() { + check_segmentation( + r#"first command + another line of first command ++ second command +third command + +fourth command. + fifth command. +"#, + Mode::Batch, + &[ + (Type::Identifier, "first"), + (Type::Spaces, " "), + (Type::Identifier, "command"), + (Type::Newline, "\n"), + (Type::Spaces, " "), + (Type::Identifier, "another"), + (Type::Spaces, " "), + (Type::Identifier, "line"), + (Type::Spaces, " "), + (Type::Identifier, "of"), + (Type::Spaces, " "), + (Type::Identifier, "first"), + (Type::Spaces, " "), + (Type::Identifier, "command"), + (Type::Newline, "\n"), + (Type::StartCommand, "+"), + (Type::Spaces, " "), + (Type::Identifier, "second"), + (Type::Spaces, " "), + (Type::Identifier, "command"), + (Type::Newline, "\n"), + (Type::StartCommand, ""), + (Type::Identifier, "third"), + (Type::Spaces, " "), + (Type::Identifier, "command"), + (Type::Newline, "\n"), + (Type::SeparateCommands, ""), + (Type::Newline, "\n"), + (Type::Identifier, "fourth"), + (Type::Spaces, " "), + (Type::Identifier, "command"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::Spaces, " "), + (Type::Identifier, "fifth"), + (Type::Spaces, " "), + (Type::Identifier, "command"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::End, ""), + ], + &[ + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + ], ); } #[test] - fn test_define_missing_enddefine_2() { - print_segmentation( - r#"define !macro1() + fn test_auto_mode() { + check_segmentation( + r#"command + another line of command +2sls ++ another command +another line of second command +data list /x 1 +aggregate. +print eject. +twostep cluster + + +fourth command. + fifth command. "#, + Mode::Auto, + &[ + (Type::Identifier, "command"), + (Type::Newline, "\n"), + (Type::Spaces, " "), + (Type::Identifier, "another"), + (Type::Spaces, " "), + (Type::Identifier, "line"), + (Type::Spaces, " "), + (Type::Identifier, "of"), + (Type::Spaces, " "), + (Type::Identifier, "command"), + (Type::Newline, "\n"), + (Type::StartCommand, ""), + (Type::Number, "2"), + (Type::Identifier, "sls"), + (Type::Newline, "\n"), + (Type::StartCommand, "+"), + (Type::Spaces, " "), + (Type::Identifier, "another"), + (Type::Spaces, " "), + (Type::Identifier, "command"), + (Type::Newline, "\n"), + (Type::Identifier, "another"), + (Type::Spaces, " "), + (Type::Identifier, "line"), + (Type::Spaces, " "), + (Type::Identifier, "of"), + (Type::Spaces, " "), + (Type::Identifier, "second"), + (Type::Spaces, " "), + (Type::Identifier, "command"), + (Type::Newline, "\n"), + (Type::StartCommand, ""), + (Type::Identifier, "data"), + (Type::Spaces, " "), + (Type::Identifier, "list"), + (Type::Spaces, " "), + (Type::Punct, "/"), + (Type::Identifier, "x"), + (Type::Spaces, " "), + (Type::Number, "1"), + (Type::Newline, "\n"), + (Type::StartCommand, ""), + (Type::Identifier, "aggregate"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::Identifier, "print"), + (Type::Spaces, " "), + (Type::Identifier, "eject"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::Identifier, "twostep"), + (Type::Spaces, " "), + (Type::Identifier, "cluster"), + (Type::Newline, "\n"), + (Type::SeparateCommands, ""), + (Type::Newline, "\n"), + (Type::SeparateCommands, ""), + (Type::Newline, "\n"), + (Type::Identifier, "fourth"), + (Type::Spaces, " "), + (Type::Identifier, "command"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::Spaces, " "), + (Type::Identifier, "fifth"), + (Type::Spaces, " "), + (Type::Identifier, "command"), + (Type::EndCommand, "."), + (Type::Newline, "\n"), + (Type::End, ""), + ], + &[ + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::Later, + PromptStyle::First, + PromptStyle::First, + PromptStyle::Later, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + PromptStyle::First, + ], ); } } diff --git a/src/language/lexer/segment.c b/src/language/lexer/segment.c index 096383f5df..16e7c0fdd3 100644 --- a/src/language/lexer/segment.c +++ b/src/language/lexer/segment.c @@ -1848,6 +1848,47 @@ segment_type_to_string (enum segment_type type) } } +/* Returns the name of segment TYPE as a string. The caller must not modify + or free the returned string. + + This is useful only for debugging and testing. */ +const char * +segment_type_to_rust_string (enum segment_type type) +{ + switch (type) + { + case SEG_NUMBER: return "Number"; + case SEG_QUOTED_STRING: return "QuotedString"; + case SEG_HEX_STRING: return "HexString"; + case SEG_UNICODE_STRING: return "UnicodeString"; + case SEG_UNQUOTED_STRING: return "UnquotedString"; + case SEG_RESERVED_WORD: return "ReservedWord"; + case SEG_IDENTIFIER: return "Identifier"; + case SEG_PUNCT: return "Punct"; + case SEG_SHBANG: return "Shbang"; + case SEG_SPACES: return "Spaces"; + case SEG_COMMENT: return "Comment"; + case SEG_NEWLINE: return "Newline"; + case SEG_COMMENT_COMMAND: return "CommentCommand"; + case SEG_DO_REPEAT_COMMAND: return "DoRepeatCommand"; + case SEG_INLINE_DATA: return "InlineData"; + case SEG_MACRO_ID: return "MacroId"; + case SEG_MACRO_NAME: return "MacroName"; + case SEG_MACRO_BODY: return "MacroBody"; + case SEG_START_DOCUMENT: return "StartDocument"; + case SEG_DOCUMENT: return "Document"; + case SEG_START_COMMAND: return "StartCommand"; + case SEG_SEPARATE_COMMANDS: return "SeparateCommands"; + case SEG_END_COMMAND: return "EndCommand"; + case SEG_END: return "End"; + case SEG_EXPECTED_QUOTE: return "ExpectedQuote"; + case SEG_EXPECTED_EXPONENT: return "ExpectedExponent"; + case SEG_UNEXPECTED_CHAR: return "UnexpectedChar"; + default: + return "unknown segment type"; + } +} + /* Returns a segmenter with the given syntax MODE. If IS_SNIPPET is false, then the segmenter will parse as if it's being given diff --git a/src/language/lexer/segment.h b/src/language/lexer/segment.h index d5f846a900..6c2f0bd6b3 100644 --- a/src/language/lexer/segment.h +++ b/src/language/lexer/segment.h @@ -108,6 +108,7 @@ enum { SEG_N_TYPES = SEG_TYPES }; #undef SEG_TYPE const char *segment_type_to_string (enum segment_type); +const char *segment_type_to_rust_string (enum segment_type); /* A segmenter. Opaque. */ struct segmenter diff --git a/src/libpspp/prompt.c b/src/libpspp/prompt.c index f96ca8c100..6f70d33826 100644 --- a/src/libpspp/prompt.c +++ b/src/libpspp/prompt.c @@ -42,3 +42,27 @@ prompt_style_to_string (enum prompt_style style) } } +const char * +prompt_style_to_rust_string (enum prompt_style style) +{ + switch (style) + { + case PROMPT_FIRST: + return "First"; + case PROMPT_LATER: + return "Later"; + case PROMPT_DATA: + return "Data"; + case PROMPT_COMMENT: + return "Comment"; + case PROMPT_DOCUMENT: + return "Document"; + case PROMPT_DO_REPEAT: + return "DoRepeat"; + case PROMPT_DEFINE: + return "Define"; + default: + return "unknown prompt"; + } +} + diff --git a/src/libpspp/prompt.h b/src/libpspp/prompt.h index 8022e73732..24f4890035 100644 --- a/src/libpspp/prompt.h +++ b/src/libpspp/prompt.h @@ -29,5 +29,6 @@ enum prompt_style }; const char *prompt_style_to_string (enum prompt_style); +const char *prompt_style_to_rust_string (enum prompt_style); #endif /* prompt.h */ diff --git a/tests/language/lexer/segment-test.c b/tests/language/lexer/segment-test.c index 5977e8fce6..74e99e507e 100644 --- a/tests/language/lexer/segment-test.c +++ b/tests/language/lexer/segment-test.c @@ -54,6 +54,8 @@ static bool check_truncations; input. */ static bool strip_trailing_newline; +static bool rust; + static const char *parse_options (int argc, char **argv); static void usage (void) NO_RETURN; @@ -115,9 +117,16 @@ check_segmentation (const char *input, size_t length, bool print_segments) int prev_type = -1; size_t offset = 0; enum segment_type type; + + enum prompt_style *prompts = NULL; + size_t n_prompts = 0; + size_t allocated_prompts = 0; + + if (rust) + printf ("&[\n"); do { - const char *type_name, *p; + const char *p; int n; if (one_byte) @@ -175,33 +184,51 @@ check_segmentation (const char *input, size_t length, bool print_segments) continue; } - if (!verbose) + if (!rust) { - if (prev_type != SEG_SPACES && prev_type != -1 - && type == SEG_SPACES && n == 1 && input[offset] == ' ') + if (!verbose) { - printf (" space\n"); - offset++; - prev_type = -1; - continue; + if (prev_type != SEG_SPACES && prev_type != -1 + && type == SEG_SPACES && n == 1 && input[offset] == ' ') + { + printf (" space\n"); + offset++; + prev_type = -1; + continue; + } } + if (prev_type != -1) + putchar ('\n'); + prev_type = type; } - if (prev_type != -1) - putchar ('\n'); - prev_type = type; if (verbose) printf ("%2zu:%2zu: ", line_number, offset - line_offset); - type_name = segment_type_to_string (type); - for (p = type_name; *p != '\0'; p++) - putchar (tolower ((unsigned char) *p)); - if (n > 0) + if (rust) + { + printf (" (Type::%s, ", segment_type_to_rust_string (type)); + } + else + { + const char *type_name = segment_type_to_string (type); + for (p = type_name; *p != '\0'; p++) + putchar (tolower ((unsigned char) *p)); + } + + if (n > 0 || rust) { int i; - for (i = MIN (15, strlen (type_name)); i < 16; i++) - putchar (' '); + if (rust) + printf ("\""); + else + { + const char *type_name = segment_type_to_string (type); + for (i = MIN (15, strlen (type_name)); i < 16; i++) + putchar (' '); + } + for (i = 0; i < n;) { const uint8_t *u_input = CHAR_CAST (const uint8_t *, input); @@ -228,11 +255,17 @@ check_segmentation (const char *input, size_t length, bool print_segments) switch (uc) { case ' ': - printf ("_"); + if (rust) + putchar (' '); + else + putchar ('_');; break; case '_': - printf ("\\_"); + if (rust) + putchar ('_'); + else + printf ("\\_"); break; case '\\': @@ -255,9 +288,21 @@ check_segmentation (const char *input, size_t length, bool print_segments) printf ("\\v"); break; + case '"': + if (rust) + printf ("\\\""); + else + putchar ('"'); + break; + default: if (uc < 0x20 || uc == 0x00a0) - printf ("", uc); + { + if (rust) + printf ("\\u{%04x}", uc); + else + printf("", uc); + } else fwrite (input + offset + i, 1, mblen, stdout); break; @@ -269,22 +314,42 @@ check_segmentation (const char *input, size_t length, bool print_segments) } offset += n; - if (type == SEG_NEWLINE) + if (rust) + { + printf ("\"),\n"); + if (type == SEG_NEWLINE) + { + if (n_prompts >= allocated_prompts) + prompts = x2nrealloc (prompts, &allocated_prompts, sizeof *prompts); + prompts[n_prompts++] = segmenter_get_prompt (&s); + } + } + else { - enum prompt_style prompt; + if (type == SEG_NEWLINE) { + enum prompt_style prompt; - line_number++; - line_offset = offset; + line_number++; + line_offset = offset; - prompt = segmenter_get_prompt (&s); - printf (" (%s)\n", prompt_style_to_string (prompt)); + prompt = segmenter_get_prompt(&s); + printf(" (%s)\n", prompt_style_to_string(prompt)); + } } - fflush (stdout); + fflush(stdout); } while (type != SEG_END); - if (print_segments) + if (print_segments && !rust) putchar ('\n'); + + if (rust) + { + printf ("], &[\n"); + for (size_t i = 0; i < n_prompts; i++) + printf (" PromptStyle::%s,\n", prompt_style_to_rust_string(prompts[i])); + printf ("]\n"); + } } static const char * @@ -301,6 +366,7 @@ parse_options (int argc, char **argv) {"batch", no_argument, NULL, 'b'}, {"interactive", no_argument, NULL, 'i'}, {"verbose", no_argument, NULL, 'v'}, + {"rust", no_argument, NULL, 'r'}, {"help", no_argument, NULL, 'h'}, {NULL, 0, NULL, 0}, }; @@ -339,6 +405,10 @@ parse_options (int argc, char **argv) verbose = true; break; + case 'r': + rust = true; + break; + case 'h': usage (); diff --git a/tests/language/lexer/segment.at b/tests/language/lexer/segment.at index 80c09779a8..2d5184b6cc 100644 --- a/tests/language/lexer/segment.at +++ b/tests/language/lexer/segment.at @@ -17,6 +17,7 @@ dnl AT_BANNER([syntax segmentation]) m4_define([PSPP_CHECK_SEGMENT], [AT_CAPTURE_FILE([input]) + segment-test --rust $1 input > output.rs for strip in "" "-s"; do case $strip in # ( '') sed 's/^-//' < expout-base > expout ;; # ( -- 2.30.2