+#include <assert.h>
#include <float.h>
#include <stdbool.h>
#include <stdint.h>
#include <string.h>
#include <sys/stat.h>
#include <unistd.h>
+#include "u8-mbtouc.h"
static uint8_t *data;
static size_t n;
}
+static bool __attribute__((unused))
+all_utf8(const char *p_)
+{
+ const uint8_t *p = (const uint8_t *) p_;
+ size_t len = strlen ((char *) p);
+ for (size_t ofs = 0, mblen; ofs < len; ofs += mblen)
+ {
+ ucs4_t uc;
+
+ mblen = u8_mbtouc (&uc, p + ofs, len - ofs);
+ if ((uc < 32 && uc != '\n') || uc == 127 || uc == 0xfffd)
+ return false;
+ }
+ return true;
+}
+
static char *
get_string(const char *where)
{
}
static void
-dump_value_31(FILE *stream)
+dump_optional_value(FILE *stream)
{
if (match_byte (0x31))
{
{
/* We only have one SPV file for this version (with many
tables). */
- match_u32_assert(0x200);
- match_u32_assert(0x1000000);
- match_u32_assert(0);
- match_byte_assert(0);
+ match_byte(0);
+ if (!match_u32(1))
+ match_u32_assert(2);
+ match_byte(0);
+ match_byte(0);
+ if (!match_u32(0) && !match_u32(1) && !match_u32(2) && !match_u32(3) && !match_u32(4) && !match_u32(5) && !match_u32(6) && !match_u32(7) && !match_u32(8) && !match_u32(9))
+ match_u32_assert(10);
+ match_byte(0);
+ match_byte(0);
return;
}
else if (match_u32 (2))
{
fprintf(stream, "(special 2)");
+ if (!match_byte(0))
+ match_byte_assert(2);
match_byte_assert(0);
- match_byte_assert(0);
- if (!match_u32 (2))
- match_u32_assert(1);
+ if (!match_u32 (2) && !match_u32(1))
+ match_u32_assert(3);
dump_nested_string(); /* Our corpus doesn't contain any examples with strings though. */
}
else
}
static void
-dump_value(FILE *stream, int level, bool match1)
+dump_value(FILE *stream, int level)
{
match_byte(0);
match_byte(0);
if (match_byte (3))
{
char *text = get_string();
- dump_value_31(stream);
+ dump_optional_value(stream);
char *identifier = get_string();
char *text_eng = get_string();
fprintf (stream, "<string c=\"%s\"", text_eng);
fprintf (stream, "/>\n");
if (!match_byte (0))
match_byte_assert(1);
- if (match1)
- match_byte (1);
}
else if (match_byte (5))
{
- dump_value_31(stream);
+ dump_optional_value(stream);
char *name = get_string ();
char *label = get_string ();
fprintf (stream, "<variable name=\"%s\"", name);
char *var, *vallab;
double value;
- dump_value_31 (stream);
+ dump_optional_value (stream);
format = get_u32 ();
value = get_double ();
var = get_string ();
unsigned int format;
double value;
- dump_value_31(stream);
+ dump_optional_value(stream);
format = get_u32 ();
value = get_double ();
fprintf (stream, "<number value=\"%.*g\" format=\"%s%d.%d\"/>\n",
DBL_DIG, value, format_to_string(format >> 16), (format >> 8) & 0xff, format & 0xff);
- if (match1)
- match_byte (1);
}
else
{
- dump_value_31(stream);
+ dump_optional_value(stream);
char *base = get_string();
int x = get_u32();
fprintf (stream, " ");
fprintf (stream, "<substitution index=\"%d\">\n", i + 1);
for (int j = 0; j < y; j++)
- dump_value (stream, level + 2, false);
+ dump_value (stream, level + 2);
for (int j = 0; j <= level + 1; j++)
fprintf (stream, " ");
fprintf (stream, "</substitution>\n");
}
static void
-dump_category(int level, int *indexes, int *n_indexes, int max_indexes)
+dump_category(FILE *stream, int level, int *indexes, int *n_indexes, int max_indexes)
{
for (int i = 0; i <= level; i++)
- fprintf (stdout, " ");
+ fprintf (stream, " ");
printf ("<category>\n");
- dump_value (stdout, level + 1, true);
- match_byte(0);
- match_byte(0);
- match_byte(0);
+ dump_value (stream, level + 1);
- if (match_u32 (1))
- match_byte (0);
- else if (match_byte (1))
- {
- match_byte (0);
- if (!match_u32 (2))
- match_u32_assert (1);
- match_byte (0);
- }
- else if (!match_u32(2))
- match_u32_assert (0);
+ int merge = data[pos];
+ if (!match_byte(0))
+ match_byte_assert (1);
+
+ match_byte_assert (0);
+
+ int unindexed = data[pos];
+ if (!match_byte(0))
+ match_byte_assert (1);
+
+ int x = get_u32 ();
+ pos -= 4;
+ if (!match_u32 (0))
+ match_u32_assert (2);
int indx = get_u32();
int n_categories = get_u32();
- if (indx != -1)
+ if (indx == -1)
{
+ if (merge)
+ {
+ for (int i = 0; i <= level + 1; i++)
+ fprintf (stream, " ");
+ fprintf (stream, "<merge/>\n");
+ }
+ }
+ else
+ {
+ if (merge)
+ {
+ fprintf(stderr, "index not -1 but merged\n");
+ exit(1);
+ }
+ if (x != 2)
+ {
+ fprintf(stderr, "index not -1 but x != 2\n");
+ exit(1);
+ }
if (n_categories != 0)
{
fprintf(stderr, "index not -1 but subcategories\n");
}
indexes[(*n_indexes)++] = indx;
}
+
+ int expected_unindexed = indx == -1;
+ if (unindexed != expected_unindexed)
+ {
+ fprintf(stderr, "unindexed (%d) mismatch with indx (%d)\n",
+ unindexed, indx);
+ exit(1);
+ }
+
if (n_categories == 0)
{
for (int i = 0; i <= level + 1; i++)
- fprintf (stdout, " ");
- fprintf (stdout, "<category-index>%d</category-index>\n", indx);
+ fprintf (stream, " ");
+ fprintf (stream, "<category-index>%d</category-index>\n", indx);
}
for (int i = 0; i < n_categories; i++)
- dump_category (level + 1, indexes, n_indexes, max_indexes);
+ dump_category (stream, level + 1, indexes, n_indexes, max_indexes);
for (int i = 0; i <= level; i++)
- fprintf (stdout, " ");
+ fprintf (stream, " ");
printf ("</category>\n");
}
-static void
+static int
dump_dim(int indx)
{
int n_categories;
printf ("<dimension index=\"%d\">\n", indx);
- dump_value (stdout, 0, false);
+ dump_value (stdout, 0);
- /* This byte is usually 0x02 but many other values have been spotted. */
+ /* This byte is usually 0 but many other values have been spotted. */
pos++;
if (!match_byte(0) && !match_byte(1))
int indexes[2048];
int n_indexes = 0;
for (int i = 0; i < n_categories; i++)
- dump_category (0, indexes, &n_indexes, sizeof indexes / sizeof *indexes);
+ dump_category (stdout, 0, indexes, &n_indexes, sizeof indexes / sizeof *indexes);
check_permutation(indexes, n_indexes, "categories");
fprintf (stdout, "</dimension>\n");
+ return n_indexes;
}
int n_dims;
+static int dim_n_cats[64];
+#define MAX_DIMS (sizeof dim_n_cats / sizeof *dim_n_cats)
+
static void
dump_dims(void)
{
n_dims = get_u32();
+ assert(n_dims < MAX_DIMS);
for (int i = 0; i < n_dims; i++)
- dump_dim (i);
+ dim_n_cats[i] = dump_dim (i);
}
static void
dump_data(void)
{
/* The first three numbers add to the number of dimensions. */
- int t = get_u32();
- t += get_u32();
- match_u32_assert(n_dims - t);
+ int l = get_u32();
+ int r = get_u32();
+ int c = n_dims - l - r;
+ match_u32_assert(c);
/* The next n_dims numbers are a permutation of the dimension numbers. */
int a[n_dims];
for (int i = 0; i < n_dims; i++)
- a[i] = get_u32();
+ {
+ int dim = get_u32();
+ a[i] = dim;
+
+ const char *name = i < l ? "layer" : i < l + r ? "row" : "column";
+ printf ("<%s dimension=\"%d\"/>\n", name, dim);
+ }
check_permutation(a, n_dims, "dimensions");
int x = get_u32();
printf ("<data>\n");
for (int i = 0; i < x; i++)
{
- printf (" <datum index=\"%d\">\n", get_u32());
+ unsigned int indx = get_u32();
+ printf (" <datum index=\"%d\" coords=", indx);
+
+ int coords[MAX_DIMS];
+ for (int i = n_dims; i-- > 0; )
+ {
+ coords[i] = indx % dim_n_cats[i];
+ indx /= dim_n_cats[i];
+ }
+ for (int i = 0; i < n_dims; i++)
+ printf("%c%d", i ? ',' : '"', coords[i]);
+
+ printf ("\">\n");
match_u32_assert(0);
- dump_value(stdout, 1, false);
+ if (version == 1)
+ match_byte(0);
+ dump_value(stdout, 1);
fprintf (stdout, " </datum>\n");
}
printf ("</data>\n");
static void
dump_title(void)
{
- pos = 0x27;
printf ("<title-local>\n");
- dump_value(stdout, 0, true);
+ dump_value(stdout, 0);
+ match_byte(1);
printf ("</title-local>\n");
printf ("<subtype>\n");
- dump_value(stdout, 0, true);
+ dump_value(stdout, 0);
+ match_byte(1);
printf ("</subtype>\n");
match_byte_assert(0x31);
printf ("<title-c>\n");
- dump_value(stdout, 0, true);
+ dump_value(stdout, 0);
+ match_byte(1);
printf ("</title-c>\n");
match_byte(0);
if (match_byte(0x31))
{
printf ("<caption>\n");
- dump_value(stdout, 0, false);
+ dump_value(stdout, 0);
printf ("</caption>\n");
}
else
match_byte_assert(0x58);
-
int n_footnotes = get_u32();
for (int i = 0; i < n_footnotes; i++)
{
printf ("<footnote index=\"%d\">\n", i);
- dump_value(stdout, 0, false);
+ dump_value(stdout, 0);
+ /* Custom footnote marker string. */
if (match_byte (0x31))
- {
- /* Custom footnote marker string. */
- match_byte_assert(3);
- get_string();
- match_byte_assert(0x58);
- match_u32_assert(0);
- get_string();
- }
+ dump_value(stdout, 0);
else
match_byte_assert (0x58);
- printf("(%d)\n", get_u32());
+ get_u32 ();
printf ("</footnote>\n");
}
}
match_byte_assert(0);
if (!match_byte(0x40) && !match_byte(0x20) && !match_byte(0x80) && !match_byte(0x10) && !match_byte(0x70))
match_byte_assert(0x50);
- if (!match_byte(0x41))
- match_byte_assert(0x51);
+ match_byte_assert(0x41);
if (!match_u32(0) && !match_u32(1))
match_u32_assert(2);
match_byte_assert(0);
/* OK, this seems really unlikely to be totally correct, but it matches my corpus... */
if (!match_u32(0) && !match_u32(2))
- match_u32_assert(0xfaad);
+ {
+ if (i == 7)
+ match_u32_assert(0xfaad);
+ else
+ match_u32_assert(0);
+ }
if (!match_u32(0) && !match_u32(1) && !match_u32(2))
match_u32_assert(3);
if (version > 1)
{
- /* These seem unlikely to be correct too. */
if (i != 3)
{
if (!match_u32(8))
int count = get_u32();
pos += 4 * count;
- printf ("<encoding>%s</encoding>\n", get_string ());
+ const char *encoding = get_string();
+ printf ("<encoding>%s</encoding>\n", encoding);
if (!match_u32(0))
match_u32_assert(UINT32_MAX);
}
else
match_u32_assert(UINT32_MAX);
+
+ int decimal = data[pos];
+ int grouping = data[pos + 1];
if (match_byte('.'))
{
- if (!match_byte(','))
+ if (!match_byte(',') && !match_byte('\''))
match_byte_assert(' ');
}
else
if (!match_byte('.') && !match_byte(' '))
match_byte_assert(0);
}
+ printf("<format decimal=\"%c\" grouping=\"", decimal);
+ if (grouping)
+ putchar(grouping);
+ printf("\"/>\n");
if (match_u32(5))
{
for (int i = 0; i < 5; i++)
- get_string();
+ printf("<CC%c>%s</CC%c>\n", 'A' + i, get_string(), 'A' + i);
}
else
match_u32_assert(0);
exit(1);
}
- if (argc > 1)
+ if (argc != 2)
{
- if (!strcmp(argv[1], "title0"))
- {
- pos = 0x27;
- if (match_byte (0x03)
- || (match_byte (0x05) && match_byte (0x58)))
- printf ("%s\n", get_string());
- else
- printf ("<unknown>\n");
- return 0;
- }
- else if (!strcmp(argv[1], "title"))
- {
- dump_title();
- exit(0);
- }
- else if (!strcmp(argv[1], "titleraw"))
- {
- const char fonts[] = "\x01\x31\x09\0\0\0SansSerif";
- start = 0x27;
- n = find(fonts, sizeof fonts - 1);
- }
- else if (!strcmp(argv[1], "fonts"))
- {
- const char fonts[] = "\x01\x31\x09\0\0\0SansSerif";
- const char styles[] = "\xf0\0\0\0";
- start = find(fonts, sizeof fonts - 1);
- n = find(styles, sizeof styles - 1);
- }
- else if (!strcmp(argv[1], "styles"))
- {
- const char styles[] = "\xf0\0\0\0";
- const char dimensions[] = "-,,,.\0";
- start = find(styles, sizeof styles - 1);
- n = find(dimensions, sizeof dimensions - 1) + sizeof dimensions - 1;
- }
- else if (!strcmp(argv[1], "dimensions") || !strcmp(argv[1], "all"))
- {
- pos = 0;
- match_byte_assert(1);
- match_byte_assert(0);
-
- /* This might be a version number of some kind, because value 1 seems
- to only appear in an SPV file that also required its own weird
- special cases in dump_value_31(). */
- version = get_u32();
- pos -= 4;
- if (!match_u32(1))
- match_u32_assert(3);
+ fprintf (stderr, "usage: %s TYPE < .bin", argv[0]);
+ exit (1);
+ }
- match_byte_assert(1);
- if (!match_byte(0))
- match_byte_assert(1);
- match_byte_assert(0);
- match_byte_assert(0);
- if (!match_byte(0))
- match_byte_assert(1);
- pos++;
- match_byte_assert(0);
- match_byte_assert(0);
- match_byte_assert(0);
- dump_title ();
- dump_fonts();
- dump_dims ();
- dump_data ();
- match_byte (1);
- if (pos != n)
- {
- fprintf (stderr, "%x / %x\n", pos, n);
- exit(1);
- }
- exit(0);
- }
+ if (!strcmp(argv[1], "title0"))
+ {
+ pos = 0x27;
+ if (match_byte (0x03)
+ || (match_byte (0x05) && match_byte (0x58)))
+ printf ("%s\n", get_string());
else
+ printf ("<unknown>\n");
+ return 0;
+ }
+ else if (!strcmp(argv[1], "title"))
+ {
+ pos = 0x27;
+ dump_title();
+ exit(0);
+ }
+ else if (!strcmp(argv[1], "titleraw"))
+ {
+ const char fonts[] = "\x01\x31\x09\0\0\0SansSerif";
+ start = 0x27;
+ n = find(fonts, sizeof fonts - 1);
+ }
+ else if (!strcmp(argv[1], "fonts"))
+ {
+ const char fonts[] = "\x01\x31\x09\0\0\0SansSerif";
+ const char styles[] = "\xf0\0\0\0";
+ start = find(fonts, sizeof fonts - 1);
+ n = find(styles, sizeof styles - 1);
+ }
+ else if (!strcmp(argv[1], "styles"))
+ {
+ const char styles[] = "\xf0\0\0\0";
+ const char dimensions[] = "-,,,.\0";
+ start = find(styles, sizeof styles - 1);
+ n = find(dimensions, sizeof dimensions - 1) + sizeof dimensions - 1;
+ }
+ else if (!strcmp(argv[1], "dimensions") || !strcmp(argv[1], "all"))
+ {
+ pos = 0;
+ match_byte_assert(1);
+ match_byte_assert(0);
+
+ /* This might be a version number of some kind, because value 1 seems
+ to only appear in an SPV file that also required its own weird
+ special cases in dump_optional_value(). */
+ version = get_u32();
+ pos -= 4;
+ if (!match_u32(1))
+ match_u32_assert(3);
+
+ match_byte_assert(1);
+ if (!match_byte(0))
+ match_byte_assert(1);
+
+ /* Offset 8. */
+ match_byte_assert(0);
+ match_byte_assert(0);
+ if (!match_byte(0))
+ match_byte_assert(1);
+
+ /* Offset 11. */
+ pos++;
+ match_byte_assert(0);
+ match_byte_assert(0);
+ match_byte_assert(0);
+
+ /* Offset 15. */
+ pos++;
+ if (!match_byte(0))
+ match_byte_assert(1);
+ match_byte_assert(0);
+ match_byte_assert(0);
+
+ /* Offset 19. */
+ pos++;
+ if (!match_byte(0))
+ match_byte_assert(1);
+ match_byte_assert(0);
+ match_byte_assert(0);
+
+ /* Offset 23. */
+ pos++;
+ if (!match_byte(0))
+ match_byte_assert(1);
+ match_byte_assert(0);
+ match_byte_assert(0);
+
+ /* Offset 27. */
+ pos++;
+ pos++;
+ match_byte_assert(0);
+ match_byte_assert(0);
+
+ /* Offset 31.
+
+ This is the tableId, e.g. -4154297861994971133 would be 0xdca00003.
+ We don't have enough context to validate it. */
+ pos += 4;
+
+ /* Offset 35. */
+ pos += 4;
+
+ dump_title ();
+ dump_fonts();
+ dump_dims ();
+ dump_data ();
+ match_byte (1);
+ if (pos != n)
{
- fprintf (stderr, "unknown section %s\n", argv[1]);
+ fprintf (stderr, "%x / %x\n", pos, n);
exit(1);
}
+ exit(0);
}
- else
- start = 0x27;
+ else if (!strcmp(argv[1], "raw"))
+ {
+ start = 0x27;
- dump_raw(stdout, start, n);
+ dump_raw(stdout, start, n);
+ }
+ else
+ {
+ fprintf (stderr, "unknown section %s\n", argv[1]);
+ exit(1);
+ }
return 0;
}