#include <string.h>
#include <sys/stat.h>
#include <unistd.h>
+#include "u8-mbtouc.h"
static uint8_t *data;
static size_t n;
}
+static bool __attribute__((unused))
+all_utf8(const char *p_)
+{
+ const uint8_t *p = (const uint8_t *) p_;
+ size_t len = strlen ((char *) p);
+ for (size_t ofs = 0, mblen; ofs < len; ofs += mblen)
+ {
+ ucs4_t uc;
+
+ mblen = u8_mbtouc (&uc, p + ofs, len - ofs);
+ if ((uc < 32 && uc != '\n') || uc == 127 || uc == 0xfffd)
+ return false;
+ }
+ return true;
+}
+
static char *
get_string(const char *where)
{
match_byte_assert(0);
if (!match_byte(0x40) && !match_byte(0x20) && !match_byte(0x80) && !match_byte(0x10) && !match_byte(0x70))
match_byte_assert(0x50);
- if (!match_byte(0x41))
- match_byte_assert(0x51);
+ match_byte_assert(0x41);
if (!match_u32(0) && !match_u32(1))
match_u32_assert(2);
match_byte_assert(0);
/* OK, this seems really unlikely to be totally correct, but it matches my corpus... */
if (!match_u32(0) && !match_u32(2))
- match_u32_assert(0xfaad);
+ {
+ if (i == 7)
+ match_u32_assert(0xfaad);
+ else
+ match_u32_assert(0);
+ }
if (!match_u32(0) && !match_u32(1) && !match_u32(2))
match_u32_assert(3);
if (version > 1)
{
- /* These seem unlikely to be correct too. */
if (i != 3)
{
if (!match_u32(8))
concatenated together, terminated by a byte 01:
@example
-light-member := header title fonts dims data 01
+light-member := header title styles dimensions data 01
@end example
The first section is a 0x27-byte header:
@end example
@example
-fonts := 00 font*8
- int[x1] byte*[x1]
- int[x2] byte*[x2]
- int[x3] byte*[x3]
- int[x4] int*[x4]
- string /* @r{encoding} */
- (i0 | i-1) (00 | 01) 00 (00 | 01)
- int
- byte[decimal] byte[grouping]
- int[x5] string*[x5] /* @r{custom currency} */
- int[x6] byte*[x6]
+styles := 00 font*8
+ int[x1] byte*[x1]
+ int[x2] byte*[x2]
+ int[x3] byte*[x3]
+ int[x4] int*[x4]
+ string[encoding]
+ (i0 | i-1) (00 | 01) 00 (00 | 01)
+ int
+ byte[decimal] byte[grouping]
+ int[x5] string*[x5] /* @r{custom currency} */
+ int[x6] byte*[x6]
@end example
In every example in the corpus, @code{x1} is 240. The meaning of the
Observed values of @code{x4} vary from 0 to 17. Out of 7060 examples
in the corpus, it is nonzero only 36 times.
+@code{encoding} is a character encoding, usually a Windows code page
+such as @code{en_US.windows-1252} or @code{it_IT.windows-1252}. The
+encoding string is itself encoded in US-ASCII. The rest of the
+character strings in the file use this encoding.
+
@code{decimal} is the decimal point character. The observed values
are @samp{.} and @samp{,}.
@code{grouping} is the grouping character. The observed values are
@samp{,}, @samp{.}, @samp{'}, @samp{ }, and zero (presumably
indicating that digits should not be grouped).
+
+@code{x5} is observed as either 0 or 5. When it is 5, the following
+strings are CCA through CCE format strings. Most commonly these are
+all @code{-,,,} but other strings occur.
+
+@example
+font := byte[index] 31 string[typeface]
+ 00 00
+ (10 | 20 | 40 | 50 | 70 | 80)[f1]
+ 41
+ (i0 | i1 | i2)[f2]
+ 00
+ (i0 | i2 | i64173)[f3]
+ (i0 | i1 | i2 | i3)[f4]
+ string[fgcolor] string[bgcolor]
+ i0 i0 00
+ (v3: int[f5] int[f6] int[f7] int[f8])
+@end example
+
+Each @code{font}, in order, represents the font style for a different
+element: title, caption, footnote, row labels, column labels, corner
+labels, data, and layers.
+
+@code{index} is the 1-based index of the @code{font}, i.e. 1 for the
+first @code{font}, through 8 for the final @code{font}.
+
+@code{typeface} is the string name of the font. In the corpus, this
+is @code{SansSerif} in over 99% of instances and @code{Times New
+Roman} in the rest.
+
+@code{fgcolor} and @code{bgcolor} are the foreground color and
+background color, respectively. In the corpus, these are always
+@code{#000000} and @code{#ffffff}, respectively.
+
+The meaning of the remaining data is unknown. It seems likely to
+include font sizes, horizontal and vertical alignment, attributes such
+as bold or italic, and margins. @code{f1} is @code{40} most of the
+time. @code{f2} is @code{i1} most of the time for the title and
+@code{i0} most of the time for other fonts.
+
+The table below lists the values observed in the corpus. When a cell
+contains a single value, then 99+% of the corpus contains that value.
+When a cell contains a pair of values, then the first value is seen in
+about two-third of the corpus and the second value in about the
+remaining one-third. In fonts that include multiple pairs, values are
+correlated, that is, for font 3, f5 = 24, f6 = 24, f7 = 2 appears
+about two-thirds of the time, as does the combination of f4 = 0, f6 =
+10 for font 7.
+
+@example
+font f1 f2 f3 f4 f5 f6 f7 f8
+
+ 1 40 1 0 0 8 10/11 1 8
+ 2 40 0 2 1 8 10/11 1 1
+ 3 40 0 2 1 24/11 24/ 8 2/3 4
+ 4 40 0 2 3 8 10/11 1 1
+ 5 40 0 0 1 8 10/11 1 4
+ 6 40 0 2 1 8 10/11 1 4
+ 7 40 0 64173 0/1 8 10/11 1 1
+ 8 40 0 2 3 8 10/11 1 4
+@end example