eBookReaderSwitch/source/pdf/pdf-lex.c

693 lines
13 KiB
C

#include "mupdf/fitz.h"
#include "mupdf/pdf.h"
#include <string.h>
#define IS_NUMBER \
'+':case'-':case'.':case'0':case'1':case'2':case'3':\
case'4':case'5':case'6':case'7':case'8':case'9'
#define IS_WHITE \
'\x00':case'\x09':case'\x0a':case'\x0c':case'\x0d':case'\x20'
#define IS_HEX \
'0':case'1':case'2':case'3':case'4':case'5':case'6':\
case'7':case'8':case'9':case'A':case'B':case'C':\
case'D':case'E':case'F':case'a':case'b':case'c':\
case'd':case'e':case'f'
#define IS_DELIM \
'(':case')':case'<':case'>':case'[':case']':case'{':\
case'}':case'/':case'%'
#define RANGE_0_9 \
'0':case'1':case'2':case'3':case'4':case'5':\
case'6':case'7':case'8':case'9'
#define RANGE_a_f \
'a':case'b':case'c':case'd':case'e':case'f'
#define RANGE_A_F \
'A':case'B':case'C':case'D':case'E':case'F'
#define RANGE_0_7 \
'0':case'1':case'2':case'3':case'4':case'5':case'6':case'7'
/* #define DUMP_LEXER_STREAM */
#ifdef DUMP_LEXER_STREAM
static inline int lex_byte(fz_context *ctx, fz_stream *stm)
{
int c = fz_read_byte(ctx, stm);
if (c == EOF)
fz_write_printf(ctx, fz_stdout(ctx), "<EOF>");
else if (c >= 32 && c < 128)
fz_write_printf(ctx, fz_stdout(ctx), "%c", c);
else
fz_write_printf(ctx, fz_stdout(ctx), "<%02x>", c);
return c;
}
#else
#define lex_byte(C,S) fz_read_byte(C,S)
#endif
static inline int iswhite(int ch)
{
return
ch == '\000' ||
ch == '\011' ||
ch == '\012' ||
ch == '\014' ||
ch == '\015' ||
ch == '\040';
}
static inline int fz_isprint(int ch)
{
return ch >= ' ' && ch <= '~';
}
static inline int unhex(int ch)
{
if (ch >= '0' && ch <= '9') return ch - '0';
if (ch >= 'A' && ch <= 'F') return ch - 'A' + 0xA;
if (ch >= 'a' && ch <= 'f') return ch - 'a' + 0xA;
return 0;
}
static void
lex_white(fz_context *ctx, fz_stream *f)
{
int c;
do {
c = lex_byte(ctx, f);
} while ((c <= 32) && (iswhite(c)));
if (c != EOF)
fz_unread_byte(ctx, f);
}
static void
lex_comment(fz_context *ctx, fz_stream *f)
{
int c;
do {
c = lex_byte(ctx, f);
} while ((c != '\012') && (c != '\015') && (c != EOF));
}
/* Fast(ish) but inaccurate strtof, with Adobe overflow handling. */
static float acrobat_compatible_atof(char *s)
{
int neg = 0;
int i = 0;
while (*s == '-')
{
neg = 1;
++s;
}
while (*s == '+')
{
++s;
}
while (*s >= '0' && *s <= '9')
{
/* We deliberately ignore overflow here.
* Tests show that Acrobat handles * overflows in exactly the same way we do:
* 123450000000000000000678 is read as 678.
*/
i = i * 10 + (*s - '0');
++s;
}
if (*s == '.')
{
float v = i;
float n = 0;
float d = 1;
++s;
while (*s >= '0' && *s <= '9')
{
n = 10 * n + (*s - '0');
d = 10 * d;
++s;
}
v += n / d;
return neg ? -v : v;
}
else
{
return neg ? -i : i;
}
}
/* Fast but inaccurate atoi. */
static int fast_atoi(char *s)
{
int neg = 0;
int i = 0;
while (*s == '-')
{
neg = 1;
++s;
}
while (*s == '+')
{
++s;
}
while (*s >= '0' && *s <= '9')
{
/* We deliberately ignore overflow here. */
i = i * 10 + (*s - '0');
++s;
}
return neg ? -i : i;
}
static int
lex_number(fz_context *ctx, fz_stream *f, pdf_lexbuf *buf, int c)
{
char *s = buf->scratch;
char *e = buf->scratch + buf->size - 1; /* leave space for zero terminator */
char *isreal = (c == '.' ? s : NULL);
int neg = (c == '-');
int isbad = 0;
*s++ = c;
c = lex_byte(ctx, f);
/* skip extra '-' signs at start of number */
if (neg)
{
while (c == '-')
c = lex_byte(ctx, f);
}
while (s < e)
{
switch (c)
{
case IS_WHITE:
case IS_DELIM:
fz_unread_byte(ctx, f);
goto end;
case EOF:
goto end;
case '.':
if (isreal)
isbad = 1;
isreal = s;
*s++ = c;
break;
case RANGE_0_9:
*s++ = c;
break;
default:
isbad = 1;
*s++ = c;
break;
}
c = lex_byte(ctx, f);
}
end:
*s = '\0';
if (isbad)
return PDF_TOK_ERROR;
if (isreal)
{
/* We'd like to use the fastest possible atof
* routine, but we'd rather match acrobats
* handling of broken numbers. As such, we
* spot common broken cases and call an
* acrobat compatible routine where required. */
if (neg > 1 || isreal - buf->scratch >= 10)
buf->f = acrobat_compatible_atof(buf->scratch);
else
buf->f = fz_atof(buf->scratch);
return PDF_TOK_REAL;
}
else
{
buf->i = fast_atoi(buf->scratch);
return PDF_TOK_INT;
}
}
static void
lex_name(fz_context *ctx, fz_stream *f, pdf_lexbuf *lb)
{
char *s = lb->scratch;
char *e = s + fz_mini(127, lb->size);
int c;
while (1)
{
if (s == e)
{
if (e - lb->scratch < 127)
{
s += pdf_lexbuf_grow(ctx, lb);
e = lb->scratch + fz_mini(127, lb->size);
}
else
{
/* truncate names that are too long */
fz_warn(ctx, "name is too long");
*s = 0;
lb->len = s - lb->scratch;
s = NULL;
}
}
c = lex_byte(ctx, f);
switch (c)
{
case IS_WHITE:
case IS_DELIM:
fz_unread_byte(ctx, f);
goto end;
case EOF:
goto end;
case '#':
{
int hex[2];
int i;
for (i = 0; i < 2; i++)
{
c = fz_peek_byte(ctx, f);
switch (c)
{
case RANGE_0_9:
if (i == 1 && c == '0' && hex[0] == 0)
goto illegal;
hex[i] = lex_byte(ctx, f) - '0';
break;
case RANGE_a_f:
hex[i] = lex_byte(ctx, f) - 'a' + 10;
break;
case RANGE_A_F:
hex[i] = lex_byte(ctx, f) - 'A' + 10;
break;
default:
case EOF:
goto illegal;
}
}
if (s) *s++ = (hex[0] << 4) + hex[1];
break;
illegal:
if (i == 1)
fz_unread_byte(ctx, f);
if (s) *s++ = '#';
continue;
}
default:
if (s) *s++ = c;
break;
}
}
end:
if (s)
{
*s = '\0';
lb->len = s - lb->scratch;
}
}
static int
lex_string(fz_context *ctx, fz_stream *f, pdf_lexbuf *lb)
{
char *s = lb->scratch;
char *e = s + lb->size;
int bal = 1;
int oct;
int c;
while (1)
{
if (s == e)
{
s += pdf_lexbuf_grow(ctx, lb);
e = lb->scratch + lb->size;
}
c = lex_byte(ctx, f);
switch (c)
{
case EOF:
return PDF_TOK_ERROR;
case '(':
bal++;
*s++ = c;
break;
case ')':
bal --;
if (bal == 0)
goto end;
*s++ = c;
break;
case '\\':
c = lex_byte(ctx, f);
switch (c)
{
case EOF:
return PDF_TOK_ERROR;
case 'n':
*s++ = '\n';
break;
case 'r':
*s++ = '\r';
break;
case 't':
*s++ = '\t';
break;
case 'b':
*s++ = '\b';
break;
case 'f':
*s++ = '\f';
break;
case '(':
*s++ = '(';
break;
case ')':
*s++ = ')';
break;
case '\\':
*s++ = '\\';
break;
case RANGE_0_7:
oct = c - '0';
c = lex_byte(ctx, f);
if (c >= '0' && c <= '7')
{
oct = oct * 8 + (c - '0');
c = lex_byte(ctx, f);
if (c >= '0' && c <= '7')
oct = oct * 8 + (c - '0');
else if (c != EOF)
fz_unread_byte(ctx, f);
}
else if (c != EOF)
fz_unread_byte(ctx, f);
*s++ = oct;
break;
case '\n':
break;
case '\r':
c = lex_byte(ctx, f);
if ((c != '\n') && (c != EOF))
fz_unread_byte(ctx, f);
break;
default:
*s++ = c;
}
break;
default:
*s++ = c;
break;
}
}
end:
lb->len = s - lb->scratch;
return PDF_TOK_STRING;
}
static int
lex_hex_string(fz_context *ctx, fz_stream *f, pdf_lexbuf *lb)
{
char *s = lb->scratch;
char *e = s + lb->size;
int a = 0, x = 0;
int c;
while (1)
{
if (s == e)
{
s += pdf_lexbuf_grow(ctx, lb);
e = lb->scratch + lb->size;
}
c = lex_byte(ctx, f);
switch (c)
{
case IS_WHITE:
break;
default:
fz_warn(ctx, "invalid character in hex string");
/* fall through */
case IS_HEX:
if (x)
{
*s++ = a * 16 + unhex(c);
x = !x;
}
else
{
a = unhex(c);
x = !x;
}
break;
case '>':
if (x)
{
*s++ = a * 16; /* pad truncated string with '0' */
}
goto end;
case EOF:
return PDF_TOK_ERROR;
}
}
end:
lb->len = s - lb->scratch;
return PDF_TOK_STRING;
}
static pdf_token
pdf_token_from_keyword(char *key)
{
switch (*key)
{
case 'R':
if (!strcmp(key, "R")) return PDF_TOK_R;
break;
case 't':
if (!strcmp(key, "true")) return PDF_TOK_TRUE;
if (!strcmp(key, "trailer")) return PDF_TOK_TRAILER;
break;
case 'f':
if (!strcmp(key, "false")) return PDF_TOK_FALSE;
break;
case 'n':
if (!strcmp(key, "null")) return PDF_TOK_NULL;
break;
case 'o':
if (!strcmp(key, "obj")) return PDF_TOK_OBJ;
break;
case 'e':
if (!strcmp(key, "endobj")) return PDF_TOK_ENDOBJ;
if (!strcmp(key, "endstream")) return PDF_TOK_ENDSTREAM;
break;
case 's':
if (!strcmp(key, "stream")) return PDF_TOK_STREAM;
if (!strcmp(key, "startxref")) return PDF_TOK_STARTXREF;
break;
case 'x':
if (!strcmp(key, "xref")) return PDF_TOK_XREF;
break;
}
while (*key)
{
if (!fz_isprint(*key))
return PDF_TOK_ERROR;
++key;
}
return PDF_TOK_KEYWORD;
}
void pdf_lexbuf_init(fz_context *ctx, pdf_lexbuf *lb, int size)
{
lb->size = lb->base_size = size;
lb->len = 0;
lb->scratch = &lb->buffer[0];
}
void pdf_lexbuf_fin(fz_context *ctx, pdf_lexbuf *lb)
{
if (lb && lb->size != lb->base_size)
fz_free(ctx, lb->scratch);
}
ptrdiff_t pdf_lexbuf_grow(fz_context *ctx, pdf_lexbuf *lb)
{
char *old = lb->scratch;
int newsize = lb->size * 2;
if (lb->size == lb->base_size)
{
lb->scratch = fz_malloc(ctx, newsize);
memcpy(lb->scratch, lb->buffer, lb->size);
}
else
{
lb->scratch = fz_realloc(ctx, lb->scratch, newsize);
}
lb->size = newsize;
return lb->scratch - old;
}
pdf_token
pdf_lex(fz_context *ctx, fz_stream *f, pdf_lexbuf *buf)
{
while (1)
{
int c = lex_byte(ctx, f);
switch (c)
{
case EOF:
return PDF_TOK_EOF;
case IS_WHITE:
lex_white(ctx, f);
break;
case '%':
lex_comment(ctx, f);
break;
case '/':
lex_name(ctx, f, buf);
return PDF_TOK_NAME;
case '(':
return lex_string(ctx, f, buf);
case ')':
return PDF_TOK_ERROR;
case '<':
c = lex_byte(ctx, f);
if (c == '<')
return PDF_TOK_OPEN_DICT;
if (c != EOF)
fz_unread_byte(ctx, f);
return lex_hex_string(ctx, f, buf);
case '>':
c = lex_byte(ctx, f);
if (c == '>')
return PDF_TOK_CLOSE_DICT;
if (c != EOF)
fz_unread_byte(ctx, f);
return PDF_TOK_ERROR;
case '[':
return PDF_TOK_OPEN_ARRAY;
case ']':
return PDF_TOK_CLOSE_ARRAY;
case '{':
return PDF_TOK_OPEN_BRACE;
case '}':
return PDF_TOK_CLOSE_BRACE;
case IS_NUMBER:
return lex_number(ctx, f, buf, c);
default: /* isregular: !isdelim && !iswhite && c != EOF */
fz_unread_byte(ctx, f);
lex_name(ctx, f, buf);
return pdf_token_from_keyword(buf->scratch);
}
}
}
pdf_token
pdf_lex_no_string(fz_context *ctx, fz_stream *f, pdf_lexbuf *buf)
{
while (1)
{
int c = lex_byte(ctx, f);
switch (c)
{
case EOF:
return PDF_TOK_EOF;
case IS_WHITE:
lex_white(ctx, f);
break;
case '%':
lex_comment(ctx, f);
break;
case '/':
lex_name(ctx, f, buf);
return PDF_TOK_NAME;
case '(':
return PDF_TOK_ERROR; /* no strings allowed */
case ')':
return PDF_TOK_ERROR; /* no strings allowed */
case '<':
c = lex_byte(ctx, f);
if (c == '<')
return PDF_TOK_OPEN_DICT;
if (c != EOF)
fz_unread_byte(ctx, f);
return PDF_TOK_ERROR; /* no strings allowed */
case '>':
c = lex_byte(ctx, f);
if (c == '>')
return PDF_TOK_CLOSE_DICT;
if (c != EOF)
fz_unread_byte(ctx, f);
return PDF_TOK_ERROR;
case '[':
return PDF_TOK_OPEN_ARRAY;
case ']':
return PDF_TOK_CLOSE_ARRAY;
case '{':
return PDF_TOK_OPEN_BRACE;
case '}':
return PDF_TOK_CLOSE_BRACE;
case IS_NUMBER:
return lex_number(ctx, f, buf, c);
default: /* isregular: !isdelim && !iswhite && c != EOF */
fz_unread_byte(ctx, f);
lex_name(ctx, f, buf);
return pdf_token_from_keyword(buf->scratch);
}
}
}
/*
print a lexed token to a buffer, growing if necessary
*/
void pdf_append_token(fz_context *ctx, fz_buffer *fzbuf, int tok, pdf_lexbuf *buf)
{
switch (tok)
{
case PDF_TOK_NAME:
fz_append_printf(ctx, fzbuf, "/%s", buf->scratch);
break;
case PDF_TOK_STRING:
if (buf->len >= buf->size)
pdf_lexbuf_grow(ctx, buf);
buf->scratch[buf->len] = 0;
fz_append_pdf_string(ctx, fzbuf, buf->scratch);
break;
case PDF_TOK_OPEN_DICT:
fz_append_string(ctx, fzbuf, "<<");
break;
case PDF_TOK_CLOSE_DICT:
fz_append_string(ctx, fzbuf, ">>");
break;
case PDF_TOK_OPEN_ARRAY:
fz_append_byte(ctx, fzbuf, '[');
break;
case PDF_TOK_CLOSE_ARRAY:
fz_append_byte(ctx, fzbuf, ']');
break;
case PDF_TOK_OPEN_BRACE:
fz_append_byte(ctx, fzbuf, '{');
break;
case PDF_TOK_CLOSE_BRACE:
fz_append_byte(ctx, fzbuf, '}');
break;
case PDF_TOK_INT:
fz_append_printf(ctx, fzbuf, "%ld", buf->i);
break;
case PDF_TOK_REAL:
fz_append_printf(ctx, fzbuf, "%g", buf->f);
break;
default:
fz_append_data(ctx, fzbuf, buf->scratch, buf->len);
break;
}
}