693 lines
13 KiB
C
693 lines
13 KiB
C
#include "mupdf/fitz.h"
|
|
#include "mupdf/pdf.h"
|
|
|
|
#include <string.h>
|
|
|
|
#define IS_NUMBER \
|
|
'+':case'-':case'.':case'0':case'1':case'2':case'3':\
|
|
case'4':case'5':case'6':case'7':case'8':case'9'
|
|
#define IS_WHITE \
|
|
'\x00':case'\x09':case'\x0a':case'\x0c':case'\x0d':case'\x20'
|
|
#define IS_HEX \
|
|
'0':case'1':case'2':case'3':case'4':case'5':case'6':\
|
|
case'7':case'8':case'9':case'A':case'B':case'C':\
|
|
case'D':case'E':case'F':case'a':case'b':case'c':\
|
|
case'd':case'e':case'f'
|
|
#define IS_DELIM \
|
|
'(':case')':case'<':case'>':case'[':case']':case'{':\
|
|
case'}':case'/':case'%'
|
|
|
|
#define RANGE_0_9 \
|
|
'0':case'1':case'2':case'3':case'4':case'5':\
|
|
case'6':case'7':case'8':case'9'
|
|
#define RANGE_a_f \
|
|
'a':case'b':case'c':case'd':case'e':case'f'
|
|
#define RANGE_A_F \
|
|
'A':case'B':case'C':case'D':case'E':case'F'
|
|
#define RANGE_0_7 \
|
|
'0':case'1':case'2':case'3':case'4':case'5':case'6':case'7'
|
|
|
|
/* #define DUMP_LEXER_STREAM */
|
|
#ifdef DUMP_LEXER_STREAM
|
|
static inline int lex_byte(fz_context *ctx, fz_stream *stm)
|
|
{
|
|
int c = fz_read_byte(ctx, stm);
|
|
|
|
if (c == EOF)
|
|
fz_write_printf(ctx, fz_stdout(ctx), "<EOF>");
|
|
else if (c >= 32 && c < 128)
|
|
fz_write_printf(ctx, fz_stdout(ctx), "%c", c);
|
|
else
|
|
fz_write_printf(ctx, fz_stdout(ctx), "<%02x>", c);
|
|
return c;
|
|
}
|
|
#else
|
|
#define lex_byte(C,S) fz_read_byte(C,S)
|
|
#endif
|
|
|
|
static inline int iswhite(int ch)
|
|
{
|
|
return
|
|
ch == '\000' ||
|
|
ch == '\011' ||
|
|
ch == '\012' ||
|
|
ch == '\014' ||
|
|
ch == '\015' ||
|
|
ch == '\040';
|
|
}
|
|
|
|
static inline int fz_isprint(int ch)
|
|
{
|
|
return ch >= ' ' && ch <= '~';
|
|
}
|
|
|
|
static inline int unhex(int ch)
|
|
{
|
|
if (ch >= '0' && ch <= '9') return ch - '0';
|
|
if (ch >= 'A' && ch <= 'F') return ch - 'A' + 0xA;
|
|
if (ch >= 'a' && ch <= 'f') return ch - 'a' + 0xA;
|
|
return 0;
|
|
}
|
|
|
|
static void
|
|
lex_white(fz_context *ctx, fz_stream *f)
|
|
{
|
|
int c;
|
|
do {
|
|
c = lex_byte(ctx, f);
|
|
} while ((c <= 32) && (iswhite(c)));
|
|
if (c != EOF)
|
|
fz_unread_byte(ctx, f);
|
|
}
|
|
|
|
static void
|
|
lex_comment(fz_context *ctx, fz_stream *f)
|
|
{
|
|
int c;
|
|
do {
|
|
c = lex_byte(ctx, f);
|
|
} while ((c != '\012') && (c != '\015') && (c != EOF));
|
|
}
|
|
|
|
/* Fast(ish) but inaccurate strtof, with Adobe overflow handling. */
|
|
static float acrobat_compatible_atof(char *s)
|
|
{
|
|
int neg = 0;
|
|
int i = 0;
|
|
|
|
while (*s == '-')
|
|
{
|
|
neg = 1;
|
|
++s;
|
|
}
|
|
while (*s == '+')
|
|
{
|
|
++s;
|
|
}
|
|
|
|
while (*s >= '0' && *s <= '9')
|
|
{
|
|
/* We deliberately ignore overflow here.
|
|
* Tests show that Acrobat handles * overflows in exactly the same way we do:
|
|
* 123450000000000000000678 is read as 678.
|
|
*/
|
|
i = i * 10 + (*s - '0');
|
|
++s;
|
|
}
|
|
|
|
if (*s == '.')
|
|
{
|
|
float v = i;
|
|
float n = 0;
|
|
float d = 1;
|
|
++s;
|
|
while (*s >= '0' && *s <= '9')
|
|
{
|
|
n = 10 * n + (*s - '0');
|
|
d = 10 * d;
|
|
++s;
|
|
}
|
|
v += n / d;
|
|
return neg ? -v : v;
|
|
}
|
|
else
|
|
{
|
|
return neg ? -i : i;
|
|
}
|
|
}
|
|
|
|
/* Fast but inaccurate atoi. */
|
|
static int fast_atoi(char *s)
|
|
{
|
|
int neg = 0;
|
|
int i = 0;
|
|
|
|
while (*s == '-')
|
|
{
|
|
neg = 1;
|
|
++s;
|
|
}
|
|
while (*s == '+')
|
|
{
|
|
++s;
|
|
}
|
|
|
|
while (*s >= '0' && *s <= '9')
|
|
{
|
|
/* We deliberately ignore overflow here. */
|
|
i = i * 10 + (*s - '0');
|
|
++s;
|
|
}
|
|
|
|
return neg ? -i : i;
|
|
}
|
|
|
|
static int
|
|
lex_number(fz_context *ctx, fz_stream *f, pdf_lexbuf *buf, int c)
|
|
{
|
|
char *s = buf->scratch;
|
|
char *e = buf->scratch + buf->size - 1; /* leave space for zero terminator */
|
|
char *isreal = (c == '.' ? s : NULL);
|
|
int neg = (c == '-');
|
|
int isbad = 0;
|
|
|
|
*s++ = c;
|
|
|
|
c = lex_byte(ctx, f);
|
|
|
|
/* skip extra '-' signs at start of number */
|
|
if (neg)
|
|
{
|
|
while (c == '-')
|
|
c = lex_byte(ctx, f);
|
|
}
|
|
|
|
while (s < e)
|
|
{
|
|
switch (c)
|
|
{
|
|
case IS_WHITE:
|
|
case IS_DELIM:
|
|
fz_unread_byte(ctx, f);
|
|
goto end;
|
|
case EOF:
|
|
goto end;
|
|
case '.':
|
|
if (isreal)
|
|
isbad = 1;
|
|
isreal = s;
|
|
*s++ = c;
|
|
break;
|
|
case RANGE_0_9:
|
|
*s++ = c;
|
|
break;
|
|
default:
|
|
isbad = 1;
|
|
*s++ = c;
|
|
break;
|
|
}
|
|
c = lex_byte(ctx, f);
|
|
}
|
|
|
|
end:
|
|
*s = '\0';
|
|
if (isbad)
|
|
return PDF_TOK_ERROR;
|
|
if (isreal)
|
|
{
|
|
/* We'd like to use the fastest possible atof
|
|
* routine, but we'd rather match acrobats
|
|
* handling of broken numbers. As such, we
|
|
* spot common broken cases and call an
|
|
* acrobat compatible routine where required. */
|
|
if (neg > 1 || isreal - buf->scratch >= 10)
|
|
buf->f = acrobat_compatible_atof(buf->scratch);
|
|
else
|
|
buf->f = fz_atof(buf->scratch);
|
|
return PDF_TOK_REAL;
|
|
}
|
|
else
|
|
{
|
|
buf->i = fast_atoi(buf->scratch);
|
|
return PDF_TOK_INT;
|
|
}
|
|
}
|
|
|
|
static void
|
|
lex_name(fz_context *ctx, fz_stream *f, pdf_lexbuf *lb)
|
|
{
|
|
char *s = lb->scratch;
|
|
char *e = s + fz_mini(127, lb->size);
|
|
int c;
|
|
|
|
while (1)
|
|
{
|
|
if (s == e)
|
|
{
|
|
if (e - lb->scratch < 127)
|
|
{
|
|
s += pdf_lexbuf_grow(ctx, lb);
|
|
e = lb->scratch + fz_mini(127, lb->size);
|
|
}
|
|
else
|
|
{
|
|
/* truncate names that are too long */
|
|
fz_warn(ctx, "name is too long");
|
|
*s = 0;
|
|
lb->len = s - lb->scratch;
|
|
s = NULL;
|
|
}
|
|
}
|
|
c = lex_byte(ctx, f);
|
|
switch (c)
|
|
{
|
|
case IS_WHITE:
|
|
case IS_DELIM:
|
|
fz_unread_byte(ctx, f);
|
|
goto end;
|
|
case EOF:
|
|
goto end;
|
|
case '#':
|
|
{
|
|
int hex[2];
|
|
int i;
|
|
for (i = 0; i < 2; i++)
|
|
{
|
|
c = fz_peek_byte(ctx, f);
|
|
switch (c)
|
|
{
|
|
case RANGE_0_9:
|
|
if (i == 1 && c == '0' && hex[0] == 0)
|
|
goto illegal;
|
|
hex[i] = lex_byte(ctx, f) - '0';
|
|
break;
|
|
case RANGE_a_f:
|
|
hex[i] = lex_byte(ctx, f) - 'a' + 10;
|
|
break;
|
|
case RANGE_A_F:
|
|
hex[i] = lex_byte(ctx, f) - 'A' + 10;
|
|
break;
|
|
default:
|
|
case EOF:
|
|
goto illegal;
|
|
}
|
|
}
|
|
if (s) *s++ = (hex[0] << 4) + hex[1];
|
|
break;
|
|
illegal:
|
|
if (i == 1)
|
|
fz_unread_byte(ctx, f);
|
|
if (s) *s++ = '#';
|
|
continue;
|
|
}
|
|
default:
|
|
if (s) *s++ = c;
|
|
break;
|
|
}
|
|
}
|
|
end:
|
|
if (s)
|
|
{
|
|
*s = '\0';
|
|
lb->len = s - lb->scratch;
|
|
}
|
|
}
|
|
|
|
static int
|
|
lex_string(fz_context *ctx, fz_stream *f, pdf_lexbuf *lb)
|
|
{
|
|
char *s = lb->scratch;
|
|
char *e = s + lb->size;
|
|
int bal = 1;
|
|
int oct;
|
|
int c;
|
|
|
|
while (1)
|
|
{
|
|
if (s == e)
|
|
{
|
|
s += pdf_lexbuf_grow(ctx, lb);
|
|
e = lb->scratch + lb->size;
|
|
}
|
|
c = lex_byte(ctx, f);
|
|
switch (c)
|
|
{
|
|
case EOF:
|
|
return PDF_TOK_ERROR;
|
|
case '(':
|
|
bal++;
|
|
*s++ = c;
|
|
break;
|
|
case ')':
|
|
bal --;
|
|
if (bal == 0)
|
|
goto end;
|
|
*s++ = c;
|
|
break;
|
|
case '\\':
|
|
c = lex_byte(ctx, f);
|
|
switch (c)
|
|
{
|
|
case EOF:
|
|
return PDF_TOK_ERROR;
|
|
case 'n':
|
|
*s++ = '\n';
|
|
break;
|
|
case 'r':
|
|
*s++ = '\r';
|
|
break;
|
|
case 't':
|
|
*s++ = '\t';
|
|
break;
|
|
case 'b':
|
|
*s++ = '\b';
|
|
break;
|
|
case 'f':
|
|
*s++ = '\f';
|
|
break;
|
|
case '(':
|
|
*s++ = '(';
|
|
break;
|
|
case ')':
|
|
*s++ = ')';
|
|
break;
|
|
case '\\':
|
|
*s++ = '\\';
|
|
break;
|
|
case RANGE_0_7:
|
|
oct = c - '0';
|
|
c = lex_byte(ctx, f);
|
|
if (c >= '0' && c <= '7')
|
|
{
|
|
oct = oct * 8 + (c - '0');
|
|
c = lex_byte(ctx, f);
|
|
if (c >= '0' && c <= '7')
|
|
oct = oct * 8 + (c - '0');
|
|
else if (c != EOF)
|
|
fz_unread_byte(ctx, f);
|
|
}
|
|
else if (c != EOF)
|
|
fz_unread_byte(ctx, f);
|
|
*s++ = oct;
|
|
break;
|
|
case '\n':
|
|
break;
|
|
case '\r':
|
|
c = lex_byte(ctx, f);
|
|
if ((c != '\n') && (c != EOF))
|
|
fz_unread_byte(ctx, f);
|
|
break;
|
|
default:
|
|
*s++ = c;
|
|
}
|
|
break;
|
|
default:
|
|
*s++ = c;
|
|
break;
|
|
}
|
|
}
|
|
end:
|
|
lb->len = s - lb->scratch;
|
|
return PDF_TOK_STRING;
|
|
}
|
|
|
|
static int
|
|
lex_hex_string(fz_context *ctx, fz_stream *f, pdf_lexbuf *lb)
|
|
{
|
|
char *s = lb->scratch;
|
|
char *e = s + lb->size;
|
|
int a = 0, x = 0;
|
|
int c;
|
|
|
|
while (1)
|
|
{
|
|
if (s == e)
|
|
{
|
|
s += pdf_lexbuf_grow(ctx, lb);
|
|
e = lb->scratch + lb->size;
|
|
}
|
|
c = lex_byte(ctx, f);
|
|
switch (c)
|
|
{
|
|
case IS_WHITE:
|
|
break;
|
|
default:
|
|
fz_warn(ctx, "invalid character in hex string");
|
|
/* fall through */
|
|
case IS_HEX:
|
|
if (x)
|
|
{
|
|
*s++ = a * 16 + unhex(c);
|
|
x = !x;
|
|
}
|
|
else
|
|
{
|
|
a = unhex(c);
|
|
x = !x;
|
|
}
|
|
break;
|
|
case '>':
|
|
if (x)
|
|
{
|
|
*s++ = a * 16; /* pad truncated string with '0' */
|
|
}
|
|
goto end;
|
|
case EOF:
|
|
return PDF_TOK_ERROR;
|
|
}
|
|
}
|
|
end:
|
|
lb->len = s - lb->scratch;
|
|
return PDF_TOK_STRING;
|
|
}
|
|
|
|
static pdf_token
|
|
pdf_token_from_keyword(char *key)
|
|
{
|
|
switch (*key)
|
|
{
|
|
case 'R':
|
|
if (!strcmp(key, "R")) return PDF_TOK_R;
|
|
break;
|
|
case 't':
|
|
if (!strcmp(key, "true")) return PDF_TOK_TRUE;
|
|
if (!strcmp(key, "trailer")) return PDF_TOK_TRAILER;
|
|
break;
|
|
case 'f':
|
|
if (!strcmp(key, "false")) return PDF_TOK_FALSE;
|
|
break;
|
|
case 'n':
|
|
if (!strcmp(key, "null")) return PDF_TOK_NULL;
|
|
break;
|
|
case 'o':
|
|
if (!strcmp(key, "obj")) return PDF_TOK_OBJ;
|
|
break;
|
|
case 'e':
|
|
if (!strcmp(key, "endobj")) return PDF_TOK_ENDOBJ;
|
|
if (!strcmp(key, "endstream")) return PDF_TOK_ENDSTREAM;
|
|
break;
|
|
case 's':
|
|
if (!strcmp(key, "stream")) return PDF_TOK_STREAM;
|
|
if (!strcmp(key, "startxref")) return PDF_TOK_STARTXREF;
|
|
break;
|
|
case 'x':
|
|
if (!strcmp(key, "xref")) return PDF_TOK_XREF;
|
|
break;
|
|
}
|
|
|
|
while (*key)
|
|
{
|
|
if (!fz_isprint(*key))
|
|
return PDF_TOK_ERROR;
|
|
++key;
|
|
}
|
|
|
|
return PDF_TOK_KEYWORD;
|
|
}
|
|
|
|
void pdf_lexbuf_init(fz_context *ctx, pdf_lexbuf *lb, int size)
|
|
{
|
|
lb->size = lb->base_size = size;
|
|
lb->len = 0;
|
|
lb->scratch = &lb->buffer[0];
|
|
}
|
|
|
|
void pdf_lexbuf_fin(fz_context *ctx, pdf_lexbuf *lb)
|
|
{
|
|
if (lb && lb->size != lb->base_size)
|
|
fz_free(ctx, lb->scratch);
|
|
}
|
|
|
|
ptrdiff_t pdf_lexbuf_grow(fz_context *ctx, pdf_lexbuf *lb)
|
|
{
|
|
char *old = lb->scratch;
|
|
int newsize = lb->size * 2;
|
|
if (lb->size == lb->base_size)
|
|
{
|
|
lb->scratch = fz_malloc(ctx, newsize);
|
|
memcpy(lb->scratch, lb->buffer, lb->size);
|
|
}
|
|
else
|
|
{
|
|
lb->scratch = fz_realloc(ctx, lb->scratch, newsize);
|
|
}
|
|
lb->size = newsize;
|
|
return lb->scratch - old;
|
|
}
|
|
|
|
pdf_token
|
|
pdf_lex(fz_context *ctx, fz_stream *f, pdf_lexbuf *buf)
|
|
{
|
|
while (1)
|
|
{
|
|
int c = lex_byte(ctx, f);
|
|
switch (c)
|
|
{
|
|
case EOF:
|
|
return PDF_TOK_EOF;
|
|
case IS_WHITE:
|
|
lex_white(ctx, f);
|
|
break;
|
|
case '%':
|
|
lex_comment(ctx, f);
|
|
break;
|
|
case '/':
|
|
lex_name(ctx, f, buf);
|
|
return PDF_TOK_NAME;
|
|
case '(':
|
|
return lex_string(ctx, f, buf);
|
|
case ')':
|
|
return PDF_TOK_ERROR;
|
|
case '<':
|
|
c = lex_byte(ctx, f);
|
|
if (c == '<')
|
|
return PDF_TOK_OPEN_DICT;
|
|
if (c != EOF)
|
|
fz_unread_byte(ctx, f);
|
|
return lex_hex_string(ctx, f, buf);
|
|
case '>':
|
|
c = lex_byte(ctx, f);
|
|
if (c == '>')
|
|
return PDF_TOK_CLOSE_DICT;
|
|
if (c != EOF)
|
|
fz_unread_byte(ctx, f);
|
|
return PDF_TOK_ERROR;
|
|
case '[':
|
|
return PDF_TOK_OPEN_ARRAY;
|
|
case ']':
|
|
return PDF_TOK_CLOSE_ARRAY;
|
|
case '{':
|
|
return PDF_TOK_OPEN_BRACE;
|
|
case '}':
|
|
return PDF_TOK_CLOSE_BRACE;
|
|
case IS_NUMBER:
|
|
return lex_number(ctx, f, buf, c);
|
|
default: /* isregular: !isdelim && !iswhite && c != EOF */
|
|
fz_unread_byte(ctx, f);
|
|
lex_name(ctx, f, buf);
|
|
return pdf_token_from_keyword(buf->scratch);
|
|
}
|
|
}
|
|
}
|
|
|
|
pdf_token
|
|
pdf_lex_no_string(fz_context *ctx, fz_stream *f, pdf_lexbuf *buf)
|
|
{
|
|
while (1)
|
|
{
|
|
int c = lex_byte(ctx, f);
|
|
switch (c)
|
|
{
|
|
case EOF:
|
|
return PDF_TOK_EOF;
|
|
case IS_WHITE:
|
|
lex_white(ctx, f);
|
|
break;
|
|
case '%':
|
|
lex_comment(ctx, f);
|
|
break;
|
|
case '/':
|
|
lex_name(ctx, f, buf);
|
|
return PDF_TOK_NAME;
|
|
case '(':
|
|
return PDF_TOK_ERROR; /* no strings allowed */
|
|
case ')':
|
|
return PDF_TOK_ERROR; /* no strings allowed */
|
|
case '<':
|
|
c = lex_byte(ctx, f);
|
|
if (c == '<')
|
|
return PDF_TOK_OPEN_DICT;
|
|
if (c != EOF)
|
|
fz_unread_byte(ctx, f);
|
|
return PDF_TOK_ERROR; /* no strings allowed */
|
|
case '>':
|
|
c = lex_byte(ctx, f);
|
|
if (c == '>')
|
|
return PDF_TOK_CLOSE_DICT;
|
|
if (c != EOF)
|
|
fz_unread_byte(ctx, f);
|
|
return PDF_TOK_ERROR;
|
|
case '[':
|
|
return PDF_TOK_OPEN_ARRAY;
|
|
case ']':
|
|
return PDF_TOK_CLOSE_ARRAY;
|
|
case '{':
|
|
return PDF_TOK_OPEN_BRACE;
|
|
case '}':
|
|
return PDF_TOK_CLOSE_BRACE;
|
|
case IS_NUMBER:
|
|
return lex_number(ctx, f, buf, c);
|
|
default: /* isregular: !isdelim && !iswhite && c != EOF */
|
|
fz_unread_byte(ctx, f);
|
|
lex_name(ctx, f, buf);
|
|
return pdf_token_from_keyword(buf->scratch);
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
print a lexed token to a buffer, growing if necessary
|
|
*/
|
|
void pdf_append_token(fz_context *ctx, fz_buffer *fzbuf, int tok, pdf_lexbuf *buf)
|
|
{
|
|
switch (tok)
|
|
{
|
|
case PDF_TOK_NAME:
|
|
fz_append_printf(ctx, fzbuf, "/%s", buf->scratch);
|
|
break;
|
|
case PDF_TOK_STRING:
|
|
if (buf->len >= buf->size)
|
|
pdf_lexbuf_grow(ctx, buf);
|
|
buf->scratch[buf->len] = 0;
|
|
fz_append_pdf_string(ctx, fzbuf, buf->scratch);
|
|
break;
|
|
case PDF_TOK_OPEN_DICT:
|
|
fz_append_string(ctx, fzbuf, "<<");
|
|
break;
|
|
case PDF_TOK_CLOSE_DICT:
|
|
fz_append_string(ctx, fzbuf, ">>");
|
|
break;
|
|
case PDF_TOK_OPEN_ARRAY:
|
|
fz_append_byte(ctx, fzbuf, '[');
|
|
break;
|
|
case PDF_TOK_CLOSE_ARRAY:
|
|
fz_append_byte(ctx, fzbuf, ']');
|
|
break;
|
|
case PDF_TOK_OPEN_BRACE:
|
|
fz_append_byte(ctx, fzbuf, '{');
|
|
break;
|
|
case PDF_TOK_CLOSE_BRACE:
|
|
fz_append_byte(ctx, fzbuf, '}');
|
|
break;
|
|
case PDF_TOK_INT:
|
|
fz_append_printf(ctx, fzbuf, "%ld", buf->i);
|
|
break;
|
|
case PDF_TOK_REAL:
|
|
fz_append_printf(ctx, fzbuf, "%g", buf->f);
|
|
break;
|
|
default:
|
|
fz_append_data(ctx, fzbuf, buf->scratch, buf->len);
|
|
break;
|
|
}
|
|
}
|