eBookReaderSwitch/source/pdf/pdf-repair.c

723 lines
16 KiB
C

#include "mupdf/fitz.h"
#include "mupdf/pdf.h"
#include <string.h>
/* Scan file for objects and reconstruct xref table */
struct entry
{
int num;
int gen;
int64_t ofs;
int64_t stm_ofs;
int stm_len;
};
static void add_root(fz_context *ctx, pdf_obj *obj, pdf_obj ***roots, int *num_roots, int *max_roots)
{
if (*num_roots == *max_roots)
{
int new_max_roots = *max_roots * 2;
if (new_max_roots == 0)
new_max_roots = 4;
*roots = fz_realloc_array(ctx, *roots, new_max_roots, pdf_obj*);
*max_roots = new_max_roots;
}
(*roots)[(*num_roots)++] = pdf_keep_obj(ctx, obj);
}
int
pdf_repair_obj(fz_context *ctx, pdf_document *doc, pdf_lexbuf *buf, int64_t *stmofsp, int *stmlenp, pdf_obj **encrypt, pdf_obj **id, pdf_obj **page, int64_t *tmpofs, pdf_obj **root)
{
fz_stream *file = doc->file;
pdf_token tok;
int stm_len;
*stmofsp = 0;
if (stmlenp)
*stmlenp = -1;
stm_len = 0;
/* On entry to this function, we know that we've just seen
* '<int> <int> obj'. We expect the next thing we see to be a
* pdf object. Regardless of the type of thing we meet next
* we only need to fully parse it if it is a dictionary. */
tok = pdf_lex(ctx, file, buf);
if (tok == PDF_TOK_OPEN_DICT)
{
pdf_obj *obj, *dict = NULL;
fz_try(ctx)
{
dict = pdf_parse_dict(ctx, doc, file, buf);
}
fz_catch(ctx)
{
fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
/* Don't let a broken object at EOF overwrite a good one */
if (file->eof)
fz_rethrow(ctx);
/* Silently swallow the error */
dict = pdf_new_dict(ctx, NULL, 2);
}
/* We must be careful not to try to resolve any indirections
* here. We have just read dict, so we know it to be a non
* indirected dictionary. Before we look at any values that
* we get back from looking up in it, we need to check they
* aren't indirected. */
if (encrypt || id || root)
{
obj = pdf_dict_get(ctx, dict, PDF_NAME(Type));
if (!pdf_is_indirect(ctx, obj) && pdf_name_eq(ctx, obj, PDF_NAME(XRef)))
{
if (encrypt)
{
obj = pdf_dict_get(ctx, dict, PDF_NAME(Encrypt));
if (obj)
{
pdf_drop_obj(ctx, *encrypt);
*encrypt = pdf_keep_obj(ctx, obj);
}
}
if (id)
{
obj = pdf_dict_get(ctx, dict, PDF_NAME(ID));
if (obj)
{
pdf_drop_obj(ctx, *id);
*id = pdf_keep_obj(ctx, obj);
}
}
if (root)
*root = pdf_keep_obj(ctx, pdf_dict_get(ctx, dict, PDF_NAME(Root)));
}
}
obj = pdf_dict_get(ctx, dict, PDF_NAME(Length));
if (!pdf_is_indirect(ctx, obj) && pdf_is_int(ctx, obj))
stm_len = pdf_to_int(ctx, obj);
if (doc->file_reading_linearly && page)
{
obj = pdf_dict_get(ctx, dict, PDF_NAME(Type));
if (!pdf_is_indirect(ctx, obj) && pdf_name_eq(ctx, obj, PDF_NAME(Page)))
{
pdf_drop_obj(ctx, *page);
*page = pdf_keep_obj(ctx, dict);
}
}
pdf_drop_obj(ctx, dict);
}
while ( tok != PDF_TOK_STREAM &&
tok != PDF_TOK_ENDOBJ &&
tok != PDF_TOK_ERROR &&
tok != PDF_TOK_EOF &&
tok != PDF_TOK_INT )
{
*tmpofs = fz_tell(ctx, file);
if (*tmpofs < 0)
fz_throw(ctx, FZ_ERROR_GENERIC, "cannot tell in file");
tok = pdf_lex(ctx, file, buf);
}
if (tok == PDF_TOK_STREAM)
{
int c = fz_read_byte(ctx, file);
if (c == '\r') {
c = fz_peek_byte(ctx, file);
if (c == '\n')
fz_read_byte(ctx, file);
}
*stmofsp = fz_tell(ctx, file);
if (*stmofsp < 0)
fz_throw(ctx, FZ_ERROR_GENERIC, "cannot seek in file");
if (stm_len > 0)
{
fz_seek(ctx, file, *stmofsp + stm_len, 0);
fz_try(ctx)
{
tok = pdf_lex(ctx, file, buf);
}
fz_catch(ctx)
{
fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
fz_warn(ctx, "cannot find endstream token, falling back to scanning");
}
if (tok == PDF_TOK_ENDSTREAM)
goto atobjend;
fz_seek(ctx, file, *stmofsp, 0);
}
(void)fz_read(ctx, file, (unsigned char *) buf->scratch, 9);
while (memcmp(buf->scratch, "endstream", 9) != 0)
{
c = fz_read_byte(ctx, file);
if (c == EOF)
break;
memmove(&buf->scratch[0], &buf->scratch[1], 8);
buf->scratch[8] = c;
}
if (stmlenp)
*stmlenp = fz_tell(ctx, file) - *stmofsp - 9;
atobjend:
*tmpofs = fz_tell(ctx, file);
if (*tmpofs < 0)
fz_throw(ctx, FZ_ERROR_GENERIC, "cannot tell in file");
tok = pdf_lex(ctx, file, buf);
if (tok != PDF_TOK_ENDOBJ)
fz_warn(ctx, "object missing 'endobj' token");
else
{
/* Read another token as we always return the next one */
*tmpofs = fz_tell(ctx, file);
if (*tmpofs < 0)
fz_throw(ctx, FZ_ERROR_GENERIC, "cannot tell in file");
tok = pdf_lex(ctx, file, buf);
}
}
return tok;
}
static void
pdf_repair_obj_stm(fz_context *ctx, pdf_document *doc, int stm_num)
{
pdf_obj *obj;
fz_stream *stm = NULL;
pdf_token tok;
int i, n, count;
pdf_lexbuf buf;
fz_var(stm);
pdf_lexbuf_init(ctx, &buf, PDF_LEXBUF_SMALL);
fz_try(ctx)
{
obj = pdf_load_object(ctx, doc, stm_num);
count = pdf_dict_get_int(ctx, obj, PDF_NAME(N));
pdf_drop_obj(ctx, obj);
stm = pdf_open_stream_number(ctx, doc, stm_num);
for (i = 0; i < count; i++)
{
pdf_xref_entry *entry;
tok = pdf_lex(ctx, stm, &buf);
if (tok != PDF_TOK_INT)
fz_throw(ctx, FZ_ERROR_GENERIC, "corrupt object stream (%d 0 R)", stm_num);
n = buf.i;
if (n < 0)
{
fz_warn(ctx, "ignoring object with invalid object number (%d %d R)", n, i);
continue;
}
else if (n >= pdf_xref_len(ctx, doc))
{
fz_warn(ctx, "ignoring object with invalid object number (%d %d R)", n, i);
continue;
}
entry = pdf_get_populating_xref_entry(ctx, doc, n);
entry->ofs = stm_num;
entry->gen = i;
entry->num = n;
entry->stm_ofs = 0;
pdf_drop_obj(ctx, entry->obj);
entry->obj = NULL;
entry->type = 'o';
tok = pdf_lex(ctx, stm, &buf);
if (tok != PDF_TOK_INT)
fz_throw(ctx, FZ_ERROR_GENERIC, "corrupt object stream (%d 0 R)", stm_num);
}
}
fz_always(ctx)
{
fz_drop_stream(ctx, stm);
pdf_lexbuf_fin(ctx, &buf);
}
fz_catch(ctx)
{
fz_rethrow(ctx);
}
}
static void
orphan_object(fz_context *ctx, pdf_document *doc, pdf_obj *obj)
{
if (doc->orphans_count == doc->orphans_max)
{
int new_max = (doc->orphans_max ? doc->orphans_max*2 : 32);
fz_try(ctx)
{
doc->orphans = fz_realloc_array(ctx, doc->orphans, new_max, pdf_obj*);
doc->orphans_max = new_max;
}
fz_catch(ctx)
{
pdf_drop_obj(ctx, obj);
fz_rethrow(ctx);
}
}
doc->orphans[doc->orphans_count++] = obj;
}
static int is_white(int c)
{
return c == '\x00' || c == '\x09' || c == '\x0a' || c == '\x0c' || c == '\x0d' || c == '\x20';
}
void
pdf_repair_xref(fz_context *ctx, pdf_document *doc)
{
pdf_obj *dict, *obj = NULL;
pdf_obj *length;
pdf_obj *encrypt = NULL;
pdf_obj *id = NULL;
pdf_obj **roots = NULL;
pdf_obj *info = NULL;
struct entry *list = NULL;
int listlen;
int listcap;
int maxnum = 0;
int num = 0;
int gen = 0;
int64_t tmpofs, stm_ofs, numofs = 0, genofs = 0;
int stm_len;
pdf_token tok;
int next;
int i;
size_t j, n;
int c;
pdf_lexbuf *buf = &doc->lexbuf.base;
int num_roots = 0;
int max_roots = 0;
fz_var(encrypt);
fz_var(id);
fz_var(roots);
fz_var(num_roots);
fz_var(max_roots);
fz_var(info);
fz_var(list);
fz_var(obj);
fz_warn(ctx, "repairing PDF document");
if (doc->repair_attempted)
fz_throw(ctx, FZ_ERROR_GENERIC, "Repair failed already - not trying again");
doc->repair_attempted = 1;
doc->dirty = 1;
pdf_forget_xref(ctx, doc);
fz_seek(ctx, doc->file, 0, 0);
fz_try(ctx)
{
pdf_xref_entry *entry;
listlen = 0;
listcap = 1024;
list = fz_malloc_array(ctx, listcap, struct entry);
/* look for '%PDF' version marker within first kilobyte of file */
n = fz_read(ctx, doc->file, (unsigned char *)buf->scratch, fz_mini(buf->size, 1024));
fz_seek(ctx, doc->file, 0, 0);
if (n >= 4)
{
for (j = 0; j < n - 4; j++)
{
if (memcmp(&buf->scratch[j], "%PDF", 4) == 0)
{
fz_seek(ctx, doc->file, (int64_t)(j + 8), 0); /* skip "%PDF-X.Y" */
break;
}
}
}
/* skip comment line after version marker since some generators
* forget to terminate the comment with a newline */
c = fz_read_byte(ctx, doc->file);
while (c >= 0 && (c == ' ' || c == '%'))
c = fz_read_byte(ctx, doc->file);
fz_unread_byte(ctx, doc->file);
while (1)
{
tmpofs = fz_tell(ctx, doc->file);
if (tmpofs < 0)
fz_throw(ctx, FZ_ERROR_GENERIC, "cannot tell in file");
fz_try(ctx)
tok = pdf_lex_no_string(ctx, doc->file, buf);
fz_catch(ctx)
{
fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
fz_warn(ctx, "skipping ahead to next token");
do
c = fz_read_byte(ctx, doc->file);
while (c != EOF && !is_white(c));
if (c == EOF)
tok = PDF_TOK_EOF;
else
continue;
}
/* If we have the next token already, then we'll jump
* back here, rather than going through the top of
* the loop. */
have_next_token:
if (tok == PDF_TOK_INT)
{
if (buf->i < 0)
{
num = 0;
gen = 0;
continue;
}
numofs = genofs;
num = gen;
genofs = tmpofs;
gen = buf->i;
}
else if (tok == PDF_TOK_OBJ)
{
pdf_obj *root = NULL;
fz_try(ctx)
{
stm_len = 0;
stm_ofs = 0;
tok = pdf_repair_obj(ctx, doc, buf, &stm_ofs, &stm_len, &encrypt, &id, NULL, &tmpofs, &root);
if (root)
add_root(ctx, root, &roots, &num_roots, &max_roots);
}
fz_always(ctx)
{
pdf_drop_obj(ctx, root);
}
fz_catch(ctx)
{
fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
/* If we haven't seen a root yet, there is nothing
* we can do, but give up. Otherwise, we'll make
* do. */
if (!roots)
fz_rethrow(ctx);
fz_warn(ctx, "cannot parse object (%d %d R) - ignoring rest of file", num, gen);
break;
}
if (num <= 0 || num > PDF_MAX_OBJECT_NUMBER)
{
fz_warn(ctx, "ignoring object with invalid object number (%d %d R)", num, gen);
goto have_next_token;
}
gen = fz_clampi(gen, 0, 65535);
if (listlen + 1 == listcap)
{
listcap = (listcap * 3) / 2;
list = fz_realloc_array(ctx, list, listcap, struct entry);
}
list[listlen].num = num;
list[listlen].gen = gen;
list[listlen].ofs = numofs;
list[listlen].stm_ofs = stm_ofs;
list[listlen].stm_len = stm_len;
listlen ++;
if (num > maxnum)
maxnum = num;
goto have_next_token;
}
/* If we find a dictionary it is probably the trailer,
* but could be a stream (or bogus) dictionary caused
* by a corrupt file. */
else if (tok == PDF_TOK_OPEN_DICT)
{
pdf_obj *dictobj;
fz_try(ctx)
{
dict = pdf_parse_dict(ctx, doc, doc->file, buf);
}
fz_catch(ctx)
{
fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
/* If this was the real trailer dict
* it was broken, in which case we are
* in trouble. Keep going though in
* case this was just a bogus dict. */
continue;
}
fz_try(ctx)
{
dictobj = pdf_dict_get(ctx, dict, PDF_NAME(Encrypt));
if (dictobj)
{
pdf_drop_obj(ctx, encrypt);
encrypt = pdf_keep_obj(ctx, dictobj);
}
dictobj = pdf_dict_get(ctx, dict, PDF_NAME(ID));
if (dictobj && (!id || !encrypt || pdf_dict_get(ctx, dict, PDF_NAME(Encrypt))))
{
pdf_drop_obj(ctx, id);
id = pdf_keep_obj(ctx, dictobj);
}
dictobj = pdf_dict_get(ctx, dict, PDF_NAME(Root));
if (dictobj)
add_root(ctx, dictobj, &roots, &num_roots, &max_roots);
dictobj = pdf_dict_get(ctx, dict, PDF_NAME(Info));
if (dictobj)
{
pdf_drop_obj(ctx, info);
info = pdf_keep_obj(ctx, dictobj);
}
}
fz_always(ctx)
pdf_drop_obj(ctx, dict);
fz_catch(ctx)
fz_rethrow(ctx);
}
else if (tok == PDF_TOK_EOF)
{
break;
}
else
{
num = 0;
gen = 0;
}
}
if (listlen == 0)
fz_throw(ctx, FZ_ERROR_GENERIC, "no objects found");
/* make xref reasonable */
/*
Dummy access to entry to assure sufficient space in the xref table
and avoid repeated reallocs in the loop
*/
/* Ensure that the first xref table is a 'solid' one from
* 0 to maxnum. */
pdf_ensure_solid_xref(ctx, doc, maxnum);
for (i = 1; i < maxnum; i++)
{
entry = pdf_get_populating_xref_entry(ctx, doc, i);
if (entry->obj != NULL)
continue;
entry->type = 'f';
entry->ofs = 0;
entry->gen = 0;
entry->num = 0;
entry->stm_ofs = 0;
}
for (i = 0; i < listlen; i++)
{
entry = pdf_get_populating_xref_entry(ctx, doc, list[i].num);
entry->type = 'n';
entry->ofs = list[i].ofs;
entry->gen = list[i].gen;
entry->num = list[i].num;
entry->stm_ofs = list[i].stm_ofs;
/* correct stream length for unencrypted documents */
if (!encrypt && list[i].stm_len >= 0)
{
pdf_obj *old_obj = NULL;
dict = pdf_load_object(ctx, doc, list[i].num);
fz_try(ctx)
{
length = pdf_new_int(ctx, list[i].stm_len);
pdf_dict_get_put_drop(ctx, dict, PDF_NAME(Length), length, &old_obj);
if (old_obj)
orphan_object(ctx, doc, old_obj);
}
fz_always(ctx)
pdf_drop_obj(ctx, dict);
fz_catch(ctx)
fz_rethrow(ctx);
}
}
entry = pdf_get_populating_xref_entry(ctx, doc, 0);
entry->type = 'f';
entry->ofs = 0;
entry->gen = 65535;
entry->num = 0;
entry->stm_ofs = 0;
next = 0;
for (i = pdf_xref_len(ctx, doc) - 1; i >= 0; i--)
{
entry = pdf_get_populating_xref_entry(ctx, doc, i);
if (entry->type == 'f')
{
entry->ofs = next;
if (entry->gen < 65535)
entry->gen ++;
next = i;
}
}
/* create a repaired trailer, Root will be added later */
obj = pdf_new_dict(ctx, doc, 5);
/* During repair there is only a single xref section */
pdf_set_populating_xref_trailer(ctx, doc, obj);
pdf_drop_obj(ctx, obj);
obj = NULL;
obj = pdf_new_int(ctx, maxnum + 1);
pdf_dict_put(ctx, pdf_trailer(ctx, doc), PDF_NAME(Size), obj);
pdf_drop_obj(ctx, obj);
obj = NULL;
if (roots)
{
for (i = num_roots-1; i > 0; i--)
{
if (pdf_is_dict(ctx, roots[i]))
break;
}
if (i >= 0)
{
pdf_dict_put(ctx, pdf_trailer(ctx, doc), PDF_NAME(Root), roots[i]);
}
}
if (info)
{
pdf_dict_put(ctx, pdf_trailer(ctx, doc), PDF_NAME(Info), info);
pdf_drop_obj(ctx, info);
info = NULL;
}
if (encrypt)
{
if (pdf_is_indirect(ctx, encrypt))
{
/* create new reference with non-NULL xref pointer */
obj = pdf_new_indirect(ctx, doc, pdf_to_num(ctx, encrypt), pdf_to_gen(ctx, encrypt));
pdf_drop_obj(ctx, encrypt);
encrypt = obj;
obj = NULL;
}
pdf_dict_put(ctx, pdf_trailer(ctx, doc), PDF_NAME(Encrypt), encrypt);
pdf_drop_obj(ctx, encrypt);
encrypt = NULL;
}
if (id)
{
if (pdf_is_indirect(ctx, id))
{
/* create new reference with non-NULL xref pointer */
obj = pdf_new_indirect(ctx, doc, pdf_to_num(ctx, id), pdf_to_gen(ctx, id));
pdf_drop_obj(ctx, id);
id = obj;
obj = NULL;
}
pdf_dict_put(ctx, pdf_trailer(ctx, doc), PDF_NAME(ID), id);
pdf_drop_obj(ctx, id);
id = NULL;
}
fz_free(ctx, list);
}
fz_always(ctx)
{
for (i = 0; i < num_roots; i++)
pdf_drop_obj(ctx, roots[i]);
fz_free(ctx, roots);
}
fz_catch(ctx)
{
pdf_drop_obj(ctx, encrypt);
pdf_drop_obj(ctx, id);
pdf_drop_obj(ctx, obj);
pdf_drop_obj(ctx, info);
fz_free(ctx, list);
fz_rethrow(ctx);
}
}
void
pdf_repair_obj_stms(fz_context *ctx, pdf_document *doc)
{
pdf_obj *dict;
int i;
int xref_len = pdf_xref_len(ctx, doc);
for (i = 0; i < xref_len; i++)
{
pdf_xref_entry *entry = pdf_get_populating_xref_entry(ctx, doc, i);
if (entry->stm_ofs)
{
dict = pdf_load_object(ctx, doc, i);
fz_try(ctx)
{
if (pdf_name_eq(ctx, pdf_dict_get(ctx, dict, PDF_NAME(Type)), PDF_NAME(ObjStm)))
pdf_repair_obj_stm(ctx, doc, i);
}
fz_catch(ctx)
{
fz_warn(ctx, "ignoring broken object stream (%d 0 R)", i);
}
pdf_drop_obj(ctx, dict);
}
}
/* Ensure that streamed objects reside inside a known non-streamed object */
for (i = 0; i < xref_len; i++)
{
pdf_xref_entry *entry = pdf_get_populating_xref_entry(ctx, doc, i);
if (entry->type == 'o' && pdf_get_populating_xref_entry(ctx, doc, entry->ofs)->type != 'n')
fz_throw(ctx, FZ_ERROR_GENERIC, "invalid reference to non-object-stream: %d (%d 0 R)", (int)entry->ofs, i);
}
}