550 lines
14 KiB
C
550 lines
14 KiB
C
#include "mupdf/fitz.h"
|
|
#include "mupdf/pdf.h"
|
|
|
|
static void
|
|
pdf_clean_stream_object(fz_context *ctx, pdf_document *doc, pdf_obj *obj, pdf_obj *orig_res, fz_cookie *cookie, int own_res,
|
|
pdf_text_filter_fn *text_filter, pdf_after_text_object_fn *after_text, void *arg,
|
|
int sanitize, int ascii)
|
|
{
|
|
pdf_processor *proc_buffer = NULL;
|
|
pdf_processor *proc_filter = NULL;
|
|
pdf_obj *res = NULL;
|
|
pdf_obj *ref;
|
|
fz_buffer *buffer;
|
|
|
|
if (!obj)
|
|
return;
|
|
|
|
fz_var(res);
|
|
fz_var(proc_buffer);
|
|
fz_var(proc_filter);
|
|
|
|
buffer = fz_new_buffer(ctx, 1024);
|
|
|
|
fz_try(ctx)
|
|
{
|
|
pdf_obj *sp = pdf_dict_get(ctx, obj, PDF_NAME(StructParents));
|
|
int structparents = -1;
|
|
if (pdf_is_number(ctx, sp))
|
|
structparents = pdf_to_int(ctx, sp);
|
|
|
|
if (own_res)
|
|
{
|
|
pdf_obj *r = pdf_dict_get(ctx, obj, PDF_NAME(Resources));
|
|
if (r)
|
|
orig_res = r;
|
|
}
|
|
|
|
res = pdf_new_dict(ctx, doc, 1);
|
|
|
|
proc_buffer = pdf_new_buffer_processor(ctx, buffer, ascii);
|
|
proc_filter = pdf_new_filter_processor_with_text_filter(ctx, doc, structparents, proc_buffer, orig_res, res, text_filter, after_text, arg);
|
|
|
|
pdf_process_contents(ctx, proc_filter, doc, orig_res, obj, cookie);
|
|
pdf_close_processor(ctx, proc_filter);
|
|
pdf_close_processor(ctx, proc_buffer);
|
|
|
|
pdf_update_stream(ctx, doc, obj, buffer, 0);
|
|
|
|
if (own_res)
|
|
{
|
|
ref = pdf_add_object(ctx, doc, res);
|
|
pdf_dict_put_drop(ctx, obj, PDF_NAME(Resources), ref);
|
|
}
|
|
}
|
|
fz_always(ctx)
|
|
{
|
|
pdf_drop_processor(ctx, proc_filter);
|
|
pdf_drop_processor(ctx, proc_buffer);
|
|
fz_drop_buffer(ctx, buffer);
|
|
pdf_drop_obj(ctx, res);
|
|
}
|
|
fz_catch(ctx)
|
|
{
|
|
fz_rethrow(ctx);
|
|
}
|
|
}
|
|
|
|
static void
|
|
pdf_clean_type3(fz_context *ctx, pdf_document *doc, pdf_obj *obj, pdf_obj *orig_res, fz_cookie *cookie, int sanitize, int ascii)
|
|
{
|
|
pdf_processor *proc_buffer = NULL;
|
|
pdf_processor *proc_filter = NULL;
|
|
pdf_obj *res = NULL;
|
|
pdf_obj *ref;
|
|
pdf_obj *charprocs;
|
|
int i, l;
|
|
|
|
fz_var(res);
|
|
fz_var(proc_buffer);
|
|
fz_var(proc_filter);
|
|
|
|
fz_try(ctx)
|
|
{
|
|
res = pdf_dict_get(ctx, obj, PDF_NAME(Resources));
|
|
if (res)
|
|
orig_res = res;
|
|
res = NULL;
|
|
|
|
res = pdf_new_dict(ctx, doc, 1);
|
|
|
|
charprocs = pdf_dict_get(ctx, obj, PDF_NAME(CharProcs));
|
|
l = pdf_dict_len(ctx, charprocs);
|
|
|
|
for (i = 0; i < l; i++)
|
|
{
|
|
pdf_obj *val = pdf_dict_get_val(ctx, charprocs, i);
|
|
fz_buffer *buffer = fz_new_buffer(ctx, 1024);
|
|
fz_try(ctx)
|
|
{
|
|
proc_buffer = pdf_new_buffer_processor(ctx, buffer, ascii);
|
|
if (sanitize)
|
|
{
|
|
proc_filter = pdf_new_filter_processor(ctx, doc, proc_buffer, orig_res, res);
|
|
pdf_process_contents(ctx, proc_filter, doc, orig_res, val, cookie);
|
|
pdf_close_processor(ctx, proc_filter);
|
|
}
|
|
else
|
|
{
|
|
pdf_process_contents(ctx, proc_filter, doc, orig_res, val, cookie);
|
|
}
|
|
pdf_close_processor(ctx, proc_buffer);
|
|
|
|
pdf_update_stream(ctx, doc, val, buffer, 0);
|
|
}
|
|
fz_always(ctx)
|
|
{
|
|
pdf_drop_processor(ctx, proc_filter);
|
|
pdf_drop_processor(ctx, proc_buffer);
|
|
fz_drop_buffer(ctx, buffer);
|
|
}
|
|
fz_catch(ctx)
|
|
{
|
|
fz_rethrow(ctx);
|
|
}
|
|
}
|
|
|
|
/* ProcSet - no cleaning possible. Inherit this from the old dict. */
|
|
pdf_dict_put(ctx, res, PDF_NAME(ProcSet), pdf_dict_get(ctx, orig_res, PDF_NAME(ProcSet)));
|
|
|
|
ref = pdf_add_object(ctx, doc, res);
|
|
pdf_dict_put_drop(ctx, obj, PDF_NAME(Resources), ref);
|
|
}
|
|
fz_always(ctx)
|
|
{
|
|
pdf_drop_obj(ctx, res);
|
|
}
|
|
fz_catch(ctx)
|
|
{
|
|
fz_rethrow(ctx);
|
|
}
|
|
}
|
|
|
|
/*
|
|
Clean a loaded pages rendering operations,
|
|
with an optional post processing step.
|
|
|
|
Firstly, this filters the PDF operators used to avoid (some cases
|
|
of) repetition, and leaves the page in a balanced state with an
|
|
unchanged top level matrix etc. At the same time, the resources
|
|
used by the page contents are collected.
|
|
|
|
Next, the resources themselves are cleaned (as appropriate) in the
|
|
same way.
|
|
|
|
Next, an optional post processing stage is called.
|
|
|
|
Finally, the page contents and resources in the documents page tree
|
|
are replaced by these processed versions.
|
|
|
|
Annotations remain unaffected.
|
|
|
|
page: A page loaded by pdf_load_page.
|
|
|
|
cookie: A pointer to an optional fz_cookie structure that can be used
|
|
to track progress, collect errors etc.
|
|
*/
|
|
void pdf_clean_page_contents(fz_context *ctx, pdf_document *doc, pdf_page *page, fz_cookie *cookie, pdf_page_contents_process_fn *proc_fn, void *arg, int sanitize, int ascii)
|
|
{
|
|
pdf_filter_page_contents(ctx, doc, page, cookie, proc_fn, NULL, NULL, arg, sanitize, ascii);
|
|
}
|
|
|
|
/*
|
|
Performs the same task as
|
|
pdf_clean_page_contents, but with an optional text filter
|
|
function.
|
|
|
|
text_filter: Function to assess whether a given character
|
|
should be kept (return 0) or removed (return 1).
|
|
|
|
after_text: Function called after each text object is closed
|
|
to allow other output to be sent.
|
|
|
|
arg: Opaque value to be passed to callback functions.
|
|
*/
|
|
void pdf_filter_page_contents(fz_context *ctx, pdf_document *doc, pdf_page *page, fz_cookie *cookie,
|
|
pdf_page_contents_process_fn *proc_fn, pdf_text_filter_fn *text_filter, pdf_after_text_object_fn *after_text, void *proc_arg,
|
|
int sanitize, int ascii)
|
|
{
|
|
pdf_processor *proc_buffer = NULL;
|
|
pdf_processor *proc_filter = NULL;
|
|
pdf_obj *new_obj = NULL;
|
|
pdf_obj *new_ref = NULL;
|
|
pdf_obj *res = NULL;
|
|
pdf_obj *obj;
|
|
pdf_obj *contents;
|
|
pdf_obj *resources;
|
|
fz_buffer *buffer;
|
|
|
|
fz_var(new_obj);
|
|
fz_var(new_ref);
|
|
fz_var(res);
|
|
fz_var(proc_buffer);
|
|
fz_var(proc_filter);
|
|
|
|
buffer = fz_new_buffer(ctx, 1024);
|
|
|
|
fz_try(ctx)
|
|
{
|
|
pdf_obj *sp = pdf_dict_get(ctx, page->obj, PDF_NAME(StructParents));
|
|
int structparents = -1;
|
|
if (pdf_is_number(ctx, sp))
|
|
structparents = pdf_to_int(ctx, sp);
|
|
contents = pdf_page_contents(ctx, page);
|
|
resources = pdf_page_resources(ctx, page);
|
|
|
|
proc_buffer = pdf_new_buffer_processor(ctx, buffer, ascii);
|
|
if (sanitize)
|
|
{
|
|
res = pdf_new_dict(ctx, doc, 1);
|
|
proc_filter = pdf_new_filter_processor_with_text_filter(ctx, doc, structparents, proc_buffer, resources, res, text_filter, after_text, proc_arg);
|
|
pdf_process_contents(ctx, proc_filter, doc, resources, contents, cookie);
|
|
pdf_close_processor(ctx, proc_filter);
|
|
}
|
|
else
|
|
{
|
|
res = pdf_keep_obj(ctx, resources);
|
|
pdf_process_contents(ctx, proc_buffer, doc, resources, contents, cookie);
|
|
}
|
|
pdf_close_processor(ctx, proc_buffer);
|
|
|
|
/* Deal with page content stream. */
|
|
|
|
if (pdf_is_array(ctx, contents))
|
|
{
|
|
/* create a new object to replace the array */
|
|
new_obj = pdf_new_dict(ctx, doc, 1);
|
|
new_ref = pdf_add_object(ctx, doc, new_obj);
|
|
contents = new_ref;
|
|
pdf_dict_put(ctx, page->obj, PDF_NAME(Contents), contents);
|
|
}
|
|
else
|
|
{
|
|
pdf_dict_del(ctx, contents, PDF_NAME(Filter));
|
|
pdf_dict_del(ctx, contents, PDF_NAME(DecodeParms));
|
|
}
|
|
|
|
pdf_update_stream(ctx, doc, contents, buffer, 0);
|
|
|
|
/* Now deal with resources. The spec allows for Type3 fonts and form
|
|
* XObjects to omit a resource dictionary and look in the parent.
|
|
* Avoid that by flattening here as part of the cleaning. This could
|
|
* conceivably cause changes in rendering, but we don't care. */
|
|
|
|
/* ExtGState */
|
|
obj = pdf_dict_get(ctx, res, PDF_NAME(ExtGState));
|
|
if (obj)
|
|
{
|
|
int i, l;
|
|
|
|
l = pdf_dict_len(ctx, obj);
|
|
for (i = 0; i < l; i++)
|
|
{
|
|
pdf_obj *o = pdf_dict_get(ctx, pdf_dict_get_val(ctx, obj, i), PDF_NAME(SMask));
|
|
if (!o)
|
|
continue;
|
|
o = pdf_dict_get(ctx, o, PDF_NAME(G));
|
|
if (!o)
|
|
continue;
|
|
/* Transparency group XObject */
|
|
pdf_clean_stream_object(ctx, doc, o, resources, cookie, 1, text_filter, after_text, proc_arg, sanitize, ascii);
|
|
}
|
|
}
|
|
|
|
/* Pattern */
|
|
obj = pdf_dict_get(ctx, res, PDF_NAME(Pattern));
|
|
if (obj)
|
|
{
|
|
int i, l;
|
|
l = pdf_dict_len(ctx, obj);
|
|
for (i = 0; i < l; i++)
|
|
{
|
|
pdf_obj *pat_res;
|
|
pdf_obj *pat = pdf_dict_get_val(ctx, obj, i);
|
|
if (!pat)
|
|
continue;
|
|
pat_res = pdf_dict_get(ctx, pat, PDF_NAME(Resources));
|
|
if (pat_res == NULL)
|
|
pat_res = resources;
|
|
if (pdf_dict_get_int(ctx, pat, PDF_NAME(PatternType)) == 1)
|
|
pdf_clean_stream_object(ctx, doc, pat, pat_res, cookie, 0, text_filter, after_text, proc_arg, sanitize, ascii);
|
|
}
|
|
}
|
|
|
|
/* XObject */
|
|
obj = pdf_dict_get(ctx, res, PDF_NAME(XObject));
|
|
if (obj)
|
|
{
|
|
int i, l;
|
|
l = pdf_dict_len(ctx, obj);
|
|
for (i = 0; i < l; i++)
|
|
{
|
|
pdf_obj *xobj_res;
|
|
pdf_obj *xobj = pdf_dict_get_val(ctx, obj, i);
|
|
if (!xobj)
|
|
continue;
|
|
xobj_res = pdf_dict_get(ctx, xobj, PDF_NAME(Resources));
|
|
if (xobj_res == NULL)
|
|
xobj_res = resources;
|
|
if (pdf_name_eq(ctx, PDF_NAME(Form), pdf_dict_get(ctx, xobj, PDF_NAME(Subtype))))
|
|
pdf_clean_stream_object(ctx, doc, xobj, xobj_res, cookie, 1, text_filter, after_text, proc_arg, sanitize, ascii);
|
|
}
|
|
}
|
|
|
|
/* Font */
|
|
obj = pdf_dict_get(ctx, res, PDF_NAME(Font));
|
|
if (obj)
|
|
{
|
|
int i, l;
|
|
l = pdf_dict_len(ctx, obj);
|
|
for (i = 0; i < l; i++)
|
|
{
|
|
pdf_obj *o = pdf_dict_get_val(ctx, obj, i);
|
|
if (!o)
|
|
continue;
|
|
if (pdf_name_eq(ctx, PDF_NAME(Type3), pdf_dict_get(ctx, o, PDF_NAME(Subtype))))
|
|
pdf_clean_type3(ctx, doc, o, resources, cookie, sanitize, ascii);
|
|
}
|
|
}
|
|
|
|
/* ProcSet - no cleaning possible. Inherit this from the old dict. */
|
|
obj = pdf_dict_get(ctx, resources, PDF_NAME(ProcSet));
|
|
if (obj)
|
|
pdf_dict_put(ctx, res, PDF_NAME(ProcSet), obj);
|
|
|
|
/* ColorSpace - no cleaning possible. */
|
|
/* Properties - no cleaning possible. */
|
|
|
|
if (proc_fn)
|
|
(*proc_fn)(ctx, buffer, res, proc_arg);
|
|
|
|
/* Update resource dictionary */
|
|
if (sanitize)
|
|
{
|
|
pdf_dict_put(ctx, page->obj, PDF_NAME(Resources), res);
|
|
}
|
|
}
|
|
fz_always(ctx)
|
|
{
|
|
pdf_drop_processor(ctx, proc_filter);
|
|
pdf_drop_processor(ctx, proc_buffer);
|
|
fz_drop_buffer(ctx, buffer);
|
|
pdf_drop_obj(ctx, new_obj);
|
|
pdf_drop_obj(ctx, new_ref);
|
|
pdf_drop_obj(ctx, res);
|
|
}
|
|
fz_catch(ctx)
|
|
{
|
|
fz_rethrow(ctx);
|
|
}
|
|
}
|
|
|
|
/*
|
|
Clean a loaded annotations rendering operations,
|
|
with an optional post processing step.
|
|
|
|
Each appearance stream in the annotation is processed.
|
|
|
|
Firstly, this filters the PDF operators used to avoid (some cases
|
|
of) repetition, and leaves the page in a balanced state with an
|
|
unchanged top level matrix etc. At the same time, the resources
|
|
used by the page contents are collected.
|
|
|
|
Next, the resources themselves are cleaned (as appropriate) in the
|
|
same way.
|
|
|
|
Next, an optional post processing stage is called.
|
|
|
|
Finally, the updated stream of operations is reinserted into the
|
|
appearance stream.
|
|
|
|
annot: An annotation loaded by pdf_load_annot.
|
|
|
|
cookie: A pointer to an optional fz_cookie structure that can be used
|
|
to track progress, collect errors etc.
|
|
*/
|
|
void pdf_clean_annot_contents(fz_context *ctx, pdf_document *doc, pdf_annot *annot, fz_cookie *cookie, pdf_page_contents_process_fn *proc_fn, void *proc_arg, int sanitize, int ascii)
|
|
{
|
|
pdf_filter_annot_contents(ctx, doc, annot, cookie, proc_fn, NULL, NULL, proc_arg, sanitize, ascii);
|
|
}
|
|
|
|
/*
|
|
Performs the same task as
|
|
pdf_clean_annot_contents, but with an optional text filter
|
|
function.
|
|
|
|
text_filter: Function to assess whether a given character
|
|
should be kept (return 0) or removed (return 1).
|
|
|
|
after_text: Function called after each text object is closed
|
|
to allow other output to be sent.
|
|
|
|
arg: Opaque value to be passed to callback functions.
|
|
*/
|
|
void pdf_filter_annot_contents(fz_context *ctx, pdf_document *doc, pdf_annot *annot, fz_cookie *cookie,
|
|
pdf_page_contents_process_fn *proc, pdf_text_filter_fn *text_filter, pdf_after_text_object_fn *after_text, void *arg, int sanitize, int ascii)
|
|
{
|
|
pdf_obj *ap;
|
|
int i, n;
|
|
|
|
ap = pdf_dict_get(ctx, annot->obj, PDF_NAME(AP));
|
|
if (ap == NULL)
|
|
return;
|
|
|
|
n = pdf_dict_len(ctx, ap);
|
|
for (i = 0; i < n; i++)
|
|
{
|
|
pdf_obj *v = pdf_dict_get_val(ctx, ap, i);
|
|
|
|
if (v == NULL)
|
|
continue;
|
|
|
|
pdf_clean_stream_object(ctx, doc, v, NULL, cookie, 1, text_filter, after_text, arg, sanitize, ascii);
|
|
}
|
|
}
|
|
|
|
static void
|
|
pdf_redact_end_page(fz_context *ctx, fz_buffer *buf, pdf_obj *res, void *opaque)
|
|
{
|
|
pdf_page *page = opaque;
|
|
pdf_annot *annot;
|
|
pdf_obj *qp;
|
|
int i, n;
|
|
|
|
fz_append_string(ctx, buf, "0 g\n");
|
|
|
|
for (annot = pdf_first_annot(ctx, page); annot; annot = pdf_next_annot(ctx, annot))
|
|
{
|
|
if (pdf_dict_get(ctx, annot->obj, PDF_NAME(Subtype)) == PDF_NAME(Redact))
|
|
{
|
|
qp = pdf_dict_get(ctx, annot->obj, PDF_NAME(QuadPoints));
|
|
n = pdf_array_len(ctx, qp);
|
|
if (n > 0)
|
|
{
|
|
for (i = 0; i < n; i += 8)
|
|
{
|
|
fz_quad q = pdf_to_quad(ctx, qp, i);
|
|
fz_append_printf(ctx, buf, "%g %g m\n", q.ll.x, q.ll.y);
|
|
fz_append_printf(ctx, buf, "%g %g l\n", q.lr.x, q.lr.y);
|
|
fz_append_printf(ctx, buf, "%g %g l\n", q.ur.x, q.ur.y);
|
|
fz_append_printf(ctx, buf, "%g %g l\n", q.ul.x, q.ul.y);
|
|
fz_append_string(ctx, buf, "f\n");
|
|
}
|
|
}
|
|
else
|
|
{
|
|
fz_rect r = pdf_dict_get_rect(ctx, annot->obj, PDF_NAME(Rect));
|
|
fz_append_printf(ctx, buf, "%g %g m\n", r.x0, r.y0);
|
|
fz_append_printf(ctx, buf, "%g %g l\n", r.x1, r.y0);
|
|
fz_append_printf(ctx, buf, "%g %g l\n", r.x1, r.y1);
|
|
fz_append_printf(ctx, buf, "%g %g l\n", r.x0, r.y1);
|
|
fz_append_string(ctx, buf, "f\n");
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
static int
|
|
pdf_redact_text_filter(fz_context *ctx, void *opaque, int *ucsbuf, int ucslen, fz_matrix trm, fz_matrix ctm, fz_rect bbox)
|
|
{
|
|
pdf_page *page = opaque;
|
|
pdf_annot *annot;
|
|
pdf_obj *qp;
|
|
fz_rect r;
|
|
fz_quad q;
|
|
int i, n;
|
|
|
|
trm = fz_concat(trm, ctm);
|
|
|
|
for (annot = pdf_first_annot(ctx, page); annot; annot = pdf_next_annot(ctx, annot))
|
|
{
|
|
if (pdf_dict_get(ctx, annot->obj, PDF_NAME(Subtype)) == PDF_NAME(Redact))
|
|
{
|
|
qp = pdf_dict_get(ctx, annot->obj, PDF_NAME(QuadPoints));
|
|
n = pdf_array_len(ctx, qp);
|
|
if (n > 0)
|
|
{
|
|
for (i = 0; i < n; i += 8)
|
|
{
|
|
q = pdf_to_quad(ctx, qp, i);
|
|
if (fz_is_point_inside_quad(fz_make_point(trm.e, trm.f), q))
|
|
return 1;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
r = pdf_dict_get_rect(ctx, annot->obj, PDF_NAME(Rect));
|
|
if (fz_is_point_inside_rect(fz_make_point(trm.e, trm.f), r))
|
|
return 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
int
|
|
pdf_redact_page(fz_context *ctx, pdf_document *doc, pdf_page *page, pdf_redact_options *opts)
|
|
{
|
|
pdf_annot *annot;
|
|
int has_redactions = 0;
|
|
int no_black_boxes = 0;
|
|
|
|
if (opts)
|
|
{
|
|
no_black_boxes = opts->no_black_boxes;
|
|
}
|
|
|
|
for (annot = pdf_first_annot(ctx, page); annot; annot = pdf_next_annot(ctx, annot))
|
|
if (pdf_dict_get(ctx, annot->obj, PDF_NAME(Subtype)) == PDF_NAME(Redact))
|
|
has_redactions = 1;
|
|
|
|
if (has_redactions)
|
|
{
|
|
pdf_filter_page_contents(ctx, doc, page, NULL,
|
|
no_black_boxes ? NULL : pdf_redact_end_page,
|
|
pdf_redact_text_filter,
|
|
NULL,
|
|
page,
|
|
1, 1);
|
|
}
|
|
|
|
annot = pdf_first_annot(ctx, page);
|
|
while (annot)
|
|
{
|
|
if (pdf_dict_get(ctx, annot->obj, PDF_NAME(Subtype)) == PDF_NAME(Redact))
|
|
{
|
|
pdf_delete_annot(ctx, page, annot);
|
|
annot = pdf_first_annot(ctx, page);
|
|
}
|
|
else
|
|
{
|
|
annot = pdf_next_annot(ctx, annot);
|
|
}
|
|
}
|
|
|
|
doc->redacted = has_redactions;
|
|
|
|
return has_redactions;
|
|
}
|