831 lines
22 KiB
C
831 lines
22 KiB
C
|
#include "mupdf/fitz.h"
|
||
|
#include "mupdf/ucdn.h"
|
||
|
|
||
|
#include <math.h>
|
||
|
#include <float.h>
|
||
|
#include <string.h>
|
||
|
|
||
|
/* Simple layout structure */
|
||
|
|
||
|
fz_layout_block *fz_new_layout(fz_context *ctx)
|
||
|
{
|
||
|
fz_pool *pool = fz_new_pool(ctx);
|
||
|
fz_layout_block *block;
|
||
|
fz_try(ctx)
|
||
|
{
|
||
|
block = fz_pool_alloc(ctx, pool, sizeof (fz_layout_block));
|
||
|
block->pool = pool;
|
||
|
block->head = NULL;
|
||
|
block->tailp = &block->head;
|
||
|
}
|
||
|
fz_catch(ctx)
|
||
|
{
|
||
|
fz_drop_pool(ctx, pool);
|
||
|
fz_rethrow(ctx);
|
||
|
}
|
||
|
return block;
|
||
|
}
|
||
|
|
||
|
void fz_drop_layout(fz_context *ctx, fz_layout_block *block)
|
||
|
{
|
||
|
if (block)
|
||
|
fz_drop_pool(ctx, block->pool);
|
||
|
}
|
||
|
|
||
|
void fz_add_layout_line(fz_context *ctx, fz_layout_block *block, float x, float y, float h, const char *p)
|
||
|
{
|
||
|
fz_layout_line *line = fz_pool_alloc(ctx, block->pool, sizeof (fz_layout_line));
|
||
|
line->x = x;
|
||
|
line->y = y;
|
||
|
line->h = h;
|
||
|
line->p = p;
|
||
|
line->text = NULL;
|
||
|
line->next = NULL;
|
||
|
*block->tailp = line;
|
||
|
block->tailp = &line->next;
|
||
|
block->text_tailp = &line->text;
|
||
|
}
|
||
|
|
||
|
void fz_add_layout_char(fz_context *ctx, fz_layout_block *block, float x, float w, const char *p)
|
||
|
{
|
||
|
fz_layout_char *ch = fz_pool_alloc(ctx, block->pool, sizeof (fz_layout_char));
|
||
|
ch->x = x;
|
||
|
ch->w = w;
|
||
|
ch->p = p;
|
||
|
ch->next = NULL;
|
||
|
*block->text_tailp = ch;
|
||
|
block->text_tailp = &ch->next;
|
||
|
}
|
||
|
|
||
|
/* Extract text into blocks and lines. */
|
||
|
|
||
|
#define PARAGRAPH_DIST 1.5f
|
||
|
#define SPACE_DIST 0.15f
|
||
|
#define SPACE_MAX_DIST 0.8f
|
||
|
|
||
|
typedef struct fz_stext_device_s fz_stext_device;
|
||
|
|
||
|
struct fz_stext_device_s
|
||
|
{
|
||
|
fz_device super;
|
||
|
fz_stext_page *page;
|
||
|
fz_point pen, start;
|
||
|
fz_matrix trm;
|
||
|
int new_obj;
|
||
|
int curdir;
|
||
|
int lastchar;
|
||
|
int flags;
|
||
|
int color;
|
||
|
const fz_text *lasttext;
|
||
|
};
|
||
|
|
||
|
const char *fz_stext_options_usage =
|
||
|
"Text output options:\n"
|
||
|
"\tinhibit-spaces: don't add spaces between gaps in the text\n"
|
||
|
"\tpreserve-images: keep images in output\n"
|
||
|
"\tpreserve-ligatures: do not expand ligatures into constituent characters\n"
|
||
|
"\tpreserve-whitespace: do not convert all whitespace into space characters\n"
|
||
|
"\n";
|
||
|
|
||
|
/*
|
||
|
Create an empty text page.
|
||
|
|
||
|
The text page is filled out by the text device to contain the blocks
|
||
|
and lines of text on the page.
|
||
|
|
||
|
mediabox: optional mediabox information.
|
||
|
*/
|
||
|
fz_stext_page *
|
||
|
fz_new_stext_page(fz_context *ctx, fz_rect mediabox)
|
||
|
{
|
||
|
fz_pool *pool = fz_new_pool(ctx);
|
||
|
fz_stext_page *page = NULL;
|
||
|
fz_try(ctx)
|
||
|
{
|
||
|
page = fz_pool_alloc(ctx, pool, sizeof(*page));
|
||
|
page->pool = pool;
|
||
|
page->mediabox = mediabox;
|
||
|
page->first_block = NULL;
|
||
|
page->last_block = NULL;
|
||
|
}
|
||
|
fz_catch(ctx)
|
||
|
{
|
||
|
fz_drop_pool(ctx, pool);
|
||
|
fz_rethrow(ctx);
|
||
|
}
|
||
|
return page;
|
||
|
}
|
||
|
|
||
|
void
|
||
|
fz_drop_stext_page(fz_context *ctx, fz_stext_page *page)
|
||
|
{
|
||
|
if (page)
|
||
|
{
|
||
|
fz_stext_block *block;
|
||
|
for (block = page->first_block; block; block = block->next)
|
||
|
if (block->type == FZ_STEXT_BLOCK_IMAGE)
|
||
|
fz_drop_image(ctx, block->u.i.image);
|
||
|
fz_drop_pool(ctx, page->pool);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
static fz_stext_block *
|
||
|
add_block_to_page(fz_context *ctx, fz_stext_page *page)
|
||
|
{
|
||
|
fz_stext_block *block = fz_pool_alloc(ctx, page->pool, sizeof *page->first_block);
|
||
|
block->prev = page->last_block;
|
||
|
if (!page->first_block)
|
||
|
page->first_block = page->last_block = block;
|
||
|
else
|
||
|
{
|
||
|
page->last_block->next = block;
|
||
|
page->last_block = block;
|
||
|
}
|
||
|
return block;
|
||
|
}
|
||
|
|
||
|
static fz_stext_block *
|
||
|
add_text_block_to_page(fz_context *ctx, fz_stext_page *page)
|
||
|
{
|
||
|
fz_stext_block *block = add_block_to_page(ctx, page);
|
||
|
block->type = FZ_STEXT_BLOCK_TEXT;
|
||
|
return block;
|
||
|
}
|
||
|
|
||
|
static fz_stext_block *
|
||
|
add_image_block_to_page(fz_context *ctx, fz_stext_page *page, fz_matrix ctm, fz_image *image)
|
||
|
{
|
||
|
fz_stext_block *block = add_block_to_page(ctx, page);
|
||
|
block->type = FZ_STEXT_BLOCK_IMAGE;
|
||
|
block->u.i.transform = ctm;
|
||
|
block->u.i.image = fz_keep_image(ctx, image);
|
||
|
block->bbox = fz_transform_rect(fz_unit_rect, ctm);
|
||
|
return block;
|
||
|
}
|
||
|
|
||
|
static fz_stext_line *
|
||
|
add_line_to_block(fz_context *ctx, fz_stext_page *page, fz_stext_block *block, const fz_point *dir, int wmode)
|
||
|
{
|
||
|
fz_stext_line *line = fz_pool_alloc(ctx, page->pool, sizeof *block->u.t.first_line);
|
||
|
line->prev = block->u.t.last_line;
|
||
|
if (!block->u.t.first_line)
|
||
|
block->u.t.first_line = block->u.t.last_line = line;
|
||
|
else
|
||
|
{
|
||
|
block->u.t.last_line->next = line;
|
||
|
block->u.t.last_line = line;
|
||
|
}
|
||
|
|
||
|
line->dir = *dir;
|
||
|
line->wmode = wmode;
|
||
|
|
||
|
return line;
|
||
|
}
|
||
|
|
||
|
static fz_stext_char *
|
||
|
add_char_to_line(fz_context *ctx, fz_stext_page *page, fz_stext_line *line, fz_matrix trm, fz_font *font, float size, int c, fz_point *p, fz_point *q, int color)
|
||
|
{
|
||
|
fz_stext_char *ch = fz_pool_alloc(ctx, page->pool, sizeof *line->first_char);
|
||
|
fz_point a, d;
|
||
|
|
||
|
if (!line->first_char)
|
||
|
line->first_char = line->last_char = ch;
|
||
|
else
|
||
|
{
|
||
|
line->last_char->next = ch;
|
||
|
line->last_char = ch;
|
||
|
}
|
||
|
|
||
|
ch->c = c;
|
||
|
ch->color = color;
|
||
|
ch->origin = *p;
|
||
|
ch->size = size;
|
||
|
ch->font = font; /* TODO: keep and drop */
|
||
|
|
||
|
if (line->wmode == 0)
|
||
|
{
|
||
|
a.x = 0;
|
||
|
d.x = 0;
|
||
|
a.y = fz_font_ascender(ctx, font);
|
||
|
d.y = fz_font_descender(ctx, font);
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
fz_rect bbox = fz_font_bbox(ctx, font);
|
||
|
a.x = bbox.x1;
|
||
|
d.x = bbox.x0;
|
||
|
a.y = 0;
|
||
|
d.y = 0;
|
||
|
}
|
||
|
a = fz_transform_vector(a, trm);
|
||
|
d = fz_transform_vector(d, trm);
|
||
|
|
||
|
ch->quad.ll = fz_make_point(p->x + d.x, p->y + d.y);
|
||
|
ch->quad.ul = fz_make_point(p->x + a.x, p->y + a.y);
|
||
|
ch->quad.lr = fz_make_point(q->x + d.x, q->y + d.y);
|
||
|
ch->quad.ur = fz_make_point(q->x + a.x, q->y + a.y);
|
||
|
|
||
|
return ch;
|
||
|
}
|
||
|
|
||
|
static int
|
||
|
direction_from_bidi_class(int bidiclass, int curdir)
|
||
|
{
|
||
|
switch (bidiclass)
|
||
|
{
|
||
|
/* strong */
|
||
|
case UCDN_BIDI_CLASS_L: return 1;
|
||
|
case UCDN_BIDI_CLASS_R: return -1;
|
||
|
case UCDN_BIDI_CLASS_AL: return -1;
|
||
|
|
||
|
/* weak */
|
||
|
case UCDN_BIDI_CLASS_EN:
|
||
|
case UCDN_BIDI_CLASS_ES:
|
||
|
case UCDN_BIDI_CLASS_ET:
|
||
|
case UCDN_BIDI_CLASS_AN:
|
||
|
case UCDN_BIDI_CLASS_CS:
|
||
|
case UCDN_BIDI_CLASS_NSM:
|
||
|
case UCDN_BIDI_CLASS_BN:
|
||
|
return curdir;
|
||
|
|
||
|
/* neutral */
|
||
|
case UCDN_BIDI_CLASS_B:
|
||
|
case UCDN_BIDI_CLASS_S:
|
||
|
case UCDN_BIDI_CLASS_WS:
|
||
|
case UCDN_BIDI_CLASS_ON:
|
||
|
return curdir;
|
||
|
|
||
|
/* embedding, override, pop ... we don't support them */
|
||
|
default:
|
||
|
return 0;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
static float
|
||
|
vec_dot(const fz_point *a, const fz_point *b)
|
||
|
{
|
||
|
return a->x * b->x + a->y * b->y;
|
||
|
}
|
||
|
|
||
|
static void
|
||
|
fz_add_stext_char_imp(fz_context *ctx, fz_stext_device *dev, fz_font *font, int c, int glyph, fz_matrix trm, float adv, int wmode)
|
||
|
{
|
||
|
fz_stext_page *page = dev->page;
|
||
|
fz_stext_block *cur_block;
|
||
|
fz_stext_line *cur_line;
|
||
|
|
||
|
int new_para = 0;
|
||
|
int new_line = 1;
|
||
|
int add_space = 0;
|
||
|
fz_point dir, ndir, p, q;
|
||
|
float size;
|
||
|
fz_point delta;
|
||
|
float spacing = 0;
|
||
|
float base_offset = 0;
|
||
|
int rtl = 0;
|
||
|
|
||
|
dev->curdir = direction_from_bidi_class(ucdn_get_bidi_class(c), dev->curdir);
|
||
|
|
||
|
/* dir = direction vector for motion. ndir = normalised(dir) */
|
||
|
if (wmode == 0)
|
||
|
{
|
||
|
dir.x = 1;
|
||
|
dir.y = 0;
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
dir.x = 0;
|
||
|
dir.y = -1;
|
||
|
}
|
||
|
dir = fz_transform_vector(dir, trm);
|
||
|
ndir = fz_normalize_vector(dir);
|
||
|
|
||
|
size = fz_matrix_expansion(trm);
|
||
|
|
||
|
/* We need to identify where glyphs 'start' (p) and 'stop' (q).
|
||
|
* Each glyph holds its 'start' position, and the next glyph in the
|
||
|
* span (or span->max if there is no next glyph) holds its 'end'
|
||
|
* position.
|
||
|
*
|
||
|
* For both horizontal and vertical motion, trm->{e,f} gives the
|
||
|
* origin (usually the bottom left) of the glyph.
|
||
|
*
|
||
|
* In horizontal mode:
|
||
|
* + p is bottom left.
|
||
|
* + q is the bottom right
|
||
|
* In vertical mode:
|
||
|
* + p is top left (where it advanced from)
|
||
|
* + q is bottom left
|
||
|
*/
|
||
|
if (wmode == 0)
|
||
|
{
|
||
|
p.x = trm.e;
|
||
|
p.y = trm.f;
|
||
|
q.x = trm.e + adv * dir.x;
|
||
|
q.y = trm.f + adv * dir.y;
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
p.x = trm.e - adv * dir.x;
|
||
|
p.y = trm.f - adv * dir.y;
|
||
|
q.x = trm.e;
|
||
|
q.y = trm.f;
|
||
|
}
|
||
|
|
||
|
/* Find current position to enter new text. */
|
||
|
cur_block = page->last_block;
|
||
|
if (cur_block && cur_block->type != FZ_STEXT_BLOCK_TEXT)
|
||
|
cur_block = NULL;
|
||
|
cur_line = cur_block ? cur_block->u.t.last_line : NULL;
|
||
|
|
||
|
if (cur_line && glyph < 0)
|
||
|
{
|
||
|
/* Don't advance pen or break lines for no-glyph characters in a cluster */
|
||
|
add_char_to_line(ctx, page, cur_line, trm, font, size, c, &dev->pen, &dev->pen, dev->color);
|
||
|
dev->lastchar = c;
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
if (cur_line == NULL || cur_line->wmode != wmode || vec_dot(&ndir, &cur_line->dir) < 0.999f)
|
||
|
{
|
||
|
/* If the matrix has changed rotation, or the wmode is different (or if we don't have a line at all),
|
||
|
* then we can't append to the current block/line. */
|
||
|
new_para = 1;
|
||
|
new_line = 1;
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
/* Detect fake bold where text is printed twice in the same place. */
|
||
|
delta.x = fabsf(q.x - dev->pen.x);
|
||
|
delta.y = fabsf(q.y - dev->pen.y);
|
||
|
if (delta.x < FLT_EPSILON && delta.y < FLT_EPSILON && c == dev->lastchar)
|
||
|
return;
|
||
|
|
||
|
/* Calculate how far we've moved since the last character. */
|
||
|
delta.x = p.x - dev->pen.x;
|
||
|
delta.y = p.y - dev->pen.y;
|
||
|
|
||
|
/* The transform has not changed, so we know we're in the same
|
||
|
* direction. Calculate 2 distances; how far off the previous
|
||
|
* baseline we are, together with how far along the baseline
|
||
|
* we are from the expected position. */
|
||
|
spacing = ndir.x * delta.x + ndir.y * delta.y;
|
||
|
base_offset = -ndir.y * delta.x + ndir.x * delta.y;
|
||
|
|
||
|
/* Only a small amount off the baseline - we'll take this */
|
||
|
if (fabsf(base_offset) < size * 0.8f)
|
||
|
{
|
||
|
/* LTR or neutral character */
|
||
|
if (dev->curdir >= 0)
|
||
|
{
|
||
|
if (fabsf(spacing) < size * SPACE_DIST)
|
||
|
{
|
||
|
/* Motion is in line and small enough to ignore. */
|
||
|
new_line = 0;
|
||
|
}
|
||
|
else if (fabsf(spacing) > size * SPACE_MAX_DIST)
|
||
|
{
|
||
|
/* Motion is in line and large enough to warrant splitting to a new line */
|
||
|
new_line = 1;
|
||
|
}
|
||
|
else if (spacing < 0)
|
||
|
{
|
||
|
/* Motion is backward in line! Ignore this odd spacing. */
|
||
|
new_line = 0;
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
/* Motion is forward in line and large enough to warrant us adding a space. */
|
||
|
if (dev->lastchar != ' ' && wmode == 0)
|
||
|
add_space = 1;
|
||
|
new_line = 0;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/* RTL character -- disable space character and column detection heuristics */
|
||
|
else
|
||
|
{
|
||
|
new_line = 0;
|
||
|
if (spacing > size * SPACE_DIST || spacing < 0)
|
||
|
rtl = 0; /* backward (or big jump to 'right' side) means logical order */
|
||
|
else
|
||
|
rtl = 1; /* visual order, we need to reverse in a post process pass */
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/* Enough for a new line, but not enough for a new paragraph */
|
||
|
else if (fabsf(base_offset) <= size * PARAGRAPH_DIST)
|
||
|
{
|
||
|
/* Check indent to spot text-indent style paragraphs */
|
||
|
if (wmode == 0 && cur_line && dev->new_obj)
|
||
|
if (fabsf(p.x - dev->start.x) > size * 0.5f)
|
||
|
new_para = 1;
|
||
|
new_line = 1;
|
||
|
}
|
||
|
|
||
|
/* Way off the baseline - open a new paragraph */
|
||
|
else
|
||
|
{
|
||
|
new_para = 1;
|
||
|
new_line = 1;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/* Start a new block (but only at the beginning of a text object) */
|
||
|
if (new_para || !cur_block)
|
||
|
{
|
||
|
cur_block = add_text_block_to_page(ctx, page);
|
||
|
cur_line = cur_block->u.t.last_line;
|
||
|
}
|
||
|
|
||
|
/* Start a new line */
|
||
|
if (new_line || !cur_line)
|
||
|
{
|
||
|
cur_line = add_line_to_block(ctx, page, cur_block, &ndir, wmode);
|
||
|
dev->start = p;
|
||
|
}
|
||
|
|
||
|
/* Add synthetic space */
|
||
|
if (add_space && !(dev->flags & FZ_STEXT_INHIBIT_SPACES))
|
||
|
add_char_to_line(ctx, page, cur_line, trm, font, size, ' ', &dev->pen, &p, dev->color);
|
||
|
|
||
|
add_char_to_line(ctx, page, cur_line, trm, font, size, c, &p, &q, dev->color);
|
||
|
dev->lastchar = c;
|
||
|
dev->pen = q;
|
||
|
|
||
|
dev->new_obj = 0;
|
||
|
dev->trm = trm;
|
||
|
}
|
||
|
|
||
|
static void
|
||
|
fz_add_stext_char(fz_context *ctx, fz_stext_device *dev, fz_font *font, int c, int glyph, fz_matrix trm, float adv, int wmode)
|
||
|
{
|
||
|
/* ignore when one unicode character maps to multiple glyphs */
|
||
|
if (c == -1)
|
||
|
return;
|
||
|
|
||
|
if (!(dev->flags & FZ_STEXT_PRESERVE_LIGATURES))
|
||
|
{
|
||
|
switch (c)
|
||
|
{
|
||
|
case 0xFB00: /* ff */
|
||
|
fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode);
|
||
|
fz_add_stext_char_imp(ctx, dev, font, 'f', -1, trm, 0, wmode);
|
||
|
return;
|
||
|
case 0xFB01: /* fi */
|
||
|
fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode);
|
||
|
fz_add_stext_char_imp(ctx, dev, font, 'i', -1, trm, 0, wmode);
|
||
|
return;
|
||
|
case 0xFB02: /* fl */
|
||
|
fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode);
|
||
|
fz_add_stext_char_imp(ctx, dev, font, 'l', -1, trm, 0, wmode);
|
||
|
return;
|
||
|
case 0xFB03: /* ffi */
|
||
|
fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode);
|
||
|
fz_add_stext_char_imp(ctx, dev, font, 'f', -1, trm, 0, wmode);
|
||
|
fz_add_stext_char_imp(ctx, dev, font, 'i', -1, trm, 0, wmode);
|
||
|
return;
|
||
|
case 0xFB04: /* ffl */
|
||
|
fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode);
|
||
|
fz_add_stext_char_imp(ctx, dev, font, 'f', -1, trm, 0, wmode);
|
||
|
fz_add_stext_char_imp(ctx, dev, font, 'l', -1, trm, 0, wmode);
|
||
|
return;
|
||
|
case 0xFB05: /* long st */
|
||
|
case 0xFB06: /* st */
|
||
|
fz_add_stext_char_imp(ctx, dev, font, 's', glyph, trm, adv, wmode);
|
||
|
fz_add_stext_char_imp(ctx, dev, font, 't', -1, trm, 0, wmode);
|
||
|
return;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if (!(dev->flags & FZ_STEXT_PRESERVE_WHITESPACE))
|
||
|
{
|
||
|
switch (c)
|
||
|
{
|
||
|
case 0x0009: /* tab */
|
||
|
case 0x0020: /* space */
|
||
|
case 0x00A0: /* no-break space */
|
||
|
case 0x1680: /* ogham space mark */
|
||
|
case 0x180E: /* mongolian vowel separator */
|
||
|
case 0x2000: /* en quad */
|
||
|
case 0x2001: /* em quad */
|
||
|
case 0x2002: /* en space */
|
||
|
case 0x2003: /* em space */
|
||
|
case 0x2004: /* three-per-em space */
|
||
|
case 0x2005: /* four-per-em space */
|
||
|
case 0x2006: /* six-per-em space */
|
||
|
case 0x2007: /* figure space */
|
||
|
case 0x2008: /* punctuation space */
|
||
|
case 0x2009: /* thin space */
|
||
|
case 0x200A: /* hair space */
|
||
|
case 0x202F: /* narrow no-break space */
|
||
|
case 0x205F: /* medium mathematical space */
|
||
|
case 0x3000: /* ideographic space */
|
||
|
c = ' ';
|
||
|
}
|
||
|
}
|
||
|
|
||
|
fz_add_stext_char_imp(ctx, dev, font, c, glyph, trm, adv, wmode);
|
||
|
}
|
||
|
|
||
|
static void
|
||
|
fz_stext_extract(fz_context *ctx, fz_stext_device *dev, fz_text_span *span, fz_matrix ctm)
|
||
|
{
|
||
|
fz_font *font = span->font;
|
||
|
fz_matrix tm = span->trm;
|
||
|
fz_matrix trm;
|
||
|
float adv;
|
||
|
int i;
|
||
|
|
||
|
if (span->len == 0)
|
||
|
return;
|
||
|
|
||
|
tm.e = 0;
|
||
|
tm.f = 0;
|
||
|
trm = fz_concat(tm, ctm);
|
||
|
|
||
|
for (i = 0; i < span->len; i++)
|
||
|
{
|
||
|
/* Calculate new pen location and delta */
|
||
|
tm.e = span->items[i].x;
|
||
|
tm.f = span->items[i].y;
|
||
|
trm = fz_concat(tm, ctm);
|
||
|
|
||
|
/* Calculate bounding box and new pen position based on font metrics */
|
||
|
if (span->items[i].gid >= 0)
|
||
|
adv = fz_advance_glyph(ctx, font, span->items[i].gid, span->wmode);
|
||
|
else
|
||
|
adv = 0;
|
||
|
|
||
|
fz_add_stext_char(ctx, dev, font, span->items[i].ucs, span->items[i].gid, trm, adv, span->wmode);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
static int hexrgb_from_color(fz_context *ctx, fz_colorspace *colorspace, const float *color)
|
||
|
{
|
||
|
float rgb[3];
|
||
|
fz_convert_color(ctx, colorspace, color, fz_device_rgb(ctx), rgb, NULL, fz_default_color_params);
|
||
|
return
|
||
|
(fz_clampi(rgb[0] * 255, 0, 255) << 16) |
|
||
|
(fz_clampi(rgb[1] * 255, 0, 255) << 8) |
|
||
|
(fz_clampi(rgb[2] * 255, 0, 255));
|
||
|
}
|
||
|
|
||
|
static void
|
||
|
fz_stext_fill_text(fz_context *ctx, fz_device *dev, const fz_text *text, fz_matrix ctm,
|
||
|
fz_colorspace *colorspace, const float *color, float alpha, fz_color_params color_params)
|
||
|
{
|
||
|
fz_stext_device *tdev = (fz_stext_device*)dev;
|
||
|
fz_text_span *span;
|
||
|
if (text == tdev->lasttext)
|
||
|
return;
|
||
|
tdev->color = hexrgb_from_color(ctx, colorspace, color);
|
||
|
tdev->new_obj = 1;
|
||
|
for (span = text->head; span; span = span->next)
|
||
|
fz_stext_extract(ctx, tdev, span, ctm);
|
||
|
fz_drop_text(ctx, tdev->lasttext);
|
||
|
tdev->lasttext = fz_keep_text(ctx, text);
|
||
|
}
|
||
|
|
||
|
static void
|
||
|
fz_stext_stroke_text(fz_context *ctx, fz_device *dev, const fz_text *text, const fz_stroke_state *stroke, fz_matrix ctm,
|
||
|
fz_colorspace *colorspace, const float *color, float alpha, fz_color_params color_params)
|
||
|
{
|
||
|
fz_stext_device *tdev = (fz_stext_device*)dev;
|
||
|
fz_text_span *span;
|
||
|
if (text == tdev->lasttext)
|
||
|
return;
|
||
|
tdev->color = hexrgb_from_color(ctx, colorspace, color);
|
||
|
tdev->new_obj = 1;
|
||
|
for (span = text->head; span; span = span->next)
|
||
|
fz_stext_extract(ctx, tdev, span, ctm);
|
||
|
fz_drop_text(ctx, tdev->lasttext);
|
||
|
tdev->lasttext = fz_keep_text(ctx, text);
|
||
|
}
|
||
|
|
||
|
static void
|
||
|
fz_stext_clip_text(fz_context *ctx, fz_device *dev, const fz_text *text, fz_matrix ctm, fz_rect scissor)
|
||
|
{
|
||
|
fz_stext_device *tdev = (fz_stext_device*)dev;
|
||
|
fz_text_span *span;
|
||
|
if (text == tdev->lasttext)
|
||
|
return;
|
||
|
tdev->color = 0;
|
||
|
tdev->new_obj = 1;
|
||
|
for (span = text->head; span; span = span->next)
|
||
|
fz_stext_extract(ctx, tdev, span, ctm);
|
||
|
fz_drop_text(ctx, tdev->lasttext);
|
||
|
tdev->lasttext = fz_keep_text(ctx, text);
|
||
|
}
|
||
|
|
||
|
static void
|
||
|
fz_stext_clip_stroke_text(fz_context *ctx, fz_device *dev, const fz_text *text, const fz_stroke_state *stroke, fz_matrix ctm, fz_rect scissor)
|
||
|
{
|
||
|
fz_stext_device *tdev = (fz_stext_device*)dev;
|
||
|
fz_text_span *span;
|
||
|
if (text == tdev->lasttext)
|
||
|
return;
|
||
|
tdev->color = 0;
|
||
|
tdev->new_obj = 1;
|
||
|
for (span = text->head; span; span = span->next)
|
||
|
fz_stext_extract(ctx, tdev, span, ctm);
|
||
|
fz_drop_text(ctx, tdev->lasttext);
|
||
|
tdev->lasttext = fz_keep_text(ctx, text);
|
||
|
}
|
||
|
|
||
|
static void
|
||
|
fz_stext_ignore_text(fz_context *ctx, fz_device *dev, const fz_text *text, fz_matrix ctm)
|
||
|
{
|
||
|
fz_stext_device *tdev = (fz_stext_device*)dev;
|
||
|
fz_text_span *span;
|
||
|
if (text == tdev->lasttext)
|
||
|
return;
|
||
|
tdev->color = 0;
|
||
|
tdev->new_obj = 1;
|
||
|
for (span = text->head; span; span = span->next)
|
||
|
fz_stext_extract(ctx, tdev, span, ctm);
|
||
|
fz_drop_text(ctx, tdev->lasttext);
|
||
|
tdev->lasttext = fz_keep_text(ctx, text);
|
||
|
}
|
||
|
|
||
|
/* Images and shadings */
|
||
|
|
||
|
static void
|
||
|
fz_stext_fill_image(fz_context *ctx, fz_device *dev, fz_image *img, fz_matrix ctm, float alpha, fz_color_params color_params)
|
||
|
{
|
||
|
fz_stext_device *tdev = (fz_stext_device*)dev;
|
||
|
|
||
|
/* If the alpha is less than 50% then it's probably a watermark or effect or something. Skip it. */
|
||
|
if (alpha < 0.5f)
|
||
|
return;
|
||
|
|
||
|
add_image_block_to_page(ctx, tdev->page, ctm, img);
|
||
|
}
|
||
|
|
||
|
static void
|
||
|
fz_stext_fill_image_mask(fz_context *ctx, fz_device *dev, fz_image *img, fz_matrix ctm,
|
||
|
fz_colorspace *cspace, const float *color, float alpha, fz_color_params color_params)
|
||
|
{
|
||
|
fz_stext_fill_image(ctx, dev, img, ctm, alpha, color_params);
|
||
|
}
|
||
|
|
||
|
static fz_image *
|
||
|
fz_new_image_from_shade(fz_context *ctx, fz_shade *shade, fz_matrix *in_out_ctm, fz_color_params color_params, fz_rect scissor)
|
||
|
{
|
||
|
fz_matrix ctm = *in_out_ctm;
|
||
|
fz_pixmap *pix;
|
||
|
fz_image *img = NULL;
|
||
|
fz_rect bounds;
|
||
|
fz_irect bbox;
|
||
|
|
||
|
bounds = fz_bound_shade(ctx, shade, ctm);
|
||
|
bounds = fz_intersect_rect(bounds, scissor);
|
||
|
bbox = fz_irect_from_rect(bounds);
|
||
|
|
||
|
pix = fz_new_pixmap_with_bbox(ctx, fz_device_rgb(ctx), bbox, NULL, !shade->use_background);
|
||
|
fz_try(ctx)
|
||
|
{
|
||
|
if (shade->use_background)
|
||
|
fz_fill_pixmap_with_color(ctx, pix, shade->colorspace, shade->background, color_params);
|
||
|
else
|
||
|
fz_clear_pixmap(ctx, pix);
|
||
|
fz_paint_shade(ctx, shade, NULL, ctm, pix, color_params, bbox, NULL);
|
||
|
img = fz_new_image_from_pixmap(ctx, pix, NULL);
|
||
|
}
|
||
|
fz_always(ctx)
|
||
|
fz_drop_pixmap(ctx, pix);
|
||
|
fz_catch(ctx)
|
||
|
fz_rethrow(ctx);
|
||
|
|
||
|
in_out_ctm->a = pix->w;
|
||
|
in_out_ctm->b = 0;
|
||
|
in_out_ctm->c = 0;
|
||
|
in_out_ctm->d = pix->h;
|
||
|
in_out_ctm->e = pix->x;
|
||
|
in_out_ctm->f = pix->y;
|
||
|
return img;
|
||
|
}
|
||
|
|
||
|
static void
|
||
|
fz_stext_fill_shade(fz_context *ctx, fz_device *dev, fz_shade *shade, fz_matrix ctm, float alpha, fz_color_params color_params)
|
||
|
{
|
||
|
fz_matrix local_ctm = ctm;
|
||
|
fz_rect scissor = fz_device_current_scissor(ctx, dev);
|
||
|
fz_image *image = fz_new_image_from_shade(ctx, shade, &local_ctm, color_params, scissor);
|
||
|
fz_try(ctx)
|
||
|
fz_stext_fill_image(ctx, dev, image, local_ctm, alpha, color_params);
|
||
|
fz_always(ctx)
|
||
|
fz_drop_image(ctx, image);
|
||
|
fz_catch(ctx)
|
||
|
fz_rethrow(ctx);
|
||
|
}
|
||
|
|
||
|
static void
|
||
|
fz_stext_close_device(fz_context *ctx, fz_device *dev)
|
||
|
{
|
||
|
fz_stext_device *tdev = (fz_stext_device*)dev;
|
||
|
fz_stext_page *page = tdev->page;
|
||
|
fz_stext_block *block;
|
||
|
fz_stext_line *line;
|
||
|
fz_stext_char *ch;
|
||
|
|
||
|
for (block = page->first_block; block; block = block->next)
|
||
|
{
|
||
|
if (block->type != FZ_STEXT_BLOCK_TEXT)
|
||
|
continue;
|
||
|
for (line = block->u.t.first_line; line; line = line->next)
|
||
|
{
|
||
|
for (ch = line->first_char; ch; ch = ch->next)
|
||
|
{
|
||
|
fz_rect ch_box = fz_rect_from_quad(ch->quad);
|
||
|
if (ch == line->first_char)
|
||
|
line->bbox = ch_box;
|
||
|
else
|
||
|
line->bbox = fz_union_rect(line->bbox, ch_box);
|
||
|
}
|
||
|
block->bbox = fz_union_rect(block->bbox, line->bbox);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/* TODO: smart sorting of blocks and lines in reading order */
|
||
|
/* TODO: unicode NFC normalization */
|
||
|
}
|
||
|
|
||
|
static void
|
||
|
fz_stext_drop_device(fz_context *ctx, fz_device *dev)
|
||
|
{
|
||
|
fz_stext_device *tdev = (fz_stext_device*)dev;
|
||
|
fz_drop_text(ctx, tdev->lasttext);
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
Parse stext device options from a comma separated key-value string.
|
||
|
*/
|
||
|
fz_stext_options *
|
||
|
fz_parse_stext_options(fz_context *ctx, fz_stext_options *opts, const char *string)
|
||
|
{
|
||
|
const char *val;
|
||
|
|
||
|
memset(opts, 0, sizeof *opts);
|
||
|
|
||
|
if (fz_has_option(ctx, string, "preserve-ligatures", &val) && fz_option_eq(val, "yes"))
|
||
|
opts->flags |= FZ_STEXT_PRESERVE_LIGATURES;
|
||
|
if (fz_has_option(ctx, string, "preserve-whitespace", &val) && fz_option_eq(val, "yes"))
|
||
|
opts->flags |= FZ_STEXT_PRESERVE_WHITESPACE;
|
||
|
if (fz_has_option(ctx, string, "preserve-images", &val) && fz_option_eq(val, "yes"))
|
||
|
opts->flags |= FZ_STEXT_PRESERVE_IMAGES;
|
||
|
if (fz_has_option(ctx, string, "inhibit-spaces", &val) && fz_option_eq(val, "yes"))
|
||
|
opts->flags |= FZ_STEXT_INHIBIT_SPACES;
|
||
|
|
||
|
return opts;
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
Create a device to extract the text on a page.
|
||
|
|
||
|
Gather the text on a page into blocks and lines.
|
||
|
|
||
|
The reading order is taken from the order the text is drawn in the
|
||
|
source file, so may not be accurate.
|
||
|
|
||
|
page: The text page to which content should be added. This will
|
||
|
usually be a newly created (empty) text page, but it can be one
|
||
|
containing data already (for example when merging multiple pages,
|
||
|
or watermarking).
|
||
|
|
||
|
options: Options to configure the stext device.
|
||
|
*/
|
||
|
fz_device *
|
||
|
fz_new_stext_device(fz_context *ctx, fz_stext_page *page, const fz_stext_options *opts)
|
||
|
{
|
||
|
fz_stext_device *dev = fz_new_derived_device(ctx, fz_stext_device);
|
||
|
|
||
|
dev->super.close_device = fz_stext_close_device;
|
||
|
dev->super.drop_device = fz_stext_drop_device;
|
||
|
|
||
|
dev->super.fill_text = fz_stext_fill_text;
|
||
|
dev->super.stroke_text = fz_stext_stroke_text;
|
||
|
dev->super.clip_text = fz_stext_clip_text;
|
||
|
dev->super.clip_stroke_text = fz_stext_clip_stroke_text;
|
||
|
dev->super.ignore_text = fz_stext_ignore_text;
|
||
|
|
||
|
if (opts && (opts->flags & FZ_STEXT_PRESERVE_IMAGES))
|
||
|
{
|
||
|
dev->super.fill_shade = fz_stext_fill_shade;
|
||
|
dev->super.fill_image = fz_stext_fill_image;
|
||
|
dev->super.fill_image_mask = fz_stext_fill_image_mask;
|
||
|
}
|
||
|
|
||
|
if (opts)
|
||
|
dev->flags = opts->flags;
|
||
|
dev->page = page;
|
||
|
dev->pen.x = 0;
|
||
|
dev->pen.y = 0;
|
||
|
dev->trm = fz_identity;
|
||
|
dev->lastchar = ' ';
|
||
|
dev->curdir = 1;
|
||
|
dev->lasttext = NULL;
|
||
|
|
||
|
return (fz_device*)dev;
|
||
|
}
|