192 lines
5.3 KiB
C
192 lines
5.3 KiB
C
#ifndef MUPDF_FITZ_STRUCTURED_TEXT_H
|
|
#define MUPDF_FITZ_STRUCTURED_TEXT_H
|
|
|
|
#include "mupdf/fitz/system.h"
|
|
#include "mupdf/fitz/context.h"
|
|
#include "mupdf/fitz/geometry.h"
|
|
#include "mupdf/fitz/font.h"
|
|
#include "mupdf/fitz/image.h"
|
|
#include "mupdf/fitz/output.h"
|
|
#include "mupdf/fitz/device.h"
|
|
|
|
/*
|
|
Simple text layout (for use with annotation editing primarily).
|
|
*/
|
|
typedef struct fz_layout_char_s fz_layout_char;
|
|
typedef struct fz_layout_line_s fz_layout_line;
|
|
typedef struct fz_layout_block_s fz_layout_block;
|
|
|
|
struct fz_layout_char_s
|
|
{
|
|
float x, w;
|
|
const char *p; /* location in source text of character */
|
|
fz_layout_char *next;
|
|
};
|
|
|
|
struct fz_layout_line_s
|
|
{
|
|
float x, y, h;
|
|
const char *p; /* location in source text of start of line */
|
|
fz_layout_char *text;
|
|
fz_layout_line *next;
|
|
};
|
|
|
|
struct fz_layout_block_s
|
|
{
|
|
fz_pool *pool;
|
|
fz_matrix matrix;
|
|
fz_matrix inv_matrix;
|
|
fz_layout_line *head, **tailp;
|
|
fz_layout_char **text_tailp;
|
|
};
|
|
|
|
fz_layout_block *fz_new_layout(fz_context *ctx);
|
|
void fz_drop_layout(fz_context *ctx, fz_layout_block *block);
|
|
void fz_add_layout_line(fz_context *ctx, fz_layout_block *block, float x, float y, float h, const char *p);
|
|
void fz_add_layout_char(fz_context *ctx, fz_layout_block *block, float x, float w, const char *p);
|
|
|
|
/*
|
|
Text extraction device: Used for searching, format conversion etc.
|
|
|
|
(In development - Subject to change in future versions)
|
|
*/
|
|
|
|
typedef struct fz_stext_char_s fz_stext_char;
|
|
typedef struct fz_stext_line_s fz_stext_line;
|
|
typedef struct fz_stext_block_s fz_stext_block;
|
|
typedef struct fz_stext_page_s fz_stext_page;
|
|
|
|
/*
|
|
FZ_STEXT_PRESERVE_LIGATURES: If this option is activated ligatures
|
|
are passed through to the application in their original form. If
|
|
this option is deactivated ligatures are expanded into their
|
|
constituent parts, e.g. the ligature ffi is expanded into three
|
|
separate characters f, f and i.
|
|
|
|
FZ_STEXT_PRESERVE_WHITESPACE: If this option is activated whitespace
|
|
is passed through to the application in its original form. If this
|
|
option is deactivated any type of horizontal whitespace (including
|
|
horizontal tabs) will be replaced with space characters of variable
|
|
width.
|
|
|
|
FZ_STEXT_PRESERVE_IMAGES: If this option is set, then images will
|
|
be stored in the structured text structure. The default is to ignore
|
|
all images.
|
|
|
|
FZ_STEXT_INHIBIT_SPACES: If this option is set, we will not try to
|
|
add missing space characters where there are large gaps between
|
|
characters.
|
|
*/
|
|
enum
|
|
{
|
|
FZ_STEXT_PRESERVE_LIGATURES = 1,
|
|
FZ_STEXT_PRESERVE_WHITESPACE = 2,
|
|
FZ_STEXT_PRESERVE_IMAGES = 4,
|
|
FZ_STEXT_INHIBIT_SPACES = 8,
|
|
};
|
|
|
|
/*
|
|
A text page is a list of blocks, together with an overall bounding box.
|
|
*/
|
|
struct fz_stext_page_s
|
|
{
|
|
fz_pool *pool;
|
|
fz_rect mediabox;
|
|
fz_stext_block *first_block, *last_block;
|
|
};
|
|
|
|
enum
|
|
{
|
|
FZ_STEXT_BLOCK_TEXT = 0,
|
|
FZ_STEXT_BLOCK_IMAGE = 1
|
|
};
|
|
|
|
/*
|
|
A text block is a list of lines of text (typically a paragraph), or an image.
|
|
*/
|
|
struct fz_stext_block_s
|
|
{
|
|
int type;
|
|
fz_rect bbox;
|
|
union {
|
|
struct { fz_stext_line *first_line, *last_line; } t;
|
|
struct { fz_matrix transform; fz_image *image; } i;
|
|
} u;
|
|
fz_stext_block *prev, *next;
|
|
};
|
|
|
|
/*
|
|
A text line is a list of characters that share a common baseline.
|
|
*/
|
|
struct fz_stext_line_s
|
|
{
|
|
int wmode; /* 0 for horizontal, 1 for vertical */
|
|
fz_point dir; /* normalized direction of baseline */
|
|
fz_rect bbox;
|
|
fz_stext_char *first_char, *last_char;
|
|
fz_stext_line *prev, *next;
|
|
};
|
|
|
|
/*
|
|
A text char is a unicode character, the style in which is appears, and
|
|
the point at which it is positioned.
|
|
*/
|
|
struct fz_stext_char_s
|
|
{
|
|
int c;
|
|
int color; /* sRGB hex color */
|
|
fz_point origin;
|
|
fz_quad quad;
|
|
float size;
|
|
fz_font *font;
|
|
fz_stext_char *next;
|
|
};
|
|
|
|
extern const char *fz_stext_options_usage;
|
|
|
|
fz_stext_page *fz_new_stext_page(fz_context *ctx, fz_rect mediabox);
|
|
void fz_drop_stext_page(fz_context *ctx, fz_stext_page *page);
|
|
|
|
void fz_print_stext_page_as_html(fz_context *ctx, fz_output *out, fz_stext_page *page, int id);
|
|
void fz_print_stext_header_as_html(fz_context *ctx, fz_output *out);
|
|
void fz_print_stext_trailer_as_html(fz_context *ctx, fz_output *out);
|
|
|
|
void fz_print_stext_page_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_page *page, int id);
|
|
void fz_print_stext_header_as_xhtml(fz_context *ctx, fz_output *out);
|
|
void fz_print_stext_trailer_as_xhtml(fz_context *ctx, fz_output *out);
|
|
|
|
void fz_print_stext_page_as_xml(fz_context *ctx, fz_output *out, fz_stext_page *page, int id);
|
|
|
|
void fz_print_stext_page_as_text(fz_context *ctx, fz_output *out, fz_stext_page *page);
|
|
|
|
int fz_search_stext_page(fz_context *ctx, fz_stext_page *text, const char *needle, fz_quad *quads, int max_quads);
|
|
|
|
int fz_highlight_selection(fz_context *ctx, fz_stext_page *page, fz_point a, fz_point b, fz_quad *quads, int max_quads);
|
|
|
|
enum
|
|
{
|
|
FZ_SELECT_CHARS,
|
|
FZ_SELECT_WORDS,
|
|
FZ_SELECT_LINES,
|
|
};
|
|
|
|
fz_quad fz_snap_selection(fz_context *ctx, fz_stext_page *page, fz_point *ap, fz_point *bp, int mode);
|
|
|
|
char *fz_copy_selection(fz_context *ctx, fz_stext_page *page, fz_point a, fz_point b, int crlf);
|
|
|
|
/*
|
|
Options for creating a pixmap and draw device.
|
|
*/
|
|
typedef struct fz_stext_options_s fz_stext_options;
|
|
|
|
struct fz_stext_options_s
|
|
{
|
|
int flags;
|
|
};
|
|
|
|
fz_stext_options *fz_parse_stext_options(fz_context *ctx, fz_stext_options *opts, const char *string);
|
|
|
|
fz_device *fz_new_stext_device(fz_context *ctx, fz_stext_page *page, const fz_stext_options *options);
|
|
|
|
#endif
|