eBookReaderSwitch/source/html/html-parse.c

1640 lines
41 KiB
C
Raw Normal View History

#include "../fitz/fitz-imp.h"
#include "mupdf/ucdn.h"
#include "html-imp.h"
#include <string.h>
#include <stdio.h>
#include <assert.h>
enum { T, R, B, L };
#define DEFAULT_DIR FZ_BIDI_LTR
static const char *html_default_css =
"@page{margin:3em 2em}"
"a{color:#06C;text-decoration:underline}"
"address{display:block;font-style:italic}"
"b{font-weight:bold}"
"bdo{direction:rtl;unicode-bidi:bidi-override}"
"blockquote{display:block;margin:1em 40px}"
"body{display:block;margin:1em}"
"cite{font-style:italic}"
"code{font-family:monospace}"
"dd{display:block;margin:0 0 0 40px}"
"del{text-decoration:line-through}"
"div{display:block}"
"dl{display:block;margin:1em 0}"
"dt{display:block}"
"em{font-style:italic}"
"h1{display:block;font-size:2em;font-weight:bold;margin:0.67em 0;page-break-after:avoid}"
"h2{display:block;font-size:1.5em;font-weight:bold;margin:0.83em 0;page-break-after:avoid}"
"h3{display:block;font-size:1.17em;font-weight:bold;margin:1em 0;page-break-after:avoid}"
"h4{display:block;font-size:1em;font-weight:bold;margin:1.33em 0;page-break-after:avoid}"
"h5{display:block;font-size:0.83em;font-weight:bold;margin:1.67em 0;page-break-after:avoid}"
"h6{display:block;font-size:0.67em;font-weight:bold;margin:2.33em 0;page-break-after:avoid}"
"head{display:none}"
"hr{border-style:solid;border-width:1px;display:block;margin-bottom:0.5em;margin-top:0.5em;text-align:center}"
"html{display:block}"
"i{font-style:italic}"
"ins{text-decoration:underline}"
"kbd{font-family:monospace}"
"li{display:list-item}"
"menu{display:block;list-style-type:disc;margin:1em 0;padding:0 0 0 30pt}"
"ol{display:block;list-style-type:decimal;margin:1em 0;padding:0 0 0 30pt}"
"p{display:block;margin:1em 0}"
"pre{display:block;font-family:monospace;margin:1em 0;white-space:pre}"
"samp{font-family:monospace}"
"script{display:none}"
"small{font-size:0.83em}"
"strong{font-weight:bold}"
"style{display:none}"
"sub{font-size:0.83em;vertical-align:sub}"
"sup{font-size:0.83em;vertical-align:super}"
"table{display:table}"
"tbody{display:table-row-group}"
"td{display:table-cell;padding:1px}"
"tfoot{display:table-footer-group}"
"th{display:table-cell;font-weight:bold;padding:1px;text-align:center}"
"thead{display:table-header-group}"
"tr{display:table-row}"
"ul{display:block;list-style-type:disc;margin:1em 0;padding:0 0 0 30pt}"
"ul ul{list-style-type:circle}"
"ul ul ul{list-style-type:square}"
"var{font-style:italic}"
"svg{display:none}"
;
static const char *fb2_default_css =
"@page{margin:3em 2em}"
"FictionBook{display:block;margin:1em}"
"stylesheet,binary{display:none}"
"description>*{display:none}"
"description>title-info{display:block}"
"description>title-info>*{display:none}"
"description>title-info>coverpage{display:block;page-break-before:always;page-break-after:always}"
"body,section,title,subtitle,p,cite,epigraph,text-author,date,poem,stanza,v,empty-line{display:block}"
"image{display:block}"
"p>image{display:inline}"
"table{display:table}"
"tr{display:table-row}"
"th,td{display:table-cell}"
"a{color:#06C;text-decoration:underline}"
"a[type=note]{font-size:small;vertical-align:super}"
"code{white-space:pre;font-family:monospace}"
"emphasis{font-style:italic}"
"strikethrough{text-decoration:line-through}"
"strong{font-weight:bold}"
"sub{font-size:small;vertical-align:sub}"
"sup{font-size:small;vertical-align:super}"
"image{margin:1em 0;text-align:center}"
"cite,poem{margin:1em 2em}"
"subtitle,epigraph,stanza{margin:1em 0}"
"title>p{text-align:center;font-size:x-large}"
"subtitle{text-align:center;font-size:large}"
"p{margin-top:1em;text-align:justify}"
"empty-line{padding-top:1em}"
"p+p{margin-top:0;text-indent:1.5em}"
"empty-line+p{margin-top:0}"
"section>title{page-break-before:always}"
;
struct genstate
{
fz_pool *pool;
fz_html_font_set *set;
fz_archive *zip;
fz_tree *images;
int is_fb2;
const char *base_uri;
fz_css *css;
int at_bol;
int emit_white;
int last_brk_cls;
fz_css_style_splay *styles;
};
static int iswhite(int c)
{
return c == ' ' || c == '\t' || c == '\r' || c == '\n';
}
static int is_all_white(const char *s)
{
while (*s)
{
if (!iswhite(*s))
return 0;
++s;
}
return 1;
}
/* TODO: pool allocator for flow nodes */
/* TODO: store text by pointing to a giant buffer */
static void fz_drop_html_flow(fz_context *ctx, fz_html_flow *flow)
{
while (flow)
{
fz_html_flow *next = flow->next;
if (flow->type == FLOW_IMAGE)
fz_drop_image(ctx, flow->content.image);
flow = next;
}
}
static fz_html_flow *add_flow(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box, int type, int extras)
{
size_t size = (type == FLOW_IMAGE ? sizeof(fz_html_flow) : offsetof(fz_html_flow, content) + extras);
fz_html_flow *flow = fz_pool_alloc(ctx, pool, size);
flow->type = type;
flow->expand = 0;
flow->bidi_level = 0;
flow->markup_lang = 0;
flow->breaks_line = 0;
flow->box = inline_box;
*top->flow_tail = flow;
top->flow_tail = &flow->next;
return flow;
}
static void add_flow_space(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box)
{
fz_html_flow *flow = add_flow(ctx, pool, top, inline_box, FLOW_SPACE, 0);
flow->expand = 1;
}
static void add_flow_break(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box)
{
(void)add_flow(ctx, pool, top, inline_box, FLOW_BREAK, 0);
}
static void add_flow_sbreak(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box)
{
(void)add_flow(ctx, pool, top, inline_box, FLOW_SBREAK, 0);
}
static void add_flow_shyphen(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box)
{
(void)add_flow(ctx, pool, top, inline_box, FLOW_SHYPHEN, 0);
}
static void add_flow_word(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box, const char *a, const char *b, int lang)
{
fz_html_flow *flow = add_flow(ctx, pool, top, inline_box, FLOW_WORD, b - a + 1);
memcpy(flow->content.text, a, b - a);
flow->content.text[b - a] = 0;
flow->markup_lang = lang;
}
static void add_flow_image(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box, fz_image *img)
{
fz_html_flow *flow = add_flow(ctx, pool, top, inline_box, FLOW_IMAGE, 0);
flow->content.image = fz_keep_image(ctx, img);
}
static void add_flow_anchor(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box)
{
(void)add_flow(ctx, pool, top, inline_box, FLOW_ANCHOR, 0);
}
static fz_html_flow *split_flow(fz_context *ctx, fz_pool *pool, fz_html_flow *flow, size_t offset)
{
fz_html_flow *new_flow;
char *text;
size_t len;
assert(flow->type == FLOW_WORD);
if (offset == 0)
return flow;
text = flow->content.text;
while (*text && offset)
{
int rune;
text += fz_chartorune(&rune, text);
offset--;
}
len = strlen(text);
new_flow = fz_pool_alloc(ctx, pool, offsetof(fz_html_flow, content) + len+1);
memcpy(new_flow, flow, offsetof(fz_html_flow, content));
new_flow->next = flow->next;
flow->next = new_flow;
strcpy(new_flow->content.text, text);
*text = 0;
return new_flow;
}
static void flush_space(fz_context *ctx, fz_html_box *flow, fz_html_box *inline_box, int lang, struct genstate *g)
{
static const char *space = " ";
int bsp = inline_box->style->white_space & WS_ALLOW_BREAK_SPACE;
fz_pool *pool = g->pool;
if (g->emit_white)
{
if (!g->at_bol)
{
if (bsp)
add_flow_space(ctx, pool, flow, inline_box);
else
add_flow_word(ctx, pool, flow, inline_box, space, space+1, lang);
}
g->emit_white = 0;
}
}
/* pair-wise lookup table for UAX#14 linebreaks */
static const char *pairbrk[29] =
{
/* -OCCQGNESIPPNAHIIHBBBZCWHHJJJR- */
/* -PLPULSXYSROULLDNYAB2WMJ23LVTI- */
"^^^^^^^^^^^^^^^^^^^^^^^^^^^^^", /* OP open punctuation */
"_^^%%^^^^%%_____%%__^^^______", /* CL close punctuation */
"_^^%%^^^^%%%%%__%%__^^^______", /* CP close parenthesis */
"^^^%%%^^^%%%%%%%%%%%^^^%%%%%%", /* QU quotation */
"%^^%%%^^^%%%%%%%%%%%^^^%%%%%%", /* GL non-breaking glue */
"_^^%%%^^^_______%%__^^^______", /* NS nonstarters */
"_^^%%%^^^______%%%__^^^______", /* EX exclamation/interrogation */
"_^^%%%^^^__%_%__%%__^^^______", /* SY symbols allowing break after */
"_^^%%%^^^__%%%__%%__^^^______", /* IS infix numeric separator */
"%^^%%%^^^__%%%%_%%__^^^%%%%%_", /* PR prefix numeric */
"%^^%%%^^^__%%%__%%__^^^______", /* PO postfix numeric */
"%^^%%%^^^%%%%%_%%%__^^^______", /* NU numeric */
"%^^%%%^^^__%%%_%%%__^^^______", /* AL ordinary alphabetic and symbol characters */
"%^^%%%^^^__%%%_%%%__^^^______", /* HL hebrew letter */
"_^^%%%^^^_%____%%%__^^^______", /* ID ideographic */
"_^^%%%^^^______%%%__^^^______", /* IN inseparable characters */
"_^^%_%^^^__%____%%__^^^______", /* HY hyphens */
"_^^%_%^^^_______%%__^^^______", /* BA break after */
"%^^%%%^^^%%%%%%%%%%%^^^%%%%%%", /* BB break before */
"_^^%%%^^^_______%%_^^^^______", /* B2 break opportunity before and after */
"____________________^________", /* ZW zero width space */
"%^^%%%^^^__%%%_%%%__^^^______", /* CM combining mark */
"%^^%%%^^^%%%%%%%%%%%^^^%%%%%%", /* WJ word joiner */
"_^^%%%^^^_%____%%%__^^^___%%_", /* H2 hangul leading/vowel syllable */
"_^^%%%^^^_%____%%%__^^^____%_", /* H3 hangul leading/vowel/trailing syllable */
"_^^%%%^^^_%____%%%__^^^%%%%__", /* JL hangul leading jamo */
"_^^%%%^^^_%____%%%__^^^___%%_", /* JV hangul vowel jamo */
"_^^%%%^^^_%____%%%__^^^____%_", /* JT hangul trailing jamo */
"_^^%%%^^^_______%%__^^^_____%", /* RI regional indicator */
};
static void generate_text(fz_context *ctx, fz_html_box *box, const char *text, int lang, struct genstate *g)
{
fz_html_box *flow;
fz_pool *pool = g->pool;
int collapse = box->style->white_space & WS_COLLAPSE;
int bsp = box->style->white_space & WS_ALLOW_BREAK_SPACE;
int bnl = box->style->white_space & WS_FORCE_BREAK_NEWLINE;
static const char *space = " ";
flow = box;
while (flow->type != BOX_FLOW)
flow = flow->up;
while (*text)
{
if (bnl && (*text == '\n' || *text == '\r'))
{
if (text[0] == '\r' && text[1] == '\n')
text += 2;
else
text += 1;
add_flow_break(ctx, pool, flow, box);
g->at_bol = 1;
}
else if (iswhite(*text))
{
if (collapse)
{
if (bnl)
while (*text == ' ' || *text == '\t')
++text;
else
while (iswhite(*text))
++text;
g->emit_white = 1;
}
else
{
// TODO: tabs
if (bsp)
add_flow_space(ctx, pool, flow, box);
else
add_flow_word(ctx, pool, flow, box, space, space+1, lang);
++text;
}
g->last_brk_cls = UCDN_LINEBREAK_CLASS_WJ; /* don't add sbreaks after a space */
}
else
{
const char *prev, *mark = text;
int c;
flush_space(ctx, flow, box, lang, g);
if (g->at_bol)
g->last_brk_cls = UCDN_LINEBREAK_CLASS_WJ;
while (*text && !iswhite(*text))
{
prev = text;
text += fz_chartorune(&c, text);
if (c == 0xAD) /* soft hyphen */
{
if (mark != prev)
add_flow_word(ctx, pool, flow, box, mark, prev, lang);
add_flow_shyphen(ctx, pool, flow, box);
mark = text;
g->last_brk_cls = UCDN_LINEBREAK_CLASS_WJ; /* don't add sbreaks after a soft hyphen */
}
else if (bsp) /* allow soft breaks */
{
int this_brk_cls = ucdn_get_resolved_linebreak_class(c);
if (this_brk_cls < UCDN_LINEBREAK_CLASS_RI)
{
int brk = pairbrk[g->last_brk_cls][this_brk_cls];
/* we handle spaces elsewhere, so ignore these classes */
if (brk == '@') brk = '^';
if (brk == '#') brk = '^';
if (brk == '%') brk = '^';
if (brk == '_')
{
if (mark != prev)
add_flow_word(ctx, pool, flow, box, mark, prev, lang);
add_flow_sbreak(ctx, pool, flow, box);
mark = prev;
}
g->last_brk_cls = this_brk_cls;
}
}
}
if (mark != text)
add_flow_word(ctx, pool, flow, box, mark, text, lang);
g->at_bol = 0;
}
}
}
static fz_image *load_html_image(fz_context *ctx, fz_archive *zip, const char *base_uri, const char *src)
{
char path[2048];
fz_image *img = NULL;
fz_buffer *buf = NULL;
fz_var(img);
fz_var(buf);
fz_try(ctx)
{
if (!strncmp(src, "data:image/jpeg;base64,", 23))
buf = fz_new_buffer_from_base64(ctx, src+23, 0);
else if (!strncmp(src, "data:image/png;base64,", 22))
buf = fz_new_buffer_from_base64(ctx, src+22, 0);
else
{
fz_strlcpy(path, base_uri, sizeof path);
fz_strlcat(path, "/", sizeof path);
fz_strlcat(path, src, sizeof path);
fz_urldecode(path);
fz_cleanname(path);
buf = fz_read_archive_entry(ctx, zip, path);
}
#if FZ_ENABLE_SVG
if (strstr(src, ".svg"))
img = fz_new_image_from_svg(ctx, buf, base_uri, zip);
else
#endif
img = fz_new_image_from_buffer(ctx, buf);
}
fz_always(ctx)
fz_drop_buffer(ctx, buf);
fz_catch(ctx)
fz_warn(ctx, "html: cannot load image src='%s'", src);
return img;
}
static fz_image *load_svg_image(fz_context *ctx, fz_archive *zip, const char *base_uri, fz_xml *xml)
{
fz_image *img = NULL;
fz_try(ctx)
img = fz_new_image_from_svg_xml(ctx, xml, base_uri, zip);
fz_catch(ctx)
fz_warn(ctx, "html: cannot load embedded svg document");
return img;
}
static void generate_anchor(fz_context *ctx, fz_html_box *box, struct genstate *g)
{
fz_pool *pool = g->pool;
fz_html_box *flow = box;
while (flow->type != BOX_FLOW)
flow = flow->up;
add_flow_anchor(ctx, pool, flow, box);
}
static void generate_image(fz_context *ctx, fz_html_box *box, fz_image *img, struct genstate *g)
{
fz_html_box *flow = box;
fz_pool *pool = g->pool;
while (flow->type != BOX_FLOW)
flow = flow->up;
flush_space(ctx, flow, box, 0, g);
if (!img)
{
const char *alt = "[image]";
add_flow_word(ctx, pool, flow, box, alt, alt + 7, 0);
}
else
{
fz_try(ctx)
{
add_flow_sbreak(ctx, pool, flow, box);
add_flow_image(ctx, pool, flow, box, img);
add_flow_sbreak(ctx, pool, flow, box);
}
fz_always(ctx)
{
fz_drop_image(ctx, img);
}
fz_catch(ctx)
fz_rethrow(ctx);
}
g->at_bol = 0;
}
static void init_box(fz_context *ctx, fz_html_box *box, fz_bidi_direction markup_dir)
{
box->type = BOX_BLOCK;
box->x = box->y = 0;
box->w = box->b = 0;
box->up = NULL;
box->down = NULL;
box->next = NULL;
box->flow_head = NULL;
box->flow_tail = &box->flow_head;
box->markup_dir = markup_dir;
box->style = NULL;
}
static void fz_drop_html_box(fz_context *ctx, fz_html_box *box)
{
while (box)
{
fz_html_box *next = box->next;
fz_drop_html_flow(ctx, box->flow_head);
fz_drop_html_box(ctx, box->down);
box = next;
}
}
static void fz_drop_html_imp(fz_context *ctx, fz_storable *stor)
{
fz_html *html = (fz_html *)stor;
fz_drop_html_box(ctx, html->root);
fz_drop_pool(ctx, html->pool);
}
void fz_drop_html(fz_context *ctx, fz_html *html)
{
fz_defer_reap_start(ctx);
fz_drop_storable(ctx, &html->storable);
fz_defer_reap_end(ctx);
}
fz_html *fz_keep_html(fz_context *ctx, fz_html *html)
{
return fz_keep_storable(ctx, &html->storable);
}
static fz_html_box *new_box(fz_context *ctx, fz_pool *pool, fz_bidi_direction markup_dir)
{
fz_html_box *box = fz_pool_alloc(ctx, pool, sizeof *box);
init_box(ctx, box, markup_dir);
return box;
}
static fz_html_box *new_short_box(fz_context *ctx, fz_pool *pool, fz_bidi_direction markup_dir)
{
fz_html_box *box = fz_pool_alloc(ctx, pool, offsetof(fz_html_box, padding));
init_box(ctx, box, markup_dir);
return box;
}
static void insert_box(fz_context *ctx, fz_html_box *box, int type, fz_html_box *top)
{
box->type = type;
box->up = top;
if (top)
{
/* Here 'next' really means 'last of my children'. This will
* be fixed up in a pass at the end of parsing. */
if (!top->next)
{
top->down = top->next = box;
}
else
{
top->next->next = box;
/* Here next actually means next */
top->next = box;
}
}
}
static fz_html_box *insert_block_box(fz_context *ctx, fz_html_box *box, fz_html_box *top)
{
if (top->type == BOX_BLOCK)
{
insert_box(ctx, box, BOX_BLOCK, top);
}
else if (top->type == BOX_FLOW)
{
while (top->type != BOX_BLOCK)
top = top->up;
insert_box(ctx, box, BOX_BLOCK, top);
}
else if (top->type == BOX_INLINE)
{
while (top->type != BOX_BLOCK)
top = top->up;
insert_box(ctx, box, BOX_BLOCK, top);
}
return top;
}
static fz_html_box *insert_table_box(fz_context *ctx, fz_html_box *box, fz_html_box *top)
{
top = insert_block_box(ctx, box, top);
box->type = BOX_TABLE;
return top;
}
static fz_html_box *insert_table_row_box(fz_context *ctx, fz_html_box *box, fz_html_box *top)
{
fz_html_box *table = top;
while (table && table->type != BOX_TABLE)
table = table->up;
if (table)
{
insert_box(ctx, box, BOX_TABLE_ROW, table);
return table;
}
fz_warn(ctx, "table-row not inside table element");
insert_block_box(ctx, box, top);
return top;
}
static fz_html_box *insert_table_cell_box(fz_context *ctx, fz_html_box *box, fz_html_box *top)
{
fz_html_box *tr = top;
while (tr && tr->type != BOX_TABLE_ROW)
tr = tr->up;
if (tr)
{
insert_box(ctx, box, BOX_TABLE_CELL, tr);
return tr;
}
fz_warn(ctx, "table-cell not inside table-row element");
insert_block_box(ctx, box, top);
return top;
}
static fz_html_box *insert_break_box(fz_context *ctx, fz_html_box *box, fz_html_box *top)
{
if (top->type == BOX_BLOCK)
{
insert_box(ctx, box, BOX_BREAK, top);
}
else if (top->type == BOX_FLOW)
{
while (top->type != BOX_BLOCK)
top = top->up;
insert_box(ctx, box, BOX_BREAK, top);
}
else if (top->type == BOX_INLINE)
{
while (top->type != BOX_BLOCK)
top = top->up;
insert_box(ctx, box, BOX_BREAK, top);
}
return top;
}
static void insert_inline_box(fz_context *ctx, fz_html_box *box, fz_html_box *top, int markup_dir, struct genstate *g)
{
if (top->type == BOX_FLOW || top->type == BOX_INLINE)
{
insert_box(ctx, box, BOX_INLINE, top);
}
else
{
while (top->type != BOX_BLOCK && top->type != BOX_TABLE_CELL)
top = top->up;
/* Here 'next' actually means 'last of my children' */
if (top->next && top->next->type == BOX_FLOW)
{
insert_box(ctx, box, BOX_INLINE, top->next);
}
else
{
fz_css_style style;
fz_html_box *flow = new_short_box(ctx, g->pool, markup_dir);
flow->is_first_flow = !top->next;
fz_default_css_style(ctx, &style);
flow->style = fz_css_enlist(ctx, &style, &g->styles, g->pool);
insert_box(ctx, flow, BOX_FLOW, top);
insert_box(ctx, box, BOX_INLINE, flow);
g->at_bol = 1;
}
}
}
static fz_html_box *
generate_boxes(fz_context *ctx,
fz_xml *node,
fz_html_box *top,
fz_css_match *up_match,
int list_counter,
int section_depth,
int markup_dir,
int markup_lang,
struct genstate *g)
{
fz_css_match match;
fz_html_box *box, *last_top;
const char *tag;
int display;
fz_css_style style;
while (node)
{
match.up = up_match;
match.count = 0;
tag = fz_xml_tag(node);
if (tag)
{
fz_match_css(ctx, &match, g->css, node);
display = fz_get_css_match_display(&match);
if (tag[0]=='b' && tag[1]=='r' && tag[2]==0)
{
if (top->type == BOX_INLINE)
{
fz_html_box *flow = top;
while (flow->type != BOX_FLOW)
flow = flow->up;
add_flow_break(ctx, g->pool, flow, top);
}
else
{
box = new_short_box(ctx, g->pool, markup_dir);
fz_apply_css_style(ctx, g->set, &style, &match);
box->style = fz_css_enlist(ctx, &style, &g->styles, g->pool);
top = insert_break_box(ctx, box, top);
}
g->at_bol = 1;
}
else if (tag[0]=='i' && tag[1]=='m' && tag[2]=='g' && tag[3]==0)
{
const char *src = fz_xml_att(node, "src");
if (src)
{
int w, h;
const char *w_att = fz_xml_att(node, "width");
const char *h_att = fz_xml_att(node, "height");
box = new_short_box(ctx, g->pool, markup_dir);
fz_apply_css_style(ctx, g->set, &style, &match);
if (w_att && (w = fz_atoi(w_att)) > 0)
{
style.width.value = w;
style.width.unit = strchr(w_att, '%') ? N_PERCENT : N_LENGTH;
}
if (h_att && (h = fz_atoi(h_att)) > 0)
{
style.height.value = h;
style.height.unit = strchr(h_att, '%') ? N_PERCENT : N_LENGTH;
}
box->style = fz_css_enlist(ctx, &style, &g->styles, g->pool);
insert_inline_box(ctx, box, top, markup_dir, g);
generate_image(ctx, box, load_html_image(ctx, g->zip, g->base_uri, src), g);
}
}
else if (tag[0]=='s' && tag[1]=='v' && tag[2]=='g' && tag[3]==0)
{
box = new_short_box(ctx, g->pool, markup_dir);
fz_apply_css_style(ctx, g->set, &style, &match);
box->style = fz_css_enlist(ctx, &style, &g->styles, g->pool);
insert_inline_box(ctx, box, top, markup_dir, g);
generate_image(ctx, box, load_svg_image(ctx, g->zip, g->base_uri, node), g);
}
else if (g->is_fb2 && tag[0]=='i' && tag[1]=='m' && tag[2]=='a' && tag[3]=='g' && tag[4]=='e' && tag[5]==0)
{
const char *src = fz_xml_att(node, "l:href");
if (!src)
src = fz_xml_att(node, "xlink:href");
if (src && src[0] == '#')
{
fz_image *img = fz_tree_lookup(ctx, g->images, src+1);
if (display == DIS_BLOCK)
{
fz_html_box *imgbox;
box = new_box(ctx, g->pool, markup_dir);
fz_default_css_style(ctx, &style);
fz_apply_css_style(ctx, g->set, &style, &match);
box->style = fz_css_enlist(ctx, &style, &g->styles, g->pool);
top = insert_block_box(ctx, box, top);
imgbox = new_short_box(ctx, g->pool, markup_dir);
fz_apply_css_style(ctx, g->set, &style, &match);
imgbox->style = fz_css_enlist(ctx, &style, &g->styles, g->pool);
insert_inline_box(ctx, imgbox, box, markup_dir, g);
generate_image(ctx, imgbox, fz_keep_image(ctx, img), g);
}
else if (display == DIS_INLINE)
{
box = new_short_box(ctx, g->pool, markup_dir);
fz_apply_css_style(ctx, g->set, &style, &match);
box->style = fz_css_enlist(ctx, &style, &g->styles, g->pool);
insert_inline_box(ctx, box, top, markup_dir, g);
generate_image(ctx, box, fz_keep_image(ctx, img), g);
}
}
}
else if (display != DIS_NONE)
{
const char *dir, *lang, *id, *href;
int child_dir = markup_dir;
int child_lang = markup_lang;
dir = fz_xml_att(node, "dir");
if (dir)
{
if (!strcmp(dir, "auto"))
child_dir = FZ_BIDI_NEUTRAL;
else if (!strcmp(dir, "rtl"))
child_dir = FZ_BIDI_RTL;
else if (!strcmp(dir, "ltr"))
child_dir = FZ_BIDI_LTR;
else
child_dir = DEFAULT_DIR;
}
lang = fz_xml_att(node, "lang");
if (lang)
child_lang = fz_text_language_from_string(lang);
if (display == DIS_INLINE)
box = new_short_box(ctx, g->pool, child_dir);
else
box = new_box(ctx, g->pool, child_dir);
fz_default_css_style(ctx, &style);
fz_apply_css_style(ctx, g->set, &style, &match);
box->style = fz_css_enlist(ctx, &style, &g->styles, g->pool);
id = fz_xml_att(node, "id");
if (id)
box->id = fz_pool_strdup(ctx, g->pool, id);
if (display == DIS_BLOCK || display == DIS_INLINE_BLOCK)
{
top = insert_block_box(ctx, box, top);
if (g->is_fb2)
{
if (!strcmp(tag, "title") || !strcmp(tag, "subtitle"))
box->heading = fz_mini(section_depth, 6);
}
else
{
if (tag[0]=='h' && tag[1]>='1' && tag[1]<='6' && tag[2]==0)
box->heading = tag[1] - '0';
}
}
else if (display == DIS_LIST_ITEM)
{
top = insert_block_box(ctx, box, top);
box->list_item = ++list_counter;
}
else if (display == DIS_INLINE)
{
insert_inline_box(ctx, box, top, child_dir, g);
if (id)
generate_anchor(ctx, box, g);
if (tag[0]=='a' && tag[1]==0)
{
if (g->is_fb2)
{
href = fz_xml_att(node, "l:href");
if (!href)
href = fz_xml_att(node, "xlink:href");
}
else
href = fz_xml_att(node, g->is_fb2 ? "l:href" : "href");
if (href)
box->href = fz_pool_strdup(ctx, g->pool, href);
}
}
else if (display == DIS_TABLE)
{
top = insert_table_box(ctx, box, top);
}
else if (display == DIS_TABLE_ROW)
{
top = insert_table_row_box(ctx, box, top);
}
else if (display == DIS_TABLE_CELL)
{
top = insert_table_cell_box(ctx, box, top);
}
else
{
fz_warn(ctx, "unknown box display type");
insert_box(ctx, box, BOX_BLOCK, top);
}
if (fz_xml_down(node))
{
int child_counter = list_counter;
int child_section = section_depth;
if (!strcmp(tag, "ul") || !strcmp(tag, "ol"))
child_counter = 0;
if (!strcmp(tag, "section"))
++child_section;
last_top = generate_boxes(ctx,
fz_xml_down(node),
box,
&match,
child_counter,
child_section,
child_dir,
child_lang,
g);
if (last_top != box)
top = last_top;
}
}
}
else
{
const char *text = fz_xml_text(node);
int collapse = top->style->white_space & WS_COLLAPSE;
if (collapse && is_all_white(text))
{
g->emit_white = 1;
}
else
{
if (top->type != BOX_INLINE)
{
/* Create anonymous inline box, with the same style as the top block box. */
fz_css_style style;
box = new_short_box(ctx, g->pool, markup_dir);
fz_default_css_style(ctx, &style);
box->style = fz_css_enlist(ctx, &style, &g->styles, g->pool);
insert_inline_box(ctx, box, top, markup_dir, g);
style = *top->style;
/* Make sure not to recursively multiply font sizes. */
style.font_size.value = 1;
style.font_size.unit = N_SCALE;
box->style = fz_css_enlist(ctx, &style, &g->styles, g->pool);
generate_text(ctx, box, text, markup_lang, g);
}
else
{
generate_text(ctx, top, text, markup_lang, g);
}
}
}
node = fz_xml_next(node);
}
return top;
}
static char *concat_text(fz_context *ctx, fz_xml *root)
{
fz_xml *node;
size_t i = 0, n = 1;
char *s;
for (node = fz_xml_down(root); node; node = fz_xml_next(node))
{
const char *text = fz_xml_text(node);
n += text ? strlen(text) : 0;
}
s = fz_malloc(ctx, n);
for (node = fz_xml_down(root); node; node = fz_xml_next(node))
{
const char *text = fz_xml_text(node);
if (text)
{
n = strlen(text);
memcpy(s+i, text, n);
i += n;
}
}
s[i] = 0;
return s;
}
static void
html_load_css_link(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_css *css, fz_xml *root, const char *href)
{
char path[2048];
char css_base_uri[2048];
fz_buffer *buf;
fz_var(buf);
fz_strlcpy(path, base_uri, sizeof path);
fz_strlcat(path, "/", sizeof path);
fz_strlcat(path, href, sizeof path);
fz_urldecode(path);
fz_cleanname(path);
fz_dirname(css_base_uri, path, sizeof css_base_uri);
buf = NULL;
fz_try(ctx)
{
buf = fz_read_archive_entry(ctx, zip, path);
fz_parse_css(ctx, css, fz_string_from_buffer(ctx, buf), path);
fz_add_css_font_faces(ctx, set, zip, css_base_uri, css);
}
fz_always(ctx)
fz_drop_buffer(ctx, buf);
fz_catch(ctx)
fz_warn(ctx, "ignoring stylesheet %s", path);
}
static void
html_load_css(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_css *css, fz_xml *root)
{
fz_xml *html, *head, *node;
html = fz_xml_find(root, "html");
head = fz_xml_find_down(html, "head");
for (node = fz_xml_down(head); node; node = fz_xml_next(node))
{
if (fz_xml_is_tag(node, "link"))
{
char *rel = fz_xml_att(node, "rel");
if (rel && !fz_strcasecmp(rel, "stylesheet"))
{
char *type = fz_xml_att(node, "type");
if ((type && !strcmp(type, "text/css")) || !type)
{
char *href = fz_xml_att(node, "href");
if (href)
{
html_load_css_link(ctx, set, zip, base_uri, css, root, href);
}
}
}
}
else if (fz_xml_is_tag(node, "style"))
{
char *s = concat_text(ctx, node);
fz_try(ctx)
fz_parse_css(ctx, css, s, "<style>");
fz_catch(ctx)
fz_warn(ctx, "ignoring inline stylesheet");
fz_free(ctx, s);
}
}
}
static void
fb2_load_css(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_css *css, fz_xml *root)
{
fz_xml *fictionbook, *stylesheet;
fictionbook = fz_xml_find(root, "FictionBook");
stylesheet = fz_xml_find_down(fictionbook, "stylesheet");
if (stylesheet)
{
char *s = concat_text(ctx, stylesheet);
fz_try(ctx)
{
fz_parse_css(ctx, css, s, "<stylesheet>");
fz_add_css_font_faces(ctx, set, zip, base_uri, css);
}
fz_catch(ctx)
fz_warn(ctx, "ignoring inline stylesheet");
fz_free(ctx, s);
}
}
static fz_tree *
load_fb2_images(fz_context *ctx, fz_xml *root)
{
fz_xml *fictionbook, *binary;
fz_tree *images = NULL;
fictionbook = fz_xml_find(root, "fictionbook");
for (binary = fz_xml_find_down(fictionbook, "binary"); binary; binary = fz_xml_find_next(binary, "binary"))
{
const char *id = fz_xml_att(binary, "id");
char *b64 = NULL;
fz_buffer *buf = NULL;
fz_image *img = NULL;
fz_var(b64);
fz_var(buf);
fz_try(ctx)
{
b64 = concat_text(ctx, binary);
buf = fz_new_buffer_from_base64(ctx, b64, strlen(b64));
img = fz_new_image_from_buffer(ctx, buf);
}
fz_always(ctx)
{
fz_drop_buffer(ctx, buf);
fz_free(ctx, b64);
}
fz_catch(ctx)
fz_rethrow(ctx);
images = fz_tree_insert(ctx, images, id, img);
}
return images;
}
typedef struct
{
uint32_t *data;
size_t cap;
size_t len;
} uni_buf;
typedef struct
{
fz_context *ctx;
fz_pool *pool;
fz_html_flow *flow;
uni_buf *buffer;
} bidi_data;
static void fragment_cb(const uint32_t *fragment,
size_t fragment_len,
int bidi_level,
int script,
void *arg)
{
bidi_data *data = (bidi_data *)arg;
size_t fragment_offset = fragment - data->buffer->data;
/* We are guaranteed that fragmentOffset will be at the beginning
* of flow. */
while (fragment_len > 0)
{
size_t len;
if (data->flow->type == FLOW_SPACE)
{
len = 1;
}
else if (data->flow->type == FLOW_BREAK || data->flow->type == FLOW_SBREAK ||
data->flow->type == FLOW_SHYPHEN || data->flow->type == FLOW_ANCHOR)
{
len = 0;
}
else
{
/* Must be text */
len = fz_utflen(data->flow->content.text);
if (len > fragment_len)
{
/* We need to split this flow box */
(void)split_flow(data->ctx, data->pool, data->flow, fragment_len);
len = fz_utflen(data->flow->content.text);
}
}
/* This flow box is entirely contained within this fragment. */
data->flow->bidi_level = bidi_level;
data->flow->script = script;
data->flow = data->flow->next;
fragment_offset += len;
fragment_len -= len;
}
}
static fz_bidi_direction
detect_flow_directionality(fz_context *ctx, fz_pool *pool, uni_buf *buffer, fz_bidi_direction bidi_dir, fz_html_flow *flow)
{
fz_html_flow *end = flow;
bidi_data data;
while (end)
{
int level = end->bidi_level;
/* Gather the text from the flow up into a single buffer (at
* least, as much of it as has the same direction markup). */
buffer->len = 0;
while (end && (level & 1) == (end->bidi_level & 1))
{
size_t len = 0;
const char *text = "";
int broken = 0;
switch (end->type)
{
case FLOW_WORD:
len = fz_utflen(end->content.text);
text = end->content.text;
break;
case FLOW_SPACE:
len = 1;
text = " ";
break;
case FLOW_SHYPHEN:
case FLOW_SBREAK:
break;
case FLOW_BREAK:
case FLOW_IMAGE:
broken = 1;
break;
}
end = end->next;
if (broken)
break;
/* Make sure the buffer is large enough */
if (buffer->len + len > buffer->cap)
{
size_t newcap = buffer->cap;
if (newcap < 128)
newcap = 128; /* Sensible small default */
while (newcap < buffer->len + len)
newcap = (newcap * 3) / 2;
buffer->data = fz_realloc_array(ctx, buffer->data, newcap, uint32_t);
buffer->cap = newcap;
}
/* Expand the utf8 text into Unicode and store it in the buffer */
while (*text)
{
int rune;
text += fz_chartorune(&rune, text);
buffer->data[buffer->len++] = rune;
}
}
/* Detect directionality for the buffer */
data.ctx = ctx;
data.pool = pool;
data.flow = flow;
data.buffer = buffer;
fz_bidi_fragment_text(ctx, buffer->data, buffer->len, &bidi_dir, fragment_cb, &data, 0 /* Flags */);
flow = end;
}
return bidi_dir;
}
static void
detect_box_directionality(fz_context *ctx, fz_pool *pool, uni_buf *buffer, fz_html_box *box)
{
while (box)
{
if (box->flow_head)
box->markup_dir = detect_flow_directionality(ctx, pool, buffer, box->markup_dir, box->flow_head);
detect_box_directionality(ctx, pool, buffer, box->down);
box = box->next;
}
}
static void
detect_directionality(fz_context *ctx, fz_pool *pool, fz_html_box *box)
{
uni_buf buffer = { NULL };
fz_try(ctx)
detect_box_directionality(ctx, pool, &buffer, box);
fz_always(ctx)
fz_free(ctx, buffer.data);
fz_catch(ctx)
fz_rethrow(ctx);
}
/* Here we look for places where box->next actually means
* 'the last of my children', and correct it by setting
* next == NULL. We can spot these because box->next->up == box. */
static void
fix_nexts(fz_html_box *box)
{
while (box)
{
if (box->down)
fix_nexts(box->down);
if (box->next && box->next->up == box)
{
box->next = NULL;
break;
}
box = box->next;
}
}
fz_html *
fz_parse_html(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_buffer *buf, const char *user_css)
{
fz_xml_doc *xml;
fz_xml *root, *node;
fz_html *html = NULL;
char *title;
fz_css_match match;
struct genstate g;
g.pool = NULL;
g.set = set;
g.zip = zip;
g.images = NULL;
g.base_uri = base_uri;
g.css = NULL;
g.at_bol = 0;
g.emit_white = 0;
g.last_brk_cls = UCDN_LINEBREAK_CLASS_OP;
g.styles = NULL;
xml = fz_parse_xml(ctx, buf, 1, 1);
root = fz_xml_root(xml);
fz_try(ctx)
g.css = fz_new_css(ctx);
fz_catch(ctx)
{
fz_drop_xml(ctx, xml);
fz_rethrow(ctx);
}
#ifndef NDEBUG
if (fz_atoi(getenv("FZ_DEBUG_XML")))
fz_debug_xml(root, 0);
#endif
fz_try(ctx)
{
if (fz_xml_find(root, "fictionbook"))
{
g.is_fb2 = 1;
fz_parse_css(ctx, g.css, fb2_default_css, "<default:fb2>");
if (fz_use_document_css(ctx))
fb2_load_css(ctx, g.set, g.zip, g.base_uri, g.css, root);
g.images = load_fb2_images(ctx, root);
}
else
{
g.is_fb2 = 0;
fz_parse_css(ctx, g.css, html_default_css, "<default:html>");
if (fz_use_document_css(ctx))
html_load_css(ctx, g.set, g.zip, g.base_uri, g.css, root);
g.images = NULL;
}
if (user_css)
{
fz_parse_css(ctx, g.css, user_css, "<user>");
fz_add_css_font_faces(ctx, g.set, g.zip, ".", g.css);
}
}
fz_catch(ctx)
{
fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
fz_warn(ctx, "ignoring styles due to errors: %s", fz_caught_message(ctx));
}
#ifndef NDEBUG
if (fz_atoi(getenv("FZ_DEBUG_CSS")))
fz_debug_css(ctx, g.css);
#endif
fz_try(ctx)
{
fz_css_style style;
g.pool = fz_new_pool(ctx);
html = fz_pool_alloc(ctx, g.pool, sizeof *html);
FZ_INIT_STORABLE(html, 1, fz_drop_html_imp);
html->pool = g.pool;
html->root = new_box(ctx, g.pool, DEFAULT_DIR);
html->layout_w = 0;
html->layout_h = 0;
html->layout_em = 0;
fz_default_css_style(ctx, &style);
match.up = NULL;
match.count = 0;
fz_match_css_at_page(ctx, &match, g.css);
fz_apply_css_style(ctx, g.set, &style, &match);
html->root->style = fz_css_enlist(ctx, &style, &g.styles, g.pool);
// TODO: transfer page margins out of this hacky box
generate_boxes(ctx, root, html->root, &match, 0, 0, DEFAULT_DIR, FZ_LANG_UNSET, &g);
fix_nexts(html->root);
detect_directionality(ctx, g.pool, html->root);
if (g.is_fb2)
{
node = fz_xml_find(root, "fictionbook");
node = fz_xml_find_down(node, "description");
node = fz_xml_find_down(node, "title-info");
node = fz_xml_find_down(node, "book-title");
title = fz_xml_text(fz_xml_down(node));
if (title)
html->title = fz_pool_strdup(ctx, g.pool, title);
}
else
{
node = fz_xml_find(root, "html");
node = fz_xml_find_down(node, "head");
node = fz_xml_find_down(node, "title");
title = fz_xml_text(fz_xml_down(node));
if (title)
html->title = fz_pool_strdup(ctx, g.pool, title);
}
}
fz_always(ctx)
{
fz_drop_tree(ctx, g.images, (void(*)(fz_context*,void*))fz_drop_image);
fz_drop_css(ctx, g.css);
fz_drop_xml(ctx, xml);
}
fz_catch(ctx)
{
fz_drop_html(ctx, html);
fz_rethrow(ctx);
}
return html;
}
static void indent(int level)
{
while (level-- > 0)
putchar('\t');
}
static void
fz_debug_html_flow(fz_context *ctx, fz_html_flow *flow, int level)
{
fz_html_box *sbox = NULL;
while (flow)
{
if (flow->box != sbox) {
if (sbox) {
indent(level);
printf("}\n");
}
sbox = flow->box;
indent(level);
printf("span em=%g font='%s'", sbox->em, fz_font_name(ctx, sbox->style->font));
if (fz_font_is_serif(ctx, sbox->style->font))
printf(" serif");
else
printf(" sans");
if (fz_font_is_monospaced(ctx, sbox->style->font))
printf(" monospaced");
if (fz_font_is_bold(ctx, sbox->style->font))
printf(" bold");
if (fz_font_is_italic(ctx, sbox->style->font))
printf(" italic");
if (sbox->style->small_caps)
printf(" small-caps");
printf("\n");
indent(level);
printf("{\n");
}
indent(level+1);
switch (flow->type) {
case FLOW_WORD: printf("word "); break;
case FLOW_SPACE: printf("space"); break;
case FLOW_SBREAK: printf("sbrk "); break;
case FLOW_SHYPHEN: printf("shy "); break;
case FLOW_BREAK: printf("break"); break;
case FLOW_IMAGE: printf("image"); break;
case FLOW_ANCHOR: printf("anchor"); break;
}
printf(" y=%g x=%g w=%g", flow->y, flow->x, flow->w);
if (flow->type == FLOW_IMAGE)
printf(" h=%g", flow->h);
if (flow->type == FLOW_WORD)
printf(" text='%s'", flow->content.text);
printf("\n");
if (flow->breaks_line) {
indent(level+1);
printf("*\n");
}
flow = flow->next;
}
indent(level);
printf("}\n");
}
static void
fz_debug_html_box(fz_context *ctx, fz_html_box *box, int level)
{
while (box)
{
indent(level);
switch (box->type) {
case BOX_BLOCK: printf("block"); break;
case BOX_BREAK: printf("break"); break;
case BOX_FLOW: printf("flow"); break;
case BOX_INLINE: printf("inline"); break;
case BOX_TABLE: printf("table"); break;
case BOX_TABLE_ROW: printf("table-row"); break;
case BOX_TABLE_CELL: printf("table-cell"); break;
}
printf(" em=%g x=%g y=%g w=%g b=%g\n", box->em, box->x, box->y, box->w, box->b);
indent(level);
printf("{\n");
if (box->type == BOX_BLOCK) {
indent(level+1);
printf("margin=%g %g %g %g\n", box->margin[0], box->margin[1], box->margin[2], box->margin[3]);
}
if (box->is_first_flow) {
indent(level+1);
printf("is-first-flow\n");
}
if (box->list_item) {
indent(level+1);
printf("list=%d\n", box->list_item);
}
if (box->id) {
indent(level+1);
printf("id=%s\n", box->id);
}
if (box->href) {
indent(level+1);
printf("href=%s\n", box->href);
}
if (box->down)
fz_debug_html_box(ctx, box->down, level + 1);
if (box->flow_head)
fz_debug_html_flow(ctx, box->flow_head, level + 1);
indent(level);
printf("}\n");
box = box->next;
}
}
void
fz_debug_html(fz_context *ctx, fz_html_box *box)
{
fz_debug_html_box(ctx, box, 0);
}
static size_t
fz_html_size(fz_context *ctx, fz_html *html)
{
return html ? fz_pool_size(ctx, html->pool) : 0;
}
/* Magic to make html storable. */
typedef struct {
int refs;
void *doc;
int chapter_num;
} fz_html_key;
static int
fz_make_hash_html_key(fz_context *ctx, fz_store_hash *hash, void *key_)
{
fz_html_key *key = (fz_html_key *)key_;
hash->u.pi.ptr = key->doc;
hash->u.pi.i = key->chapter_num;
return 1;
}
static void *
fz_keep_html_key(fz_context *ctx, void *key_)
{
fz_html_key *key = (fz_html_key *)key_;
return fz_keep_imp(ctx, key, &key->refs);
}
static void
fz_drop_html_key(fz_context *ctx, void *key_)
{
fz_html_key *key = (fz_html_key *)key_;
if (fz_drop_imp(ctx, key, &key->refs))
{
fz_free(ctx, key);
}
}
static int
fz_cmp_html_key(fz_context *ctx, void *k0_, void *k1_)
{
fz_html_key *k0 = (fz_html_key *)k0_;
fz_html_key *k1 = (fz_html_key *)k1_;
return k0->doc == k1->doc && k0->chapter_num == k1->chapter_num;
}
static void
fz_format_html_key(fz_context *ctx, char *s, int n, void *key_)
{
fz_html_key *key = (fz_html_key *)key_;
fz_snprintf(s, n, "(html doc=%p, ch=%d)", key->doc, key->chapter_num);
}
static const fz_store_type fz_html_store_type =
{
fz_make_hash_html_key,
fz_keep_html_key,
fz_drop_html_key,
fz_cmp_html_key,
fz_format_html_key,
NULL
};
fz_html *fz_store_html(fz_context *ctx, fz_html *html, void *doc, int chapter)
{
fz_html_key *key = NULL;
fz_html *other_html;
/* Stick the parsed html in the store */
fz_var(key);
fz_try(ctx)
{
key = fz_malloc_struct(ctx, fz_html_key);
key->refs = 1;
key->doc = doc;
key->chapter_num = chapter;
other_html = fz_store_item(ctx, key, html, fz_html_size(ctx, html), &fz_html_store_type);
if (other_html)
{
fz_drop_html(ctx, html);
html = other_html;
}
}
fz_always(ctx)
fz_drop_html_key(ctx, key);
fz_catch(ctx)
{
/* Do nothing */
}
return html;
}
fz_html *fz_find_html(fz_context *ctx, void *doc, int chapter)
{
fz_html_key key;
key.refs = 1;
key.doc = doc;
key.chapter_num = chapter;
return fz_find_item(ctx, &fz_drop_html_imp, &key, &fz_html_store_type);
}
static int
html_filter_store(fz_context *ctx, void *doc, void *key_)
{
fz_html_key *key = (fz_html_key *)key_;
return (doc == key->doc);
}
void fz_purge_stored_html(fz_context *ctx, void *doc)
{
fz_filter_store(ctx, html_filter_store, doc, &fz_html_store_type);
}