eBookReaderSwitch/source/fitz/xml.c

1161 lines
27 KiB
C
Raw Normal View History

#include "mupdf/fitz.h"
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
/* #define FZ_XML_SEQ */
/* We bend the XML parser slightly when it's reading for HTML.
* To do this, we use extra knowledge about HTML tags, expressed
* in the table in a header. */
#define START_OPTIONAL 1
#define END_OPTIONAL 2
#define END_FORBIDDEN 4
#define DEPRECATED 8
#define DTD_LOOSE 16
#define DTD_FRAMESET 32
/* The following values are added by us. */
/* If a tag can contain nested instances of itself, we
* mark it as being a container. */
#define CONTAINER 64
/* Table tags autoclose each other in complex ways. */
#define TABLE_SHIFT 7
#define TABLE_MASK 7
/* Which level of a table are we? */
/* 128 * 1 = TABLE
* 128 * 2 = COLGROUP
* 128 * 3 = COL
* 128 * 4 = THEAD/TBODY/TFOOT
* 128 * 5 = TR
* 128 * 6 = TD/TH
* Any table tag, will be autoclosed by the opening of another table tag
* (within the same container) of a smaller level than it. */
#define IMPLIES_SHIFT 10
#define IMPLIES_SHIFT2 17
#define IMPLIES_SHIFT3 24
#define IMPLIES_MASK 127
/* If a tag should always be contained within another one, we
* indicate this with an 'implies'. TABLE tags never imply
* out of the current table. */
typedef struct { char tag[16]; int flags; } fz_xml_html_tag_t;
#define HTML_TAG(A,B,C,D,E) fz_xml_html_tag_ ## A
enum
{
fz_xml_html_tag__NONE,
#include "html-tags.h"
, fz_xml_html_tag__NUMTAGS
};
#define HTML_TAG(A,B,C,D,E) { # A, E | (fz_xml_html_tag_ ## B << IMPLIES_SHIFT) | (fz_xml_html_tag_ ## C << IMPLIES_SHIFT2) | (fz_xml_html_tag_ ## D << IMPLIES_SHIFT3) }
fz_xml_html_tag_t html_tags[] =
{
{ "", 0 },
#include "html-tags.h"
};
/*
When parsing XML, we assume that all tags are properly terminated.
i.e. <foo> has </foo> or <foo />.
We currently don't check this at all. In fact, we generate an incorrect
tree if this isn't the case.
For example:
<a><b><c></b><d></d></a>
Will produce:
a
b
cd
Rather than:
a
bd
c
Over the course of a large HTML file, this can lead to HUGE "south easterly" skew
in the tree.
This happens because (when parsing pure xml) when we hit </d>, we don't check that the tag we are closing is actually a <d>.
So, some heuristics to use when parsing HTML:
* When we open a tag 'foo', if we are immediately in another 'foo', then close the first 'foo' first.
* When we close a tag 'foo', run up the stack looking for an enclosing 'foo'. If we find one, close
everything up to and including that. If we don't find one, don't close anything.
With these heuristics, we get the following for free:
* A TD closes any other TD.
* A /TR closes and open TD.
* A TR closes any open TR.
This leaves problems with:
* Nested tables.
* Nested divs.
* Nested spans.
Tables, divs and spans (let alone nested ones) are problematic anyway. Ignore
this for now.
We could special case TABLE, DIV and SPAN so appropriate tags don't pop up past them.
*/
static const struct { const char *name; int c; } html_entities[] = {
{"nbsp",160}, {"iexcl",161}, {"cent",162}, {"pound",163},
{"curren",164}, {"yen",165}, {"brvbar",166}, {"sect",167},
{"uml",168}, {"copy",169}, {"ordf",170}, {"laquo",171},
{"not",172}, {"shy",173}, {"reg",174}, {"macr",175}, {"deg",176},
{"plusmn",177}, {"sup2",178}, {"sup3",179}, {"acute",180},
{"micro",181}, {"para",182}, {"middot",183}, {"cedil",184},
{"sup1",185}, {"ordm",186}, {"raquo",187}, {"frac14",188},
{"frac12",189}, {"frac34",190}, {"iquest",191}, {"Agrave",192},
{"Aacute",193}, {"Acirc",194}, {"Atilde",195}, {"Auml",196},
{"Aring",197}, {"AElig",198}, {"Ccedil",199}, {"Egrave",200},
{"Eacute",201}, {"Ecirc",202}, {"Euml",203}, {"Igrave",204},
{"Iacute",205}, {"Icirc",206}, {"Iuml",207}, {"ETH",208},
{"Ntilde",209}, {"Ograve",210}, {"Oacute",211}, {"Ocirc",212},
{"Otilde",213}, {"Ouml",214}, {"times",215}, {"Oslash",216},
{"Ugrave",217}, {"Uacute",218}, {"Ucirc",219}, {"Uuml",220},
{"Yacute",221}, {"THORN",222}, {"szlig",223}, {"agrave",224},
{"aacute",225}, {"acirc",226}, {"atilde",227}, {"auml",228},
{"aring",229}, {"aelig",230}, {"ccedil",231}, {"egrave",232},
{"eacute",233}, {"ecirc",234}, {"euml",235}, {"igrave",236},
{"iacute",237}, {"icirc",238}, {"iuml",239}, {"eth",240},
{"ntilde",241}, {"ograve",242}, {"oacute",243}, {"ocirc",244},
{"otilde",245}, {"ouml",246}, {"divide",247}, {"oslash",248},
{"ugrave",249}, {"uacute",250}, {"ucirc",251}, {"uuml",252},
{"yacute",253}, {"thorn",254}, {"yuml",255}, {"lt",60}, {"gt",62},
{"amp",38}, {"apos",39}, {"quot",34}, {"OElig",338}, {"oelig",339},
{"Scaron",352}, {"scaron",353}, {"Yuml",376}, {"circ",710},
{"tilde",732}, {"ensp",8194}, {"emsp",8195}, {"thinsp",8201},
{"zwnj",8204}, {"zwj",8205}, {"lrm",8206}, {"rlm",8207},
{"ndash",8211}, {"mdash",8212}, {"lsquo",8216}, {"rsquo",8217},
{"sbquo",8218}, {"ldquo",8220}, {"rdquo",8221}, {"bdquo",8222},
{"dagger",8224}, {"Dagger",8225}, {"permil",8240}, {"lsaquo",8249},
{"rsaquo",8250}, {"euro",8364}, {"fnof",402}, {"Alpha",913},
{"Beta",914}, {"Gamma",915}, {"Delta",916}, {"Epsilon",917},
{"Zeta",918}, {"Eta",919}, {"Theta",920}, {"Iota",921}, {"Kappa",922},
{"Lambda",923}, {"Mu",924}, {"Nu",925}, {"Xi",926}, {"Omicron",927},
{"Pi",928}, {"Rho",929}, {"Sigma",931}, {"Tau",932}, {"Upsilon",933},
{"Phi",934}, {"Chi",935}, {"Psi",936}, {"Omega",937}, {"alpha",945},
{"beta",946}, {"gamma",947}, {"delta",948}, {"epsilon",949},
{"zeta",950}, {"eta",951}, {"theta",952}, {"iota",953}, {"kappa",954},
{"lambda",955}, {"mu",956}, {"nu",957}, {"xi",958}, {"omicron",959},
{"pi",960}, {"rho",961}, {"sigmaf",962}, {"sigma",963}, {"tau",964},
{"upsilon",965}, {"phi",966}, {"chi",967}, {"psi",968}, {"omega",969},
{"thetasym",977}, {"upsih",978}, {"piv",982}, {"bull",8226},
{"hellip",8230}, {"prime",8242}, {"Prime",8243}, {"oline",8254},
{"frasl",8260}, {"weierp",8472}, {"image",8465}, {"real",8476},
{"trade",8482}, {"alefsym",8501}, {"larr",8592}, {"uarr",8593},
{"rarr",8594}, {"darr",8595}, {"harr",8596}, {"crarr",8629},
{"lArr",8656}, {"uArr",8657}, {"rArr",8658}, {"dArr",8659},
{"hArr",8660}, {"forall",8704}, {"part",8706}, {"exist",8707},
{"empty",8709}, {"nabla",8711}, {"isin",8712}, {"notin",8713},
{"ni",8715}, {"prod",8719}, {"sum",8721}, {"minus",8722},
{"lowast",8727}, {"radic",8730}, {"prop",8733}, {"infin",8734},
{"ang",8736}, {"and",8743}, {"or",8744}, {"cap",8745}, {"cup",8746},
{"int",8747}, {"there4",8756}, {"sim",8764}, {"cong",8773},
{"asymp",8776}, {"ne",8800}, {"equiv",8801}, {"le",8804}, {"ge",8805},
{"sub",8834}, {"sup",8835}, {"nsub",8836}, {"sube",8838},
{"supe",8839}, {"oplus",8853}, {"otimes",8855}, {"perp",8869},
{"sdot",8901}, {"lceil",8968}, {"rceil",8969}, {"lfloor",8970},
{"rfloor",8971}, {"lang",9001}, {"rang",9002}, {"loz",9674},
{"spades",9824}, {"clubs",9827}, {"hearts",9829}, {"diams",9830},
};
struct parser
{
fz_pool *pool;
fz_xml *head;
int preserve_white;
int for_html;
int depth;
#ifdef FZ_XML_SEQ
int seq;
#endif
};
struct attribute
{
char *value;
struct attribute *next;
char name[1];
};
struct fz_xml_doc_s
{
fz_pool *pool;
fz_xml *root;
};
/* Text nodes never use the down pointer. Therefore
* if the down pointer is the MAGIC_TEXT value, we
* know there is text. */
struct fz_xml_s
{
fz_xml *up, *down, *prev, *next;
#ifdef FZ_XML_SEQ
int seq;
#endif
union
{
char text[1];
struct
{
struct attribute *atts;
char name[1];
} d;
} u;
};
#define MAGIC_TEXT ((fz_xml *)1)
#define FZ_TEXT_ITEM(item) (item && item->down == MAGIC_TEXT)
static void xml_indent(int n)
{
while (n--) {
putchar(' ');
putchar(' ');
}
}
/*
Pretty-print an XML tree to stdout.
*/
void fz_debug_xml(fz_xml *item, int level)
{
char *s = fz_xml_text(item);
if (s)
{
int c;
xml_indent(level);
putchar('"');
while ((c = *s++)) {
switch (c) {
default:
if (c < 32 || c > 127) {
putchar('\\');
putchar('x');
putchar("0123456789ABCDEF"[(c>>4) & 15]);
putchar("0123456789ABCDEF"[(c) & 15]);
} else {
putchar(c);
}
break;
case '\\': putchar('\\'); putchar('\\'); break;
case '\b': putchar('\\'); putchar('b'); break;
case '\f': putchar('\\'); putchar('f'); break;
case '\n': putchar('\\'); putchar('n'); break;
case '\r': putchar('\\'); putchar('r'); break;
case '\t': putchar('\\'); putchar('t'); break;
}
}
putchar('"');
#ifdef FZ_XML_SEQ
printf(" <%d>", item->seq);
#endif
putchar('\n');
}
else
{
fz_xml *child;
struct attribute *att;
xml_indent(level);
#ifdef FZ_XML_SEQ
printf("(%s <%d>\n", item->u.d.name, item->seq);
#else
printf("(%s\n", item->u.d.name);
#endif
for (att = item->u.d.atts; att; att = att->next)
{
xml_indent(level);
printf("=%s %s\n", att->name, att->value);
}
for (child = fz_xml_down(item); child; child = child->next)
fz_debug_xml(child, level + 1);
xml_indent(level);
#ifdef FZ_XML_SEQ
printf(")%s <%d>\n", item->u.d.name, item->seq);
#else
printf(")%s\n", item->u.d.name);
#endif
}
}
/*
Return previous sibling of XML node.
*/
fz_xml *fz_xml_prev(fz_xml *item)
{
return item ? item->prev : NULL;
}
/*
Return next sibling of XML node.
*/
fz_xml *fz_xml_next(fz_xml *item)
{
return item ? item->next : NULL;
}
/*
Return parent of XML node.
*/
fz_xml *fz_xml_up(fz_xml *item)
{
return item ? item->up : NULL;
}
/*
Return first child of XML node.
*/
fz_xml *fz_xml_down(fz_xml *item)
{
return item && !FZ_TEXT_ITEM(item) ? item->down : NULL;
}
/*
Return the text content of an XML node.
Return NULL if the node is a tag.
*/
char *fz_xml_text(fz_xml *item)
{
return (item && FZ_TEXT_ITEM(item)) ? item->u.text : NULL;
}
/*
Return tag of XML node. Return NULL for text nodes.
*/
char *fz_xml_tag(fz_xml *item)
{
return item && !FZ_TEXT_ITEM(item) && item->u.d.name[0] ? item->u.d.name : NULL;
}
/*
Return true if the tag name matches.
*/
int fz_xml_is_tag(fz_xml *item, const char *name)
{
if (!item || FZ_TEXT_ITEM(item))
return 0;
return !strcmp(item->u.d.name, name);
}
/*
Return the value of an attribute of an XML node.
NULL if the attribute doesn't exist.
*/
char *fz_xml_att(fz_xml *item, const char *name)
{
struct attribute *att;
if (!item || FZ_TEXT_ITEM(item))
return NULL;
for (att = item->u.d.atts; att; att = att->next)
if (!strcmp(att->name, name))
return att->value;
return NULL;
}
fz_xml *fz_xml_find(fz_xml *item, const char *tag)
{
while (item)
{
if (!strcmp(item->u.d.name, tag))
return item;
item = item->next;
}
return NULL;
}
fz_xml *fz_xml_find_next(fz_xml *item, const char *tag)
{
if (item)
item = item->next;
return fz_xml_find(item, tag);
}
fz_xml *fz_xml_find_down(fz_xml *item, const char *tag)
{
if (item)
item = fz_xml_down(item);
return fz_xml_find(item, tag);
}
fz_xml *fz_xml_root(fz_xml_doc *xml)
{
return xml ? xml->root : NULL;
}
/*
Free the XML node and all its children and siblings.
*/
void fz_drop_xml(fz_context *ctx, fz_xml_doc *xml)
{
if (xml)
fz_drop_pool(ctx, xml->pool);
}
/*
Detach a node from the tree, unlinking it from its parent,
and setting the document root to the node.
*/
void fz_detach_xml(fz_context *ctx, fz_xml_doc *xml, fz_xml *node)
{
if (node->up)
node->up->down = NULL;
xml->root = node;
}
static size_t xml_parse_entity(int *c, char *a)
{
char *b;
size_t i;
if (a[1] == '#') {
if (a[2] == 'x')
*c = strtol(a + 3, &b, 16);
else
*c = strtol(a + 2, &b, 10);
if (*b == ';')
return b - a + 1;
}
else if (a[1] == 'l' && a[2] == 't' && a[3] == ';') {
*c = '<';
return 4;
}
else if (a[1] == 'g' && a[2] == 't' && a[3] == ';') {
*c = '>';
return 4;
}
else if (a[1] == 'a' && a[2] == 'm' && a[3] == 'p' && a[4] == ';') {
*c = '&';
return 5;
}
else if (a[1] == 'a' && a[2] == 'p' && a[3] == 'o' && a[4] == 's' && a[5] == ';') {
*c = '\'';
return 6;
}
else if (a[1] == 'q' && a[2] == 'u' && a[3] == 'o' && a[4] == 't' && a[5] == ';') {
*c = '"';
return 6;
}
/* We should only be doing this for XHTML, but it shouldn't be a problem. */
for (i = 0; i < nelem(html_entities); ++i) {
size_t n = strlen(html_entities[i].name);
if (!strncmp(a+1, html_entities[i].name, n) && a[n+1] == ';') {
*c = html_entities[i].c;
return n + 2;
}
}
*c = *a;
return 1;
}
static inline int isname(int c)
{
return c == '.' || c == '-' || c == '_' || c == ':' ||
(c >= '0' && c <= '9') ||
(c >= 'A' && c <= 'Z') ||
(c >= 'a' && c <= 'z');
}
static inline int iswhite(int c)
{
return c == ' ' || c == '\r' || c == '\n' || c == '\t';
}
static int
find_html_tag(const char *tag, int len)
{
int low = 0;
int high = nelem(html_tags);
int mid;
while (low != high)
{
int cmp;
mid = (low + high)>>1;
cmp = strncmp(html_tags[mid].tag, tag, len);
if (cmp == 0)
cmp = html_tags[mid].tag[len];
if (cmp == 0)
return mid;
if (cmp < 0)
low = mid+1;
else
high = mid;
}
return fz_xml_html_tag__NONE;
}
static int xml_emit_open_tag(fz_context *ctx, struct parser *parser, char *a, char *b, int is_text)
{
fz_xml *head, *tail;
size_t size;
int autoclose = 0;
if (is_text)
size = offsetof(fz_xml, u.text) + b-a+1;
else
size = offsetof(fz_xml, u.d.name) + b-a+1;
head = fz_pool_alloc(ctx, parser->pool, size);
if (is_text)
head->down = MAGIC_TEXT;
else
{
if (parser->for_html)
{
int tag_num;
/* Lowercase the tag */
char *s = head->u.d.name;
char *t = a;
while (t != b)
{
char c = *t++;
if (c >= 'A' && c <= 'Z')
c += 'a' - 'A';
*s++ = c;
}
tag_num = find_html_tag(a, b-a);
if (tag_num != fz_xml_html_tag__NONE && (html_tags[tag_num].flags & END_FORBIDDEN))
autoclose = 1;
}
else
memcpy(head->u.d.name, a, b - a);
head->u.d.name[b - a] = 0;
head->u.d.atts = NULL;
head->down = NULL;
}
head->up = parser->head;
head->next = NULL;
#ifdef FZ_XML_SEQ
head->seq = parser->seq++;
#endif
/* During construction, we use head->next to mean "the
* tail of the children. When we close the tag, we
* rewrite it to be NULL. */
if (!parser->head->down) {
parser->head->down = head;
parser->head->next = head;
head->prev = NULL;
}
else {
tail = parser->head->next;
tail->next = head;
head->prev = tail;
parser->head->next = head;
}
parser->head = head;
parser->depth++;
return autoclose;
}
static void xml_emit_att_name(fz_context *ctx, struct parser *parser, char *a, char *b)
{
fz_xml *head = parser->head;
struct attribute *att;
size_t size;
size = offsetof(struct attribute, name) + b-a+1;
att = fz_pool_alloc(ctx, parser->pool, size);
memcpy(att->name, a, b - a);
att->name[b - a] = 0;
att->value = NULL;
att->next = head->u.d.atts;
head->u.d.atts = att;
}
static void xml_emit_att_value(fz_context *ctx, struct parser *parser, char *a, char *b)
{
fz_xml *head = parser->head;
struct attribute *att = head->u.d.atts;
char *s;
int c;
/* entities are all longer than UTFmax so runetochar is safe */
s = att->value = fz_pool_alloc(ctx, parser->pool, b - a + 1);
while (a < b) {
if (*a == '&') {
a += xml_parse_entity(&c, a);
s += fz_runetochar(s, c);
}
else {
*s++ = *a++;
}
}
*s = 0;
}
static void xml_emit_close_tag(fz_context *ctx, struct parser *parser)
{
parser->depth--;
parser->head->next = NULL;
if (parser->head->up)
parser->head = parser->head->up;
}
static void xml_emit_text(fz_context *ctx, struct parser *parser, char *a, char *b)
{
fz_xml *head;
char *s;
int c;
/* Skip text outside the root tag */
if (parser->depth == 0)
return;
/* Skip all-whitespace text nodes */
if (!parser->preserve_white)
{
for (s = a; s < b; s++)
if (!iswhite(*s))
break;
if (s == b)
return;
}
(void)xml_emit_open_tag(ctx, parser, a, b, 1);
head = parser->head;
/* entities are all longer than UTFmax so runetochar is safe */
s = fz_xml_text(head);
while (a < b) {
if (*a == '&') {
a += xml_parse_entity(&c, a);
s += fz_runetochar(s, c);
}
else {
*s++ = *a++;
}
}
*s = 0;
xml_emit_close_tag(ctx, parser);
}
static void xml_emit_cdata(fz_context *ctx, struct parser *parser, char *a, char *b)
{
fz_xml *head;
char *s;
(void)xml_emit_open_tag(ctx, parser, a, b, 1);
head = parser->head;
s = head->u.text;
while (a < b)
*s++ = *a++;
*s = 0;
xml_emit_close_tag(ctx, parser);
}
static int
pop_to_tag(fz_context *ctx, struct parser *parser, char *mark, char *p)
{
fz_xml *to, *head;
/* Run up from the tag */
if (parser->for_html)
{
for (to = parser->head; to; to = to->up)
{
char *tag = fz_xml_tag(to);
if (tag && fz_strncasecmp(tag, mark, p-mark) == 0 && tag[p-mark] == 0)
break; /* Found a matching tag */
}
}
else
{
for (to = parser->head; to; to = to->up)
{
char *tag = fz_xml_tag(to);
if (tag && strncmp(tag, mark, p-mark) == 0 && tag[p-mark] == 0)
break; /* Found a matching tag */
}
}
if (to == NULL)
{
/* We didn't find a matching enclosing tag. Don't close anything. */
return 0;
}
/* Pop everything up to and including this tag. */
for (head = parser->head; head != to; head = head->up)
xml_emit_close_tag(ctx, parser);
return 1;
}
static void
open_implied(fz_context *ctx, struct parser *parser, int tag)
{
fz_xml *head;
int implied, implied2, implied3, tag_num;
int table_level;
if (tag == fz_xml_html_tag__NONE)
return;
implied = (html_tags[tag].flags >> IMPLIES_SHIFT) & IMPLIES_MASK;
implied2 = (html_tags[tag].flags >> IMPLIES_SHIFT2) & IMPLIES_MASK;
implied3 = (html_tags[tag].flags >> IMPLIES_SHIFT3) & IMPLIES_MASK;
if (implied == fz_xml_html_tag__NONE)
return;
if (implied2 == fz_xml_html_tag__NONE)
implied2 = implied;
if (implied3 == fz_xml_html_tag__NONE)
implied3 = implied;
/* So, check to see whether implied{,2,3} is present. */
table_level = (html_tags[tag].flags>>TABLE_SHIFT) & TABLE_MASK;
if (table_level != 0)
{
/* Table tag. Autoclose anything within the current TABLE
* with >= table_level. */
fz_xml *close_to = NULL;
int implied_found = 0;
for (head = parser->head; head; head = head->up)
{
char *tag = fz_xml_tag(head);
int level;
if (tag == NULL)
continue;
tag_num = find_html_tag(tag, strlen(tag));
level = (html_tags[tag_num].flags>>TABLE_SHIFT) & TABLE_MASK;
if (level >= table_level)
close_to = head;
if (tag_num == implied || tag_num == implied2 || tag_num == implied3)
implied_found = 1;
if (tag_num == fz_xml_html_tag_table)
break;
}
if (close_to)
{
for (head = parser->head; head; head = head->up)
{
xml_emit_close_tag(ctx, parser);
if (head == close_to)
break;
}
}
if (!implied_found)
{
char *tag = html_tags[implied].tag;
open_implied(ctx, parser, implied);
xml_emit_open_tag(ctx, parser, tag, tag + strlen(tag), 0);
}
}
else
{
/* Non table tag. Open by implication. */
for (head = parser->head; head; head = head->up)
{
char *tag = fz_xml_tag(head);
if (tag == NULL)
continue;
tag_num = find_html_tag(tag, strlen(tag));
if (tag_num == implied || tag_num == implied2 || tag_num == implied3)
break;
}
if (head == NULL)
{
char *s = html_tags[implied].tag;
open_implied(ctx, parser, implied);
(void)xml_emit_open_tag(ctx, parser, s, s+strlen(s), 0);
}
}
}
/* When we meet a new tag, before we open it, there may be
* things we should do first... */
static void
pre_open_tag(fz_context *ctx, struct parser *parser, char *mark, char *p)
{
fz_xml *head = parser->head;
int tag_num;
if (!parser->for_html)
return;
tag_num = find_html_tag(mark, p-mark);
if (tag_num == fz_xml_html_tag__NONE)
return;
if ((html_tags[tag_num].flags & CONTAINER) == 0)
{
/* We aren't a container flag. This means that we should autoclose up to
* any matching tags in the same container. */
fz_xml *which;
for (which = head; which; which = which->up)
{
char *tag = fz_xml_tag(which);
int tag_num2 = tag ? find_html_tag(tag, strlen(tag)) : fz_xml_html_tag__NONE;
if (tag_num == tag_num2)
{
/* Autoclose everything from head to which inclusive */
while (1)
{
int done = (head == which);
xml_emit_close_tag(ctx, parser);
head = head->up;
if (done)
break;
}
break;
}
if (html_tags[tag_num2].flags & CONTAINER)
{
/* Stop searching */
break;
}
}
}
/* Now, autoopen any tags implied by this one. */
open_implied(ctx, parser, tag_num);
}
static char *
skip_namespace_prefix(char *mark, char *p)
{
char *ns;
for (ns = mark; ns < p - 1; ++ns)
if (*ns == ':')
mark = ns + 1;
return mark;
}
static char *xml_parse_document_imp(fz_context *ctx, struct parser *parser, char *p)
{
char *mark;
int quote;
int autoclose;
char *q;
parse_text:
mark = p;
while (*p && *p != '<') ++p;
if (*p == '<') {
/* skip trailing newline before closing tag */
if (p[1] == '/' && mark < p - 1 && p[-1] == '\n')
xml_emit_text(ctx, parser, mark, p - 1);
else if (mark < p)
xml_emit_text(ctx, parser, mark, p);
++p;
goto parse_element;
} else if (mark < p)
xml_emit_text(ctx, parser, mark, p);
return NULL;
parse_element:
if (*p == '/') { ++p; goto parse_closing_element; }
if (*p == '!') { ++p; goto parse_comment; }
if (*p == '?') { ++p; goto parse_processing_instruction; }
while (iswhite(*p)) ++p;
if (isname(*p))
goto parse_element_name;
return "syntax error in element";
parse_comment:
if (p[0]=='D' && p[1]=='O' && p[2]=='C' && p[3]=='T' && p[4]=='Y' && p[5]=='P' && p[6]=='E')
goto parse_declaration;
if (p[0]=='E' && p[1]=='N' && p[2]=='T' && p[3]=='I' && p[4]=='T' && p[5]=='Y')
goto parse_declaration;
if (*p == '[') goto parse_cdata;
if (*p++ != '-') return "syntax error in comment (<! not followed by --)";
if (*p++ != '-') return "syntax error in comment (<!- not followed by -)";
while (*p) {
if (p[0] == '-' && p[1] == '-' && p[2] == '>') {
p += 3;
goto parse_text;
}
++p;
}
return "end of data in comment";
parse_declaration:
while (*p) if (*p++ == '>') goto parse_text;
return "end of data in declaration";
parse_cdata:
if (p[1] != 'C' || p[2] != 'D' || p[3] != 'A' || p[4] != 'T' || p[5] != 'A' || p[6] != '[')
return "syntax error in CDATA section";
p += 7;
mark = p;
while (*p) {
if (p[0] == ']' && p[1] == ']' && p[2] == '>') {
xml_emit_cdata(ctx, parser, mark, p);
p += 3;
goto parse_text;
}
++p;
}
return "end of data in CDATA section";
parse_processing_instruction:
while (*p) {
if (p[0] == '?' && p[1] == '>') {
p += 2;
goto parse_text;
}
++p;
}
return "end of data in processing instruction";
parse_closing_element:
while (iswhite(*p)) ++p;
mark = p;
while (isname(*p)) ++p;
q = p;
while (iswhite(*p)) ++p;
if (*p != '>')
return "syntax error in closing element";
mark = skip_namespace_prefix(mark, q);
if (pop_to_tag(ctx, parser, mark, q))
xml_emit_close_tag(ctx, parser);
++p;
goto parse_text;
parse_element_name:
mark = p;
while (isname(*p)) ++p;
mark = skip_namespace_prefix(mark, p);
pre_open_tag(ctx, parser, mark, p);
autoclose = xml_emit_open_tag(ctx, parser, mark, p, 0);
if (*p == '>') {
if (autoclose)
xml_emit_close_tag(ctx, parser);
++p;
if (*p == '\n') ++p; /* must skip linebreak immediately after an opening tag */
goto parse_text;
}
if (p[0] == '/' && p[1] == '>') {
xml_emit_close_tag(ctx, parser);
p += 2;
goto parse_text;
}
if (iswhite(*p))
goto parse_attributes;
return "syntax error after element name";
parse_attributes:
while (iswhite(*p)) ++p;
if (isname(*p))
goto parse_attribute_name;
if (*p == '>') {
if (autoclose)
xml_emit_close_tag(ctx, parser);
++p;
if (*p == '\n') ++p; /* must skip linebreak immediately after an opening tag */
goto parse_text;
}
if (p[0] == '/' && p[1] == '>') {
xml_emit_close_tag(ctx, parser);
p += 2;
goto parse_text;
}
return "syntax error in attributes";
parse_attribute_name:
mark = p;
while (isname(*p)) ++p;
xml_emit_att_name(ctx, parser, mark, p);
while (iswhite(*p)) ++p;
if (*p == '=') { ++p; goto parse_attribute_value; }
return "syntax error after attribute name";
parse_attribute_value:
while (iswhite(*p)) ++p;
quote = *p++;
if (quote != '"' && quote != '\'')
return "missing quote character";
mark = p;
while (*p && *p != quote) ++p;
if (*p == quote) {
xml_emit_att_value(ctx, parser, mark, p++);
goto parse_attributes;
}
return "end of data in attribute value";
}
static int startswith(const char *a, const char *b)
{
return !fz_strncasecmp(a, b, strlen(b));
}
static const unsigned short *find_xml_encoding(char *s)
{
const unsigned short *table = NULL;
char *end, *xml, *enc;
end = strchr(s, '>');
if (end)
{
*end = 0;
xml = strstr(s, "<?xml");
if (xml)
{
enc = strstr(xml, "encoding=");
if (enc)
{
enc += 10;
if (startswith(enc, "iso-8859-1") || startswith(enc, "latin1"))
table = fz_unicode_from_iso8859_1;
else if (startswith(enc, "iso-8859-7") || startswith(enc, "greek"))
table = fz_unicode_from_iso8859_7;
else if (startswith(enc, "koi8"))
table = fz_unicode_from_koi8u;
else if (startswith(enc, "windows-1250"))
table = fz_unicode_from_windows_1250;
else if (startswith(enc, "windows-1251"))
table = fz_unicode_from_windows_1251;
else if (startswith(enc, "windows-1252"))
table = fz_unicode_from_windows_1252;
}
}
*end = '>';
}
return table;
}
static char *convert_to_utf8(fz_context *ctx, unsigned char *s, size_t n, int *dofree)
{
const unsigned short *table;
const unsigned char *e = s + n;
char *dst, *d;
int c;
if (s[0] == 0xFE && s[1] == 0xFF) {
s += 2;
dst = d = fz_malloc(ctx, n * FZ_UTFMAX);
while (s + 1 < e) {
c = s[0] << 8 | s[1];
d += fz_runetochar(d, c);
s += 2;
}
*d = 0;
*dofree = 1;
return dst;
}
if (s[0] == 0xFF && s[1] == 0xFE) {
s += 2;
dst = d = fz_malloc(ctx, n * FZ_UTFMAX);
while (s + 1 < e) {
c = s[0] | s[1] << 8;
d += fz_runetochar(d, c);
s += 2;
}
*d = 0;
*dofree = 1;
return dst;
}
table = find_xml_encoding((char*)s);
if (table) {
dst = d = fz_malloc(ctx, n * FZ_UTFMAX);
while (*s) {
c = table[*s++];
d += fz_runetochar(d, c);
}
*d = 0;
*dofree = 1;
return dst;
}
*dofree = 0;
if (s[0] == 0xEF && s[1] == 0xBB && s[2] == 0xBF)
return (char*)s+3;
return (char*)s;
}
/*
Parse the contents of buffer into a tree of xml nodes.
preserve_white: whether to keep or delete all-whitespace nodes.
*/
fz_xml_doc *
fz_parse_xml(fz_context *ctx, fz_buffer *buf, int preserve_white, int for_html)
{
struct parser parser;
fz_xml_doc *xml = NULL;
fz_xml root, *node;
char *p = NULL;
char *error;
int dofree = 0;
unsigned char *s;
size_t n;
fz_var(dofree);
fz_var(p);
/* ensure we are zero-terminated */
fz_terminate_buffer(ctx, buf);
n = fz_buffer_storage(ctx, buf, &s);
memset(&root, 0, sizeof(root));
parser.pool = fz_new_pool(ctx);
parser.head = &root;
parser.preserve_white = preserve_white;
parser.for_html = for_html;
parser.depth = 0;
#ifdef FZ_XML_SEQ
parser.seq = 0;
#endif
fz_try(ctx)
{
p = convert_to_utf8(ctx, s, n, &dofree);
error = xml_parse_document_imp(ctx, &parser, p);
if (error)
fz_throw(ctx, FZ_ERROR_GENERIC, "%s", error);
for (node = parser.head; node; node = node->up)
node->next = NULL;
for (node = root.down; node; node = node->next)
node->up = NULL;
xml = fz_pool_alloc(ctx, parser.pool, sizeof *xml);
xml->pool = parser.pool;
xml->root = root.down;
}
fz_always(ctx)
{
if (dofree)
fz_free(ctx, p);
}
fz_catch(ctx)
{
fz_drop_pool(ctx, parser.pool);
fz_rethrow(ctx);
}
return xml;
}