eBookReaderSwitch/mupdf/source/fitz/bidi.c

590 lines
14 KiB
C
Raw Normal View History

/*
* Bidirectional text processing.
*
* Processes unicode text by arranging the characters into an order suitable
* for display. E.g. Hebrew text will be arranged from right-to-left and
* any English within the text will remain in the left-to-right order.
* Characters such as parenthesis will be substituted for their mirrored
* equivalents if they are part of text which must be reversed.
*
* This is an implementation of the unicode Bidirectional Algorithm which
* can be found here: http://www.unicode.org/reports/tr9/ and is based
* on the reference implementation of the algorithm found on that page.
*
* For a nice overview of how it works, read this...
* http://www.w3.org/TR/REC-html40/struct/dirlang.html
*
* Extracted from the SmartOffice code, where it was modified by Ian
* Beveridge.
*
* Copyright (C) Picsel, 2004. All Rights Reserved.
*/
/*
* Original copyright notice from unicode reference implementation.
* ----------------------------------------------------------------
* Written by: Asmus Freytag
* C++ and Windows dependencies removed, and
* command line interface added by: Rick McGowan
*
* Copyright (C) 1999, ASMUS, Inc. All Rights Reserved
*/
/*
* Includes...
*/
#include "mupdf/fitz.h"
#include "mupdf/ucdn.h"
#include "bidi-imp.h" /* standard bidi code interface */
#include <assert.h>
/*
* Macros...
*/
#define ODD(x) ((x) & 1)
#define REPLACEABLE_TYPE(t) ( \
((t)==BDI_ES) || ((t)==BDI_ET) || ((t)==BDI_CS) || \
((t)==BDI_NSM) || ((t)==BDI_PDF) || ((t)==BDI_BN) || \
((t)==BDI_S) || ((t)==BDI_WS) || ((t)==BDI_N) )
#ifdef DEBUG_BIDI_VERBOSE
#define DBUGVF(params) do { fz_warn params; } while (0)
#else
#define DBUGVF(params) do {} while (0)
#endif
#ifdef DEBUG_BIDI_OUTLINE
#define DBUGH(params) do { fz_warn params; } while (0)
#else
#define DBUGH(params) do {} while (0)
#endif
#define UNICODE_EOS 0
#define UNICODE_DIGIT_ZERO 0x0030
#define UNICODE_DIGIT_NINE 0x0039
#define UNICODE_SUPERSCRIPT_TWO 0x00B2
#define UNICODE_SUPERSCRIPT_THREE 0x00B3
#define UNICODE_SUPERSCRIPT_ONE 0x00B9
#define UNICODE_RTL_START 0x0590
#define UNICODE_RTL_END 0x07BF
#define UNICODE_ARABIC_INDIC_DIGIT_ZERO 0x0660
#define UNICODE_ARABIC_INDIC_DIGIT_NINE 0x0669
#define UNICODE_EXTENDED_ARABIC_INDIC_DIGIT_ZERO 0x06F0
#define UNICODE_EXTENDED_ARABIC_INDIC_DIGIT_NINE 0x06F9
#define UNICODE_ZERO_WIDTH_NON_JOINER 0x200C
#define UNICODE_SUPERSCRIPT_ZERO 0x2070
#define UNICODE_SUPERSCRIPT_FOUR 0x2074
#define UNICODE_SUPERSCRIPT_NINE 0x2079
#define UNICODE_SUBSCRIPT_ZERO 0x2080
#define UNICODE_SUBSCRIPT_NINE 0x2089
#define UNICODE_CIRCLED_DIGIT_ONE 0x2460
#define UNICODE_NUMBER_TWENTY_FULL_STOP 0x249B
#define UNICODE_CIRCLED_DIGIT_ZERO 0x24EA
#define UNICODE_FULLWIDTH_DIGIT_ZERO 0xFF10
#define UNICODE_FULLWIDTH_DIGIT_NINE 0xFF19
#ifndef TRUE
#define TRUE (1)
#endif
#ifndef FALSE
#define FALSE (0)
#endif
/*
* Enumerations...
*/
#ifdef DEBUG_BIDI_VERBOSE
/* display support: */
static const char char_from_types[] =
{
' ', /* ON */
'>', /* L */
'<', /* R */
'9', /* AN */
'1', /* EN */
'a', /* AL */
'@', /* NSM */
'.', /* CS */
',', /* ES */
'$', /* ET */
':', /* BN */
'X', /* S */
'_', /* WS */
'B', /* B */
'+', /* RLO */
'+', /* RLE */
'+', /* LRO */
'+', /* LRE */
'-', /* PDF */
'=' /* LS */
};
#endif
/*
* Functions and static functions...
*/
/* UCDN uses a different ordering than Bidi does. We cannot
* change to the UCDN ordering, as the bidi-std.c code relies
* on the exact ordering (at least that N = ON = 0). We
* therefore map between the two using this small table. It
* also takes care of fudging LRI, RLI, FSI and PDI, that this
* code does not currently support. */
static const uint8_t ucdn_to_bidi[] =
{
BDI_L, /* UCDN_BIDI_CLASS_L = 0 */
BDI_LRE, /* UCDN_BIDI_CLASS_LRE = 1 */
BDI_LRO, /* UCDN_BIDI_CLASS_LRO = 2 */
BDI_R, /* UCDN_BIDI_CLASS_R = 3 */
BDI_AL, /* UCDN_BIDI_CLASS_AL = 4 */
BDI_RLE, /* UCDN_BIDI_CLASS_RLE = 5 */
BDI_RLO, /* UCDN_BIDI_CLASS_RLO = 6 */
BDI_PDF, /* UCDN_BIDI_CLASS_PDF = 7 */
BDI_EN, /* UCDN_BIDI_CLASS_EN = 8 */
BDI_ES, /* UCDN_BIDI_CLASS_ES = 9 */
BDI_ET, /* UCDN_BIDI_CLASS_ET = 10 */
BDI_AN, /* UCDN_BIDI_CLASS_AN = 11 */
BDI_CS, /* UCDN_BIDI_CLASS_CS = 12 */
BDI_NSM, /* UCDN_BIDI_CLASS_NSM = 13 */
BDI_BN, /* UCDN_BIDI_CLASS_BN = 14 */
BDI_B, /* UCDN_BIDI_CLASS_B = 15 */
BDI_S, /* UCDN_BIDI_CLASS_S = 16 */
BDI_WS, /* UCDN_BIDI_CLASS_WS = 17 */
BDI_ON, /* UCDN_BIDI_CLASS_ON = 18 */
BDI_LRE, /* UCDN_BIDI_CLASS_LRI = 19 */
BDI_RLE, /* UCDN_BIDI_CLASS_RLI = 20 */
BDI_N, /* UCDN_BIDI_CLASS_FSI = 21 */
BDI_N, /* UCDN_BIDI_CLASS_PDI = 22 */
};
#define class_from_ch_ws(ch) (ucdn_to_bidi[ucdn_get_bidi_class(ch)])
/* Return a direction for white-space on the second pass of the algorithm. */
static fz_bidi_chartype class_from_ch_n(uint32_t ch)
{
fz_bidi_chartype from_ch_ws = class_from_ch_ws(ch);
if (from_ch_ws == BDI_S || from_ch_ws == BDI_WS)
return BDI_N;
return from_ch_ws;
}
/* Split fragments into single scripts (or punctuation + single script) */
static void
split_at_script(const uint32_t *fragment,
size_t fragment_len,
int level,
void *arg,
fz_bidi_fragment_fn *callback)
{
int script = UCDN_SCRIPT_COMMON;
size_t script_start, i;
script_start = 0;
for (i = 0; i < fragment_len; i++)
{
int s = ucdn_get_script(fragment[i]);
if (s == UCDN_SCRIPT_COMMON || s == UCDN_SCRIPT_INHERITED)
{
/* Punctuation etc. This is fine. */
}
else if (s == script)
{
/* Same script. Still fine. */
}
else if (script == UCDN_SCRIPT_COMMON || script == UCDN_SCRIPT_INHERITED)
{
/* First non punctuation thing. Set the script. */
script = s;
}
else
{
/* Change of script. Break the fragment. */
(*callback)(&fragment[script_start], i - script_start, level, script, arg);
script_start = i;
script = s;
}
}
if (script_start != fragment_len)
{
(*callback)(&fragment[script_start], fragment_len - script_start, level, script, arg);
}
}
/* Determines the character classes for all following
* passes of the algorithm. A character class is basically the type of Bidi
* behaviour that the character exhibits.
*/
static void
classify_characters(const uint32_t *text,
fz_bidi_chartype *types,
size_t len,
fz_bidi_flags flags)
{
size_t i;
if ((flags & FZ_BIDI_CLASSIFY_WHITE_SPACE)!=0)
{
for (i = 0; i < len; i++)
{
types[i] = class_from_ch_ws(text[i]);
}
}
else
{
#ifdef DEBUG_BIDI_VERBOSE
fprintf(stderr, "Text: ");
for (i = 0; i < len; i++)
{
/* So that we can actually sort of read the debug string, any
* non-ascii characters are replaced with a 1-digit hash
* value from 0-9, making non-english characters appear
* as numbers
*/
fprintf(stderr, "%c", (text[i] <= 127 && text[i] >= 32) ?
text[i] : text[i] % 9 + '0');
}
fprintf(stderr, "\nTypes: ");
#endif
for (i = 0; i < len; i++)
{
types[i] = class_from_ch_n(text[i]);
#ifdef DEBUG_BIDI_VERBOSE
fprintf(stderr, "%c", char_from_types[(int)types[i]]);
#endif
}
#ifdef DEBUG_BIDI_VERBOSE
fprintf(stderr, "\n");
#endif
}
}
/* Determines the base level of the text.
* Implements rule P2 of the Unicode Bidi Algorithm.
* Note: Ignores explicit embeddings
*/
static fz_bidi_level base_level_from_text(fz_bidi_chartype *types, size_t len)
{
size_t i;
for (i = 0; i < len; i++)
{
switch (types[i])
{
/* strong left */
case BDI_L:
return FZ_BIDI_LTR;
/* strong right */
case BDI_R:
case BDI_AL:
return FZ_BIDI_RTL;
}
}
return FZ_BIDI_LTR;
}
static fz_bidi_direction direction_from_type(fz_bidi_chartype type)
{
switch (type)
{
case BDI_L:
case BDI_EN:
return FZ_BIDI_LTR;
case BDI_R:
case BDI_AL:
return FZ_BIDI_RTL;
default:
return FZ_BIDI_NEUTRAL;
}
}
static void
classify_quoted_blocks(const uint32_t *text,
fz_bidi_chartype *types,
size_t len)
{
size_t i;
int inQuote = FALSE;
int pdfNeeded = FALSE;
int ltrFound = FALSE;
int rtlFound = FALSE;
/* Only do anything special here if there is mixed content
* (LTR *and* RTL) in the text.
*/
for (i = 0; i < len; i++)
{
switch (direction_from_type(types[i]))
{
case FZ_BIDI_LTR:
ltrFound = TRUE;
break;
case FZ_BIDI_RTL:
rtlFound = TRUE;
break;
default:
break;
}
}
/* Only make any changes if *both* LTR and RTL characters exist
* in this text.
*/
if (!ltrFound || !rtlFound)
{
return;
}
for (i = 0; i < len; i++)
{
if (text[i]=='"')
{
/* If we're already in a quote then terminate it,
* else start a new block.
*/
if (inQuote)
{
inQuote = FALSE;
if (pdfNeeded)
{
pdfNeeded = FALSE;
types[i] = BDI_PDF;
}
}
else
{
size_t j;
int done = FALSE;
inQuote = TRUE;
/* Find the first strong right or left type and
* use that to determine whether we should classify
* the quote as LRE or RLE. Or neither, if we
* hit another quote before any strongly-directional
* character.
*/
for (j = i + 1; !done && (j < len) && text[j] != '"'; ++j)
{
switch(types[j])
{
case BDI_RLE:
case BDI_LRE:
done = TRUE;
break;
case BDI_L:
case BDI_EN:
types[i] = BDI_LRE;
pdfNeeded = TRUE;
done = TRUE;
break;
case BDI_R:
case BDI_AL:
types[i] = BDI_RLE;
pdfNeeded = TRUE;
done = TRUE;
break;
default:
break;
}
}
}
}
}
}
/* Creates a buffer with an embedding level for every character in the
* given text. Also determines the base level and returns it in
* *baseDir if *baseDir does not initially contain a valid direction.
*/
static fz_bidi_level *
create_levels(fz_context *ctx,
const uint32_t *text,
size_t len,
fz_bidi_direction *baseDir,
int resolveWhiteSpace,
int flags)
{
fz_bidi_level *levels, *plevels;
fz_bidi_chartype *types = NULL;
fz_bidi_chartype *ptypes;
fz_bidi_level baseLevel;
const uint32_t *ptext;
size_t plen, remaining;
levels = fz_malloc(ctx, len * sizeof(*levels));
fz_var(types);
fz_try(ctx)
{
types = fz_malloc(ctx, len * sizeof(fz_bidi_chartype));
classify_characters(text, types, len, flags);
if (*baseDir != FZ_BIDI_LTR && *baseDir != FZ_BIDI_RTL)
{
/* Derive the base level from the text and
* update *baseDir in case the caller wants to know.
*/
baseLevel = base_level_from_text(types, len);
*baseDir = ODD(baseLevel)==1 ? FZ_BIDI_RTL : FZ_BIDI_LTR;
}
else
{
baseLevel = (fz_bidi_level)*baseDir;
}
{
/* Replace tab with base direction, i.e. make tab appear as
* 'strong left' if the base direction is left-to-right and
* 'strong right' if base direction is right-to-left. This
* allows Layout to implicitly treat tabs as 'segment separators'.
*/
size_t i;
for (i = 0u; i < len; i++)
{
if (text[i]=='\t')
{
types[i] = (*baseDir == FZ_BIDI_RTL) ? BDI_R : BDI_L;
}
}
}
/* Look for quotation marks. Classify them as RLE or LRE
* or leave them alone, depending on what follows them.
*/
classify_quoted_blocks(text, types, len);
/* Work one paragraph at a time. */
plevels = levels;
ptypes = types;
ptext = text;
remaining = len;
while (remaining)
{
plen = fz_bidi_resolve_paragraphs(ptypes, remaining);
/* Work out the levels and character types... */
(void)fz_bidi_resolve_explicit(baseLevel, BDI_N, ptypes, plevels, plen, 0);
fz_bidi_resolve_weak(ctx, baseLevel, ptypes, plevels, plen);
fz_bidi_resolve_neutrals(baseLevel, ptypes, plevels, plen);
fz_bidi_resolve_implicit(ptypes, plevels, plen);
classify_characters(ptext, ptypes, plen, FZ_BIDI_CLASSIFY_WHITE_SPACE);
if (resolveWhiteSpace)
{
/* resolve whitespace */
fz_bidi_resolve_whitespace(baseLevel, ptypes, plevels, plen);
}
plevels += plen;
ptypes += plen;
ptext += plen;
remaining -= plen;
}
/* The levels buffer now has odd and even numbers indicating
* rtl or ltr characters, respectively.
*/
#ifdef DEBUG_BIDI_VERBOSE
fprintf(stderr, "Levels: ");
{
size_t i;
for (i = 0; i < len; i++)
{
fprintf(stderr, "%d", levels[i]>9?0:levels[i]);
}
fprintf(stderr, "\n");
}
#endif
}
fz_always(ctx)
{
fz_free(ctx, types);
}
fz_catch(ctx)
{
fz_free(ctx, levels);
fz_rethrow(ctx);
}
return levels;
}
/* Partitions the given character sequence into one or more unidirectional
* fragments and invokes the given callback function for each fragment.
*/
void fz_bidi_fragment_text(fz_context *ctx,
const uint32_t *text,
size_t textlen,
fz_bidi_direction *baseDir,
fz_bidi_fragment_fn *callback,
void *arg,
int flags)
{
size_t startOfFragment;
size_t i;
fz_bidi_level *levels;
if (text == NULL || callback == NULL || textlen == 0)
return;
DBUGH(("fz_bidi_fragment_text('%S', len = %d)\n", text, textlen));
levels = create_levels(ctx, text, textlen, baseDir, FALSE, flags);
/* We now have an array with an embedding level
* for each character in text.
*/
assert(levels != NULL);
fz_try(ctx)
{
startOfFragment = 0;
for (i = 1; i < textlen; i++)
{
if (levels[i] != levels[i-1])
{
/* We've gone past the end of the fragment.
* Create a text object for it, then start
* a new fragment.
*/
split_at_script(&text[startOfFragment],
i - startOfFragment,
levels[startOfFragment],
arg,
callback);
startOfFragment = i;
}
}
/* Now i == textlen. Deal with the final (or maybe only) fragment. */
/* otherwise create 1 fragment */
split_at_script(&text[startOfFragment],
i - startOfFragment,
levels[startOfFragment],
arg,
callback);
}
fz_always(ctx)
{
fz_free(ctx, levels);
}
fz_catch(ctx)
{
fz_rethrow(ctx);
}
}