/* * Bidirectional text processing. * * Processes unicode text by arranging the characters into an order suitable * for display. E.g. Hebrew text will be arranged from right-to-left and * any English within the text will remain in the left-to-right order. * Characters such as parenthesis will be substituted for their mirrored * equivalents if they are part of text which must be reversed. * * This is an implementation of the unicode Bidirectional Algorithm which * can be found here: http://www.unicode.org/reports/tr9/ and is based * on the reference implementation of the algorithm found on that page. * * For a nice overview of how it works, read this... * http://www.w3.org/TR/REC-html40/struct/dirlang.html * * Extracted from the SmartOffice code, where it was modified by Ian * Beveridge. * * Copyright (C) Picsel, 2004. All Rights Reserved. */ /* * Original copyright notice from unicode reference implementation. * ---------------------------------------------------------------- * Written by: Asmus Freytag * C++ and Windows dependencies removed, and * command line interface added by: Rick McGowan * * Copyright (C) 1999, ASMUS, Inc. All Rights Reserved */ /* * Includes... */ #include "mupdf/fitz.h" #include "mupdf/ucdn.h" #include "bidi-imp.h" /* standard bidi code interface */ #include /* * Macros... */ #define ODD(x) ((x) & 1) #define REPLACEABLE_TYPE(t) ( \ ((t)==BDI_ES) || ((t)==BDI_ET) || ((t)==BDI_CS) || \ ((t)==BDI_NSM) || ((t)==BDI_PDF) || ((t)==BDI_BN) || \ ((t)==BDI_S) || ((t)==BDI_WS) || ((t)==BDI_N) ) #ifdef DEBUG_BIDI_VERBOSE #define DBUGVF(params) do { fz_warn params; } while (0) #else #define DBUGVF(params) do {} while (0) #endif #ifdef DEBUG_BIDI_OUTLINE #define DBUGH(params) do { fz_warn params; } while (0) #else #define DBUGH(params) do {} while (0) #endif #define UNICODE_EOS 0 #define UNICODE_DIGIT_ZERO 0x0030 #define UNICODE_DIGIT_NINE 0x0039 #define UNICODE_SUPERSCRIPT_TWO 0x00B2 #define UNICODE_SUPERSCRIPT_THREE 0x00B3 #define UNICODE_SUPERSCRIPT_ONE 0x00B9 #define UNICODE_RTL_START 0x0590 #define UNICODE_RTL_END 0x07BF #define UNICODE_ARABIC_INDIC_DIGIT_ZERO 0x0660 #define UNICODE_ARABIC_INDIC_DIGIT_NINE 0x0669 #define UNICODE_EXTENDED_ARABIC_INDIC_DIGIT_ZERO 0x06F0 #define UNICODE_EXTENDED_ARABIC_INDIC_DIGIT_NINE 0x06F9 #define UNICODE_ZERO_WIDTH_NON_JOINER 0x200C #define UNICODE_SUPERSCRIPT_ZERO 0x2070 #define UNICODE_SUPERSCRIPT_FOUR 0x2074 #define UNICODE_SUPERSCRIPT_NINE 0x2079 #define UNICODE_SUBSCRIPT_ZERO 0x2080 #define UNICODE_SUBSCRIPT_NINE 0x2089 #define UNICODE_CIRCLED_DIGIT_ONE 0x2460 #define UNICODE_NUMBER_TWENTY_FULL_STOP 0x249B #define UNICODE_CIRCLED_DIGIT_ZERO 0x24EA #define UNICODE_FULLWIDTH_DIGIT_ZERO 0xFF10 #define UNICODE_FULLWIDTH_DIGIT_NINE 0xFF19 #ifndef TRUE #define TRUE (1) #endif #ifndef FALSE #define FALSE (0) #endif /* * Enumerations... */ #ifdef DEBUG_BIDI_VERBOSE /* display support: */ static const char char_from_types[] = { ' ', /* ON */ '>', /* L */ '<', /* R */ '9', /* AN */ '1', /* EN */ 'a', /* AL */ '@', /* NSM */ '.', /* CS */ ',', /* ES */ '$', /* ET */ ':', /* BN */ 'X', /* S */ '_', /* WS */ 'B', /* B */ '+', /* RLO */ '+', /* RLE */ '+', /* LRO */ '+', /* LRE */ '-', /* PDF */ '=' /* LS */ }; #endif /* * Functions and static functions... */ /* UCDN uses a different ordering than Bidi does. We cannot * change to the UCDN ordering, as the bidi-std.c code relies * on the exact ordering (at least that N = ON = 0). We * therefore map between the two using this small table. It * also takes care of fudging LRI, RLI, FSI and PDI, that this * code does not currently support. */ static const uint8_t ucdn_to_bidi[] = { BDI_L, /* UCDN_BIDI_CLASS_L = 0 */ BDI_LRE, /* UCDN_BIDI_CLASS_LRE = 1 */ BDI_LRO, /* UCDN_BIDI_CLASS_LRO = 2 */ BDI_R, /* UCDN_BIDI_CLASS_R = 3 */ BDI_AL, /* UCDN_BIDI_CLASS_AL = 4 */ BDI_RLE, /* UCDN_BIDI_CLASS_RLE = 5 */ BDI_RLO, /* UCDN_BIDI_CLASS_RLO = 6 */ BDI_PDF, /* UCDN_BIDI_CLASS_PDF = 7 */ BDI_EN, /* UCDN_BIDI_CLASS_EN = 8 */ BDI_ES, /* UCDN_BIDI_CLASS_ES = 9 */ BDI_ET, /* UCDN_BIDI_CLASS_ET = 10 */ BDI_AN, /* UCDN_BIDI_CLASS_AN = 11 */ BDI_CS, /* UCDN_BIDI_CLASS_CS = 12 */ BDI_NSM, /* UCDN_BIDI_CLASS_NSM = 13 */ BDI_BN, /* UCDN_BIDI_CLASS_BN = 14 */ BDI_B, /* UCDN_BIDI_CLASS_B = 15 */ BDI_S, /* UCDN_BIDI_CLASS_S = 16 */ BDI_WS, /* UCDN_BIDI_CLASS_WS = 17 */ BDI_ON, /* UCDN_BIDI_CLASS_ON = 18 */ BDI_LRE, /* UCDN_BIDI_CLASS_LRI = 19 */ BDI_RLE, /* UCDN_BIDI_CLASS_RLI = 20 */ BDI_N, /* UCDN_BIDI_CLASS_FSI = 21 */ BDI_N, /* UCDN_BIDI_CLASS_PDI = 22 */ }; #define class_from_ch_ws(ch) (ucdn_to_bidi[ucdn_get_bidi_class(ch)]) /* Return a direction for white-space on the second pass of the algorithm. */ static fz_bidi_chartype class_from_ch_n(uint32_t ch) { fz_bidi_chartype from_ch_ws = class_from_ch_ws(ch); if (from_ch_ws == BDI_S || from_ch_ws == BDI_WS) return BDI_N; return from_ch_ws; } /* Split fragments into single scripts (or punctuation + single script) */ static void split_at_script(const uint32_t *fragment, size_t fragment_len, int level, void *arg, fz_bidi_fragment_fn *callback) { int script = UCDN_SCRIPT_COMMON; size_t script_start, i; script_start = 0; for (i = 0; i < fragment_len; i++) { int s = ucdn_get_script(fragment[i]); if (s == UCDN_SCRIPT_COMMON || s == UCDN_SCRIPT_INHERITED) { /* Punctuation etc. This is fine. */ } else if (s == script) { /* Same script. Still fine. */ } else if (script == UCDN_SCRIPT_COMMON || script == UCDN_SCRIPT_INHERITED) { /* First non punctuation thing. Set the script. */ script = s; } else { /* Change of script. Break the fragment. */ (*callback)(&fragment[script_start], i - script_start, level, script, arg); script_start = i; script = s; } } if (script_start != fragment_len) { (*callback)(&fragment[script_start], fragment_len - script_start, level, script, arg); } } /* Determines the character classes for all following * passes of the algorithm. A character class is basically the type of Bidi * behaviour that the character exhibits. */ static void classify_characters(const uint32_t *text, fz_bidi_chartype *types, size_t len, fz_bidi_flags flags) { size_t i; if ((flags & FZ_BIDI_CLASSIFY_WHITE_SPACE)!=0) { for (i = 0; i < len; i++) { types[i] = class_from_ch_ws(text[i]); } } else { #ifdef DEBUG_BIDI_VERBOSE fprintf(stderr, "Text: "); for (i = 0; i < len; i++) { /* So that we can actually sort of read the debug string, any * non-ascii characters are replaced with a 1-digit hash * value from 0-9, making non-english characters appear * as numbers */ fprintf(stderr, "%c", (text[i] <= 127 && text[i] >= 32) ? text[i] : text[i] % 9 + '0'); } fprintf(stderr, "\nTypes: "); #endif for (i = 0; i < len; i++) { types[i] = class_from_ch_n(text[i]); #ifdef DEBUG_BIDI_VERBOSE fprintf(stderr, "%c", char_from_types[(int)types[i]]); #endif } #ifdef DEBUG_BIDI_VERBOSE fprintf(stderr, "\n"); #endif } } /* Determines the base level of the text. * Implements rule P2 of the Unicode Bidi Algorithm. * Note: Ignores explicit embeddings */ static fz_bidi_level base_level_from_text(fz_bidi_chartype *types, size_t len) { size_t i; for (i = 0; i < len; i++) { switch (types[i]) { /* strong left */ case BDI_L: return FZ_BIDI_LTR; /* strong right */ case BDI_R: case BDI_AL: return FZ_BIDI_RTL; } } return FZ_BIDI_LTR; } static fz_bidi_direction direction_from_type(fz_bidi_chartype type) { switch (type) { case BDI_L: case BDI_EN: return FZ_BIDI_LTR; case BDI_R: case BDI_AL: return FZ_BIDI_RTL; default: return FZ_BIDI_NEUTRAL; } } static void classify_quoted_blocks(const uint32_t *text, fz_bidi_chartype *types, size_t len) { size_t i; int inQuote = FALSE; int pdfNeeded = FALSE; int ltrFound = FALSE; int rtlFound = FALSE; /* Only do anything special here if there is mixed content * (LTR *and* RTL) in the text. */ for (i = 0; i < len; i++) { switch (direction_from_type(types[i])) { case FZ_BIDI_LTR: ltrFound = TRUE; break; case FZ_BIDI_RTL: rtlFound = TRUE; break; default: break; } } /* Only make any changes if *both* LTR and RTL characters exist * in this text. */ if (!ltrFound || !rtlFound) { return; } for (i = 0; i < len; i++) { if (text[i]=='"') { /* If we're already in a quote then terminate it, * else start a new block. */ if (inQuote) { inQuote = FALSE; if (pdfNeeded) { pdfNeeded = FALSE; types[i] = BDI_PDF; } } else { size_t j; int done = FALSE; inQuote = TRUE; /* Find the first strong right or left type and * use that to determine whether we should classify * the quote as LRE or RLE. Or neither, if we * hit another quote before any strongly-directional * character. */ for (j = i + 1; !done && (j < len) && text[j] != '"'; ++j) { switch(types[j]) { case BDI_RLE: case BDI_LRE: done = TRUE; break; case BDI_L: case BDI_EN: types[i] = BDI_LRE; pdfNeeded = TRUE; done = TRUE; break; case BDI_R: case BDI_AL: types[i] = BDI_RLE; pdfNeeded = TRUE; done = TRUE; break; default: break; } } } } } } /* Creates a buffer with an embedding level for every character in the * given text. Also determines the base level and returns it in * *baseDir if *baseDir does not initially contain a valid direction. */ static fz_bidi_level * create_levels(fz_context *ctx, const uint32_t *text, size_t len, fz_bidi_direction *baseDir, int resolveWhiteSpace, int flags) { fz_bidi_level *levels, *plevels; fz_bidi_chartype *types = NULL; fz_bidi_chartype *ptypes; fz_bidi_level baseLevel; const uint32_t *ptext; size_t plen, remaining; levels = fz_malloc(ctx, len * sizeof(*levels)); fz_var(types); fz_try(ctx) { types = fz_malloc(ctx, len * sizeof(fz_bidi_chartype)); classify_characters(text, types, len, flags); if (*baseDir != FZ_BIDI_LTR && *baseDir != FZ_BIDI_RTL) { /* Derive the base level from the text and * update *baseDir in case the caller wants to know. */ baseLevel = base_level_from_text(types, len); *baseDir = ODD(baseLevel)==1 ? FZ_BIDI_RTL : FZ_BIDI_LTR; } else { baseLevel = (fz_bidi_level)*baseDir; } { /* Replace tab with base direction, i.e. make tab appear as * 'strong left' if the base direction is left-to-right and * 'strong right' if base direction is right-to-left. This * allows Layout to implicitly treat tabs as 'segment separators'. */ size_t i; for (i = 0u; i < len; i++) { if (text[i]=='\t') { types[i] = (*baseDir == FZ_BIDI_RTL) ? BDI_R : BDI_L; } } } /* Look for quotation marks. Classify them as RLE or LRE * or leave them alone, depending on what follows them. */ classify_quoted_blocks(text, types, len); /* Work one paragraph at a time. */ plevels = levels; ptypes = types; ptext = text; remaining = len; while (remaining) { plen = fz_bidi_resolve_paragraphs(ptypes, remaining); /* Work out the levels and character types... */ (void)fz_bidi_resolve_explicit(baseLevel, BDI_N, ptypes, plevels, plen, 0); fz_bidi_resolve_weak(ctx, baseLevel, ptypes, plevels, plen); fz_bidi_resolve_neutrals(baseLevel, ptypes, plevels, plen); fz_bidi_resolve_implicit(ptypes, plevels, plen); classify_characters(ptext, ptypes, plen, FZ_BIDI_CLASSIFY_WHITE_SPACE); if (resolveWhiteSpace) { /* resolve whitespace */ fz_bidi_resolve_whitespace(baseLevel, ptypes, plevels, plen); } plevels += plen; ptypes += plen; ptext += plen; remaining -= plen; } /* The levels buffer now has odd and even numbers indicating * rtl or ltr characters, respectively. */ #ifdef DEBUG_BIDI_VERBOSE fprintf(stderr, "Levels: "); { size_t i; for (i = 0; i < len; i++) { fprintf(stderr, "%d", levels[i]>9?0:levels[i]); } fprintf(stderr, "\n"); } #endif } fz_always(ctx) { fz_free(ctx, types); } fz_catch(ctx) { fz_free(ctx, levels); fz_rethrow(ctx); } return levels; } /* Partitions the given character sequence into one or more unidirectional * fragments and invokes the given callback function for each fragment. */ void fz_bidi_fragment_text(fz_context *ctx, const uint32_t *text, size_t textlen, fz_bidi_direction *baseDir, fz_bidi_fragment_fn *callback, void *arg, int flags) { size_t startOfFragment; size_t i; fz_bidi_level *levels; if (text == NULL || callback == NULL || textlen == 0) return; DBUGH(("fz_bidi_fragment_text('%S', len = %d)\n", text, textlen)); levels = create_levels(ctx, text, textlen, baseDir, FALSE, flags); /* We now have an array with an embedding level * for each character in text. */ assert(levels != NULL); fz_try(ctx) { startOfFragment = 0; for (i = 1; i < textlen; i++) { if (levels[i] != levels[i-1]) { /* We've gone past the end of the fragment. * Create a text object for it, then start * a new fragment. */ split_at_script(&text[startOfFragment], i - startOfFragment, levels[startOfFragment], arg, callback); startOfFragment = i; } } /* Now i == textlen. Deal with the final (or maybe only) fragment. */ /* otherwise create 1 fragment */ split_at_script(&text[startOfFragment], i - startOfFragment, levels[startOfFragment], arg, callback); } fz_always(ctx) { fz_free(ctx, levels); } fz_catch(ctx) { fz_rethrow(ctx); } }