eBookReaderSwitch/mupdf/source/fitz/string.c

794 lines
16 KiB
C

#include "mupdf/fitz.h"
#include <string.h>
#include <errno.h>
#include <math.h>
#include <float.h>
#include <stdlib.h>
static inline int
fz_tolower(int c)
{
if (c >= 'A' && c <= 'Z')
return c + 32;
return c;
}
/*
Return strlen(s), if that is less than maxlen, or maxlen if
there is no null byte ('\0') among the first maxlen bytes.
*/
size_t
fz_strnlen(const char *s, size_t n)
{
const char *p = memchr(s, 0, n);
return p ? (size_t) (p - s) : n;
}
int
fz_strncasecmp(const char *a, const char *b, int n)
{
if (!n--)
return 0;
for (; *a && *b && n && (*a == *b || fz_tolower(*a) == fz_tolower(*b)); a++, b++, n--)
;
return fz_tolower(*a) - fz_tolower(*b);
}
/*
Case insensitive (ASCII only) string comparison.
*/
int
fz_strcasecmp(const char *a, const char *b)
{
while (fz_tolower(*a) == fz_tolower(*b))
{
if (*a++ == 0)
return 0;
b++;
}
return fz_tolower(*a) - fz_tolower(*b);
}
/*
Given a pointer to a C string (or a pointer to NULL) break
it at the first occurrence of a delimiter char (from a given set).
stringp: Pointer to a C string pointer (or NULL). Updated on exit to
point to the first char of the string after the delimiter that was
found. The string pointed to by stringp will be corrupted by this
call (as the found delimiter will be overwritten by 0).
delim: A C string of acceptable delimiter characters.
Returns a pointer to a C string containing the chars of stringp up
to the first delimiter char (or the end of the string), or NULL.
*/
char *
fz_strsep(char **stringp, const char *delim)
{
char *ret = *stringp;
if (!ret) return NULL;
if ((*stringp = strpbrk(*stringp, delim)) != NULL)
*((*stringp)++) = '\0';
return ret;
}
/*
Copy at most n-1 chars of a string into a destination
buffer with null termination, returning the real length of the
initial string (excluding terminator).
dst: Destination buffer, at least n bytes long.
src: C string (non-NULL).
n: Size of dst buffer in bytes.
Returns the length (excluding terminator) of src.
*/
size_t
fz_strlcpy(char *dst, const char *src, size_t siz)
{
register char *d = dst;
register const char *s = src;
register size_t n = siz;
/* Copy as many bytes as will fit */
if (n != 0 && --n != 0) {
do {
if ((*d++ = *s++) == 0)
break;
} while (--n != 0);
}
/* Not enough room in dst, add NUL and traverse rest of src */
if (n == 0) {
if (siz != 0)
*d = '\0'; /* NUL-terminate dst */
while (*s++)
;
}
return(s - src - 1); /* count does not include NUL */
}
/*
Concatenate 2 strings, with a maximum length.
dst: pointer to first string in a buffer of n bytes.
src: pointer to string to concatenate.
n: Size (in bytes) of buffer that dst is in.
Returns the real length that a concatenated dst + src would have been
(not including terminator).
*/
size_t
fz_strlcat(char *dst, const char *src, size_t siz)
{
register char *d = dst;
register const char *s = src;
register size_t n = siz;
size_t dlen;
/* Find the end of dst and adjust bytes left but don't go past end */
while (*d != '\0' && n-- != 0)
d++;
dlen = d - dst;
n = siz - dlen;
if (n == 0)
return dlen + strlen(s);
while (*s != '\0') {
if (n != 1) {
*d++ = *s;
n--;
}
s++;
}
*d = '\0';
return dlen + (s - src); /* count does not include NUL */
}
/*
extract the directory component from a path.
*/
void
fz_dirname(char *dir, const char *path, size_t n)
{
size_t i;
if (!path || !path[0])
{
fz_strlcpy(dir, ".", n);
return;
}
fz_strlcpy(dir, path, n);
i = strlen(dir);
for(; dir[i] == '/'; --i) if (!i) { fz_strlcpy(dir, "/", n); return; }
for(; dir[i] != '/'; --i) if (!i) { fz_strlcpy(dir, ".", n); return; }
for(; dir[i] == '/'; --i) if (!i) { fz_strlcpy(dir, "/", n); return; }
dir[i+1] = 0;
}
static inline int ishex(int a)
{
return (a >= 'A' && a <= 'F') ||
(a >= 'a' && a <= 'f') ||
(a >= '0' && a <= '9');
}
static inline int tohex(int c)
{
if (c >= '0' && c <= '9') return c - '0';
if (c >= 'a' && c <= 'f') return c - 'a' + 0xA;
if (c >= 'A' && c <= 'F') return c - 'A' + 0xA;
return 0;
}
/*
decode url escapes.
*/
char *
fz_urldecode(char *url)
{
char *s = url;
char *p = url;
while (*s)
{
int c = (unsigned char) *s++;
if (c == '%' && ishex(s[0]) && ishex(s[1]))
{
int a = tohex(*s++);
int b = tohex(*s++);
*p++ = a << 4 | b;
}
else
{
*p++ = c;
}
}
*p = 0;
return url;
}
/*
create output file name using a template.
If the path contains %[0-9]*d, the first such pattern will be replaced
with the page number. If the template does not contain such a pattern, the page
number will be inserted before the filename extension. If the template does not have
a filename extension, the page number will be added to the end.
*/
void
fz_format_output_path(fz_context *ctx, char *path, size_t size, const char *fmt, int page)
{
const char *s, *p;
char num[40];
int i, n;
int z = 0;
for (i = 0; page; page /= 10)
num[i++] = '0' + page % 10;
num[i] = 0;
s = p = strchr(fmt, '%');
if (p)
{
++p;
while (*p >= '0' && *p <= '9')
z = z * 10 + (*p++ - '0');
}
if (p && *p == 'd')
{
++p;
}
else
{
s = p = strrchr(fmt, '.');
if (!p)
s = p = fmt + strlen(fmt);
}
if (z < 1)
z = 1;
while (i < z && i < (int)sizeof num)
num[i++] = '0';
n = s - fmt;
if (n + i + strlen(p) >= size)
fz_throw(ctx, FZ_ERROR_GENERIC, "path name buffer overflow");
memcpy(path, fmt, n);
while (i > 0)
path[n++] = num[--i];
fz_strlcpy(path + n, p, size - n);
}
#define SEP(x) ((x)=='/' || (x) == 0)
/*
rewrite path to the shortest string that names the same path.
Eliminates multiple and trailing slashes, interprets "." and "..".
Overwrites the string in place.
*/
char *
fz_cleanname(char *name)
{
char *p, *q, *dotdot;
int rooted;
rooted = name[0] == '/';
/*
* invariants:
* p points at beginning of path element we're considering.
* q points just past the last path element we wrote (no slash).
* dotdot points just past the point where .. cannot backtrack
* any further (no slash).
*/
p = q = dotdot = name + rooted;
while (*p)
{
if(p[0] == '/') /* null element */
p++;
else if (p[0] == '.' && SEP(p[1]))
p += 1; /* don't count the separator in case it is nul */
else if (p[0] == '.' && p[1] == '.' && SEP(p[2]))
{
p += 2;
if (q > dotdot) /* can backtrack */
{
while(--q > dotdot && *q != '/')
;
}
else if (!rooted) /* /.. is / but ./../ is .. */
{
if (q != name)
*q++ = '/';
*q++ = '.';
*q++ = '.';
dotdot = q;
}
}
else /* real path element */
{
if (q != name+rooted)
*q++ = '/';
while ((*q = *p) != '/' && *q != 0)
p++, q++;
}
}
if (q == name) /* empty string is really "." */
*q++ = '.';
*q = '\0';
return name;
}
enum
{
UTFmax = 4, /* maximum bytes per rune */
Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */
Runeself = 0x80, /* rune and UTF sequences are the same (<) */
Runeerror = 0xFFFD, /* decoding error in UTF */
Runemax = 0x10FFFF, /* maximum rune value */
};
enum
{
Bit1 = 7,
Bitx = 6,
Bit2 = 5,
Bit3 = 4,
Bit4 = 3,
Bit5 = 2,
T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */
Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */
T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */
T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */
T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */
T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */
Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */
Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */
Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */
Rune4 = (1<<(Bit4+3*Bitx))-1, /* 0001 1111 1111 1111 1111 1111 */
Maskx = (1<<Bitx)-1, /* 0011 1111 */
Testx = Maskx ^ 0xFF, /* 1100 0000 */
Bad = Runeerror,
};
/*
UTF8 decode a single rune from a sequence of chars.
rune: Pointer to an int to assign the decoded 'rune' to.
str: Pointer to a UTF8 encoded string.
Returns the number of bytes consumed.
*/
int
fz_chartorune(int *rune, const char *str)
{
int c, c1, c2, c3;
int l;
/*
* one character sequence
* 00000-0007F => T1
*/
c = *(const unsigned char*)str;
if(c < Tx) {
*rune = c;
return 1;
}
/*
* two character sequence
* 0080-07FF => T2 Tx
*/
c1 = *(const unsigned char*)(str+1) ^ Tx;
if(c1 & Testx)
goto bad;
if(c < T3) {
if(c < T2)
goto bad;
l = ((c << Bitx) | c1) & Rune2;
if(l <= Rune1)
goto bad;
*rune = l;
return 2;
}
/*
* three character sequence
* 0800-FFFF => T3 Tx Tx
*/
c2 = *(const unsigned char*)(str+2) ^ Tx;
if(c2 & Testx)
goto bad;
if(c < T4) {
l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
if(l <= Rune2)
goto bad;
*rune = l;
return 3;
}
/*
* four character sequence (21-bit value)
* 10000-1FFFFF => T4 Tx Tx Tx
*/
c3 = *(const unsigned char*)(str+3) ^ Tx;
if (c3 & Testx)
goto bad;
if (c < T5) {
l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
if (l <= Rune3)
goto bad;
*rune = l;
return 4;
}
/*
* Support for 5-byte or longer UTF-8 would go here, but
* since we don't have that, we'll just fall through to bad.
*/
/*
* bad decoding
*/
bad:
*rune = Bad;
return 1;
}
/*
UTF8 encode a rune to a sequence of chars.
str: Pointer to a place to put the UTF8 encoded character.
rune: Pointer to a 'rune'.
Returns the number of bytes the rune took to output.
*/
int
fz_runetochar(char *str, int rune)
{
/* Runes are signed, so convert to unsigned for range check. */
unsigned int c = (unsigned int)rune;
/*
* one character sequence
* 00000-0007F => 00-7F
*/
if(c <= Rune1) {
str[0] = c;
return 1;
}
/*
* two character sequence
* 0080-07FF => T2 Tx
*/
if(c <= Rune2) {
str[0] = T2 | (c >> 1*Bitx);
str[1] = Tx | (c & Maskx);
return 2;
}
/*
* If the Rune is out of range, convert it to the error rune.
* Do this test here because the error rune encodes to three bytes.
* Doing it earlier would duplicate work, since an out of range
* Rune wouldn't have fit in one or two bytes.
*/
if (c > Runemax)
c = Runeerror;
/*
* three character sequence
* 0800-FFFF => T3 Tx Tx
*/
if (c <= Rune3) {
str[0] = T3 | (c >> 2*Bitx);
str[1] = Tx | ((c >> 1*Bitx) & Maskx);
str[2] = Tx | (c & Maskx);
return 3;
}
/*
* four character sequence (21-bit value)
* 10000-1FFFFF => T4 Tx Tx Tx
*/
str[0] = T4 | (c >> 3*Bitx);
str[1] = Tx | ((c >> 2*Bitx) & Maskx);
str[2] = Tx | ((c >> 1*Bitx) & Maskx);
str[3] = Tx | (c & Maskx);
return 4;
}
/*
Count how many chars are required to represent a rune.
rune: The rune to encode.
Returns the number of bytes required to represent this run in UTF8.
*/
int
fz_runelen(int c)
{
char str[10];
return fz_runetochar(str, c);
}
/*
Count how many runes the UTF-8 encoded string
consists of.
s: The UTF-8 encoded, NUL-terminated text string.
Returns the number of runes in the string.
*/
int
fz_utflen(const char *s)
{
int c, n, rune;
n = 0;
for(;;) {
c = *(const unsigned char*)s;
if(c < Runeself) {
if(c == 0)
return n;
s++;
} else
s += fz_chartorune(&rune, s);
n++;
}
return 0;
}
/*
Range checking atof
*/
float fz_atof(const char *s)
{
float result;
if (s == NULL)
return 0;
errno = 0;
result = fz_strtof(s, NULL);
if ((errno == ERANGE && result == 0) || isnan(result))
/* Return 1.0 on underflow, as it's a small known value that won't cause a divide by 0. */
return 1;
result = fz_clamp(result, -FLT_MAX, FLT_MAX);
return result;
}
/*
atoi that copes with NULL
*/
int fz_atoi(const char *s)
{
if (s == NULL)
return 0;
return atoi(s);
}
int64_t fz_atoi64(const char *s)
{
if (s == NULL)
return 0;
return atoll(s);
}
/*
Check and parse string into page ranges:
( ','? ([0-9]+|'N') ( '-' ([0-9]+|N) )? )+
*/
int fz_is_page_range(fz_context *ctx, const char *s)
{
/* TODO: check the actual syntax... */
while (*s)
{
if ((*s < '0' || *s > '9') && *s != 'N' && *s != '-' && *s != ',')
return 0;
s++;
}
return 1;
}
const char *fz_parse_page_range(fz_context *ctx, const char *s, int *a, int *b, int n)
{
if (!s || !s[0])
return NULL;
if (s[0] == ',')
s += 1;
if (s[0] == 'N')
{
*a = n;
s += 1;
}
else
*a = strtol(s, (char**)&s, 10);
if (s[0] == '-')
{
if (s[1] == 'N')
{
*b = n;
s += 2;
}
else
*b = strtol(s+1, (char**)&s, 10);
}
else
*b = *a;
*a = fz_clampi(*a, 1, n);
*b = fz_clampi(*b, 1, n);
return s;
}
/* memmem from musl */
#define MAX(a,b) ((a)>(b)?(a):(b))
#define BITOP(a,b,op) \
((a)[(size_t)(b)/(8*sizeof *(a))] op (size_t)1<<((size_t)(b)%(8*sizeof *(a))))
static char *twobyte_memmem(const unsigned char *h, size_t k, const unsigned char *n)
{
uint16_t nw = n[0]<<8 | n[1], hw = h[0]<<8 | h[1];
for (h++, k--; k; k--, hw = hw<<8 | *++h)
if (hw == nw) return (char *)h-1;
return 0;
}
static char *threebyte_memmem(const unsigned char *h, size_t k, const unsigned char *n)
{
uint32_t nw = n[0]<<24 | n[1]<<16 | n[2]<<8;
uint32_t hw = h[0]<<24 | h[1]<<16 | h[2]<<8;
for (h+=2, k-=2; k; k--, hw = (hw|*++h)<<8)
if (hw == nw) return (char *)h-2;
return 0;
}
static char *fourbyte_memmem(const unsigned char *h, size_t k, const unsigned char *n)
{
uint32_t nw = n[0]<<24 | n[1]<<16 | n[2]<<8 | n[3];
uint32_t hw = h[0]<<24 | h[1]<<16 | h[2]<<8 | h[3];
for (h+=3, k-=3; k; k--, hw = hw<<8 | *++h)
if (hw == nw) return (char *)h-3;
return 0;
}
static char *twoway_memmem(const unsigned char *h, const unsigned char *z, const unsigned char *n, size_t l)
{
size_t i, ip, jp, k, p, ms, p0, mem, mem0;
size_t byteset[32 / sizeof(size_t)] = { 0 };
size_t shift[256];
/* Computing length of needle and fill shift table */
for (i=0; i<l; i++)
BITOP(byteset, n[i], |=), shift[n[i]] = i+1;
/* Compute maximal suffix */
ip = -1; jp = 0; k = p = 1;
while (jp+k<l) {
if (n[ip+k] == n[jp+k]) {
if (k == p) {
jp += p;
k = 1;
} else k++;
} else if (n[ip+k] > n[jp+k]) {
jp += k;
k = 1;
p = jp - ip;
} else {
ip = jp++;
k = p = 1;
}
}
ms = ip;
p0 = p;
/* And with the opposite comparison */
ip = -1; jp = 0; k = p = 1;
while (jp+k<l) {
if (n[ip+k] == n[jp+k]) {
if (k == p) {
jp += p;
k = 1;
} else k++;
} else if (n[ip+k] < n[jp+k]) {
jp += k;
k = 1;
p = jp - ip;
} else {
ip = jp++;
k = p = 1;
}
}
if (ip+1 > ms+1) ms = ip;
else p = p0;
/* Periodic needle? */
if (memcmp(n, n+p, ms+1)) {
mem0 = 0;
p = MAX(ms, l-ms-1) + 1;
} else mem0 = l-p;
mem = 0;
/* Search loop */
for (;;) {
/* If remainder of haystack is shorter than needle, done */
if ((size_t)(z-h) < l) return 0;
/* Check last byte first; advance by shift on mismatch */
if (BITOP(byteset, h[l-1], &)) {
k = l-shift[h[l-1]];
if (k) {
if (mem0 && mem && k < p) k = l-p;
h += k;
mem = 0;
continue;
}
} else {
h += l;
mem = 0;
continue;
}
/* Compare right half */
for (k=MAX(ms+1,mem); k<l && n[k] == h[k]; k++);
if (k < l) {
h += k-ms;
mem = 0;
continue;
}
/* Compare left half */
for (k=ms+1; k>mem && n[k-1] == h[k-1]; k--);
if (k <= mem) return (char *)h;
h += p;
mem = mem0;
}
}
/*
Find the start of the first occurrence of the substring needle in haystack.
*/
void *fz_memmem(const void *h0, size_t k, const void *n0, size_t l)
{
const unsigned char *h = h0, *n = n0;
/* Return immediately on empty needle */
if (!l) return (void *)h;
/* Return immediately when needle is longer than haystack */
if (k<l) return 0;
/* Use faster algorithms for short needles */
h = memchr(h0, *n, k);
if (!h || l==1) return (void *)h;
k -= h - (const unsigned char *)h0;
if (k<l) return 0;
if (l==2) return twobyte_memmem(h, k, n);
if (l==3) return threebyte_memmem(h, k, n);
if (l==4) return fourbyte_memmem(h, k, n);
return twoway_memmem(h, h+k, n, l);
}