[Date Prev][Date Next][Thread Prev][Thread Next]
[Date Index]
[Thread Index]
- Subject: Re: UTF-8 testing
- From: Jo-Philipp Wich <xm@...>
- Date: Thu, 06 Jan 2011 20:52:06 +0100
Another one attached, in C.
I use it mainly to validate UTF8-8 data, the buf_putchar() calls could
be replaced with a simple length++.
With the mentioned length++ changes I counted 42133 chars.
~ Jow
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
struct buffer {
unsigned char *data;
unsigned char *dptr;
unsigned int size;
unsigned int fill;
};
struct buffer * buf_init(void)
{
struct buffer *buf;
unsigned char *data;
buf = (struct buffer *)malloc(sizeof(struct buffer));
if (buf != NULL)
{
buf->fill = 0;
buf->size = 1024;
buf->data = (unsigned char *)malloc(buf->size);
if (buf->data != NULL)
{
buf->dptr = buf->data;
buf->data[0] = 0;
return buf;
}
free(buf);
}
return NULL;
}
int buf_grow(struct buffer *buf)
{
unsigned int off = (buf->dptr - buf->data);
unsigned char *data =
(unsigned char *)realloc(buf->data, buf->size + 1024);
if (data != NULL)
{
buf->data = data;
buf->dptr = data + off;
buf->size += 1024;
return buf->size;
}
return 0;
}
int buf_putchar(struct buffer *buf, unsigned char c)
{
if( ((buf->fill + 1) >= buf->size) && !buf_grow(buf) )
return 0;
*(buf->dptr++) = c;
*(buf->dptr) = 0;
buf->fill++;
return 1;
}
int buf_append(struct buffer *buf, unsigned char *s, unsigned int len)
{
while ((buf->fill + len + 1) >= buf->size)
{
if (!buf_grow(buf))
return 0;
}
memcpy(buf->dptr, s, len);
buf->fill += len;
buf->dptr += len;
*(buf->dptr) = 0;
return len;
}
/* calculate the number of expected continuation chars */
static inline int mb_num_chars(unsigned char c)
{
if ((c & 0xE0) == 0xC0)
return 2;
else if ((c & 0xF0) == 0xE0)
return 3;
else if ((c & 0xF8) == 0xF0)
return 4;
else if ((c & 0xFC) == 0xF8)
return 5;
else if ((c & 0xFE) == 0xFC)
return 6;
return 1;
}
/* test whether the given byte is a valid continuation char */
static inline int mb_is_cont(unsigned char c)
{
return ((c >= 0x80) && (c <= 0xBF));
}
/* test whether the byte sequence at the given pointer with the given
* length is the shortest possible representation of the code point */
static inline int mb_is_shortest(unsigned char *s, int n)
{
switch (n)
{
case 2:
/* 1100000x (10xxxxxx) */
return !(((*s >> 1) == 0x60) &&
((*(s+1) >> 6) == 0x02));
case 3:
/* 11100000 100xxxxx (10xxxxxx) */
return !((*s == 0xE0) &&
((*(s+1) >> 5) == 0x04) &&
((*(s+2) >> 6) == 0x02));
case 4:
/* 11110000 1000xxxx (10xxxxxx 10xxxxxx) */
return !((*s == 0xF0) &&
((*(s+1) >> 4) == 0x08) &&
((*(s+2) >> 6) == 0x02) &&
((*(s+3) >> 6) == 0x02));
case 5:
/* 11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx) */
return !((*s == 0xF8) &&
((*(s+1) >> 3) == 0x10) &&
((*(s+2) >> 6) == 0x02) &&
((*(s+3) >> 6) == 0x02) &&
((*(s+4) >> 6) == 0x02));
case 6:
/* 11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx) */
return !((*s == 0xF8) &&
((*(s+1) >> 2) == 0x20) &&
((*(s+2) >> 6) == 0x02) &&
((*(s+3) >> 6) == 0x02) &&
((*(s+4) >> 6) == 0x02) &&
((*(s+5) >> 6) == 0x02));
}
return 1;
}
/* test whether the byte sequence at the given pointer with the given
* length is an UTF-16 surrogate */
static inline int mb_is_surrogate(unsigned char *s, int n)
{
return ((n == 3) && (*s == 0xED) && (*(s+1) >= 0xA0) && (*(s+1) <= 0xBF));
}
/* test whether the byte sequence at the given pointer with the given
* length is an illegal UTF-8 code point */
static inline int mb_is_illegal(unsigned char *s, int n)
{
return ((n == 3) && (*s == 0xEF) && (*(s+1) == 0xBF) &&
(*(s+2) >= 0xBE) && (*(s+2) <= 0xBF));
}
unsigned char * sanitize_utf8(const unsigned char *s, int len)
{
struct buffer *buf = buf_init();
unsigned char *ptr = (unsigned char *)s;
unsigned int o, v, n;
if (!buf)
return NULL;
for (o = 0; o < len; o++)
{
/* ascii byte without null */
if ((*(ptr+0) >= 0x01) && (*(ptr+0) <= 0x7F))
{
if (!buf_putchar(buf, *ptr++))
goto fail;
}
/* multi byte sequence */
else if ((n = mb_num_chars(*ptr)) > 1)
{
/* count valid chars */
for (v = 1; (v <= n) && ((o+v) < len) && mb_is_cont(*(ptr+v)); v++);
switch (n)
{
case 6:
case 5:
/* five and six byte sequences are always invalid */
fprintf(stderr, "Invalid %i-byte sequence at offset %i\n", n, o);
if (!buf_putchar(buf, '?'))
goto fail;
break;
default:
/* if the number of valid continuation bytes matches the
* expected number and if the sequence is legal, copy
* the bytes to the destination buffer */
if ((v == n) && mb_is_shortest(ptr, n) &&
!mb_is_surrogate(ptr, n) && !mb_is_illegal(ptr, n))
{
/* copy sequence */
if (!buf_append(buf, ptr, n))
goto fail;
}
/* the found sequence is illegal, skip it */
else
{
/* invalid sequence */
const char *reason = "Illegal";
if (v != n)
reason = "Truncated";
else if (!mb_is_shortest(ptr, n))
reason = "Overlong";
else if (mb_is_surrogate(ptr, n))
reason = "Surrogate";
fprintf(stderr, "%s %i-byte sequence at offset %i\n", reason, n, o);
if (!buf_putchar(buf, '?'))
goto fail;
}
break;
}
/* advance beyound the last found valid continuation char */
o += (v - 1);
ptr += v;
}
/* invalid byte (0x00) */
else
{
fprintf(stderr, "Illegal zero-byte at offset %i\n", o);
if (!buf_putchar(buf, '?')) /* or 0xEF, 0xBF, 0xBD */
goto fail;
ptr++;
}
}
fail:
ptr = buf->data;
free(buf);
return ptr;
}
int main(int argc, char **argv)
{
if( argc < 2 )
{
printf("Usage: %s <file>\n", argv[0]);
return 1;
}
struct buffer *data = buf_init();
char buf[1024];
int rlen;
int fd;
if (!strcmp(argv[1], "-"))
fd = 0;
else
fd = open(argv[1], O_RDONLY);
if (fd > -1)
{
while ((rlen = read(fd, buf, sizeof(buf))) > 0)
buf_append(data, buf, rlen);
char *s = sanitize_utf8(data->data, data->fill);
printf("%s\n", s);
free(s);
close(fd);
}
return 0;
}