[Date Prev][Date Next][Thread Prev][Thread Next]
[Date Index]
[Thread Index]
- Subject: Re: UTF-8 testing
- From: Peter Cawley <lua@...>
- Date: Thu, 6 Jan 2011 19:41:14 +0000
On Thu, Jan 6, 2011 at 7:34 PM, Eero Pajarre <epajarre@gmail.com> wrote:
> My own utf-8 character extraction code follows (this is not a char
> counter, (it is a character extractor, but you should be able to
> modify it if you want:
>
> inline int in_range(int x,int a,int b)
> {
> return x>=a && x<=b;
> }
>
> inline int mb(int x)
> {
> return in_range(x,0x80,0xbf);
> }
>
>
> inline unsigned decode(unsigned char *ptr,unsigned mask,int count)
> {
> unsigned res=0;
> for(int i=0;i<count;i++){
> res <<= 6;
> res |= ptr[i] & mask;
> mask=0x3f;
> }
> return res;
> }
>
> static unsigned utf_letter(const char **ptr)
> {
> int skip=1;
> int res=0;
> unsigned char *c=(unsigned char *)(*ptr);
> if (c[0]<=127){
> skip=1;
> res=c[0];
> }else if (in_range(c[0],0xC2,0xDF) && mb(c[1])){
> res=decode(c,0x1f,2);
> skip=2;
> }else if (in_range(c[0],0xE0,0xEF) && mb(c[1]) && mb(c[2])){
> res=decode(c,0xf,3);
> skip=3;
> }else if (in_range(c[0],0xF0,0xF4) && mb(c[1]) && mb(c[2]) && mb(c[3])){
> res=decode(c,0x7,4);
> skip=4;
> }else if (c[0]==0xe4 || /* Caution this part is not UTF-8, you
> should assert here if you just want to be compatible*/
> c[0]==0xe5 ||
> c[0]==0xf6 ||
> c[0]==0xc4 ||
> c[0]==0xc5 ||
> c[0]==0xd6){
> assert(0);
> res=c[0];
> skip=1;
> }else{
> assert(0);
> res='*';
> skip=1;
> }
> *ptr += skip;
> return res;
> }
If we're posting decoders, here is one which I wrote just the other
day (in C++, but not too hard to make it into C code):
static unsigned int utf8next(const char*& sString)
{
unsigned int iCode = *reinterpret_cast<const unsigned char*>(sString++);
unsigned int iContinuation;
if(iCode & 0x80)
{
if((iCode & 0x40) == 0)
{
// Invalid encoding: character should not start with a continuation
// byte. Hence return the Unicode replacement character.
return 0xFFFD;
}
else
{
#define CONTINUATION_CHAR \
iContinuation = *reinterpret_cast<const unsigned char*>(sString); \
if((iContinuation & 0xC0) != 0x80) \
/* Invalid encoding: not enough continuation characters. */ \
return 0xFFFD; \
iCode = (iCode << 6) | (iContinuation & 0x3F); \
++sString
iCode &= 0x3F;
if(iCode & 0x20)
{
iCode &= 0x1F;
if(iCode & 0x10)
{
iCode &= 0x0F;
if(iCode & 0x08)
{
// Invalid encoding: too-long byte sequence. Hence
// return the Unicode replacement character.
return 0xFFFD;
}
CONTINUATION_CHAR;
}
CONTINUATION_CHAR;
}
CONTINUATION_CHAR;
}
#undef CONTINUATION_CHAR
}
return iCode;
}