Re: UTF-8 testing

lua-l archive

[Date Prev][Date Next][Thread Prev][Thread Next] [Date Index] [Thread Index]

Subject: Re: UTF-8 testing
From: Peter Cawley <lua@...>
Date: Thu, 6 Jan 2011 19:41:14 +0000

On Thu, Jan 6, 2011 at 7:34 PM, Eero Pajarre <epajarre@gmail.com> wrote:
> My own utf-8 character extraction code follows (this is not a char
> counter, (it is a character extractor, but you should be able to
> modify it if you want:
>
> inline int in_range(int x,int a,int b)
> {
>  return x>=a && x<=b;
> }
>
> inline int mb(int x)
> {
>  return in_range(x,0x80,0xbf);
> }
>
>
> inline unsigned decode(unsigned char *ptr,unsigned mask,int count)
> {
>  unsigned res=0;
>  for(int i=0;i<count;i++){
>    res <<= 6;
>    res |= ptr[i] & mask;
>    mask=0x3f;
>  }
>  return res;
> }
>
> static unsigned utf_letter(const char **ptr)
> {
>  int skip=1;
>  int res=0;
>  unsigned char *c=(unsigned char *)(*ptr);
>  if (c[0]<=127){
>    skip=1;
>    res=c[0];
>  }else if (in_range(c[0],0xC2,0xDF) && mb(c[1])){
>    res=decode(c,0x1f,2);
>    skip=2;
>  }else if (in_range(c[0],0xE0,0xEF) && mb(c[1]) && mb(c[2])){
>    res=decode(c,0xf,3);
>    skip=3;
>  }else if (in_range(c[0],0xF0,0xF4) && mb(c[1]) && mb(c[2]) && mb(c[3])){
>    res=decode(c,0x7,4);
>    skip=4;
>  }else if (c[0]==0xe4 ||  /* Caution this part is not UTF-8, you
> should assert here if you just want to be compatible*/
>            c[0]==0xe5 ||
>            c[0]==0xf6 ||
>            c[0]==0xc4 ||
>            c[0]==0xc5 ||
>            c[0]==0xd6){
>    assert(0);
>    res=c[0];
>    skip=1;
>  }else{
>    assert(0);
>    res='*';
>    skip=1;
>  }
>  *ptr += skip;
>  return res;
> }

If we're posting decoders, here is one which I wrote just the other
day (in C++, but not too hard to make it into C code):

static unsigned int utf8next(const char*& sString)
{
    unsigned int iCode = *reinterpret_cast<const unsigned char*>(sString++);
    unsigned int iContinuation;
    if(iCode & 0x80)
    {
        if((iCode & 0x40) == 0)
        {
            // Invalid encoding: character should not start with a continuation
            // byte. Hence return the Unicode replacement character.
            return 0xFFFD;
        }
        else
        {
#define CONTINUATION_CHAR \
    iContinuation = *reinterpret_cast<const unsigned char*>(sString); \
    if((iContinuation & 0xC0) != 0x80) \
        /* Invalid encoding: not enough continuation characters. */ \
        return 0xFFFD; \
    iCode = (iCode << 6) | (iContinuation & 0x3F); \
    ++sString

            iCode &= 0x3F;
            if(iCode & 0x20)
            {
                iCode &= 0x1F;
                if(iCode & 0x10)
                {
                    iCode &= 0x0F;
                    if(iCode & 0x08)
                    {
                        // Invalid encoding: too-long byte sequence. Hence
                        // return the Unicode replacement character.
                        return 0xFFFD;
                    }
                    CONTINUATION_CHAR;
                }
                CONTINUATION_CHAR;
            }
            CONTINUATION_CHAR;
        }

#undef CONTINUATION_CHAR
    }
    return iCode;
}

Follow-Ups:
- Re: UTF-8 testing, Jo-Philipp Wich

References:
- UTF-8 testing, Henning Diedrich
- Re: UTF-8 testing, Eero Pajarre

Prev by Date: Re: Lua Cookbook
Next by Date: Re: UTF-8 testing
Previous by thread: Re: UTF-8 testing
Next by thread: Re: UTF-8 testing
Index(es):
- Date
- Thread