Re: UTF-8 testing

lua-l archive

[Date Prev][Date Next][Thread Prev][Thread Next] [Date Index] [Thread Index]

Subject: Re: UTF-8 testing
From: Henning Diedrich <hd2010@...>
Date: Thu, 06 Jan 2011 22:03:36 +0100

Hi Eero, Peter, Paul, Jo --

On 1/6/11 8:34 PM, Eero Pajarre wrote:

My own utf-8 character extraction code follows

I adapted Eero's version (original at bottom) to:

    #define char_in_range(x,a,b) (x>=a && x<=b)     #define char_mb(x) char_in_range(x,0x80,0xbf)
    unsigned char *c = (unsigned char *)getstr(rawtsvalue(rb));     unsigned char *q = c + tsvalue(rb)->len;     size_t count = 0;     while(c < q) {             if (c[0]<=127) c++;         else if (char_in_range(c[0],0xC2,0xDF) && char_mb(c[1])) c+=2;         else if (char_in_range(c[0],0xE0,0xEF) && char_mb(c[1]) && char_mb(c[2])) c+=3;         else if (char_in_range(c[0],0xF0,0xF4) && char_mb(c[1]) && char_mb(c[2]) && char_mb(c[3])) c+=4;         else { count--; c++; }         count++;     }
But funny enough, it yields the same results on that test file: 37,075. Which means that certain cases of corruption are not present in the test file, I guess.

Does anybody have an alternate count? Or an idea?

And this version, with the exceptions in, gave the same:

    #define char_in_range(x,a,b) (x>=a && x<=b)     #define char_mb(x) char_in_range(x,0x80,0xbf)
    unsigned char *c = (unsigned char *)getstr(rawtsvalue(rb));     unsigned char *q = c + tsvalue(rb)->len;     size_t count = 0;     while(c < q) {             if (c[0]<=127) c++;         else if (char_in_range(c[0],0xC2,0xDF) && char_mb(c[1])) c+=2;         else if (char_in_range(c[0],0xE0,0xEF) && char_mb(c[1]) && char_mb(c[2])) c+=3;         else if (char_in_range(c[0],0xF0,0xF4) && char_mb(c[1]) && char_mb(c[2]) && char_mb(c[3])) c+=4;         else if (c[0]==0xe4 ||                  c[0]==0xe5 ||                  c[0]==0xf6 ||                  c[0]==0xc4 ||                  c[0]==0xc5 ||                  c[0]==0xd6) c++;         else { count--; c++; }         count++;     }
I am not sure what the exceptions should help with? Allowing a mix of UTF-8 and extended ASCII? Is that even useful in any case?

Thanks,
Henning

(this is not a char counter, (it is a character extractor, but you should be able to
modify it if you want:


inline int in_range(int x,int a,int b)
{
  return x>=a && x<=b;
}

inline int mb(int x)
{
  return in_range(x,0x80,0xbf);
}


inline unsigned decode(unsigned char *ptr,unsigned mask,int count)
{
  unsigned res=0;
  for(int i=0;i<count;i++){
    res <<= 6;
    res |= ptr[i] & mask;
    mask=0x3f;
  }
  return res;
}

static unsigned utf_letter(const char **ptr)
{
  int skip=1;
  int res=0;
  unsigned char *c=(unsigned char *)(*ptr);
  if (c[0]<=127){
    skip=1;
    res=c[0];
  }else if (in_range(c[0],0xC2,0xDF) && mb(c[1])){
    res=decode(c,0x1f,2);
    skip=2;
  }else if (in_range(c[0],0xE0,0xEF) && mb(c[1]) && mb(c[2])){
    res=decode(c,0xf,3);
    skip=3;
  }else if (in_range(c[0],0xF0,0xF4) && mb(c[1]) && mb(c[2]) && mb(c[3])){
    res=decode(c,0x7,4);
    skip=4;
  }else if (c[0]==0xe4 ||  /* Caution this part is not UTF-8, you
should assert here if you just want to be compatible*/
	    c[0]==0xe5 ||
	    c[0]==0xf6 ||
	    c[0]==0xc4 ||
	    c[0]==0xc5 ||
	    c[0]==0xd6){
    assert(0);
    res=c[0];
    skip=1;
  }else{
    assert(0);
    res='*';
    skip=1;
  }
  *ptr += skip;
  return res;
}

References:
- UTF-8 testing, Henning Diedrich
- Re: UTF-8 testing, Eero Pajarre

Prev by Date: Re: UTF-8 testing
Next by Date: Re: Lua Cookbook
Previous by thread: Re: UTF-8 testing
Next by thread: Re: UTF-8 testing
Index(es):
- Date
- Thread