lua-users home
lua-l archive

[Date Prev][Date Next][Thread Prev][Thread Next] [Date Index] [Thread Index]


Hi Eero, Peter, Paul, Jo --

On 1/6/11 8:34 PM, Eero Pajarre wrote:
My own utf-8 character extraction code follows 

I adapted Eero's version (original at bottom) to:

      #define char_in_range(x,a,b) (x>=a && x<=b)
      #define char_mb(x) char_in_range(x,0x80,0xbf)

      unsigned char *c = (unsigned char *)getstr(rawtsvalue(rb));
      unsigned char *q = c + tsvalue(rb)->len;
      size_t count = 0;

      while(c < q) {
     
        if (c[0]<=127) c++;
        else if (char_in_range(c[0],0xC2,0xDF) && char_mb(c[1])) c+=2;
        else if (char_in_range(c[0],0xE0,0xEF) && char_mb(c[1]) && char_mb(c[2])) c+=3;
        else if (char_in_range(c[0],0xF0,0xF4) && char_mb(c[1]) && char_mb(c[2]) && char_mb(c[3])) c+=4;
        else { count--; c++; }
        count++;
      }

But funny enough, it yields the same results on that test file: 37,075. Which means that certain cases of corruption are not present in the test file, I guess.

Does anybody have an alternate count? Or an idea?

And this version, with the exceptions in, gave the same:

      #define char_in_range(x,a,b) (x>=a && x<=b)
      #define char_mb(x) char_in_range(x,0x80,0xbf)

      unsigned char *c = (unsigned char *)getstr(rawtsvalue(rb));
      unsigned char *q = c + tsvalue(rb)->len;
      size_t count = 0;

      while(c < q) {
     
        if (c[0]<=127) c++;
        else if (char_in_range(c[0],0xC2,0xDF) && char_mb(c[1])) c+=2;
        else if (char_in_range(c[0],0xE0,0xEF) && char_mb(c[1]) && char_mb(c[2])) c+=3;
        else if (char_in_range(c[0],0xF0,0xF4) && char_mb(c[1]) && char_mb(c[2]) && char_mb(c[3])) c+=4;
        else if (c[0]==0xe4 || 
                 c[0]==0xe5 ||
                 c[0]==0xf6 ||
                 c[0]==0xc4 ||
                 c[0]==0xc5 ||
                 c[0]==0xd6) c++;
        else { count--; c++; }
        count++;
      }

I am not sure what the exceptions should help with? Allowing a mix of UTF-8 and extended ASCII? Is that even useful in any case?

Thanks,
Henning

(this is not a char counter, (it is a character extractor, but you should be able to
modify it if you want:


inline int in_range(int x,int a,int b)
{
  return x>=a && x<=b;
}

inline int mb(int x)
{
  return in_range(x,0x80,0xbf);
}


inline unsigned decode(unsigned char *ptr,unsigned mask,int count)
{
  unsigned res=0;
  for(int i=0;i<count;i++){
    res <<= 6;
    res |= ptr[i] & mask;
    mask=0x3f;
  }
  return res;
}

static unsigned utf_letter(const char **ptr)
{
  int skip=1;
  int res=0;
  unsigned char *c=(unsigned char *)(*ptr);
  if (c[0]<=127){
    skip=1;
    res=c[0];
  }else if (in_range(c[0],0xC2,0xDF) && mb(c[1])){
    res=decode(c,0x1f,2);
    skip=2;
  }else if (in_range(c[0],0xE0,0xEF) && mb(c[1]) && mb(c[2])){
    res=decode(c,0xf,3);
    skip=3;
  }else if (in_range(c[0],0xF0,0xF4) && mb(c[1]) && mb(c[2]) && mb(c[3])){
    res=decode(c,0x7,4);
    skip=4;
  }else if (c[0]==0xe4 ||  /* Caution this part is not UTF-8, you
should assert here if you just want to be compatible*/
	    c[0]==0xe5 ||
	    c[0]==0xf6 ||
	    c[0]==0xc4 ||
	    c[0]==0xc5 ||
	    c[0]==0xd6){
    assert(0);
    res=c[0];
    skip=1;
  }else{
    assert(0);
    res='*';
    skip=1;
  }
  *ptr += skip;
  return res;
}