[Date Prev][Date Next][Thread Prev][Thread Next]
[Date Index]
[Thread Index]
- Subject: Re: UTF-8 testing
- From: Eero Pajarre <epajarre@...>
- Date: Thu, 6 Jan 2011 21:34:58 +0200
I did not actually see the counting code in the code snippet you posted.
Perhaps you ment something like this:
/* UTF-8 estimate */
unsigned char *p = (unsigned char *)getstr(rawtsvalue(rb));
unsigned char *q = p + tsvalue(rb)->len;
size_t count = 0;
while(p < q){
if(*p <= 127 || (*p >= 194 && *p <= 244)) /* this can be reversed */
count++;
p++;
}
Notice the addition of the count++ and moving the p++ outside of the
if statement.
.
In my own code I actually do more error checking, although I mostly
use so that I can kludge in backwards compatibility to a subset of
iso-latin1 character set.
My own utf-8 character extraction code follows (this is not a char
counter, (it is a character extractor, but you should be able to
modify it if you want:
inline int in_range(int x,int a,int b)
{
return x>=a && x<=b;
}
inline int mb(int x)
{
return in_range(x,0x80,0xbf);
}
inline unsigned decode(unsigned char *ptr,unsigned mask,int count)
{
unsigned res=0;
for(int i=0;i<count;i++){
res <<= 6;
res |= ptr[i] & mask;
mask=0x3f;
}
return res;
}
static unsigned utf_letter(const char **ptr)
{
int skip=1;
int res=0;
unsigned char *c=(unsigned char *)(*ptr);
if (c[0]<=127){
skip=1;
res=c[0];
}else if (in_range(c[0],0xC2,0xDF) && mb(c[1])){
res=decode(c,0x1f,2);
skip=2;
}else if (in_range(c[0],0xE0,0xEF) && mb(c[1]) && mb(c[2])){
res=decode(c,0xf,3);
skip=3;
}else if (in_range(c[0],0xF0,0xF4) && mb(c[1]) && mb(c[2]) && mb(c[3])){
res=decode(c,0x7,4);
skip=4;
}else if (c[0]==0xe4 || /* Caution this part is not UTF-8, you
should assert here if you just want to be compatible*/
c[0]==0xe5 ||
c[0]==0xf6 ||
c[0]==0xc4 ||
c[0]==0xc5 ||
c[0]==0xd6){
assert(0);
res=c[0];
skip=1;
}else{
assert(0);
res='*';
skip=1;
}
*ptr += skip;
return res;
}