|
Hi Eero, Peter, Paul, Jo -- On 1/6/11 8:34 PM, Eero Pajarre wrote: My own utf-8 character extraction code follows I adapted Eero's version (original at bottom) to: #define char_in_range(x,a,b) (x>=a && x<=b) #define char_mb(x) char_in_range(x,0x80,0xbf) unsigned char *c = (unsigned char *)getstr(rawtsvalue(rb)); unsigned char *q = c + tsvalue(rb)->len; size_t count = 0; while(c < q) { if (c[0]<=127) c++; else if (char_in_range(c[0],0xC2,0xDF) && char_mb(c[1])) c+=2; else if (char_in_range(c[0],0xE0,0xEF) && char_mb(c[1]) && char_mb(c[2])) c+=3; else if (char_in_range(c[0],0xF0,0xF4) && char_mb(c[1]) && char_mb(c[2]) && char_mb(c[3])) c+=4; else { count--; c++; } count++; } But funny enough, it yields the same results on that test file: 37,075. Which means that certain cases of corruption are not present in the test file, I guess. Does anybody have an alternate count? Or an idea? And this version, with the exceptions in, gave the same: #define char_in_range(x,a,b) (x>=a && x<=b) #define char_mb(x) char_in_range(x,0x80,0xbf) unsigned char *c = (unsigned char *)getstr(rawtsvalue(rb)); unsigned char *q = c + tsvalue(rb)->len; size_t count = 0; while(c < q) { if (c[0]<=127) c++; else if (char_in_range(c[0],0xC2,0xDF) && char_mb(c[1])) c+=2; else if (char_in_range(c[0],0xE0,0xEF) && char_mb(c[1]) && char_mb(c[2])) c+=3; else if (char_in_range(c[0],0xF0,0xF4) && char_mb(c[1]) && char_mb(c[2]) && char_mb(c[3])) c+=4; else if (c[0]==0xe4 || c[0]==0xe5 || c[0]==0xf6 || c[0]==0xc4 || c[0]==0xc5 || c[0]==0xd6) c++; else { count--; c++; } count++; } I am not sure what the exceptions should help with? Allowing a mix of UTF-8 and extended ASCII? Is that even useful in any case? Thanks, Henning (this is not a char counter, (it is a character extractor, but you should be able to modify it if you want: inline int in_range(int x,int a,int b) { return x>=a && x<=b; } inline int mb(int x) { return in_range(x,0x80,0xbf); } inline unsigned decode(unsigned char *ptr,unsigned mask,int count) { unsigned res=0; for(int i=0;i<count;i++){ res <<= 6; res |= ptr[i] & mask; mask=0x3f; } return res; } static unsigned utf_letter(const char **ptr) { int skip=1; int res=0; unsigned char *c=(unsigned char *)(*ptr); if (c[0]<=127){ skip=1; res=c[0]; }else if (in_range(c[0],0xC2,0xDF) && mb(c[1])){ res=decode(c,0x1f,2); skip=2; }else if (in_range(c[0],0xE0,0xEF) && mb(c[1]) && mb(c[2])){ res=decode(c,0xf,3); skip=3; }else if (in_range(c[0],0xF0,0xF4) && mb(c[1]) && mb(c[2]) && mb(c[3])){ res=decode(c,0x7,4); skip=4; }else if (c[0]==0xe4 || /* Caution this part is not UTF-8, you should assert here if you just want to be compatible*/ c[0]==0xe5 || c[0]==0xf6 || c[0]==0xc4 || c[0]==0xc5 || c[0]==0xd6){ assert(0); res=c[0]; skip=1; }else{ assert(0); res='*'; skip=1; } *ptr += skip; return res; } |