lua-users home
lua-l archive

[Date Prev][Date Next][Thread Prev][Thread Next] [Date Index] [Thread Index]


Roberto Ierusalimschy wrote:
> We are deprecating the use of ctype inside the Lua core, (See for
> instance http://lua-users.org/lists/lua-l/2006-12/msg00155.html for a
> discussion about the subject.) Our current replacement does not have
> toupper (the obvious implementation as a macro would need another 257
> bytes), so we have to handle the lowercase "digits" explicitly.

Great, looks like we are heading in the same direction. I've
removed *all* NLS stuff from the VM core of LuaJIT 2.x. The libc
NLS functions only make sense for single-char locales and these
are becoming increasingly useless (every Linux distro released in
the past ~5 years is using an UTF-8 locale).

And you don't need an extra table for uppercasing. Please find
attached my own ctype replacement, carefully optimized to generate
good code. Apart from lj_ctype_toupper() and lj_ctype_tolower() it
also has the lj_ctype_isident() function used in the lexer.

The latter treats chars 128-255 as an identifier which works out
nicely in practice for both ISO-8859-* and UTF-8 locales. The main
lexer loop looks like this (paraphrased):

  for (;;) {  /* c holds the current character. */
    if (lj_ctype_isident(c)) {
      if (lj_ctype_isdigit(c))  /* Digits start a number */
        return read_number();
      /* Anything else starts an identifier or reserved word. */
      do {
        get_next_char();
      } while (lj_ctype_isident(c));
      return name_or_reserved();
    }
    switch (c) {  /* Handle all other chars. */
    case '\n': case '\r': ...
    case ' ': case '\t': case '\v': case '\f': ...
    case '-': ...
    ...
    }
  }

--Mike
/*
** Internal CTYPE replacement.
** Donated to the public domain.
*/

#ifndef _LJ_CTYPE_H
#define _LJ_CTYPE_H

#include "lj_def.h"

#define LJ_CTYPE_CNTRL	0x01
#define LJ_CTYPE_SPACE	0x02
#define LJ_CTYPE_PUNCT	0x04
#define LJ_CTYPE_DIGIT	0x08
#define LJ_CTYPE_XDIGIT	0x10
#define LJ_CTYPE_UPPER	0x20
#define LJ_CTYPE_LOWER	0x40
#define LJ_CTYPE_IDENT	0x80
#define LJ_CTYPE_ALPHA	(LJ_CTYPE_LOWER|LJ_CTYPE_UPPER)
#define LJ_CTYPE_ALNUM	(LJ_CTYPE_ALPHA|LJ_CTYPE_DIGIT)

/* Only pass -1 or 0..255 to these macros. Never pass a signed char! */
#define lj_ctype_isa(c, t)	(lj_ctype_bits[(c)+1] & t)
#define lj_ctype_iscntrl(c)	lj_ctype_isa((c), LJ_CTYPE_CNTRL)
#define lj_ctype_isspace(c)	lj_ctype_isa((c), LJ_CTYPE_SPACE)
#define lj_ctype_ispunct(c)	lj_ctype_isa((c), LJ_CTYPE_PUNCT)
#define lj_ctype_isdigit(c)	lj_ctype_isa((c), LJ_CTYPE_DIGIT)
#define lj_ctype_isxdigit(c)	lj_ctype_isa((c), LJ_CTYPE_XDIGIT)
#define lj_ctype_isupper(c)	lj_ctype_isa((c), LJ_CTYPE_UPPER)
#define lj_ctype_islower(c)	lj_ctype_isa((c), LJ_CTYPE_LOWER)
#define lj_ctype_isident(c)	lj_ctype_isa((c), LJ_CTYPE_IDENT)
#define lj_ctype_isalpha(c)	lj_ctype_isa((c), LJ_CTYPE_ALPHA)
#define lj_ctype_isalnum(c)	lj_ctype_isa((c), LJ_CTYPE_ALNUM)

#define lj_ctype_toupper(c)	((c) - (lj_ctype_islower(c) >> 1))
#define lj_ctype_tolower(c)	((c) + lj_ctype_isupper(c))

LUAI_DATA const uint8_t lj_ctype_bits[257];

#endif
/*
** Internal CTYPE replacement.
** Donated to the public domain.
**
** This is intended to replace the problematic libc single-byte NLS functions.
** These just don't make sense anymore with UTF-8 locales becoming the norm
** on POSIX systems. It never worked too well on Windows systems since hardly
** anyone bothered to call setlocale().
**
** Instead this table is hardcoded for ASCII, except for identifiers. These
** include the characters 128-255, too. This allows for the use of all
** non-ASCII chars as identifiers in the lexer. This is a broad definition,
** but works well in practice for both UTF-8 locales and most single-byte
** locales (such as ISO-8859-*).
**
** If you really need proper ctypes for UTF-8 strings, please use an add-on
** library such as slnunicode: http://luaforge.net/projects/sln/
*/

#define lj_ctype_c
#define LUA_CORE

#include "lj_ctype.h"

LUAI_DATADEF const uint8_t lj_ctype_bits[257] = {
    0,
    1,  1,  1,  1,  1,  1,  1,  1,  1,  3,  3,  3,  3,  3,  1,  1,
    1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
    2,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
  152,152,152,152,152,152,152,152,152,152,  4,  4,  4,  4,  4,  4,
    4,176,176,176,176,176,176,160,160,160,160,160,160,160,160,160,
  160,160,160,160,160,160,160,160,160,160,160,  4,  4,  4,  4,132,
    4,208,208,208,208,208,208,192,192,192,192,192,192,192,192,192,
  192,192,192,192,192,192,192,192,192,192,192,  4,  4,  4,  4,  1,
  128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,
  128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,
  128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,
  128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,
  128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,
  128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,
  128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,
  128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128
};