lua-users home
lua-l archive

[Date Prev][Date Next][Thread Prev][Thread Next] [Date Index] [Thread Index]


Below is an implementation of the "tokens" test for APLC that counts the number of tokens in a lua 5.0.2 script file. It should correctly count all allowed tokens, where

-  (multi-line) strings and (multi-line) comments count as a single token
- Each + and - sign always counts as a single token. (So -5 is not a single number but consists of two tokens.)

The lexer below only *counts* tokens, so in its current form it is not suited to feed a syntactic parser but it should not be hard to adapt.

Since this is a fairly tricky business, would someone with some Lua lexer knowledge care to run it through some tests? It is not submitted to the APLC site yet.

--
Wim



-- print short discription if no filename is provided
if not arg[1] then
	print("usage: " .. arg[0] .. [[ filename

counts the number of tokens in a Lua script file.]])
	return
end


-- parse file in chunks of this size
local chunk_size = 1024

-- total tokens
local count = 0

-- report a token
local function token()
	count = count + 1
end

-- forward declaration of main lexer
local lex

-- forward declaration of current parse state
local parse

-- helper to parse to first newline (for single-line comments)
local function single_line(chunk, index, more)
	local _, t = string.find(chunk, "\n", index)
	if t then
		token()
		parse = lex
		return lex(chunk, t + 1, more)
	elseif more then
		parse = single_line
		return ""
	else
		token()
	end
end

-- nesting level in multi-line comments and strings
local level

-- helper to parse multi-line token (string or comment)
local function multi_line(chunk, index, more)
	local len = string.len(chunk)
	while index < len do
		local _, t, c = string.find(chunk, "([%[%]][%[%]])", index)
		if not c then index = len
		elseif c == "[[" then
			level = level + 1
			index = t + 1
		elseif c == "]]" then
			level = level - 1
			index = t + 1
			if level == 0 then
				token()
				parse = lex
				return lex(chunk, index, more)
			end
		else index = t
		end
	end

	if more then
		parse = multi_line
		return string.sub(chunk, index)
	end

	error "]] expected"
end

-- quote character that opened currect string token
local quote

-- helper to parse a quoted string
local function quoted_string(chunk, index, more)
	local len = string.len(chunk)
	local pat = "([\\\n" .. quote .. "])"
	while index <= len do
		local _, c
		_, index, c = string.find(chunk, pat, index)
		if not c then index = len + 1
		elseif c == "\n" then
			error("unexpected newline: " .. quote .. " expected")
		elseif c == "\\" then
			if index < len then index = index + 2
			else break
			end
		elseif c == quote then
			token()
			parse = lex
			return lex(chunk, index + 1, more)
		end
	end

	if more then
		parse = quoted_string
		return string.sub(chunk, index)
	else
		error("unexpected end of file: " .. quote .. " expected")
	end
end

-- main lexer function.
-- try to parse as many tokens as possible from the chunk.
-- returns a remaining tail of the chunk that has to be
-- reexamined or nil if there's nothing left to do.
function lex(chunk, index, more)
	local len = string.len(chunk)
	if index > len then
		-- more input needed
		return more and ""
	elseif len - index < 3 and more then
		-- more lookahead needed
		return string.sub(chunk, index)
	end

	-- skip white space
	local _, t = string.find(chunk, "^%s+", index)
	if t then return lex(chunk, t + 1, more) end

	-- keywords and identifiers
	_, t = string.find(chunk, "^[_%a][_%w]*", index)
	if t == len and more then
		return string.sub(chunk, index)
	elseif t then
		token()
		return lex(chunk, t + 1, more)
	end

	-- numbers
	_, t = string.find(chunk, "^[0-9]+%.?[0-9]*", index)
	if not t then
		_, t = string.find(chunk, "^%.?[0-9]+", index)
	end

	if t then
		local _, e = string.find(chunk, "^[eE][+-]?[0-9]+", t + 1)
		t = e or t

		if t == len and more then
			return string.sub(chunk, index)
		end
		token()
		return lex(chunk, t + 1, more)
	end

	-- special tokens
	if string.find(chunk, "^[%-%[%'%\"]", index) then
		if string.find(chunk, "^%-%-%[%[", index) then
			level = 1
			return multi_line(chunk, index + 4, more)
		elseif string.find(chunk, "^%-%-", index) then
			return single_line(chunk, index + 2, more)
		elseif string.find(chunk, "^%[%[", index) then
			level = 1
			return multi_line(chunk, index + 2, more)
		elseif string.find(chunk, "^%'", index) then
			quote = "'"
			return quoted_string(chunk, index + 1, more)
		elseif string.find(chunk, '^%"', index) then
			quote = '"'
			return quoted_string(chunk, index + 1, more)
		end
	end

	-- multichar tokens
	if not t then
		_, t = string.find(chunk, "^[%~%=%<%>]=", index)
	end

	if not t then
		_, t = string.find(chunk, "^%.%.%.?", index)
	end

	-- single char tokens
	if not t then
_, t = string.find(chunk, "^[%^%*%(%)%-%+%=%{%}%[%]%:%;%<%>%,%.%/]", index)
	end

	if t then
		token()
		return lex(chunk, t + 1, more)
	end

	-- still no match?
error("unrecognised token: " .. string.sub(chunk, index, index + 5) .. "...")
end

-- start in the main lexer state
parse = lex

-- read and parse chunks from specified file
local function start()
	io.input(arg[1])

	local chunk = io.read(chunk_size)
	local res = parse(chunk, 1, true)
	while res do
		chunk = io.read(chunk_size)
		if chunk then
			res = parse(res .. chunk, 1, true)
		else
			res = parse(res, 1, false)
		end
	end
end

-- start parsing the file and catch any errors
local rc, err = pcall(start)

if rc then
	print(count .. " tokens")
else
	print(err)
end