lua-users home
lua-l archive

[Date Prev][Date Next][Thread Prev][Thread Next] [Date Index] [Thread Index]


Hello,

I've started writing a NLP toolkit in Lua under Windows some time ago and the 
development was going on perfectly. However, when "porting" to Linux, a 
strange problem started to occur in a piece of code that I wrote weeks ago 
and was running fine in Windows (actually, understanding why the bug takes 
place in only one OS seems to be my biggest problem).
It was a bit hard to isolate the problem, but I am attaching C and Lua code 
that replicates it. I have a host program written in C that, using glua, 
allows Lua code to call some GNU regex functions (it is needed for NLP, 
gsub() is not enough). The problem takes place here, when tokenizing a string 
with regular expression rules: in Windows it runs fast and without problems, 
in Linux it logarithmically decreases the speed until crashing around 
byte/char 50.000. I used the non-modified Lua 5.0 distribution in both 
Windows (98SE with MinGW) and Linux (kernel 2.4, gcc 3.2), tweaked a bit the 
'config' file for Lua, rewrote the function from scratch, but nothing. I am 
pretty sure the problem is not directly related to Lua but only to my code 
(the pretty obvious guess is that there is some memory problem -- maybe gc?) 
but have no concrete ideas on how to solve anymore.
Any ideas? Thanks for the attention.

Tiago Tresoldi

---
/* C main.c, stripped to the code that matters */

#include <sys/types.h> /* for regex, below */
#include <regex.h>

/* Lua-related includes */
#include <lua.h>
#include <lualib.h>
#include <lauxlib.h>
#include "glua.h" /* Enrico Colombini's glua */

glua_function(__regex)
  const char *string, *rule; /* read from inside Lua */
  int string_len, rule_len, start; /* read from inside Lua */
  struct re_pattern_buffer regex;
  struct re_registers regs;
  const char *s;

  re_syntax_options = RE_SYNTAX_POSIX_EXTENDED;

  string     = glua_getString(1);
  rule       = glua_getString(2);
  string_len = glua_getNumber(3);
  rule_len   = glua_getNumber(4);
  start      = glua_getNumber(5);
  
  regs.num_regs = 1;
  memset (&regex, '\0', sizeof (regex));
  s = re_compile_pattern (rule, rule_len, &regex);
  if (s != NULL)
    {
      puts ("re_compile_pattern return non-NULL value");
      glua_pushNil();
    }
  else
    {
      glua_pushNumber( re_match (&regex, string, string_len, start, &regs) );
    }

  return 1;

}

/*----- List of functions visible from Lua with glua -----*/
static glua_fitem flist[] = {
  glua_fn(__regex)
};

---

-- Lua Tokenization code

function RETokenizer (text, rule, source)
  local start = 1
  local match = 0
  local tlist = { } -- tokens list
  local loc   = 1   -- temporary location index

  repeat
    match = regex(text, rule, start)
    if match ~= -1 then
      local token_text = string.sub(text, start, start+match-1)

      -- nlptk.location.new() and nlptk.token.new() return simple table with a           
      -- few fields for programming, but we can replace this by empty table
      -- here for this bug-tracking purpose
      local l = nlptk.location.new(loc, loc+1, "w", source)
      local t = nlptk.token.new(token_text, nil, l)
      local l = { }
      local t = { }

      table.insert(tlist, t)
      loc = loc + 1

      start = start + match
    else
      start = start + 1
    end
  until start > string.len(text)

  return tlist
end