Read Defined Chunks

lua-users home
wiki

Difference (from prior major revision) (minor diff, author diff)

Changed: 1,2c1,2
With this function you can read defined chunks (until a given delimiter is found) from an file or from io.stdin
Its a complete rewrite of the first version: now its fast, doesn't concatenate strings unnecessary, needs less memory and is flexible.
With this function you can read defined chunks (until a given delimiter is found) from an file or from io.stdin.
It's a complete rewrite of a prior version: now its fast, doesn't concatenate strings unnecessary, needs less memory and is flexible.

Changed: 4c4
Please note a simple non Lua standard enhancement: I use the number variable lua.maxread to set the chunksizes used for io.reads(chunk of bytes) at a central point. Replace this variable by your preferred chunksize e.g. 2^13 for 8KB
Please note a simple non Lua standard enhancement: I use the number variable lua.maxread to set the chunksizes used for io.reads(chunk of bytes) at a central point. Replace this variable by your preferred chunksize e.g. 2^13 for 8KB.

Changed: 6c6,12
{{{
{{{!Lua
-- a simple example without using all the specials
local Handle=io.open('File','r')
local ReadUntil?=io.readuntil(Handle)
repeat
Line=coroutine.resume(ReadUntil?,'\n',true)
if Line then

Changed: 8c14,43
function io.readuntil(Filehandle,Delimiter,Collect,Limit)
end
until Line==nil
}}}


-- another example
local Handle = io.open('File', 'r')
local ReadUntil = io.readuntil(Handle)
local Chunk, Found
repeat
_,Chunk, Found = coroutine.resume(ReadUntil,
'search this string in a huge file',false)
if Found then
_,Chunk, Found = coroutine.resume(ReadUntil,
'search another string in the same file',true)
if Found then break end
end
until Chunk == nil

-- Now if chunk ~= nil, then chunk is the stuff between
-- 'search this string in a huge file' and 'search another
-- string in the same file'. Yes, it's possible to do the same
-- very simply, but the advantage here is that the large file
-- isn't loaded at once into memory.



Code:

{{{!Lua
function io.readuntil(Filehandle, Delimiter, Collect, Limit)

Changed: 11,14c46,53
-- Delimiter (string, optional); max. length is lua.maxread; optional because coroutine.resume() also accepts <delimiter>
-- Collect (boolean, optional) = true (default); read until <delimiter> is found or end of file or <limit> is reached and return string at once
-- = false; return string also before <delimiter> is found or end of file or <limit> is reached
-- Limit (number, optional); number of bytes to read from <filehandle>; default is unlimited
-- Delimiter (string, optional); max. length is lua.maxread;
-- optional because coroutine.resume() also accepts <delimiter>
-- Collect (boolean, optional) = true (default); read until
-- <delimiter> is found or end of file or <limit> is reached
-- and return string at once = false; return string also before
-- <delimiter> is found or end of file or <limit> is reached
-- Limit (number, optional); number of bytes to read from
-- <filehandle>; default is unlimited

Changed: 16c55
-- <function>=cooroutine.resume(Function,Delimiter,Collect)
-- <function> = cooroutine.resume(Function, Delimiter, Collect)

Changed: 23c62,63
-- = false; an error occured and the second argument returned is the errormessage
-- = false; an error occured and the second
-- argument returned is the errormessage

Changed: 30,40c70,71
-- = true the end of file is reached and the next coroutine.resume returns true,nil(,nil)

if type(Delimiter)=='boolean' then
Collect,Delimiter = Delimiter,Collect
end
if type(Delimiter)=='number' then
Limit,Delimiter = Delimiter,nil
end
if type(Collect)=='number' then
Limit,Collect = Collect,nil
end
-- = true the end of file is reached and the next
-- coroutine.resume returns true,nil(,nil)

Changed: 42c73,81
return coroutine.create(function(NewDelimiter?,NewCollect?)
if type(Delimiter) == 'boolean' then
Collect,Delimiter = Delimiter,Collect
end
if type(Delimiter) == 'number' then
Limit,Delimiter = Delimiter,nil
end
if type(Collect) == 'number' then
Limit,Collect = Collect,nil
end

Changed: 44c83
local Next=function(NewDelimiter?,NewCollect?)
return coroutine.create(function(NewDelimiter?,NewCollect?)

Changed: 46c85,86
if type(NewDelimiter?)=='boolean' then
local Next = function(NewDelimiter?,NewCollect?)
if type(NewDelimiter?) == 'boolean' then

Changed: 48,62c88,102
end

return NewDelimiter? or Delimiter,NewCollect? or Collect

end

Delimiter,Collect = Next(NewDelimiter?,NewCollect?)

local Chunksize,Chunk,Length,First,Second,SearchFrom?,GetFrom?,FoundFrom?,FoundTo? = lua.maxread,{},0,1,2,1,1

if Limit and Length+Chunksize>Limit then
Chunk[First]=Limit-Length>0 and Filehandle:read(Limit-Length)
else
Chunk[First]=Filehandle:read(Chunksize)
end
end
return NewDelimiter? or Delimiter,NewCollect? or Collect
end

Delimiter,Collect = Next(NewDelimiter?,NewCollect?)

local Chunksize,Chunk,Length,First,Second,SearchFrom?,
GetFrom?,FoundFrom?,FoundTo? =
lua.maxread,{},0,1,2,1,1

if Limit and Length+Chunksize>Limit then
Chunk[First] = Limit-Length>0 and Filehandle:read(Limit-Length)
else
Chunk[First] = Filehandle:read(Chunksize)
end

Changed: 64,65c104,105
if Chunk[First] then
Length=Length+string.len(Chunk[First])
if Chunk[First] then
Length = Length + string.len(Chunk[First])

Changed: 67c107,110
while true do
while true do
if string.len(Delimiter)>Chunksize then
error('io.readuntil: delimiter to long')
end

Changed: 69,70c112,123
if string.len(Delimiter)>Chunksize then
error('io.readuntil: delimiter to long')
FoundFrom?,FoundTo? = string.find(
Chunk[First],Delimiter,SearchFrom?,true)
if FoundFrom? then
-- delimiter found in first chunk
Delimiter,Collect = Next(coroutine.yield(
string.sub(Chunk[First],GetFrom?,FoundFrom?-1),true))
SearchFrom?,GetFrom? = FoundTo?+1,FoundTo?+1
else
if Limit and Length+Chunksize > Limit then
Chunk[Second] = Limit-Length>0 and Filehandle:read(Limit-Length)
else
Chunk[Second] = Filehandle:read(Chunksize)

Changed: 73,78c126,127
FoundFrom?,FoundTo? = string.find(Chunk[First],Delimiter,SearchFrom?,true)
if FoundFrom? then
-- delimiter found in first chunk
Delimiter,Collect = Next(coroutine.yield(string.sub(Chunk[First],GetFrom?,FoundFrom?-1),true))
SearchFrom?,GetFrom? = FoundTo?+1,FoundTo?+1
else
if Chunk[Second] then
Length = Length + string.len(Chunk[Second])

Changed: 80,81c129,144
if Limit and Length+Chunksize>Limit then
Chunk[Second]=Limit-Length>0 and Filehandle:read(Limit-Length)
-- concatenate end of first chunk with start of
-- second chunk so that a possible splitted delimiter
-- must be found
FoundFrom?,FoundTo? = string.find(
string.sub(Chunk[First],
string.len(Chunk[First])-string.len(Delimiter)+2) ..
string.sub(Chunk[Second],1,string.len(Delimiter)-1),
Delimiter,1,true)
if FoundFrom? then
-- delimiter is splitted between first and second chunk
Delimiter,Collect = Next(coroutine.yield(
string.sub(Chunk[First],GetFrom?,string.len(Chunk[First])-
string.len(Delimiter)+FoundFrom?), true
))
First,Second = Second,First
SearchFrom?,GetFrom? = FoundFrom?+1,FoundFrom?+1

Changed: 83,95c146,149
Chunk[Second]=Filehandle:read(Chunksize)
end

if Chunk[Second] then
Length=Length+string.len(Chunk[Second])

-- concatenate end of first chunk with start of second chunk so that a possible splitted delimiter must be found
FoundFrom?,FoundTo? = string.find(string.sub(Chunk[First],string.len(Chunk[First])-string.len(Delimiter)+2)..string.sub(Chunk[Second],1,string.len(Delimiter)-1),Delimiter,1,true)
if FoundFrom? then
-- delimiter is splitted between first and second chunk
Delimiter,Collect = Next(coroutine.yield(string.sub(Chunk[First],GetFrom?,string.len(Chunk[First])-string.len(Delimiter)+FoundFrom?),true))
First,Second = Second,First
SearchFrom?,GetFrom? = FoundFrom?+1,FoundFrom?+1
-- delimiter isn't splitted between first and second chunk
if Collect then
SearchFrom? = string.len(Chunk[First])+1
Chunk[First] = Chunk[First]..Chunk[Second]

Changed: 97,106c151,153
-- delimiter isn't splitted between first and second chunk
if Collect then
SearchFrom?=string.len(Chunk[First])+1
Chunk[First]=Chunk[First]..Chunk[Second]
else
if string.len(Chunk[First])>=GetFrom? then
Delimiter,Collect = Next(coroutine.yield(string.sub(Chunk[First],GetFrom?),false))
end
First,Second = Second,First
SearchFrom?,GetFrom? = 1,1
if string.len(Chunk[First]) >= GetFrom? then
Delimiter,Collect = Next(coroutine.yield(
string.sub(Chunk[First],GetFrom?),false))

Added: 107a155,156
First,Second = Second,First
SearchFrom?,GetFrom? = 1,1

Removed: 109,111d157
else
-- no delimiter found and no further input
break

Added: 112a159,161
else
-- no delimiter found and no further input
break

Removed: 115,120d163

if string.len(Chunk[First])>=GetFrom? then
-- return rest of first chunk
coroutine.yield(string.sub(Chunk[First],GetFrom?),false)
end


Changed: 123,126c166,172
end)

-- return (thread); a coroutine

if string.len(Chunk[First]) >= GetFrom? then
-- return rest of first chunk
coroutine.yield(string.sub(Chunk[First],GetFrom?),false)
end
end
end)
-- return (thread); a coroutine

Added: 128a175,176

-- MarkusHuber

With this function you can read defined chunks (until a given delimiter is found) from an file or from io.stdin. It's a complete rewrite of a prior version: now its fast, doesn't concatenate strings unnecessary, needs less memory and is flexible. The purpose is to process a) files with tons of megabaytes b) mixed formated input e.g. MIME multipart messages are a mix of lines \r\n and binary data Please note a simple non Lua standard enhancement: I use the number variable lua.maxread to set the chunksizes used for io.reads(chunk of bytes) at a central point. Replace this variable by your preferred chunksize e.g. 2^13 for 8KB.

-- a simple example without using all the specials
local Handle=io.open('File','r')
local ReadUntil=io.readuntil(Handle)
repeat
   Line=coroutine.resume(ReadUntil,'\n',true)
   if Line then

   end
until Line==nil

-- another example
local Handle = io.open('File', 'r')
local ReadUntil = io.readuntil(Handle)
local Chunk, Found
repeat
   _,Chunk, Found = coroutine.resume(ReadUntil,
                   'search this string in a huge file',false)
   if Found then
      _,Chunk, Found = coroutine.resume(ReadUntil,
                      'search another string in the same file',true)
      if Found then break end
   end
until Chunk == nil

-- Now if chunk ~= nil, then chunk is the stuff between
-- 'search this string in a huge file' and 'search another
-- string in the same file'. Yes, it's possible to do the same
-- very simply, but the advantage here is that the large file
-- isn't loaded at once into memory.

Code:

function io.readuntil(Filehandle, Delimiter, Collect, Limit)

-- Filehandle (userdata)
-- Delimiter (string, optional); max. length is lua.maxread;
--   optional because coroutine.resume() also accepts <delimiter>
-- Collect (boolean, optional) = true (default); read until
--   <delimiter> is found or end of file or <limit> is reached
--   and return string at once = false; return string also before
--   <delimiter> is found or end of file or <limit> is reached
-- Limit (number, optional); number of bytes to read from
--   <filehandle>; default is unlimited

-- <function> = cooroutine.resume(Function, Delimiter, Collect)

-- Function (thread); returned from io.readuntil()
-- Delimiter (string, optional); see io.readuntil()
-- Collect (boolean, optional); see io.readuntil()

-- return (boolean); = true; no error
--                   = false; an error occured and the second
--                            argument returned is the errormessage
--        (string or nil) = nil; end of file
--        (boolean) = true; delimiter found
--                  = false; delimiter not found

-- note: if the coroutine returns true,<string>,false then
--          if <collect> = false it does not have to be the end of file
--                       = true  the end of file is reached and the next
--                               coroutine.resume returns true,nil(,nil)

   if type(Delimiter) == 'boolean' then
      Collect,Delimiter = Delimiter,Collect
   end
   if type(Delimiter) == 'number' then
      Limit,Delimiter = Delimiter,nil
   end
   if type(Collect) == 'number' then
      Limit,Collect = Collect,nil
   end

   return coroutine.create(function(NewDelimiter,NewCollect)

      local Next = function(NewDelimiter,NewCollect)
                      if type(NewDelimiter) == 'boolean' then
                          NewCollect,NewDelimiter = NewDelimiter,nil
                      end
                      return NewDelimiter or Delimiter,NewCollect or Collect
                   end

      Delimiter,Collect = Next(NewDelimiter,NewCollect)

      local Chunksize,Chunk,Length,First,Second,SearchFrom,
            GetFrom,FoundFrom,FoundTo =
            lua.maxread,{},0,1,2,1,1

      if Limit and Length+Chunksize>Limit then
         Chunk[First] = Limit-Length>0 and Filehandle:read(Limit-Length)
      else
         Chunk[First] = Filehandle:read(Chunksize)
      end

      if Chunk[First] then
         Length = Length + string.len(Chunk[First])

         while true do
            if string.len(Delimiter)>Chunksize then
               error('io.readuntil: delimiter to long')
            end

            FoundFrom,FoundTo = string.find(
                      Chunk[First],Delimiter,SearchFrom,true)
            if FoundFrom then
               -- delimiter found in first chunk
               Delimiter,Collect = Next(coroutine.yield(
                    string.sub(Chunk[First],GetFrom,FoundFrom-1),true))
               SearchFrom,GetFrom = FoundTo+1,FoundTo+1
            else
               if Limit and Length+Chunksize > Limit then
                  Chunk[Second] = Limit-Length>0 and Filehandle:read(Limit-Length)
               else
                  Chunk[Second] = Filehandle:read(Chunksize)
               end

               if Chunk[Second] then
                  Length = Length + string.len(Chunk[Second])

                  -- concatenate end of first chunk with start of
                  -- second chunk so that a possible splitted delimiter
                  -- must be found
                  FoundFrom,FoundTo = string.find(
                    string.sub(Chunk[First],
                      string.len(Chunk[First])-string.len(Delimiter)+2) ..
                      string.sub(Chunk[Second],1,string.len(Delimiter)-1),
                    Delimiter,1,true)
                  if FoundFrom then
                     -- delimiter is splitted between first and second chunk
                     Delimiter,Collect = Next(coroutine.yield(
                       string.sub(Chunk[First],GetFrom,string.len(Chunk[First])-
                         string.len(Delimiter)+FoundFrom), true
                     ))
                     First,Second = Second,First
                     SearchFrom,GetFrom = FoundFrom+1,FoundFrom+1
                  else
                     -- delimiter isn't splitted between first and second chunk
                     if Collect then
                        SearchFrom = string.len(Chunk[First])+1
                        Chunk[First] = Chunk[First]..Chunk[Second]
                     else
                        if string.len(Chunk[First]) >= GetFrom then
                           Delimiter,Collect = Next(coroutine.yield(
                              string.sub(Chunk[First],GetFrom),false))
                        end
                        First,Second = Second,First
                        SearchFrom,GetFrom = 1,1
                     end
                  end
               else
                  -- no delimiter found and no further input
                  break
               end
            end
         end

         if string.len(Chunk[First]) >= GetFrom then
            -- return rest of first chunk
            coroutine.yield(string.sub(Chunk[First],GetFrom),false)
         end
      end
   end)
   -- return (thread); a coroutine
   end

-- MarkusHuber


RecentChanges · preferences
edit · history
Last edited May 28, 2007 10:01 pm GMT (diff)