Read Defined Chunks

lua-users home
wiki

With this function you can read defined chunks (until a given delimiter is found) from an file or from io.stdin. It's a complete rewrite of a prior version: now its fast, doesn't concatenate strings unnecessary, needs less memory and is flexible. The purpose is to process a) files with tons of megabaytes b) mixed formated input e.g. MIME multipart messages are a mix of lines \r\n and binary data Please note a simple non Lua standard enhancement: I use the number variable lua.maxread to set the chunksizes used for io.reads(chunk of bytes) at a central point. Replace this variable by your preferred chunksize e.g. 2^13 for 8KB.

-- a simple example without using all the specials
local Handle=io.open('File','r')
local ReadUntil=io.readuntil(Handle)
repeat
   Line=coroutine.resume(ReadUntil,'\n',true)
   if Line then

   end
until Line==nil

-- another example
local Handle = io.open('File', 'r')
local ReadUntil = io.readuntil(Handle)
local Chunk, Found
repeat
   _,Chunk, Found = coroutine.resume(ReadUntil,
                   'search this string in a huge file',false)
   if Found then
      _,Chunk, Found = coroutine.resume(ReadUntil,
                      'search another string in the same file',true)
      if Found then break end
   end
until Chunk == nil

-- Now if chunk ~= nil, then chunk is the stuff between
-- 'search this string in a huge file' and 'search another
-- string in the same file'. Yes, it's possible to do the same
-- very simply, but the advantage here is that the large file
-- isn't loaded at once into memory.

Code:

function io.readuntil(Filehandle, Delimiter, Collect, Limit)

-- Filehandle (userdata)
-- Delimiter (string, optional); max. length is lua.maxread;
--   optional because coroutine.resume() also accepts <delimiter>
-- Collect (boolean, optional) = true (default); read until
--   <delimiter> is found or end of file or <limit> is reached
--   and return string at once = false; return string also before
--   <delimiter> is found or end of file or <limit> is reached
-- Limit (number, optional); number of bytes to read from
--   <filehandle>; default is unlimited

-- <function> = cooroutine.resume(Function, Delimiter, Collect)

-- Function (thread); returned from io.readuntil()
-- Delimiter (string, optional); see io.readuntil()
-- Collect (boolean, optional); see io.readuntil()

-- return (boolean); = true; no error
--                   = false; an error occured and the second
--                            argument returned is the errormessage
--        (string or nil) = nil; end of file
--        (boolean) = true; delimiter found
--                  = false; delimiter not found

-- note: if the coroutine returns true,<string>,false then
--          if <collect> = false it does not have to be the end of file
--                       = true  the end of file is reached and the next
--                               coroutine.resume returns true,nil(,nil)

   if type(Delimiter) == 'boolean' then
      Collect,Delimiter = Delimiter,Collect
   end
   if type(Delimiter) == 'number' then
      Limit,Delimiter = Delimiter,nil
   end
   if type(Collect) == 'number' then
      Limit,Collect = Collect,nil
   end

   return coroutine.create(function(NewDelimiter,NewCollect)

      local Next = function(NewDelimiter,NewCollect)
                      if type(NewDelimiter) == 'boolean' then
                          NewCollect,NewDelimiter = NewDelimiter,nil
                      end
                      return NewDelimiter or Delimiter,NewCollect or Collect
                   end

      Delimiter,Collect = Next(NewDelimiter,NewCollect)

      local Chunksize,Chunk,Length,First,Second,SearchFrom,
            GetFrom,FoundFrom,FoundTo =
            lua.maxread,{},0,1,2,1,1

      if Limit and Length+Chunksize>Limit then
         Chunk[First] = Limit-Length>0 and Filehandle:read(Limit-Length)
      else
         Chunk[First] = Filehandle:read(Chunksize)
      end

      if Chunk[First] then
         Length = Length + string.len(Chunk[First])

         while true do
            if string.len(Delimiter)>Chunksize then
               error('io.readuntil: delimiter to long')
            end

            FoundFrom,FoundTo = string.find(
                      Chunk[First],Delimiter,SearchFrom,true)
            if FoundFrom then
               -- delimiter found in first chunk
               Delimiter,Collect = Next(coroutine.yield(
                    string.sub(Chunk[First],GetFrom,FoundFrom-1),true))
               SearchFrom,GetFrom = FoundTo+1,FoundTo+1
            else
               if Limit and Length+Chunksize > Limit then
                  Chunk[Second] = Limit-Length>0 and Filehandle:read(Limit-Length)
               else
                  Chunk[Second] = Filehandle:read(Chunksize)
               end

               if Chunk[Second] then
                  Length = Length + string.len(Chunk[Second])

                  -- concatenate end of first chunk with start of
                  -- second chunk so that a possible splitted delimiter
                  -- must be found
                  FoundFrom,FoundTo = string.find(
                    string.sub(Chunk[First],
                      string.len(Chunk[First])-string.len(Delimiter)+2) ..
                      string.sub(Chunk[Second],1,string.len(Delimiter)-1),
                    Delimiter,1,true)
                  if FoundFrom then
                     -- delimiter is splitted between first and second chunk
                     Delimiter,Collect = Next(coroutine.yield(
                       string.sub(Chunk[First],GetFrom,string.len(Chunk[First])-
                         string.len(Delimiter)+FoundFrom), true
                     ))
                     First,Second = Second,First
                     SearchFrom,GetFrom = FoundFrom+1,FoundFrom+1
                  else
                     -- delimiter isn't splitted between first and second chunk
                     if Collect then
                        SearchFrom = string.len(Chunk[First])+1
                        Chunk[First] = Chunk[First]..Chunk[Second]
                     else
                        if string.len(Chunk[First]) >= GetFrom then
                           Delimiter,Collect = Next(coroutine.yield(
                              string.sub(Chunk[First],GetFrom),false))
                        end
                        First,Second = Second,First
                        SearchFrom,GetFrom = 1,1
                     end
                  end
               else
                  -- no delimiter found and no further input
                  break
               end
            end
         end

         if string.len(Chunk[First]) >= GetFrom then
            -- return rest of first chunk
            coroutine.yield(string.sub(Chunk[First],GetFrom),false)
         end
      end
   end)
   -- return (thread); a coroutine
   end

-- MarkusHuber


RecentChanges · preferences
edit · history
Last edited May 28, 2007 10:01 pm GMT (diff)