Read Defined Chunks

lua-users home
wiki

Showing revision 13
With this function you can read defined chunks (until a given delimiter is found) from an file or from io.stdin Its a complete rewrite of the first version: now its fast, doesn't concatenate strings unnecessary, needs less memory and is flexible. The purpose is to process a) files with tons of megabaytes b) mixed formated input e.g. MIME multipart messages are a mix of lines \r\n and binary data Please note a simple non Lua standard enhancement: I use the number variable lua.maxread to set the chunksizes used for io.reads(chunk of bytes) at a central point. Replace this variable by your preferred chunksize e.g. 2^13 for 8KB


-- a simple example without using all the specials

   local Handle=io.open('File','r')
   local ReadUntil=io.readuntil(Handle)
   repeat
      Line=coroutine.resume(ReadUntil,'\n',true)
      if Line then

      end
   until Line==nil

-- another example

   local Handle=io.open('File','r')
   local ReadUntil=io.readuntil(Handle)
   local Chunk,Found
   repeat
      _,Chunk,Found=coroutine.resume(ReadUntil,'search this string in a huge file',false)
      if Found then
         _,Chunk,Found=coroutine.resume(ReadUntil,'search another string in the same file',true)
         if Found then break end
      end
   until Chunk==nil

-- now if chunk~=nil then chunk is the stuff between 'search this string in a huge file' and 'search another string in the same file'
-- yes its possible to do the same very simple, but the advantage here is that the large file isn't loaded at once into memory




   function io.readuntil(Filehandle,Delimiter,Collect,Limit)

-- Filehandle (userdata)
-- Delimiter (string, optional); max. length is lua.maxread; optional because coroutine.resume() also accepts <delimiter>
-- Collect (boolean, optional) = true (default); read until <delimiter> is found or end of file or <limit> is reached and return string at once
--                             = false; return string also before <delimiter> is found or end of file or <limit> is reached
-- Limit (number, optional); number of bytes to read from <filehandle>; default is unlimited

-- <function>=cooroutine.resume(Function,Delimiter,Collect)

-- Function (thread); returned from io.readuntil()
-- Delimiter (string, optional); see io.readuntil()
-- Collect (boolean, optional); see io.readuntil()

-- return (boolean); = true; no error
--                   = false; an error occured and the second argument returned is the errormessage
--        (string or nil) = nil; end of file
--        (boolean) = true; delimiter found
--                  = false; delimiter not found

-- note: if the coroutine returns true,<string>,false then
--          if <collect> = false it does not have to be the end of file
--                       = true  the end of file is reached and the next coroutine.resume returns true,nil(,nil)

      if type(Delimiter)=='boolean' then
         Collect,Delimiter = Delimiter,Collect
      end
      if type(Delimiter)=='number' then
         Limit,Delimiter = Delimiter,nil
      end
      if type(Collect)=='number' then
         Limit,Collect = Collect,nil
      end

      return coroutine.create(function(NewDelimiter,NewCollect)

         local Next=function(NewDelimiter,NewCollect)

                       if type(NewDelimiter)=='boolean' then
                          NewCollect,NewDelimiter = NewDelimiter,nil
                       end

                       return NewDelimiter or Delimiter,NewCollect or Collect

                    end

         Delimiter,Collect = Next(NewDelimiter,NewCollect)

         local Chunksize,Chunk,Length,First,Second,SearchFrom,GetFrom,FoundFrom,FoundTo = lua.maxread,{},0,1,2,1,1

         if Limit and Length+Chunksize>Limit then
            Chunk[First]=Limit-Length>0 and Filehandle:read(Limit-Length)
         else
            Chunk[First]=Filehandle:read(Chunksize)
         end

         if Chunk[First] then
            Length=Length+string.len(Chunk[First])

            while true do

               if string.len(Delimiter)>Chunksize then
                  error('io.readuntil: delimiter to long')
               end

               FoundFrom,FoundTo = string.find(Chunk[First],Delimiter,SearchFrom,true)
               if FoundFrom then
--                delimiter found in first chunk
                  Delimiter,Collect = Next(coroutine.yield(string.sub(Chunk[First],GetFrom,FoundFrom-1),true))
                  SearchFrom,GetFrom = FoundTo+1,FoundTo+1
               else

                  if Limit and Length+Chunksize>Limit then
                     Chunk[Second]=Limit-Length>0 and Filehandle:read(Limit-Length)
                  else
                     Chunk[Second]=Filehandle:read(Chunksize)
                  end

                  if Chunk[Second] then
                     Length=Length+string.len(Chunk[Second])

--                   concatenate end of first chunk with start of second chunk so that a possible splitted delimiter must be found
                     FoundFrom,FoundTo = string.find(string.sub(Chunk[First],string.len(Chunk[First])-string.len(Delimiter)+2)..string.sub(Chunk[Second],1,string.len(Delimiter)-1),Delimiter,1,true)
                     if FoundFrom then
--                      delimiter is splitted between first and second chunk
                        Delimiter,Collect = Next(coroutine.yield(string.sub(Chunk[First],GetFrom,string.len(Chunk[First])-string.len(Delimiter)+FoundFrom),true))
                        First,Second = Second,First
                        SearchFrom,GetFrom = FoundFrom+1,FoundFrom+1
                     else
--                      delimiter isn't splitted between first and second chunk
                        if Collect then
                           SearchFrom=string.len(Chunk[First])+1
                           Chunk[First]=Chunk[First]..Chunk[Second]
                        else
                           if string.len(Chunk[First])>=GetFrom then
                              Delimiter,Collect = Next(coroutine.yield(string.sub(Chunk[First],GetFrom),false))
                           end
                           First,Second = Second,First
                           SearchFrom,GetFrom = 1,1
                        end
                     end
                  else
--                   no delimiter found and no further input
                     break
                  end
               end
            end

            if string.len(Chunk[First])>=GetFrom then
--             return rest of first chunk
               coroutine.yield(string.sub(Chunk[First],GetFrom),false)
            end

         end

      end)

-- return (thread); a coroutine

   end

RecentChanges · preferences
edit · history · current revision
Edited August 7, 2004 12:04 am GMT (diff)