Scite Using Unicode

lua-users home
wiki

The following are examples of writing and reading Unicode characters in a buffer, specifically dealing with UTF-8. If you can't see the Unicode characters in the first code snippet, try viewing using the UTF-8 character set.


Write Some Unicode Characters

-- -*- coding: utf-8 -*-
-- write some UTF-8 chars <khman@users.sf.net> 20061017 public domain
-- (see Markus Kuhn's UTF-8 and Unicode FAQ or RFC3629 for more info)
function UnicodeWriteSomething()
  -- αιοφό C3 A1 C3 A9 C3 AF C3 B6 C3 BC
  -- &#20013;&#25991; E4 B8 AD E6 96 87
  -- open a new buffer and set encoding as UTF-8
  scite.Open("")
  editor.CodePage = SC_CP_UTF8
  -- string is in UTF-8
  editor:AppendText("αιοφό\n")
  editor:AppendText("&#20013;&#25991;\n")
  -- string is encoded as escaped sequences
  editor:AppendText("\195\161\195\169\195\175\195\182\195\188\n")
  editor:AppendText("\228\184\173\230\150\135\n")
end


Read And Write Unicode Values

The following functions helps with reading and writing of UTF-8 characters in a buffer. It allows up to 6-byte character sequences to support UCS-4 ranges.

-- -*- coding: utf-8 -*-
-- return value of UTF-8 character <khman@users.sf.net> 20061017 public domain
-- (see Markus Kuhn's UTF-8 and Unicode FAQ or RFC3629 for more info)
function FromUTF8(pos)
  local mod = math.mod
  local function charat(p)
    local v = editor.CharAt[p]; if v < 0 then v = v + 256 end; return v
  end
  local v, c, n = 0, charat(pos), 1
  if c < 128 then v = c
  elseif c < 192 then
    error("Byte values between 0x80 to 0xBF cannot start a multibyte sequence")
  elseif c < 224 then v = mod(c, 32); n = 2
  elseif c < 240 then v = mod(c, 16); n = 3
  elseif c < 248 then v = mod(c,  8); n = 4
  elseif c < 252 then v = mod(c,  4); n = 5
  elseif c < 254 then v = mod(c,  2); n = 6
  else
    error("Byte values between 0xFE and OxFF cannot start a multibyte sequence")
  end
  for i = 2, n do
    pos = pos + 1; c = charat(pos)
    if c < 128 or c > 191 then
      error("Following bytes must have values between 0x80 and 0xBF")
    end
    v = v * 64 + mod(c, 64)
  end
  return v, pos, n
end

-- return UTF-8 sequence string <khman@users.sf.net> 20061017 public domain
-- (see Markus Kuhn's UTF-8 and Unicode FAQ or RFC3629 for more info)
function ToUTF8(v)
  local math = math
  local n, s, b = 1, "", 0
  -- delete this if your version of SciTE goes beyond UCS-2
  if v > 65535 then error("SciTE does not support codes above U+FFFF") end
  if v >= 55296 and v <= 57343 then
    error("failed to convert UTF-16 surrogate pairs to UTF-8")
  end
  if    v >= 67108864 then n = 6; b = 252
  elseif v >= 2097152 then n = 5; b = 248
  elseif v >=   65536 then n = 4; b = 240
  elseif v >=    2048 then n = 3; b = 224
  elseif v >=     128 then n = 2; b = 192
  end
  for i = 2, n do
    local c = math.mod(v, 64); v = math.floor(v / 64)
    s = string.char(c + 128)..s
  end
  s = string.char(v + b)..s
  return s, n
end

-- demonstrate use of FromUTF8() function: display the character code
-- value of the current character under the cursor in the output window
function Demo_FromUTF8()
  print("Character code: "..(FromUTF8(editor.CurrentPos)))
end

-- demonstrate use of ToUTF8() function: display two characters based
-- on the given unicode value
function Demo_ToUTF8()
  editor:AppendText(ToUTF8(tonumber("0x4E2D", 16)))
  editor:AppendText(ToUTF8(tonumber("0x6587", 16)))
end


Display Character Codes from U+0000 to U+FFFF

The following demo function displays a table of Unicode characters. It requires the ToUTF8() function from above.

-- -*- coding: utf-8 -*-
-- write out a UTF-16 table <khman@users.sf.net> 20061017 public domain
function UTF16Table()
  scite.Open("")
  editor.CodePage = SC_CP_UTF8
  editor:AppendText("-*- coding: utf-8 -*-\n")
  editor:AppendText("   Dec ( Hex ) : 0123456789ABCDEF0123456789ABCDEF\n")
  editor:AppendText("-------------------------------------------------\n")
  for p = 0, 65535, 32 do
    ln = string.format("%6d (0x%4X): ", p, p)
    for q = p, p+31 do
      if q < 32 or (q >= 55296 and q <= 57343) then ln = ln.."?"
      else ln = ln..ToUTF8(q)
      end
    end
    ln = ln.."\n"
    editor:AppendText(ln)
  end
end

--KeinHongMan


RecentChanges · preferences
edit · history
Last edited October 17, 2006 6:26 pm GMT (diff)