Module:Deranize: Difference between revisions

From The Toaq Wiki
(reformat lookup table)
(Fix tone marks with implicit o'aomo for real)
 
(23 intermediate revisions by 3 users not shown)
Line 1: Line 1:
-- either mw.ustring.toNFD or mw.ustring.match doesn't work properly, so resorting to this for the time being (lol)
-- Converts Latin text to Derani.
local untone = {
-- Supports: syllable splitting, tone, prefixes, hiatus/diphthong marks, punctuation.
  ["a"] = {"a", 1},
-- For cartouches and null-raı, try: deranize("Ruaq sá [poq] sá* da.")
  ["u"] = {"u", 1},
 
  ["ı"] = {"ı", 1},
u = mw.ustring
  ["o"] = {"o", 1},
 
  ["e"] = {"e", 1},
function get_tone(word)
  ["á"] = {"a", 2},
   local nfd = u.toNFD(word)
   ["ú"] = {"u", 2},
   if u.find(nfd, "́") then return 2 end
   ["í"] = {"ı", 2},
   if u.find(nfd, "̈") then return 3 end
  ["ó"] = {"o", 2},
   if u.find(nfd, "̂") then return 4 end
   ["é"] = {"e", 2},
   return 1
  ["ä"] = {"a", 3},
end
   ["ü"] = {"u", 3},
 
  ["ï"] = {"ı", 3},
function strip_tone(word)
   ["ö"] = {"o", 3},
   return u.gsub(u.gsub(u.toNFD(word), "[́̈̂]", ""), "i", "ı")
  ["ë"] = {"e", 3},
end
  ["â"] = {"a", 4},
   ["û"] = {"u", 4},
  ["î"] = {"ı", 4},
  ["ô"] = {"o", 4},
  ["ê"] = {"e", 4},
}


local deranimap = {
local deranimap = {
   -- consonants
   -- consonants
   ["m"]  = "", -- DERANI LETTER MAMEI
   ["m"]  = "󱚰", -- DERANI LETTER MAMEI
   ["m_"] = "", -- DERANI LETTER MAMEI CODA
   ["m_"] = "󱚱", -- DERANI LETTER MAMEI CODA
   ["b"]  = "", -- DERANI LETTER BUBUE
   ["b"]  = "󱚲", -- DERANI LETTER BUBUE
   ["p"]  = "", -- DERANI LETTER PIPOQ
   ["p"]  = "󱚳", -- DERANI LETTER PIPOQ
   ["f"]  = "", -- DERANI LETTER FOFUAQ
   ["f"]  = "󱚴", -- DERANI LETTER FOFUAQ
   ["n"]  = "", -- DERANI LETTER NANAQ
   ["n"]  = "󱚵", -- DERANI LETTER NANAQ
   ["d"]  = "", -- DERANI LETTER DUDEO
   ["d"]  = "󱚶", -- DERANI LETTER DUDEO
   ["t"]  = "", -- DERANI LETTER TITIEQ
   ["t"]  = "󱚷", -- DERANI LETTER TITIEQ
   ["z"]  = "", -- DERANI LETTER ZOZEO
   ["z"]  = "󱚸", -- DERANI LETTER ZOZEO
   ["c"]  = "", -- DERANI LETTER CECOA
   ["c"]  = "󱚹", -- DERANI LETTER CECOA
   ["s"]  = "", -- DERANI LETTER SAQSEOQ
   ["s"]  = "󱚺", -- DERANI LETTER SAQSEOQ
   ["r"]  = "", -- DERANI LETTER RAIRUA
   ["r"]  = "󱚻", -- DERANI LETTER RAIRUA
   ["l"]  = "", -- DERANI LETTER LAOLIQ
   ["l"]  = "󱚼", -- DERANI LETTER LAOLIQ
   ["nh"] = "", -- DERANI LETTER NHANHOQ
   ["nh"] = "󱚽", -- DERANI LETTER NHANHOQ
   ["j"]  = "", -- DERANI LETTER JUJUO
   ["j"]  = "󱚾", -- DERANI LETTER JUJUO
   ["ch"] = "", -- DERANI LETTER CHICHAO
   ["ch"] = "󱚿", -- DERANI LETTER CHICHAO
   ["sh"] = "", -- DERANI LETTER SHOSHIA
   ["sh"] = "󱛀", -- DERANI LETTER SHOSHIA
   ["ꝡ"]  = "", -- DERANI LETTER VEVA
   ["ꝡ"]  = "󱛁", -- DERANI LETTER VEVA
   ["q_"] = "", -- DERANI LETTER AQ-AQ
   ["q_"] = "󱛂", -- DERANI LETTER AQ-AQ
   ["g"]  = "", -- DERANI LETTER GUGUI
   ["g"]  = "󱛃", -- DERANI LETTER GUGUI
   ["k"]  = "", -- DERANI LETTER KIKUE
   ["k"]  = "󱛄", -- DERANI LETTER KIKUE
   ["'"]  = "", -- DERANI LETTER O-AOMO
   ["'"]  = "󱛅", -- DERANI LETTER O-AOMO
   ["h"]  = "", -- DERANI LETTER HEHAQ
   ["h"]  = "󱛆", -- DERANI LETTER HEHAQ
   -- vowels
   -- vowels
   ["a"]  = "", -- DERANI LETTER SAQSEOQ
   ["a"]  = "󱚺", -- DERANI LETTER SAQSEOQ
   ["ı"]  = "", -- DERANI LETTER CECOA
   ["ı"]  = "󱚹", -- DERANI LETTER CECOA
   ["u"]  = "", -- DERANI LETTER BUBUE
   ["u"]  = "󱚲", -- DERANI LETTER BUBUE
   ["o"]  = "", -- DERANI LETTER GUGUI
   ["o"]  = "󱛃", -- DERANI LETTER GUGUI
   ["e"]  = "", -- DERANI LETTER FOFUAQ
   ["e"]  = "󱚴", -- DERANI LETTER FOFUAQ
   -- tone marks
   -- tone marks
   [2]    = "", -- DERANI COMBINING RISING TONE
  [1]    = "",
   [3]    = "", -- DERANI COMBINING LOW GLOTTAL TONE
   [2]    = "󱛊", -- DERANI COMBINING RISING TONE
   [4]    = "", -- DERANI COMBINING RISING-FALLING TONE
   [3]    = "󱛋", -- DERANI COMBINING LOW GLOTTAL TONE
   -- TODO: the rest of the owl
   [4]    = "󱛌", -- DERANI COMBINING RISING-FALLING TONE
   -- prefix
  ["-"] = "󱛒",
  ["hiatus"] = "󱛍",
  ["diphthong"] = "󱛎",
  [""] = "",
  -- diphthong reform
  ["aı"] = "󱚶", -- DERANI LETTER DUDEO
  ["ao"] = "󱚳", -- DERANI LETTER PIPOQ
  ["eı"] = "󱚸", -- DERANI LETTER ZOZEO
  ["oı"] = "󱚽", -- DERANI LETTER NHANHOQ
}
 
local derani_punctuation = {
  ["\""] = "󱛓",
  [":"] = "󱛓",
  [","] = " 󱛔",
  ["."] = " 󱛕",
  ["!"] = " 󱛖",
  ["?"] = " 󱛗",
  ["["] = "󱛘",
  ["]"] = "󱛙",
  ["*"] = " 󱛚",
  ["_"] = "󱛛", -- compatibility nbsp
}
}


-- {{#invoke:Deranize|deranize|arg}} (TODO: turn this into a family friendly template)
local vowel_lookalikes = {
function deranize(frame)
  ["s"] = true,
   assert(frame.args[1] ~= nil and frame.args[2] == nil, "This function requires exactly one argument")
   ["b"] = true,
  local normalized = mw.ustring.gsub(mw.ustring.lower(frame.args[1]), "i", "ı")
  ["c"] = true,
   mw.log(normalized)
  ["g"] = true,
  ["f"] = true,
}
 
local diphthong_lookalikes = {
  ["d"] = true,
  ["p"] = true,
  ["z"] = true,
   ["nh"] = true,
}
 
local reform = false
 
function deranize_word(word)
   local res = {}
   local res = {}
   local ix = 1
   local ix = 1
   local len = mw.ustring.len(normalized)
  local tone = get_tone(word)
   local len = mw.ustring.len(word)
   -- NB. this is not PCRE regex, see https://www.mediawiki.org/wiki/Extension:Scribunto/Lua_reference_manual#Ustring_patterns
   -- NB. this is not PCRE regex, see https://www.mediawiki.org/wiki/Extension:Scribunto/Lua_reference_manual#Ustring_patterns
  --              ↓ initpos (these ones eval to an index)                    precoda ↓        ↓ postcoda
   local toaqre = "()([mbpfndtzcsrljꝡgk']?h?)([auıoe])([̣]?)([́̈̂̀]?)([auıoe]?[auıoe]?)()([qm]?)([-·]?)()([mbpfndtzcsrljꝡgk']?h?)([auıoe]?)"
  --                ↓ initial              ↓ medial                ↓ final            ↓ coda  ↓ lingeringvowel
  local is_first_syllable = true
   local toaqre = "()([mbpfndtzcsrljꝡgk']?h?)([auıoeáúíóéäüïöëâûîôê])([auıoe]?[auıoe]?)()([qm]?)()([auıoe]?)"
   while ix <= len do
   while ix < len do
     local shouldbreak = false
     local shouldbreak = false
     local initpos, initial, medial, final, precoda, coda, postcoda, lingeringvowel = mw.ustring.match(normalized, toaqre, ix)
     local pos_init, initial, medial, underdot, diacritic, final, pos_precoda, coda, hyphen, pos_postcoda, next_initial, next_medial = mw.ustring.match(word, toaqre, ix)
     if initpos == nil then
     if pos_init == nil then
       shouldbreak = true
       break
      initpos = nil
     end
     end
    mw.log(table.concat({ix, initpos, initial, medial, final, precoda, coda, postcoda, lingeringvowel}, ", "))
     res[#res+1] = u.toNFC(u.sub(word, ix, pos_init - 1))
     res[#res+1] = mw.ustring.toNFC(mw.ustring.sub(normalized, ix, initpos - 1))
     if coda == "m" and next_initial == "" and next_medial ~= "" then
    if shouldbreak then break end
     if coda == "m" and lingervowel ~= "" then
       coda = ""
       coda = ""
       ix = precoda
      next_initial = "m"
       ix = pos_precoda
    else
      ix = pos_postcoda
    end
 
    local v = medial..final
    local is_diphthong = v == "aı" or v == "ao" or v == "eı" or v == "oı"
    if is_diphthong and reform then medial, final = v, "" end
    local glyphs = {initial, tone, medial}
    if initial == ""
      and is_first_syllable
      and (final == "" or (reform and is_diphthong))
      and coda == ""
      and (vowel_lookalikes[next_initial] or (reform and diphthong_lookalikes[next_initial])) then
      glyphs[1] = "'"
    end
    if not is_first_syllable then glyphs[2] = "" end
    if glyphs[1] == "" or glyphs[1] == "'" or glyphs[1] == "ꝡ" then
      -- Move tone onto the first vowel
      glyphs[2], glyphs[3] = glyphs[3], glyphs[2]
    end
    if is_diphthong and not reform then
      glyphs[#glyphs+1] = "diphthong"
    elseif final ~= "" then
      glyphs[#glyphs+1] = "hiatus"
    end
    if reform then
      glyphs[#glyphs+1] = final
     else
     else
       ix = postcoda
       for j, fin in ipairs(mw.text.split(final, "")) do
        if j > 1 then glyphs[#glyphs+1] = "diphthong" end
        glyphs[#glyphs+1] = fin
      end
     end
     end
    local nucleus, tone = unpack(untone[medial])
    local glyphs = {initial, nucleus, tone}
     if coda ~= "" then
     if coda ~= "" then
       glyphs[#glyphs+1] = coda .. "_"
       glyphs[#glyphs+1] = coda .. "_"
     end
     end
     for _, fin in ipairs(mw.text.split(final, "")) do
     if underdot ~= "" or hyphen ~= "" then
       glyphs[#glyphs+1] = fin
       glyphs[#glyphs+1] = "-"
     end
     end
     for _, glyph in ipairs(glyphs) do
     for _, glyph in ipairs(glyphs) do
       local mapped = deranimap[glyph]
       res[#res+1] = deranimap[glyph] or "(" .. glyph .. "?)"
      if mapped then
      res[#res+1] = mapped
      end
     end
     end
     -- TODO: actually implement derani hiatus symbol, o'aomo, etc. insertion logic here 🤪
     is_first_syllable = false
   end
   end
  res[#res+1] = u.toNFC(u.sub(word, ix, len))
   return table.concat(res)
   return table.concat(res)
end
end


return {deranize = deranize}
function deranize(frame)
  assert(frame.args[1] ~= nil, "This function requires at least one argument")
  reform = frame.args[2] ~= nil
  local text = u.gsub(u.lower(u.toNFD(frame.args[1])), "i", "ı")
  local converted = u.gsub(text, "(%S+)", deranize_word)
  for k, v in pairs(derani_punctuation) do
    converted = u.gsub(converted, "%" .. k, v)
  end
  return converted
end
 
function clean(frame)
  assert(frame.args[1] ~= nil and frame.args[2] == nil, "This function requires exactly one argument")
  local cleaned = u.gsub(frame.args[1], "[:%[%]*]", "")
  local nbsp = string.char(0xC2, 0xA0)
  cleaned = u.gsub(cleaned, "_", nbsp)
  return cleaned
end
 
return {deranize = deranize, clean = clean}

Latest revision as of 02:58, 25 September 2024

Use with Template:Deranize. {{deranize|mamıeq doe}} renders like so: 󱚰󱚺󱚰󱚹󱛍󱚴󱛂 󱚶󱛃󱛍󱚴 (mamıeq doe).


-- Converts Latin text to Derani.
-- Supports: syllable splitting, tone, prefixes, hiatus/diphthong marks, punctuation.
-- For cartouches and null-raı, try: deranize("Ruaq sá [poq] sá* da.")

u = mw.ustring

function get_tone(word)
  local nfd = u.toNFD(word)
  if u.find(nfd, "́") then return 2 end
  if u.find(nfd, "̈") then return 3 end
  if u.find(nfd, "̂") then return 4 end
  return 1
end

function strip_tone(word)
  return u.gsub(u.gsub(u.toNFD(word), "[́̈̂]", ""), "i", "ı")
end

local deranimap = {
  -- consonants
  ["m"]  = "󱚰", -- DERANI LETTER MAMEI
  ["m_"] = "󱚱", -- DERANI LETTER MAMEI CODA
  ["b"]  = "󱚲", -- DERANI LETTER BUBUE
  ["p"]  = "󱚳", -- DERANI LETTER PIPOQ
  ["f"]  = "󱚴", -- DERANI LETTER FOFUAQ
  ["n"]  = "󱚵", -- DERANI LETTER NANAQ
  ["d"]  = "󱚶", -- DERANI LETTER DUDEO
  ["t"]  = "󱚷", -- DERANI LETTER TITIEQ
  ["z"]  = "󱚸", -- DERANI LETTER ZOZEO
  ["c"]  = "󱚹", -- DERANI LETTER CECOA
  ["s"]  = "󱚺", -- DERANI LETTER SAQSEOQ
  ["r"]  = "󱚻", -- DERANI LETTER RAIRUA
  ["l"]  = "󱚼", -- DERANI LETTER LAOLIQ
  ["nh"] = "󱚽", -- DERANI LETTER NHANHOQ
  ["j"]  = "󱚾", -- DERANI LETTER JUJUO
  ["ch"] = "󱚿", -- DERANI LETTER CHICHAO
  ["sh"] = "󱛀", -- DERANI LETTER SHOSHIA
  ["ꝡ"]  = "󱛁", -- DERANI LETTER VEVA
  ["q_"] = "󱛂", -- DERANI LETTER AQ-AQ
  ["g"]  = "󱛃", -- DERANI LETTER GUGUI
  ["k"]  = "󱛄", -- DERANI LETTER KIKUE
  ["'"]  = "󱛅", -- DERANI LETTER O-AOMO
  ["h"]  = "󱛆", -- DERANI LETTER HEHAQ
  -- vowels
  ["a"]  = "󱚺", -- DERANI LETTER SAQSEOQ
  ["ı"]  = "󱚹", -- DERANI LETTER CECOA
  ["u"]  = "󱚲", -- DERANI LETTER BUBUE
  ["o"]  = "󱛃", -- DERANI LETTER GUGUI
  ["e"]  = "󱚴", -- DERANI LETTER FOFUAQ
  -- tone marks
  [1]    = "",
  [2]    = "󱛊", -- DERANI COMBINING RISING TONE
  [3]    = "󱛋", -- DERANI COMBINING LOW GLOTTAL TONE
  [4]    = "󱛌", -- DERANI COMBINING RISING-FALLING TONE
  -- prefix
  ["-"] = "󱛒",
  ["hiatus"] = "󱛍",
  ["diphthong"] = "󱛎",
  [""] = "",
  -- diphthong reform
  ["aı"] = "󱚶", -- DERANI LETTER DUDEO
  ["ao"] = "󱚳", -- DERANI LETTER PIPOQ
  ["eı"] = "󱚸", -- DERANI LETTER ZOZEO
  ["oı"] = "󱚽", -- DERANI LETTER NHANHOQ
}

local derani_punctuation = {
  ["\""] = "󱛓",
  [":"] = "󱛓",
  [","] = " 󱛔",
  ["."] = " 󱛕",
  ["!"] = " 󱛖",
  ["?"] = " 󱛗",
  ["["] = "󱛘",
  ["]"] = "󱛙",
  ["*"] = " 󱛚",
  ["_"] = "󱛛", -- compatibility nbsp
}

local vowel_lookalikes = {
  ["s"] = true,
  ["b"] = true,
  ["c"] = true,
  ["g"] = true,
  ["f"] = true,
}

local diphthong_lookalikes = {
  ["d"] = true,
  ["p"] = true,
  ["z"] = true,
  ["nh"] = true,
}

local reform = false

function deranize_word(word)
  local res = {}
  local ix = 1
  local tone = get_tone(word)
  local len = mw.ustring.len(word)
  -- NB. this is not PCRE regex, see https://www.mediawiki.org/wiki/Extension:Scribunto/Lua_reference_manual#Ustring_patterns
  local toaqre = "()([mbpfndtzcsrljꝡgk']?h?)([auıoe])([̣]?)([́̈̂̀]?)([auıoe]?[auıoe]?)()([qm]?)([-·]?)()([mbpfndtzcsrljꝡgk']?h?)([auıoe]?)"
  local is_first_syllable = true
  while ix <= len do
    local shouldbreak = false
    local pos_init, initial, medial, underdot, diacritic, final, pos_precoda, coda, hyphen, pos_postcoda, next_initial, next_medial = mw.ustring.match(word, toaqre, ix)
    if pos_init == nil then
      break
    end
    res[#res+1] = u.toNFC(u.sub(word, ix, pos_init - 1))
    if coda == "m" and next_initial == "" and next_medial ~= "" then
      coda = ""
      next_initial = "m"
      ix = pos_precoda
    else
      ix = pos_postcoda
    end

    local v = medial..final
    local is_diphthong = v == "aı" or v == "ao" or v == "eı" or v == "oı"
    if is_diphthong and reform then medial, final = v, "" end
    local glyphs = {initial, tone, medial}
    if initial == ""
      and is_first_syllable
      and (final == "" or (reform and is_diphthong))
      and coda == ""
      and (vowel_lookalikes[next_initial] or (reform and diphthong_lookalikes[next_initial])) then
      glyphs[1] = "'"
    end
    if not is_first_syllable then glyphs[2] = "" end
    if glyphs[1] == "" or glyphs[1] == "'" or glyphs[1] == "ꝡ" then
      -- Move tone onto the first vowel
      glyphs[2], glyphs[3] = glyphs[3], glyphs[2]
    end
    if is_diphthong and not reform then
      glyphs[#glyphs+1] = "diphthong"
    elseif final ~= "" then
      glyphs[#glyphs+1] = "hiatus"
    end
    if reform then
      glyphs[#glyphs+1] = final
    else
      for j, fin in ipairs(mw.text.split(final, "")) do
        if j > 1 then glyphs[#glyphs+1] = "diphthong" end
        glyphs[#glyphs+1] = fin
      end
    end
    if coda ~= "" then
      glyphs[#glyphs+1] = coda .. "_"
    end
    if underdot ~= "" or hyphen ~= "" then
      glyphs[#glyphs+1] = "-"
    end
    for _, glyph in ipairs(glyphs) do
      res[#res+1] = deranimap[glyph] or "(" .. glyph .. "?)"
    end
    is_first_syllable = false
  end
  res[#res+1] = u.toNFC(u.sub(word, ix, len))
  return table.concat(res)
end

function deranize(frame)
  assert(frame.args[1] ~= nil, "This function requires at least one argument")
  reform = frame.args[2] ~= nil
  local text = u.gsub(u.lower(u.toNFD(frame.args[1])), "i", "ı")
  local converted = u.gsub(text, "(%S+)", deranize_word)
  for k, v in pairs(derani_punctuation) do
    converted = u.gsub(converted, "%" .. k, v)
  end
  return converted
end

function clean(frame)
  assert(frame.args[1] ~= nil and frame.args[2] == nil, "This function requires exactly one argument")
  local cleaned = u.gsub(frame.args[1], "[:%[%]*]", "")
  local nbsp = string.char(0xC2, 0xA0)
  cleaned = u.gsub(cleaned, "_", nbsp)
  return cleaned
end

return {deranize = deranize, clean = clean}