Module:Deranize: Difference between revisions
m (untone → detone) |
(Fix tone marks with implicit o'aomo for real) |
||
(21 intermediate revisions by 3 users not shown) | |||
Line 1: | Line 1: | ||
-- | -- Converts Latin text to Derani. | ||
-- Supports: syllable splitting, tone, prefixes, hiatus/diphthong marks, punctuation. | |||
-- For cartouches and null-raı, try: deranize("Ruaq sá [poq] sá* da.") | |||
u = mw.ustring | |||
function get_tone(word) | |||
local nfd = u.toNFD(word) | |||
if u.find(nfd, "́") then return 2 end | |||
if u.find(nfd, "̈") then return 3 end | |||
if u.find(nfd, "̂") then return 4 end | |||
return 1 | |||
end | |||
function strip_tone(word) | |||
return u.gsub(u.gsub(u.toNFD(word), "[́̈̂]", ""), "i", "ı") | |||
end | |||
local deranimap = { | local deranimap = { | ||
-- consonants | -- consonants | ||
["m"] = " | ["m"] = "", -- DERANI LETTER MAMEI | ||
["m_"] = " | ["m_"] = "", -- DERANI LETTER MAMEI CODA | ||
["b"] = " | ["b"] = "", -- DERANI LETTER BUBUE | ||
["p"] = " | ["p"] = "", -- DERANI LETTER PIPOQ | ||
["f"] = " | ["f"] = "", -- DERANI LETTER FOFUAQ | ||
["n"] = " | ["n"] = "", -- DERANI LETTER NANAQ | ||
["d"] = " | ["d"] = "", -- DERANI LETTER DUDEO | ||
["t"] = " | ["t"] = "", -- DERANI LETTER TITIEQ | ||
["z"] = " | ["z"] = "", -- DERANI LETTER ZOZEO | ||
["c"] = " | ["c"] = "", -- DERANI LETTER CECOA | ||
["s"] = " | ["s"] = "", -- DERANI LETTER SAQSEOQ | ||
["r"] = " | ["r"] = "", -- DERANI LETTER RAIRUA | ||
["l"] = " | ["l"] = "", -- DERANI LETTER LAOLIQ | ||
["nh"] = " | ["nh"] = "", -- DERANI LETTER NHANHOQ | ||
["j"] = " | ["j"] = "", -- DERANI LETTER JUJUO | ||
["ch"] = " | ["ch"] = "", -- DERANI LETTER CHICHAO | ||
["sh"] = " | ["sh"] = "", -- DERANI LETTER SHOSHIA | ||
["ꝡ"] = " | ["ꝡ"] = "", -- DERANI LETTER VEVA | ||
["q_"] = " | ["q_"] = "", -- DERANI LETTER AQ-AQ | ||
["g"] = " | ["g"] = "", -- DERANI LETTER GUGUI | ||
["k"] = " | ["k"] = "", -- DERANI LETTER KIKUE | ||
["'"] = " | ["'"] = "", -- DERANI LETTER O-AOMO | ||
["h"] = " | ["h"] = "", -- DERANI LETTER HEHAQ | ||
-- vowels | -- vowels | ||
["a"] = " | ["a"] = "", -- DERANI LETTER SAQSEOQ | ||
["ı"] = " | ["ı"] = "", -- DERANI LETTER CECOA | ||
["u"] = " | ["u"] = "", -- DERANI LETTER BUBUE | ||
["o"] = " | ["o"] = "", -- DERANI LETTER GUGUI | ||
["e"] = " | ["e"] = "", -- DERANI LETTER FOFUAQ | ||
-- tone marks | -- tone marks | ||
[2] = " | [1] = "", | ||
[3] = " | [2] = "", -- DERANI COMBINING RISING TONE | ||
[4] = " | [3] = "", -- DERANI COMBINING LOW GLOTTAL TONE | ||
-- | [4] = "", -- DERANI COMBINING RISING-FALLING TONE | ||
-- prefix | |||
["-"] = "", | |||
["hiatus"] = "", | |||
["diphthong"] = "", | |||
[""] = "", | |||
-- diphthong reform | |||
["aı"] = "", -- DERANI LETTER DUDEO | |||
["ao"] = "", -- DERANI LETTER PIPOQ | |||
["eı"] = "", -- DERANI LETTER ZOZEO | |||
["oı"] = "", -- DERANI LETTER NHANHOQ | |||
} | |||
local derani_punctuation = { | |||
["\""] = "", | |||
[":"] = "", | |||
[","] = " ", | |||
["."] = " ", | |||
["!"] = " ", | |||
["?"] = " ", | |||
["["] = "", | |||
["]"] = "", | |||
["*"] = " ", | |||
["_"] = "", -- compatibility nbsp | |||
} | } | ||
local vowel_lookalikes = { | |||
["s"] = true, | |||
["b"] = true, | |||
["c"] = true, | |||
["g"] = true, | |||
["f"] = true, | |||
} | |||
local diphthong_lookalikes = { | |||
["d"] = true, | |||
["p"] = true, | |||
["z"] = true, | |||
["nh"] = true, | |||
} | |||
local reform = false | |||
function deranize_word(word) | |||
local res = {} | local res = {} | ||
local ix = 1 | local ix = 1 | ||
local len = mw.ustring.len( | local tone = get_tone(word) | ||
local len = mw.ustring.len(word) | |||
-- NB. this is not PCRE regex, see https://www.mediawiki.org/wiki/Extension:Scribunto/Lua_reference_manual#Ustring_patterns | -- NB. this is not PCRE regex, see https://www.mediawiki.org/wiki/Extension:Scribunto/Lua_reference_manual#Ustring_patterns | ||
local toaqre = "()([mbpfndtzcsrljꝡgk']?h?)([auıoe])([̣]?)([́̈̂̀]?)([auıoe]?[auıoe]?)()([qm]?)([-·]?)()([mbpfndtzcsrljꝡgk']?h?)([auıoe]?)" | |||
local is_first_syllable = true | |||
local toaqre = "()([mbpfndtzcsrljꝡgk']?h?)([ | while ix <= len do | ||
while ix < len do | |||
local shouldbreak = false | local shouldbreak = false | ||
local | local pos_init, initial, medial, underdot, diacritic, final, pos_precoda, coda, hyphen, pos_postcoda, next_initial, next_medial = mw.ustring.match(word, toaqre, ix) | ||
if | if pos_init == nil then | ||
break | |||
end | end | ||
res[#res+1] = u.toNFC(u.sub(word, ix, pos_init - 1)) | |||
res[#res+1] = | if coda == "m" and next_initial == "" and next_medial ~= "" then | ||
if coda == "m" and | |||
coda = "" | coda = "" | ||
ix = | next_initial = "m" | ||
ix = pos_precoda | |||
else | |||
ix = pos_postcoda | |||
end | |||
local v = medial..final | |||
local is_diphthong = v == "aı" or v == "ao" or v == "eı" or v == "oı" | |||
if is_diphthong and reform then medial, final = v, "" end | |||
local glyphs = {initial, tone, medial} | |||
if initial == "" | |||
and is_first_syllable | |||
and (final == "" or (reform and is_diphthong)) | |||
and coda == "" | |||
and (vowel_lookalikes[next_initial] or (reform and diphthong_lookalikes[next_initial])) then | |||
glyphs[1] = "'" | |||
end | |||
if not is_first_syllable then glyphs[2] = "" end | |||
if glyphs[1] == "" or glyphs[1] == "'" or glyphs[1] == "ꝡ" then | |||
-- Move tone onto the first vowel | |||
glyphs[2], glyphs[3] = glyphs[3], glyphs[2] | |||
end | |||
if is_diphthong and not reform then | |||
glyphs[#glyphs+1] = "diphthong" | |||
elseif final ~= "" then | |||
glyphs[#glyphs+1] = "hiatus" | |||
end | |||
if reform then | |||
glyphs[#glyphs+1] = final | |||
else | else | ||
for j, fin in ipairs(mw.text.split(final, "")) do | |||
if j > 1 then glyphs[#glyphs+1] = "diphthong" end | |||
glyphs[#glyphs+1] = fin | |||
end | |||
end | end | ||
if coda ~= "" then | if coda ~= "" then | ||
glyphs[#glyphs+1] = coda .. "_" | glyphs[#glyphs+1] = coda .. "_" | ||
end | end | ||
if underdot ~= "" or hyphen ~= "" then | |||
glyphs[#glyphs+1] = | glyphs[#glyphs+1] = "-" | ||
end | end | ||
for _, glyph in ipairs(glyphs) do | for _, glyph in ipairs(glyphs) do | ||
res[#res+1] = deranimap[glyph] or "(" .. glyph .. "?)" | |||
end | end | ||
is_first_syllable = false | |||
end | end | ||
res[#res+1] = u.toNFC(u.sub(word, ix, len)) | |||
return table.concat(res) | return table.concat(res) | ||
end | end | ||
return {deranize = deranize} | function deranize(frame) | ||
assert(frame.args[1] ~= nil, "This function requires at least one argument") | |||
reform = frame.args[2] ~= nil | |||
local text = u.gsub(u.lower(u.toNFD(frame.args[1])), "i", "ı") | |||
local converted = u.gsub(text, "(%S+)", deranize_word) | |||
for k, v in pairs(derani_punctuation) do | |||
converted = u.gsub(converted, "%" .. k, v) | |||
end | |||
return converted | |||
end | |||
function clean(frame) | |||
assert(frame.args[1] ~= nil and frame.args[2] == nil, "This function requires exactly one argument") | |||
local cleaned = u.gsub(frame.args[1], "[:%[%]*]", "") | |||
local nbsp = string.char(0xC2, 0xA0) | |||
cleaned = u.gsub(cleaned, "_", nbsp) | |||
return cleaned | |||
end | |||
return {deranize = deranize, clean = clean} |
Latest revision as of 02:58, 25 September 2024
Use with Template:Deranize. {{deranize|mamıeq doe}}
renders like so: (mamıeq doe).
-- Converts Latin text to Derani.
-- Supports: syllable splitting, tone, prefixes, hiatus/diphthong marks, punctuation.
-- For cartouches and null-raı, try: deranize("Ruaq sá [poq] sá* da.")
u = mw.ustring
function get_tone(word)
local nfd = u.toNFD(word)
if u.find(nfd, "́") then return 2 end
if u.find(nfd, "̈") then return 3 end
if u.find(nfd, "̂") then return 4 end
return 1
end
function strip_tone(word)
return u.gsub(u.gsub(u.toNFD(word), "[́̈̂]", ""), "i", "ı")
end
local deranimap = {
-- consonants
["m"] = "", -- DERANI LETTER MAMEI
["m_"] = "", -- DERANI LETTER MAMEI CODA
["b"] = "", -- DERANI LETTER BUBUE
["p"] = "", -- DERANI LETTER PIPOQ
["f"] = "", -- DERANI LETTER FOFUAQ
["n"] = "", -- DERANI LETTER NANAQ
["d"] = "", -- DERANI LETTER DUDEO
["t"] = "", -- DERANI LETTER TITIEQ
["z"] = "", -- DERANI LETTER ZOZEO
["c"] = "", -- DERANI LETTER CECOA
["s"] = "", -- DERANI LETTER SAQSEOQ
["r"] = "", -- DERANI LETTER RAIRUA
["l"] = "", -- DERANI LETTER LAOLIQ
["nh"] = "", -- DERANI LETTER NHANHOQ
["j"] = "", -- DERANI LETTER JUJUO
["ch"] = "", -- DERANI LETTER CHICHAO
["sh"] = "", -- DERANI LETTER SHOSHIA
["ꝡ"] = "", -- DERANI LETTER VEVA
["q_"] = "", -- DERANI LETTER AQ-AQ
["g"] = "", -- DERANI LETTER GUGUI
["k"] = "", -- DERANI LETTER KIKUE
["'"] = "", -- DERANI LETTER O-AOMO
["h"] = "", -- DERANI LETTER HEHAQ
-- vowels
["a"] = "", -- DERANI LETTER SAQSEOQ
["ı"] = "", -- DERANI LETTER CECOA
["u"] = "", -- DERANI LETTER BUBUE
["o"] = "", -- DERANI LETTER GUGUI
["e"] = "", -- DERANI LETTER FOFUAQ
-- tone marks
[1] = "",
[2] = "", -- DERANI COMBINING RISING TONE
[3] = "", -- DERANI COMBINING LOW GLOTTAL TONE
[4] = "", -- DERANI COMBINING RISING-FALLING TONE
-- prefix
["-"] = "",
["hiatus"] = "",
["diphthong"] = "",
[""] = "",
-- diphthong reform
["aı"] = "", -- DERANI LETTER DUDEO
["ao"] = "", -- DERANI LETTER PIPOQ
["eı"] = "", -- DERANI LETTER ZOZEO
["oı"] = "", -- DERANI LETTER NHANHOQ
}
local derani_punctuation = {
["\""] = "",
[":"] = "",
[","] = " ",
["."] = " ",
["!"] = " ",
["?"] = " ",
["["] = "",
["]"] = "",
["*"] = " ",
["_"] = "", -- compatibility nbsp
}
local vowel_lookalikes = {
["s"] = true,
["b"] = true,
["c"] = true,
["g"] = true,
["f"] = true,
}
local diphthong_lookalikes = {
["d"] = true,
["p"] = true,
["z"] = true,
["nh"] = true,
}
local reform = false
function deranize_word(word)
local res = {}
local ix = 1
local tone = get_tone(word)
local len = mw.ustring.len(word)
-- NB. this is not PCRE regex, see https://www.mediawiki.org/wiki/Extension:Scribunto/Lua_reference_manual#Ustring_patterns
local toaqre = "()([mbpfndtzcsrljꝡgk']?h?)([auıoe])([̣]?)([́̈̂̀]?)([auıoe]?[auıoe]?)()([qm]?)([-·]?)()([mbpfndtzcsrljꝡgk']?h?)([auıoe]?)"
local is_first_syllable = true
while ix <= len do
local shouldbreak = false
local pos_init, initial, medial, underdot, diacritic, final, pos_precoda, coda, hyphen, pos_postcoda, next_initial, next_medial = mw.ustring.match(word, toaqre, ix)
if pos_init == nil then
break
end
res[#res+1] = u.toNFC(u.sub(word, ix, pos_init - 1))
if coda == "m" and next_initial == "" and next_medial ~= "" then
coda = ""
next_initial = "m"
ix = pos_precoda
else
ix = pos_postcoda
end
local v = medial..final
local is_diphthong = v == "aı" or v == "ao" or v == "eı" or v == "oı"
if is_diphthong and reform then medial, final = v, "" end
local glyphs = {initial, tone, medial}
if initial == ""
and is_first_syllable
and (final == "" or (reform and is_diphthong))
and coda == ""
and (vowel_lookalikes[next_initial] or (reform and diphthong_lookalikes[next_initial])) then
glyphs[1] = "'"
end
if not is_first_syllable then glyphs[2] = "" end
if glyphs[1] == "" or glyphs[1] == "'" or glyphs[1] == "ꝡ" then
-- Move tone onto the first vowel
glyphs[2], glyphs[3] = glyphs[3], glyphs[2]
end
if is_diphthong and not reform then
glyphs[#glyphs+1] = "diphthong"
elseif final ~= "" then
glyphs[#glyphs+1] = "hiatus"
end
if reform then
glyphs[#glyphs+1] = final
else
for j, fin in ipairs(mw.text.split(final, "")) do
if j > 1 then glyphs[#glyphs+1] = "diphthong" end
glyphs[#glyphs+1] = fin
end
end
if coda ~= "" then
glyphs[#glyphs+1] = coda .. "_"
end
if underdot ~= "" or hyphen ~= "" then
glyphs[#glyphs+1] = "-"
end
for _, glyph in ipairs(glyphs) do
res[#res+1] = deranimap[glyph] or "(" .. glyph .. "?)"
end
is_first_syllable = false
end
res[#res+1] = u.toNFC(u.sub(word, ix, len))
return table.concat(res)
end
function deranize(frame)
assert(frame.args[1] ~= nil, "This function requires at least one argument")
reform = frame.args[2] ~= nil
local text = u.gsub(u.lower(u.toNFD(frame.args[1])), "i", "ı")
local converted = u.gsub(text, "(%S+)", deranize_word)
for k, v in pairs(derani_punctuation) do
converted = u.gsub(converted, "%" .. k, v)
end
return converted
end
function clean(frame)
assert(frame.args[1] ~= nil and frame.args[2] == nil, "This function requires exactly one argument")
local cleaned = u.gsub(frame.args[1], "[:%[%]*]", "")
local nbsp = string.char(0xC2, 0xA0)
cleaned = u.gsub(cleaned, "_", nbsp)
return cleaned
end
return {deranize = deranize, clean = clean}