Module:Ko-rm

-- based on Wiktionary wikt:Module:ko-translit wikt:Module:ko-pron wikt:Module:ko-pron/data wikt:Module:ko local p = {} local Ugsub = mw.ustring.gsub local Umatch = mw.ustring.match local Usub = mw.ustring.sub local Uchar = mw.ustring.char local codepoint = mw.ustring.codepoint local data = mw.loadData('Module:Ko-rm/data') local lib = require('Module:Feature')

function p.main(frame) local args = require('Module:Arguments').getArgs(frame, {		parentFirst = true,		wrappers = {'Template:Ko-rm'}	}) if (not args[1]) then return '' end return p._main(args) end

function p._main(args) local str = mw.text.unstrip(args[1]) --mw.logObject(str) --debug str = Ugsub(str, '%([一-鿿㐀-䶿𠀀-𮯯𰀀-𱍏]+%)', '') str = Ugsub(str, "%([一-鿿㐀-䶿𠀀-𮯯𰀀-𱍏]*[一-鿿㐀-䶿𠀀-𮯯𰀀-𱍏]+[一-鿿㐀-䶿𠀀-𮯯𰀀-𱍏]*%)", '') str = Ugsub(str, '[一-鿿㐀-䶿𠀀-𮯯𰀀-𱍏]+%((.-)%)', '%1') str = Ugsub(str, '(.-) ', '%1') str = Ugsub(str, '<[%d%p]+:(.-)>', '%1') --for manual numeral readings str = Ugsub(str, '<%->', '-') --for manual hyphenation str = Ugsub(str, '< >', ' ') --for manual spacing

if not Umatch(str, '[가-힣]') then return '' end

--pronunciation exception(s) str = Ugsub(str, '여덟', '여덜') str = Ugsub(str, 'Ⅰ', '일') str = Ugsub(str, 'Ⅱ', '이') str = Ugsub(str, 'Ⅲ', '삼') str = Ugsub(str, 'Ⅳ', '사') str = Ugsub(str, 'Ⅴ', '오') str = Ugsub(str, 'Ⅵ', '육') str = Ugsub(str, 'Ⅶ', '칠') str = Ugsub(str, 'Ⅷ', '팔') str = Ugsub(str, 'Ⅸ', '구') str = Ugsub(str, 'Ⅹ', '십')

-- pre-romanization punctuation conversion str = Ugsub(str, '[《「『【]', '“') str = Ugsub(str, '[》」』】]', '”')

local revised = p.romanize(str, args)

if (not revised) then return '' end --mw.logObject(revised,'revised') --debug if Umatch(revised, '[%.%?%!]') then revised = mw.ustring.upper(Usub(revised, 1, 1)) .. Usub(revised, 2, -1) revised = Ugsub(revised, "([%.%?%!]) ([a-z%'])", '%1 ^%2') revised = Ugsub(revised, "^%'%'%'", "'''^") end revised = Ugsub(revised, "([a-z])%-%'([a-z])", '%1-%2') revised = Ugsub(revised, "%^%'%'%'", "'''^") revised = Ugsub(revised, '%^%l', mw.ustring.upper) revised = Ugsub(revised, '%^', '') revised = Ugsub(revised, "%-%-", "-") revised = Ugsub(revised, '%-%-', '-') --punctuation fixing revised = Ugsub(revised, '…', '...') revised = Ugsub(revised, '！', '!') revised = Ugsub(revised, '？', '?') revised = Ugsub(revised, '”([A-Za-z])', '”-%1') revised = Ugsub(revised, '(//[^/@]-@@[^/@]-@@//)([A-Za-z])', '%1-%2') revised = Ugsub(revised, '[·・]', ' - ') revised = Ugsub(revised, '——', '⸺') revised = Ugsub(revised, '&mdash;&mdash;', '⸺') --secondary romanisation system while revised:find('^.-//[^/]-//.-$') do		local pre, dur, dur3, post = string.match(revised, '^(.-)//([^@/]-)@@([^@/]-)@@//(.-)$') if dur3 ~= nil then --mw.logObject(pre,'pre') mw.logObject(dur,'dur') mw.logObject(post,'post') --debug if mw.ustring.lower(dur3) ~= mw.ustring.lower(dur) then dur = '' .. p.capitalizer(dur, true) .. ' '			else				dur = p.capitalizer(dur, true)			end		else			pre, dur, post = string.match(revised, '^(.-)//([^/]-)//(.-)$')		end		revised = pre .. dur .. post	end	--all case (|capi=1) or sentence case (|sent=1)	if (args.capi or args.sent) then		revised = p.capitalizer(revised, (args.capi or nil))	end	--post-capitalization punctuation fixing	revised = Ugsub(revised, "”'", '”-')	revised = Ugsub(revised, '[“”]', '"') revised = Ugsub(revised, '([%a])(%d+)', '%1-%2') revised = Ugsub(revised, '(%d+)([%a])', '%1-%2') revised = Ugsub(revised, '(%d+)-[Pp]x', '%1px') --lazy fix for accidental hyphenation of pixel amounts return revised end

function p.romanize(text_param, args) local P, optional_params = {}, { 'nn', 'ni', 'bcred' } for _, pm in ipairs(optional_params) do		P[pm] = { } if args[pm] then for pp in mw.text.gsplit(args[pm], ',') do P[pm][tonumber(pp) or pp] = 1 end end end --mw.logObject(P,'P') --debug local T_index, T_next_index = 0,0 local rom3 = false

text_param = Ugsub(text_param, '["](.)', '%1')

for primitive_word in mw.ustring.gmatch(text_param, '[%-ᄀ-ᄒ' .. 'ᅡ-ᅵ' .. 'ᆨ-ᇂ' .. "ㄱ-ㅣ가-힣' /「」%^]+") do		--mw.logObject(primitive_word,'primitive_word') --debug --mw.logObject(text_param,'text_param') --debug local the_original = primitive_word primitive_word = Ugsub(primitive_word, "'''", 'ß') local bold_position, bold_count = {}, 0 while Umatch(primitive_word, 'ß') do			bold_position[(mw.ustring.find(primitive_word, 'ß')) + bold_count] = true primitive_word = Ugsub(primitive_word, 'ß', '', 1) bold_count = bold_count + 1 end

local word_set = { primitive_word }

local word_set_romanisations = {} for _, respelling in ipairs(word_set) do			--mw.logObject(word_set,'word_set') --debug --mw.logObject(respelling,'respelling') --debug local decomposed_syllables = p.decompose_syllable(respelling) --mw.logObject(decomposed_syllables,'decomposed_syllables') --debug local romanisation = {} local romanisation3 = {} local bold_insert_count = 0 for index = 0, #decomposed_syllables, 1 do				if index ~= 0 then T_index = T_index + 1 end local this_syllable_text = index ~= 0 and Usub(respelling, index, index) or '' local forced = '' --mw.logObject(this_syllable_text,'this_syllable_text_I') --debug while Umatch(this_syllable_text, '[/「」^]') do forced = forced .. this_syllable_text respelling = Usub(respelling, 2, -1) this_syllable_text = index ~= 0 and Usub(respelling, index, index) or '' end if (forced:find('//') and (not rom3)) then rom3 = true elseif forced:find('//') then rom3 = false end --mw.logObject(this_syllable_text,'this_syllable_text_F') --debug if this_syllable_text == '-' then -- skip it, it will be handled below else T_next_index = T_index local syllable = decomposed_syllables[index] or { initial = 'Ø', vowel = 'Ø', final = 'X' } local next_index = index local next_syllable_text local saw_hyphen_after = false while true do						next_index = next_index + 1 T_next_index = T_next_index + 1 next_syllable_text = next_index > #decomposed_syllables and '' or Usub(respelling, next_index, next_index) if next_syllable_text ~= '-' then break end saw_hyphen_after = true end local next_syllable = decomposed_syllables[next_index] or { initial = 'Ø', vowel = 'Ø', final = 'Ø' } syllable.final = data.FSC[syllable.final] or syllable.final

if this_syllable_text == '넓' then if Umatch(next_syllable.initial, '[ᄌᄉ]') then syllable.final = 'ᆸ'

elseif next_syllable.initial == 'ᄃ' then if Umatch(next_syllable.vowel, '[^ᅡᅵ]') then syllable.final = 'ᆸ' end end end local vowel = data.vowels[syllable.vowel][2] if P.nn[T_next_index] and Umatch(syllable.final .. next_syllable.initial, 'ᆫᄅ') then next_syllable.initial = 'ᄂ' end

if P.ni[T_next_index] and next_syllable.initial == 'ᄋ' and Umatch(next_syllable.vowel, '[ᅵᅣᅧᅭᅲ]') then next_syllable.initial = 'ᄂ' end

if P.bcred[T_index] then syllable.final = data.boundary[syllable.final .. '-Ø'][1] end

if index ~= 0 and this_syllable_text == '밟' and not Umatch(next_syllable.initial, '[ᄋᄒ]') then syllable.final = 'ᆸ' end

if Umatch(this_syllable_text, '[닭뷁삵슭앍줅찱칡탉흙]') and not Umatch(next_syllable.initial .. ';' .. next_syllable.vowel, 'ᄋ;[ᅦᅧᅳᅴᅵ]') then syllable.final = 'ᆨ' end

if next_syllable_text == '없' then if Umatch(syllable.final, '[ᆩᆪᆰᆿ]') then syllable.final = 'ᆨ' elseif Umatch(syllable.final, '[ᆬᆭ]') then syllable.final = 'ᆫ' elseif Umatch(syllable.final, '[ᆺᆻᆽᆾᇀ]') then syllable.final = 'ᆮ' elseif Umatch(syllable.final, '[ᆲᆳᆴᆶ]') then syllable.final = 'ᆯ' elseif syllable.final == 'ᆱ' then syllable.final = 'ᆷ' elseif Umatch(syllable.final, '[ᆵᆹᇁ]') then syllable.final = 'ᆸ' end end

if (not P.bcred[T_index]) then if Umatch(syllable.final .. next_syllable.initial, 'ᇀᄋ') then if Umatch(next_syllable.vowel, '[ᅵᅧ]') then syllable.final = 'ᆾ' end

elseif Umatch(syllable.final .. next_syllable.initial, 'ᆴᄋ') then if Umatch(next_syllable.vowel, '[ᅵᅧ]') then syllable.final = 'ᆯ' next_syllable.initial = 'ᄎ' end

elseif Umatch(syllable.final .. next_syllable.initial, 'ᆮᄋ') and tonumber(s_variation or -1) ~= index then if Umatch(next_syllable.vowel, '[ᅵᅧ]') then syllable.final = 'ᆽ' end

elseif Umatch(syllable.final .. next_syllable.initial, 'ᆮᄒ') then if Umatch(next_syllable.vowel, '[ᅵᅧ]') then syllable.final = 'ᆾ' next_syllable.initial = 'ᄋ' end end end

if syllable.final .. next_syllable.initial == 'ᆺᄋ' and not Umatch(next_syllable_text, '[아았어었에으은을음읍의이인일임입있]') then syllable.final = 'ᆮ' end

local bound = syllable.final .. '-' .. next_syllable.initial if (not data.boundary[bound]) then mw.log('No boundary data for ' .. bound .. '.') return nil end local junction = data.boundary[bound][2] local junction3 = data.boundary[bound][3] or data.boundary[bound][2] --mw.logObject(junction, 'junction') --debug --mw.logObject(junction3, 'junction3') --debug if bold_position[index + bold_insert_count + 1] then junction = Ugsub(junction, '^.*$', function(matched)							local a, b = string.match(matched, '^(ng);(.*)$')							if ((not a) and (not b)) then a, b = string.match(matched, '^(.?%-?);(.*)$') end							return Umatch(syllable.final .. next_syllable.initial, '^Ø?[ᄀ-ᄒ]$')								and "" .. (a or ) .. ';' .. (b or )								or (a or ) .. "" .. ';' .. (b or ) end)

bold_insert_count = bold_insert_count + 1 end

local final_cons, initial_cons = Umatch(junction, '^(.*);(.*)$') --special romanisation if rom3 then if (#romanisation3 == 0 and #romanisation > 0) then table.insert(romanisation3, romanisation[#romanisation]) end local final_cons3, initial_cons3 = Umatch(junction3, '^(.*);(.*)$') table.insert(romanisation3, vowel) table.insert(romanisation3, final_cons3) table.insert(romanisation3, (saw_hyphen_after and '-' or '')) table.insert(romanisation3, initial_cons3) elseif ((not rom3) and #romanisation3 > 0) then table.remove(romanisation3) table.remove(romanisation3) table.insert(romanisation, #romanisation-(#romanisation>0 and 1 or 0), '@@' .. Ugsub(table.concat(romanisation3), "[^A-Za-z\"]$", ) .. '@@')						romanisation3 = {}					end					table.insert(romanisation, #romanisation-(#romanisation>0 and 1 or 0), forced)					table.insert(romanisation, vowel)					table.insert(romanisation, final_cons)					table.insert(romanisation, (saw_hyphen_after and '-' or ))					table.insert(romanisation, initial_cons)					--straggler characters at end of word set					if index == #decomposed_syllables and lib.isNotEmpty(Usub(respelling, index+1, index+1)) then						local N = Usub(respelling, index+1, #respelling)						if (N:find('//') and #romanisation3 > 0) then							table.remove(romanisation3)							table.remove(romanisation3)							table.insert(romanisation, #romanisation-(#romanisation>0 and 1 or 0),  '@@' .. Ugsub(table.concat(romanisation3), "[^A-Za-z\"]$", '') .. '@@')							romanisation3 = {}							table.insert(romanisation, N)						else							romanisation3 = {}						end					end					--					local currRom = { 						syllable = syllable,						vowel = vowel,						final_cons = final_cons,						initial_cons = initial_cons,						totalRom = table.concat(romanisation),						totalRom3 = table.concat(romanisation3)					} --debug					mw.logObject(currRom,'currRom') --debug					--				end			end

local temp_romanisation = table.concat(romanisation) --mw.logObject(temp_romanisation,'temp_romanisation') --debug for i = 1, 2 do				temp_romanisation = Ugsub(temp_romanisation, '(.)…(.)', function(a, b)					return a .. (data.AI[a .. b] and "'" or '') .. b end) temp_romanisation = Ugsub(temp_romanisation, "wo'e", 'woe') temp_romanisation = Ugsub(temp_romanisation, "yo'e", 'yoe') temp_romanisation = Ugsub(temp_romanisation, "we'o", 'weo') temp_romanisation = Ugsub(temp_romanisation, "we'u", 'weu') temp_romanisation = Ugsub(temp_romanisation, "ye'u", 'yeu') temp_romanisation = Ugsub(temp_romanisation, "yu'i", 'yui') end table.insert(word_set_romanisations, temp_romanisation) end

text_param = Ugsub(			text_param,			p.pattern_escape(the_original),			table.concat(word_set_romanisations, '/'),			1		) end

return text_param end

function p.decompose_jamo(syllable) if (not Umatch(syllable, '[가-힣]')) then if Umatch(syllable, '[ᄀ-ᄒ]') then return { initial = syllable, vowel = 'Ø', final = 'Ø' } elseif Umatch(syllable, '[ᅡ-ᅵ]') then return { initial = 'Ø', vowel = syllable, final = 'Ø' } elseif Umatch(syllable, '[ᆨ-ᇂ]') then return { initial = 'Ø', vowel = 'Ø', final = syllable } elseif Umatch(syllable, '[ㄱ-ㆎ]') then return { initial = 'Ø', vowel = 'Ø', final = syllable } else return { initial = 'Ø', vowel = ' ', final = 'X' } end end local cp = codepoint(syllable) if (not cp) then return { , , '' } end local relative_cp = cp - 0xAC00 local jongseong = (((relative_cp % 28) ~= 0) and Uchar(0x11A7 + (relative_cp % 28))) or '' local jungseong = Uchar(0x1161 + math.floor((relative_cp % 588) / 28)) local choseong = Uchar(0x1100 + math.floor(relative_cp / 588)) return { initial = choseong, vowel = jungseong, final = jongseong } end

function p.pattern_escape(text) if type(text) == 'table' then text = text.args[1] end text = Ugsub(text, '([%^$%%.%[%]*+%-?])', '%%%1') return text end

function p.decompose_syllable(word) local decomposed_syllables = {} for syllable in mw.text.gsplit(word, '') do		--mw.logObject(syllable,'syllable') --debug if not Umatch(syllable, '[/「」%^]') then table.insert(decomposed_syllables, p.decompose_jamo(syllable)) end end return decomposed_syllables end

function p.capitalizer(str, all) if lib.isNotEmpty(str) then str = mw.text.split(str,'') --mw.logObject(str,'str') --debug local cap = true for index = 1,#str do			if (str[index]:find(((all ~= nil) and "[^A-Za-z\-\"_]" or "[^A-Za-z\-\"_,%s]")) and str[index] ~= "'") or (cap and str[index] == ' ')then cap = true --mw.logObject(str[index],'skipped') --debug elseif cap and str[index] == '_' then cap = false str[index] = '' elseif cap then str[index] = mw.ustring.upper(str[index]) --mw.logObject(str[index],'capped') --debug cap = false end end str = table.concat(str,'') end return str end

function p.strip(str) if lib.isEmpty(str) then return '' end str = Ugsub(str,   '//(.-)//',          '%1'    ) --remove given name specifier str = Ugsub(str,   '%^',                ''      ) --remove capitalization marker str = Ugsub(str,   '<.>',               ''      ) --remove arbitrary separator str = Ugsub(str,   '_',                 ''      ) --remove capitalization blacklister str = Ugsub(str,   '<([%d%p]+):.->',    '%1'    ) --reduce number readings to just the number return str end

return p