မော်ဂျူး:Jpan-sortkey

အဝ်ႏ ဝိစ်သိဉ်နရီ ကို

Documentation for this module may be created at မော်ဂျူး:Jpan-sortkey/doc

local export = {}

local concat = table.concat
local find = mw.ustring.find
local gsub = mw.ustring.gsub
local insert = table.insert
local match = mw.ustring.match
local sub = mw.ustring.sub
local toNFC = mw.ustring.toNFC

local range = mw.loadData("Module:ja/data/range")
local kanji_pattern = range.kanji
local ideograph_pattern = range.ideograph
local kana_graph_pattern = range.kana_graph
local latin_pattern = range.latin

local get_by_code = require("Module:languages").getByCode
local Hani_sort = require("Module:Hani-sortkey").makeSortKey
local track = require("Module:debug/track")

function export.makeSortKey(text, lang, sc)
	-- Determine reading.
	local seen_pages, langname = {}
	while lang ~= "mul" and (not seen_pages[text]) and find(text, "[0-9" .. kanji_pattern .. ideograph_pattern .. kana_graph_pattern .. latin_pattern .. "]") do
		repeat
			-- langname is escaped to ensure pattern safety
			langname = langname or (get_by_code(lang)
				:getCanonicalName()
				:gsub("[%^$()%%.[%]*+%-?]", "%%%0"))
			seen_pages[text] = true
			local content = mw.title.new(toNFC(text)):getContent()
			if content then
				local start, heading_end = content:find("%f[^%z\n\r]==[\t ]*" .. langname .. "[\t ]*==[\t ]*%f[%z\n\r]")
				if start then
					local section_end
					for loc, level in content:sub(heading_end):gmatch("()%f[^%z\n\r](=+)([^\n\r]+)%2[\t ]*%f[%z\n\r]") do
						if #level == 2 then
							section_end = loc
							break
						end
					end
					content = content:sub(heading_end, section_end)
					local findTemplates = require("Module:templateparser").findTemplates
					local kanjitab, br
					for template, args in findTemplates(content) do
						local templates = {
							[lang .. "-head"] = true,
							[lang .. "-pos"] = true,
						}
						if templates[template] and args[2] then
							text = args[2]:gsub("[ %-%.^%%]", "")
							br = true
							break
						elseif (template == "head" or template == "head-lite") and args[1] == lang then
							for i, arg in ipairs(args) do
								if arg == "kana" then
									local kana = args[i+1]
									if kana then
										text = kana
										br = true
										break
									end
								end
							end
						end
						templates = {
							[lang .. "-noun"] = true,
							[lang .. "-verb"] = true,
							[lang .. "-adj"] = true,
							[lang .. "-phrase"] = true,
							[lang .. "-verb form"] = true,
							[lang .. "-verb-suru"] = true,
							[lang .. "-see"] = true,
							[lang .. "-see-kango"] = true,
							[lang .. "-gv"] = true,
						}
						if templates[template] and args[1] then
							text = args[1]:gsub("[ %-%.^%%]", "")
							br = true
							break
						elseif template == lang .. "-kanjitab" then
							kanjitab = kanjitab or args
						end
					end
					if (not br) and kanjitab then
						track{"Jpan-sortkey/kanjitab", "Jpan-sortkey/kanjitab/" .. lang}
						if kanjitab.sortkey then
							text = kanjitab.sortkey
							break
						end
						-- extract kanji and non-kanji
						local kanji = {}
						local non_kanji = {}
						
						local kanji_border = 1
						gsub(text, "()([" .. kanji_pattern .. "々])()", function(p1, w1, p2)
							insert(non_kanji, sub(text, kanji_border, p1 - 1))
							kanji_border = p2
							insert(kanji, w1)
						end)
						insert(non_kanji, sub(text, kanji_border))
						-- 々
						for i, v in ipairs(kanji) do
							if v == "々" then kanji[i] = kanji[i - 1] end
						end
						-- process readings
						local readings = {}
						local readings_actual = {}
						local reading_length_total = 0
						for i in ipairs(kanjitab) do
							local reading_kana, reading_length = match(kanjitab[i] or "", "^([^0-9]*)([0-9]*)$")
							reading_kana = reading_kana ~= "" and reading_kana or nil
							reading_length = reading_kana and tonumber(reading_length) or 1
		
							insert(readings, {reading_kana, reading_length})
							reading_length_total = reading_length_total + reading_length
							for i = reading_length_total + 1, #kanji do
								insert(readings, {nil, 1})
							end
							if reading_kana then
								local actual_reading = kanjitab["k" .. i]
								local okurigana = kanjitab["o" .. i]
								readings_actual[i] = {(actual_reading or reading_kana) .. (okurigana or ""), reading_length}
							else
								readings_actual[i] = {nil, 1}
							end
						end
						local sortkey = {non_kanji[1]}
						local id = 1
						for _, v in ipairs(readings_actual) do
							id = id + v[2]
							v[1] = v[1] ~= "-" and v[1]
							insert(sortkey, (v[1] or "") .. (non_kanji[id] or ""))
						end
						sortkey = concat(sortkey)
						if sortkey ~= "" then
							text = sortkey
						end
					end
				end
			end
		until true
	end
	
	-- Use hiragana sort.
	text = require("Module:Hira-sortkey").makeSortKey(text, lang, sc)
	
	-- Run through Hani sort, to catch any stray kanji. This shouldn't happen but often does, and we still want to handle them sensibly in the time before the entry is fixed.
	local ret = Hani_sort(text, lang, sc)
	
	if not (lang == "mul" or ret == text) then
		track{"Jpan-sortkey/fallback", "Jpan-sortkey/fallback/" .. lang}
	end
	
	return ret
end

return export