မော်ဂျူး:ru-common
Documentation for this module may be created at မော်ဂျူး:ru-common/doc
--[[
Author: Benwing; some very early work by CodeCat and Atitarev
This module holds some commonly used functions for the Russian language.
It's generally for use from other modules, not #invoke, although some functions
can be invoked from a template (export.iotation(), export.reduce_stem(),
export.dereduce_stem() -- this was actually added to support calling from a
bot script rather than from a user template). There's also export.main(),
which supposedly can be used to invoke most functions in this module from a
template, but it may or may not work. There may also be issues when invoking
such functions from templates when transliteration is present, due to the
need for the transliteration to be decomposed, as mentioned below (all strings
from Wiktionary pages are normally in composed form).
NOTE NOTE NOTE: All functions assume that transliteration (but not Russian)
has had its acute and grave accents decomposed using export.decompose().
This is the first thing that should be done to all user-specified
transliteration and any transliteration we compute that we expect to work with.
]]
local export = {}
local lang = require("Module:languages").getByCode("ru")
local strutils = require("Module:string utilities")
local m_table_tools = require("Module:table tools")
-- Prevents an infinite require loop since ru-translit requires a different function in this module.
local m_ru_translit = require("Module:utilities").require_when_needed("Module:ru-translit")
local u = mw.ustring.char
local rfind = mw.ustring.find
local rsubn = mw.ustring.gsub
local rmatch = mw.ustring.match
local rsplit = mw.text.split
local ulower = mw.ustring.lower
local uupper = mw.ustring.upper
local usub = mw.ustring.sub
local AC = u(0x0301) -- acute = ́
local GR = u(0x0300) -- grave = ̀
local CFLEX = u(0x0302) -- circumflex = ̂
local BREVE = u(0x0306) -- breve ̆
local DIA = u(0x0308) -- diaeresis = ̈
local CARON = u(0x030C) -- caron ̌
local OGONEK = u(0x0328) -- ogonek ̨
local PSEUDOVOWEL = u(0xFFF1) -- pseudovowel placeholder
local PSEUDOCONS = u(0xFFF2) -- pseudoconsonant placeholder
-- any accent
export.accent = AC .. GR .. DIA .. BREVE .. CARON .. OGONEK
-- regex for any optional accent(s)
export.opt_accent = "[" .. export.accent .. "]*"
-- any composed Cyrillic vowel with grave accent
export.composed_grave_vowel = "ѐЀѝЍ"
-- any Cyrillic vowel except ёЁ
export.vowel_no_jo = "аеиоуяэыюіѣѵАЕИОУЯЭЫЮІѢѴ" .. PSEUDOVOWEL .. export.composed_grave_vowel
-- any Cyrillic vowel, including ёЁ
export.vowel = export.vowel_no_jo .. "ёЁ"
-- any vowel in transliteration
export.tr_vowel = "aeěɛiouyAEĚƐIOUY" .. PSEUDOVOWEL
-- any consonant in transliteration, omitting soft/hard sign
export.tr_cons_no_sign = "bcčdfghjklmnpqrsštvwxzžBCČDFGHJKLMNPQRSŠTVWXZŽ" .. PSEUDOCONS
-- any consonant in transliteration, including soft/hard sign
export.tr_cons = export.tr_cons_no_sign .. "ʹʺ"
-- regex for any consonant in transliteration, including soft/hard sign,
-- optionally followed by any accent
export.tr_cons_acc_re = "[" .. export.tr_cons .. "]" .. export.opt_accent
-- any Cyrillic consonant except sibilants and ц
export.cons_except_sib_c = "бдфгйклмнпрствхзьъБДФГЙКЛМНПРСТВХЗЬЪ" .. PSEUDOCONS
-- Cyrillic sibilant consonants
export.sib = "шщчжШЩЧЖ"
-- Cyrillic sibilant consonants and ц
export.sib_c = export.sib .. "цЦ"
-- any Cyrillic consonant
export.cons = export.cons_except_sib_c .. export.sib_c
-- Cyrillic velar consonants
export.velar = "кгхКГХ"
-- uppercase Cyrillic consonants
export.uppercase = "АЕИОУЯЭЫЁЮІѢѴБДФГЙКЛМНПРСТВХЗЬЪШЩЧЖЦ"
-- version of rsubn() that discards all but the first return value
local function rsub(term, foo, bar)
local retval = rsubn(term, foo, bar)
return retval
end
local function ine(x)
return x ~= "" and x or nil
end
-- this function enables the module to be called from a template;
-- FIXME, does this actually work?
function export.main(frame)
-- FIXME: Not used. Consider deleting.
if type(export[frame.args[1]]) == 'function' then
return export[frame.args[1]](frame.args[2], frame.args[3])
else
return export[frame.args[1]][frame.args[2]]
end
end
-- selects preposition о, об or обо for next phrase, which can start from
-- punctuation
function export.obo(phr)
-- FIXME: Not used. Consider deleting.
--Algorithm design is mainly inherited from w:ru:template:Обо
local w = rmatch(phr,"[%p%s%c]*(.-)[%p%s%c]") or rmatch(phr,"[%p%s%c]*(.-)$")
if not w then return nil end
if string.find(" всей всём всех мне ",' '..ulower(w)..' ',1,true) then return 'обо' end
local ws=usub(w,1,2)
if ws==uupper(ws) then -- abbrev
if rmatch(ws,"^[ЙУНФЫАРОЛЭСМИRYUIOASFHLXNMÖÜÄΑΕΟΥΩ]") then return 'об' else return 'о' end
elseif rmatch(uupper(w),"^[АОЭИУЫAOIEÖÜÄΑΕΟΥΩ]") then
return 'об'
else
return 'о'
end
end
-- Apply Proto-Slavic iotation. This is the change that is affected by a
-- Slavic -j- after a consonant.
function export.iotation(stem, tr, shch)
local combine_tr = false
-- so this can be called from a template
if type(stem) == 'table' then
stem, tr, shch = ine(stem.args[1]), ine(stem.args[2]), ine(stem.args[3])
combine_tr = true
end
stem = rsub(stem, "[сх]$", "ш")
stem = rsub(stem, "ск$", "щ")
stem = rsub(stem, "ст$", "щ")
stem = rsub(stem, "[кц]$", "ч")
-- normally "т" is iotated as "ч" but there are many verbs that are iotated with "щ"
if shch == "щ" then
stem = rsub(stem, "т$", "щ")
else
stem = rsub(stem, "т$", "ч")
end
stem = rsub(stem, "[гдз]$", "ж")
stem = rsub(stem, "([бвмпф])$", "%1л")
if tr then
tr = rsub(tr, "[sx]$", "š")
tr = rsub(tr, "sk$", "šč")
tr = rsub(tr, "st$", "šč")
tr = rsub(tr, "[kc]$", "č")
-- normally "т" is iotated as "ч" but there are many verbs that are iotated with "щ"
if shch == "щ" then
tr = rsub(tr, "t$", "šč")
else
tr = rsub(tr, "t$", "č")
end
tr = rsub(tr, "[gdz]$", "ž")
tr = rsub(tr, "([bvmpf])$", "%1l")
end
if combine_tr then
return export.combine_russian_tr(stem, tr)
else
return stem, tr
end
end
-- Does a set of Cyrillic words in connected text need accents? We need to
-- split by word and check each one.
function export.needs_accents(text)
local function word_needs_accents(word)
-- A word needs accents if it is unstressed and contains more than
-- one vowel, unless it's a prefix or suffix
return not rfind(word, "^%-") and not rfind(word, "%-$") and
export.is_unstressed(word) and not export.is_monosyllabic(word)
end
local words = rsplit(text, "%s")
for _, word in ipairs(words) do
if word_needs_accents(word) then
return true
end
end
return false
end
-- True if Cyrillic word is stressed (acute or diaeresis)
function export.is_stressed(word)
-- A word that has ё in it is inherently stressed.
-- diaeresis occurs in сѣ̈дла plural of сѣдло́
return rfind(word, "[́̈ёЁ]")
end
-- True if Cyrillic word has no stress mark (acute or diaeresis)
function export.is_unstressed(word)
return not export.is_stressed(word)
end
-- True if Cyrillic word is stressed on the last syllable
function export.is_ending_stressed(word)
return rfind(word, "[ёЁ][^" .. export.vowel .. "]*$") or
rfind(word, "[" .. export.vowel .. "][́̈][^" .. export.vowel .. "]*$")
end
-- True if a Cyrillic word has two or more stresses (acute or diaeresis)
function export.is_multi_stressed(word)
word = rsub(word, "[ёЁ]", "е́")
return rfind(word, "[" .. export.vowel .. "][́̈].*[" .. export.vowel .. "][́̈]")
end
-- True if Cyrillic word is stressed on the first syllable
function export.is_beginning_stressed(word)
return rfind(word, "^[^" .. export.vowel .. "]*[ёЁ]") or
rfind(word, "^[^" .. export.vowel .. "]*[" .. export.vowel .. "]́")
end
-- True if Cyrillic word has no vowel. Don't treat suffixes as nonsyllabic
-- even if they have no vowel, as they are generally added onto words with
-- vowels.
function export.is_nonsyllabic(word)
return not rfind(word, "^%-") and not rfind(word, "[" .. export.vowel .. "]")
end
-- True if Cyrillic word has no more than one vowel; includes non-syllabic
-- stems such as льд-
function export.is_monosyllabic(word)
return not rfind(word, "[" .. export.vowel .. "].*[" .. export.vowel .. "]")
end
local recomposer = {
-- Cyrillic letters
["е" .. DIA] = "ё",
["Е" .. DIA] = "Ё",
["и" .. BREVE] = "й",
["И" .. BREVE] = "Й",
["і" .. DIA] = "ї",
["І" .. DIA] = "Ї",
-- Latin letters
["c" .. CARON] = "č",
["C" .. CARON] = "Č",
["e" .. CARON] = "ě",
["E" .. CARON] = "Ě",
["o" .. CARON] = "ǒ",
["O" .. CARON] = "Ǒ",
["o" .. OGONEK] = "ǫ",
["O" .. OGONEK] = "Ǫ",
["s" .. CARON] = "š",
["S" .. CARON] = "Š",
["z" .. CARON] = "ž",
["Z" .. CARON] = "Ž",
-- used in ru-pron:
["ж" .. BREVE] = "ӂ", -- used in ru-pron
["Ж" .. BREVE] = "Ӂ",
["j" .. CFLEX] = "ĵ",
["J" .. CFLEX] = "Ĵ",
["j" .. CARON] = "ǰ",
-- no composed uppercase equivalent of J-caron
["ʒ" .. CARON] = "ǯ",
["Ʒ" .. CARON] = "Ǯ",
}
-- Decompose acute, grave, etc. on letters (esp. Latin) into individivual
-- character + combining accent. But recompose Cyrillic and Latin characters
-- that we want to treat as units and get caught in the crossfire. We mostly
-- want acute and grave decomposed; perhaps should just explicitly decompose
-- those and no others.
function export.decompose(text)
text = mw.ustring.toNFD(text)
text = rsub(text, ".[" .. BREVE .. DIA .. CARON .. OGONEK .. "]", recomposer)
return text
end
function export.assert_decomposed(text)
assert(not rfind(text, "[áéíóúýàèìòùỳäëïöüÿÁÉÍÓÚÝÀÈÌÒÙỲÄËÏÖÜŸ]"))
end
-- Transliterate text and then apply acute/grave decomposition.
function export.translit(text, no_include_monosyllabic_jo_accent)
return export.decompose(m_ru_translit.tr(text, nil, nil, not no_include_monosyllabic_jo_accent))
end
-- Recompose acutes and graves into preceding vowels. Probably not necessary.
function export.recompose(text)
return mw.ustring.toNFC(text)
end
local grave_decomposer = {
["ѐ"] = "е" .. GR,
["Ѐ"] = "Е" .. GR,
["ѝ"] = "и" .. GR,
["Ѝ"] = "И" .. GR,
}
-- decompose precomposed Cyrillic chars w/grave accent; not necessary for
-- acute accent as there aren't precomposed Cyrillic chars w/acute accent,
-- and undesirable for precomposed ё and Ё
function export.decompose_grave(word)
return rsub(word, "[ѐЀѝЍ]", grave_decomposer)
end
local grave_deaccenter = {
[GR] = "", -- grave accent
["ѐ"] = "е", -- composed Cyrillic chars w/grave accent
["Ѐ"] = "Е",
["ѝ"] = "и",
["Ѝ"] = "И",
}
local deaccenter = mw.clone(grave_deaccenter)
deaccenter[AC] = "" -- acute accent
-- Remove acute and grave accents; don't affect composed diaeresis in ёЁ or
-- uncomposed diaeresis in -ѣ̈- (as in plural сѣ̈дла of сѣдло́).
-- NOTE: Translit must already be decomposed! See comment at top.
function export.remove_accents(word, tr)
local ru_removed = rsub(word, "[́̀ѐЀѝЍ]", deaccenter)
if not tr then
return ru_removed, nil
end
return ru_removed, rsub(tr, "[" .. AC .. GR .. "]", deaccenter)
end
-- Remove grave accents; don't affect acute or composed diaeresis in ёЁ or
-- uncomposed diaeresis in -ѣ̈- (as in plural сѣ̈дла of сѣдло́).
-- NOTE: Translit must already be decomposed! See comment at top.
function export.remove_grave_accents(word, tr)
local ru_removed = rsub(word, "[̀ѐЀѝЍ]", grave_deaccenter)
if not tr then
return ru_removed, nil
end
return ru_removed, rsub(tr, GR, "")
end
-- Remove acute and grave accents in monosyllabic words; don't affect
-- diaeresis (composed or uncomposed) because it indicates a change in vowel
-- quality, which still applies to monosyllabic words. Don't change suffixes,
-- where a "monosyllabic" stress is still significant (e.g. -ча́т short
-- masculine of -ча́тый, vs. -́чат short masculine of -́чатый).
-- NOTE: Translit must already be decomposed! See comment at top.
function export.remove_monosyllabic_accents(word, tr)
if export.is_monosyllabic(word) and not rfind(word, "^%-") then
return export.remove_accents(word, tr)
else
return word, tr
end
end