Module:OsmPageTitleParser

From OpenStreetMap Wiki
Jump to navigation Jump to search
[Edit] [Purge] Documentation

This module parses title object into language, key value, and an optional tag value. This module is designed to be used by other modules. See also testcases code.

All tests passed.

test_keys_en
Text Expected Actual
☑Y Key:something {key="something",language="en"} {language="en",key="something"}
☑Y Key:some:thing {key="some:thing",language="en"} {language="en",key="some:thing"}
☑Y KEY::some:thing {key=":some:thing",language="en"} {language="en",key=":some:thing"}
☑Y Key:some:thing:aa {key="some:thing:aa",language="en"} {language="en",key="some:thing:aa"}
☑Y Key:some:thing:aa: {key="some:thing:aa:",language="en"} {language="en",key="some:thing:aa:"}
test_keys_international
Text Expected Actual
☑Y ko:key:some:thing {key="some:thing",language="ko"} {language="ko",key="some:thing"}
☑Y kO:key:some:thing:o {key="some:thing:o",language="ko"} {language="ko",key="some:thing:o"}
☑Y kO:key:so me:thing:o: {key="so me:thing:o:",language="ko"} {language="ko",key="so me:thing:o:"}
☑Y No:key:abc {key="abc",language="no"} {language="no",key="abc"}
test_keys_international_known_NS
Text Expected Actual
☑Y ru:Key:something {key="something",language="ru"} {language="ru",key="something"}
☑Y rU:KEY:some:thing:o {key="some:thing:o",language="ru"} {language="ru",key="some:thing:o"}
☑Y rU:keY:so me:thing:o: {key="so me:thing:o:",language="ru"} {language="ru",key="so me:thing:o:"}
test_langPrefix
Text Expected Actual
☑Y en
☑Y EN
☑Y fr FR: FR:
☑Y Fr FR: FR:
☑Y PT Pt: Pt:
☑Y
☑Y (nil)
test_language_pseudonamespaces
Text Expected Actual
☑Y bh:Main {language="bh",_parseFailed=true} {language="bh",_parseFailed=true}
☑Y ca-valencia:Main {language="ca-valencia",_parseFailed=true} {language="ca-valencia",_parseFailed=true}
☑Y gcf:Main {language="gcf",_parseFailed=true} {language="gcf",_parseFailed=true}
☑Y gsw:Main {language="gsw",_parseFailed=true} {language="gsw",_parseFailed=true}
☑Y kbp:Main {language="kbp",_parseFailed=true} {language="kbp",_parseFailed=true}
☑Y kfa:Main {language="kfa",_parseFailed=true} {language="kfa",_parseFailed=true}
☑Y mrw:Main {language="mrw",_parseFailed=true} {language="mrw",_parseFailed=true}
☑Y rcf:Main {language="rcf",_parseFailed=true} {language="rcf",_parseFailed=true}
☑Y sr-cyrl:Main {language="sr-cyrl",_parseFailed=true} {language="sr-cyrl",_parseFailed=true}
☑Y sr-latn:Main {language="sr-latn",_parseFailed=true} {language="sr-latn",_parseFailed=true}
☑Y sxu:Main {language="sxu",_parseFailed=true} {language="sxu",_parseFailed=true}
☑Y swg:Main {language="swg",_parseFailed=true} {language="swg",_parseFailed=true}
☑Y sxu:Main {language="sxu",_parseFailed=true} {language="sxu",_parseFailed=true}
☑Y trp:Main {language="trp",_parseFailed=true} {language="trp",_parseFailed=true}
☑Y tzm:Main {language="tzm",_parseFailed=true} {language="tzm",_parseFailed=true}
☑Y zgh:Main {language="zgh",_parseFailed=true} {language="zgh",_parseFailed=true}
☑Y Nds:Main {language="nds",_parseFailed=true} {language="nds",_parseFailed=true}
☑Y Gcf:Test {language="gcf",_parseFailed=true} {language="gcf",_parseFailed=true}
☑Y Zh-hant:Relation {language="zh-hant",_parseFailed=true} {language="zh-hant",_parseFailed=true}
☑Y Pt-br:Main {language="pt-br",_parseFailed=true} {language="pt-br",_parseFailed=true}
☑Y No:Main {language="no",_parseFailed=true} {language="no",_parseFailed=true}
test_non_language_pseudonamespaces
Text Expected Actual
☑Y POI:Scotiabank {language="en",_parseFailed=true} {language="en",_parseFailed=true}
☑Y Switzerland:Berne {language="en",_parseFailed=true} {language="en",_parseFailed=true}
test_splitKeyValue
Text Expected Actual
☑Y akey=avalue {k="akey",v="avalue"} {k="akey",v="avalue"}
☑Y akey {k="akey"} {k="akey"}
☑Y akey= {k="akey",v=""} {k="akey",v=""}
☑Y akey=ava=lue {k="akey",v="ava=lue"} {k="akey",v="ava=lue"}
☑Y akey==ava=lue= {k="akey",v="=ava=lue="} {k="akey",v="=ava=lue="}
☑Y {k=""} {k=""}
☑Y (nil) {} {}
test_tags_en
Text Expected Actual
☑Y Tag:something=abc {value="abc",key="something",language="en"} {value="abc",key="something",language="en"}
☑Y Tag:some:thing=abc:xyz {value="abc:xyz",key="some:thing",language="en"} {value="abc:xyz",key="some:thing",language="en"}
☑Y TAG::some:thing=aa=bb=c {value="aa=bb=c",key=":some:thing",language="en"} {value="aa=bb=c",key=":some:thing",language="en"}
☑Y taG:some:thing:aa bb=yy {value="yy",key="some:thing:aa bb",language="en"} {value="yy",key="some:thing:aa bb",language="en"}
☑Y Tag:some:thing:aa:=a b {value="a b",key="some:thing:aa:",language="en"} {value="a b",key="some:thing:aa:",language="en"}
test_tags_international
Text Expected Actual
☑Y ko:key:some:thing=abc x:yz {value="abc x:yz",key="some:thing",language="ko"} {value="abc x:yz",key="some:thing",language="ko"}
☑Y kO:key:some:thing:o=:a: {value=":a:",key="some:thing:o",language="ko"} {value=":a:",key="some:thing:o",language="ko"}
☑Y kO:key:so me:thing:o:=* {value="*",key="so me:thing:o:",language="ko"} {value="*",key="so me:thing:o:",language="ko"}
test_tags_international_known_NS
Text Expected Actual
☑Y ru:Key:something=abc {value="abc",key="something",language="ru"} {value="abc",key="something",language="ru"}
☑Y rU:KEY:some:thing:o=a=b:c {value="a=b:c",key="some:thing:o",language="ru"} {value="a=b:c",key="some:thing:o",language="ru"}
☑Y rU:keY:so me:thing:o:=== {value="==",key="so me:thing:o:",language="ru"} {value="==",key="so me:thing:o:",language="ru"}
☑Y RU:Moscow {language="ru",_parseFailed=true} {language="ru",_parseFailed=true}
test_talkpages
Text Expected Actual
☑Y Talk:Main {language="en",_parseFailed=true} {language="en",_parseFailed=true}
☑Y Talk:Pt:Creating an Account {language="pt",_parseFailed=true} {language="pt",_parseFailed=true}
☑Y JA talk:Bus routes in Kanagawa {language="ja",_parseFailed=true} {language="ja",_parseFailed=true}
☑Y Talk:POI:The Church of Jesus Christ of Latter-day Saints {language="en",_parseFailed=true} {language="en",_parseFailed=true}
test_unparsable_titles
Text Expected Actual
☑Y something {language="en",_parseFailed=true} {language="en",_parseFailed=true}
☑Y FR:something {language="fr",_parseFailed=true} {language="fr",_parseFailed=true}
☑Y ko:something {language="ko",_parseFailed=true} {language="ko",_parseFailed=true}
☑Y some:thing {language="en",_parseFailed=true} {language="en",_parseFailed=true}
☑Y FR:some:thing {language="fr",_parseFailed=true} {language="fr",_parseFailed=true}
☑Y KO:some:thing {language="ko",_parseFailed=true} {language="ko",_parseFailed=true}
☑Y {_parseFailed=true} {_parseFailed=true}

local p = {}
local data = mw.loadData('Module:OSM Constants')

-- Module_talk:OsmPageTitleParser/testcases  has many test cases showing how to use this module
-- Simple debugging:
--  =p.parseTitle(mw.title.new('Key:test')).key

-- A wrapper to return the parse results. See p.parseTitleToObj
function p.parseTitle(title)
  local result = {}
  p.parseTitleToObj(result, title)
  return result
end

-- given a title string in a form of (lang-code:)?(tag|key):(tagkey)(=tagvalue)?
-- tries to parse it into language (object), key, and optional value strings
-- We have to do it manually because Lua regex support is not that great
-- The actual tag and key prefixes are ignored. Value will be nil if no equal sign.
-- The output result object will always get the language unless title is nil,
-- but no other values will be set unless successfuly parsed
-- Params:  title object,  result table to get the results
-- Returns true if parsed, false otherwise
function p.parseTitleToObj(result, title)
  if not title then return false end

  local language, prefix, tagkey, tagvalue

  -- if this is one of the known language namespaces, do not allow more language codes
  local ns = title.namespace / 2 * 2
  ns = ns - ns % 2 -- treat talk pages as their corresponding main pages
  local langCode = data.nsToLangCodeMap[ns]
  if langCode then
    language = mw.getLanguage(langCode)
  end

  local keyvalue
  for _, val in ipairs(mw.text.split(title.text, ':', true)) do
    if not prefix then
      -- this could be the language code or the prefix (tag or key)
      local lval = string.lower(val)
      if lval == 'tag' or lval == 'key' then
        prefix = lval
      elseif not language and (data.customLangCodes[lval] or mw.language.isSupportedLanguage(lval)) then
      	-- mw.getLanguage will creat an object even if the language is not supported
        language = mw.getLanguage(lval)
      else
        -- unrecognized, there was no tag or key as first or second part
        break
      end
    else
      -- combine the values back into a single string after we found the prefix
      if keyvalue then
        keyvalue = keyvalue .. ':' .. val
      else
        keyvalue = val
      end
    end
  end

  if keyvalue then
    -- now split the keyvalue into key and (optional) value
    tagkey, tagvalue = p.splitKeyValue(keyvalue)
    if not tagkey and not tagvalue then
      keyvalue = nil
    end
  end

  if not language then
    result.language = mw.language.getContentLanguage()
  else
    result.language = language
  end

  if keyvalue then
    result.key = tagkey
    if tagvalue then result.value = tagvalue end
    return true
  else
    return false
  end
end

-- given a key=value string, split it into two parts and return both
-- if there is no equal sign, return key and nil value
function p.splitKeyValue(keyvalue)
	if not keyvalue then return end
	local tagkey, tagvalue
    local eqlSignPos = mw.ustring.find(keyvalue, '=', 1, true)
    if not eqlSignPos then
      tagkey = keyvalue
    else
      local keyvalLen = mw.ustring.len(keyvalue)
      if eqlSignPos > 1 then
        tagkey = mw.ustring.sub(keyvalue, 1, eqlSignPos - 1)
        tagvalue = mw.ustring.sub(keyvalue, eqlSignPos + 1)
      end
    end
    return tagkey, tagvalue
end

-- Given a language code, returns proper title prefix:
-- empty string for English, all caps for the namespaces, first letter cap for rest
function p.langPrefix(langCode)
	langCode = langCode and mw.ustring.lower(langCode) or ''
	if not langCode or langCode == '' or langCode == 'en' then
		return ''
	end
	for _,v in pairs(data.nsToLangCodeMap) do
	  if v == langCode then
	    return string.upper(langCode) .. ':'
	  end
	end
	return mw.getContentLanguage():ucfirst(langCode) .. ':'
end

return p