Module:string/decodeEntities
Appearance
- This module lacks a documentation subpage. Please create it.
- Useful links: root page • root page’s subpages • links • transclusions • testcases • sandbox
local load_module = "Module:load"
local string_char_module = "Module:string/char"
local find = string.find
local gsub = string.gsub
local match = string.match
local require = require
local tonumber = tonumber
local function u(...)
u = require(string_char_module)
return u(...)
end
local entities
local function get_entities()
entities, get_entities = require(load_module).load_data("Module:data/entities"), nil
return entities
end
local function decode_entity(hash, x, code)
-- "#" isn't included in "[%w\128-\255]", so if no "#" is found then it's a
-- a named entity or a false match.
if hash == "" then
return (entities or get_entities())[x .. code]
end
-- Exclude numbers that don't fit the expected format.
local cp
if x == "" then
cp = match(code, "^()%d+$") and tonumber(code)
else
cp = match(code, "^()%x+$") and tonumber(code, 16)
end
-- Exclude surrogates (U+D800 to U+DFFF) and codepoints that are too high.
return cp and (
cp <= 0xD7FF or
cp >= 0xE000 and cp <= 0x10FFFF
) and u(cp) or nil
end
return function(str)
-- As an optimisation, only do a full search with gsub() if plain searches
-- for "&" and ";" find anything.
local amp = find(str, "&", nil, true)
-- Search for ";" after the point "&" was found.
return amp and find(str, ";", amp + 1, true) and
-- Non-ASCII characters aren't valid in proper HTML named entities, but
-- MediaWiki uses them in some nonstandard aliases (which have also been
-- included in [[Module:data/entities]]), so include them anyway.
gsub(str, "&(#?)([xX]?)([%w\128-\255]+);", decode_entity) or
str
end