Jump to content

Module:string/decodeEntities

From Wiktionary, the free dictionary

local load_module = "Module:load"
local string_char_module = "Module:string/char"

local find = string.find
local gsub = string.gsub
local match = string.match
local require = require
local tonumber = tonumber

local function u(...)
	u = require(string_char_module)
	return u(...)
end

local entities
local function get_entities()
	entities, get_entities = require(load_module).load_data("Module:data/entities"), nil
	return entities
end

local function decode_entity(hash, x, code)
	-- "#" isn't included in "[%w\128-\255]", so if no "#" is found then it's a
	-- a named entity or a false match.
	if hash == "" then
		return (entities or get_entities())[x .. code]
	end
	-- Exclude numbers that don't fit the expected format.
	local cp
	if x == "" then
		cp = match(code, "^()%d+$") and tonumber(code)
	else
		cp = match(code, "^()%x+$") and tonumber(code, 16)
	end
	-- Exclude surrogates (U+D800 to U+DFFF) and codepoints that are too high.
	return cp and (
		cp <= 0xD7FF or
		cp >= 0xE000 and cp <= 0x10FFFF
	) and u(cp) or nil
end

return function(str)
	-- As an optimisation, only do a full search with gsub() if plain searches
	-- for "&" and ";" find anything.
	local amp = find(str, "&", nil, true)
	-- Search for ";" after the point "&" was found.
	return amp and find(str, ";", amp + 1, true) and
		-- Non-ASCII characters aren't valid in proper HTML named entities, but
		-- MediaWiki uses them in some nonstandard aliases (which have also been
		-- included in [[Module:data/entities]]), so include them anyway.
		gsub(str, "&(#?)([xX]?)([%w\128-\255]+);", decode_entity) or
		str
end