Module:XML

Documentation for this module may be created at Module:XML/doc
--[[

A pure-Lua XML parser for Wikisource, for the TEI XML formatter.

A simplified of XML is supported. The DTD can only do custom entities and cannot reference to an external file.

The returned Lua table may seem cluttered, with children, attributes and fundamental characteristics of a node
(tag, parent) mixed together. Separating them into different tables would make a file hit the MediaWiki Lua
memory limit about twice as fast.

--]]

local p = {}

-- Check if there is unrecognised non-whitespace content where there should not be
local function assert_unprocessed(str, err_msg)
	if str:find('%S') then
		error(err_msg..'\nUprocessed: "'..str:gsub('^%s*(.-)%s*$', '%1')..'"')
	end
end

-- Replaces all entitites in a given string
local function replace_entities(str, entities)
	return str:gsub('&([%w_]+);', entities)
end

-- Formats attributes in the key="value" syntax into a table
local function process_attributes(attributes, node, err_msg)
	assert_unprocessed(attributes:gsub('([%w:_%-]+)%s*=%s*(["\'])(.-)%2', function(attr_name, _, attr_value)
		node[attr_name] = attr_value
		return ''
	end), err_msg)
end

-- Returns a table containing `root` with the DOM and `instructions` with the processing instructions
function p.parse(str)

	-- Strip comments. Known issue is that this allows false positives such as
	--     < <!-- comment --> tag >
	-- which is not compliant to XML guidelines.
	str = str:gsub('<!%-%-.-%-%->', '')

	-- Collapse whitespace, to avoid it being parsed in weird ways as wikitext. We could add the option
	-- to preserve whitespace if there is convincing need, but I would argue against using whitespace
	-- semantically in any way, as far as TEI is concerned.
	str = str:gsub('%s+', ' ')

	-- Handle processing instruction
	local instructions = {}
	str = str:gsub('%s*<%?(.-)%?>%s*', function (instruction)
		local name = instruction:match('%S+')
		if not name then
			error('Invalid processing instruction')
		end
		instructions[name] = {}
		process_attributes(instruction:gsub('^%s*%S+', ''), instructions[name], 'Invalid processing instruction')
		return ''
	end)

	-- Collect entity declarations
	local root_tag, internal_subset, rest = str:match('^%s*<!DOCTYPE%s+(%S+)%s*%[(.-)%]>%s*(.*)$')
	if internal_subset then
		-- Parse entity definitions inside the DOCTYPE subset
		local entities = {}
		-- Check that no stray characters exist in subset after stripping the <!ENTITY ... > blocks
		assert_unprocessed(internal_subset:gsub('<!ENTITY%s+([_%w]+)%s+"(.-)"%s*>', function(name, value)
			-- Expand entities recursively if needed
			entities[name] = replace_entities(value, entities)
			return ''
		end), 'Only <!ENTITY ... > is supported inside the DTD internal subset')
		-- Replace entities
		str = replace_entities(rest, entities)
	end

	local pos = 1

	-- Virtual root
	local root = { }
	local node = root

	while true do

		local start, stop, start_slash, tag_name, attributes, end_slash = str:find('<%s*(/?)%s*([%w:_%-]+)(.-)(/?)%s*>', pos)

		if not start then
			break
		end

		-- Text before tag
		if pos < start then
			table.insert(node, {
				_content = str:sub(pos, start - 1),
				_parent = node,
				_index = #node + 1,
			})
		end

		if start_slash == '' then
			-- Opening tag
			local new_node = {
				_tag = tag_name,
				_parent = node,
				_index = #node + 1,
			}
			-- Process attributes
			process_attributes(attributes, new_node, 'Extraneous characters inside opening tag <'..tag_name..'>')
			table.insert(node, new_node)
			-- Go down the hierarchy unless self-closing
			if end_slash ~= '/' then
				node = new_node
			end
		else
			-- Closing tag
			if node._tag ~= tag_name then
				error('Mismatched </'..tag_name..'>, expected </'..node._tag..'>')
			elseif end_slash ~= '' then
				error('Unexpected slash at the end of a closing tag </'..tag_name..'>')
			end
			assert_unprocessed(attributes, 'Extraneous characters inside closing tag </'..tag_name..'>')
			-- Go up the hierarchy
			node = node._parent
		end
		pos = stop + 1
	end

	-- Check if all tags were closed.
	if node ~= root then
		error('Expected closing tag </'..node._tag..'>')
	end

	-- Check if there is any content outside of root tag.
	if #root ~= 1 then
		-- The case in which the second child is a whitespace string can be easily handled.
		if #root == 2 and root[2]._content and not root[2]._content:find('%S') then
			table.remove(root, 2)
		else
			error('All content should be contained in a single root element')
		end
	end

	root = root[1]

	-- Check if there is any content after what has been processed.
	assert_unprocessed(str:sub(pos), 'Extraneous content after root')

	-- Check if the doctype contains the correct tag.
	if root_tag and (root._tag ~= root_tag) then
		error('DTD does not match root tag')
	end

	return {
		root = root,
		instructions = instructions,
	}

end

return p