Module:XML
Appearance
Documentation for this module may be created at Module:XML/doc
--[[
A pure-Lua XML parser for Wikisource, for the TEI XML formatter.
A simplified of XML is supported. The DTD can only do custom entities and cannot reference to an external file.
The returned Lua table may seem cluttered, with children, attributes and fundamental characteristics of a node
(tag, parent) mixed together. Separating them into different tables would make a file hit the MediaWiki Lua
memory limit about twice as fast.
--]]
local p = {}
-- Check if there is unrecognised non-whitespace content where there should not be
local function assert_unprocessed(str, err_msg)
if str:find('%S') then
error(err_msg..'\nUprocessed: "'..str:gsub('^%s*(.-)%s*$', '%1')..'"')
end
end
-- Replaces all entitites in a given string
local function replace_entities(str, entities)
return str:gsub('&([%w_]+);', entities)
end
-- Formats attributes in the key="value" syntax into a table
local function process_attributes(attributes, node, err_msg)
assert_unprocessed(attributes:gsub('([%w:_%-]+)%s*=%s*(["\'])(.-)%2', function(attr_name, _, attr_value)
node[attr_name] = attr_value
return ''
end), err_msg)
end
-- Returns a table containing `root` with the DOM and `instructions` with the processing instructions
function p.parse(str)
-- Strip comments. Known issue is that this allows false positives such as
-- < <!-- comment --> tag >
-- which is not compliant to XML guidelines.
str = str:gsub('<!%-%-.-%-%->', '')
-- Collapse whitespace, to avoid it being parsed in weird ways as wikitext. We could add the option
-- to preserve whitespace if there is convincing need, but I would argue against using whitespace
-- semantically in any way, as far as TEI is concerned.
str = str:gsub('%s+', ' ')
-- Handle processing instruction
local instructions = {}
str = str:gsub('%s*<%?(.-)%?>%s*', function (instruction)
local name = instruction:match('%S+')
if not name then
error('Invalid processing instruction')
end
instructions[name] = {}
process_attributes(instruction:gsub('^%s*%S+', ''), instructions[name], 'Invalid processing instruction')
return ''
end)
-- Collect entity declarations
local root_tag, internal_subset, rest = str:match('^%s*<!DOCTYPE%s+(%S+)%s*%[(.-)%]>%s*(.*)$')
if internal_subset then
-- Parse entity definitions inside the DOCTYPE subset
local entities = {}
-- Check that no stray characters exist in subset after stripping the <!ENTITY ... > blocks
assert_unprocessed(internal_subset:gsub('<!ENTITY%s+([_%w]+)%s+"(.-)"%s*>', function(name, value)
-- Expand entities recursively if needed
entities[name] = replace_entities(value, entities)
return ''
end), 'Only <!ENTITY ... > is supported inside the DTD internal subset')
-- Replace entities
str = replace_entities(rest, entities)
end
local pos = 1
-- Virtual root
local root = { }
local node = root
while true do
local start, stop, start_slash, tag_name, attributes, end_slash = str:find('<%s*(/?)%s*([%w:_%-]+)(.-)(/?)%s*>', pos)
if not start then
break
end
-- Text before tag
if pos < start then
table.insert(node, {
_content = str:sub(pos, start - 1),
_parent = node,
_index = #node + 1,
})
end
if start_slash == '' then
-- Opening tag
local new_node = {
_tag = tag_name,
_parent = node,
_index = #node + 1,
}
-- Process attributes
process_attributes(attributes, new_node, 'Extraneous characters inside opening tag <'..tag_name..'>')
table.insert(node, new_node)
-- Go down the hierarchy unless self-closing
if end_slash ~= '/' then
node = new_node
end
else
-- Closing tag
if node._tag ~= tag_name then
error('Mismatched </'..tag_name..'>, expected </'..node._tag..'>')
elseif end_slash ~= '' then
error('Unexpected slash at the end of a closing tag </'..tag_name..'>')
end
assert_unprocessed(attributes, 'Extraneous characters inside closing tag </'..tag_name..'>')
-- Go up the hierarchy
node = node._parent
end
pos = stop + 1
end
-- Check if all tags were closed.
if node ~= root then
error('Expected closing tag </'..node._tag..'>')
end
-- Check if there is any content outside of root tag.
if #root ~= 1 then
-- The case in which the second child is a whitespace string can be easily handled.
if #root == 2 and root[2]._content and not root[2]._content:find('%S') then
table.remove(root, 2)
else
error('All content should be contained in a single root element')
end
end
root = root[1]
-- Check if there is any content after what has been processed.
assert_unprocessed(str:sub(pos), 'Extraneous content after root')
-- Check if the doctype contains the correct tag.
if root_tag and (root._tag ~= root_tag) then
error('DTD does not match root tag')
end
return {
root = root,
instructions = instructions,
}
end
return p