You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

882 lines
33 KiB

5 months ago
# SAX-like interfaces
class XMLSyntaxAssertionError(XMLSyntaxError, AssertionError):
"""
An XMLSyntaxError that additionally inherits from AssertionError for
ElementTree / backwards compatibility reasons.
This class may get replaced by a plain XMLSyntaxError in a future version.
"""
def __init__(self, message):
XMLSyntaxError.__init__(self, message, None, 0, 1)
ctypedef enum _SaxParserEvents:
SAX_EVENT_START = 1 << 0
SAX_EVENT_END = 1 << 1
SAX_EVENT_DATA = 1 << 2
SAX_EVENT_DOCTYPE = 1 << 3
SAX_EVENT_PI = 1 << 4
SAX_EVENT_COMMENT = 1 << 5
SAX_EVENT_START_NS = 1 << 6
SAX_EVENT_END_NS = 1 << 7
ctypedef enum _ParseEventFilter:
PARSE_EVENT_FILTER_START = 1 << 0
PARSE_EVENT_FILTER_END = 1 << 1
PARSE_EVENT_FILTER_START_NS = 1 << 2
PARSE_EVENT_FILTER_END_NS = 1 << 3
PARSE_EVENT_FILTER_COMMENT = 1 << 4
PARSE_EVENT_FILTER_PI = 1 << 5
cdef int _buildParseEventFilter(events) except -1:
cdef int event_filter
event_filter = 0
for event in events:
if event == 'start':
event_filter |= PARSE_EVENT_FILTER_START
elif event == 'end':
event_filter |= PARSE_EVENT_FILTER_END
elif event == 'start-ns':
event_filter |= PARSE_EVENT_FILTER_START_NS
elif event == 'end-ns':
event_filter |= PARSE_EVENT_FILTER_END_NS
elif event == 'comment':
event_filter |= PARSE_EVENT_FILTER_COMMENT
elif event == 'pi':
event_filter |= PARSE_EVENT_FILTER_PI
else:
raise ValueError, f"invalid event name '{event}'"
return event_filter
cdef class _SaxParserTarget:
cdef int _sax_event_filter
def __cinit__(self):
self._sax_event_filter = 0
cdef _handleSaxStart(self, tag, attrib, nsmap):
return None
cdef _handleSaxEnd(self, tag):
return None
cdef int _handleSaxData(self, data) except -1:
return 0
cdef int _handleSaxDoctype(self, root_tag, public_id, system_id) except -1:
return 0
cdef _handleSaxPi(self, target, data):
return None
cdef _handleSaxComment(self, comment):
return None
cdef _handleSaxStartNs(self, prefix, uri):
return None
cdef _handleSaxEndNs(self, prefix):
return None
#@cython.final
@cython.internal
@cython.no_gc_clear # Required because parent class uses it - Cython bug.
cdef class _SaxParserContext(_ParserContext):
"""This class maps SAX2 events to parser target events.
"""
cdef _SaxParserTarget _target
cdef _BaseParser _parser
cdef xmlparser.startElementNsSAX2Func _origSaxStart
cdef xmlparser.endElementNsSAX2Func _origSaxEnd
cdef xmlparser.startElementSAXFunc _origSaxStartNoNs
cdef xmlparser.endElementSAXFunc _origSaxEndNoNs
cdef xmlparser.charactersSAXFunc _origSaxData
cdef xmlparser.cdataBlockSAXFunc _origSaxCData
cdef xmlparser.internalSubsetSAXFunc _origSaxDoctype
cdef xmlparser.commentSAXFunc _origSaxComment
cdef xmlparser.processingInstructionSAXFunc _origSaxPI
cdef xmlparser.startDocumentSAXFunc _origSaxStartDocument
# for event collecting
cdef int _event_filter
cdef list _ns_stack
cdef list _node_stack
cdef _ParseEventsIterator events_iterator
# for iterparse
cdef _Element _root
cdef _MultiTagMatcher _matcher
def __cinit__(self, _BaseParser parser):
self._ns_stack = []
self._node_stack = []
self._parser = parser
self.events_iterator = _ParseEventsIterator()
cdef void _setSaxParserTarget(self, _SaxParserTarget target) noexcept:
self._target = target
cdef void _initParserContext(self, xmlparser.xmlParserCtxt* c_ctxt) noexcept:
_ParserContext._initParserContext(self, c_ctxt)
if self._target is not None:
self._connectTarget(c_ctxt)
elif self._event_filter:
self._connectEvents(c_ctxt)
cdef void _connectTarget(self, xmlparser.xmlParserCtxt* c_ctxt) noexcept:
"""Wrap original SAX2 callbacks to call into parser target.
"""
sax = c_ctxt.sax
self._origSaxStart = sax.startElementNs = NULL
self._origSaxStartNoNs = sax.startElement = NULL
if self._target._sax_event_filter & (SAX_EVENT_START |
SAX_EVENT_START_NS |
SAX_EVENT_END_NS):
# intercept => overwrite orig callback
# FIXME: also intercept on when collecting END events
if sax.initialized == xmlparser.XML_SAX2_MAGIC:
sax.startElementNs = _handleSaxTargetStart
if self._target._sax_event_filter & SAX_EVENT_START:
sax.startElement = _handleSaxTargetStartNoNs
self._origSaxEnd = sax.endElementNs = NULL
self._origSaxEndNoNs = sax.endElement = NULL
if self._target._sax_event_filter & (SAX_EVENT_END |
SAX_EVENT_END_NS):
if sax.initialized == xmlparser.XML_SAX2_MAGIC:
sax.endElementNs = _handleSaxEnd
if self._target._sax_event_filter & SAX_EVENT_END:
sax.endElement = _handleSaxEndNoNs
self._origSaxData = sax.characters = sax.cdataBlock = NULL
if self._target._sax_event_filter & SAX_EVENT_DATA:
sax.characters = sax.cdataBlock = _handleSaxData
# doctype propagation is always required for entity replacement
self._origSaxDoctype = sax.internalSubset
if self._target._sax_event_filter & SAX_EVENT_DOCTYPE:
sax.internalSubset = _handleSaxTargetDoctype
self._origSaxPI = sax.processingInstruction = NULL
if self._target._sax_event_filter & SAX_EVENT_PI:
sax.processingInstruction = _handleSaxTargetPI
self._origSaxComment = sax.comment = NULL
if self._target._sax_event_filter & SAX_EVENT_COMMENT:
sax.comment = _handleSaxTargetComment
# enforce entity replacement
sax.reference = NULL
c_ctxt.replaceEntities = 1
cdef void _connectEvents(self, xmlparser.xmlParserCtxt* c_ctxt) noexcept:
"""Wrap original SAX2 callbacks to collect parse events without parser target.
"""
sax = c_ctxt.sax
self._origSaxStartDocument = sax.startDocument
sax.startDocument = _handleSaxStartDocument
# only override "start" event handler if needed
self._origSaxStart = sax.startElementNs
if self._event_filter == 0 or c_ctxt.html or \
self._event_filter & (PARSE_EVENT_FILTER_START |
PARSE_EVENT_FILTER_END |
PARSE_EVENT_FILTER_START_NS |
PARSE_EVENT_FILTER_END_NS):
sax.startElementNs = <xmlparser.startElementNsSAX2Func>_handleSaxStart
self._origSaxStartNoNs = sax.startElement
if self._event_filter == 0 or c_ctxt.html or \
self._event_filter & (PARSE_EVENT_FILTER_START |
PARSE_EVENT_FILTER_END):
sax.startElement = <xmlparser.startElementSAXFunc>_handleSaxStartNoNs
# only override "end" event handler if needed
self._origSaxEnd = sax.endElementNs
if self._event_filter == 0 or \
self._event_filter & (PARSE_EVENT_FILTER_END |
PARSE_EVENT_FILTER_END_NS):
sax.endElementNs = <xmlparser.endElementNsSAX2Func>_handleSaxEnd
self._origSaxEndNoNs = sax.endElement
if self._event_filter == 0 or \
self._event_filter & PARSE_EVENT_FILTER_END:
sax.endElement = <xmlparser.endElementSAXFunc>_handleSaxEndNoNs
self._origSaxComment = sax.comment
if self._event_filter & PARSE_EVENT_FILTER_COMMENT:
sax.comment = <xmlparser.commentSAXFunc>_handleSaxComment
self._origSaxPI = sax.processingInstruction
if self._event_filter & PARSE_EVENT_FILTER_PI:
sax.processingInstruction = <xmlparser.processingInstructionSAXFunc>_handleSaxPIEvent
cdef _setEventFilter(self, events, tag):
self._event_filter = _buildParseEventFilter(events)
if not self._event_filter or tag is None or tag == '*':
self._matcher = None
else:
self._matcher = _MultiTagMatcher.__new__(_MultiTagMatcher, tag)
cdef int startDocument(self, xmlDoc* c_doc) except -1:
try:
self._doc = _documentFactory(c_doc, self._parser)
finally:
self._parser = None # clear circular reference ASAP
if self._matcher is not None:
self._matcher.cacheTags(self._doc, True) # force entry in libxml2 dict
return 0
cdef int pushEvent(self, event, xmlNode* c_node) except -1:
cdef _Element root
if self._root is None:
root = self._doc.getroot()
if root is not None and root._c_node.type == tree.XML_ELEMENT_NODE:
self._root = root
node = _elementFactory(self._doc, c_node)
self.events_iterator._events.append( (event, node) )
return 0
cdef int flushEvents(self) except -1:
events = self.events_iterator._events
while self._node_stack:
events.append( ('end', self._node_stack.pop()) )
_pushSaxNsEndEvents(self)
while self._ns_stack:
_pushSaxNsEndEvents(self)
cdef void _handleSaxException(self, xmlparser.xmlParserCtxt* c_ctxt) noexcept:
if c_ctxt.errNo == xmlerror.XML_ERR_OK:
c_ctxt.errNo = xmlerror.XML_ERR_INTERNAL_ERROR
# stop parsing immediately
c_ctxt.wellFormed = 0
c_ctxt.disableSAX = 1
c_ctxt.instate = xmlparser.XML_PARSER_EOF
self._store_raised()
@cython.final
@cython.internal
cdef class _ParseEventsIterator:
"""A reusable parse events iterator"""
cdef list _events
cdef int _event_index
def __cinit__(self):
self._events = []
self._event_index = 0
def __iter__(self):
return self
def __next__(self):
cdef int event_index = self._event_index
events = self._events
if event_index >= 2**10 or event_index * 2 >= len(events):
if event_index:
# clean up from time to time
del events[:event_index]
self._event_index = event_index = 0
if event_index >= len(events):
raise StopIteration
item = events[event_index]
self._event_index = event_index + 1
return item
cdef list _build_prefix_uri_list(_SaxParserContext context, int c_nb_namespaces,
const_xmlChar** c_namespaces):
"Build [(prefix, uri)] list of declared namespaces."
cdef int i
namespaces = []
for i in xrange(c_nb_namespaces):
namespaces.append((funicodeOrEmpty(c_namespaces[0]), funicode(c_namespaces[1])))
c_namespaces += 2
return namespaces
cdef void _handleSaxStart(
void* ctxt, const_xmlChar* c_localname, const_xmlChar* c_prefix,
const_xmlChar* c_namespace, int c_nb_namespaces,
const_xmlChar** c_namespaces,
int c_nb_attributes, int c_nb_defaulted,
const_xmlChar** c_attributes) noexcept with gil:
cdef int i
cdef size_t c_len
c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
if c_ctxt._private is NULL or c_ctxt.disableSAX:
return
context = <_SaxParserContext>c_ctxt._private
cdef int event_filter = context._event_filter
try:
if (c_nb_namespaces and
event_filter & (PARSE_EVENT_FILTER_START_NS |
PARSE_EVENT_FILTER_END_NS)):
declared_namespaces = _build_prefix_uri_list(
context, c_nb_namespaces, c_namespaces)
if event_filter & PARSE_EVENT_FILTER_START_NS:
for prefix_uri_tuple in declared_namespaces:
context.events_iterator._events.append(("start-ns", prefix_uri_tuple))
else:
declared_namespaces = None
context._origSaxStart(c_ctxt, c_localname, c_prefix, c_namespace,
c_nb_namespaces, c_namespaces, c_nb_attributes,
c_nb_defaulted, c_attributes)
if c_ctxt.html:
_fixHtmlDictNodeNames(c_ctxt.dict, c_ctxt.node)
# The HTML parser in libxml2 reports the missing opening tags when it finds
# misplaced ones, but with tag names from C string constants that ignore the
# parser dict. Thus, we need to intern the name ourselves.
c_localname = tree.xmlDictLookup(c_ctxt.dict, c_localname, -1)
if c_localname is NULL:
raise MemoryError()
if event_filter & PARSE_EVENT_FILTER_END_NS:
context._ns_stack.append(declared_namespaces)
if event_filter & (PARSE_EVENT_FILTER_END |
PARSE_EVENT_FILTER_START):
_pushSaxStartEvent(context, c_ctxt, c_namespace, c_localname, None)
except:
context._handleSaxException(c_ctxt)
finally:
return # swallow any further exceptions
cdef void _handleSaxTargetStart(
void* ctxt, const_xmlChar* c_localname, const_xmlChar* c_prefix,
const_xmlChar* c_namespace, int c_nb_namespaces,
const_xmlChar** c_namespaces,
int c_nb_attributes, int c_nb_defaulted,
const_xmlChar** c_attributes) noexcept with gil:
cdef int i
cdef size_t c_len
c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
if c_ctxt._private is NULL or c_ctxt.disableSAX:
return
context = <_SaxParserContext>c_ctxt._private
cdef int event_filter = context._event_filter
cdef int sax_event_filter = context._target._sax_event_filter
try:
if c_nb_namespaces:
declared_namespaces = _build_prefix_uri_list(
context, c_nb_namespaces, c_namespaces)
if event_filter & PARSE_EVENT_FILTER_START_NS:
for prefix_uri_tuple in declared_namespaces:
context.events_iterator._events.append(("start-ns", prefix_uri_tuple))
if sax_event_filter & SAX_EVENT_START_NS:
for prefix, uri in declared_namespaces:
context._target._handleSaxStartNs(prefix, uri)
#if not context._target._sax_event_filter & SAX_EVENT_START:
# # *Only* collecting start-ns events.
# return
else:
declared_namespaces = None
if sax_event_filter & SAX_EVENT_START:
if c_nb_defaulted > 0:
# only add default attributes if we asked for them
if c_ctxt.loadsubset & xmlparser.XML_COMPLETE_ATTRS == 0:
c_nb_attributes -= c_nb_defaulted
if c_nb_attributes == 0:
attrib = IMMUTABLE_EMPTY_MAPPING
else:
attrib = {}
for i in xrange(c_nb_attributes):
name = _namespacedNameFromNsName(
c_attributes[2], c_attributes[0])
if c_attributes[3] is NULL:
value = ''
else:
c_len = c_attributes[4] - c_attributes[3]
value = c_attributes[3][:c_len].decode('utf8')
attrib[name] = value
c_attributes += 5
nsmap = dict(declared_namespaces) if c_nb_namespaces else IMMUTABLE_EMPTY_MAPPING
element = _callTargetSaxStart(
context, c_ctxt,
_namespacedNameFromNsName(c_namespace, c_localname),
attrib, nsmap)
else:
element = None
if (event_filter & PARSE_EVENT_FILTER_END_NS or
sax_event_filter & SAX_EVENT_END_NS):
context._ns_stack.append(declared_namespaces)
if event_filter & (PARSE_EVENT_FILTER_END |
PARSE_EVENT_FILTER_START):
_pushSaxStartEvent(context, c_ctxt, c_namespace,
c_localname, element)
except:
context._handleSaxException(c_ctxt)
finally:
return # swallow any further exceptions
cdef void _handleSaxStartNoNs(void* ctxt, const_xmlChar* c_name,
const_xmlChar** c_attributes) noexcept with gil:
c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
if c_ctxt._private is NULL or c_ctxt.disableSAX:
return
context = <_SaxParserContext>c_ctxt._private
try:
context._origSaxStartNoNs(c_ctxt, c_name, c_attributes)
if c_ctxt.html:
_fixHtmlDictNodeNames(c_ctxt.dict, c_ctxt.node)
# The HTML parser in libxml2 reports the missing opening tags when it finds
# misplaced ones, but with tag names from C string constants that ignore the
# parser dict. Thus, we need to intern the name ourselves.
c_name = tree.xmlDictLookup(c_ctxt.dict, c_name, -1)
if c_name is NULL:
raise MemoryError()
if context._event_filter & (PARSE_EVENT_FILTER_END |
PARSE_EVENT_FILTER_START):
_pushSaxStartEvent(context, c_ctxt, NULL, c_name, None)
except:
context._handleSaxException(c_ctxt)
finally:
return # swallow any further exceptions
cdef void _handleSaxTargetStartNoNs(void* ctxt, const_xmlChar* c_name,
const_xmlChar** c_attributes) noexcept with gil:
c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
if c_ctxt._private is NULL or c_ctxt.disableSAX:
return
context = <_SaxParserContext>c_ctxt._private
try:
if c_attributes is NULL:
attrib = IMMUTABLE_EMPTY_MAPPING
else:
attrib = {}
while c_attributes[0] is not NULL:
name = funicode(c_attributes[0])
attrib[name] = funicodeOrEmpty(c_attributes[1])
c_attributes += 2
element = _callTargetSaxStart(
context, c_ctxt, funicode(c_name),
attrib, IMMUTABLE_EMPTY_MAPPING)
if context._event_filter & (PARSE_EVENT_FILTER_END |
PARSE_EVENT_FILTER_START):
_pushSaxStartEvent(context, c_ctxt, NULL, c_name, element)
except:
context._handleSaxException(c_ctxt)
finally:
return # swallow any further exceptions
cdef _callTargetSaxStart(_SaxParserContext context,
xmlparser.xmlParserCtxt* c_ctxt,
tag, attrib, nsmap):
element = context._target._handleSaxStart(tag, attrib, nsmap)
if element is not None and c_ctxt.input is not NULL:
if isinstance(element, _Element):
(<_Element>element)._c_node.line = (
<unsigned short>c_ctxt.input.line
if c_ctxt.input.line < 65535 else 65535)
return element
cdef int _pushSaxStartEvent(_SaxParserContext context,
xmlparser.xmlParserCtxt* c_ctxt,
const_xmlChar* c_href,
const_xmlChar* c_name, node) except -1:
if (context._matcher is None or
context._matcher.matchesNsTag(c_href, c_name)):
if node is None and context._target is None:
assert context._doc is not None
node = _elementFactory(context._doc, c_ctxt.node)
if context._event_filter & PARSE_EVENT_FILTER_START:
context.events_iterator._events.append(('start', node))
if (context._target is None and
context._event_filter & PARSE_EVENT_FILTER_END):
context._node_stack.append(node)
return 0
cdef void _handleSaxEnd(void* ctxt, const_xmlChar* c_localname,
const_xmlChar* c_prefix,
const_xmlChar* c_namespace) noexcept with gil:
c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
if c_ctxt._private is NULL or c_ctxt.disableSAX:
return
context = <_SaxParserContext>c_ctxt._private
try:
if context._target is not None:
if context._target._sax_event_filter & SAX_EVENT_END:
node = context._target._handleSaxEnd(
_namespacedNameFromNsName(c_namespace, c_localname))
else:
node = None
else:
context._origSaxEnd(c_ctxt, c_localname, c_prefix, c_namespace)
node = None
_pushSaxEndEvent(context, c_namespace, c_localname, node)
_pushSaxNsEndEvents(context)
except:
context._handleSaxException(c_ctxt)
finally:
return # swallow any further exceptions
cdef void _handleSaxEndNoNs(void* ctxt, const_xmlChar* c_name) noexcept with gil:
c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
if c_ctxt._private is NULL or c_ctxt.disableSAX:
return
context = <_SaxParserContext>c_ctxt._private
try:
if context._target is not None:
node = context._target._handleSaxEnd(funicode(c_name))
else:
context._origSaxEndNoNs(c_ctxt, c_name)
node = None
_pushSaxEndEvent(context, NULL, c_name, node)
except:
context._handleSaxException(c_ctxt)
finally:
return # swallow any further exceptions
cdef int _pushSaxNsEndEvents(_SaxParserContext context) except -1:
cdef bint build_events = context._event_filter & PARSE_EVENT_FILTER_END_NS
cdef bint call_target = (
context._target is not None
and context._target._sax_event_filter & SAX_EVENT_END_NS)
if not build_events and not call_target:
return 0
cdef list declared_namespaces = context._ns_stack.pop()
if declared_namespaces is None:
return 0
cdef tuple prefix_uri
for prefix_uri in reversed(declared_namespaces):
if call_target:
context._target._handleSaxEndNs(prefix_uri[0])
if build_events:
context.events_iterator._events.append(('end-ns', None))
return 0
cdef int _pushSaxEndEvent(_SaxParserContext context,
const_xmlChar* c_href,
const_xmlChar* c_name, node) except -1:
if context._event_filter & PARSE_EVENT_FILTER_END:
if (context._matcher is None or
context._matcher.matchesNsTag(c_href, c_name)):
if context._target is None:
node = context._node_stack.pop()
context.events_iterator._events.append(('end', node))
return 0
cdef void _handleSaxData(void* ctxt, const_xmlChar* c_data, int data_len) noexcept with gil:
# can only be called if parsing with a target
c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
if c_ctxt._private is NULL or c_ctxt.disableSAX:
return
context = <_SaxParserContext>c_ctxt._private
try:
context._target._handleSaxData(
c_data[:data_len].decode('utf8'))
except:
context._handleSaxException(c_ctxt)
finally:
return # swallow any further exceptions
cdef void _handleSaxTargetDoctype(void* ctxt, const_xmlChar* c_name,
const_xmlChar* c_public,
const_xmlChar* c_system) noexcept with gil:
# can only be called if parsing with a target
c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
if c_ctxt._private is NULL or c_ctxt.disableSAX:
return
context = <_SaxParserContext>c_ctxt._private
try:
context._target._handleSaxDoctype(
funicodeOrNone(c_name),
funicodeOrNone(c_public),
funicodeOrNone(c_system))
except:
context._handleSaxException(c_ctxt)
finally:
return # swallow any further exceptions
cdef void _handleSaxStartDocument(void* ctxt) noexcept with gil:
c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
if c_ctxt._private is NULL or c_ctxt.disableSAX:
return
context = <_SaxParserContext>c_ctxt._private
context._origSaxStartDocument(ctxt)
c_doc = c_ctxt.myDoc
try:
context.startDocument(c_doc)
except:
context._handleSaxException(c_ctxt)
finally:
return # swallow any further exceptions
cdef void _handleSaxTargetPI(void* ctxt, const_xmlChar* c_target,
const_xmlChar* c_data) noexcept with gil:
# can only be called if parsing with a target
c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
if c_ctxt._private is NULL or c_ctxt.disableSAX:
return
context = <_SaxParserContext>c_ctxt._private
try:
pi = context._target._handleSaxPi(
funicodeOrNone(c_target),
funicodeOrEmpty(c_data))
if context._event_filter & PARSE_EVENT_FILTER_PI:
context.events_iterator._events.append(('pi', pi))
except:
context._handleSaxException(c_ctxt)
finally:
return # swallow any further exceptions
cdef void _handleSaxPIEvent(void* ctxt, const_xmlChar* target,
const_xmlChar* data) noexcept with gil:
# can only be called when collecting pi events
c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
if c_ctxt._private is NULL or c_ctxt.disableSAX:
return
context = <_SaxParserContext>c_ctxt._private
context._origSaxPI(ctxt, target, data)
c_node = _findLastEventNode(c_ctxt)
if c_node is NULL:
return
try:
context.pushEvent('pi', c_node)
except:
context._handleSaxException(c_ctxt)
finally:
return # swallow any further exceptions
cdef void _handleSaxTargetComment(void* ctxt, const_xmlChar* c_data) noexcept with gil:
# can only be called if parsing with a target
c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
if c_ctxt._private is NULL or c_ctxt.disableSAX:
return
context = <_SaxParserContext>c_ctxt._private
try:
comment = context._target._handleSaxComment(funicodeOrEmpty(c_data))
if context._event_filter & PARSE_EVENT_FILTER_COMMENT:
context.events_iterator._events.append(('comment', comment))
except:
context._handleSaxException(c_ctxt)
finally:
return # swallow any further exceptions
cdef void _handleSaxComment(void* ctxt, const_xmlChar* text) noexcept with gil:
# can only be called when collecting comment events
c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
if c_ctxt._private is NULL or c_ctxt.disableSAX:
return
context = <_SaxParserContext>c_ctxt._private
context._origSaxComment(ctxt, text)
c_node = _findLastEventNode(c_ctxt)
if c_node is NULL:
return
try:
context.pushEvent('comment', c_node)
except:
context._handleSaxException(c_ctxt)
finally:
return # swallow any further exceptions
cdef inline xmlNode* _findLastEventNode(xmlparser.xmlParserCtxt* c_ctxt):
# this mimics what libxml2 creates for comments/PIs
if c_ctxt.inSubset == 1:
return c_ctxt.myDoc.intSubset.last
elif c_ctxt.inSubset == 2:
return c_ctxt.myDoc.extSubset.last
elif c_ctxt.node is NULL:
return c_ctxt.myDoc.last
elif c_ctxt.node.type == tree.XML_ELEMENT_NODE:
return c_ctxt.node.last
else:
return c_ctxt.node.next
############################################################
## ET compatible XML tree builder
############################################################
cdef class TreeBuilder(_SaxParserTarget):
"""TreeBuilder(self, element_factory=None, parser=None,
comment_factory=None, pi_factory=None,
insert_comments=True, insert_pis=True)
Parser target that builds a tree from parse event callbacks.
The factory arguments can be used to influence the creation of
elements, comments and processing instructions.
By default, comments and processing instructions are inserted into
the tree, but they can be ignored by passing the respective flags.
The final tree is returned by the ``close()`` method.
"""
cdef _BaseParser _parser
cdef object _factory
cdef object _comment_factory
cdef object _pi_factory
cdef list _data
cdef list _element_stack
cdef object _element_stack_pop
cdef _Element _last # may be None
cdef bint _in_tail
cdef bint _insert_comments
cdef bint _insert_pis
def __init__(self, *, element_factory=None, parser=None,
comment_factory=None, pi_factory=None,
bint insert_comments=True, bint insert_pis=True):
self._sax_event_filter = \
SAX_EVENT_START | SAX_EVENT_END | SAX_EVENT_DATA | \
SAX_EVENT_PI | SAX_EVENT_COMMENT
self._data = [] # data collector
self._element_stack = [] # element stack
self._element_stack_pop = self._element_stack.pop
self._last = None # last element
self._in_tail = 0 # true if we're after an end tag
self._factory = element_factory
self._comment_factory = comment_factory if comment_factory is not None else Comment
self._pi_factory = pi_factory if pi_factory is not None else ProcessingInstruction
self._insert_comments = insert_comments
self._insert_pis = insert_pis
self._parser = parser
@cython.final
cdef int _flush(self) except -1:
if self._data:
if self._last is not None:
text = "".join(self._data)
if self._in_tail:
assert self._last.tail is None, "internal error (tail)"
self._last.tail = text
else:
assert self._last.text is None, "internal error (text)"
self._last.text = text
del self._data[:]
return 0
# internal SAX event handlers
@cython.final
cdef _handleSaxStart(self, tag, attrib, nsmap):
self._flush()
if self._factory is not None:
self._last = self._factory(tag, attrib)
if self._element_stack:
_appendChild(self._element_stack[-1], self._last)
elif self._element_stack:
self._last = _makeSubElement(
self._element_stack[-1], tag, None, None, attrib, nsmap, None)
else:
self._last = _makeElement(
tag, NULL, None, self._parser, None, None, attrib, nsmap, None)
self._element_stack.append(self._last)
self._in_tail = 0
return self._last
@cython.final
cdef _handleSaxEnd(self, tag):
self._flush()
self._last = self._element_stack_pop()
self._in_tail = 1
return self._last
@cython.final
cdef int _handleSaxData(self, data) except -1:
self._data.append(data)
@cython.final
cdef _handleSaxPi(self, target, data):
elem = self._pi_factory(target, data)
if self._insert_pis:
self._flush()
self._last = elem
if self._element_stack:
_appendChild(self._element_stack[-1], self._last)
self._in_tail = 1
return self._last
@cython.final
cdef _handleSaxComment(self, comment):
elem = self._comment_factory(comment)
if self._insert_comments:
self._flush()
self._last = elem
if self._element_stack:
_appendChild(self._element_stack[-1], self._last)
self._in_tail = 1
return elem
# Python level event handlers
def close(self):
"""close(self)
Flushes the builder buffers, and returns the toplevel document
element. Raises XMLSyntaxError on inconsistencies.
"""
if self._element_stack:
raise XMLSyntaxAssertionError("missing end tags")
# TODO: this does not necessarily seem like an error case. Why not just return None?
if self._last is None:
raise XMLSyntaxAssertionError("missing toplevel element")
return self._last
def data(self, data):
"""data(self, data)
Adds text to the current element. The value should be either an
8-bit string containing ASCII text, or a Unicode string.
"""
self._handleSaxData(data)
def start(self, tag, attrs, nsmap=None):
"""start(self, tag, attrs, nsmap=None)
Opens a new element.
"""
if nsmap is None:
nsmap = IMMUTABLE_EMPTY_MAPPING
return self._handleSaxStart(tag, attrs, nsmap)
def end(self, tag):
"""end(self, tag)
Closes the current element.
"""
element = self._handleSaxEnd(tag)
assert self._last.tag == tag,\
f"end tag mismatch (expected {self._last.tag}, got {tag})"
return element
def pi(self, target, data=None):
"""pi(self, target, data=None)
Creates a processing instruction using the factory, appends it
(unless disabled) and returns it.
"""
return self._handleSaxPi(target, data)
def comment(self, comment):
"""comment(self, comment)
Creates a comment using the factory, appends it (unless disabled)
and returns it.
"""
return self._handleSaxComment(comment)

Powered by BW's shoe-string budget.