You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1995 lines
80 KiB
1995 lines
80 KiB
# Parsers for XML and HTML
|
|
|
|
from lxml.includes cimport xmlparser
|
|
from lxml.includes cimport htmlparser
|
|
|
|
|
|
class ParseError(LxmlSyntaxError):
|
|
"""Syntax error while parsing an XML document.
|
|
|
|
For compatibility with ElementTree 1.3 and later.
|
|
"""
|
|
def __init__(self, message, code, line, column, filename=None):
|
|
super(_ParseError, self).__init__(message)
|
|
self.lineno, self.offset = (line, column - 1)
|
|
self.code = code
|
|
self.filename = filename
|
|
|
|
@property
|
|
def position(self):
|
|
return self.lineno, self.offset + 1
|
|
|
|
@position.setter
|
|
def position(self, new_pos):
|
|
self.lineno, column = new_pos
|
|
self.offset = column - 1
|
|
|
|
cdef object _ParseError = ParseError
|
|
|
|
|
|
class XMLSyntaxError(ParseError):
|
|
"""Syntax error while parsing an XML document.
|
|
"""
|
|
|
|
cdef class ParserError(LxmlError):
|
|
"""Internal lxml parser error.
|
|
"""
|
|
|
|
|
|
@cython.final
|
|
@cython.internal
|
|
cdef class _ParserDictionaryContext:
|
|
# Global parser context to share the string dictionary.
|
|
#
|
|
# This class is a delegate singleton!
|
|
#
|
|
# It creates _ParserDictionaryContext objects for each thread to keep thread state,
|
|
# but those must never be used directly. Always stick to using the static
|
|
# __GLOBAL_PARSER_CONTEXT as defined below the class.
|
|
#
|
|
|
|
cdef tree.xmlDict* _c_dict
|
|
cdef _BaseParser _default_parser
|
|
cdef list _implied_parser_contexts
|
|
|
|
def __cinit__(self):
|
|
self._c_dict = NULL
|
|
self._implied_parser_contexts = []
|
|
|
|
def __dealloc__(self):
|
|
if self._c_dict is not NULL:
|
|
xmlparser.xmlDictFree(self._c_dict)
|
|
|
|
cdef int initMainParserContext(self) except -1:
|
|
"""Put the global context into the thread dictionary of the main
|
|
thread. To be called once and only in the main thread."""
|
|
thread_dict = python.PyThreadState_GetDict()
|
|
if thread_dict is not NULL:
|
|
(<dict>thread_dict)["_ParserDictionaryContext"] = self
|
|
|
|
cdef _ParserDictionaryContext _findThreadParserContext(self):
|
|
"Find (or create) the _ParserDictionaryContext object for the current thread"
|
|
cdef _ParserDictionaryContext context
|
|
thread_dict = python.PyThreadState_GetDict()
|
|
if thread_dict is NULL:
|
|
return self
|
|
d = <dict>thread_dict
|
|
result = python.PyDict_GetItem(d, "_ParserDictionaryContext")
|
|
if result is not NULL:
|
|
return <object>result
|
|
context = <_ParserDictionaryContext>_ParserDictionaryContext.__new__(_ParserDictionaryContext)
|
|
d["_ParserDictionaryContext"] = context
|
|
return context
|
|
|
|
cdef int setDefaultParser(self, _BaseParser parser) except -1:
|
|
"Set the default parser for the current thread"
|
|
cdef _ParserDictionaryContext context
|
|
context = self._findThreadParserContext()
|
|
context._default_parser = parser
|
|
|
|
cdef _BaseParser getDefaultParser(self):
|
|
"Return (or create) the default parser of the current thread"
|
|
cdef _ParserDictionaryContext context
|
|
context = self._findThreadParserContext()
|
|
if context._default_parser is None:
|
|
if self._default_parser is None:
|
|
self._default_parser = __DEFAULT_XML_PARSER._copy()
|
|
if context is not self:
|
|
context._default_parser = self._default_parser._copy()
|
|
return context._default_parser
|
|
|
|
cdef tree.xmlDict* _getThreadDict(self, tree.xmlDict* default):
|
|
"Return the thread-local dict or create a new one if necessary."
|
|
cdef _ParserDictionaryContext context
|
|
context = self._findThreadParserContext()
|
|
if context._c_dict is NULL:
|
|
# thread dict not yet set up => use default or create a new one
|
|
if default is not NULL:
|
|
context._c_dict = default
|
|
xmlparser.xmlDictReference(default)
|
|
return default
|
|
if self._c_dict is NULL:
|
|
self._c_dict = xmlparser.xmlDictCreate()
|
|
if context is not self:
|
|
context._c_dict = xmlparser.xmlDictCreateSub(self._c_dict)
|
|
return context._c_dict
|
|
|
|
cdef int initThreadDictRef(self, tree.xmlDict** c_dict_ref) except -1:
|
|
c_dict = c_dict_ref[0]
|
|
c_thread_dict = self._getThreadDict(c_dict)
|
|
if c_dict is c_thread_dict:
|
|
return 0
|
|
if c_dict is not NULL:
|
|
xmlparser.xmlDictFree(c_dict)
|
|
c_dict_ref[0] = c_thread_dict
|
|
xmlparser.xmlDictReference(c_thread_dict)
|
|
|
|
cdef int initParserDict(self, xmlparser.xmlParserCtxt* pctxt) except -1:
|
|
"Assure we always use the same string dictionary."
|
|
self.initThreadDictRef(&pctxt.dict)
|
|
pctxt.dictNames = 1
|
|
|
|
cdef int initXPathParserDict(self, xpath.xmlXPathContext* pctxt) except -1:
|
|
"Assure we always use the same string dictionary."
|
|
self.initThreadDictRef(&pctxt.dict)
|
|
|
|
cdef int initDocDict(self, xmlDoc* result) except -1:
|
|
"Store dict of last object parsed if no shared dict yet"
|
|
# XXX We also free the result dict here if there already was one.
|
|
# This case should only occur for new documents with empty dicts,
|
|
# otherwise we'd free data that's in use => segfault
|
|
self.initThreadDictRef(&result.dict)
|
|
|
|
cdef _ParserContext findImpliedContext(self):
|
|
"""Return any current implied xml parser context for the current
|
|
thread. This is used when the resolver functions are called
|
|
with an xmlParserCtxt that was generated from within libxml2
|
|
(i.e. without a _ParserContext) - which happens when parsing
|
|
schema and xinclude external references."""
|
|
cdef _ParserDictionaryContext context
|
|
cdef _ParserContext implied_context
|
|
|
|
# see if we have a current implied parser
|
|
context = self._findThreadParserContext()
|
|
if context._implied_parser_contexts:
|
|
implied_context = context._implied_parser_contexts[-1]
|
|
return implied_context
|
|
return None
|
|
|
|
cdef int pushImpliedContextFromParser(self, _BaseParser parser) except -1:
|
|
"Push a new implied context object taken from the parser."
|
|
if parser is not None:
|
|
self.pushImpliedContext(parser._getParserContext())
|
|
else:
|
|
self.pushImpliedContext(None)
|
|
|
|
cdef int pushImpliedContext(self, _ParserContext parser_context) except -1:
|
|
"Push a new implied context object."
|
|
cdef _ParserDictionaryContext context
|
|
context = self._findThreadParserContext()
|
|
context._implied_parser_contexts.append(parser_context)
|
|
|
|
cdef int popImpliedContext(self) except -1:
|
|
"Pop the current implied context object."
|
|
cdef _ParserDictionaryContext context
|
|
context = self._findThreadParserContext()
|
|
context._implied_parser_contexts.pop()
|
|
|
|
cdef _ParserDictionaryContext __GLOBAL_PARSER_CONTEXT = _ParserDictionaryContext()
|
|
__GLOBAL_PARSER_CONTEXT.initMainParserContext()
|
|
|
|
############################################################
|
|
## support for Python unicode I/O
|
|
############################################################
|
|
|
|
# name of Python Py_UNICODE encoding as known to libxml2
|
|
cdef const_char* _PY_UNICODE_ENCODING = NULL
|
|
|
|
cdef int _setupPythonUnicode() except -1:
|
|
"""Sets _PY_UNICODE_ENCODING to the internal encoding name of Python unicode
|
|
strings if libxml2 supports reading native Python unicode. This depends
|
|
on iconv and the local Python installation, so we simply check if we find
|
|
a matching encoding handler.
|
|
"""
|
|
cdef tree.xmlCharEncodingHandler* enchandler
|
|
cdef Py_ssize_t l
|
|
cdef const_char* enc
|
|
cdef Py_UNICODE *uchars = [c'<', c't', c'e', c's', c't', c'/', c'>']
|
|
cdef const_xmlChar* buffer = <const_xmlChar*>uchars
|
|
# apparently, libxml2 can't detect UTF-16 on some systems
|
|
if (buffer[0] == c'<' and buffer[1] == c'\0' and
|
|
buffer[2] == c't' and buffer[3] == c'\0'):
|
|
enc = "UTF-16LE"
|
|
elif (buffer[0] == c'\0' and buffer[1] == c'<' and
|
|
buffer[2] == c'\0' and buffer[3] == c't'):
|
|
enc = "UTF-16BE"
|
|
else:
|
|
# let libxml2 give it a try
|
|
enc = _findEncodingName(buffer, sizeof(Py_UNICODE) * 7)
|
|
if enc is NULL:
|
|
# not my fault, it's YOUR broken system :)
|
|
return 0
|
|
enchandler = tree.xmlFindCharEncodingHandler(enc)
|
|
if enchandler is not NULL:
|
|
global _PY_UNICODE_ENCODING
|
|
tree.xmlCharEncCloseFunc(enchandler)
|
|
_PY_UNICODE_ENCODING = enc
|
|
return 0
|
|
|
|
cdef const_char* _findEncodingName(const_xmlChar* buffer, int size):
|
|
"Work around bug in libxml2: find iconv name of encoding on our own."
|
|
cdef tree.xmlCharEncoding enc
|
|
enc = tree.xmlDetectCharEncoding(buffer, size)
|
|
if enc == tree.XML_CHAR_ENCODING_UTF16LE:
|
|
if size >= 4 and (buffer[0] == <const_xmlChar> b'\xFF' and
|
|
buffer[1] == <const_xmlChar> b'\xFE' and
|
|
buffer[2] == 0 and buffer[3] == 0):
|
|
return "UTF-32LE" # according to BOM
|
|
else:
|
|
return "UTF-16LE"
|
|
elif enc == tree.XML_CHAR_ENCODING_UTF16BE:
|
|
return "UTF-16BE"
|
|
elif enc == tree.XML_CHAR_ENCODING_UCS4LE:
|
|
return "UCS-4LE"
|
|
elif enc == tree.XML_CHAR_ENCODING_UCS4BE:
|
|
return "UCS-4BE"
|
|
elif enc == tree.XML_CHAR_ENCODING_NONE:
|
|
return NULL
|
|
else:
|
|
# returns a constant char*, no need to free it
|
|
return tree.xmlGetCharEncodingName(enc)
|
|
|
|
# Python 3.12 removed support for "Py_UNICODE".
|
|
if python.PY_VERSION_HEX < 0x030C0000:
|
|
_setupPythonUnicode()
|
|
|
|
|
|
cdef unicode _find_PyUCS4EncodingName():
|
|
"""
|
|
Find a suitable encoding for Py_UCS4 PyUnicode strings in libxml2.
|
|
"""
|
|
ustring = "<xml>\U0001F92A</xml>"
|
|
cdef const xmlChar* buffer = <const xmlChar*> python.PyUnicode_DATA(ustring)
|
|
cdef Py_ssize_t py_buffer_len = python.PyUnicode_GET_LENGTH(ustring)
|
|
|
|
encoding_name = ''
|
|
cdef tree.xmlCharEncoding enc = tree.xmlDetectCharEncoding(buffer, py_buffer_len)
|
|
enchandler = tree.xmlGetCharEncodingHandler(enc)
|
|
if enchandler is not NULL:
|
|
try:
|
|
if enchandler.name:
|
|
encoding_name = enchandler.name.decode('UTF-8')
|
|
finally:
|
|
tree.xmlCharEncCloseFunc(enchandler)
|
|
else:
|
|
c_name = tree.xmlGetCharEncodingName(enc)
|
|
if c_name:
|
|
encoding_name = c_name.decode('UTF-8')
|
|
|
|
|
|
if encoding_name and not encoding_name.endswith('LE') and not encoding_name.endswith('BE'):
|
|
encoding_name += 'BE' if python.PY_BIG_ENDIAN else 'LE'
|
|
return encoding_name or None
|
|
|
|
_pyucs4_encoding_name = _find_PyUCS4EncodingName()
|
|
|
|
|
|
############################################################
|
|
## support for file-like objects
|
|
############################################################
|
|
|
|
@cython.final
|
|
@cython.internal
|
|
cdef class _FileReaderContext:
|
|
cdef object _filelike
|
|
cdef object _encoding
|
|
cdef object _url
|
|
cdef object _bytes
|
|
cdef _ExceptionContext _exc_context
|
|
cdef Py_ssize_t _bytes_read
|
|
cdef char* _c_url
|
|
cdef bint _close_file_after_read
|
|
|
|
def __cinit__(self, filelike, exc_context not None, url, encoding=None, bint close_file=False):
|
|
self._exc_context = exc_context
|
|
self._filelike = filelike
|
|
self._close_file_after_read = close_file
|
|
self._encoding = encoding
|
|
if url is None:
|
|
self._c_url = NULL
|
|
else:
|
|
url = _encodeFilename(url)
|
|
self._c_url = _cstr(url)
|
|
self._url = url
|
|
self._bytes = b''
|
|
self._bytes_read = 0
|
|
|
|
cdef _close_file(self):
|
|
if self._filelike is None or not self._close_file_after_read:
|
|
return
|
|
try:
|
|
close = self._filelike.close
|
|
except AttributeError:
|
|
close = None
|
|
finally:
|
|
self._filelike = None
|
|
if close is not None:
|
|
close()
|
|
|
|
cdef xmlparser.xmlParserInputBuffer* _createParserInputBuffer(self) noexcept:
|
|
cdef xmlparser.xmlParserInputBuffer* c_buffer = xmlparser.xmlAllocParserInputBuffer(0)
|
|
if c_buffer:
|
|
c_buffer.readcallback = _readFilelikeParser
|
|
c_buffer.context = <python.PyObject*> self
|
|
return c_buffer
|
|
|
|
cdef xmlparser.xmlParserInput* _createParserInput(
|
|
self, xmlparser.xmlParserCtxt* ctxt) noexcept:
|
|
cdef xmlparser.xmlParserInputBuffer* c_buffer = self._createParserInputBuffer()
|
|
if not c_buffer:
|
|
return NULL
|
|
return xmlparser.xmlNewIOInputStream(ctxt, c_buffer, 0)
|
|
|
|
cdef tree.xmlDtd* _readDtd(self) noexcept:
|
|
cdef xmlparser.xmlParserInputBuffer* c_buffer = self._createParserInputBuffer()
|
|
if not c_buffer:
|
|
return NULL
|
|
with nogil:
|
|
return xmlparser.xmlIOParseDTD(NULL, c_buffer, 0)
|
|
|
|
cdef xmlDoc* _readDoc(self, xmlparser.xmlParserCtxt* ctxt, int options) noexcept:
|
|
cdef xmlDoc* result
|
|
cdef void* c_callback_context = <python.PyObject*> self
|
|
cdef char* c_encoding = _cstr(self._encoding) if self._encoding is not None else NULL
|
|
|
|
orig_options = ctxt.options
|
|
with nogil:
|
|
if ctxt.html:
|
|
result = htmlparser.htmlCtxtReadIO(
|
|
ctxt, _readFilelikeParser, NULL, c_callback_context,
|
|
self._c_url, c_encoding, options)
|
|
if result is not NULL:
|
|
if _fixHtmlDictNames(ctxt.dict, result) < 0:
|
|
tree.xmlFreeDoc(result)
|
|
result = NULL
|
|
else:
|
|
result = xmlparser.xmlCtxtReadIO(
|
|
ctxt, _readFilelikeParser, NULL, c_callback_context,
|
|
self._c_url, c_encoding, options)
|
|
ctxt.options = orig_options # work around libxml2 problem
|
|
|
|
try:
|
|
self._close_file()
|
|
except:
|
|
self._exc_context._store_raised()
|
|
finally:
|
|
return result # swallow any exceptions
|
|
|
|
cdef int copyToBuffer(self, char* c_buffer, int c_requested) noexcept:
|
|
cdef int c_byte_count = 0
|
|
cdef char* c_start
|
|
cdef Py_ssize_t byte_count, remaining
|
|
if self._bytes_read < 0:
|
|
return 0
|
|
try:
|
|
byte_count = python.PyBytes_GET_SIZE(self._bytes)
|
|
remaining = byte_count - self._bytes_read
|
|
while c_requested > remaining:
|
|
c_start = _cstr(self._bytes) + self._bytes_read
|
|
cstring_h.memcpy(c_buffer, c_start, remaining)
|
|
c_byte_count += remaining
|
|
c_buffer += remaining
|
|
c_requested -= remaining
|
|
|
|
self._bytes = self._filelike.read(c_requested)
|
|
if not isinstance(self._bytes, bytes):
|
|
if isinstance(self._bytes, unicode):
|
|
if self._encoding is None:
|
|
self._bytes = (<unicode>self._bytes).encode('utf8')
|
|
else:
|
|
self._bytes = python.PyUnicode_AsEncodedString(
|
|
self._bytes, _cstr(self._encoding), NULL)
|
|
else:
|
|
self._close_file()
|
|
raise TypeError, \
|
|
"reading from file-like objects must return byte strings or unicode strings"
|
|
|
|
remaining = python.PyBytes_GET_SIZE(self._bytes)
|
|
if remaining == 0:
|
|
self._bytes_read = -1
|
|
self._close_file()
|
|
return c_byte_count
|
|
self._bytes_read = 0
|
|
|
|
if c_requested > 0:
|
|
c_start = _cstr(self._bytes) + self._bytes_read
|
|
cstring_h.memcpy(c_buffer, c_start, c_requested)
|
|
c_byte_count += c_requested
|
|
self._bytes_read += c_requested
|
|
except:
|
|
c_byte_count = -1
|
|
self._exc_context._store_raised()
|
|
try:
|
|
self._close_file()
|
|
except:
|
|
self._exc_context._store_raised()
|
|
finally:
|
|
return c_byte_count # swallow any exceptions
|
|
|
|
cdef int _readFilelikeParser(void* ctxt, char* c_buffer, int c_size) noexcept with gil:
|
|
return (<_FileReaderContext>ctxt).copyToBuffer(c_buffer, c_size)
|
|
|
|
cdef int _readFileParser(void* ctxt, char* c_buffer, int c_size) noexcept nogil:
|
|
return stdio.fread(c_buffer, 1, c_size, <stdio.FILE*>ctxt)
|
|
|
|
############################################################
|
|
## support for custom document loaders
|
|
############################################################
|
|
|
|
cdef xmlparser.xmlParserInput* _local_resolver(const_char* c_url, const_char* c_pubid,
|
|
xmlparser.xmlParserCtxt* c_context) noexcept with gil:
|
|
cdef _ResolverContext context
|
|
cdef xmlparser.xmlParserInput* c_input
|
|
cdef _InputDocument doc_ref
|
|
cdef _FileReaderContext file_context
|
|
# if there is no _ParserContext associated with the xmlParserCtxt
|
|
# passed, check to see if the thread state object has an implied
|
|
# context.
|
|
if c_context._private is not NULL:
|
|
context = <_ResolverContext>c_context._private
|
|
else:
|
|
context = __GLOBAL_PARSER_CONTEXT.findImpliedContext()
|
|
|
|
if context is None:
|
|
if __DEFAULT_ENTITY_LOADER is NULL:
|
|
return NULL
|
|
with nogil:
|
|
# free the GIL as we might do serious I/O here (e.g. HTTP)
|
|
c_input = __DEFAULT_ENTITY_LOADER(c_url, c_pubid, c_context)
|
|
return c_input
|
|
|
|
try:
|
|
if c_url is NULL:
|
|
url = None
|
|
else:
|
|
# parsing a related document (DTD etc.) => UTF-8 encoded URL?
|
|
url = _decodeFilename(<const_xmlChar*>c_url)
|
|
if c_pubid is NULL:
|
|
pubid = None
|
|
else:
|
|
pubid = funicode(<const_xmlChar*>c_pubid) # always UTF-8
|
|
|
|
doc_ref = context._resolvers.resolve(url, pubid, context)
|
|
except:
|
|
context._store_raised()
|
|
return NULL
|
|
|
|
if doc_ref is not None:
|
|
if doc_ref._type == PARSER_DATA_STRING:
|
|
data = doc_ref._data_bytes
|
|
filename = doc_ref._filename
|
|
if not filename:
|
|
filename = None
|
|
elif not isinstance(filename, bytes):
|
|
# most likely a text URL
|
|
filename = filename.encode('utf8')
|
|
if not isinstance(filename, bytes):
|
|
filename = None
|
|
|
|
c_input = xmlparser.xmlNewInputStream(c_context)
|
|
if c_input is not NULL:
|
|
if filename is not None:
|
|
c_input.filename = <char *>tree.xmlStrdup(_xcstr(filename))
|
|
c_input.base = _xcstr(data)
|
|
c_input.length = python.PyBytes_GET_SIZE(data)
|
|
c_input.cur = c_input.base
|
|
c_input.end = c_input.base + c_input.length
|
|
elif doc_ref._type == PARSER_DATA_FILENAME:
|
|
data = None
|
|
c_filename = _cstr(doc_ref._filename)
|
|
with nogil:
|
|
# free the GIL as we might do serious I/O here
|
|
c_input = xmlparser.xmlNewInputFromFile(
|
|
c_context, c_filename)
|
|
elif doc_ref._type == PARSER_DATA_FILE:
|
|
file_context = _FileReaderContext(doc_ref._file, context, url,
|
|
None, doc_ref._close_file)
|
|
c_input = file_context._createParserInput(c_context)
|
|
data = file_context
|
|
else:
|
|
data = None
|
|
c_input = NULL
|
|
|
|
if data is not None:
|
|
context._storage.add(data)
|
|
if c_input is not NULL:
|
|
return c_input
|
|
|
|
if __DEFAULT_ENTITY_LOADER is NULL:
|
|
return NULL
|
|
|
|
with nogil:
|
|
# free the GIL as we might do serious I/O here (e.g. HTTP)
|
|
c_input = __DEFAULT_ENTITY_LOADER(c_url, c_pubid, c_context)
|
|
return c_input
|
|
|
|
cdef xmlparser.xmlExternalEntityLoader __DEFAULT_ENTITY_LOADER
|
|
__DEFAULT_ENTITY_LOADER = xmlparser.xmlGetExternalEntityLoader()
|
|
|
|
|
|
cdef xmlparser.xmlExternalEntityLoader _register_document_loader() noexcept nogil:
|
|
cdef xmlparser.xmlExternalEntityLoader old = xmlparser.xmlGetExternalEntityLoader()
|
|
xmlparser.xmlSetExternalEntityLoader(<xmlparser.xmlExternalEntityLoader>_local_resolver)
|
|
return old
|
|
|
|
cdef void _reset_document_loader(xmlparser.xmlExternalEntityLoader old) noexcept nogil:
|
|
xmlparser.xmlSetExternalEntityLoader(old)
|
|
|
|
|
|
############################################################
|
|
## Parsers
|
|
############################################################
|
|
|
|
@cython.no_gc_clear # May have to call "self._validator.disconnect()" on dealloc.
|
|
@cython.internal
|
|
cdef class _ParserContext(_ResolverContext):
|
|
cdef _ErrorLog _error_log
|
|
cdef _ParserSchemaValidationContext _validator
|
|
cdef xmlparser.xmlParserCtxt* _c_ctxt
|
|
cdef xmlparser.xmlExternalEntityLoader _orig_loader
|
|
cdef python.PyThread_type_lock _lock
|
|
cdef _Document _doc
|
|
cdef bint _collect_ids
|
|
|
|
def __cinit__(self):
|
|
self._c_ctxt = NULL
|
|
self._collect_ids = True
|
|
if not config.ENABLE_THREADING:
|
|
self._lock = NULL
|
|
else:
|
|
self._lock = python.PyThread_allocate_lock()
|
|
self._error_log = _ErrorLog()
|
|
|
|
def __dealloc__(self):
|
|
if config.ENABLE_THREADING and self._lock is not NULL:
|
|
python.PyThread_free_lock(self._lock)
|
|
self._lock = NULL
|
|
if self._c_ctxt is not NULL:
|
|
if <void*>self._validator is not NULL and self._validator is not None:
|
|
# If the parser was not closed correctly (e.g. interrupted iterparse()),
|
|
# and the schema validator wasn't freed and cleaned up yet, the libxml2 SAX
|
|
# validator plug might still be in place, which will make xmlFreeParserCtxt()
|
|
# crash when trying to xmlFree() a static SAX handler.
|
|
# Thus, make sure we disconnect the handler interceptor here at the latest.
|
|
self._validator.disconnect()
|
|
xmlparser.xmlFreeParserCtxt(self._c_ctxt)
|
|
|
|
cdef _ParserContext _copy(self):
|
|
cdef _ParserContext context
|
|
context = self.__class__()
|
|
context._collect_ids = self._collect_ids
|
|
context._validator = self._validator.copy()
|
|
_initParserContext(context, self._resolvers._copy(), NULL)
|
|
return context
|
|
|
|
cdef void _initParserContext(self, xmlparser.xmlParserCtxt* c_ctxt) noexcept:
|
|
self._c_ctxt = c_ctxt
|
|
c_ctxt._private = <void*>self
|
|
|
|
cdef void _resetParserContext(self) noexcept:
|
|
if self._c_ctxt is not NULL:
|
|
if self._c_ctxt.html:
|
|
htmlparser.htmlCtxtReset(self._c_ctxt)
|
|
self._c_ctxt.disableSAX = 0 # work around bug in libxml2
|
|
else:
|
|
xmlparser.xmlClearParserCtxt(self._c_ctxt)
|
|
# work around bug in libxml2 [2.9.10 .. 2.9.14]:
|
|
# https://gitlab.gnome.org/GNOME/libxml2/-/issues/378
|
|
self._c_ctxt.nsNr = 0
|
|
|
|
cdef int prepare(self, bint set_document_loader=True) except -1:
|
|
cdef int result
|
|
if config.ENABLE_THREADING and self._lock is not NULL:
|
|
with nogil:
|
|
result = python.PyThread_acquire_lock(
|
|
self._lock, python.WAIT_LOCK)
|
|
if result == 0:
|
|
raise ParserError, "parser locking failed"
|
|
self._error_log.clear()
|
|
self._doc = None
|
|
# Need a cast here because older libxml2 releases do not use 'const' in the functype.
|
|
self._c_ctxt.sax.serror = <xmlerror.xmlStructuredErrorFunc> _receiveParserError
|
|
self._orig_loader = _register_document_loader() if set_document_loader else NULL
|
|
if self._validator is not None:
|
|
self._validator.connect(self._c_ctxt, self._error_log)
|
|
return 0
|
|
|
|
cdef int cleanup(self) except -1:
|
|
if self._orig_loader is not NULL:
|
|
_reset_document_loader(self._orig_loader)
|
|
try:
|
|
if self._validator is not None:
|
|
self._validator.disconnect()
|
|
self._resetParserContext()
|
|
self.clear()
|
|
self._doc = None
|
|
self._c_ctxt.sax.serror = NULL
|
|
finally:
|
|
if config.ENABLE_THREADING and self._lock is not NULL:
|
|
python.PyThread_release_lock(self._lock)
|
|
return 0
|
|
|
|
cdef object _handleParseResult(self, _BaseParser parser,
|
|
xmlDoc* result, filename):
|
|
c_doc = self._handleParseResultDoc(parser, result, filename)
|
|
if self._doc is not None and self._doc._c_doc is c_doc:
|
|
return self._doc
|
|
else:
|
|
return _documentFactory(c_doc, parser)
|
|
|
|
cdef xmlDoc* _handleParseResultDoc(self, _BaseParser parser,
|
|
xmlDoc* result, filename) except NULL:
|
|
recover = parser._parse_options & xmlparser.XML_PARSE_RECOVER
|
|
return _handleParseResult(self, self._c_ctxt, result,
|
|
filename, recover,
|
|
free_doc=self._doc is None)
|
|
|
|
cdef _initParserContext(_ParserContext context,
|
|
_ResolverRegistry resolvers,
|
|
xmlparser.xmlParserCtxt* c_ctxt):
|
|
_initResolverContext(context, resolvers)
|
|
if c_ctxt is not NULL:
|
|
context._initParserContext(c_ctxt)
|
|
|
|
cdef void _forwardParserError(xmlparser.xmlParserCtxt* _parser_context, const xmlerror.xmlError* error) noexcept with gil:
|
|
(<_ParserContext>_parser_context._private)._error_log._receive(error)
|
|
|
|
cdef void _receiveParserError(void* c_context, const xmlerror.xmlError* error) noexcept nogil:
|
|
if __DEBUG:
|
|
if c_context is NULL or (<xmlparser.xmlParserCtxt*>c_context)._private is NULL:
|
|
_forwardError(NULL, error)
|
|
else:
|
|
_forwardParserError(<xmlparser.xmlParserCtxt*>c_context, error)
|
|
|
|
cdef int _raiseParseError(xmlparser.xmlParserCtxt* ctxt, filename,
|
|
_ErrorLog error_log) except -1:
|
|
if filename is not None and \
|
|
ctxt.lastError.domain == xmlerror.XML_FROM_IO:
|
|
if isinstance(filename, bytes):
|
|
filename = _decodeFilenameWithLength(
|
|
<bytes>filename, len(<bytes>filename))
|
|
if ctxt.lastError.message is not NULL:
|
|
try:
|
|
message = ctxt.lastError.message.decode('utf-8')
|
|
except UnicodeDecodeError:
|
|
# the filename may be in there => play it safe
|
|
message = ctxt.lastError.message.decode('iso8859-1')
|
|
message = f"Error reading file '{filename}': {message.strip()}"
|
|
else:
|
|
message = f"Error reading '{filename}'"
|
|
raise IOError, message
|
|
elif error_log:
|
|
raise error_log._buildParseException(
|
|
XMLSyntaxError, "Document is not well formed")
|
|
elif ctxt.lastError.message is not NULL:
|
|
message = ctxt.lastError.message.strip()
|
|
code = ctxt.lastError.code
|
|
line = ctxt.lastError.line
|
|
column = ctxt.lastError.int2
|
|
if ctxt.lastError.line > 0:
|
|
message = f"line {line}: {message}"
|
|
raise XMLSyntaxError(message, code, line, column, filename)
|
|
else:
|
|
raise XMLSyntaxError(None, xmlerror.XML_ERR_INTERNAL_ERROR, 0, 0,
|
|
filename)
|
|
|
|
cdef xmlDoc* _handleParseResult(_ParserContext context,
|
|
xmlparser.xmlParserCtxt* c_ctxt,
|
|
xmlDoc* result, filename,
|
|
bint recover, bint free_doc) except NULL:
|
|
cdef bint well_formed
|
|
if result is not NULL:
|
|
__GLOBAL_PARSER_CONTEXT.initDocDict(result)
|
|
|
|
if c_ctxt.myDoc is not NULL:
|
|
if c_ctxt.myDoc is not result:
|
|
__GLOBAL_PARSER_CONTEXT.initDocDict(c_ctxt.myDoc)
|
|
tree.xmlFreeDoc(c_ctxt.myDoc)
|
|
c_ctxt.myDoc = NULL
|
|
|
|
if result is not NULL:
|
|
if (context._validator is not None and
|
|
not context._validator.isvalid()):
|
|
well_formed = 0 # actually not 'valid', but anyway ...
|
|
elif (not c_ctxt.wellFormed and not c_ctxt.html and
|
|
c_ctxt.charset == tree.XML_CHAR_ENCODING_8859_1 and
|
|
[1 for error in context._error_log
|
|
if error.type == ErrorTypes.ERR_INVALID_CHAR]):
|
|
# An encoding error occurred and libxml2 switched from UTF-8
|
|
# input to (undecoded) Latin-1, at some arbitrary point in the
|
|
# document. Better raise an error than allowing for a broken
|
|
# tree with mixed encodings. This is fixed in libxml2 2.12.
|
|
well_formed = 0
|
|
elif recover or (c_ctxt.wellFormed and
|
|
c_ctxt.lastError.level < xmlerror.XML_ERR_ERROR):
|
|
well_formed = 1
|
|
elif not c_ctxt.replaceEntities and not c_ctxt.validate \
|
|
and context is not None:
|
|
# in this mode, we ignore errors about undefined entities
|
|
for error in context._error_log.filter_from_errors():
|
|
if error.type != ErrorTypes.WAR_UNDECLARED_ENTITY and \
|
|
error.type != ErrorTypes.ERR_UNDECLARED_ENTITY:
|
|
well_formed = 0
|
|
break
|
|
else:
|
|
well_formed = 1
|
|
else:
|
|
well_formed = 0
|
|
|
|
if not well_formed:
|
|
if free_doc:
|
|
tree.xmlFreeDoc(result)
|
|
result = NULL
|
|
|
|
if context is not None and context._has_raised():
|
|
if result is not NULL:
|
|
if free_doc:
|
|
tree.xmlFreeDoc(result)
|
|
result = NULL
|
|
context._raise_if_stored()
|
|
|
|
if result is NULL:
|
|
if context is not None:
|
|
_raiseParseError(c_ctxt, filename, context._error_log)
|
|
else:
|
|
_raiseParseError(c_ctxt, filename, None)
|
|
else:
|
|
if result.URL is NULL and filename is not None:
|
|
result.URL = tree.xmlStrdup(_xcstr(filename))
|
|
if result.encoding is NULL:
|
|
result.encoding = tree.xmlStrdup(<unsigned char*>"UTF-8")
|
|
|
|
if context._validator is not None and \
|
|
context._validator._add_default_attributes:
|
|
# we currently need to do this here as libxml2 does not
|
|
# support inserting default attributes during parse-time
|
|
# validation
|
|
context._validator.inject_default_attributes(result)
|
|
|
|
return result
|
|
|
|
cdef int _fixHtmlDictNames(tree.xmlDict* c_dict, xmlDoc* c_doc) noexcept nogil:
|
|
cdef xmlNode* c_node
|
|
if c_doc is NULL:
|
|
return 0
|
|
c_node = c_doc.children
|
|
tree.BEGIN_FOR_EACH_ELEMENT_FROM(<xmlNode*>c_doc, c_node, 1)
|
|
if c_node.type == tree.XML_ELEMENT_NODE:
|
|
if _fixHtmlDictNodeNames(c_dict, c_node) < 0:
|
|
return -1
|
|
tree.END_FOR_EACH_ELEMENT_FROM(c_node)
|
|
return 0
|
|
|
|
cdef int _fixHtmlDictSubtreeNames(tree.xmlDict* c_dict, xmlDoc* c_doc,
|
|
xmlNode* c_start_node) noexcept nogil:
|
|
"""
|
|
Move names to the dict, iterating in document order, starting at
|
|
c_start_node. This is used in incremental parsing after each chunk.
|
|
"""
|
|
cdef xmlNode* c_node
|
|
if not c_doc:
|
|
return 0
|
|
if not c_start_node:
|
|
return _fixHtmlDictNames(c_dict, c_doc)
|
|
c_node = c_start_node
|
|
tree.BEGIN_FOR_EACH_ELEMENT_FROM(<xmlNode*>c_doc, c_node, 1)
|
|
if c_node.type == tree.XML_ELEMENT_NODE:
|
|
if _fixHtmlDictNodeNames(c_dict, c_node) < 0:
|
|
return -1
|
|
tree.END_FOR_EACH_ELEMENT_FROM(c_node)
|
|
return 0
|
|
|
|
cdef inline int _fixHtmlDictNodeNames(tree.xmlDict* c_dict,
|
|
xmlNode* c_node) noexcept nogil:
|
|
cdef xmlNode* c_attr
|
|
c_name = tree.xmlDictLookup(c_dict, c_node.name, -1)
|
|
if c_name is NULL:
|
|
return -1
|
|
if c_name is not c_node.name:
|
|
tree.xmlFree(<char*>c_node.name)
|
|
c_node.name = c_name
|
|
c_attr = <xmlNode*>c_node.properties
|
|
while c_attr is not NULL:
|
|
c_name = tree.xmlDictLookup(c_dict, c_attr.name, -1)
|
|
if c_name is NULL:
|
|
return -1
|
|
if c_name is not c_attr.name:
|
|
tree.xmlFree(<char*>c_attr.name)
|
|
c_attr.name = c_name
|
|
c_attr = c_attr.next
|
|
return 0
|
|
|
|
|
|
@cython.internal
|
|
cdef class _BaseParser:
|
|
cdef ElementClassLookup _class_lookup
|
|
cdef _ResolverRegistry _resolvers
|
|
cdef _ParserContext _parser_context
|
|
cdef _ParserContext _push_parser_context
|
|
cdef int _parse_options
|
|
cdef bint _for_html
|
|
cdef bint _remove_comments
|
|
cdef bint _remove_pis
|
|
cdef bint _strip_cdata
|
|
cdef bint _collect_ids
|
|
cdef bint _resolve_external_entities
|
|
cdef XMLSchema _schema
|
|
cdef bytes _filename
|
|
cdef readonly object target
|
|
cdef object _default_encoding
|
|
cdef tuple _events_to_collect # (event_types, tag)
|
|
|
|
def __init__(self, int parse_options, bint for_html, XMLSchema schema,
|
|
remove_comments, remove_pis, strip_cdata, collect_ids,
|
|
target, encoding, bint resolve_external_entities=True):
|
|
cdef tree.xmlCharEncodingHandler* enchandler
|
|
cdef int c_encoding
|
|
if not isinstance(self, (XMLParser, HTMLParser)):
|
|
raise TypeError, "This class cannot be instantiated"
|
|
|
|
self._parse_options = parse_options
|
|
self.target = target
|
|
self._for_html = for_html
|
|
self._remove_comments = remove_comments
|
|
self._remove_pis = remove_pis
|
|
self._strip_cdata = strip_cdata
|
|
self._collect_ids = collect_ids
|
|
self._resolve_external_entities = resolve_external_entities
|
|
self._schema = schema
|
|
|
|
self._resolvers = _ResolverRegistry()
|
|
|
|
if encoding is None:
|
|
self._default_encoding = None
|
|
else:
|
|
encoding = _utf8(encoding)
|
|
enchandler = tree.xmlFindCharEncodingHandler(_cstr(encoding))
|
|
if enchandler is NULL:
|
|
raise LookupError, f"unknown encoding: '{encoding}'"
|
|
tree.xmlCharEncCloseFunc(enchandler)
|
|
self._default_encoding = encoding
|
|
|
|
cdef _setBaseURL(self, base_url):
|
|
self._filename = _encodeFilename(base_url)
|
|
|
|
cdef _collectEvents(self, event_types, tag):
|
|
if event_types is None:
|
|
event_types = ()
|
|
else:
|
|
event_types = tuple(set(event_types))
|
|
_buildParseEventFilter(event_types) # purely for validation
|
|
self._events_to_collect = (event_types, tag)
|
|
|
|
cdef _ParserContext _getParserContext(self):
|
|
cdef xmlparser.xmlParserCtxt* pctxt
|
|
if self._parser_context is None:
|
|
self._parser_context = self._createContext(self.target, None)
|
|
self._parser_context._collect_ids = self._collect_ids
|
|
if self._schema is not None:
|
|
self._parser_context._validator = \
|
|
self._schema._newSaxValidator(
|
|
self._parse_options & xmlparser.XML_PARSE_DTDATTR)
|
|
pctxt = self._newParserCtxt()
|
|
_initParserContext(self._parser_context, self._resolvers, pctxt)
|
|
self._configureSaxContext(pctxt)
|
|
return self._parser_context
|
|
|
|
cdef _ParserContext _getPushParserContext(self):
|
|
cdef xmlparser.xmlParserCtxt* pctxt
|
|
if self._push_parser_context is None:
|
|
self._push_parser_context = self._createContext(
|
|
self.target, self._events_to_collect)
|
|
self._push_parser_context._collect_ids = self._collect_ids
|
|
if self._schema is not None:
|
|
self._push_parser_context._validator = \
|
|
self._schema._newSaxValidator(
|
|
self._parse_options & xmlparser.XML_PARSE_DTDATTR)
|
|
pctxt = self._newPushParserCtxt()
|
|
_initParserContext(
|
|
self._push_parser_context, self._resolvers, pctxt)
|
|
self._configureSaxContext(pctxt)
|
|
return self._push_parser_context
|
|
|
|
cdef _ParserContext _createContext(self, target, events_to_collect):
|
|
cdef _SaxParserContext sax_context
|
|
if target is not None:
|
|
sax_context = _TargetParserContext(self)
|
|
(<_TargetParserContext>sax_context)._setTarget(target)
|
|
elif events_to_collect:
|
|
sax_context = _SaxParserContext(self)
|
|
else:
|
|
# nothing special to configure
|
|
return _ParserContext()
|
|
if events_to_collect:
|
|
events, tag = events_to_collect
|
|
sax_context._setEventFilter(events, tag)
|
|
return sax_context
|
|
|
|
@cython.final
|
|
cdef int _configureSaxContext(self, xmlparser.xmlParserCtxt* pctxt) except -1:
|
|
if self._remove_comments:
|
|
pctxt.sax.comment = NULL
|
|
if self._remove_pis:
|
|
pctxt.sax.processingInstruction = NULL
|
|
if self._strip_cdata:
|
|
# hard switch-off for CDATA nodes => makes them plain text
|
|
pctxt.sax.cdataBlock = NULL
|
|
if not self._resolve_external_entities:
|
|
pctxt.sax.getEntity = _getInternalEntityOnly
|
|
|
|
cdef int _registerHtmlErrorHandler(self, xmlparser.xmlParserCtxt* c_ctxt) except -1:
|
|
cdef xmlparser.xmlSAXHandler* sax = c_ctxt.sax
|
|
if sax is not NULL and sax.initialized and sax.initialized != xmlparser.XML_SAX2_MAGIC:
|
|
# need to extend SAX1 context to SAX2 to get proper error reports
|
|
if <xmlparser.xmlSAXHandlerV1*>sax is &htmlparser.htmlDefaultSAXHandler:
|
|
sax = <xmlparser.xmlSAXHandler*> tree.xmlMalloc(sizeof(xmlparser.xmlSAXHandler))
|
|
if sax is NULL:
|
|
raise MemoryError()
|
|
cstring_h.memcpy(sax, &htmlparser.htmlDefaultSAXHandler,
|
|
sizeof(htmlparser.htmlDefaultSAXHandler))
|
|
c_ctxt.sax = sax
|
|
sax.initialized = xmlparser.XML_SAX2_MAGIC
|
|
# Need a cast here because older libxml2 releases do not use 'const' in the functype.
|
|
sax.serror = <xmlerror.xmlStructuredErrorFunc> _receiveParserError
|
|
sax.startElementNs = NULL
|
|
sax.endElementNs = NULL
|
|
sax._private = NULL
|
|
return 0
|
|
|
|
cdef xmlparser.xmlParserCtxt* _newParserCtxt(self) except NULL:
|
|
cdef xmlparser.xmlParserCtxt* c_ctxt
|
|
if self._for_html:
|
|
c_ctxt = htmlparser.htmlCreateMemoryParserCtxt('dummy', 5)
|
|
if c_ctxt is not NULL:
|
|
self._registerHtmlErrorHandler(c_ctxt)
|
|
else:
|
|
c_ctxt = xmlparser.xmlNewParserCtxt()
|
|
if c_ctxt is NULL:
|
|
raise MemoryError
|
|
c_ctxt.sax.startDocument = _initSaxDocument
|
|
return c_ctxt
|
|
|
|
cdef xmlparser.xmlParserCtxt* _newPushParserCtxt(self) except NULL:
|
|
cdef xmlparser.xmlParserCtxt* c_ctxt
|
|
cdef char* c_filename = _cstr(self._filename) if self._filename is not None else NULL
|
|
if self._for_html:
|
|
c_ctxt = htmlparser.htmlCreatePushParserCtxt(
|
|
NULL, NULL, NULL, 0, c_filename, tree.XML_CHAR_ENCODING_NONE)
|
|
if c_ctxt is not NULL:
|
|
self._registerHtmlErrorHandler(c_ctxt)
|
|
htmlparser.htmlCtxtUseOptions(c_ctxt, self._parse_options)
|
|
else:
|
|
c_ctxt = xmlparser.xmlCreatePushParserCtxt(
|
|
NULL, NULL, NULL, 0, c_filename)
|
|
if c_ctxt is not NULL:
|
|
xmlparser.xmlCtxtUseOptions(c_ctxt, self._parse_options)
|
|
if c_ctxt is NULL:
|
|
raise MemoryError()
|
|
c_ctxt.sax.startDocument = _initSaxDocument
|
|
return c_ctxt
|
|
|
|
@property
|
|
def error_log(self):
|
|
"""The error log of the last parser run.
|
|
"""
|
|
cdef _ParserContext context
|
|
context = self._getParserContext()
|
|
return context._error_log.copy()
|
|
|
|
@property
|
|
def resolvers(self):
|
|
"""The custom resolver registry of this parser."""
|
|
return self._resolvers
|
|
|
|
@property
|
|
def version(self):
|
|
"""The version of the underlying XML parser."""
|
|
return "libxml2 %d.%d.%d" % LIBXML_VERSION
|
|
|
|
def set_element_class_lookup(self, ElementClassLookup lookup = None):
|
|
"""set_element_class_lookup(self, lookup = None)
|
|
|
|
Set a lookup scheme for element classes generated from this parser.
|
|
|
|
Reset it by passing None or nothing.
|
|
"""
|
|
self._class_lookup = lookup
|
|
|
|
cdef _BaseParser _copy(self):
|
|
"Create a new parser with the same configuration."
|
|
cdef _BaseParser parser
|
|
parser = self.__class__()
|
|
parser._parse_options = self._parse_options
|
|
parser._for_html = self._for_html
|
|
parser._remove_comments = self._remove_comments
|
|
parser._remove_pis = self._remove_pis
|
|
parser._strip_cdata = self._strip_cdata
|
|
parser._filename = self._filename
|
|
parser._resolvers = self._resolvers
|
|
parser.target = self.target
|
|
parser._class_lookup = self._class_lookup
|
|
parser._default_encoding = self._default_encoding
|
|
parser._schema = self._schema
|
|
parser._events_to_collect = self._events_to_collect
|
|
return parser
|
|
|
|
def copy(self):
|
|
"""copy(self)
|
|
|
|
Create a new parser with the same configuration.
|
|
"""
|
|
return self._copy()
|
|
|
|
def makeelement(self, _tag, attrib=None, nsmap=None, **_extra):
|
|
"""makeelement(self, _tag, attrib=None, nsmap=None, **_extra)
|
|
|
|
Creates a new element associated with this parser.
|
|
"""
|
|
return _makeElement(_tag, NULL, None, self, None, None,
|
|
attrib, nsmap, _extra)
|
|
|
|
# internal parser methods
|
|
|
|
cdef xmlDoc* _parseUnicodeDoc(self, utext, char* c_filename) except NULL:
|
|
"""Parse unicode document, share dictionary if possible.
|
|
"""
|
|
cdef _ParserContext context
|
|
cdef xmlDoc* result
|
|
cdef xmlparser.xmlParserCtxt* pctxt
|
|
cdef Py_ssize_t py_buffer_len
|
|
cdef int buffer_len, c_kind
|
|
cdef const_char* c_text
|
|
cdef const_char* c_encoding = _PY_UNICODE_ENCODING
|
|
if python.PyUnicode_IS_READY(utext):
|
|
# PEP-393 string
|
|
c_text = <const_char*>python.PyUnicode_DATA(utext)
|
|
py_buffer_len = python.PyUnicode_GET_LENGTH(utext)
|
|
c_kind = python.PyUnicode_KIND(utext)
|
|
if c_kind == 1:
|
|
if python.PyUnicode_MAX_CHAR_VALUE(utext) <= 127:
|
|
c_encoding = 'UTF-8'
|
|
else:
|
|
c_encoding = 'ISO-8859-1'
|
|
elif c_kind == 2:
|
|
py_buffer_len *= 2
|
|
if python.PY_BIG_ENDIAN:
|
|
c_encoding = 'UTF-16BE' # actually UCS-2
|
|
else:
|
|
c_encoding = 'UTF-16LE' # actually UCS-2
|
|
elif c_kind == 4:
|
|
py_buffer_len *= 4
|
|
if python.PY_BIG_ENDIAN:
|
|
c_encoding = 'UTF-32BE' # actually UCS-4
|
|
else:
|
|
c_encoding = 'UTF-32LE' # actually UCS-4
|
|
else:
|
|
assert False, f"Illegal Unicode kind {c_kind}"
|
|
else:
|
|
# old Py_UNICODE string
|
|
py_buffer_len = python.PyUnicode_GET_DATA_SIZE(utext)
|
|
c_text = python.PyUnicode_AS_DATA(utext)
|
|
assert 0 <= py_buffer_len <= limits.INT_MAX
|
|
buffer_len = py_buffer_len
|
|
|
|
context = self._getParserContext()
|
|
context.prepare()
|
|
try:
|
|
pctxt = context._c_ctxt
|
|
__GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
|
|
orig_options = pctxt.options
|
|
with nogil:
|
|
if self._for_html:
|
|
result = htmlparser.htmlCtxtReadMemory(
|
|
pctxt, c_text, buffer_len, c_filename, c_encoding,
|
|
self._parse_options)
|
|
if result is not NULL:
|
|
if _fixHtmlDictNames(pctxt.dict, result) < 0:
|
|
tree.xmlFreeDoc(result)
|
|
result = NULL
|
|
else:
|
|
result = xmlparser.xmlCtxtReadMemory(
|
|
pctxt, c_text, buffer_len, c_filename, c_encoding,
|
|
self._parse_options)
|
|
pctxt.options = orig_options # work around libxml2 problem
|
|
|
|
return context._handleParseResultDoc(self, result, None)
|
|
finally:
|
|
context.cleanup()
|
|
|
|
cdef xmlDoc* _parseDoc(self, char* c_text, int c_len,
|
|
char* c_filename) except NULL:
|
|
"""Parse document, share dictionary if possible.
|
|
"""
|
|
cdef _ParserContext context
|
|
cdef xmlDoc* result
|
|
cdef xmlparser.xmlParserCtxt* pctxt
|
|
cdef char* c_encoding
|
|
cdef tree.xmlCharEncoding enc
|
|
context = self._getParserContext()
|
|
context.prepare()
|
|
try:
|
|
pctxt = context._c_ctxt
|
|
__GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
|
|
|
|
if self._default_encoding is None:
|
|
c_encoding = NULL
|
|
# libxml2 (at least 2.9.3) does not recognise UTF-32 BOMs
|
|
# NOTE: limit to problematic cases because it changes character offsets
|
|
if c_len >= 4 and (c_text[0] == b'\xFF' and c_text[1] == b'\xFE' and
|
|
c_text[2] == 0 and c_text[3] == 0):
|
|
c_encoding = "UTF-32LE"
|
|
c_text += 4
|
|
c_len -= 4
|
|
elif c_len >= 4 and (c_text[0] == 0 and c_text[1] == 0 and
|
|
c_text[2] == b'\xFE' and c_text[3] == b'\xFF'):
|
|
c_encoding = "UTF-32BE"
|
|
c_text += 4
|
|
c_len -= 4
|
|
else:
|
|
# no BOM => try to determine encoding
|
|
enc = tree.xmlDetectCharEncoding(<const_xmlChar*>c_text, c_len)
|
|
if enc == tree.XML_CHAR_ENCODING_UCS4LE:
|
|
c_encoding = 'UTF-32LE'
|
|
elif enc == tree.XML_CHAR_ENCODING_UCS4BE:
|
|
c_encoding = 'UTF-32BE'
|
|
else:
|
|
c_encoding = _cstr(self._default_encoding)
|
|
|
|
orig_options = pctxt.options
|
|
with nogil:
|
|
if self._for_html:
|
|
result = htmlparser.htmlCtxtReadMemory(
|
|
pctxt, c_text, c_len, c_filename,
|
|
c_encoding, self._parse_options)
|
|
if result is not NULL:
|
|
if _fixHtmlDictNames(pctxt.dict, result) < 0:
|
|
tree.xmlFreeDoc(result)
|
|
result = NULL
|
|
else:
|
|
result = xmlparser.xmlCtxtReadMemory(
|
|
pctxt, c_text, c_len, c_filename,
|
|
c_encoding, self._parse_options)
|
|
pctxt.options = orig_options # work around libxml2 problem
|
|
|
|
return context._handleParseResultDoc(self, result, None)
|
|
finally:
|
|
context.cleanup()
|
|
|
|
cdef xmlDoc* _parseDocFromFile(self, char* c_filename) except NULL:
|
|
cdef _ParserContext context
|
|
cdef xmlDoc* result
|
|
cdef xmlparser.xmlParserCtxt* pctxt
|
|
cdef char* c_encoding
|
|
result = NULL
|
|
|
|
context = self._getParserContext()
|
|
context.prepare()
|
|
try:
|
|
pctxt = context._c_ctxt
|
|
__GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
|
|
|
|
if self._default_encoding is None:
|
|
c_encoding = NULL
|
|
else:
|
|
c_encoding = _cstr(self._default_encoding)
|
|
|
|
orig_options = pctxt.options
|
|
with nogil:
|
|
if self._for_html:
|
|
result = htmlparser.htmlCtxtReadFile(
|
|
pctxt, c_filename, c_encoding, self._parse_options)
|
|
if result is not NULL:
|
|
if _fixHtmlDictNames(pctxt.dict, result) < 0:
|
|
tree.xmlFreeDoc(result)
|
|
result = NULL
|
|
else:
|
|
result = xmlparser.xmlCtxtReadFile(
|
|
pctxt, c_filename, c_encoding, self._parse_options)
|
|
pctxt.options = orig_options # work around libxml2 problem
|
|
|
|
return context._handleParseResultDoc(self, result, c_filename)
|
|
finally:
|
|
context.cleanup()
|
|
|
|
cdef xmlDoc* _parseDocFromFilelike(self, filelike, filename,
|
|
encoding) except NULL:
|
|
cdef _ParserContext context
|
|
cdef _FileReaderContext file_context
|
|
cdef xmlDoc* result
|
|
cdef xmlparser.xmlParserCtxt* pctxt
|
|
cdef char* c_filename
|
|
if not filename:
|
|
filename = None
|
|
|
|
context = self._getParserContext()
|
|
context.prepare()
|
|
try:
|
|
pctxt = context._c_ctxt
|
|
__GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
|
|
file_context = _FileReaderContext(
|
|
filelike, context, filename,
|
|
encoding or self._default_encoding)
|
|
result = file_context._readDoc(pctxt, self._parse_options)
|
|
|
|
return context._handleParseResultDoc(
|
|
self, result, filename)
|
|
finally:
|
|
context.cleanup()
|
|
|
|
|
|
cdef tree.xmlEntity* _getInternalEntityOnly(void* ctxt, const_xmlChar* name) noexcept nogil:
|
|
"""
|
|
Callback function to intercept the entity resolution when external entity loading is disabled.
|
|
"""
|
|
cdef tree.xmlEntity* entity = xmlparser.xmlSAX2GetEntity(ctxt, name)
|
|
if not entity:
|
|
return NULL
|
|
if entity.etype not in (
|
|
tree.xmlEntityType.XML_EXTERNAL_GENERAL_PARSED_ENTITY,
|
|
tree.xmlEntityType.XML_EXTERNAL_GENERAL_UNPARSED_ENTITY,
|
|
tree.xmlEntityType.XML_EXTERNAL_PARAMETER_ENTITY):
|
|
return entity
|
|
|
|
# Reject all external entities and fail the parsing instead. There is currently
|
|
# no way in libxml2 to just prevent the entity resolution in this case.
|
|
cdef xmlerror.xmlError c_error
|
|
cdef xmlerror.xmlStructuredErrorFunc err_func
|
|
cdef xmlparser.xmlParserInput* parser_input
|
|
cdef void* err_context
|
|
|
|
c_ctxt = <xmlparser.xmlParserCtxt *> ctxt
|
|
err_func = xmlerror.xmlStructuredError
|
|
if err_func:
|
|
parser_input = c_ctxt.input
|
|
# Copied from xmlVErrParser() in libxml2: get current input from stack.
|
|
if parser_input and parser_input.filename is NULL and c_ctxt.inputNr > 1:
|
|
parser_input = c_ctxt.inputTab[c_ctxt.inputNr - 2]
|
|
|
|
c_error = xmlerror.xmlError(
|
|
domain=xmlerror.xmlErrorDomain.XML_FROM_PARSER,
|
|
code=xmlerror.xmlParserErrors.XML_ERR_EXT_ENTITY_STANDALONE,
|
|
level=xmlerror.xmlErrorLevel.XML_ERR_FATAL,
|
|
message=b"External entity resolution is disabled for security reasons "
|
|
b"when resolving '&%s;'. Use 'XMLParser(resolve_entities=True)' "
|
|
b"if you consider it safe to enable it.",
|
|
file=parser_input.filename,
|
|
node=entity,
|
|
str1=<char*> name,
|
|
str2=NULL,
|
|
str3=NULL,
|
|
line=parser_input.line if parser_input else 0,
|
|
int1=0,
|
|
int2=parser_input.col if parser_input else 0,
|
|
)
|
|
err_context = xmlerror.xmlStructuredErrorContext
|
|
err_func(err_context, &c_error)
|
|
|
|
c_ctxt.wellFormed = 0
|
|
# The entity was looked up and does not need to be freed.
|
|
return NULL
|
|
|
|
|
|
cdef void _initSaxDocument(void* ctxt) noexcept with gil:
|
|
xmlparser.xmlSAX2StartDocument(ctxt)
|
|
c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
|
|
c_doc = c_ctxt.myDoc
|
|
|
|
# set up document dict
|
|
if c_doc and c_ctxt.dict and not c_doc.dict:
|
|
# I have no idea why libxml2 disables this - we need it
|
|
c_ctxt.dictNames = 1
|
|
c_doc.dict = c_ctxt.dict
|
|
xmlparser.xmlDictReference(c_ctxt.dict)
|
|
|
|
# set up XML ID hash table
|
|
if c_ctxt._private:
|
|
context = <_ParserContext>c_ctxt._private
|
|
if context._collect_ids:
|
|
# keep the global parser dict from filling up with XML IDs
|
|
if c_doc and not c_doc.ids:
|
|
# memory errors are not fatal here
|
|
c_dict = xmlparser.xmlDictCreate()
|
|
if c_dict:
|
|
c_doc.ids = tree.xmlHashCreateDict(0, c_dict)
|
|
xmlparser.xmlDictFree(c_dict)
|
|
else:
|
|
c_doc.ids = tree.xmlHashCreate(0)
|
|
else:
|
|
c_ctxt.loadsubset |= xmlparser.XML_SKIP_IDS
|
|
if c_doc and c_doc.ids and not tree.xmlHashSize(c_doc.ids):
|
|
# already initialised but empty => clear
|
|
tree.xmlHashFree(c_doc.ids, NULL)
|
|
c_doc.ids = NULL
|
|
|
|
|
|
############################################################
|
|
## ET feed parser
|
|
############################################################
|
|
|
|
cdef class _FeedParser(_BaseParser):
|
|
cdef bint _feed_parser_running
|
|
|
|
@property
|
|
def feed_error_log(self):
|
|
"""The error log of the last (or current) run of the feed parser.
|
|
|
|
Note that this is local to the feed parser and thus is
|
|
different from what the ``error_log`` property returns.
|
|
"""
|
|
return self._getPushParserContext()._error_log.copy()
|
|
|
|
cpdef feed(self, data):
|
|
"""feed(self, data)
|
|
|
|
Feeds data to the parser. The argument should be an 8-bit string
|
|
buffer containing encoded data, although Unicode is supported as long
|
|
as both string types are not mixed.
|
|
|
|
This is the main entry point to the consumer interface of a
|
|
parser. The parser will parse as much of the XML stream as it
|
|
can on each call. To finish parsing or to reset the parser,
|
|
call the ``close()`` method. Both methods may raise
|
|
ParseError if errors occur in the input data. If an error is
|
|
raised, there is no longer a need to call ``close()``.
|
|
|
|
The feed parser interface is independent of the normal parser
|
|
usage. You can use the same parser as a feed parser and in
|
|
the ``parse()`` function concurrently.
|
|
"""
|
|
cdef _ParserContext context
|
|
cdef bytes bstring
|
|
cdef xmlparser.xmlParserCtxt* pctxt
|
|
cdef Py_ssize_t py_buffer_len, ustart
|
|
cdef const_char* char_data
|
|
cdef const_char* c_encoding
|
|
cdef int buffer_len
|
|
cdef int error
|
|
cdef bint recover = self._parse_options & xmlparser.XML_PARSE_RECOVER
|
|
|
|
if isinstance(data, bytes):
|
|
if self._default_encoding is None:
|
|
c_encoding = NULL
|
|
else:
|
|
c_encoding = self._default_encoding
|
|
char_data = _cstr(data)
|
|
py_buffer_len = python.PyBytes_GET_SIZE(data)
|
|
ustart = 0
|
|
elif isinstance(data, unicode):
|
|
c_encoding = b"UTF-8"
|
|
char_data = NULL
|
|
py_buffer_len = len(<unicode> data)
|
|
ustart = 0
|
|
else:
|
|
raise TypeError, "Parsing requires string data"
|
|
|
|
context = self._getPushParserContext()
|
|
pctxt = context._c_ctxt
|
|
error = 0
|
|
if not self._feed_parser_running:
|
|
context.prepare(set_document_loader=False)
|
|
self._feed_parser_running = 1
|
|
c_filename = (_cstr(self._filename)
|
|
if self._filename is not None else NULL)
|
|
|
|
# We have to give *mlCtxtResetPush() enough input to figure
|
|
# out the character encoding (at least four bytes),
|
|
# however if we give it all we got, we'll have nothing for
|
|
# *mlParseChunk() and things go wrong.
|
|
buffer_len = 0
|
|
if char_data is not NULL:
|
|
buffer_len = 4 if py_buffer_len > 4 else <int>py_buffer_len
|
|
orig_loader = _register_document_loader()
|
|
if self._for_html:
|
|
error = _htmlCtxtResetPush(
|
|
pctxt, char_data, buffer_len, c_filename, c_encoding,
|
|
self._parse_options)
|
|
else:
|
|
xmlparser.xmlCtxtUseOptions(pctxt, self._parse_options)
|
|
error = xmlparser.xmlCtxtResetPush(
|
|
pctxt, char_data, buffer_len, c_filename, c_encoding)
|
|
_reset_document_loader(orig_loader)
|
|
py_buffer_len -= buffer_len
|
|
char_data += buffer_len
|
|
if error:
|
|
raise MemoryError()
|
|
__GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
|
|
|
|
#print pctxt.charset, 'NONE' if c_encoding is NULL else c_encoding
|
|
|
|
fixup_error = 0
|
|
while py_buffer_len > 0 and (error == 0 or recover):
|
|
if char_data is NULL:
|
|
# Unicode parsing by converting chunks to UTF-8
|
|
buffer_len = 2**19 # len(bytes) <= 4 * (2**19) == 2 MiB
|
|
bstring = (<unicode> data)[ustart : ustart+buffer_len].encode('UTF-8')
|
|
ustart += buffer_len
|
|
py_buffer_len -= buffer_len # may end up < 0
|
|
error, fixup_error = _parse_data_chunk(pctxt, <const char*> bstring, <int> len(bstring))
|
|
else:
|
|
# Direct byte string parsing.
|
|
buffer_len = <int>py_buffer_len if py_buffer_len <= limits.INT_MAX else limits.INT_MAX
|
|
error, fixup_error = _parse_data_chunk(pctxt, char_data, buffer_len)
|
|
py_buffer_len -= buffer_len
|
|
char_data += buffer_len
|
|
|
|
if fixup_error:
|
|
context.store_exception(MemoryError())
|
|
|
|
if context._has_raised():
|
|
# propagate Python exceptions immediately
|
|
recover = 0
|
|
error = 1
|
|
break
|
|
|
|
if error and not pctxt.replaceEntities and not pctxt.validate:
|
|
# in this mode, we ignore errors about undefined entities
|
|
for entry in context._error_log.filter_from_errors():
|
|
if entry.type != ErrorTypes.WAR_UNDECLARED_ENTITY and \
|
|
entry.type != ErrorTypes.ERR_UNDECLARED_ENTITY:
|
|
break
|
|
else:
|
|
error = 0
|
|
|
|
if not pctxt.wellFormed and pctxt.disableSAX and context._has_raised():
|
|
# propagate Python exceptions immediately
|
|
recover = 0
|
|
error = 1
|
|
|
|
if fixup_error or not recover and (error or not pctxt.wellFormed):
|
|
self._feed_parser_running = 0
|
|
try:
|
|
context._handleParseResult(self, pctxt.myDoc, None)
|
|
finally:
|
|
context.cleanup()
|
|
|
|
cpdef close(self):
|
|
"""close(self)
|
|
|
|
Terminates feeding data to this parser. This tells the parser to
|
|
process any remaining data in the feed buffer, and then returns the
|
|
root Element of the tree that was parsed.
|
|
|
|
This method must be called after passing the last chunk of data into
|
|
the ``feed()`` method. It should only be called when using the feed
|
|
parser interface, all other usage is undefined.
|
|
"""
|
|
if not self._feed_parser_running:
|
|
raise XMLSyntaxError("no element found",
|
|
xmlerror.XML_ERR_INTERNAL_ERROR, 0, 0,
|
|
self._filename)
|
|
|
|
context = self._getPushParserContext()
|
|
pctxt = context._c_ctxt
|
|
|
|
self._feed_parser_running = 0
|
|
if self._for_html:
|
|
htmlparser.htmlParseChunk(pctxt, NULL, 0, 1)
|
|
else:
|
|
xmlparser.xmlParseChunk(pctxt, NULL, 0, 1)
|
|
|
|
if (pctxt.recovery and not pctxt.disableSAX and
|
|
isinstance(context, _SaxParserContext)):
|
|
# apply any left-over 'end' events
|
|
(<_SaxParserContext>context).flushEvents()
|
|
|
|
try:
|
|
result = context._handleParseResult(self, pctxt.myDoc, None)
|
|
finally:
|
|
context.cleanup()
|
|
|
|
if isinstance(result, _Document):
|
|
return (<_Document>result).getroot()
|
|
else:
|
|
return result
|
|
|
|
|
|
cdef (int, int) _parse_data_chunk(xmlparser.xmlParserCtxt* c_ctxt,
|
|
const char* char_data, int buffer_len):
|
|
fixup_error = 0
|
|
with nogil:
|
|
if c_ctxt.html:
|
|
c_node = c_ctxt.node # last node where the parser stopped
|
|
orig_loader = _register_document_loader()
|
|
error = htmlparser.htmlParseChunk(c_ctxt, char_data, buffer_len, 0)
|
|
_reset_document_loader(orig_loader)
|
|
# and now for the fun part: move node names to the dict
|
|
if c_ctxt.myDoc:
|
|
fixup_error = _fixHtmlDictSubtreeNames(
|
|
c_ctxt.dict, c_ctxt.myDoc, c_node)
|
|
if c_ctxt.myDoc.dict and c_ctxt.myDoc.dict is not c_ctxt.dict:
|
|
xmlparser.xmlDictFree(c_ctxt.myDoc.dict)
|
|
c_ctxt.myDoc.dict = c_ctxt.dict
|
|
xmlparser.xmlDictReference(c_ctxt.dict)
|
|
else:
|
|
orig_loader = _register_document_loader()
|
|
error = xmlparser.xmlParseChunk(c_ctxt, char_data, buffer_len, 0)
|
|
_reset_document_loader(orig_loader)
|
|
return (error, fixup_error)
|
|
|
|
|
|
cdef int _htmlCtxtResetPush(xmlparser.xmlParserCtxt* c_ctxt,
|
|
const_char* c_data, int buffer_len,
|
|
const_char* c_filename, const_char* c_encoding,
|
|
int parse_options) except -1:
|
|
cdef xmlparser.xmlParserInput* c_input_stream
|
|
# libxml2 lacks an HTML push parser setup function
|
|
error = xmlparser.xmlCtxtResetPush(
|
|
c_ctxt, c_data, buffer_len, c_filename, c_encoding)
|
|
if error:
|
|
return error
|
|
|
|
# fix libxml2 setup for HTML
|
|
c_ctxt.progressive = 1
|
|
c_ctxt.html = 1
|
|
htmlparser.htmlCtxtUseOptions(c_ctxt, parse_options)
|
|
|
|
return 0
|
|
|
|
|
|
############################################################
|
|
## XML parser
|
|
############################################################
|
|
|
|
cdef int _XML_DEFAULT_PARSE_OPTIONS
|
|
_XML_DEFAULT_PARSE_OPTIONS = (
|
|
xmlparser.XML_PARSE_NOENT |
|
|
xmlparser.XML_PARSE_NOCDATA |
|
|
xmlparser.XML_PARSE_NONET |
|
|
xmlparser.XML_PARSE_COMPACT |
|
|
xmlparser.XML_PARSE_BIG_LINES
|
|
)
|
|
|
|
cdef class XMLParser(_FeedParser):
|
|
"""XMLParser(self, encoding=None, attribute_defaults=False, dtd_validation=False, load_dtd=False, no_network=True, ns_clean=False, recover=False, schema: XMLSchema =None, huge_tree=False, remove_blank_text=False, resolve_entities=True, remove_comments=False, remove_pis=False, strip_cdata=True, collect_ids=True, target=None, compact=True)
|
|
|
|
The XML parser.
|
|
|
|
Parsers can be supplied as additional argument to various parse
|
|
functions of the lxml API. A default parser is always available
|
|
and can be replaced by a call to the global function
|
|
'set_default_parser'. New parsers can be created at any time
|
|
without a major run-time overhead.
|
|
|
|
The keyword arguments in the constructor are mainly based on the
|
|
libxml2 parser configuration. A DTD will also be loaded if DTD
|
|
validation or attribute default values are requested (unless you
|
|
additionally provide an XMLSchema from which the default
|
|
attributes can be read).
|
|
|
|
Available boolean keyword arguments:
|
|
|
|
- attribute_defaults - inject default attributes from DTD or XMLSchema
|
|
- dtd_validation - validate against a DTD referenced by the document
|
|
- load_dtd - use DTD for parsing
|
|
- no_network - prevent network access for related files (default: True)
|
|
- ns_clean - clean up redundant namespace declarations
|
|
- recover - try hard to parse through broken XML
|
|
- remove_blank_text - discard blank text nodes that appear ignorable
|
|
- remove_comments - discard comments
|
|
- remove_pis - discard processing instructions
|
|
- strip_cdata - replace CDATA sections by normal text content (default: True)
|
|
- compact - save memory for short text content (default: True)
|
|
- collect_ids - use a hash table of XML IDs for fast access (default: True, always True with DTD validation)
|
|
- huge_tree - disable security restrictions and support very deep trees
|
|
and very long text content (only affects libxml2 2.7+)
|
|
|
|
Other keyword arguments:
|
|
|
|
- resolve_entities - replace entities by their text value: False for keeping the
|
|
entity references, True for resolving them, and 'internal' for resolving
|
|
internal definitions only (no external file/URL access).
|
|
The default used to be True and was changed to 'internal' in lxml 5.0.
|
|
- encoding - override the document encoding (note: libiconv encoding name)
|
|
- target - a parser target object that will receive the parse events
|
|
- schema - an XMLSchema to validate against
|
|
|
|
Note that you should avoid sharing parsers between threads. While this is
|
|
not harmful, it is more efficient to use separate parsers. This does not
|
|
apply to the default parser.
|
|
"""
|
|
def __init__(self, *, encoding=None, attribute_defaults=False,
|
|
dtd_validation=False, load_dtd=False, no_network=True,
|
|
ns_clean=False, recover=False, XMLSchema schema=None,
|
|
huge_tree=False, remove_blank_text=False, resolve_entities='internal',
|
|
remove_comments=False, remove_pis=False, strip_cdata=True,
|
|
collect_ids=True, target=None, compact=True):
|
|
cdef int parse_options
|
|
cdef bint resolve_external = True
|
|
parse_options = _XML_DEFAULT_PARSE_OPTIONS
|
|
if load_dtd:
|
|
parse_options = parse_options | xmlparser.XML_PARSE_DTDLOAD
|
|
if dtd_validation:
|
|
parse_options = parse_options | xmlparser.XML_PARSE_DTDVALID | \
|
|
xmlparser.XML_PARSE_DTDLOAD
|
|
if attribute_defaults:
|
|
parse_options = parse_options | xmlparser.XML_PARSE_DTDATTR
|
|
if schema is None:
|
|
parse_options = parse_options | xmlparser.XML_PARSE_DTDLOAD
|
|
if ns_clean:
|
|
parse_options = parse_options | xmlparser.XML_PARSE_NSCLEAN
|
|
if recover:
|
|
parse_options = parse_options | xmlparser.XML_PARSE_RECOVER
|
|
if remove_blank_text:
|
|
parse_options = parse_options | xmlparser.XML_PARSE_NOBLANKS
|
|
if huge_tree:
|
|
parse_options = parse_options | xmlparser.XML_PARSE_HUGE
|
|
if not no_network:
|
|
parse_options = parse_options ^ xmlparser.XML_PARSE_NONET
|
|
if not compact:
|
|
parse_options = parse_options ^ xmlparser.XML_PARSE_COMPACT
|
|
if not resolve_entities:
|
|
parse_options = parse_options ^ xmlparser.XML_PARSE_NOENT
|
|
elif resolve_entities == 'internal':
|
|
resolve_external = False
|
|
if not strip_cdata:
|
|
parse_options = parse_options ^ xmlparser.XML_PARSE_NOCDATA
|
|
|
|
_BaseParser.__init__(self, parse_options, False, schema,
|
|
remove_comments, remove_pis, strip_cdata,
|
|
collect_ids, target, encoding, resolve_external)
|
|
|
|
|
|
cdef class XMLPullParser(XMLParser):
|
|
"""XMLPullParser(self, events=None, *, tag=None, **kwargs)
|
|
|
|
XML parser that collects parse events in an iterator.
|
|
|
|
The collected events are the same as for iterparse(), but the
|
|
parser itself is non-blocking in the sense that it receives
|
|
data chunks incrementally through its .feed() method, instead
|
|
of reading them directly from a file(-like) object all by itself.
|
|
|
|
By default, it collects Element end events. To change that,
|
|
pass any subset of the available events into the ``events``
|
|
argument: ``'start'``, ``'end'``, ``'start-ns'``,
|
|
``'end-ns'``, ``'comment'``, ``'pi'``.
|
|
|
|
To support loading external dependencies relative to the input
|
|
source, you can pass the ``base_url``.
|
|
"""
|
|
def __init__(self, events=None, *, tag=None, base_url=None, **kwargs):
|
|
XMLParser.__init__(self, **kwargs)
|
|
if events is None:
|
|
events = ('end',)
|
|
self._setBaseURL(base_url)
|
|
self._collectEvents(events, tag)
|
|
|
|
def read_events(self):
|
|
return (<_SaxParserContext?>self._getPushParserContext()).events_iterator
|
|
|
|
|
|
cdef class ETCompatXMLParser(XMLParser):
|
|
"""ETCompatXMLParser(self, encoding=None, attribute_defaults=False, \
|
|
dtd_validation=False, load_dtd=False, no_network=True, \
|
|
ns_clean=False, recover=False, schema=None, \
|
|
huge_tree=False, remove_blank_text=False, resolve_entities=True, \
|
|
remove_comments=True, remove_pis=True, strip_cdata=True, \
|
|
target=None, compact=True)
|
|
|
|
An XML parser with an ElementTree compatible default setup.
|
|
|
|
See the XMLParser class for details.
|
|
|
|
This parser has ``remove_comments`` and ``remove_pis`` enabled by default
|
|
and thus ignores comments and processing instructions.
|
|
"""
|
|
def __init__(self, *, encoding=None, attribute_defaults=False,
|
|
dtd_validation=False, load_dtd=False, no_network=True,
|
|
ns_clean=False, recover=False, schema=None,
|
|
huge_tree=False, remove_blank_text=False, resolve_entities=True,
|
|
remove_comments=True, remove_pis=True, strip_cdata=True,
|
|
target=None, compact=True):
|
|
XMLParser.__init__(self,
|
|
attribute_defaults=attribute_defaults,
|
|
dtd_validation=dtd_validation,
|
|
load_dtd=load_dtd,
|
|
no_network=no_network,
|
|
ns_clean=ns_clean,
|
|
recover=recover,
|
|
remove_blank_text=remove_blank_text,
|
|
huge_tree=huge_tree,
|
|
compact=compact,
|
|
resolve_entities=resolve_entities,
|
|
remove_comments=remove_comments,
|
|
remove_pis=remove_pis,
|
|
strip_cdata=strip_cdata,
|
|
target=target,
|
|
encoding=encoding,
|
|
schema=schema)
|
|
|
|
# ET 1.2 compatible name
|
|
XMLTreeBuilder = ETCompatXMLParser
|
|
|
|
|
|
cdef XMLParser __DEFAULT_XML_PARSER
|
|
__DEFAULT_XML_PARSER = XMLParser()
|
|
|
|
__GLOBAL_PARSER_CONTEXT.setDefaultParser(__DEFAULT_XML_PARSER)
|
|
|
|
def set_default_parser(_BaseParser parser=None):
|
|
"""set_default_parser(parser=None)
|
|
|
|
Set a default parser for the current thread. This parser is used
|
|
globally whenever no parser is supplied to the various parse functions of
|
|
the lxml API. If this function is called without a parser (or if it is
|
|
None), the default parser is reset to the original configuration.
|
|
|
|
Note that the pre-installed default parser is not thread-safe. Avoid the
|
|
default parser in multi-threaded environments. You can create a separate
|
|
parser for each thread explicitly or use a parser pool.
|
|
"""
|
|
if parser is None:
|
|
parser = __DEFAULT_XML_PARSER
|
|
__GLOBAL_PARSER_CONTEXT.setDefaultParser(parser)
|
|
|
|
def get_default_parser():
|
|
"get_default_parser()"
|
|
return __GLOBAL_PARSER_CONTEXT.getDefaultParser()
|
|
|
|
############################################################
|
|
## HTML parser
|
|
############################################################
|
|
|
|
cdef int _HTML_DEFAULT_PARSE_OPTIONS
|
|
_HTML_DEFAULT_PARSE_OPTIONS = (
|
|
htmlparser.HTML_PARSE_RECOVER |
|
|
htmlparser.HTML_PARSE_NONET |
|
|
htmlparser.HTML_PARSE_COMPACT
|
|
)
|
|
|
|
cdef class HTMLParser(_FeedParser):
|
|
"""HTMLParser(self, encoding=None, remove_blank_text=False, \
|
|
remove_comments=False, remove_pis=False, strip_cdata=True, \
|
|
no_network=True, target=None, schema: XMLSchema =None, \
|
|
recover=True, compact=True, collect_ids=True, huge_tree=False)
|
|
|
|
The HTML parser.
|
|
|
|
This parser allows reading HTML into a normal XML tree. By
|
|
default, it can read broken (non well-formed) HTML, depending on
|
|
the capabilities of libxml2. Use the 'recover' option to switch
|
|
this off.
|
|
|
|
Available boolean keyword arguments:
|
|
|
|
- recover - try hard to parse through broken HTML (default: True)
|
|
- no_network - prevent network access for related files (default: True)
|
|
- remove_blank_text - discard empty text nodes that are ignorable (i.e. not actual text content)
|
|
- remove_comments - discard comments
|
|
- remove_pis - discard processing instructions
|
|
- strip_cdata - replace CDATA sections by normal text content (default: True)
|
|
- compact - save memory for short text content (default: True)
|
|
- default_doctype - add a default doctype even if it is not found in the HTML (default: True)
|
|
- collect_ids - use a hash table of XML IDs for fast access (default: True)
|
|
- huge_tree - disable security restrictions and support very deep trees
|
|
and very long text content (only affects libxml2 2.7+)
|
|
|
|
Other keyword arguments:
|
|
|
|
- encoding - override the document encoding (note: libiconv encoding name)
|
|
- target - a parser target object that will receive the parse events
|
|
- schema - an XMLSchema to validate against
|
|
|
|
Note that you should avoid sharing parsers between threads for performance
|
|
reasons.
|
|
"""
|
|
def __init__(self, *, encoding=None, remove_blank_text=False,
|
|
remove_comments=False, remove_pis=False, strip_cdata=True,
|
|
no_network=True, target=None, XMLSchema schema=None,
|
|
recover=True, compact=True, default_doctype=True,
|
|
collect_ids=True, huge_tree=False):
|
|
cdef int parse_options
|
|
parse_options = _HTML_DEFAULT_PARSE_OPTIONS
|
|
if remove_blank_text:
|
|
parse_options = parse_options | htmlparser.HTML_PARSE_NOBLANKS
|
|
if not recover:
|
|
parse_options = parse_options ^ htmlparser.HTML_PARSE_RECOVER
|
|
if not no_network:
|
|
parse_options = parse_options ^ htmlparser.HTML_PARSE_NONET
|
|
if not compact:
|
|
parse_options = parse_options ^ htmlparser.HTML_PARSE_COMPACT
|
|
if not default_doctype:
|
|
parse_options = parse_options ^ htmlparser.HTML_PARSE_NODEFDTD
|
|
if huge_tree:
|
|
parse_options = parse_options | xmlparser.XML_PARSE_HUGE
|
|
|
|
_BaseParser.__init__(self, parse_options, True, schema,
|
|
remove_comments, remove_pis, strip_cdata,
|
|
collect_ids, target, encoding)
|
|
|
|
|
|
cdef HTMLParser __DEFAULT_HTML_PARSER
|
|
__DEFAULT_HTML_PARSER = HTMLParser()
|
|
|
|
|
|
cdef class HTMLPullParser(HTMLParser):
|
|
"""HTMLPullParser(self, events=None, *, tag=None, base_url=None, **kwargs)
|
|
|
|
HTML parser that collects parse events in an iterator.
|
|
|
|
The collected events are the same as for iterparse(), but the
|
|
parser itself is non-blocking in the sense that it receives
|
|
data chunks incrementally through its .feed() method, instead
|
|
of reading them directly from a file(-like) object all by itself.
|
|
|
|
By default, it collects Element end events. To change that,
|
|
pass any subset of the available events into the ``events``
|
|
argument: ``'start'``, ``'end'``, ``'start-ns'``,
|
|
``'end-ns'``, ``'comment'``, ``'pi'``.
|
|
|
|
To support loading external dependencies relative to the input
|
|
source, you can pass the ``base_url``.
|
|
"""
|
|
def __init__(self, events=None, *, tag=None, base_url=None, **kwargs):
|
|
HTMLParser.__init__(self, **kwargs)
|
|
if events is None:
|
|
events = ('end',)
|
|
self._setBaseURL(base_url)
|
|
self._collectEvents(events, tag)
|
|
|
|
def read_events(self):
|
|
return (<_SaxParserContext?>self._getPushParserContext()).events_iterator
|
|
|
|
|
|
############################################################
|
|
## helper functions for document creation
|
|
############################################################
|
|
|
|
cdef xmlDoc* _parseDoc(text, filename, _BaseParser parser) except NULL:
|
|
cdef char* c_filename
|
|
cdef char* c_text
|
|
cdef Py_ssize_t c_len
|
|
if parser is None:
|
|
parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
|
|
if not filename:
|
|
c_filename = NULL
|
|
else:
|
|
filename_utf = _encodeFilenameUTF8(filename)
|
|
c_filename = _cstr(filename_utf)
|
|
if isinstance(text, unicode):
|
|
if python.PyUnicode_IS_READY(text):
|
|
# PEP-393 Unicode string
|
|
c_len = python.PyUnicode_GET_LENGTH(text) * python.PyUnicode_KIND(text)
|
|
else:
|
|
# old Py_UNICODE string
|
|
c_len = python.PyUnicode_GET_DATA_SIZE(text)
|
|
if c_len > limits.INT_MAX:
|
|
return (<_BaseParser>parser)._parseDocFromFilelike(
|
|
StringIO(text), filename, None)
|
|
return (<_BaseParser>parser)._parseUnicodeDoc(text, c_filename)
|
|
else:
|
|
c_len = python.PyBytes_GET_SIZE(text)
|
|
if c_len > limits.INT_MAX:
|
|
return (<_BaseParser>parser)._parseDocFromFilelike(
|
|
BytesIO(text), filename, None)
|
|
c_text = _cstr(text)
|
|
return (<_BaseParser>parser)._parseDoc(c_text, c_len, c_filename)
|
|
|
|
cdef xmlDoc* _parseDocFromFile(filename8, _BaseParser parser) except NULL:
|
|
if parser is None:
|
|
parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
|
|
return (<_BaseParser>parser)._parseDocFromFile(_cstr(filename8))
|
|
|
|
cdef xmlDoc* _parseDocFromFilelike(source, filename,
|
|
_BaseParser parser) except NULL:
|
|
if parser is None:
|
|
parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
|
|
return (<_BaseParser>parser)._parseDocFromFilelike(source, filename, None)
|
|
|
|
cdef xmlDoc* _newXMLDoc() except NULL:
|
|
cdef xmlDoc* result
|
|
result = tree.xmlNewDoc(NULL)
|
|
if result is NULL:
|
|
raise MemoryError()
|
|
if result.encoding is NULL:
|
|
result.encoding = tree.xmlStrdup(<unsigned char*>"UTF-8")
|
|
__GLOBAL_PARSER_CONTEXT.initDocDict(result)
|
|
return result
|
|
|
|
cdef xmlDoc* _newHTMLDoc() except NULL:
|
|
cdef xmlDoc* result
|
|
result = tree.htmlNewDoc(NULL, NULL)
|
|
if result is NULL:
|
|
raise MemoryError()
|
|
__GLOBAL_PARSER_CONTEXT.initDocDict(result)
|
|
return result
|
|
|
|
cdef xmlDoc* _copyDoc(xmlDoc* c_doc, int recursive) except NULL:
|
|
cdef xmlDoc* result
|
|
if recursive:
|
|
with nogil:
|
|
result = tree.xmlCopyDoc(c_doc, recursive)
|
|
else:
|
|
result = tree.xmlCopyDoc(c_doc, 0)
|
|
if result is NULL:
|
|
raise MemoryError()
|
|
__GLOBAL_PARSER_CONTEXT.initDocDict(result)
|
|
return result
|
|
|
|
cdef xmlDoc* _copyDocRoot(xmlDoc* c_doc, xmlNode* c_new_root) except NULL:
|
|
"Recursively copy the document and make c_new_root the new root node."
|
|
cdef xmlDoc* result
|
|
cdef xmlNode* c_node
|
|
result = tree.xmlCopyDoc(c_doc, 0) # non recursive
|
|
__GLOBAL_PARSER_CONTEXT.initDocDict(result)
|
|
with nogil:
|
|
c_node = tree.xmlDocCopyNode(c_new_root, result, 1) # recursive
|
|
if c_node is NULL:
|
|
raise MemoryError()
|
|
tree.xmlDocSetRootElement(result, c_node)
|
|
_copyTail(c_new_root.next, c_node)
|
|
return result
|
|
|
|
cdef xmlNode* _copyNodeToDoc(xmlNode* c_node, xmlDoc* c_doc) except NULL:
|
|
"Recursively copy the element into the document. c_doc is not modified."
|
|
cdef xmlNode* c_root
|
|
c_root = tree.xmlDocCopyNode(c_node, c_doc, 1) # recursive
|
|
if c_root is NULL:
|
|
raise MemoryError()
|
|
_copyTail(c_node.next, c_root)
|
|
return c_root
|
|
|
|
|
|
############################################################
|
|
## API level helper functions for _Document creation
|
|
############################################################
|
|
|
|
cdef _Document _parseDocument(source, _BaseParser parser, base_url):
|
|
cdef _Document doc
|
|
source = _getFSPathOrObject(source)
|
|
if _isString(source):
|
|
# parse the file directly from the filesystem
|
|
doc = _parseDocumentFromURL(_encodeFilename(source), parser)
|
|
# fix base URL if requested
|
|
if base_url is not None:
|
|
base_url = _encodeFilenameUTF8(base_url)
|
|
if doc._c_doc.URL is not NULL:
|
|
tree.xmlFree(<char*>doc._c_doc.URL)
|
|
doc._c_doc.URL = tree.xmlStrdup(_xcstr(base_url))
|
|
return doc
|
|
|
|
if base_url is not None:
|
|
url = base_url
|
|
else:
|
|
url = _getFilenameForFile(source)
|
|
|
|
if hasattr(source, 'getvalue') and hasattr(source, 'tell'):
|
|
# StringIO - reading from start?
|
|
if source.tell() == 0:
|
|
return _parseMemoryDocument(source.getvalue(), url, parser)
|
|
|
|
# Support for file-like objects (urlgrabber.urlopen, ...)
|
|
if hasattr(source, 'read'):
|
|
return _parseFilelikeDocument(source, url, parser)
|
|
|
|
raise TypeError, f"cannot parse from '{python._fqtypename(source).decode('UTF-8')}'"
|
|
|
|
cdef _Document _parseDocumentFromURL(url, _BaseParser parser):
|
|
c_doc = _parseDocFromFile(url, parser)
|
|
return _documentFactory(c_doc, parser)
|
|
|
|
cdef _Document _parseMemoryDocument(text, url, _BaseParser parser):
|
|
if isinstance(text, unicode):
|
|
if _hasEncodingDeclaration(text):
|
|
raise ValueError(
|
|
"Unicode strings with encoding declaration are not supported. "
|
|
"Please use bytes input or XML fragments without declaration.")
|
|
elif not isinstance(text, bytes):
|
|
raise ValueError, "can only parse strings"
|
|
c_doc = _parseDoc(text, url, parser)
|
|
return _documentFactory(c_doc, parser)
|
|
|
|
cdef _Document _parseFilelikeDocument(source, url, _BaseParser parser):
|
|
c_doc = _parseDocFromFilelike(source, url, parser)
|
|
return _documentFactory(c_doc, parser)
|