You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
869 lines
33 KiB
869 lines
33 KiB
5 months ago
|
# support for extension functions in XPath and XSLT
|
||
|
|
||
|
cdef class XPathError(LxmlError):
|
||
|
"""Base class of all XPath errors.
|
||
|
"""
|
||
|
|
||
|
cdef class XPathEvalError(XPathError):
|
||
|
"""Error during XPath evaluation.
|
||
|
"""
|
||
|
|
||
|
cdef class XPathFunctionError(XPathEvalError):
|
||
|
"""Internal error looking up an XPath extension function.
|
||
|
"""
|
||
|
|
||
|
cdef class XPathResultError(XPathEvalError):
|
||
|
"""Error handling an XPath result.
|
||
|
"""
|
||
|
|
||
|
|
||
|
# forward declarations
|
||
|
|
||
|
ctypedef int (*_register_function)(void* ctxt, name_utf, ns_uri_utf)
|
||
|
cdef class _ExsltRegExp
|
||
|
|
||
|
################################################################################
|
||
|
# Base class for XSLT and XPath evaluation contexts: functions, namespaces, ...
|
||
|
|
||
|
@cython.internal
|
||
|
cdef class _BaseContext:
|
||
|
cdef xpath.xmlXPathContext* _xpathCtxt
|
||
|
cdef _Document _doc
|
||
|
cdef dict _extensions
|
||
|
cdef list _namespaces
|
||
|
cdef list _global_namespaces
|
||
|
cdef dict _utf_refs
|
||
|
cdef dict _function_cache
|
||
|
cdef dict _eval_context_dict
|
||
|
cdef bint _build_smart_strings
|
||
|
# for exception handling and temporary reference keeping:
|
||
|
cdef _TempStore _temp_refs
|
||
|
cdef set _temp_documents
|
||
|
cdef _ExceptionContext _exc
|
||
|
cdef _ErrorLog _error_log
|
||
|
|
||
|
def __cinit__(self):
|
||
|
self._xpathCtxt = NULL
|
||
|
|
||
|
def __init__(self, namespaces, extensions, error_log, enable_regexp,
|
||
|
build_smart_strings):
|
||
|
cdef _ExsltRegExp _regexp
|
||
|
cdef dict new_extensions
|
||
|
cdef list ns
|
||
|
self._utf_refs = {}
|
||
|
self._global_namespaces = []
|
||
|
self._function_cache = {}
|
||
|
self._eval_context_dict = None
|
||
|
self._error_log = error_log
|
||
|
|
||
|
if extensions is not None:
|
||
|
# convert extensions to UTF-8
|
||
|
if isinstance(extensions, dict):
|
||
|
extensions = (extensions,)
|
||
|
# format: [ {(ns, name):function} ] -> {(ns_utf, name_utf):function}
|
||
|
new_extensions = {}
|
||
|
for extension in extensions:
|
||
|
for (ns_uri, name), function in extension.items():
|
||
|
if name is None:
|
||
|
raise ValueError, "extensions must have non empty names"
|
||
|
ns_utf = self._to_utf(ns_uri)
|
||
|
name_utf = self._to_utf(name)
|
||
|
new_extensions[(ns_utf, name_utf)] = function
|
||
|
extensions = new_extensions or None
|
||
|
|
||
|
if namespaces is not None:
|
||
|
if isinstance(namespaces, dict):
|
||
|
namespaces = namespaces.items()
|
||
|
if namespaces:
|
||
|
ns = []
|
||
|
for prefix, ns_uri in namespaces:
|
||
|
if prefix is None or not prefix:
|
||
|
raise TypeError, \
|
||
|
"empty namespace prefix is not supported in XPath"
|
||
|
if ns_uri is None or not ns_uri:
|
||
|
raise TypeError, \
|
||
|
"setting default namespace is not supported in XPath"
|
||
|
prefix_utf = self._to_utf(prefix)
|
||
|
ns_uri_utf = self._to_utf(ns_uri)
|
||
|
ns.append( (prefix_utf, ns_uri_utf) )
|
||
|
namespaces = ns
|
||
|
else:
|
||
|
namespaces = None
|
||
|
|
||
|
self._doc = None
|
||
|
self._exc = _ExceptionContext()
|
||
|
self._extensions = extensions
|
||
|
self._namespaces = namespaces
|
||
|
self._temp_refs = _TempStore()
|
||
|
self._temp_documents = set()
|
||
|
self._build_smart_strings = build_smart_strings
|
||
|
|
||
|
if enable_regexp:
|
||
|
_regexp = _ExsltRegExp()
|
||
|
_regexp._register_in_context(self)
|
||
|
|
||
|
cdef _BaseContext _copy(self):
|
||
|
cdef _BaseContext context
|
||
|
if self._namespaces is not None:
|
||
|
namespaces = self._namespaces[:]
|
||
|
else:
|
||
|
namespaces = None
|
||
|
context = self.__class__(namespaces, None, self._error_log, False,
|
||
|
self._build_smart_strings)
|
||
|
if self._extensions is not None:
|
||
|
context._extensions = self._extensions.copy()
|
||
|
return context
|
||
|
|
||
|
cdef bytes _to_utf(self, s):
|
||
|
"Convert to UTF-8 and keep a reference to the encoded string"
|
||
|
cdef python.PyObject* dict_result
|
||
|
if s is None:
|
||
|
return None
|
||
|
dict_result = python.PyDict_GetItem(self._utf_refs, s)
|
||
|
if dict_result is not NULL:
|
||
|
return <bytes>dict_result
|
||
|
utf = _utf8(s)
|
||
|
self._utf_refs[s] = utf
|
||
|
if python.IS_PYPY:
|
||
|
# use C level refs, PyPy refs are not enough!
|
||
|
python.Py_INCREF(utf)
|
||
|
return utf
|
||
|
|
||
|
cdef void _set_xpath_context(self, xpath.xmlXPathContext* xpathCtxt) noexcept:
|
||
|
self._xpathCtxt = xpathCtxt
|
||
|
xpathCtxt.userData = <void*>self
|
||
|
# Need a cast here because older libxml2 releases do not use 'const' in the functype.
|
||
|
xpathCtxt.error = <xmlerror.xmlStructuredErrorFunc> _receiveXPathError
|
||
|
|
||
|
@cython.final
|
||
|
cdef _register_context(self, _Document doc):
|
||
|
self._doc = doc
|
||
|
self._exc.clear()
|
||
|
|
||
|
@cython.final
|
||
|
cdef _cleanup_context(self):
|
||
|
#xpath.xmlXPathRegisteredNsCleanup(self._xpathCtxt)
|
||
|
#self.unregisterGlobalNamespaces()
|
||
|
if python.IS_PYPY:
|
||
|
# clean up double refs in PyPy (see "_to_utf()" method)
|
||
|
for ref in self._utf_refs.itervalues():
|
||
|
python.Py_DECREF(ref)
|
||
|
self._utf_refs.clear()
|
||
|
self._eval_context_dict = None
|
||
|
self._doc = None
|
||
|
|
||
|
@cython.final
|
||
|
cdef _release_context(self):
|
||
|
if self._xpathCtxt is not NULL:
|
||
|
self._xpathCtxt.userData = NULL
|
||
|
self._xpathCtxt = NULL
|
||
|
|
||
|
# namespaces (internal UTF-8 methods with leading '_')
|
||
|
|
||
|
cdef addNamespace(self, prefix, ns_uri):
|
||
|
cdef list namespaces
|
||
|
if prefix is None:
|
||
|
raise TypeError, "empty prefix is not supported in XPath"
|
||
|
prefix_utf = self._to_utf(prefix)
|
||
|
ns_uri_utf = self._to_utf(ns_uri)
|
||
|
new_item = (prefix_utf, ns_uri_utf)
|
||
|
if self._namespaces is None:
|
||
|
self._namespaces = [new_item]
|
||
|
else:
|
||
|
namespaces = []
|
||
|
for item in self._namespaces:
|
||
|
if item[0] == prefix_utf:
|
||
|
item = new_item
|
||
|
new_item = None
|
||
|
namespaces.append(item)
|
||
|
if new_item is not None:
|
||
|
namespaces.append(new_item)
|
||
|
self._namespaces = namespaces
|
||
|
if self._xpathCtxt is not NULL:
|
||
|
xpath.xmlXPathRegisterNs(
|
||
|
self._xpathCtxt, _xcstr(prefix_utf), _xcstr(ns_uri_utf))
|
||
|
|
||
|
cdef registerNamespace(self, prefix, ns_uri):
|
||
|
if prefix is None:
|
||
|
raise TypeError, "empty prefix is not supported in XPath"
|
||
|
prefix_utf = self._to_utf(prefix)
|
||
|
ns_uri_utf = self._to_utf(ns_uri)
|
||
|
self._global_namespaces.append(prefix_utf)
|
||
|
xpath.xmlXPathRegisterNs(self._xpathCtxt,
|
||
|
_xcstr(prefix_utf), _xcstr(ns_uri_utf))
|
||
|
|
||
|
cdef registerLocalNamespaces(self):
|
||
|
if self._namespaces is None:
|
||
|
return
|
||
|
for prefix_utf, ns_uri_utf in self._namespaces:
|
||
|
xpath.xmlXPathRegisterNs(
|
||
|
self._xpathCtxt, _xcstr(prefix_utf), _xcstr(ns_uri_utf))
|
||
|
|
||
|
cdef registerGlobalNamespaces(self):
|
||
|
cdef list ns_prefixes = _find_all_extension_prefixes()
|
||
|
if python.PyList_GET_SIZE(ns_prefixes) > 0:
|
||
|
for prefix_utf, ns_uri_utf in ns_prefixes:
|
||
|
self._global_namespaces.append(prefix_utf)
|
||
|
xpath.xmlXPathRegisterNs(
|
||
|
self._xpathCtxt, _xcstr(prefix_utf), _xcstr(ns_uri_utf))
|
||
|
|
||
|
cdef unregisterGlobalNamespaces(self):
|
||
|
if python.PyList_GET_SIZE(self._global_namespaces) > 0:
|
||
|
for prefix_utf in self._global_namespaces:
|
||
|
xpath.xmlXPathRegisterNs(self._xpathCtxt,
|
||
|
_xcstr(prefix_utf), NULL)
|
||
|
del self._global_namespaces[:]
|
||
|
|
||
|
cdef void _unregisterNamespace(self, prefix_utf) noexcept:
|
||
|
xpath.xmlXPathRegisterNs(self._xpathCtxt,
|
||
|
_xcstr(prefix_utf), NULL)
|
||
|
|
||
|
# extension functions
|
||
|
|
||
|
cdef int _addLocalExtensionFunction(self, ns_utf, name_utf, function) except -1:
|
||
|
if self._extensions is None:
|
||
|
self._extensions = {}
|
||
|
self._extensions[(ns_utf, name_utf)] = function
|
||
|
return 0
|
||
|
|
||
|
cdef registerGlobalFunctions(self, void* ctxt,
|
||
|
_register_function reg_func):
|
||
|
cdef python.PyObject* dict_result
|
||
|
cdef dict d
|
||
|
for ns_utf, ns_functions in __FUNCTION_NAMESPACE_REGISTRIES.iteritems():
|
||
|
dict_result = python.PyDict_GetItem(
|
||
|
self._function_cache, ns_utf)
|
||
|
if dict_result is not NULL:
|
||
|
d = <dict>dict_result
|
||
|
else:
|
||
|
d = {}
|
||
|
self._function_cache[ns_utf] = d
|
||
|
for name_utf, function in ns_functions.iteritems():
|
||
|
d[name_utf] = function
|
||
|
reg_func(ctxt, name_utf, ns_utf)
|
||
|
|
||
|
cdef registerLocalFunctions(self, void* ctxt,
|
||
|
_register_function reg_func):
|
||
|
cdef python.PyObject* dict_result
|
||
|
cdef dict d
|
||
|
if self._extensions is None:
|
||
|
return # done
|
||
|
last_ns = None
|
||
|
d = None
|
||
|
for (ns_utf, name_utf), function in self._extensions.iteritems():
|
||
|
if ns_utf is not last_ns or d is None:
|
||
|
last_ns = ns_utf
|
||
|
dict_result = python.PyDict_GetItem(
|
||
|
self._function_cache, ns_utf)
|
||
|
if dict_result is not NULL:
|
||
|
d = <dict>dict_result
|
||
|
else:
|
||
|
d = {}
|
||
|
self._function_cache[ns_utf] = d
|
||
|
d[name_utf] = function
|
||
|
reg_func(ctxt, name_utf, ns_utf)
|
||
|
|
||
|
cdef unregisterAllFunctions(self, void* ctxt,
|
||
|
_register_function unreg_func):
|
||
|
for ns_utf, functions in self._function_cache.iteritems():
|
||
|
for name_utf in functions:
|
||
|
unreg_func(ctxt, name_utf, ns_utf)
|
||
|
|
||
|
cdef unregisterGlobalFunctions(self, void* ctxt,
|
||
|
_register_function unreg_func):
|
||
|
for ns_utf, functions in self._function_cache.items():
|
||
|
for name_utf in functions:
|
||
|
if self._extensions is None or \
|
||
|
(ns_utf, name_utf) not in self._extensions:
|
||
|
unreg_func(ctxt, name_utf, ns_utf)
|
||
|
|
||
|
@cython.final
|
||
|
cdef _find_cached_function(self, const_xmlChar* c_ns_uri, const_xmlChar* c_name):
|
||
|
"""Lookup an extension function in the cache and return it.
|
||
|
|
||
|
Parameters: c_ns_uri may be NULL, c_name must not be NULL
|
||
|
"""
|
||
|
cdef python.PyObject* c_dict
|
||
|
cdef python.PyObject* dict_result
|
||
|
c_dict = python.PyDict_GetItem(
|
||
|
self._function_cache, None if c_ns_uri is NULL else c_ns_uri)
|
||
|
if c_dict is not NULL:
|
||
|
dict_result = python.PyDict_GetItem(
|
||
|
<object>c_dict, <unsigned char*>c_name)
|
||
|
if dict_result is not NULL:
|
||
|
return <object>dict_result
|
||
|
return None
|
||
|
|
||
|
# Python access to the XPath context for extension functions
|
||
|
|
||
|
@property
|
||
|
def context_node(self):
|
||
|
cdef xmlNode* c_node
|
||
|
if self._xpathCtxt is NULL:
|
||
|
raise XPathError, \
|
||
|
"XPath context is only usable during the evaluation"
|
||
|
c_node = self._xpathCtxt.node
|
||
|
if c_node is NULL:
|
||
|
raise XPathError, "no context node"
|
||
|
if c_node.doc != self._xpathCtxt.doc:
|
||
|
raise XPathError, \
|
||
|
"document-external context nodes are not supported"
|
||
|
if self._doc is None:
|
||
|
raise XPathError, "document context is missing"
|
||
|
return _elementFactory(self._doc, c_node)
|
||
|
|
||
|
@property
|
||
|
def eval_context(self):
|
||
|
if self._eval_context_dict is None:
|
||
|
self._eval_context_dict = {}
|
||
|
return self._eval_context_dict
|
||
|
|
||
|
# Python reference keeping during XPath function evaluation
|
||
|
|
||
|
@cython.final
|
||
|
cdef _release_temp_refs(self):
|
||
|
"Free temporarily referenced objects from this context."
|
||
|
self._temp_refs.clear()
|
||
|
self._temp_documents.clear()
|
||
|
|
||
|
@cython.final
|
||
|
cdef _hold(self, obj):
|
||
|
"""A way to temporarily hold references to nodes in the evaluator.
|
||
|
|
||
|
This is needed because otherwise nodes created in XPath extension
|
||
|
functions would be reference counted too soon, during the XPath
|
||
|
evaluation. This is most important in the case of exceptions.
|
||
|
"""
|
||
|
cdef _Element element
|
||
|
if isinstance(obj, _Element):
|
||
|
self._temp_refs.add(obj)
|
||
|
self._temp_documents.add((<_Element>obj)._doc)
|
||
|
return
|
||
|
elif _isString(obj) or not python.PySequence_Check(obj):
|
||
|
return
|
||
|
for o in obj:
|
||
|
if isinstance(o, _Element):
|
||
|
#print "Holding element:", <int>element._c_node
|
||
|
self._temp_refs.add(o)
|
||
|
#print "Holding document:", <int>element._doc._c_doc
|
||
|
self._temp_documents.add((<_Element>o)._doc)
|
||
|
|
||
|
@cython.final
|
||
|
cdef _Document _findDocumentForNode(self, xmlNode* c_node):
|
||
|
"""If an XPath expression returns an element from a different
|
||
|
document than the current context document, we call this to
|
||
|
see if it was possibly created by an extension and is a known
|
||
|
document instance.
|
||
|
"""
|
||
|
cdef _Document doc
|
||
|
for doc in self._temp_documents:
|
||
|
if doc is not None and doc._c_doc is c_node.doc:
|
||
|
return doc
|
||
|
return None
|
||
|
|
||
|
|
||
|
# libxml2 keeps these error messages in a static array in its code
|
||
|
# and doesn't give us access to them ...
|
||
|
|
||
|
cdef tuple LIBXML2_XPATH_ERROR_MESSAGES = (
|
||
|
b"Ok",
|
||
|
b"Number encoding",
|
||
|
b"Unfinished literal",
|
||
|
b"Start of literal",
|
||
|
b"Expected $ for variable reference",
|
||
|
b"Undefined variable",
|
||
|
b"Invalid predicate",
|
||
|
b"Invalid expression",
|
||
|
b"Missing closing curly brace",
|
||
|
b"Unregistered function",
|
||
|
b"Invalid operand",
|
||
|
b"Invalid type",
|
||
|
b"Invalid number of arguments",
|
||
|
b"Invalid context size",
|
||
|
b"Invalid context position",
|
||
|
b"Memory allocation error",
|
||
|
b"Syntax error",
|
||
|
b"Resource error",
|
||
|
b"Sub resource error",
|
||
|
b"Undefined namespace prefix",
|
||
|
b"Encoding error",
|
||
|
b"Char out of XML range",
|
||
|
b"Invalid or incomplete context",
|
||
|
b"Stack usage error",
|
||
|
b"Forbidden variable\n",
|
||
|
b"?? Unknown error ??\n",
|
||
|
)
|
||
|
|
||
|
cdef void _forwardXPathError(void* c_ctxt, const xmlerror.xmlError* c_error) noexcept with gil:
|
||
|
cdef xmlerror.xmlError error
|
||
|
cdef int xpath_code
|
||
|
if c_error.message is not NULL:
|
||
|
error.message = c_error.message
|
||
|
else:
|
||
|
xpath_code = c_error.code - xmlerror.XML_XPATH_EXPRESSION_OK
|
||
|
if 0 <= xpath_code < len(LIBXML2_XPATH_ERROR_MESSAGES):
|
||
|
error.message = _cstr(LIBXML2_XPATH_ERROR_MESSAGES[xpath_code])
|
||
|
else:
|
||
|
error.message = b"unknown error"
|
||
|
error.domain = c_error.domain
|
||
|
error.code = c_error.code
|
||
|
error.level = c_error.level
|
||
|
error.line = c_error.line
|
||
|
error.int2 = c_error.int1 # column
|
||
|
error.file = c_error.file
|
||
|
error.node = NULL
|
||
|
|
||
|
(<_BaseContext>c_ctxt)._error_log._receive(&error)
|
||
|
|
||
|
cdef void _receiveXPathError(void* c_context, const xmlerror.xmlError* error) noexcept nogil:
|
||
|
if not __DEBUG:
|
||
|
return
|
||
|
if c_context is NULL:
|
||
|
_forwardError(NULL, error)
|
||
|
else:
|
||
|
_forwardXPathError(c_context, error)
|
||
|
|
||
|
|
||
|
def Extension(module, function_mapping=None, *, ns=None):
|
||
|
"""Extension(module, function_mapping=None, ns=None)
|
||
|
|
||
|
Build a dictionary of extension functions from the functions
|
||
|
defined in a module or the methods of an object.
|
||
|
|
||
|
As second argument, you can pass an additional mapping of
|
||
|
attribute names to XPath function names, or a list of function
|
||
|
names that should be taken.
|
||
|
|
||
|
The ``ns`` keyword argument accepts a namespace URI for the XPath
|
||
|
functions.
|
||
|
"""
|
||
|
cdef dict functions = {}
|
||
|
if isinstance(function_mapping, dict):
|
||
|
for function_name, xpath_name in function_mapping.items():
|
||
|
functions[(ns, xpath_name)] = getattr(module, function_name)
|
||
|
else:
|
||
|
if function_mapping is None:
|
||
|
function_mapping = [ name for name in dir(module)
|
||
|
if not name.startswith('_') ]
|
||
|
for function_name in function_mapping:
|
||
|
functions[(ns, function_name)] = getattr(module, function_name)
|
||
|
return functions
|
||
|
|
||
|
################################################################################
|
||
|
# EXSLT regexp implementation
|
||
|
|
||
|
@cython.final
|
||
|
@cython.internal
|
||
|
cdef class _ExsltRegExp:
|
||
|
cdef dict _compile_map
|
||
|
def __cinit__(self):
|
||
|
self._compile_map = {}
|
||
|
|
||
|
cdef _make_string(self, value):
|
||
|
if _isString(value):
|
||
|
return value
|
||
|
elif isinstance(value, list):
|
||
|
# node set: take recursive text concatenation of first element
|
||
|
if python.PyList_GET_SIZE(value) == 0:
|
||
|
return ''
|
||
|
firstnode = value[0]
|
||
|
if _isString(firstnode):
|
||
|
return firstnode
|
||
|
elif isinstance(firstnode, _Element):
|
||
|
c_text = tree.xmlNodeGetContent((<_Element>firstnode)._c_node)
|
||
|
if c_text is NULL:
|
||
|
raise MemoryError()
|
||
|
try:
|
||
|
return funicode(c_text)
|
||
|
finally:
|
||
|
tree.xmlFree(c_text)
|
||
|
else:
|
||
|
return unicode(firstnode)
|
||
|
else:
|
||
|
return unicode(value)
|
||
|
|
||
|
cdef _compile(self, rexp, ignore_case):
|
||
|
cdef python.PyObject* c_result
|
||
|
rexp = self._make_string(rexp)
|
||
|
key = (rexp, ignore_case)
|
||
|
c_result = python.PyDict_GetItem(self._compile_map, key)
|
||
|
if c_result is not NULL:
|
||
|
return <object>c_result
|
||
|
py_flags = re.UNICODE
|
||
|
if ignore_case:
|
||
|
py_flags = py_flags | re.IGNORECASE
|
||
|
rexp_compiled = re.compile(rexp, py_flags)
|
||
|
self._compile_map[key] = rexp_compiled
|
||
|
return rexp_compiled
|
||
|
|
||
|
def test(self, ctxt, s, rexp, flags=''):
|
||
|
flags = self._make_string(flags)
|
||
|
s = self._make_string(s)
|
||
|
rexpc = self._compile(rexp, 'i' in flags)
|
||
|
if rexpc.search(s) is None:
|
||
|
return False
|
||
|
else:
|
||
|
return True
|
||
|
|
||
|
def match(self, ctxt, s, rexp, flags=''):
|
||
|
cdef list result_list
|
||
|
flags = self._make_string(flags)
|
||
|
s = self._make_string(s)
|
||
|
rexpc = self._compile(rexp, 'i' in flags)
|
||
|
if 'g' in flags:
|
||
|
results = rexpc.findall(s)
|
||
|
if not results:
|
||
|
return ()
|
||
|
else:
|
||
|
result = rexpc.search(s)
|
||
|
if not result:
|
||
|
return ()
|
||
|
results = [ result.group() ]
|
||
|
results.extend( result.groups('') )
|
||
|
result_list = []
|
||
|
root = Element('matches')
|
||
|
for s_match in results:
|
||
|
if python.PyTuple_CheckExact(s_match):
|
||
|
s_match = ''.join(s_match)
|
||
|
elem = SubElement(root, 'match')
|
||
|
elem.text = s_match
|
||
|
result_list.append(elem)
|
||
|
return result_list
|
||
|
|
||
|
def replace(self, ctxt, s, rexp, flags, replacement):
|
||
|
replacement = self._make_string(replacement)
|
||
|
flags = self._make_string(flags)
|
||
|
s = self._make_string(s)
|
||
|
rexpc = self._compile(rexp, 'i' in flags)
|
||
|
count: object = 0 if 'g' in flags else 1
|
||
|
return rexpc.sub(replacement, s, count)
|
||
|
|
||
|
cdef _register_in_context(self, _BaseContext context):
|
||
|
ns = b"http://exslt.org/regular-expressions"
|
||
|
context._addLocalExtensionFunction(ns, b"test", self.test)
|
||
|
context._addLocalExtensionFunction(ns, b"match", self.match)
|
||
|
context._addLocalExtensionFunction(ns, b"replace", self.replace)
|
||
|
|
||
|
|
||
|
################################################################################
|
||
|
# helper functions
|
||
|
|
||
|
cdef xpath.xmlXPathObject* _wrapXPathObject(object obj, _Document doc,
|
||
|
_BaseContext context) except NULL:
|
||
|
cdef xpath.xmlNodeSet* resultSet
|
||
|
cdef _Element fake_node = None
|
||
|
cdef xmlNode* c_node
|
||
|
|
||
|
if isinstance(obj, unicode):
|
||
|
obj = _utf8(obj)
|
||
|
if isinstance(obj, bytes):
|
||
|
# libxml2 copies the string value
|
||
|
return xpath.xmlXPathNewCString(_cstr(obj))
|
||
|
if isinstance(obj, bool):
|
||
|
return xpath.xmlXPathNewBoolean(obj)
|
||
|
if python.PyNumber_Check(obj):
|
||
|
return xpath.xmlXPathNewFloat(obj)
|
||
|
if obj is None:
|
||
|
resultSet = xpath.xmlXPathNodeSetCreate(NULL)
|
||
|
elif isinstance(obj, _Element):
|
||
|
resultSet = xpath.xmlXPathNodeSetCreate((<_Element>obj)._c_node)
|
||
|
elif python.PySequence_Check(obj):
|
||
|
resultSet = xpath.xmlXPathNodeSetCreate(NULL)
|
||
|
try:
|
||
|
for value in obj:
|
||
|
if isinstance(value, _Element):
|
||
|
if context is not None:
|
||
|
context._hold(value)
|
||
|
xpath.xmlXPathNodeSetAdd(resultSet, (<_Element>value)._c_node)
|
||
|
else:
|
||
|
if context is None or doc is None:
|
||
|
raise XPathResultError, \
|
||
|
f"Non-Element values not supported at this point - got {value!r}"
|
||
|
# support strings by appending text nodes to an Element
|
||
|
if isinstance(value, unicode):
|
||
|
value = _utf8(value)
|
||
|
if isinstance(value, bytes):
|
||
|
if fake_node is None:
|
||
|
fake_node = _makeElement("text-root", NULL, doc, None,
|
||
|
None, None, None, None, None)
|
||
|
context._hold(fake_node)
|
||
|
else:
|
||
|
# append a comment node to keep the text nodes separate
|
||
|
c_node = tree.xmlNewDocComment(doc._c_doc, <unsigned char*>"")
|
||
|
if c_node is NULL:
|
||
|
raise MemoryError()
|
||
|
tree.xmlAddChild(fake_node._c_node, c_node)
|
||
|
context._hold(value)
|
||
|
c_node = tree.xmlNewDocText(doc._c_doc, _xcstr(value))
|
||
|
if c_node is NULL:
|
||
|
raise MemoryError()
|
||
|
tree.xmlAddChild(fake_node._c_node, c_node)
|
||
|
xpath.xmlXPathNodeSetAdd(resultSet, c_node)
|
||
|
else:
|
||
|
raise XPathResultError, \
|
||
|
f"This is not a supported node-set result: {value!r}"
|
||
|
except:
|
||
|
xpath.xmlXPathFreeNodeSet(resultSet)
|
||
|
raise
|
||
|
else:
|
||
|
raise XPathResultError, f"Unknown return type: {python._fqtypename(obj).decode('utf8')}"
|
||
|
return xpath.xmlXPathWrapNodeSet(resultSet)
|
||
|
|
||
|
cdef object _unwrapXPathObject(xpath.xmlXPathObject* xpathObj,
|
||
|
_Document doc, _BaseContext context):
|
||
|
if xpathObj.type == xpath.XPATH_UNDEFINED:
|
||
|
raise XPathResultError, "Undefined xpath result"
|
||
|
elif xpathObj.type == xpath.XPATH_NODESET:
|
||
|
return _createNodeSetResult(xpathObj, doc, context)
|
||
|
elif xpathObj.type == xpath.XPATH_BOOLEAN:
|
||
|
return xpathObj.boolval
|
||
|
elif xpathObj.type == xpath.XPATH_NUMBER:
|
||
|
return xpathObj.floatval
|
||
|
elif xpathObj.type == xpath.XPATH_STRING:
|
||
|
stringval = funicode(xpathObj.stringval)
|
||
|
if context._build_smart_strings:
|
||
|
stringval = _elementStringResultFactory(
|
||
|
stringval, None, None, 0)
|
||
|
return stringval
|
||
|
elif xpathObj.type == xpath.XPATH_POINT:
|
||
|
raise NotImplementedError, "XPATH_POINT"
|
||
|
elif xpathObj.type == xpath.XPATH_RANGE:
|
||
|
raise NotImplementedError, "XPATH_RANGE"
|
||
|
elif xpathObj.type == xpath.XPATH_LOCATIONSET:
|
||
|
raise NotImplementedError, "XPATH_LOCATIONSET"
|
||
|
elif xpathObj.type == xpath.XPATH_USERS:
|
||
|
raise NotImplementedError, "XPATH_USERS"
|
||
|
elif xpathObj.type == xpath.XPATH_XSLT_TREE:
|
||
|
return _createNodeSetResult(xpathObj, doc, context)
|
||
|
else:
|
||
|
raise XPathResultError, f"Unknown xpath result {xpathObj.type}"
|
||
|
|
||
|
cdef object _createNodeSetResult(xpath.xmlXPathObject* xpathObj, _Document doc,
|
||
|
_BaseContext context):
|
||
|
cdef xmlNode* c_node
|
||
|
cdef int i
|
||
|
cdef list result
|
||
|
result = []
|
||
|
if xpathObj.nodesetval is NULL:
|
||
|
return result
|
||
|
for i in range(xpathObj.nodesetval.nodeNr):
|
||
|
c_node = xpathObj.nodesetval.nodeTab[i]
|
||
|
_unpackNodeSetEntry(result, c_node, doc, context,
|
||
|
xpathObj.type == xpath.XPATH_XSLT_TREE)
|
||
|
return result
|
||
|
|
||
|
cdef _unpackNodeSetEntry(list results, xmlNode* c_node, _Document doc,
|
||
|
_BaseContext context, bint is_fragment):
|
||
|
cdef xmlNode* c_child
|
||
|
if _isElement(c_node):
|
||
|
if c_node.doc != doc._c_doc and c_node.doc._private is NULL:
|
||
|
# XXX: works, but maybe not always the right thing to do?
|
||
|
# XPath: only runs when extensions create or copy trees
|
||
|
# -> we store Python refs to these, so that is OK
|
||
|
# XSLT: can it leak when merging trees from multiple sources?
|
||
|
c_node = tree.xmlDocCopyNode(c_node, doc._c_doc, 1)
|
||
|
# FIXME: call _instantiateElementFromXPath() instead?
|
||
|
results.append(
|
||
|
_fakeDocElementFactory(doc, c_node))
|
||
|
elif c_node.type == tree.XML_TEXT_NODE or \
|
||
|
c_node.type == tree.XML_CDATA_SECTION_NODE or \
|
||
|
c_node.type == tree.XML_ATTRIBUTE_NODE:
|
||
|
results.append(
|
||
|
_buildElementStringResult(doc, c_node, context))
|
||
|
elif c_node.type == tree.XML_NAMESPACE_DECL:
|
||
|
results.append( (funicodeOrNone((<xmlNs*>c_node).prefix),
|
||
|
funicodeOrNone((<xmlNs*>c_node).href)) )
|
||
|
elif c_node.type == tree.XML_DOCUMENT_NODE or \
|
||
|
c_node.type == tree.XML_HTML_DOCUMENT_NODE:
|
||
|
# ignored for everything but result tree fragments
|
||
|
if is_fragment:
|
||
|
c_child = c_node.children
|
||
|
while c_child is not NULL:
|
||
|
_unpackNodeSetEntry(results, c_child, doc, context, 0)
|
||
|
c_child = c_child.next
|
||
|
elif c_node.type == tree.XML_XINCLUDE_START or \
|
||
|
c_node.type == tree.XML_XINCLUDE_END:
|
||
|
pass
|
||
|
else:
|
||
|
raise NotImplementedError, \
|
||
|
f"Not yet implemented result node type: {c_node.type}"
|
||
|
|
||
|
cdef void _freeXPathObject(xpath.xmlXPathObject* xpathObj) noexcept:
|
||
|
"""Free the XPath object, but *never* free the *content* of node sets.
|
||
|
Python dealloc will do that for us.
|
||
|
"""
|
||
|
if xpathObj.nodesetval is not NULL:
|
||
|
xpath.xmlXPathFreeNodeSet(xpathObj.nodesetval)
|
||
|
xpathObj.nodesetval = NULL
|
||
|
xpath.xmlXPathFreeObject(xpathObj)
|
||
|
|
||
|
cdef _Element _instantiateElementFromXPath(xmlNode* c_node, _Document doc,
|
||
|
_BaseContext context):
|
||
|
# NOTE: this may copy the element - only call this when it can't leak
|
||
|
if c_node.doc != doc._c_doc and c_node.doc._private is NULL:
|
||
|
# not from the context document and not from a fake document
|
||
|
# either => may still be from a known document, e.g. one
|
||
|
# created by an extension function
|
||
|
node_doc = context._findDocumentForNode(c_node)
|
||
|
if node_doc is None:
|
||
|
# not from a known document at all! => can only make a
|
||
|
# safety copy here
|
||
|
c_node = tree.xmlDocCopyNode(c_node, doc._c_doc, 1)
|
||
|
else:
|
||
|
doc = node_doc
|
||
|
return _fakeDocElementFactory(doc, c_node)
|
||
|
|
||
|
################################################################################
|
||
|
# special str/unicode subclasses
|
||
|
|
||
|
@cython.final
|
||
|
cdef class _ElementUnicodeResult(unicode):
|
||
|
cdef _Element _parent
|
||
|
cdef readonly object attrname
|
||
|
cdef readonly bint is_tail
|
||
|
cdef readonly bint is_text
|
||
|
cdef readonly bint is_attribute
|
||
|
|
||
|
def getparent(self):
|
||
|
return self._parent
|
||
|
|
||
|
cdef object _PyElementUnicodeResult
|
||
|
if python.IS_PYPY:
|
||
|
class _PyElementUnicodeResult(unicode):
|
||
|
# we need to use a Python class here, or PyPy will crash on creation
|
||
|
# https://bitbucket.org/pypy/pypy/issues/2021/pypy3-pytype_ready-crashes-for-extension
|
||
|
def getparent(self):
|
||
|
return self._parent
|
||
|
|
||
|
class _ElementStringResult(bytes):
|
||
|
# we need to use a Python class here, bytes cannot be C-subclassed
|
||
|
# in Pyrex/Cython
|
||
|
def getparent(self):
|
||
|
return self._parent
|
||
|
|
||
|
cdef object _elementStringResultFactory(string_value, _Element parent,
|
||
|
attrname, bint is_tail):
|
||
|
cdef _ElementUnicodeResult uresult
|
||
|
cdef bint is_text
|
||
|
cdef bint is_attribute = attrname is not None
|
||
|
if parent is None:
|
||
|
is_text = 0
|
||
|
else:
|
||
|
is_text = not (is_tail or is_attribute)
|
||
|
|
||
|
if type(string_value) is bytes:
|
||
|
result = _ElementStringResult(string_value)
|
||
|
result._parent = parent
|
||
|
result.is_attribute = is_attribute
|
||
|
result.is_tail = is_tail
|
||
|
result.is_text = is_text
|
||
|
result.attrname = attrname
|
||
|
return result
|
||
|
elif python.IS_PYPY:
|
||
|
result = _PyElementUnicodeResult(string_value)
|
||
|
result._parent = parent
|
||
|
result.is_attribute = is_attribute
|
||
|
result.is_tail = is_tail
|
||
|
result.is_text = is_text
|
||
|
result.attrname = attrname
|
||
|
return result
|
||
|
else:
|
||
|
uresult = _ElementUnicodeResult(string_value)
|
||
|
uresult._parent = parent
|
||
|
uresult.is_attribute = is_attribute
|
||
|
uresult.is_tail = is_tail
|
||
|
uresult.is_text = is_text
|
||
|
uresult.attrname = attrname
|
||
|
return uresult
|
||
|
|
||
|
cdef object _buildElementStringResult(_Document doc, xmlNode* c_node,
|
||
|
_BaseContext context):
|
||
|
cdef _Element parent = None
|
||
|
cdef object attrname = None
|
||
|
cdef xmlNode* c_element
|
||
|
cdef bint is_tail
|
||
|
|
||
|
if c_node.type == tree.XML_ATTRIBUTE_NODE:
|
||
|
attrname = _namespacedName(c_node)
|
||
|
is_tail = 0
|
||
|
s = tree.xmlNodeGetContent(c_node)
|
||
|
try:
|
||
|
value = funicode(s)
|
||
|
finally:
|
||
|
tree.xmlFree(s)
|
||
|
c_element = NULL
|
||
|
else:
|
||
|
#assert c_node.type == tree.XML_TEXT_NODE or c_node.type == tree.XML_CDATA_SECTION_NODE, "invalid node type"
|
||
|
# may be tail text or normal text
|
||
|
value = funicode(c_node.content)
|
||
|
c_element = _previousElement(c_node)
|
||
|
is_tail = c_element is not NULL
|
||
|
|
||
|
if not context._build_smart_strings:
|
||
|
return value
|
||
|
|
||
|
if c_element is NULL:
|
||
|
# non-tail text or attribute text
|
||
|
c_element = c_node.parent
|
||
|
while c_element is not NULL and not _isElement(c_element):
|
||
|
c_element = c_element.parent
|
||
|
|
||
|
if c_element is not NULL:
|
||
|
parent = _instantiateElementFromXPath(c_element, doc, context)
|
||
|
|
||
|
return _elementStringResultFactory(
|
||
|
value, parent, attrname, is_tail)
|
||
|
|
||
|
################################################################################
|
||
|
# callbacks for XPath/XSLT extension functions
|
||
|
|
||
|
cdef void _extension_function_call(_BaseContext context, function,
|
||
|
xpath.xmlXPathParserContext* ctxt, int nargs) noexcept:
|
||
|
cdef _Document doc
|
||
|
cdef xpath.xmlXPathObject* obj
|
||
|
cdef list args
|
||
|
cdef int i
|
||
|
doc = context._doc
|
||
|
try:
|
||
|
args = []
|
||
|
for i in range(nargs):
|
||
|
obj = xpath.valuePop(ctxt)
|
||
|
o = _unwrapXPathObject(obj, doc, context)
|
||
|
_freeXPathObject(obj)
|
||
|
args.append(o)
|
||
|
args.reverse()
|
||
|
|
||
|
res = function(context, *args)
|
||
|
# wrap result for XPath consumption
|
||
|
obj = _wrapXPathObject(res, doc, context)
|
||
|
# prevent Python from deallocating elements handed to libxml2
|
||
|
context._hold(res)
|
||
|
xpath.valuePush(ctxt, obj)
|
||
|
except:
|
||
|
xpath.xmlXPathErr(ctxt, xpath.XPATH_EXPR_ERROR)
|
||
|
context._exc._store_raised()
|
||
|
finally:
|
||
|
return # swallow any further exceptions
|
||
|
|
||
|
# lookup the function by name and call it
|
||
|
|
||
|
cdef void _xpath_function_call(xpath.xmlXPathParserContext* ctxt,
|
||
|
int nargs) noexcept with gil:
|
||
|
cdef _BaseContext context
|
||
|
cdef xpath.xmlXPathContext* rctxt = ctxt.context
|
||
|
context = <_BaseContext> rctxt.userData
|
||
|
try:
|
||
|
function = context._find_cached_function(rctxt.functionURI, rctxt.function)
|
||
|
if function is not None:
|
||
|
_extension_function_call(context, function, ctxt, nargs)
|
||
|
else:
|
||
|
xpath.xmlXPathErr(ctxt, xpath.XPATH_UNKNOWN_FUNC_ERROR)
|
||
|
context._exc._store_exception(XPathFunctionError(
|
||
|
f"XPath function '{_namespacedNameFromNsName(rctxt.functionURI, rctxt.function)}' not found"))
|
||
|
except:
|
||
|
# may not be the right error, but we need to tell libxml2 *something*
|
||
|
xpath.xmlXPathErr(ctxt, xpath.XPATH_UNKNOWN_FUNC_ERROR)
|
||
|
context._exc._store_raised()
|
||
|
finally:
|
||
|
return # swallow any further exceptions
|