1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21 """Helper functions for working with XML."""
22
23 import re
24 from lxml import etree
25
26
27 xml_preserve_ancestors = etree.XPath("ancestor-or-self::*[attribute::xml:space='preserve']")
28 """All ancestors with xml:space='preserve'"""
29
30 xml_space_ancestors= etree.XPath("ancestor-or-self::*/attribute::xml:space")
31 """All xml:space attributes in the ancestors"""
32
33 string_xpath = etree.XPath("string()")
34 """Return a non-normalized string in the node subtree"""
35
36 string_xpath_normalized = etree.XPath("normalize-space()")
37 """Return a (space) normalized string in the node subtree"""
38
39 -def getText(node, xml_space="preserve"):
40 """Extracts the plain text content out of the given node.
41
42 This method checks the xml:space attribute of the given node, and takes
43 an optional default to use in case nothing is specified in this node."""
44 xml_space = getXMLspace(node, xml_space)
45 if xml_space == "default":
46 return unicode(string_xpath_normalized(node))
47 else:
48 return unicode(string_xpath(node))
49
50
51
52
53
54
55
56
57
58
59 XML_NS = 'http://www.w3.org/XML/1998/namespace'
60
62 """Gets the xml:lang attribute on node"""
63 return node.get("{%s}lang" % XML_NS)
64
66 """Sets the xml:lang attribute on node"""
67 node.set("{%s}lang" % XML_NS, lang)
68
75
77 """Sets the xml:space attribute on node"""
78 node.set("{%s}space" % XML_NS, value)
79
81 """Returns name in Clark notation within the given namespace.
82
83 For example namespaced("source") in an XLIFF document might return::
84 {urn:oasis:names:tc:xliff:document:1.1}source
85 This is needed throughout lxml.
86 """
87 if namespace:
88 return "{%s}%s" % (namespace, name)
89 else:
90 return name
91
92 MULTIWHITESPACE_PATTERN = r"[\n\r\t ]+"
93 MULTIWHITESPACE_RE = re.compile(MULTIWHITESPACE_PATTERN, re.MULTILINE)
94
99
101 """normalize spaces following the nodes xml:space, or alternatively the
102 given xml_space parameter."""
103 xml_space = getXMLspace(node) or xml_space
104 if xml_space == 'preserve':
105 return
106 if node.text:
107 node.text = normalize_space(node.text)
108 if remove_start and node.text[0] == u" ":
109 node.text = node.text.lstrip()
110 remove_start = False
111 if len(node.text) > 0 and node.text.endswith(u" "):
112 remove_start = True
113 if len(node) == 0:
114 node.text = node.text.rstrip()
115 if node.tail:
116 node.tail = normalize_space(node.tail)
117
118 for child in node:
119 normalize_xml_space(child, remove_start)
120