Package translate :: Package misc :: Module xml_helpers
[hide private]
[frames] | no frames]

Source Code for Module translate.misc.xml_helpers

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  # 
  4  # Copyright 2006-2009 Zuza Software Foundation 
  5  # 
  6  # This file is part of the Translate Toolkit. 
  7  # 
  8  # This program is free software; you can redistribute it and/or modify 
  9  # it under the terms of the GNU General Public License as published by 
 10  # the Free Software Foundation; either version 2 of the License, or 
 11  # (at your option) any later version. 
 12  # 
 13  # This program is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  # 
 18  # You should have received a copy of the GNU General Public License 
 19  # along with this program; if not, see <http://www.gnu.org/licenses/>. 
 20   
 21  """Helper functions for working with XML.""" 
 22   
 23  import re 
 24  from lxml import etree 
 25   
 26  # some useful xpath expressions 
 27  xml_preserve_ancestors = etree.XPath("ancestor-or-self::*[attribute::xml:space='preserve']") 
 28  """All ancestors with xml:space='preserve'""" 
 29   
 30  xml_space_ancestors= etree.XPath("ancestor-or-self::*/attribute::xml:space") 
 31  """All xml:space attributes in the ancestors""" 
 32   
 33  string_xpath = etree.XPath("string()") 
 34  """Return a non-normalized string in the node subtree""" 
 35   
 36  string_xpath_normalized = etree.XPath("normalize-space()") 
 37  """Return a (space) normalized string in the node subtree""" 
 38   
39 -def getText(node, xml_space="preserve"):
40 """Extracts the plain text content out of the given node. 41 42 This method checks the xml:space attribute of the given node, and takes 43 an optional default to use in case nothing is specified in this node.""" 44 xml_space = getXMLspace(node, xml_space) 45 if xml_space == "default": 46 return unicode(string_xpath_normalized(node)) # specific to lxml.etree 47 else: 48 return unicode(string_xpath(node)) # specific to lxml.etree
49 50 # If we want to normalise space and only preserve it when the directive 51 # xml:space="preserve" is given in node or in parents, consider this code: 52 #xml_preserves = xml_preserve_ancestors(node) 53 #if xml_preserves and xml_preserves[-1] == "preserve": 54 # return unicode(string_xpath(node)) # specific to lxml.etree 55 #else: 56 # return unicode(string_xpath_normalized(node)) # specific to lxml.etree 57 58 59 XML_NS = 'http://www.w3.org/XML/1998/namespace' 60
61 -def getXMLlang(node):
62 """Gets the xml:lang attribute on node""" 63 return node.get("{%s}lang" % XML_NS)
64
65 -def setXMLlang(node, lang):
66 """Sets the xml:lang attribute on node""" 67 node.set("{%s}lang" % XML_NS, lang)
68
69 -def getXMLspace(node, default=None):
70 """Gets the xml:space attribute on node""" 71 value = node.get("{%s}space" % XML_NS) 72 if value is None: 73 value = default 74 return value
75
76 -def setXMLspace(node, value):
77 """Sets the xml:space attribute on node""" 78 node.set("{%s}space" % XML_NS, value)
79
80 -def namespaced(namespace, name):
81 """Returns name in Clark notation within the given namespace. 82 83 For example namespaced("source") in an XLIFF document might return:: 84 {urn:oasis:names:tc:xliff:document:1.1}source 85 This is needed throughout lxml. 86 """ 87 if namespace: 88 return "{%s}%s" % (namespace, name) 89 else: 90 return name
91 92 MULTIWHITESPACE_PATTERN = r"[\n\r\t ]+" 93 MULTIWHITESPACE_RE = re.compile(MULTIWHITESPACE_PATTERN, re.MULTILINE) 94
95 -def normalize_space(text):
96 """Normalize the given text for implimentation of xml:space="default".""" 97 text = MULTIWHITESPACE_RE.sub(u" ", text) 98 return text
99
100 -def normalize_xml_space(node, xml_space, remove_start=False):
101 """normalize spaces following the nodes xml:space, or alternatively the 102 given xml_space parameter.""" 103 xml_space = getXMLspace(node) or xml_space 104 if xml_space == 'preserve': 105 return 106 if node.text: 107 node.text = normalize_space(node.text) 108 if remove_start and node.text[0] == u" ": 109 node.text = node.text.lstrip() 110 remove_start = False 111 if len(node.text) > 0 and node.text.endswith(u" "): 112 remove_start = True 113 if len(node) == 0: 114 node.text = node.text.rstrip() 115 if node.tail: 116 node.tail = normalize_space(node.tail) 117 118 for child in node: 119 normalize_xml_space(child, remove_start)
120