Package translate :: Package misc :: Module xml_helpers
[hide private]
[frames] | no frames]

Source Code for Module translate.misc.xml_helpers

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  # 
  4  # Copyright 2006-2009 Zuza Software Foundation 
  5  # 
  6  # This file is part of the Translate Toolkit. 
  7  # 
  8  # This program is free software; you can redistribute it and/or modify 
  9  # it under the terms of the GNU General Public License as published by 
 10  # the Free Software Foundation; either version 2 of the License, or 
 11  # (at your option) any later version. 
 12  # 
 13  # This program is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  # 
 18  # You should have received a copy of the GNU General Public License 
 19  # along with this program; if not, see <http://www.gnu.org/licenses/>. 
 20   
 21  """Helper functions for working with XML.""" 
 22   
 23  import re 
 24   
 25  from lxml import etree 
 26   
 27  # some useful xpath expressions 
 28  xml_preserve_ancestors = etree.XPath("ancestor-or-self::*[attribute::xml:space='preserve']") 
 29  """All ancestors with xml:space='preserve'""" 
 30   
 31  xml_space_ancestors = etree.XPath("ancestor-or-self::*/attribute::xml:space") 
 32  """All xml:space attributes in the ancestors""" 
 33   
 34  string_xpath = etree.XPath("string()") 
 35  """Return a non-normalized string in the node subtree""" 
 36   
 37  string_xpath_normalized = etree.XPath("normalize-space()") 
 38  """Return a (space) normalized string in the node subtree""" 
 39   
 40   
41 -def getText(node, xml_space="preserve"):
42 """Extracts the plain text content out of the given node. 43 44 This method checks the xml:space attribute of the given node, and takes 45 an optional default to use in case nothing is specified in this node.""" 46 xml_space = getXMLspace(node, xml_space) 47 if xml_space == "default": 48 return unicode(string_xpath_normalized(node)) # specific to lxml.etree 49 else: 50 return unicode(string_xpath(node)) # specific to lxml.etree
51 52 # If we want to normalise space and only preserve it when the directive 53 # xml:space="preserve" is given in node or in parents, consider this code: 54 #xml_preserves = xml_preserve_ancestors(node) 55 #if xml_preserves and xml_preserves[-1] == "preserve": 56 # return unicode(string_xpath(node)) # specific to lxml.etree 57 #else: 58 # return unicode(string_xpath_normalized(node)) # specific to lxml.etree 59 60 61 XML_NS = 'http://www.w3.org/XML/1998/namespace' 62 63
64 -def getXMLlang(node):
65 """Gets the xml:lang attribute on node""" 66 return node.get("{%s}lang" % XML_NS)
67 68
69 -def setXMLlang(node, lang):
70 """Sets the xml:lang attribute on node""" 71 node.set("{%s}lang" % XML_NS, lang)
72 73
74 -def getXMLspace(node, default=None):
75 """Gets the xml:space attribute on node""" 76 value = node.get("{%s}space" % XML_NS) 77 if value is None: 78 value = default 79 return value
80 81
82 -def setXMLspace(node, value):
83 """Sets the xml:space attribute on node""" 84 node.set("{%s}space" % XML_NS, value)
85 86
87 -def namespaced(namespace, name):
88 """Returns name in Clark notation within the given namespace. 89 90 For example namespaced("source") in an XLIFF document might return:: 91 {urn:oasis:names:tc:xliff:document:1.1}source 92 This is needed throughout lxml. 93 """ 94 if namespace: 95 return "{%s}%s" % (namespace, name) 96 else: 97 return name
98 99 MULTIWHITESPACE_PATTERN = r"[\n\r\t ]+" 100 MULTIWHITESPACE_RE = re.compile(MULTIWHITESPACE_PATTERN, re.MULTILINE) 101 102
103 -def normalize_space(text):
104 """Normalize the given text for implimentation of xml:space="default".""" 105 text = MULTIWHITESPACE_RE.sub(u" ", text) 106 return text
107 108
109 -def normalize_xml_space(node, xml_space, remove_start=False):
110 """normalize spaces following the nodes xml:space, or alternatively the 111 given xml_space parameter.""" 112 xml_space = getXMLspace(node) or xml_space 113 if xml_space == 'preserve': 114 return 115 if node.text: 116 node.text = normalize_space(node.text) 117 if remove_start and node.text[0] == u" ": 118 node.text = node.text.lstrip() 119 remove_start = False 120 if len(node.text) > 0 and node.text.endswith(u" "): 121 remove_start = True 122 if len(node) == 0: 123 node.text = node.text.rstrip() 124 if node.tail: 125 node.tail = normalize_space(node.tail) 126 127 for child in node: 128 normalize_xml_space(child, remove_start)
129