1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42 """
43 Provides general XML-related functionality.
44
45 What I'm trying to do here is abstract much of the functionality that directly
46 accesses the DOM tree. This is not so much to "protect" the other code from
47 the DOM, but to standardize the way it's used. It will also help extension
48 authors write code that easily looks more like the rest of Cedar Backup.
49
50 @sort: createInputDom, createOutputDom, serializeDom, isElement, readChildren,
51 readFirstChild, readStringList, readString, readInteger, readBoolean,
52 addContainerNode, addStringNode, addIntegerNode, addBooleanNode,
53 TRUE_BOOLEAN_VALUES, FALSE_BOOLEAN_VALUES, VALID_BOOLEAN_VALUES
54
55 @var TRUE_BOOLEAN_VALUES: List of boolean values in XML representing C{True}.
56 @var FALSE_BOOLEAN_VALUES: List of boolean values in XML representing C{False}.
57 @var VALID_BOOLEAN_VALUES: List of valid boolean values in XML.
58
59 @author: Kenneth J. Pronovici <pronovic@ieee.org>
60 """
61
62
63
64
65
66
67
68 import sys
69 import re
70 import logging
71 import codecs
72 from types import UnicodeType
73 from StringIO import StringIO
74
75
76 from xml.parsers.expat import ExpatError
77 from xml.dom.minidom import Node
78 from xml.dom.minidom import getDOMImplementation
79 from xml.dom.minidom import parseString
80
81
82
83
84
85
86 logger = logging.getLogger("CedarBackup2.log.xml")
87
88 TRUE_BOOLEAN_VALUES = [ "Y", "y", ]
89 FALSE_BOOLEAN_VALUES = [ "N", "n", ]
90 VALID_BOOLEAN_VALUES = TRUE_BOOLEAN_VALUES + FALSE_BOOLEAN_VALUES
91
92
93
94
95
96
110
112 """
113 Creates a DOM tree used for writing an XML document.
114 @param name: Base name of the document (root node name).
115 @return: Tuple (xmlDom, parentNode) for the new document
116 """
117 impl = getDOMImplementation()
118 xmlDom = impl.createDocument(None, name, None)
119 return (xmlDom, xmlDom.documentElement)
120
121
122
123
124
125
127 """
128 Returns True or False depending on whether the XML node is an element node.
129 """
130 return node.nodeType == Node.ELEMENT_NODE
131
133 """
134 Returns a list of nodes with a given name immediately beneath the
135 parent.
136
137 By "immediately beneath" the parent, we mean from among nodes that are
138 direct children of the passed-in parent node.
139
140 Underneath, we use the Python C{getElementsByTagName} method, which is
141 pretty cool, but which (surprisingly?) returns a list of all children
142 with a given name below the parent, at any level. We just prune that
143 list to include only children whose C{parentNode} matches the passed-in
144 parent.
145
146 @param parent: Parent node to search beneath.
147 @param name: Name of nodes to search for.
148
149 @return: List of child nodes with correct parent, or an empty list if
150 no matching nodes are found.
151 """
152 lst = []
153 if parent is not None:
154 result = parent.getElementsByTagName(name)
155 for entry in result:
156 if entry.parentNode is parent:
157 lst.append(entry)
158 return lst
159
161 """
162 Returns the first child with a given name immediately beneath the parent.
163
164 By "immediately beneath" the parent, we mean from among nodes that are
165 direct children of the passed-in parent node.
166
167 @param parent: Parent node to search beneath.
168 @param name: Name of node to search for.
169
170 @return: First properly-named child of parent, or C{None} if no matching nodes are found.
171 """
172 result = readChildren(parent, name)
173 if result is None or result == []:
174 return None
175 return result[0]
176
178 """
179 Returns a list of the string contents associated with nodes with a given
180 name immediately beneath the parent.
181
182 By "immediately beneath" the parent, we mean from among nodes that are
183 direct children of the passed-in parent node.
184
185 First, we find all of the nodes using L{readChildren}, and then we
186 retrieve the "string contents" of each of those nodes. The returned list
187 has one entry per matching node. We assume that string contents of a
188 given node belong to the first C{TEXT_NODE} child of that node. Nodes
189 which have no C{TEXT_NODE} children are not represented in the returned
190 list.
191
192 @param parent: Parent node to search beneath.
193 @param name: Name of node to search for.
194
195 @return: List of strings as described above, or C{None} if no matching nodes are found.
196 """
197 lst = []
198 result = readChildren(parent, name)
199 for entry in result:
200 if entry.hasChildNodes():
201 for child in entry.childNodes:
202 if child.nodeType == Node.TEXT_NODE:
203 lst.append(child.nodeValue)
204 break
205 if lst == []:
206 lst = None
207 return lst
208
210 """
211 Returns string contents of the first child with a given name immediately
212 beneath the parent.
213
214 By "immediately beneath" the parent, we mean from among nodes that are
215 direct children of the passed-in parent node. We assume that string
216 contents of a given node belong to the first C{TEXT_NODE} child of that
217 node.
218
219 @param parent: Parent node to search beneath.
220 @param name: Name of node to search for.
221
222 @return: String contents of node or C{None} if no matching nodes are found.
223 """
224 result = readStringList(parent, name)
225 if result is None:
226 return None
227 return result[0]
228
230 """
231 Returns integer contents of the first child with a given name immediately
232 beneath the parent.
233
234 By "immediately beneath" the parent, we mean from among nodes that are
235 direct children of the passed-in parent node.
236
237 @param parent: Parent node to search beneath.
238 @param name: Name of node to search for.
239
240 @return: Integer contents of node or C{None} if no matching nodes are found.
241 @raise ValueError: If the string at the location can't be converted to an integer.
242 """
243 result = readString(parent, name)
244 if result is None:
245 return None
246 else:
247 return int(result)
248
250 """
251 Returns float contents of the first child with a given name immediately
252 beneath the parent.
253
254 By "immediately beneath" the parent, we mean from among nodes that are
255 direct children of the passed-in parent node.
256
257 @param parent: Parent node to search beneath.
258 @param name: Name of node to search for.
259
260 @return: Float contents of node or C{None} if no matching nodes are found.
261 @raise ValueError: If the string at the location can't be converted to a
262 float value.
263 """
264 result = readString(parent, name)
265 if result is None:
266 return None
267 else:
268 return float(result)
269
271 """
272 Returns boolean contents of the first child with a given name immediately
273 beneath the parent.
274
275 By "immediately beneath" the parent, we mean from among nodes that are
276 direct children of the passed-in parent node.
277
278 The string value of the node must be one of the values in L{VALID_BOOLEAN_VALUES}.
279
280 @param parent: Parent node to search beneath.
281 @param name: Name of node to search for.
282
283 @return: Boolean contents of node or C{None} if no matching nodes are found.
284 @raise ValueError: If the string at the location can't be converted to a boolean.
285 """
286 result = readString(parent, name)
287 if result is None:
288 return None
289 else:
290 if result in TRUE_BOOLEAN_VALUES:
291 return True
292 elif result in FALSE_BOOLEAN_VALUES:
293 return False
294 else:
295 raise ValueError("Boolean values must be one of %s." % VALID_BOOLEAN_VALUES)
296
297
298
299
300
301
303 """
304 Adds a container node as the next child of a parent node.
305
306 @param xmlDom: DOM tree as from C{impl.createDocument()}.
307 @param parentNode: Parent node to create child for.
308 @param nodeName: Name of the new container node.
309
310 @return: Reference to the newly-created node.
311 """
312 containerNode = xmlDom.createElement(nodeName)
313 parentNode.appendChild(containerNode)
314 return containerNode
315
317 """
318 Adds a text node as the next child of a parent, to contain a string.
319
320 If the C{nodeValue} is None, then the node will be created, but will be
321 empty (i.e. will contain no text node child).
322
323 @param xmlDom: DOM tree as from C{impl.createDocument()}.
324 @param parentNode: Parent node to create child for.
325 @param nodeName: Name of the new container node.
326 @param nodeValue: The value to put into the node.
327
328 @return: Reference to the newly-created node.
329 """
330 containerNode = addContainerNode(xmlDom, parentNode, nodeName)
331 if nodeValue is not None:
332 textNode = xmlDom.createTextNode(nodeValue)
333 containerNode.appendChild(textNode)
334 return containerNode
335
337 """
338 Adds a text node as the next child of a parent, to contain an integer.
339
340 If the C{nodeValue} is None, then the node will be created, but will be
341 empty (i.e. will contain no text node child).
342
343 The integer will be converted to a string using "%d". The result will be
344 added to the document via L{addStringNode}.
345
346 @param xmlDom: DOM tree as from C{impl.createDocument()}.
347 @param parentNode: Parent node to create child for.
348 @param nodeName: Name of the new container node.
349 @param nodeValue: The value to put into the node.
350
351 @return: Reference to the newly-created node.
352 """
353 if nodeValue is None:
354 return addStringNode(xmlDom, parentNode, nodeName, None)
355 else:
356 return addStringNode(xmlDom, parentNode, nodeName, "%d" % nodeValue)
357
359 """
360 Adds a text node as the next child of a parent, to contain a boolean.
361
362 If the C{nodeValue} is None, then the node will be created, but will be
363 empty (i.e. will contain no text node child).
364
365 Boolean C{True}, or anything else interpreted as C{True} by Python, will
366 be converted to a string "Y". Anything else will be converted to a
367 string "N". The result is added to the document via L{addStringNode}.
368
369 @param xmlDom: DOM tree as from C{impl.createDocument()}.
370 @param parentNode: Parent node to create child for.
371 @param nodeName: Name of the new container node.
372 @param nodeValue: The value to put into the node.
373
374 @return: Reference to the newly-created node.
375 """
376 if nodeValue is None:
377 return addStringNode(xmlDom, parentNode, nodeName, None)
378 else:
379 if nodeValue:
380 return addStringNode(xmlDom, parentNode, nodeName, "Y")
381 else:
382 return addStringNode(xmlDom, parentNode, nodeName, "N")
383
384
385
386
387
388
390 """
391 Serializes a DOM tree and returns the result in a string.
392 @param xmlDom: XML DOM tree to serialize
393 @param indent: Number of spaces to indent, as an integer
394 @return: String form of DOM tree, pretty-printed.
395 """
396 xmlBuffer = StringIO()
397 serializer = Serializer(xmlBuffer, "UTF-8", indent=indent)
398 serializer.serialize(xmlDom)
399 xmlData = xmlBuffer.getvalue()
400 xmlBuffer.close()
401 return xmlData
402
404
405 """
406 XML serializer class.
407
408 This is a customized serializer that I hacked together based on what I found
409 in the PyXML distribution. Basically, around release 2.7.0, the only reason
410 I still had around a dependency on PyXML was for the PrettyPrint
411 functionality, and that seemed pointless. So, I stripped the PrettyPrint
412 code out of PyXML and hacked bits of it off until it did just what I needed
413 and no more.
414
415 This code started out being called PrintVisitor, but I decided it makes more
416 sense just calling it a serializer. I've made nearly all of the methods
417 private, and I've added a new high-level serialize() method rather than
418 having clients call C{visit()}.
419
420 Anyway, as a consequence of my hacking with it, this can't quite be called a
421 complete XML serializer any more. I ripped out support for HTML and XHTML,
422 and there is also no longer any support for namespaces (which I took out
423 because this dragged along a lot of extra code, and Cedar Backup doesn't use
424 namespaces). However, everything else should pretty much work as expected.
425
426 @copyright: This code, prior to customization, was part of the PyXML
427 codebase, and before that was part of the 4DOM suite developed by
428 Fourthought, Inc. It its original form, it was Copyright (c) 2000
429 Fourthought Inc, USA; All Rights Reserved.
430 """
431
432 - def __init__(self, stream=sys.stdout, encoding="UTF-8", indent=3):
433 """
434 Initialize a serializer.
435 @param stream: Stream to write output to.
436 @param encoding: Output encoding.
437 @param indent: Number of spaces to indent, as an integer
438 """
439 self.stream = stream
440 self.encoding = encoding
441 self._indent = indent * " "
442 self._depth = 0
443 self._inText = 0
444
446 """
447 Serialize the passed-in XML document.
448 @param xmlDom: XML DOM tree to serialize
449 @raise ValueError: If there's an unknown node type in the document.
450 """
451 self._visit(xmlDom)
452 self.stream.write("\n")
453
458
460 if not self._inText and self._indent:
461 self._write('\n' + self._indent*self._depth)
462 return
463
465 """
466 @raise ValueError: If there's an unknown node type in the document.
467 """
468 if node.nodeType == Node.ELEMENT_NODE:
469 return self._visitElement(node)
470
471 elif node.nodeType == Node.ATTRIBUTE_NODE:
472 return self._visitAttr(node)
473
474 elif node.nodeType == Node.TEXT_NODE:
475 return self._visitText(node)
476
477 elif node.nodeType == Node.CDATA_SECTION_NODE:
478 return self._visitCDATASection(node)
479
480 elif node.nodeType == Node.ENTITY_REFERENCE_NODE:
481 return self._visitEntityReference(node)
482
483 elif node.nodeType == Node.ENTITY_NODE:
484 return self._visitEntity(node)
485
486 elif node.nodeType == Node.PROCESSING_INSTRUCTION_NODE:
487 return self._visitProcessingInstruction(node)
488
489 elif node.nodeType == Node.COMMENT_NODE:
490 return self._visitComment(node)
491
492 elif node.nodeType == Node.DOCUMENT_NODE:
493 return self._visitDocument(node)
494
495 elif node.nodeType == Node.DOCUMENT_TYPE_NODE:
496 return self._visitDocumentType(node)
497
498 elif node.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
499 return self._visitDocumentFragment(node)
500
501 elif node.nodeType == Node.NOTATION_NODE:
502 return self._visitNotation(node)
503
504
505 raise ValueError("Unknown node type: %s" % repr(node))
506
508 for curr in node:
509 curr is not exclude and self._visit(curr)
510 return
511
513 for item in node.values():
514 self._visit(item)
515 return
516
524
526 self._write("<?xml version='1.0' encoding='%s'?>" % (self.encoding or 'utf-8'))
527 self._inText = 0
528 return
529
535
539
541 self._tryIndent()
542 self._write('<%s' % node.tagName)
543 for attr in node.attributes.values():
544 self._visitAttr(attr)
545 if len(node.childNodes):
546 self._write('>')
547 self._depth = self._depth + 1
548 self._visitNodeList(node.childNodes)
549 self._depth = self._depth - 1
550 not (self._inText) and self._tryIndent()
551 self._write('</%s>' % node.tagName)
552 else:
553 self._write('/>')
554 self._inText = 0
555 return
556
557 - def _visitText(self, node):
558 text = node.data
559 if self._indent:
560 text.strip()
561 if text:
562 text = _translateCDATA(text, self.encoding)
563 self.stream.write(text)
564 self._inText = 1
565 return
566
568 if not doctype.systemId and not doctype.publicId: return
569 self._tryIndent()
570 self._write('<!DOCTYPE %s' % doctype.name)
571 if doctype.systemId and '"' in doctype.systemId:
572 system = "'%s'" % doctype.systemId
573 else:
574 system = '"%s"' % doctype.systemId
575 if doctype.publicId and '"' in doctype.publicId:
576
577
578
579 public = "'%s'" % doctype.publicId
580 else:
581 public = '"%s"' % doctype.publicId
582 if doctype.publicId and doctype.systemId:
583 self._write(' PUBLIC %s %s' % (public, system))
584 elif doctype.systemId:
585 self._write(' SYSTEM %s' % system)
586 if doctype.entities or doctype.notations:
587 self._write(' [')
588 self._depth = self._depth + 1
589 self._visitNamedNodeMap(doctype.entities)
590 self._visitNamedNodeMap(doctype.notations)
591 self._depth = self._depth - 1
592 self._tryIndent()
593 self._write(']>')
594 else:
595 self._write('>')
596 self._inText = 0
597 return
598
600 """Visited from a NamedNodeMap in DocumentType"""
601 self._tryIndent()
602 self._write('<!ENTITY %s' % (node.nodeName))
603 node.publicId and self._write(' PUBLIC %s' % node.publicId)
604 node.systemId and self._write(' SYSTEM %s' % node.systemId)
605 node.notationName and self._write(' NDATA %s' % node.notationName)
606 self._write('>')
607 return
608
610 """Visited from a NamedNodeMap in DocumentType"""
611 self._tryIndent()
612 self._write('<!NOTATION %s' % node.nodeName)
613 node.publicId and self._write(' PUBLIC %s' % node.publicId)
614 node.systemId and self._write(' SYSTEM %s' % node.systemId)
615 self._write('>')
616 return
617
619 self._tryIndent()
620 self._write('<![CDATA[%s]]>' % (node.data))
621 self._inText = 0
622 return
623
629
631 self._write('&%s;' % node.nodeName)
632 self._inText = 1
633 return
634
636 self._tryIndent()
637 self._write('<?%s %s?>' % (node.target, node.data))
638 self._inText = 0
639 return
640
641 -def _encodeText(text, encoding):
642 """
643 @copyright: This code, prior to customization, was part of the PyXML
644 codebase, and before that was part of the 4DOM suite developed by
645 Fourthought, Inc. It its original form, it was attributed to Martin v.
646 Löwis and was Copyright (c) 2000 Fourthought Inc, USA; All Rights Reserved.
647 """
648 encoder = codecs.lookup(encoding)[0]
649 if type(text) is not UnicodeType:
650 text = unicode(text, "utf-8")
651 return encoder(text)[0]
652
654 """
655 Handles normalization and some intelligence about quoting.
656
657 @copyright: This code, prior to customization, was part of the PyXML
658 codebase, and before that was part of the 4DOM suite developed by
659 Fourthought, Inc. It its original form, it was Copyright (c) 2000
660 Fourthought Inc, USA; All Rights Reserved.
661 """
662 if not characters:
663 return '', "'"
664 if "'" in characters:
665 delimiter = '"'
666 new_chars = re.sub('"', '"', characters)
667 else:
668 delimiter = "'"
669 new_chars = re.sub("'", ''', characters)
670
671
672
673 if "\n" in characters:
674 new_chars = re.sub('\n', ' ', new_chars)
675 return new_chars, delimiter
676
677
678 -def _translateCDATA(characters, encoding='UTF-8', prev_chars='', markupSafe=0):
679 """
680 @copyright: This code, prior to customization, was part of the PyXML
681 codebase, and before that was part of the 4DOM suite developed by
682 Fourthought, Inc. It its original form, it was Copyright (c) 2000
683 Fourthought Inc, USA; All Rights Reserved.
684 """
685 CDATA_CHAR_PATTERN = re.compile('[&<]|]]>')
686 CHAR_TO_ENTITY = { '&': '&', '<': '<', ']]>': ']]>', }
687 ILLEGAL_LOW_CHARS = '[\x01-\x08\x0B-\x0C\x0E-\x1F]'
688 ILLEGAL_HIGH_CHARS = '\xEF\xBF[\xBE\xBF]'
689 XML_ILLEGAL_CHAR_PATTERN = re.compile('%s|%s'%(ILLEGAL_LOW_CHARS, ILLEGAL_HIGH_CHARS))
690 if not characters:
691 return ''
692 if not markupSafe:
693 if CDATA_CHAR_PATTERN.search(characters):
694 new_string = CDATA_CHAR_PATTERN.subn(lambda m, d=CHAR_TO_ENTITY: d[m.group()], characters)[0]
695 else:
696 new_string = characters
697 if prev_chars[-2:] == ']]' and characters[0] == '>':
698 new_string = '>' + new_string[1:]
699 else:
700 new_string = characters
701
702
703
704 if XML_ILLEGAL_CHAR_PATTERN.search(new_string):
705 new_string = XML_ILLEGAL_CHAR_PATTERN.subn(lambda m: '&#%i;' % ord(m.group()), new_string)[0]
706 new_string = _encodeText(new_string, encoding)
707 return new_string
708