XML writing tools for Python
我目前正在尝试ElementTree,它看起来还不错,它可以转义HTML实体,依此类推。 我是否错过了我从未听说过的真正美妙的事情?
这类似于我的实际操作:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
| import xml.etree.ElementTree as ET
root = ET.Element('html')
head = ET.SubElement(root,'head')
script = ET.SubElement(head,'script')
script.set('type','text/javascript')
script.text ="var a = 'I love á letters'"
body = ET.SubElement(root,'body')
h1 = ET.SubElement(body,'h1')
h1.text ="And I like the fact that 3 > 1"
tree = ET.ElementTree(root)
tree.write('foo.xhtml')
# more foo.xhtml
<html><head><script type="text/javascript">var a = 'I love á
letters'</head><body>And I like the fact that 3 > 1
</body></html> |
另一种方法是使用lxml中的E Factory构建器(也可在Elementtree中使用)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
| >>> from lxml import etree
>>> from lxml.builder import E
>>> def CLASS(*args): # class is a reserved word in Python
... return {"class":' '.join(args)}
>>> html = page = (
... E.html( # create an Element called"html"
... E.head(
... E.title("This is a sample document")
... ),
... E.body(
... E.h1("Hello!", CLASS("title")),
... E.p("This is a paragraph with", E.b("bold")," text in it!"),
... E.p("This is another paragraph, with a","
",
... E.a("link", href="http://www.python.org"),"."),
... E.p("Here are some reserved characters: <spam&egg>."),
... etree.XML("<p>
And finally an embedded XHTML fragment.
</p>"),
... )
... )
... )
>>> print(etree.tostring(page, pretty_print=True))
<html>
<head>
This is a sample document
</head>
<body>
<h1 class="title">Hello!
<p>
This is a paragraph with bold text in it!
</p>
<p>
This is another paragraph, with a
link.
</p>
<p>
Here are some reservered characters: <spam&egg>.
</p>
<p>
And finally an embedded XHTML fragment.
</p>
</body>
</html> |
总有SimpleXMLWriter,它是ElementTree工具包的一部分。该接口非常简单。
这是一个例子:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
| from elementtree.SimpleXMLWriter import XMLWriter
import sys
w = XMLWriter(sys.stdout)
html = w.start("html")
w.start("head")
w.element("title","my document")
w.element("meta", name="generator", value="my application 1.0")
w.end()
w.start("body")
w.element("h1","this is a heading")
w.element("p","this is a paragraph")
w.start("p")
w.data("this is")
w.element("b","bold")
w.data(" and")
w.element("i","italic")
w.data(".")
w.end("p")
w.close(html) |
我假设您实际上是在创建XML DOM树,因为您要验证此文件中包含的内容是有效的XML,否则您将只向文件中写入静态字符串。如果验证您的输出确实是您的目标,那么我建议
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
| from xml.dom.minidom import parseString
doc = parseString("""<html>
<head>
<script type="text/javascript">
var a = 'I love á letters'
</head>
<body>
And I like the fact that 3 > 1
</body>
</html>""")
with open("foo.xhtml","w") as f:
f.write( doc.toxml() ) |
这使您可以只编写要输出的XML,验证其正确性(因为parseString如果无效则将引发异常),并使您的代码看起来更好。
大概您不只是每次都编写相同的静态XML并且想要进行替换。在这种情况下,我会像
然后使用%运算符进行替换,例如
1
| </html>""" % {"message":"I love á letters"}) |
https://github.com/galvez/xmlwitch:
1 2 3 4 5 6 7 8 9 10 11 12 13 14
| import xmlwitch
xml = xmlwitch.Builder(version='1.0', encoding='utf-8')
with xml.feed(xmlns='http://www.w3.org/2005/Atom'):
xml.title('Example Feed')
xml.updated('2003-12-13T18:30:02Z')
with xml.author:
xml.name('John Doe')
xml.id('urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6')
with xml.entry:
xml.title('Atom-Powered Robots Run Amok')
xml.id('urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a')
xml.updated('2003-12-13T18:30:02Z')
xml.summary('Some text.')
print(xml) |
您实际上不是想要这样的东西吗:
1 2
| html(head(script(type='text/javascript', content='var a = ...')),
body(h1('And I like the fact that 3 < 1'), p('just some paragraph')) |
我想我在某处看到了类似的东西。太好了
编辑:实际上,我今天去写了一个库来做到这一点:magictree
您可以像这样使用它:
1 2 3 4 5 6 7 8 9 10 11 12
| from magictree import html, head, script, body, h1, p
root = html(
head(
script('''var a = 'I love á letters''',
type='text/javascript')),
body(
h1('And I like the fact that 3 > 1')))
# root is a plain Element object, like those created with ET.Element...
# so you can write it out using ElementTree :)
tree = ET.ElementTree(root)
tree.write('foo.xhtml') |
magictree中的神奇之处在于导入的工作方式:在需要时创建Element工厂。看一下源代码,它基于另一个StackOverflow问题的答案。
对于现在遇到此问题的任何人,实际上都有一种隐藏在xml.sax.utils.XMLGenerator的Python标准库中的方法。这是一个实际的例子:
1 2 3 4 5 6 7 8 9 10 11
| >>> from xml.sax.saxutils import XMLGenerator
>>> import StringIO
>>> w = XMLGenerator(out, 'utf-8')
>>> w.startDocument()
>>> w.startElement("test", {'bar': 'baz'})
>>> w.characters("Foo")
>>> w.endElement("test")
>>> w.endDocument()
>>> print out.getvalue()
<?xml version="1.0" encoding="utf-8"?>
<test bar="baz">Foo</test> |
我最终使用saxutils.escape(str)生成有效的XML字符串,然后使用Eli的方法对其进行验证,以确保我不会错过任何标签
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
| from xml.sax import saxutils
from xml.dom.minidom import parseString
from xml.parsers.expat import ExpatError
xml = '''<?xml version="1.0" encoding="%s"?>
<contents title="%s" crawl_date="%s" in_text_date="%s"
url="%s">
<main_post>%s</main_post>
</contents>''' %
(self.encoding, saxutils.escape(title), saxutils.escape(time),
saxutils.escape(date), saxutils.escape(url), saxutils.escape(contents))
try:
minidoc = parseString(xml)
catch ExpatError:
print"Invalid xml" |
尝试http://uche.ogbuji.net/tech/4suite/amara。它非常完整,并且具有一组直接的访问工具。普通Unicode支持等
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
| #
#Output the XML entry
#
def genFileOLD(out,label,term,idval):
filename=entryTime() +".html"
writer=MarkupWriter(out, indent=u"yes")
writer.startDocument()
#Test element and attribute writing
ans=namespace=u'http://www.w3.org/2005/Atom'
xns=namespace=u'http://www.w3.org/1999/xhtml'
writer.startElement(u'entry',
ans,
extraNss={u'x':u'http://www.w3.org/1999/xhtml' ,
u'dc':u'http://purl.org/dc/elements/1.1'})
#u'a':u'http://www.w3.org/2005/Atom',
#writer.attribute(u'xml:lang',unicode("en-UK"))
writer.simpleElement(u'title',ans,content=unicode(label))
#writer.simpleElement(u'a:subtitle',ans,content=u' ')
id=unicode("http://www.dpawson.co.uk/nodesets/"+afn.split(".")[0])
writer.simpleElement(u'id',ans,content=id)
writer.simpleElement(u'updated',ans,content=unicode(dtime()))
writer.startElement(u'author',ans)
writer.simpleElement(u'name',ans,content=u'Dave ')
writer.simpleElement(u'uri',ans,
content=u'http://www.dpawson.co.uk/nodesets/'+afn+".xml")
writer.endElement(u'author')
writer.startElement(u'category', ans)
if (prompt):
label=unicode(raw_input("Enter label"))
writer.attribute(u'label',unicode(label))
if (prompt):
term = unicode(raw_input("Enter term to use"))
writer.attribute(u'term', unicode(term))
writer.endElement(u'category')
writer.simpleElement(u'rights',ans,content=u'\u00A9 Dave 2005-2008')
writer.startElement(u'link',ans)
writer.attribute(u'href',
unicode("http://www.dpawson.co.uk/nodesets/entries/"+afn+".html"))
writer.attribute(u'rel',unicode("alternate"))
writer.endElement(u'link')
writer.startElement(u'published', ans)
dt=dtime()
dtu=unicode(dt)
writer.text(dtu)
writer.endElement(u'published')
writer.simpleElement(u'summary',ans,content=unicode(label))
writer.startElement(u'content',ans)
writer.attribute(u'type',unicode("xhtml"))
writer.startElement(u'div',xns)
writer.simpleElement(u'h3',xns,content=unicode(label))
writer.endElement(u'div')
writer.endElement(u'content')
writer.endElement(u'entry') |
|