Friday, July 30, 2010

使用python的lxml库解析html

[cc lang="python"]import urllib2
import lxml.html as H

def getjarinfo(url):
c=urllib2.urlopen(url)

f=c.read()
doc = H.document_fromstring(f)
tables=doc.xpath("//table[@id='download']")
pinpais=doc.xpath("//td[@id='music']")
jixings=doc.xpath("//div[@id='game']")
jars = doc.xpath("//table[@id='download']//tr[2]/td[1]/a[1]")
for j in range(len(pinpais)):
print jars[j].get('href')
print pinpais[j].text_content()
print jixings[j].text_content()
e=doc.xpath(u"//div[text()='%s']" % u"游戏")
describe=e[0].getnext().text_content()
#r = doc.xpath("//table[@id='download']//tr[2]/td[1]/a[1]")[0]
#jarurl=r.get('href')

if __name__ == '__main__':
url='http://google.com/'
getjarinfo(url)
[/cc]

No comments:

Post a Comment