Sunday, December 24, 2006

用正则表达式抓取网页


import re
import urllib
import os
import time
from urllib import urlretrieve
def spider_url(url):
r = re.compile('')
page=urllib.urlopen(url).read()
b=r.findall(page)
for x in b:
try:
type, rest = urllib.splittype(x)
host, path = urllib.splithost(rest)
#if not path or path[-1] == "/":
#path = path + "index.html"
#if os.sep != "/":
# path = os.sep.join(path.split("/"))
#if os.name == "mac":
# path = os.sep + path
path = os.path.join(host, path)
dir, base = os.path.split(path)
#i=str(time.time())+".htm"
makedirs("C:/temp/"+dir)
#i="1.htm"
#urlretrieve(x,"C:/temp/"+i)
f = open("C:/temp/"+path, "wb")
page1=urllib.urlopen(x).read()
f.write(page1)
f.close()
#self.message("saved %s", path)
print "saved %s" % x
except:pass
#return b
def makedirs(dir):
if not dir:
return
if os.path.exists(dir):
if not os.path.isdir(dir):
try:
os.rename(dir, dir + ".bak")
os.mkdir(dir)
os.rename(dir + ".bak", os.path.join(dir, "index.html"))
except os.error:
pass
return
head, tail = os.path.split(dir)
if not tail:
print "Huh? Don't know how to make dir", dir
return
makedirs(head)
os.mkdir(dir, 0777)

No comments:

Post a Comment