Sunday, December 24, 2006

python抓取网页代码实例


# -*- encoding:UTF-8 -*-
'''
This is geturl.
Wirtten by yuzebin : yuzebin@gmail.com
Important:this script is running in cygwin or linux,if you run at windows
you need the curl and wget for windows .
'''
class='''
CGetPage is charge of to get a url , it have three methods to get a page : urllib,curl and wget;
CParsePage is charge of to parse the page , and return the match;
CGetMatch is the forcad class to wrap the CGetPage and CParsePage.
'''
history='''
2006.07.10 version 0.0.0.9 :
Publish this code to internet , ;-)

2006.06.27 version 0.0.0.7 :
refrectoring class CParsePage : return re.match only
refrectoring class CGetCount : rename to CGetMatch

2006.06.26 version 0.0.0.3 :
modify class CParsePage , return re.match

2006.06.22 version 0.0.0.2 :
add class CGetCount
this version is the first workable version.
add cnsky.

2006.06.21 initial version 0.0.0.1 :
add class CGetPage and CParsePage
cannot work ;-)
'''
import string,re,os,fnmatch,sys,copy,gzip,time,datetime,urllib
from types import *

isDebugMode = False

funcUrlRead = lambda url: urllib.urlopen(url).read()

def funcOutputMessage(msg):
print str(msg)

def funcDebugInfo(msg):
if(isDebugMode==True):
print str(msg)

class CGetPage:
def __init__(self,url):
if self.urlCheck(url)==True:
self.url=url
else:
return None

def urlCheck(self,url):
#todo , check the url is valid url.
return True

def getPage(self):
self.page = funcUrlRead(self.url)

def curlPage(self):
#call curl to get a page,this requir curl is installed.
self.page = os.popen("curl -A "" -s "" + self.url + """).read()

def setPath(self,path):
self.path = path

def wgetPage(self):
#call wget to download a url to path,this requir wget is installed.
os.chdir(self.path)
os.system('wget -c ' + self.url)

class CParsePage:
def __init__(self,rule,page):
if (self.ruleCompile(rule)!=False):
self.page = page
else:
return None

def ruleCompile(self,rule):
#compile the rule
try:
self.rule = re.compile(rule)
except:
return False

def parsePage(self):
self.match = re.search(self.rule,unicode(self.page,self.getCharset(self.page)))
funcDebugInfo(type(self.match))

def getCharset(self,string):
import chardet
#todo : automatic discern the charset
charset = chardet.detect(string)
return charset['encoding']

class CGetMatch:
def __init__(self,url,rule):
self.url = url
self.rule = rule
self.cgetpage = CGetPage(self.url)
self.cgetpage.getPage()
self.page = self.cgetpage.page
self.cparsepage = CParsePage(self.rule,self.cgetpage.page)

def getMatch(self,url,rule):
self.url = url
self.rule = rule
self.cgetpage.url = url
self.cparsepage.rule = rule
self.cgetpage.getPage()
self.page = self.cgetpage.page
self.cparsepage.page = self.cgetpage.page
self.cparsepage.parsePage()
self.match = self.cparsepage.match

if __name__ == '__main__':
funcOutputMessage('===This is a get url script===')
runTest()

def runTest():
#initialization
ccount = CGetMatch('http://www.sina.com.cn','')
i=0

#1
try:
sitename = 'huajun'
rule = 'hit[587]='47588,([0-9]+)'
url = 'http://www.onlinedown.net/soft/hitjs/hits47.js'
i += 1
ccount.getMatch(url,rule)
funcOutputMessage(str(i).rjust(2) + '.' + sitename.ljust(12) +':' + str(ccount.match.group(1)))
except:
pass

#2
try:
sitename = 'skycn'
rule = u'下载次数:  ([0-9]+)'
url = 'http://www.skycn.com/soft/23265.html'
i += 1
ccount.getMatch(url,rule)
funcOutputMessage(str(i).rjust(2) + '.' + sitename.ljust(12) +':' + str(ccount.match.group(1)))
except:
pass

No comments:

Post a Comment