Fred's Blog: python抓取网页代码实例


# -*- encoding:UTF-8 -*-
'''
    This is geturl.
    Wirtten by yuzebin : yuzebin@gmail.com
    Important:this script is running in cygwin or linux,if you run at windows 
        you need the curl and wget for windows .
'''
class='''
    CGetPage is charge of to get a url , it have three methods to get a page : urllib,curl and wget;
    CParsePage is charge of to parse the page , and return the match;
    CGetMatch is the forcad class to wrap the CGetPage and CParsePage.
'''
history='''
    2006.07.10 version 0.0.0.9 :
       Publish this code to internet , ;-)

    2006.06.27 version 0.0.0.7 :
        refrectoring class CParsePage : return re.match only
        refrectoring class CGetCount : rename to CGetMatch

    2006.06.26 version 0.0.0.3 :
        modify class CParsePage , return re.match

    2006.06.22 version 0.0.0.2 :
        add class CGetCount
        this version is the first workable version.
        add cnsky.

    2006.06.21 initial version 0.0.0.1 :
        add class CGetPage and CParsePage
        cannot work ;-)
'''
import string,re,os,fnmatch,sys,copy,gzip,time,datetime,urllib
from types import *

isDebugMode = False

funcUrlRead = lambda url: urllib.urlopen(url).read()

def funcOutputMessage(msg):
    print str(msg)

def funcDebugInfo(msg):
    if(isDebugMode==True):
        print str(msg)

class CGetPage:
    def __init__(self,url):
        if self.urlCheck(url)==True:
            self.url=url
        else:
            return None

    def urlCheck(self,url):
        #todo , check the url is valid url.        
        return True

    def getPage(self):
        self.page = funcUrlRead(self.url)

    def curlPage(self):
        #call curl to get a page,this requir curl is installed.
        self.page = os.popen("curl -A "" -s "" + self.url + """).read()
        
    def setPath(self,path):
        self.path = path

    def wgetPage(self):
        #call wget to download a url to path,this requir wget is installed.
        os.chdir(self.path)
        os.system('wget -c ' + self.url)

class CParsePage:
    def __init__(self,rule,page):
        if (self.ruleCompile(rule)!=False):
            self.page = page
        else:
            return None

    def ruleCompile(self,rule):
        #compile the rule
        try:
            self.rule = re.compile(rule)
        except:
            return False

    def parsePage(self):
        self.match = re.search(self.rule,unicode(self.page,self.getCharset(self.page)))
        funcDebugInfo(type(self.match))

    def getCharset(self,string):
        import chardet
        #todo : automatic discern the charset
        charset = chardet.detect(string)
        return charset['encoding']

class CGetMatch: 
    def __init__(self,url,rule):
        self.url = url
        self.rule = rule
        self.cgetpage = CGetPage(self.url)
        self.cgetpage.getPage()
        self.page = self.cgetpage.page
        self.cparsepage = CParsePage(self.rule,self.cgetpage.page)
    
    def getMatch(self,url,rule):
        self.url = url
        self.rule = rule
        self.cgetpage.url = url
        self.cparsepage.rule = rule
        self.cgetpage.getPage()
        self.page = self.cgetpage.page
        self.cparsepage.page = self.cgetpage.page
        self.cparsepage.parsePage()
        self.match = self.cparsepage.match

if __name__ == '__main__':
    funcOutputMessage('===This is a get url script===')
    runTest()
        
def runTest():
    #initialization
    ccount = CGetMatch('http://www.sina.com.cn','')
    i=0
    
    #1
    try:
        sitename = 'huajun'
        rule = 'hit[587]='47588,([0-9]+)'
        url = 'http://www.onlinedown.net/soft/hitjs/hits47.js'
        i += 1
        ccount.getMatch(url,rule)
        funcOutputMessage(str(i).rjust(2) + '.' + sitename.ljust(12) +':' + str(ccount.match.group(1)))
    except:
        pass

    #2
    try:
        sitename = 'skycn'
        rule = u'下载次数：  ([0-9]+)'
        url = 'http://www.skycn.com/soft/23265.html'
        i += 1
        ccount.getMatch(url,rule)
        funcOutputMessage(str(i).rjust(2) + '.' + sitename.ljust(12) +':' + str(ccount.match.group(1)))
    except:
        pass
Fred's Blog

Sunday, December 24, 2006

python抓取网页代码实例

No comments:

Post a Comment