基于深度优先遍历的网站爬虫-Python

#encoding=utf-8
from bs4 import BeautifulSoup
import socket
import urllib2
import re
import zlib
import requests

urls = ["http://www.baidu.com"] #自定义要爬去的链接
depth = 10 #自定义爬去的深度
flag = "1234" #自定义要搜索的内容
 
class MyCrawler:
    def __init__(self,seeds):
        #初始化当前抓取的深度
        self.current_deepth = 1
        #使用种子初始化url队列
        self.linkQuence=linkQuence()
        if isinstance(seeds,str):
            self.linkQuence.addUnvisitedUrl(seeds)
        if isinstance(seeds,list):
            for i in seeds:
                self.linkQuence.addUnvisitedUrl(i)
    #抓取过程主函数
    def crawling(self,seeds,crawl_deepth,static_url):
        #循环条件:抓取深度不超过crawl_deepth
        while self.current_deepth <= crawl_deepth:
            #循环条件:待抓取的链接不空
            while not self.linkQuence.unVisitedUrlsEnmpy():
                #队头url出队列
                visitUrl=self.linkQuence.unVisitedUrlDeQuence()
                if visitUrl is None or visitUrl=="":
                    continue
                #获取超链接
                links=self.getHyperLinks(visitUrl,static_url)
                #将url放入已访问的url中
                self.linkQuence.addVisitedUrl(visitUrl)
            #未访问的url入列
            for link in links:
                self.linkQuence.addUnvisitedUrl(link)
            self.current_deepth += 1
             
    #获取源码中得超链接
    def getHyperLinks(self,url,static_url):
        result = []
        r = requests.get(url)
        data = r.text
        lines = data.split("\n")
        for i in lines:
            if flag in i:
                print url+"                 "+i
        # 利用正则查找所有连接
        link_list =re.findall(r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')" ,data)
        for i in link_list:
            
            if "http" not in i:
                result.append(static_url+i)
            else:
                result.append(i)
        return result
    
    #获取网页源码
    def getPageSource(self,url,timeout=100,coding=None):
        try:
            socket.setdefaulttimeout(timeout)
            req = urllib2.Request(url)
            req.add_header('User-agent', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)')
            response = urllib2.urlopen(req)
            page = '' 
            if response.headers.get('Content-Encoding') == 'gzip': 
                page = zlib.decompress(page, 16+zlib.MAX_WBITS) 
            
            if coding is None:   
                coding= response.headers.getparam("charset")   
            #如果获取的网站编码为None
            if coding is None:
                page=response.read()   
            #获取网站编码并转化为utf-8 
            else:           
                page=response.read()   
                page=page.decode(coding).encode('utf-8')   
            return ["200",page]
        except Exception,e:
            print str(e)
            return [str(e),None]
         
class linkQuence:
    def __init__(self):
        #已访问的url集合
        self.visted=[]
        #待访问的url集合
        self.unVisited=[]
    #获取访问过的url队列
    def getVisitedUrl(self):
        return self.visted
    #获取未访问的url队列
    def getUnvisitedUrl(self):
        return self.unVisited
    #添加到访问过得url队列中
    def addVisitedUrl(self,url):
        self.visted.append(url)
    #移除访问过得url
    def removeVisitedUrl(self,url):
        self.visted.remove(url)
    #未访问过得url出队列
    def unVisitedUrlDeQuence(self):
        try:
            return self.unVisited.pop()
        except:
            return None
    #保证每个url只被访问一次
    def addUnvisitedUrl(self,url):
        if url!="" and url not in self.visted and url not in self.unVisited:
            self.unVisited.insert(0,url)
    #获得已访问的url数目
    def getVisitedUrlCount(self):
        return len(self.visted)
    #获得未访问的url数目
    def getUnvistedUrlCount(self):
        return len(self.unVisited)
    #判断未访问的url队列是否为空
    def unVisitedUrlsEnmpy(self):
        return len(self.unVisited)==0
     
def main(seeds,crawl_deepth,static_url):
    craw=MyCrawler(seeds)
    craw.crawling(seeds,crawl_deepth,static_url)
     
if __name__=="__main__":
    for i in urls:
        main([i],depth,i) 
    print "Done!"

发表评论

电子邮件地址不会被公开。 必填项已用*标注