零、前言
前几天一直在找baidu的搜索接口,可供程序调用的那种,只找到了web网页的,百度应该是不会开方这种接口了。然后我写了获取搜索结果的脚本,就是简单爬虫用正则去匹配搜索结果。
一、代码
#-*-coding:utf-8-*- import requests import re import os url = "http://www.baidu.com/s" def Remove_Repeat(List):#列表去重 New_List = [] for i in List: if i not in New_List: New_List.append(i) return New_List def Split_Url(Url): i = 0 j = 0 while Url.find("/",i)>0 and j<3: i = Url.find("/",i)+1 j = j + 1 return Url[:i] def Get_Url(wd,pn):#获取百度搜索的URl Url_List = [] payload = {"wd":wd} for i in range(0,pn*10,10): payload["pn"] = i result = requests.get(url,params = payload) #print result.text Url_List = Url_List + re.findall(r"http://www.baidu.com/link\S*(?=\")",result.text) return Url_List def Get_Host(Url_List):#获取搜索结果的跟域名 Host = [] New_Url_List = Remove_Repeat(Url_List) for i in New_Url_List: temp = requests.get(i,allow_redirects=False) Host.append(Split_Url(temp.headers["location"])) return Remove_Repeat(Host) if __name__ == "__main__": #print Get_Url("",1) temp = Get_Host(Get_Url("123",1)) #搜索关键词与页数 for i in temp: print i