跟我学Python,今天我们来爬取豆瓣的电影
#coding:utf-8
import urllib2
import re
root_url = "https://movie.douban.com/"
class Movie():
def __init__(self): #初始化
self.url_tmp=[]#未抓取
self.urls=[] #已抓取
#self.number=1000 #抓取的网页数
def getpage(self,url):#获取网页原代码函数
req=urllib2.Request(url)
req.add_header("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36")
# req.add_header("Host","erebor.douban.com")
# req.add_header("Referer","https://movie.douban.com/")
page=urllib2.urlopen(req).read()
return page
def getcontent(self,url):#获取电影信息
page=self.getpage(url)
name=re.findall(r'<h1>.*?<span property="v:itemreviewed">(.*?)</span>.*?<span class="year">(.*?)</span>.*?</h1>',page,re.S)#电影名称
score=re.findall(r'<strong class="ll rating_num" property="v:average">(.*?)</strong>',page,re.S)#评分
e = page.find('<div id="info">')
page = page[e:-1]
d = page.find('</div>')
page = page[:d]
reg= re.compile(r'<[^>]+>', re.S)
content=reg.sub('',page).replace(' ', '').strip()
print "电影名称:"+name[0][0] + name[0][1]
print "电影评分:"+"".join(score)
print content
def url_add(self,page): #url获取
reg=re.compile(r'https://movie.douban.com/subject/\d{8}/')
urltmp=re.findall(reg,page)
urltmps = list(set(urltmp))
for tmp in urltmps:
if (tmp not in self.urls) and (tmp not in self.url_tmp):#判断URL是否爬取
self.url_tmp.append(tmp)
# else:
# print "%s页面已存在" %tmp
def urls_add(self):#URL循环获取
page=self.getpage(root_url)
self.url_add(page)
while len(self.url_tmp)!=0:
url=self.url_tmp.pop()
self.urls.append(url)
if len(url)!=0:
page=self.getpage(url)
self.url_add(page)
print url+" 正在爬取..."
self.getcontent(url)
Movie().urls_add()
留言与评论(共有 0 条评论) |