import requests from bs4 import BeautifulSoup import json headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'} #从xhr中获取链接 url = 'https://movie.douban.com/j/search_subjects?type=movie&tag=%E8%B1%86%E7%93%A3%E9%AB%98%E5%88%86&sort=rank&page_limit=20&page_start=0' res = requests.get(url, headers=headers,timeout=20) #print(res.status_code) js = res.json() #转化成json才能用键值对访问 response对象不能
deftopCinema(num): #获取评分排名前n部电影的名称和链接 top_info = js['subjects'][:num] top_cinema = {} for i inrange(num): top_cinema[top_info[i]['title']] = top_info[i]['url'] return top_cinema #print(topCinema(4))
defgetComment(movieUrl,pageNum): #爬取某个电影的第i页影评 start = (pageNum-1) * 20 headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'} url = movieUrl + 'comments?'+ 'start=' + str(start) + 'limit=20&status=P&sort=new_score' res = requests.get(url, headers=headers,timeout=20) soup = BeautifulSoup(res.text,'html.parser') comment_list = soup.find_all('span',class_='short') user = soup.find_all('span',class_='comment-info') cinema_comment = {} for i inrange(len(user)): cinema_comment[user[i].a.string] = comment_list[i].string return cinema_comment #print(getComment('https://movie.douban.com/subject/1292052/',1))
#爬取top3电影的前两页影评:(爬取多页只需要改一下参数即可) top3 = topCinema(3) top3_comment = {} for name in top3: for i inrange(1,3): top3_comment[name] = getComment(top3[name],i) #print(top3_comment)
#存储本地 withopen ('./comment/top3_comment.txt','w') as f: f.write(str(top3_comment)) print('保存成功') f.close() withopen('./comment/top3_comment.txt','r') as r: print(r.read()) r.close()
运行结果
Maybe you could buy me a cup of coffee.
Scan this qrcode
Open alipay app scan this qrcode, buy me a coffee!
Scan this qrcode
Open wechat app scan this qrcode, buy me a coffee!