使用Beautifulsoup和requests爬虫豆瓣电影
只用30行代码
代码如下:
import requests
from bs4 import BeautifulSoup
import pandas as pd
hrefs, titles, actors, ratings, quotes = [[] for i in range(5)]
result = [hrefs, titles, actors, ratings, quotes]
for page in range(0, 250, 25):
url = f"https://movie.douban.com/top250?start={page}&filter="
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
" AppleWebKit/537.36 (KHTML, like Gecko)"
" Chrome/108.0.0.0 Safari/537.36"}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, "lxml")
infos = soup.find_all("div", class_="info")
print("*"*50)
for index, info in enumerate(infos, start=1):
print(index+page)
href = info.find("a")["href"]
title = info.find("a").get_text().replace("\n", "").replace(" ", "")
actor = info.find("p").get_text().replace("\n", "").replace(" ", "")
rating = info.find("span", class_="rating_num").get_text()
if info.find("span", class_="inq") is None:
quote = "nothing"
else:
quote = info.find("span", class_="inq").get_text()
one_result = [href, title, actor, rating, quote]
for i in one_result:
print(i)
for i in range(5):
result[i].append(one_result[i])
column_name = ["href", "title", "actor", "rating", "quote"]
df = pd.DataFrame({column_name[i]: result[i] for i in range(5)})
df.to_csv("douban.csv")