bit杯-爬取抖音热度榜
今天写了一天bit杯的爬虫题,完成是完成了,但是写吐了。
接下来几个星期都不想写爬虫了…
import requests
import datetime
import openpyxl
import matplotlib.pyplot as plt
import threading
import pymysql
"""
使用说明:
第三方依赖openpyxl、matplotlib、pymysql
数据库参数说明:db_host数据库服务器主机名 db_user数据库用户名 db_password数据库密码 db_port数据库端口 db_name数据库名称
第一、二题默认将excel文件、视频文件(mp4)保存在DY_spider.py文件的同级目录中
"""
db_host = 'localhost'
db_user = 'root'
db_password = '757268'
db_port = 3306
db_name = 'Mysql'
session1 = requests.session()
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 Edg/118.0.2088.57",
"Referer": "https://www.douyin.com/discover",
}
params1 = {"device_platform": "webapp",
"aid": "6383",
"channel": "channel_pc_web",
"detail_list": "1",
"source": "6",
"board_type": "0",
"pc_client_type": "1",
"version_code": "170400",
"version_name": "17.4.0",
"cookie_enabled": "true",
"screen_width": "1536",
"screen_height": "864",
"browser_language": "zh-CN",
"browser_platform": "Win32",
"browser_name": "Edge",
"browser_version": "118.0.2088.57",
"browser_online": "true",
"engine_name": "Blink",
"engine_version": "118.0.0.0",
"os_name": "Windows",
"os_version": "10",
"cpu_core_num": "8",
"device_memory": "8",
"platform": "PC",
"downlink": "10",
"effective_type": "4g",
"round_trip_time": "50",
"webid": "7292586728233502227",
"msToken": "Sg1rnezO2VpibJVFdr3XfGX3aYFjKOc_9XUtOFEso7vWMSaOZZpYrg7xm9OtE9EEea4wIeDAJ3VLowvmfFhJgLq05jpPwPqgQi3TLwQM_6XzTMLcCwMhHdb7kcQXiYi9Yw==",
"X-Bogus": "DFSzsdVuFxTANtL4tYdtttteJnx4"
}
hot = session1.get("https://www.douyin.com/aweme/v1/web/hot/search/list/", params=params1, headers=headers)
hot_json = hot.json()
# print(hot_json["data"]["word_list"])
# 将时间戳 1697869498 转换为 datetime 对象
dt = datetime.datetime.fromtimestamp(1697846660)
# print(dt)
wb = openpyxl.Workbook()
ws = wb.active
ws.title = "DY_spider_info"
ws['A1'] = "热榜排名"
ws['B1'] = "热榜标题"
ws['C1'] = "热榜链接"
ws['D1'] = "热榜时间"
ws['E1'] = "热度值"
data_list = []
# data_list有可能没有置顶!!!导致只呈现0-49而不是0-50个
try:
a = hot_json["data"]["word_list"][50]["position"]
for i in range(1, 51):
data_list.append([hot_json["data"]["word_list"][i]["position"],
hot_json["data"]["word_list"][i]["word"],
"https://www.douyin.com/hot/" + hot_json["data"]["word_list"][i]["sentence_id"],
datetime.datetime.fromtimestamp(hot_json["data"]["word_list"][i]["event_time"]),
hot_json["data"]["word_list"][i]["hot_value"],
])
except IndexError:
for i in range(0, 50):
data_list.append([hot_json["data"]["word_list"][i]["position"],
hot_json["data"]["word_list"][i]["word"],
"https://www.douyin.com/hot/" + hot_json["data"]["word_list"][i]["sentence_id"],
datetime.datetime.fromtimestamp(hot_json["data"]["word_list"][i]["event_time"]),
hot_json["data"]["word_list"][i]["hot_value"],
])
for each in data_list:
ws.append(each)
wb.save("DY_spider_info.xlsx")
def save_to_mysql():
db = pymysql.connect(host=db_host, user=db_user, password=db_password, port=db_port, db=db_name,
cursorclass=pymysql.cursors.Cursor)
cursor = db.cursor()
cursor.execute("DROP TABLE IF EXISTS DY_spider_info")
sql = """CREATE TABLE DY_spider_info (
热榜排名 INT,
热榜标题 CHAR(50),
热榜链接 CHAR(50),
热榜时间 CHAR(50),
热度值 INT)
"""
cursor.execute(sql)
for each in data_list:
sql = """
INSERT INTO DY_spider_info(热榜排名, 热榜标题, 热榜链接, 热榜时间, 热度值)
VALUES (%s, %s, %s, %s, %s)
"""
cursor.execute(sql, (each[0], each[1], each[2], each[3], each[4]))
sql = "SELECT * FROM DY_spider_info"
cursor.execute(sql)
results = cursor.fetchall()
for row in results:
paiming = row[0]
biaoti = row[1]
lianjie = row[2]
time = row[3]
redu = row[4]
print("排名:%s,标题:%s,链接:%s,时间:%s,热度值:%s" % (paiming, biaoti, lianjie, time, redu))
save_to_mysql()
tag_ids = {
"Knowledge": "https://www.douyin.com/aweme/v1/web/channel/feed/?device_platform=webapp&aid=6383&channel=channel_pc_web&count=16&tag_id=300213&Seo-Flag=0&refresh_index=1&awemePcRecRawData=%7B%22is_client%22:false%7D&pc_client_type=1&version_code=170400&version_name=17.4.0&cookie_enabled=true&screen_width=1536&screen_height=864&browser_language=zh-CN&browser_platform=Win32&browser_name=Edge&browser_version=118.0.2088.57&browser_online=true&engine_name=Blink&engine_version=118.0.0.0&os_name=Windows&os_version=10&cpu_core_num=8&device_memory=8&platform=PC&downlink=10&effective_type=4g&round_trip_time=50&webid=7292586728233502227&msToken=bqKtGt8xkq2FGgQqhOCHU-dwKGI49__WpO62in3slKgX66KChJVqmRST1H4JrSB1sJluUDNO0QSqJjpc7oW7HqfsArKnnwaCi7jFi7xBlGT0jlpM1bGSBwW91Ej9xla1iw==&X-Bogus=DFSzsdVOzKTANJbetYDwJtteJn9t",
"Game": "https://www.douyin.com/aweme/v1/web/channel/feed/?device_platform=webapp&aid=6383&channel=channel_pc_web&tag_id=300205&count=10&Seo-Flag=0&refresh_index=1&pc_client_type=1&version_code=170400&version_name=17.4.0&cookie_enabled=true&screen_width=1536&screen_height=864&browser_language=zh-CN&browser_platform=Win32&browser_name=Edge&browser_version=118.0.2088.57&browser_online=true&engine_name=Blink&engine_version=118.0.0.0&os_name=Windows&os_version=10&cpu_core_num=8&device_memory=8&platform=PC&downlink=10&effective_type=4g&round_trip_time=50&webid=7292586728233502227&msToken=ZECxHUrsM-8pCzB2W415MWusfPvkmNtTcRR3isvW5guIops9GaYK49u19tMFJxPvQvdVM5ZoiTnQoFDITgYYqogUaGiCav3zYFOBwg2wfW0DhtqLnc9vZG70XbuhkWLjP-w=&X-Bogus=DFSzsdVuInXANtL4tYDz2tteJnHU",
"Entertainment": "https://www.douyin.com/aweme/v1/web/channel/feed/?device_platform=webapp&aid=6383&channel=channel_pc_web&tag_id=300201&count=10&Seo-Flag=0&refresh_index=1&pc_client_type=1&version_code=170400&version_name=17.4.0&cookie_enabled=true&screen_width=1536&screen_height=864&browser_language=zh-CN&browser_platform=Win32&browser_name=Edge&browser_version=118.0.2088.57&browser_online=true&engine_name=Blink&engine_version=118.0.0.0&os_name=Windows&os_version=10&cpu_core_num=8&device_memory=8&platform=PC&downlink=10&effective_type=4g&round_trip_time=50&webid=7292586728233502227&msToken=1iK6-9745aVAORvR7XOl0MlCfNMmQjq728vlBXcOcltMAZx8ZLepU_E4jFmpa2qw2QNBrJkRXbheSwwXlkh6uwwJAgh4-rNmlI-KcmkKJsD7JApjV-_4CmRjCnTV61TpnvU=&X-Bogus=DFSzsdVOz6xANJbetYDn/zteJn9o",
"Two-dimensional": "https://www.douyin.com/aweme/v1/web/channel/feed/?device_platform=webapp&aid=6383&channel=channel_pc_web&tag_id=300206&count=10&Seo-Flag=0&refresh_index=1&pc_client_type=1&version_code=170400&version_name=17.4.0&cookie_enabled=true&screen_width=1536&screen_height=864&browser_language=zh-CN&browser_platform=Win32&browser_name=Edge&browser_version=118.0.2088.57&browser_online=true&engine_name=Blink&engine_version=118.0.0.0&os_name=Windows&os_version=10&cpu_core_num=8&device_memory=8&platform=PC&downlink=10&effective_type=4g&round_trip_time=50&webid=7292586728233502227&msToken=k7GBkOEHoe6BRbzpGSHXhsAOddKPuQPl9XREWOAdIznEqSA1Q8rqk_KbhBhpNp-Ct2-PUPVmkzcmK6Vo4M5qXvuTlVOFXwEdGwpH7XeuycvIhwxFAzAZC5uunGc41EFtwJE=&X-Bogus=DFSzsdVu-pGANtL4tYDSjzteJnxh",
"Music": "https://www.douyin.com/aweme/v1/web/channel/feed/?device_platform=webapp&aid=6383&channel=channel_pc_web&tag_id=300209&count=10&Seo-Flag=0&refresh_index=1&pc_client_type=1&version_code=170400&version_name=17.4.0&cookie_enabled=true&screen_width=1536&screen_height=864&browser_language=zh-CN&browser_platform=Win32&browser_name=Edge&browser_version=118.0.2088.57&browser_online=true&engine_name=Blink&engine_version=118.0.0.0&os_name=Windows&os_version=10&cpu_core_num=8&device_memory=8&platform=PC&downlink=10&effective_type=4g&round_trip_time=50&webid=7292586728233502227&msToken=J8JDBQ9KhflivKOFrNUjzroORhXGxF4ws0zOratk4Yemqm5fFZ8wbSTTZwKYz9Wo_LMyagNt-eVLrBe8lICrw-d9lgH6ZgmX5c9h9rieKzcmdetcsYml8KVqGmCGGA30y3I=&X-Bogus=DFSzsdVuWzvANtL4tYDSFzteJnH0",
"Food": "https://www.douyin.com/aweme/v1/web/channel/feed/?device_platform=webapp&aid=6383&channel=channel_pc_web&tag_id=300204&count=10&Seo-Flag=0&refresh_index=1&pc_client_type=1&version_code=170400&version_name=17.4.0&cookie_enabled=true&screen_width=1536&screen_height=864&browser_language=zh-CN&browser_platform=Win32&browser_name=Edge&browser_version=118.0.2088.57&browser_online=true&engine_name=Blink&engine_version=118.0.0.0&os_name=Windows&os_version=10&cpu_core_num=8&device_memory=8&platform=PC&downlink=10&effective_type=4g&round_trip_time=50&webid=7292586728233502227&msToken=J8JDBQ9KhflivKOFrNUjzroORhXGxF4ws0zOratk4Yemqm5fFZ8wbSTTZwKYz9Wo_LMyagNt-eVLrBe8lICrw-d9lgH6ZgmX5c9h9rieKzcmdetcsYml8KVqGmCGGA30y3I=&X-Bogus=DFSzsdVuiPzANtL4tYDSitteJnxF",
"Sport": "https://www.douyin.com/aweme/v1/web/channel/feed/?device_platform=webapp&aid=6383&channel=channel_pc_web&tag_id=300207&count=10&Seo-Flag=0&refresh_index=1&pc_client_type=1&version_code=170400&version_name=17.4.0&cookie_enabled=true&screen_width=1536&screen_height=864&browser_language=zh-CN&browser_platform=Win32&browser_name=Edge&browser_version=118.0.2088.57&browser_online=true&engine_name=Blink&engine_version=118.0.0.0&os_name=Windows&os_version=10&cpu_core_num=8&device_memory=8&platform=PC&downlink=10&effective_type=4g&round_trip_time=50&webid=7292586728233502227&msToken=J8JDBQ9KhflivKOFrNUjzroORhXGxF4ws0zOratk4Yemqm5fFZ8wbSTTZwKYz9Wo_LMyagNt-eVLrBe8lICrw-d9lgH6ZgmX5c9h9rieKzcmdetcsYml8KVqGmCGGA30y3I=&X-Bogus=DFSzsdVuNTUANtL4tYDSLUteJnxU",
"Fashion": "https://www.douyin.com/aweme/v1/web/channel/feed/?device_platform=webapp&aid=6383&channel=channel_pc_web&tag_id=300208&count=10&Seo-Flag=0&refresh_index=1&pc_client_type=1&version_code=170400&version_name=17.4.0&cookie_enabled=true&screen_width=1536&screen_height=864&browser_language=zh-CN&browser_platform=Win32&browser_name=Edge&browser_version=118.0.2088.57&browser_online=true&engine_name=Blink&engine_version=118.0.0.0&os_name=Windows&os_version=10&cpu_core_num=8&device_memory=8&platform=PC&downlink=10&effective_type=4g&round_trip_time=50&webid=7292586728233502227&msToken=J8JDBQ9KhflivKOFrNUjzroORhXGxF4ws0zOratk4Yemqm5fFZ8wbSTTZwKYz9Wo_LMyagNt-eVLrBe8lICrw-d9lgH6ZgmX5c9h9rieKzcmdetcsYml8KVqGmCGGA30y3I=&X-Bogus=DFSzsdVuQekANtL4tYDtAUteJnx9"
}
headers2 = {
"Accept": "application/json, text/plain, */*",
"Accept-Encoding": "gzip",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
"Cookie": "ttwid=1%7Ci2fGDGXdsdlq-AYCufRuIChCwOHFJZyFNzDqV5t46s4%7C1697937678%7Cd20ad716c77f963cf8f62221de28730833027ecafebb31ca730f45cb743563a0; douyin.com; device_web_cpu_core=8; device_web_memory_size=8; architecture=amd64; webcast_local_quality=null; passport_csrf_token=8ce2fc5c3aa393b86a30971d2473ac15; passport_csrf_token_default=8ce2fc5c3aa393b86a30971d2473ac15; strategyABtestKey=%221697937681.37%22; s_v_web_id=verify_lo0s9885_TMVkcvah_KxyI_4uKX_Bww5_6O4wSMTBhFqf; volume_info=%7B%22isUserMute%22%3Afalse%2C%22isMute%22%3Afalse%2C%22volume%22%3A0.6%7D; xgplayer_user_id=610983813274; odin_tt=08c00f74c96d9205776b45b204a7f613588375aceaa88c5799802a8bbc81f4942fbd64b4a2bbad340e0275af01e3c521a8f5b26828a9597b177b6c67f1fdc1137d23e5ffabb6e3af8ff5c7f166f60d84; csrf_session_id=f37f074d96e3fd5322b0fdbdcae0c267; my_rd=2; FORCE_LOGIN=%7B%22videoConsumedRemainSeconds%22%3A180%2C%22isForcePopClose%22%3A1%7D; download_guide=%223%2F20231022%2F0%22; pwa2=%220%7C0%7C3%7C0%22; VIDEO_FILTER_MEMO_SELECT=%7B%22expireTime%22%3A1698548471397%2C%22type%22%3A1%7D; tt_scid=tl85-fOQNT3S5zZkxyg9BYKd8vchqJdKOA13ThYCW25t9v6Z-qVZGalohQ0gH87X5332; msToken=FZND1BdvqiifujGRTIdsR-6mXI-yJ8KdvmT24cNJHNA_oPkaJ9VUz6oONHSW7K-WwRjdt8byaVy-D1_HMkbbskN5mHozCH2ptl2N4kUd5LlXcfdraLzf6pOIGMmzhOKSRw==; __ac_nonce=06534c07600e372186904; __ac_signature=_02B4Z6wo00f01SBSahAAAIDBK97UVIa8YQUgdm6AAC0jLishCx7etjEPvNfvJNaM0UX3uWFx0rRx6.5K0hX3IyNIKMfJDcEbikWbY-NWLyRmFp7pjmUGkGxpPJQQ9Kc39G-k.0oYmB2eXsmE46; bd_ticket_guard_client_data=eyJiZC10aWNrZXQtZ3VhcmQtdmVyc2lvbiI6MiwiYmQtdGlja2V0LWd1YXJkLWl0ZXJhdGlvbi12ZXJzaW9uIjoxLCJiZC10aWNrZXQtZ3VhcmQtcmVlLXB1YmxpYy1rZXkiOiJCTmtkWjZiMG9JTDl4elNFWGlTUGk4b2FSVy96Mkd6UW9ld0Z2SXFIRTAxSG5pazZndGxPcUhFRlZTMldOZjBWeUN2Q1JrTGE0OXhQcUYrUFo0eE44L2s9IiwiYmQtdGlja2V0LWd1YXJkLXdlYi12ZXJzaW9uIjoxfQ%3D%3D; msToken=f7Uv2WorGfJPVzotcIQUc-oj2Yv4OBv0qBex-XaRG5bVPQKRQDGwDxjAq_z0P4Dj68nk5YhuYJDciNlzInwb7cCCzSKRSxiNwVXBlzFvH6sZWrPIRNi-lJ5PS7abujwh-Q==; IsDouyinActive=true; stream_recommend_feed_params=%22%7B%5C%22cookie_enabled%5C%22%3Atrue%2C%5C%22screen_width%5C%22%3A1536%2C%5C%22screen_height%5C%22%3A864%2C%5C%22browser_online%5C%22%3Atrue%2C%5C%22cpu_core_num%5C%22%3A8%2C%5C%22device_memory%5C%22%3A8%2C%5C%22downlink%5C%22%3A3.75%2C%5C%22effective_type%5C%22%3A%5C%224g%5C%22%2C%5C%22round_trip_time%5C%22%3A100%7D%22; home_can_add_dy_2_desktop=%221%22",
"Referer": "https://www.douyin.com/channel/300203",
"Sec-Ch-Ua-Mobile": "?0",
"Sec-Fetch-Dest": "empty",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "same-origin",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 Edg/118.0.2088.57",
}
session2 = requests.session()
def video_spider(tag):
session2 = requests.session()
id_json = session2.get(tag_ids[tag], headers=headers2)
json = id_json.json()
video_url = json["aweme_list"][0]["video"]["download_addr"]["url_list"][0]
video_tag = session2.get(video_url)
with open("video_{}.mp4".format(tag), "wb+") as f:
for chunk in video_tag.iter_content(chunk_size=1048576 * 5): # 每次写入5MB
if chunk:
f.write(chunk)
threads = []
for e, (tag, tag_url) in enumerate(tag_ids.items()):
if e >= 3:
break
t = threading.Thread(target=video_spider, args=(tag,))
t.name = tag
threads.append(t)
t.start()
print("{}视频写入线程开始".format(t.name))
# happy = session2.get(
# "https://www.douyin.com/aweme/v1/web/channel/feed/?device_platform=webapp&aid=6383&channel=channel_pc_web&tag_id=300201&count=10&Seo-Flag=0&refresh_index=1&pc_client_type=1&version_code=170400&version_name=17.4.0&cookie_enabled=true&screen_width=1536&screen_height=864&browser_language=zh-CN&browser_platform=Win32&browser_name=Edge&browser_version=118.0.2088.57&browser_online=true&engine_name=Blink&engine_version=118.0.0.0&os_name=Windows&os_version=10&cpu_core_num=8&device_memory=8&platform=PC&downlink=3.75&effective_type=4g&round_trip_time=100&webid=7292586728233502227&msToken=FZND1BdvqiifujGRTIdsR-6mXI-yJ8KdvmT24cNJHNA_oPkaJ9VUz6oONHSW7K-WwRjdt8byaVy-D1_HMkbbskN5mHozCH2ptl2N4kUd5LlXcfdraLzf6pOIGMmzhOKSRw==&X-Bogus=DFSzsdVOrnxANJbetYDgxUteJnxb",
# headers=headers2)
# 娱乐频道参数"tag_id": "300201" tag_id区分不同频道
# happy_json = happy.json()
# print(happy_json)
# video_url = happy_json["aweme_list"][0]["video"]["download_addr"]["url_list"][0]
# video_happy = session2.get(video_url)
# with open("video_娱乐.mp4", "wb+") as f:
# for chunk in video_happy.iter_content(chunk_size=1024):
# if chunk:
# f.write(chunk)
# game = session2.get(
# "https://www.douyin.com/aweme/v1/web/channel/feed/?device_platform=webapp&aid=6383&channel=channel_pc_web&tag_id=300205&count=10&Seo-Flag=0&refresh_index=1&pc_client_type=1&version_code=170400&version_name=17.4.0&cookie_enabled=true&screen_width=1536&screen_height=864&browser_language=zh-CN&browser_platform=Win32&browser_name=Edge&browser_version=118.0.2088.57&browser_online=true&engine_name=Blink&engine_version=118.0.0.0&os_name=Windows&os_version=10&cpu_core_num=8&device_memory=8&platform=PC&downlink=3.75&effective_type=4g&round_trip_time=100&webid=7292586728233502227&msToken=7sN84iKHfdDIPITZwX7fdY82fPSxr328d1ZKGQLHEa6ej7OAZBFfRCJP-swkXMuzc0v7SM4M-haQ7JYA4f7deY4-r8MPGlZvxaXlzHpoZ7wAOLm9nZx2Fwvu2ekOFlw6g-0=&X-Bogus=DFSzsdVOzKTANJbetYDwJtteJn9t",
# headers=headers2)
#
# game_json = game.json()
# # print(game_json)
# video_url = game_json["aweme_list"][0]["video"]["download_addr"]["url_list"][0]
# video_game = session2.get(video_url)
# with open("video_游戏.mp4", "wb+") as f:
# for chunk in video_game.iter_content(chunk_size=1024):
# if chunk:
# f.write(chunk)
def digg_count(tag_url):
id_json = requests.get(tag_url, headers=headers2)
count = 0
i = 0
json = id_json.json()
try:
for i in range(0, 10):
count += json["aweme_list"][i]["statistics"]["digg_count"]
except IndexError:
o = i
id_json = requests.get(tag_url, headers=headers2)
json = id_json.json()
while (i < 9):
count += json["aweme_list"][i - o]["statistics"]["digg_count"]
i = i + 1
return count
def plot3():
x_label = []
y = []
for tag, tag_url in tag_ids.items():
_count = digg_count(tag_url)
x_label.append(tag)
y.append(_count)
print("分区:{},视频点赞数总和:{}".format(tag, _count))
x = [1, 2, 3, 4, 5, 6, 7, 8]
color = ['red', 'peru', 'orchid', 'deepskyblue']
plt.figure(figsize=(20, 12))
plt.xticks(x, x_label)
plt.yticks(y)
plt.bar(x, y, color=color)
plt.grid(True, linestyle=':', color='r')
plt.title("Likes for 9 videos")
plt.show()
plot3()
for t in threads:
print("等待{}线程结束".format(t.name))
t.join()
print("{}线程结束".format(t.name))