LOADING

加载过慢请开启缓存 浏览器默认开启

2023/10/22

bit杯-爬取抖音热度榜

今天写了一天bit杯的爬虫题,完成是完成了,但是写吐了。

接下来几个星期都不想写爬虫了…

import requests
import datetime
import openpyxl
import matplotlib.pyplot as plt
import threading
import pymysql

"""
使用说明:
第三方依赖openpyxl、matplotlib、pymysql
数据库参数说明:db_host数据库服务器主机名 db_user数据库用户名 db_password数据库密码 db_port数据库端口 db_name数据库名称
第一、二题默认将excel文件、视频文件(mp4)保存在DY_spider.py文件的同级目录中
"""

db_host = 'localhost'
db_user = 'root'
db_password = '757268'
db_port = 3306
db_name = 'Mysql'

session1 = requests.session()

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 Edg/118.0.2088.57",
    "Referer": "https://www.douyin.com/discover",
}

params1 = {"device_platform": "webapp",
           "aid": "6383",
           "channel": "channel_pc_web",
           "detail_list": "1",
           "source": "6",
           "board_type": "0",
           "pc_client_type": "1",
           "version_code": "170400",
           "version_name": "17.4.0",
           "cookie_enabled": "true",
           "screen_width": "1536",
           "screen_height": "864",
           "browser_language": "zh-CN",
           "browser_platform": "Win32",
           "browser_name": "Edge",
           "browser_version": "118.0.2088.57",
           "browser_online": "true",
           "engine_name": "Blink",
           "engine_version": "118.0.0.0",
           "os_name": "Windows",
           "os_version": "10",
           "cpu_core_num": "8",
           "device_memory": "8",
           "platform": "PC",
           "downlink": "10",
           "effective_type": "4g",
           "round_trip_time": "50",
           "webid": "7292586728233502227",
           "msToken": "Sg1rnezO2VpibJVFdr3XfGX3aYFjKOc_9XUtOFEso7vWMSaOZZpYrg7xm9OtE9EEea4wIeDAJ3VLowvmfFhJgLq05jpPwPqgQi3TLwQM_6XzTMLcCwMhHdb7kcQXiYi9Yw==",
           "X-Bogus": "DFSzsdVuFxTANtL4tYdtttteJnx4"
           }

hot = session1.get("https://www.douyin.com/aweme/v1/web/hot/search/list/", params=params1, headers=headers)
hot_json = hot.json()
# print(hot_json["data"]["word_list"])

# 将时间戳 1697869498 转换为 datetime 对象
dt = datetime.datetime.fromtimestamp(1697846660)
# print(dt)

wb = openpyxl.Workbook()
ws = wb.active
ws.title = "DY_spider_info"
ws['A1'] = "热榜排名"
ws['B1'] = "热榜标题"
ws['C1'] = "热榜链接"
ws['D1'] = "热榜时间"
ws['E1'] = "热度值"
data_list = []
# data_list有可能没有置顶!!!导致只呈现0-49而不是0-50个
try:
    a = hot_json["data"]["word_list"][50]["position"]
    for i in range(1, 51):
        data_list.append([hot_json["data"]["word_list"][i]["position"],
                          hot_json["data"]["word_list"][i]["word"],
                          "https://www.douyin.com/hot/" + hot_json["data"]["word_list"][i]["sentence_id"],
                          datetime.datetime.fromtimestamp(hot_json["data"]["word_list"][i]["event_time"]),
                          hot_json["data"]["word_list"][i]["hot_value"],
                          ])
except IndexError:
    for i in range(0, 50):
        data_list.append([hot_json["data"]["word_list"][i]["position"],
                          hot_json["data"]["word_list"][i]["word"],
                          "https://www.douyin.com/hot/" + hot_json["data"]["word_list"][i]["sentence_id"],
                          datetime.datetime.fromtimestamp(hot_json["data"]["word_list"][i]["event_time"]),
                          hot_json["data"]["word_list"][i]["hot_value"],
                          ])

for each in data_list:
    ws.append(each)
wb.save("DY_spider_info.xlsx")


def save_to_mysql():
    db = pymysql.connect(host=db_host, user=db_user, password=db_password, port=db_port, db=db_name,
                         cursorclass=pymysql.cursors.Cursor)
    cursor = db.cursor()
    cursor.execute("DROP TABLE IF EXISTS DY_spider_info")
    sql = """CREATE TABLE DY_spider_info (
             热榜排名  INT,
             热榜标题  CHAR(50),
             热榜链接  CHAR(50),
             热榜时间  CHAR(50),  
             热度值  INT)
             """
    cursor.execute(sql)

    for each in data_list:
        sql = """
            INSERT INTO DY_spider_info(热榜排名, 热榜标题, 热榜链接, 热榜时间, 热度值)
            VALUES (%s, %s, %s, %s, %s)
        """
        cursor.execute(sql, (each[0], each[1], each[2], each[3], each[4]))

    sql = "SELECT * FROM DY_spider_info"
    cursor.execute(sql)
    results = cursor.fetchall()
    for row in results:
        paiming = row[0]
        biaoti = row[1]
        lianjie = row[2]
        time = row[3]
        redu = row[4]
        print("排名:%s,标题:%s,链接:%s,时间:%s,热度值:%s" % (paiming, biaoti, lianjie, time, redu))


save_to_mysql()

tag_ids = {
    "Knowledge": "https://www.douyin.com/aweme/v1/web/channel/feed/?device_platform=webapp&aid=6383&channel=channel_pc_web&count=16&tag_id=300213&Seo-Flag=0&refresh_index=1&awemePcRecRawData=%7B%22is_client%22:false%7D&pc_client_type=1&version_code=170400&version_name=17.4.0&cookie_enabled=true&screen_width=1536&screen_height=864&browser_language=zh-CN&browser_platform=Win32&browser_name=Edge&browser_version=118.0.2088.57&browser_online=true&engine_name=Blink&engine_version=118.0.0.0&os_name=Windows&os_version=10&cpu_core_num=8&device_memory=8&platform=PC&downlink=10&effective_type=4g&round_trip_time=50&webid=7292586728233502227&msToken=bqKtGt8xkq2FGgQqhOCHU-dwKGI49__WpO62in3slKgX66KChJVqmRST1H4JrSB1sJluUDNO0QSqJjpc7oW7HqfsArKnnwaCi7jFi7xBlGT0jlpM1bGSBwW91Ej9xla1iw==&X-Bogus=DFSzsdVOzKTANJbetYDwJtteJn9t",
    "Game": "https://www.douyin.com/aweme/v1/web/channel/feed/?device_platform=webapp&aid=6383&channel=channel_pc_web&tag_id=300205&count=10&Seo-Flag=0&refresh_index=1&pc_client_type=1&version_code=170400&version_name=17.4.0&cookie_enabled=true&screen_width=1536&screen_height=864&browser_language=zh-CN&browser_platform=Win32&browser_name=Edge&browser_version=118.0.2088.57&browser_online=true&engine_name=Blink&engine_version=118.0.0.0&os_name=Windows&os_version=10&cpu_core_num=8&device_memory=8&platform=PC&downlink=10&effective_type=4g&round_trip_time=50&webid=7292586728233502227&msToken=ZECxHUrsM-8pCzB2W415MWusfPvkmNtTcRR3isvW5guIops9GaYK49u19tMFJxPvQvdVM5ZoiTnQoFDITgYYqogUaGiCav3zYFOBwg2wfW0DhtqLnc9vZG70XbuhkWLjP-w=&X-Bogus=DFSzsdVuInXANtL4tYDz2tteJnHU",
    "Entertainment": "https://www.douyin.com/aweme/v1/web/channel/feed/?device_platform=webapp&aid=6383&channel=channel_pc_web&tag_id=300201&count=10&Seo-Flag=0&refresh_index=1&pc_client_type=1&version_code=170400&version_name=17.4.0&cookie_enabled=true&screen_width=1536&screen_height=864&browser_language=zh-CN&browser_platform=Win32&browser_name=Edge&browser_version=118.0.2088.57&browser_online=true&engine_name=Blink&engine_version=118.0.0.0&os_name=Windows&os_version=10&cpu_core_num=8&device_memory=8&platform=PC&downlink=10&effective_type=4g&round_trip_time=50&webid=7292586728233502227&msToken=1iK6-9745aVAORvR7XOl0MlCfNMmQjq728vlBXcOcltMAZx8ZLepU_E4jFmpa2qw2QNBrJkRXbheSwwXlkh6uwwJAgh4-rNmlI-KcmkKJsD7JApjV-_4CmRjCnTV61TpnvU=&X-Bogus=DFSzsdVOz6xANJbetYDn/zteJn9o",
    "Two-dimensional": "https://www.douyin.com/aweme/v1/web/channel/feed/?device_platform=webapp&aid=6383&channel=channel_pc_web&tag_id=300206&count=10&Seo-Flag=0&refresh_index=1&pc_client_type=1&version_code=170400&version_name=17.4.0&cookie_enabled=true&screen_width=1536&screen_height=864&browser_language=zh-CN&browser_platform=Win32&browser_name=Edge&browser_version=118.0.2088.57&browser_online=true&engine_name=Blink&engine_version=118.0.0.0&os_name=Windows&os_version=10&cpu_core_num=8&device_memory=8&platform=PC&downlink=10&effective_type=4g&round_trip_time=50&webid=7292586728233502227&msToken=k7GBkOEHoe6BRbzpGSHXhsAOddKPuQPl9XREWOAdIznEqSA1Q8rqk_KbhBhpNp-Ct2-PUPVmkzcmK6Vo4M5qXvuTlVOFXwEdGwpH7XeuycvIhwxFAzAZC5uunGc41EFtwJE=&X-Bogus=DFSzsdVu-pGANtL4tYDSjzteJnxh",
    "Music": "https://www.douyin.com/aweme/v1/web/channel/feed/?device_platform=webapp&aid=6383&channel=channel_pc_web&tag_id=300209&count=10&Seo-Flag=0&refresh_index=1&pc_client_type=1&version_code=170400&version_name=17.4.0&cookie_enabled=true&screen_width=1536&screen_height=864&browser_language=zh-CN&browser_platform=Win32&browser_name=Edge&browser_version=118.0.2088.57&browser_online=true&engine_name=Blink&engine_version=118.0.0.0&os_name=Windows&os_version=10&cpu_core_num=8&device_memory=8&platform=PC&downlink=10&effective_type=4g&round_trip_time=50&webid=7292586728233502227&msToken=J8JDBQ9KhflivKOFrNUjzroORhXGxF4ws0zOratk4Yemqm5fFZ8wbSTTZwKYz9Wo_LMyagNt-eVLrBe8lICrw-d9lgH6ZgmX5c9h9rieKzcmdetcsYml8KVqGmCGGA30y3I=&X-Bogus=DFSzsdVuWzvANtL4tYDSFzteJnH0",
    "Food": "https://www.douyin.com/aweme/v1/web/channel/feed/?device_platform=webapp&aid=6383&channel=channel_pc_web&tag_id=300204&count=10&Seo-Flag=0&refresh_index=1&pc_client_type=1&version_code=170400&version_name=17.4.0&cookie_enabled=true&screen_width=1536&screen_height=864&browser_language=zh-CN&browser_platform=Win32&browser_name=Edge&browser_version=118.0.2088.57&browser_online=true&engine_name=Blink&engine_version=118.0.0.0&os_name=Windows&os_version=10&cpu_core_num=8&device_memory=8&platform=PC&downlink=10&effective_type=4g&round_trip_time=50&webid=7292586728233502227&msToken=J8JDBQ9KhflivKOFrNUjzroORhXGxF4ws0zOratk4Yemqm5fFZ8wbSTTZwKYz9Wo_LMyagNt-eVLrBe8lICrw-d9lgH6ZgmX5c9h9rieKzcmdetcsYml8KVqGmCGGA30y3I=&X-Bogus=DFSzsdVuiPzANtL4tYDSitteJnxF",
    "Sport": "https://www.douyin.com/aweme/v1/web/channel/feed/?device_platform=webapp&aid=6383&channel=channel_pc_web&tag_id=300207&count=10&Seo-Flag=0&refresh_index=1&pc_client_type=1&version_code=170400&version_name=17.4.0&cookie_enabled=true&screen_width=1536&screen_height=864&browser_language=zh-CN&browser_platform=Win32&browser_name=Edge&browser_version=118.0.2088.57&browser_online=true&engine_name=Blink&engine_version=118.0.0.0&os_name=Windows&os_version=10&cpu_core_num=8&device_memory=8&platform=PC&downlink=10&effective_type=4g&round_trip_time=50&webid=7292586728233502227&msToken=J8JDBQ9KhflivKOFrNUjzroORhXGxF4ws0zOratk4Yemqm5fFZ8wbSTTZwKYz9Wo_LMyagNt-eVLrBe8lICrw-d9lgH6ZgmX5c9h9rieKzcmdetcsYml8KVqGmCGGA30y3I=&X-Bogus=DFSzsdVuNTUANtL4tYDSLUteJnxU",
    "Fashion": "https://www.douyin.com/aweme/v1/web/channel/feed/?device_platform=webapp&aid=6383&channel=channel_pc_web&tag_id=300208&count=10&Seo-Flag=0&refresh_index=1&pc_client_type=1&version_code=170400&version_name=17.4.0&cookie_enabled=true&screen_width=1536&screen_height=864&browser_language=zh-CN&browser_platform=Win32&browser_name=Edge&browser_version=118.0.2088.57&browser_online=true&engine_name=Blink&engine_version=118.0.0.0&os_name=Windows&os_version=10&cpu_core_num=8&device_memory=8&platform=PC&downlink=10&effective_type=4g&round_trip_time=50&webid=7292586728233502227&msToken=J8JDBQ9KhflivKOFrNUjzroORhXGxF4ws0zOratk4Yemqm5fFZ8wbSTTZwKYz9Wo_LMyagNt-eVLrBe8lICrw-d9lgH6ZgmX5c9h9rieKzcmdetcsYml8KVqGmCGGA30y3I=&X-Bogus=DFSzsdVuQekANtL4tYDtAUteJnx9"
}

headers2 = {
    "Accept": "application/json, text/plain, */*",
    "Accept-Encoding": "gzip",
    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
    "Cookie": "ttwid=1%7Ci2fGDGXdsdlq-AYCufRuIChCwOHFJZyFNzDqV5t46s4%7C1697937678%7Cd20ad716c77f963cf8f62221de28730833027ecafebb31ca730f45cb743563a0; douyin.com; device_web_cpu_core=8; device_web_memory_size=8; architecture=amd64; webcast_local_quality=null; passport_csrf_token=8ce2fc5c3aa393b86a30971d2473ac15; passport_csrf_token_default=8ce2fc5c3aa393b86a30971d2473ac15; strategyABtestKey=%221697937681.37%22; s_v_web_id=verify_lo0s9885_TMVkcvah_KxyI_4uKX_Bww5_6O4wSMTBhFqf; volume_info=%7B%22isUserMute%22%3Afalse%2C%22isMute%22%3Afalse%2C%22volume%22%3A0.6%7D; xgplayer_user_id=610983813274; odin_tt=08c00f74c96d9205776b45b204a7f613588375aceaa88c5799802a8bbc81f4942fbd64b4a2bbad340e0275af01e3c521a8f5b26828a9597b177b6c67f1fdc1137d23e5ffabb6e3af8ff5c7f166f60d84; csrf_session_id=f37f074d96e3fd5322b0fdbdcae0c267; my_rd=2; FORCE_LOGIN=%7B%22videoConsumedRemainSeconds%22%3A180%2C%22isForcePopClose%22%3A1%7D; download_guide=%223%2F20231022%2F0%22; pwa2=%220%7C0%7C3%7C0%22; VIDEO_FILTER_MEMO_SELECT=%7B%22expireTime%22%3A1698548471397%2C%22type%22%3A1%7D; tt_scid=tl85-fOQNT3S5zZkxyg9BYKd8vchqJdKOA13ThYCW25t9v6Z-qVZGalohQ0gH87X5332; msToken=FZND1BdvqiifujGRTIdsR-6mXI-yJ8KdvmT24cNJHNA_oPkaJ9VUz6oONHSW7K-WwRjdt8byaVy-D1_HMkbbskN5mHozCH2ptl2N4kUd5LlXcfdraLzf6pOIGMmzhOKSRw==; __ac_nonce=06534c07600e372186904; __ac_signature=_02B4Z6wo00f01SBSahAAAIDBK97UVIa8YQUgdm6AAC0jLishCx7etjEPvNfvJNaM0UX3uWFx0rRx6.5K0hX3IyNIKMfJDcEbikWbY-NWLyRmFp7pjmUGkGxpPJQQ9Kc39G-k.0oYmB2eXsmE46; bd_ticket_guard_client_data=eyJiZC10aWNrZXQtZ3VhcmQtdmVyc2lvbiI6MiwiYmQtdGlja2V0LWd1YXJkLWl0ZXJhdGlvbi12ZXJzaW9uIjoxLCJiZC10aWNrZXQtZ3VhcmQtcmVlLXB1YmxpYy1rZXkiOiJCTmtkWjZiMG9JTDl4elNFWGlTUGk4b2FSVy96Mkd6UW9ld0Z2SXFIRTAxSG5pazZndGxPcUhFRlZTMldOZjBWeUN2Q1JrTGE0OXhQcUYrUFo0eE44L2s9IiwiYmQtdGlja2V0LWd1YXJkLXdlYi12ZXJzaW9uIjoxfQ%3D%3D; msToken=f7Uv2WorGfJPVzotcIQUc-oj2Yv4OBv0qBex-XaRG5bVPQKRQDGwDxjAq_z0P4Dj68nk5YhuYJDciNlzInwb7cCCzSKRSxiNwVXBlzFvH6sZWrPIRNi-lJ5PS7abujwh-Q==; IsDouyinActive=true; stream_recommend_feed_params=%22%7B%5C%22cookie_enabled%5C%22%3Atrue%2C%5C%22screen_width%5C%22%3A1536%2C%5C%22screen_height%5C%22%3A864%2C%5C%22browser_online%5C%22%3Atrue%2C%5C%22cpu_core_num%5C%22%3A8%2C%5C%22device_memory%5C%22%3A8%2C%5C%22downlink%5C%22%3A3.75%2C%5C%22effective_type%5C%22%3A%5C%224g%5C%22%2C%5C%22round_trip_time%5C%22%3A100%7D%22; home_can_add_dy_2_desktop=%221%22",
    "Referer": "https://www.douyin.com/channel/300203",
    "Sec-Ch-Ua-Mobile": "?0",
    "Sec-Fetch-Dest": "empty",
    "Sec-Fetch-Mode": "cors",
    "Sec-Fetch-Site": "same-origin",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 Edg/118.0.2088.57",
}

session2 = requests.session()


def video_spider(tag):
    session2 = requests.session()
    id_json = session2.get(tag_ids[tag], headers=headers2)
    json = id_json.json()
    video_url = json["aweme_list"][0]["video"]["download_addr"]["url_list"][0]
    video_tag = session2.get(video_url)
    with open("video_{}.mp4".format(tag), "wb+") as f:
        for chunk in video_tag.iter_content(chunk_size=1048576 * 5):  # 每次写入5MB
            if chunk:
                f.write(chunk)


threads = []
for e, (tag, tag_url) in enumerate(tag_ids.items()):
    if e >= 3:
        break
    t = threading.Thread(target=video_spider, args=(tag,))
    t.name = tag
    threads.append(t)
    t.start()
    print("{}视频写入线程开始".format(t.name))


# happy = session2.get(
#     "https://www.douyin.com/aweme/v1/web/channel/feed/?device_platform=webapp&aid=6383&channel=channel_pc_web&tag_id=300201&count=10&Seo-Flag=0&refresh_index=1&pc_client_type=1&version_code=170400&version_name=17.4.0&cookie_enabled=true&screen_width=1536&screen_height=864&browser_language=zh-CN&browser_platform=Win32&browser_name=Edge&browser_version=118.0.2088.57&browser_online=true&engine_name=Blink&engine_version=118.0.0.0&os_name=Windows&os_version=10&cpu_core_num=8&device_memory=8&platform=PC&downlink=3.75&effective_type=4g&round_trip_time=100&webid=7292586728233502227&msToken=FZND1BdvqiifujGRTIdsR-6mXI-yJ8KdvmT24cNJHNA_oPkaJ9VUz6oONHSW7K-WwRjdt8byaVy-D1_HMkbbskN5mHozCH2ptl2N4kUd5LlXcfdraLzf6pOIGMmzhOKSRw==&X-Bogus=DFSzsdVOrnxANJbetYDgxUteJnxb",
#     headers=headers2)

# 娱乐频道参数"tag_id": "300201" tag_id区分不同频道
# happy_json = happy.json()
# print(happy_json)
# video_url = happy_json["aweme_list"][0]["video"]["download_addr"]["url_list"][0]
# video_happy = session2.get(video_url)

# with open("video_娱乐.mp4", "wb+") as f:
#     for chunk in video_happy.iter_content(chunk_size=1024):
#         if chunk:
#             f.write(chunk)


# game = session2.get(
#     "https://www.douyin.com/aweme/v1/web/channel/feed/?device_platform=webapp&aid=6383&channel=channel_pc_web&tag_id=300205&count=10&Seo-Flag=0&refresh_index=1&pc_client_type=1&version_code=170400&version_name=17.4.0&cookie_enabled=true&screen_width=1536&screen_height=864&browser_language=zh-CN&browser_platform=Win32&browser_name=Edge&browser_version=118.0.2088.57&browser_online=true&engine_name=Blink&engine_version=118.0.0.0&os_name=Windows&os_version=10&cpu_core_num=8&device_memory=8&platform=PC&downlink=3.75&effective_type=4g&round_trip_time=100&webid=7292586728233502227&msToken=7sN84iKHfdDIPITZwX7fdY82fPSxr328d1ZKGQLHEa6ej7OAZBFfRCJP-swkXMuzc0v7SM4M-haQ7JYA4f7deY4-r8MPGlZvxaXlzHpoZ7wAOLm9nZx2Fwvu2ekOFlw6g-0=&X-Bogus=DFSzsdVOzKTANJbetYDwJtteJn9t",
#     headers=headers2)
#
# game_json = game.json()
# # print(game_json)
# video_url = game_json["aweme_list"][0]["video"]["download_addr"]["url_list"][0]
# video_game = session2.get(video_url)

# with open("video_游戏.mp4", "wb+") as f:
#     for chunk in video_game.iter_content(chunk_size=1024):
#         if chunk:
#             f.write(chunk)


def digg_count(tag_url):
    id_json = requests.get(tag_url, headers=headers2)
    count = 0
    i = 0
    json = id_json.json()
    try:
        for i in range(0, 10):
            count += json["aweme_list"][i]["statistics"]["digg_count"]
    except IndexError:
        o = i
        id_json = requests.get(tag_url, headers=headers2)
        json = id_json.json()
        while (i < 9):
            count += json["aweme_list"][i - o]["statistics"]["digg_count"]
            i = i + 1
    return count


def plot3():
    x_label = []
    y = []
    for tag, tag_url in tag_ids.items():
        _count = digg_count(tag_url)
        x_label.append(tag)
        y.append(_count)
        print("分区:{},视频点赞数总和:{}".format(tag, _count))

    x = [1, 2, 3, 4, 5, 6, 7, 8]
    color = ['red', 'peru', 'orchid', 'deepskyblue']
    plt.figure(figsize=(20, 12))
    plt.xticks(x, x_label)
    plt.yticks(y)
    plt.bar(x, y, color=color)
    plt.grid(True, linestyle=':', color='r')
    plt.title("Likes for 9 videos")
    plt.show()


plot3()
for t in threads:
    print("等待{}线程结束".format(t.name))
    t.join()
    print("{}线程结束".format(t.name))