diff --git a/config/base_config.py b/config/base_config.py index 78a3bb5..3cece3d 100644 --- a/config/base_config.py +++ b/config/base_config.py @@ -57,6 +57,17 @@ START_PAGE = 1 # 爬取视频/帖子的数量控制 CRAWLER_MAX_NOTES_COUNT = 200 +# 爬取开始的天数,仅支持 bilibili 关键字搜索,YYYY-MM-DD 格式,若为 None 则表示不设置时间范围,按照默认关键字最多返回 1000 条视频的结果处理 +START_DAY = '2024-01-01' + +# 爬取结束的天数,仅支持 bilibili 关键字搜索,YYYY-MM-DD 格式,若为 None 则表示不设置时间范围,按照默认关键字最多返回 1000 条视频的结果处理 +END_DAY = '2024-01-01' + +# 是否开启按每一天进行爬取的选项,仅支持 bilibili 关键字搜索 +# 若为 False,则忽略 START_DAY 与 END_DAY 设置的值 +# 若为 True,则按照 START_DAY 至 END_DAY 按照每一天进行筛选,这样能够突破 1000 条视频的限制,最大程度爬取该关键词下的所有视频 +ALL_DAY = True + # 并发爬虫数量控制 MAX_CONCURRENCY_NUM = 1 @@ -69,7 +80,6 @@ ENABLE_GET_COMMENTS = True # 爬取一级评论的数量控制(单视频/帖子) CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES = 10 - # 是否开启爬二级评论模式, 默认不开启爬二级评论 # 老版本项目使用了 db, 则需参考 schema/tables.sql line 287 增加表字段 ENABLE_GET_SUB_COMMENTS = False @@ -87,7 +97,6 @@ XHS_SPECIFIED_NOTE_URL_LIST = [ # ........................ ] - # 指定抖音需要爬取的ID列表 DY_SPECIFIED_ID_LIST = [ "7280854932641664319", @@ -126,6 +135,7 @@ TIEBA_NAME_LIST = [ # "盗墓笔记" ] +# 指定贴吧创作者URL列表 TIEBA_CREATOR_URL_LIST = [ "https://tieba.baidu.com/home/main/?id=tb.1.7f139e2e.6CyEwxu3VJruH_-QqpCi6g&fr=frs", # ........................ diff --git a/media_platform/bilibili/client.py b/media_platform/bilibili/client.py index c8338c8..37c087b 100644 --- a/media_platform/bilibili/client.py +++ b/media_platform/bilibili/client.py @@ -147,8 +147,8 @@ class BilibiliClient(AbstractApiClient): "page": page, "page_size": page_size, "order": order.value, - "pubtime_begin": pubtime_begin_s, - "pubtime_end": pubtime_end_s + "pubtime_begin_s": pubtime_begin_s, + "pubtime_end_s": pubtime_end_s } return await self.get(uri, post_data) diff --git a/media_platform/bilibili/core.py b/media_platform/bilibili/core.py index e698bbd..4c2305b 100644 --- a/media_platform/bilibili/core.py +++ b/media_platform/bilibili/core.py @@ -19,9 +19,10 @@ import os import random from asyncio import Task from typing import Dict, List, Optional, Tuple, Union +from datetime import datetime, timedelta +import pandas as pd -from playwright.async_api import (BrowserContext, BrowserType, Page, - async_playwright) +from playwright.async_api import (BrowserContext, BrowserType, Page, async_playwright) import config from base.base_crawler import AbstractCrawler @@ -95,56 +96,122 @@ class BilibiliCrawler(AbstractCrawler): utils.logger.info( "[BilibiliCrawler.start] Bilibili Crawler finished ...") + async def get_pubtime_datetime(self, start: str = config.START_DAY, end: str = config.END_DAY) -> tuple[str, str]: + """ + 获取 bilibili 作品发布日期起始时间戳 pubtime_begin_s 与发布日期结束时间戳 pubtime_end_s + --- + :param start: 发布日期起始时间,YYYY-MM-DD + :param end: 发布日期结束时间,YYYY-MM-DD + + Note + --- + - 搜索的时间范围为 start 至 end,包含 start 和 end + - 若要搜索同一天的内容,为了包含 start 当天的搜索内容,则 pubtime_end_s 的值应该为 pubtime_begin_s 的值加上一天再减去一秒,即 start 当天的最后一秒 + - 如仅搜索 2024-01-05 的内容,pubtime_begin_s = 1704384000,pubtime_end_s = 1704470399 + 转换为可读的 datetime 对象:pubtime_begin_s = datetime.datetime(2024, 1, 5, 0, 0),pubtime_end_s = datetime.datetime(2024, 1, 5, 23, 59, 59) + - 若要搜索 start 至 end 的内容,为了包含 end 当天的搜索内容,则 pubtime_end_s 的值应该为 pubtime_end_s 的值加上一天再减去一秒,即 end 当天的最后一秒 + - 如搜索 2024-01-05 - 2024-01-06 的内容,pubtime_begin_s = 1704384000,pubtime_end_s = 1704556799 + 转换为可读的 datetime 对象:pubtime_begin_s = datetime.datetime(2024, 1, 5, 0, 0),pubtime_end_s = datetime.datetime(2024, 1, 6, 23, 59, 59) + """ + # 转换 start 与 end 为 datetime 对象 + start_day: datetime = datetime.strptime(start, '%Y-%m-%d') + end_day: datetime = datetime.strptime(end, '%Y-%m-%d') + if start_day > end_day: + raise ValueError('Wrong time range, please check your start and end argument, to ensure that the start cannot exceed end') + elif start_day == end_day: # 搜索同一天的内容 + end_day = start_day + timedelta(days=1) - timedelta(seconds=1) # 则将 end_day 设置为 start_day + 1 day - 1 second + else: # 搜索 start 至 end + end_day = end_day + timedelta(days=1) - timedelta(seconds=1) # 则将 end_day 设置为 end_day + 1 day - 1 second + # 将其重新转换为时间戳 + return str(int(start_day.timestamp())), str(int(end_day.timestamp())) + async def search(self): """ search bilibili video with keywords :return: """ - utils.logger.info( - "[BilibiliCrawler.search] Begin search bilibli keywords") + utils.logger.info("[BilibiliCrawler.search] Begin search bilibli keywords") bili_limit_count = 20 # bilibili limit page fixed value if config.CRAWLER_MAX_NOTES_COUNT < bili_limit_count: config.CRAWLER_MAX_NOTES_COUNT = bili_limit_count start_page = config.START_PAGE # start page number for keyword in config.KEYWORDS.split(","): source_keyword_var.set(keyword) - utils.logger.info( - f"[BilibiliCrawler.search] Current search keyword: {keyword}") - page = 1 - while (page - start_page + 1) * bili_limit_count <= config.CRAWLER_MAX_NOTES_COUNT: - if page < start_page: - utils.logger.info( - f"[BilibiliCrawler.search] Skip page: {page}") + utils.logger.info(f"[BilibiliCrawler.search] Current search keyword: {keyword}") + # 每个关键词最多返回 1000 条数据 + if not config.ALL_DAY: + page = 1 + while (page - start_page + 1) * bili_limit_count <= config.CRAWLER_MAX_NOTES_COUNT: + if page < start_page: + utils.logger.info(f"[BilibiliCrawler.search] Skip page: {page}") + page += 1 + continue + + utils.logger.info(f"[BilibiliCrawler.search] search bilibili keyword: {keyword}, page: {page}") + video_id_list: List[str] = [] + videos_res = await self.bili_client.search_video_by_keyword( + keyword=keyword, + page=page, + page_size=bili_limit_count, + order=SearchOrderType.DEFAULT, + pubtime_begin_s=0, # 作品发布日期起始时间戳 + pubtime_end_s=0 # 作品发布日期结束日期时间戳 + ) + video_list: List[Dict] = videos_res.get("result") + + semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) + task_list = [self.get_video_info_task(aid=video_item.get("aid"), bvid="", semaphore=semaphore) for video_item in video_list] + video_items = await asyncio.gather(*task_list) + for video_item in video_items: + if video_item: + video_id_list.append(video_item.get("View").get("aid")) + await bilibili_store.update_bilibili_video(video_item) + await bilibili_store.update_up_info(video_item) + await self.get_bilibili_video(video_item, semaphore) page += 1 - continue + await self.batch_get_video_comments(video_id_list) + # 按照 START_DAY 至 END_DAY 按照每一天进行筛选,这样能够突破 1000 条视频的限制,最大程度爬取该关键词下的所有视频 + else: + for day in pd.date_range(start=config.START_DAY, end=config.END_DAY, freq='D'): + # 按照每一天进行爬取的时间戳参数 + pubtime_begin_s, pubtime_end_s = await self.get_pubtime_datetime(start=day.strftime('%Y-%m-%d'), end=day.strftime('%Y-%m-%d')) + page = 1 + while (page - start_page + 1) * bili_limit_count <= config.CRAWLER_MAX_NOTES_COUNT: + # ! Catch any error if response return nothing, go to next day + try: + # ! Don't skip any page, to make sure gather all video in one day + # if page < start_page: + # utils.logger.info(f"[BilibiliCrawler.search] Skip page: {page}") + # page += 1 + # continue - utils.logger.info(f"[BilibiliCrawler.search] search bilibili keyword: {keyword}, page: {page}") - video_id_list: List[str] = [] - videos_res = await self.bili_client.search_video_by_keyword( - keyword=keyword, - page=page, - page_size=bili_limit_count, - order=SearchOrderType.DEFAULT, - pubtime_begin_s=0, # 作品发布日期起始时间戳 - pubtime_end_s=0 # 作品发布日期结束日期时间戳 - ) - video_list: List[Dict] = videos_res.get("result") + utils.logger.info(f"[BilibiliCrawler.search] search bilibili keyword: {keyword}, date: {day.ctime()}, page: {page}") + video_id_list: List[str] = [] + videos_res = await self.bili_client.search_video_by_keyword( + keyword=keyword, + page=page, + page_size=bili_limit_count, + order=SearchOrderType.DEFAULT, + pubtime_begin_s=pubtime_begin_s, # 作品发布日期起始时间戳 + pubtime_end_s=pubtime_end_s # 作品发布日期结束日期时间戳 + ) + video_list: List[Dict] = videos_res.get("result") - semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) - task_list = [ - self.get_video_info_task(aid=video_item.get( - "aid"), bvid="", semaphore=semaphore) - for video_item in video_list - ] - video_items = await asyncio.gather(*task_list) - for video_item in video_items: - if video_item: - video_id_list.append(video_item.get("View").get("aid")) - await bilibili_store.update_bilibili_video(video_item) - await bilibili_store.update_up_info(video_item) - await self.get_bilibili_video(video_item, semaphore) - page += 1 - await self.batch_get_video_comments(video_id_list) + semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) + task_list = [self.get_video_info_task(aid=video_item.get("aid"), bvid="", semaphore=semaphore) for video_item in video_list] + video_items = await asyncio.gather(*task_list) + for video_item in video_items: + if video_item: + video_id_list.append(video_item.get("View").get("aid")) + await bilibili_store.update_bilibili_video(video_item) + await bilibili_store.update_up_info(video_item) + await self.get_bilibili_video(video_item, semaphore) + page += 1 + await self.batch_get_video_comments(video_id_list) + # go to next day + except Exception as e: + print(e) + break async def batch_get_video_comments(self, video_id_list: List[str]): """ diff --git a/requirements.txt b/requirements.txt index 83a9c83..8e40ab5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,4 +15,5 @@ wordcloud==1.9.3 matplotlib==3.9.0 requests==2.32.3 parsel==1.9.1 -pyexecjs==1.5.1 \ No newline at end of file +pyexecjs==1.5.1 +pandas==2.2.3