mirror of
https://github.com/RYDE-WORK/MediaCrawler.git
synced 2026-02-03 15:13:19 +08:00
Merge pull request #538 from 2513502304/main
feat: bilibli support date range filter
This commit is contained in:
commit
4b63ea68ec
@ -57,6 +57,17 @@ START_PAGE = 1
|
|||||||
# 爬取视频/帖子的数量控制
|
# 爬取视频/帖子的数量控制
|
||||||
CRAWLER_MAX_NOTES_COUNT = 200
|
CRAWLER_MAX_NOTES_COUNT = 200
|
||||||
|
|
||||||
|
# 爬取开始的天数,仅支持 bilibili 关键字搜索,YYYY-MM-DD 格式,若为 None 则表示不设置时间范围,按照默认关键字最多返回 1000 条视频的结果处理
|
||||||
|
START_DAY = '2024-01-01'
|
||||||
|
|
||||||
|
# 爬取结束的天数,仅支持 bilibili 关键字搜索,YYYY-MM-DD 格式,若为 None 则表示不设置时间范围,按照默认关键字最多返回 1000 条视频的结果处理
|
||||||
|
END_DAY = '2024-01-01'
|
||||||
|
|
||||||
|
# 是否开启按每一天进行爬取的选项,仅支持 bilibili 关键字搜索
|
||||||
|
# 若为 False,则忽略 START_DAY 与 END_DAY 设置的值
|
||||||
|
# 若为 True,则按照 START_DAY 至 END_DAY 按照每一天进行筛选,这样能够突破 1000 条视频的限制,最大程度爬取该关键词下的所有视频
|
||||||
|
ALL_DAY = True
|
||||||
|
|
||||||
# 并发爬虫数量控制
|
# 并发爬虫数量控制
|
||||||
MAX_CONCURRENCY_NUM = 1
|
MAX_CONCURRENCY_NUM = 1
|
||||||
|
|
||||||
@ -69,7 +80,6 @@ ENABLE_GET_COMMENTS = True
|
|||||||
# 爬取一级评论的数量控制(单视频/帖子)
|
# 爬取一级评论的数量控制(单视频/帖子)
|
||||||
CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES = 10
|
CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES = 10
|
||||||
|
|
||||||
|
|
||||||
# 是否开启爬二级评论模式, 默认不开启爬二级评论
|
# 是否开启爬二级评论模式, 默认不开启爬二级评论
|
||||||
# 老版本项目使用了 db, 则需参考 schema/tables.sql line 287 增加表字段
|
# 老版本项目使用了 db, 则需参考 schema/tables.sql line 287 增加表字段
|
||||||
ENABLE_GET_SUB_COMMENTS = False
|
ENABLE_GET_SUB_COMMENTS = False
|
||||||
@ -87,7 +97,6 @@ XHS_SPECIFIED_NOTE_URL_LIST = [
|
|||||||
# ........................
|
# ........................
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
# 指定抖音需要爬取的ID列表
|
# 指定抖音需要爬取的ID列表
|
||||||
DY_SPECIFIED_ID_LIST = [
|
DY_SPECIFIED_ID_LIST = [
|
||||||
"7280854932641664319",
|
"7280854932641664319",
|
||||||
@ -126,6 +135,7 @@ TIEBA_NAME_LIST = [
|
|||||||
# "盗墓笔记"
|
# "盗墓笔记"
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# 指定贴吧创作者URL列表
|
||||||
TIEBA_CREATOR_URL_LIST = [
|
TIEBA_CREATOR_URL_LIST = [
|
||||||
"https://tieba.baidu.com/home/main/?id=tb.1.7f139e2e.6CyEwxu3VJruH_-QqpCi6g&fr=frs",
|
"https://tieba.baidu.com/home/main/?id=tb.1.7f139e2e.6CyEwxu3VJruH_-QqpCi6g&fr=frs",
|
||||||
# ........................
|
# ........................
|
||||||
|
|||||||
@ -147,8 +147,8 @@ class BilibiliClient(AbstractApiClient):
|
|||||||
"page": page,
|
"page": page,
|
||||||
"page_size": page_size,
|
"page_size": page_size,
|
||||||
"order": order.value,
|
"order": order.value,
|
||||||
"pubtime_begin": pubtime_begin_s,
|
"pubtime_begin_s": pubtime_begin_s,
|
||||||
"pubtime_end": pubtime_end_s
|
"pubtime_end_s": pubtime_end_s
|
||||||
}
|
}
|
||||||
return await self.get(uri, post_data)
|
return await self.get(uri, post_data)
|
||||||
|
|
||||||
|
|||||||
@ -19,9 +19,10 @@ import os
|
|||||||
import random
|
import random
|
||||||
from asyncio import Task
|
from asyncio import Task
|
||||||
from typing import Dict, List, Optional, Tuple, Union
|
from typing import Dict, List, Optional, Tuple, Union
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
from playwright.async_api import (BrowserContext, BrowserType, Page,
|
from playwright.async_api import (BrowserContext, BrowserType, Page, async_playwright)
|
||||||
async_playwright)
|
|
||||||
|
|
||||||
import config
|
import config
|
||||||
from base.base_crawler import AbstractCrawler
|
from base.base_crawler import AbstractCrawler
|
||||||
@ -95,56 +96,122 @@ class BilibiliCrawler(AbstractCrawler):
|
|||||||
utils.logger.info(
|
utils.logger.info(
|
||||||
"[BilibiliCrawler.start] Bilibili Crawler finished ...")
|
"[BilibiliCrawler.start] Bilibili Crawler finished ...")
|
||||||
|
|
||||||
|
async def get_pubtime_datetime(self, start: str = config.START_DAY, end: str = config.END_DAY) -> tuple[str, str]:
|
||||||
|
"""
|
||||||
|
获取 bilibili 作品发布日期起始时间戳 pubtime_begin_s 与发布日期结束时间戳 pubtime_end_s
|
||||||
|
---
|
||||||
|
:param start: 发布日期起始时间,YYYY-MM-DD
|
||||||
|
:param end: 发布日期结束时间,YYYY-MM-DD
|
||||||
|
|
||||||
|
Note
|
||||||
|
---
|
||||||
|
- 搜索的时间范围为 start 至 end,包含 start 和 end
|
||||||
|
- 若要搜索同一天的内容,为了包含 start 当天的搜索内容,则 pubtime_end_s 的值应该为 pubtime_begin_s 的值加上一天再减去一秒,即 start 当天的最后一秒
|
||||||
|
- 如仅搜索 2024-01-05 的内容,pubtime_begin_s = 1704384000,pubtime_end_s = 1704470399
|
||||||
|
转换为可读的 datetime 对象:pubtime_begin_s = datetime.datetime(2024, 1, 5, 0, 0),pubtime_end_s = datetime.datetime(2024, 1, 5, 23, 59, 59)
|
||||||
|
- 若要搜索 start 至 end 的内容,为了包含 end 当天的搜索内容,则 pubtime_end_s 的值应该为 pubtime_end_s 的值加上一天再减去一秒,即 end 当天的最后一秒
|
||||||
|
- 如搜索 2024-01-05 - 2024-01-06 的内容,pubtime_begin_s = 1704384000,pubtime_end_s = 1704556799
|
||||||
|
转换为可读的 datetime 对象:pubtime_begin_s = datetime.datetime(2024, 1, 5, 0, 0),pubtime_end_s = datetime.datetime(2024, 1, 6, 23, 59, 59)
|
||||||
|
"""
|
||||||
|
# 转换 start 与 end 为 datetime 对象
|
||||||
|
start_day: datetime = datetime.strptime(start, '%Y-%m-%d')
|
||||||
|
end_day: datetime = datetime.strptime(end, '%Y-%m-%d')
|
||||||
|
if start_day > end_day:
|
||||||
|
raise ValueError('Wrong time range, please check your start and end argument, to ensure that the start cannot exceed end')
|
||||||
|
elif start_day == end_day: # 搜索同一天的内容
|
||||||
|
end_day = start_day + timedelta(days=1) - timedelta(seconds=1) # 则将 end_day 设置为 start_day + 1 day - 1 second
|
||||||
|
else: # 搜索 start 至 end
|
||||||
|
end_day = end_day + timedelta(days=1) - timedelta(seconds=1) # 则将 end_day 设置为 end_day + 1 day - 1 second
|
||||||
|
# 将其重新转换为时间戳
|
||||||
|
return str(int(start_day.timestamp())), str(int(end_day.timestamp()))
|
||||||
|
|
||||||
async def search(self):
|
async def search(self):
|
||||||
"""
|
"""
|
||||||
search bilibili video with keywords
|
search bilibili video with keywords
|
||||||
:return:
|
:return:
|
||||||
"""
|
"""
|
||||||
utils.logger.info(
|
utils.logger.info("[BilibiliCrawler.search] Begin search bilibli keywords")
|
||||||
"[BilibiliCrawler.search] Begin search bilibli keywords")
|
|
||||||
bili_limit_count = 20 # bilibili limit page fixed value
|
bili_limit_count = 20 # bilibili limit page fixed value
|
||||||
if config.CRAWLER_MAX_NOTES_COUNT < bili_limit_count:
|
if config.CRAWLER_MAX_NOTES_COUNT < bili_limit_count:
|
||||||
config.CRAWLER_MAX_NOTES_COUNT = bili_limit_count
|
config.CRAWLER_MAX_NOTES_COUNT = bili_limit_count
|
||||||
start_page = config.START_PAGE # start page number
|
start_page = config.START_PAGE # start page number
|
||||||
for keyword in config.KEYWORDS.split(","):
|
for keyword in config.KEYWORDS.split(","):
|
||||||
source_keyword_var.set(keyword)
|
source_keyword_var.set(keyword)
|
||||||
utils.logger.info(
|
utils.logger.info(f"[BilibiliCrawler.search] Current search keyword: {keyword}")
|
||||||
f"[BilibiliCrawler.search] Current search keyword: {keyword}")
|
# 每个关键词最多返回 1000 条数据
|
||||||
page = 1
|
if not config.ALL_DAY:
|
||||||
while (page - start_page + 1) * bili_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
page = 1
|
||||||
if page < start_page:
|
while (page - start_page + 1) * bili_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
||||||
utils.logger.info(
|
if page < start_page:
|
||||||
f"[BilibiliCrawler.search] Skip page: {page}")
|
utils.logger.info(f"[BilibiliCrawler.search] Skip page: {page}")
|
||||||
|
page += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
utils.logger.info(f"[BilibiliCrawler.search] search bilibili keyword: {keyword}, page: {page}")
|
||||||
|
video_id_list: List[str] = []
|
||||||
|
videos_res = await self.bili_client.search_video_by_keyword(
|
||||||
|
keyword=keyword,
|
||||||
|
page=page,
|
||||||
|
page_size=bili_limit_count,
|
||||||
|
order=SearchOrderType.DEFAULT,
|
||||||
|
pubtime_begin_s=0, # 作品发布日期起始时间戳
|
||||||
|
pubtime_end_s=0 # 作品发布日期结束日期时间戳
|
||||||
|
)
|
||||||
|
video_list: List[Dict] = videos_res.get("result")
|
||||||
|
|
||||||
|
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||||
|
task_list = [self.get_video_info_task(aid=video_item.get("aid"), bvid="", semaphore=semaphore) for video_item in video_list]
|
||||||
|
video_items = await asyncio.gather(*task_list)
|
||||||
|
for video_item in video_items:
|
||||||
|
if video_item:
|
||||||
|
video_id_list.append(video_item.get("View").get("aid"))
|
||||||
|
await bilibili_store.update_bilibili_video(video_item)
|
||||||
|
await bilibili_store.update_up_info(video_item)
|
||||||
|
await self.get_bilibili_video(video_item, semaphore)
|
||||||
page += 1
|
page += 1
|
||||||
continue
|
await self.batch_get_video_comments(video_id_list)
|
||||||
|
# 按照 START_DAY 至 END_DAY 按照每一天进行筛选,这样能够突破 1000 条视频的限制,最大程度爬取该关键词下的所有视频
|
||||||
|
else:
|
||||||
|
for day in pd.date_range(start=config.START_DAY, end=config.END_DAY, freq='D'):
|
||||||
|
# 按照每一天进行爬取的时间戳参数
|
||||||
|
pubtime_begin_s, pubtime_end_s = await self.get_pubtime_datetime(start=day.strftime('%Y-%m-%d'), end=day.strftime('%Y-%m-%d'))
|
||||||
|
page = 1
|
||||||
|
while (page - start_page + 1) * bili_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
||||||
|
# ! Catch any error if response return nothing, go to next day
|
||||||
|
try:
|
||||||
|
# ! Don't skip any page, to make sure gather all video in one day
|
||||||
|
# if page < start_page:
|
||||||
|
# utils.logger.info(f"[BilibiliCrawler.search] Skip page: {page}")
|
||||||
|
# page += 1
|
||||||
|
# continue
|
||||||
|
|
||||||
utils.logger.info(f"[BilibiliCrawler.search] search bilibili keyword: {keyword}, page: {page}")
|
utils.logger.info(f"[BilibiliCrawler.search] search bilibili keyword: {keyword}, date: {day.ctime()}, page: {page}")
|
||||||
video_id_list: List[str] = []
|
video_id_list: List[str] = []
|
||||||
videos_res = await self.bili_client.search_video_by_keyword(
|
videos_res = await self.bili_client.search_video_by_keyword(
|
||||||
keyword=keyword,
|
keyword=keyword,
|
||||||
page=page,
|
page=page,
|
||||||
page_size=bili_limit_count,
|
page_size=bili_limit_count,
|
||||||
order=SearchOrderType.DEFAULT,
|
order=SearchOrderType.DEFAULT,
|
||||||
pubtime_begin_s=0, # 作品发布日期起始时间戳
|
pubtime_begin_s=pubtime_begin_s, # 作品发布日期起始时间戳
|
||||||
pubtime_end_s=0 # 作品发布日期结束日期时间戳
|
pubtime_end_s=pubtime_end_s # 作品发布日期结束日期时间戳
|
||||||
)
|
)
|
||||||
video_list: List[Dict] = videos_res.get("result")
|
video_list: List[Dict] = videos_res.get("result")
|
||||||
|
|
||||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||||
task_list = [
|
task_list = [self.get_video_info_task(aid=video_item.get("aid"), bvid="", semaphore=semaphore) for video_item in video_list]
|
||||||
self.get_video_info_task(aid=video_item.get(
|
video_items = await asyncio.gather(*task_list)
|
||||||
"aid"), bvid="", semaphore=semaphore)
|
for video_item in video_items:
|
||||||
for video_item in video_list
|
if video_item:
|
||||||
]
|
video_id_list.append(video_item.get("View").get("aid"))
|
||||||
video_items = await asyncio.gather(*task_list)
|
await bilibili_store.update_bilibili_video(video_item)
|
||||||
for video_item in video_items:
|
await bilibili_store.update_up_info(video_item)
|
||||||
if video_item:
|
await self.get_bilibili_video(video_item, semaphore)
|
||||||
video_id_list.append(video_item.get("View").get("aid"))
|
page += 1
|
||||||
await bilibili_store.update_bilibili_video(video_item)
|
await self.batch_get_video_comments(video_id_list)
|
||||||
await bilibili_store.update_up_info(video_item)
|
# go to next day
|
||||||
await self.get_bilibili_video(video_item, semaphore)
|
except Exception as e:
|
||||||
page += 1
|
print(e)
|
||||||
await self.batch_get_video_comments(video_id_list)
|
break
|
||||||
|
|
||||||
async def batch_get_video_comments(self, video_id_list: List[str]):
|
async def batch_get_video_comments(self, video_id_list: List[str]):
|
||||||
"""
|
"""
|
||||||
|
|||||||
@ -16,3 +16,4 @@ matplotlib==3.9.0
|
|||||||
requests==2.32.3
|
requests==2.32.3
|
||||||
parsel==1.9.1
|
parsel==1.9.1
|
||||||
pyexecjs==1.5.1
|
pyexecjs==1.5.1
|
||||||
|
pandas==2.2.3
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user