mirror of
https://github.com/RYDE-WORK/MediaCrawler.git
synced 2026-02-06 08:53:21 +08:00
Update core.py,删除了其它代码贡献者所添加的try-catch语句,该段try-catch语句将会影响其代码的最终逻辑并令其失效,使其仅能爬取当天一天数据而无法跳转到下一天(原先的逻辑就是try-catch捕获异常从而进入下一天,不要再向该语句中添加捕获异常操作或者finally语句!)
This commit is contained in:
parent
b675547aab
commit
af5a393a7a
@ -164,10 +164,8 @@ class BilibiliCrawler(AbstractCrawler):
|
|||||||
task_list = []
|
task_list = []
|
||||||
try:
|
try:
|
||||||
task_list = [self.get_video_info_task(aid=video_item.get("aid"), bvid="", semaphore=semaphore) for video_item in video_list]
|
task_list = [self.get_video_info_task(aid=video_item.get("aid"), bvid="", semaphore=semaphore) for video_item in video_list]
|
||||||
except Exception as e :
|
except Exception as e:
|
||||||
utils.logger.warning(
|
utils.logger.warning(f"[BilibiliCrawler.search] error in the task list. The video for this page will not be included. {e}")
|
||||||
f"[BilibiliCrawler.search] error in the task list. The video for this page will not be included. {e}"
|
|
||||||
)
|
|
||||||
video_items = await asyncio.gather(*task_list)
|
video_items = await asyncio.gather(*task_list)
|
||||||
for video_item in video_items:
|
for video_item in video_items:
|
||||||
if video_item:
|
if video_item:
|
||||||
@ -177,16 +175,19 @@ class BilibiliCrawler(AbstractCrawler):
|
|||||||
await self.get_bilibili_video(video_item, semaphore)
|
await self.get_bilibili_video(video_item, semaphore)
|
||||||
page += 1
|
page += 1
|
||||||
await self.batch_get_video_comments(video_id_list)
|
await self.batch_get_video_comments(video_id_list)
|
||||||
# 按照 START_DAY 至 END_DAY 按照每一天进行筛选,这样能够突破 1000 条视频的限制,最大程度爬取该关键词下的所有视频
|
# 按照 START_DAY 至 END_DAY 按照每一天进行筛选,这样能够突破 1000 条视频的限制,最大程度爬取该关键词下每一天的所有视频
|
||||||
else:
|
else:
|
||||||
for day in pd.date_range(start=config.START_DAY, end=config.END_DAY, freq='D'):
|
for day in pd.date_range(start=config.START_DAY, end=config.END_DAY, freq='D'):
|
||||||
# 按照每一天进行爬取的时间戳参数
|
# 按照每一天进行爬取的时间戳参数
|
||||||
pubtime_begin_s, pubtime_end_s = await self.get_pubtime_datetime(start=day.strftime('%Y-%m-%d'), end=day.strftime('%Y-%m-%d'))
|
pubtime_begin_s, pubtime_end_s = await self.get_pubtime_datetime(start=day.strftime('%Y-%m-%d'), end=day.strftime('%Y-%m-%d'))
|
||||||
page = 1
|
page = 1
|
||||||
|
#!该段 while 语句在发生异常时(通常情况下为当天数据为空时)会自动跳转到下一天,以实现最大程度爬取该关键词下当天的所有视频
|
||||||
|
#!除了仅保留现在原有的 try, except Exception 语句外,不要再添加其他的异常处理!!!否则将使该段代码失效,使其仅能爬取当天一天数据而无法跳转到下一天
|
||||||
|
#!除非将该段代码的逻辑进行重构以实现相同的功能,否则不要进行修改!!!
|
||||||
while (page - start_page + 1) * bili_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
while (page - start_page + 1) * bili_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
||||||
# ! Catch any error if response return nothing, go to next day
|
#! Catch any error if response return nothing, go to next day
|
||||||
try:
|
try:
|
||||||
# ! Don't skip any page, to make sure gather all video in one day
|
#! Don't skip any page, to make sure gather all video in one day
|
||||||
# if page < start_page:
|
# if page < start_page:
|
||||||
# utils.logger.info(f"[BilibiliCrawler.search] Skip page: {page}")
|
# utils.logger.info(f"[BilibiliCrawler.search] Skip page: {page}")
|
||||||
# page += 1
|
# page += 1
|
||||||
@ -205,11 +206,7 @@ class BilibiliCrawler(AbstractCrawler):
|
|||||||
video_list: List[Dict] = videos_res.get("result")
|
video_list: List[Dict] = videos_res.get("result")
|
||||||
|
|
||||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||||
task_list = []
|
task_list = [self.get_video_info_task(aid=video_item.get("aid"), bvid="", semaphore=semaphore) for video_item in video_list]
|
||||||
try:
|
|
||||||
task_list = [self.get_video_info_task(aid=video_item.get("aid"), bvid="", semaphore=semaphore) for video_item in video_list]
|
|
||||||
finally:
|
|
||||||
pass
|
|
||||||
video_items = await asyncio.gather(*task_list)
|
video_items = await asyncio.gather(*task_list)
|
||||||
for video_item in video_items:
|
for video_item in video_items:
|
||||||
if video_item:
|
if video_item:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user