mirror of
https://github.com/RYDE-WORK/MediaCrawler.git
synced 2026-02-05 16:36:44 +08:00
feat: 支持评论模式是否开启爬取选项
This commit is contained in:
parent
2d12ecb930
commit
59cd9f67a0
@ -60,6 +60,7 @@
|
|||||||
### 运行爬虫程序
|
### 运行爬虫程序
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
|
# 默认没有开启评论爬取模式,有需要请到配置文件中指定
|
||||||
# 从配置文件中读取关键词搜索相关的帖子并爬去帖子信息与评论
|
# 从配置文件中读取关键词搜索相关的帖子并爬去帖子信息与评论
|
||||||
python main.py --platform xhs --lt qrcode --type search
|
python main.py --platform xhs --lt qrcode --type search
|
||||||
|
|
||||||
|
|||||||
@ -30,12 +30,8 @@ CRAWLER_MAX_NOTES_COUNT = 20
|
|||||||
# 并发爬虫数量控制
|
# 并发爬虫数量控制
|
||||||
MAX_CONCURRENCY_NUM = 4
|
MAX_CONCURRENCY_NUM = 4
|
||||||
|
|
||||||
|
# 是否开启爬评论模式, 默认不开启爬评论
|
||||||
# 评论关键词筛选(只会留下包含关键词的评论,为空不限制)
|
ENABLE_GET_COMMENTS = False
|
||||||
COMMENT_KEYWORDS = [
|
|
||||||
# "真棒"
|
|
||||||
# ........................
|
|
||||||
]
|
|
||||||
|
|
||||||
# 指定小红书需要爬虫的笔记ID列表
|
# 指定小红书需要爬虫的笔记ID列表
|
||||||
XHS_SPECIFIED_ID_LIST = [
|
XHS_SPECIFIED_ID_LIST = [
|
||||||
|
|||||||
@ -127,6 +127,10 @@ class BilibiliCrawler(AbstractCrawler):
|
|||||||
:param video_id_list:
|
:param video_id_list:
|
||||||
:return:
|
:return:
|
||||||
"""
|
"""
|
||||||
|
if not config.ENABLE_GET_COMMENTS:
|
||||||
|
utils.logger.info(f"[BilibiliCrawler.batch_get_note_comments] Crawling comment mode is not enabled")
|
||||||
|
return
|
||||||
|
|
||||||
utils.logger.info(f"[BilibiliCrawler.batch_get_video_comments] video ids:{video_id_list}")
|
utils.logger.info(f"[BilibiliCrawler.batch_get_video_comments] video ids:{video_id_list}")
|
||||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||||
task_list: List[Task] = []
|
task_list: List[Task] = []
|
||||||
|
|||||||
@ -132,6 +132,10 @@ class DouYinCrawler(AbstractCrawler):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
async def batch_get_note_comments(self, aweme_list: List[str]) -> None:
|
async def batch_get_note_comments(self, aweme_list: List[str]) -> None:
|
||||||
|
if not config.ENABLE_GET_COMMENTS:
|
||||||
|
utils.logger.info(f"[DouYinCrawler.batch_get_note_comments] Crawling comment mode is not enabled")
|
||||||
|
return
|
||||||
|
|
||||||
task_list: List[Task] = []
|
task_list: List[Task] = []
|
||||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||||
for aweme_id in aweme_list:
|
for aweme_id in aweme_list:
|
||||||
@ -145,7 +149,7 @@ class DouYinCrawler(AbstractCrawler):
|
|||||||
async with semaphore:
|
async with semaphore:
|
||||||
try:
|
try:
|
||||||
# 将关键词列表传递给 get_aweme_all_comments 方法
|
# 将关键词列表传递给 get_aweme_all_comments 方法
|
||||||
comments = await self.dy_client.get_aweme_all_comments(
|
await self.dy_client.get_aweme_all_comments(
|
||||||
aweme_id=aweme_id,
|
aweme_id=aweme_id,
|
||||||
crawl_interval=random.random(),
|
crawl_interval=random.random(),
|
||||||
callback=douyin_store.batch_update_dy_aweme_comments
|
callback=douyin_store.batch_update_dy_aweme_comments
|
||||||
|
|||||||
@ -144,6 +144,10 @@ class KuaishouCrawler(AbstractCrawler):
|
|||||||
:param video_id_list:
|
:param video_id_list:
|
||||||
:return:
|
:return:
|
||||||
"""
|
"""
|
||||||
|
if not config.ENABLE_GET_COMMENTS:
|
||||||
|
utils.logger.info(f"[KuaishouCrawler.batch_get_note_comments] Crawling comment mode is not enabled")
|
||||||
|
return
|
||||||
|
|
||||||
utils.logger.info(f"[KuaishouCrawler.batch_get_video_comments] video ids:{video_id_list}")
|
utils.logger.info(f"[KuaishouCrawler.batch_get_video_comments] video ids:{video_id_list}")
|
||||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||||
task_list: List[Task] = []
|
task_list: List[Task] = []
|
||||||
|
|||||||
@ -166,6 +166,10 @@ class WeiboCrawler(AbstractCrawler):
|
|||||||
:param note_id_list:
|
:param note_id_list:
|
||||||
:return:
|
:return:
|
||||||
"""
|
"""
|
||||||
|
if not config.ENABLE_GET_COMMENTS:
|
||||||
|
utils.logger.info(f"[WeiboCrawler.batch_get_note_comments] Crawling comment mode is not enabled")
|
||||||
|
return
|
||||||
|
|
||||||
utils.logger.info(f"[WeiboCrawler.batch_get_notes_comments] note ids:{note_id_list}")
|
utils.logger.info(f"[WeiboCrawler.batch_get_notes_comments] note ids:{note_id_list}")
|
||||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||||
task_list: List[Task] = []
|
task_list: List[Task] = []
|
||||||
|
|||||||
@ -151,7 +151,8 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
|||||||
|
|
||||||
# save creator info
|
# save creator info
|
||||||
await xhs_store.save_creator(creator, creator_and_notes_info.get('creator'))
|
await xhs_store.save_creator(creator, creator_and_notes_info.get('creator'))
|
||||||
utils.logger.info(f"[XiaoHongShuCrawler.get_creators_and_notes] save creator info:{creator_and_notes_info.get('creator')}")
|
utils.logger.info(
|
||||||
|
f"[XiaoHongShuCrawler.get_creators_and_notes] save creator info:{creator_and_notes_info.get('creator')}")
|
||||||
else:
|
else:
|
||||||
# get notes
|
# get notes
|
||||||
notes = await self.xhs_client.get_notes_by_creator(creator, cursor)
|
notes = await self.xhs_client.get_notes_by_creator(creator, cursor)
|
||||||
@ -164,7 +165,8 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
|||||||
cursor = notes.get('cursor')
|
cursor = notes.get('cursor')
|
||||||
has_more_notes = notes.get('has_more_notes')
|
has_more_notes = notes.get('has_more_notes')
|
||||||
notes_res = notes.get('notes')
|
notes_res = notes.get('notes')
|
||||||
utils.logger.info(f"[XiaoHongShuCrawler.get_creators_and_notes] get creator's notes res:{notes_res}")
|
utils.logger.info(
|
||||||
|
f"[XiaoHongShuCrawler.get_creators_and_notes] get creator's notes res:{notes_res}")
|
||||||
|
|
||||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||||
task_list = [
|
task_list = [
|
||||||
@ -211,6 +213,10 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
|||||||
|
|
||||||
async def batch_get_note_comments(self, note_list: List[str]):
|
async def batch_get_note_comments(self, note_list: List[str]):
|
||||||
"""Batch get note comments"""
|
"""Batch get note comments"""
|
||||||
|
if not config.ENABLE_GET_COMMENTS:
|
||||||
|
utils.logger.info(f"[XiaoHongShuCrawler.batch_get_note_comments] Crawling comment mode is not enabled")
|
||||||
|
return
|
||||||
|
|
||||||
utils.logger.info(
|
utils.logger.info(
|
||||||
f"[XiaoHongShuCrawler.batch_get_note_comments] Begin batch get note comments, note list: {note_list}")
|
f"[XiaoHongShuCrawler.batch_get_note_comments] Begin batch get note comments, note list: {note_list}")
|
||||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user