diff --git a/config/base_config.py b/config/base_config.py index 26761f8..7edee27 100644 --- a/config/base_config.py +++ b/config/base_config.py @@ -39,6 +39,10 @@ ENABLE_GET_IMAGES = False # 是否开启爬评论模式, 默认不开启爬评论 ENABLE_GET_COMMENTS = False +# 是否开启爬二级评论模式, 默认不开启爬二级评论, 目前仅支持 xhs +# 老版本项目使用了 db, 则需参考 schema/tables.sql line 287 增加表字段 +ENABLE_GET_SUB_COMMENTS = False + # 指定小红书需要爬虫的笔记ID列表 XHS_SPECIFIED_ID_LIST = [ "6422c2750000000027000d88", diff --git a/media_platform/xhs/client.py b/media_platform/xhs/client.py index 6b8a393..b682b11 100644 --- a/media_platform/xhs/client.py +++ b/media_platform/xhs/client.py @@ -7,6 +7,7 @@ from urllib.parse import urlencode import httpx from playwright.async_api import BrowserContext, Page +import config from base.base_crawler import AbstactApiClient from tools import utils @@ -225,7 +226,7 @@ class XiaoHongShuClient(AbstactApiClient): } return await self.get(uri, params) - async def get_note_sub_comments(self, note_id: str, root_comment_id: str, num: int = 30, cursor: str = ""): + async def get_note_sub_comments(self, note_id: str, root_comment_id: str, num: int = 10, cursor: str = ""): """ 获取指定父评论下的子评论的API Args: @@ -274,6 +275,53 @@ class XiaoHongShuClient(AbstactApiClient): await callback(note_id, comments) await asyncio.sleep(crawl_interval) result.extend(comments) + sub_comments = await self.get_comments_all_sub_comments(comments, crawl_interval, callback) + result.extend(sub_comments) + return result + + async def get_comments_all_sub_comments(self, comments: List[Dict], crawl_interval: float = 1.0, + callback: Optional[Callable] = None) -> List[Dict]: + """ + 获取指定一级评论下的所有二级评论, 该方法会一直查找一级评论下的所有二级评论信息 + Args: + comments: 评论列表 + crawl_interval: 爬取一次评论的延迟单位(秒) + callback: 一次评论爬取结束后 + + Returns: + + """ + if not config.ENABLE_GET_SUB_COMMENTS: + utils.logger.info(f"[XiaoHongShuCrawler.get_comments_all_sub_comments] Crawling sub_comment mode is not enabled") + return [] + + result = [] + for comment in comments: + note_id = comment.get("note_id") + sub_comments = comment.get("sub_comments") + if sub_comments and callback: + await callback(note_id, sub_comments) + + sub_comment_has_more = comment.get("sub_comment_has_more") + if not sub_comment_has_more: + continue + + root_comment_id = comment.get("id") + sub_comment_cursor = comment.get("sub_comment_cursor") + + while sub_comment_has_more: + comments_res = await self.get_note_sub_comments(note_id, root_comment_id, 10, sub_comment_cursor) + sub_comment_has_more = comments_res.get("has_more", False) + sub_comment_cursor = comments_res.get("cursor", "") + if "comments" not in comments_res: + utils.logger.info( + f"[XiaoHongShuClient.get_comments_all_sub_comments] No 'comments' key found in response: {comments_res}") + break + comments = comments_res["comments"] + if callback: + await callback(note_id, comments) + await asyncio.sleep(crawl_interval) + result.extend(comments) return result async def get_creator_info(self, user_id: str) -> Dict: diff --git a/media_platform/xhs/core.py b/media_platform/xhs/core.py index 5b67d6d..6fc2660 100644 --- a/media_platform/xhs/core.py +++ b/media_platform/xhs/core.py @@ -273,4 +273,4 @@ class XiaoHongShuCrawler(AbstractCrawler): async def close(self): """Close browser context""" await self.browser_context.close() - utils.logger.info("[XiaoHongShuCrawler.close] Browser context closed ...") + utils.logger.info("[XiaoHongShuCrawler.close] Browser context closed ...") \ No newline at end of file diff --git a/schema/tables.sql b/schema/tables.sql index 5ba45e5..007a55e 100644 --- a/schema/tables.sql +++ b/schema/tables.sql @@ -281,4 +281,11 @@ CREATE TABLE `xhs_note_comment` ( KEY `idx_xhs_note_co_create__204f8d` (`create_time`) ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='小红书笔记评论'; +-- ---------------------------- +-- alter table xhs_note_comment to support parent_comment_id +-- ---------------------------- +ALTER TABLE `xhs_note_comment` +ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID'; + + SET FOREIGN_KEY_CHECKS = 1; diff --git a/store/xhs/__init__.py b/store/xhs/__init__.py index f10d0ef..ab13482 100644 --- a/store/xhs/__init__.py +++ b/store/xhs/__init__.py @@ -74,6 +74,7 @@ async def update_xhs_note_comment(note_id: str, comment_item: Dict): user_info = comment_item.get("user_info", {}) comment_id = comment_item.get("id") comment_pictures = [item.get("url_default", "") for item in comment_item.get("pictures", [])] + target_comment = comment_item.get("target_comment", {}) local_db_item = { "comment_id": comment_id, "create_time": comment_item.get("create_time"), @@ -83,8 +84,9 @@ async def update_xhs_note_comment(note_id: str, comment_item: Dict): "user_id": user_info.get("user_id"), "nickname": user_info.get("nickname"), "avatar": user_info.get("image"), - "sub_comment_count": comment_item.get("sub_comment_count"), + "sub_comment_count": comment_item.get("sub_comment_count", 0), "pictures": ",".join(comment_pictures), + "parent_comment_id": target_comment.get("id", 0), "last_modify_ts": utils.get_current_timestamp(), } utils.logger.info(f"[store.xhs.update_xhs_note_comment] xhs note comment:{local_db_item}") diff --git a/store/xhs/xhs_store_impl.py b/store/xhs/xhs_store_impl.py index 26fc43e..f7a8bcf 100644 --- a/store/xhs/xhs_store_impl.py +++ b/store/xhs/xhs_store_impl.py @@ -213,4 +213,4 @@ class XhsJsonStoreImplement(AbstractStore): Returns: """ - await self.save_data_to_json(creator, "creator") + await self.save_data_to_json(creator, "creator") \ No newline at end of file