From fbbead814a88d8390584464a1729632bcc78ea7f Mon Sep 17 00:00:00 2001 From: Relakkes Date: Thu, 2 Jan 2025 20:29:05 +0800 Subject: [PATCH] =?UTF-8?q?fix:=20=E8=B4=B4=E5=90=A7=E5=88=9B=E4=BD=9C?= =?UTF-8?q?=E8=80=85bug=E4=BF=AE=E5=A4=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- media_platform/tieba/client.py | 30 ++++++++++++++++++++++++++---- media_platform/tieba/core.py | 8 ++++++-- media_platform/tieba/help.py | 23 +++++++++++++++++++++++ 3 files changed, 55 insertions(+), 6 deletions(-) diff --git a/media_platform/tieba/client.py b/media_platform/tieba/client.py index 1adbbe4..39e0d2d 100644 --- a/media_platform/tieba/client.py +++ b/media_platform/tieba/client.py @@ -301,7 +301,7 @@ class BaiduTieBaClient(AbstractApiClient): page_content = await self.get(uri, return_ori_content=True) return self._page_extractor.extract_tieba_note_list(page_content) - async def get_creator_info_by_url(self, creator_url: str) -> TiebaCreator: + async def get_creator_info_by_url(self, creator_url: str) -> str: """ 根据创作者ID获取创作者信息 Args: @@ -311,7 +311,7 @@ class BaiduTieBaClient(AbstractApiClient): """ page_content = await self.request(method="GET", url=creator_url, return_ori_content=True) - return self._page_extractor.extract_creator_info(page_content) + return page_content async def get_notes_by_creator(self, user_name: str, page_number: int) -> Dict: """ @@ -335,7 +335,10 @@ class BaiduTieBaClient(AbstractApiClient): async def get_all_notes_by_creator_user_name(self, user_name: str, crawl_interval: float = 1.0, callback: Optional[Callable] = None, - max_note_count: int = 0) -> List[TiebaNote]: + max_note_count: int = 0, + creator_page_html_content: str = None, + ) -> List[TiebaNote]: + """ 根据创作者用户名获取创作者所有帖子 Args: @@ -343,11 +346,30 @@ class BaiduTieBaClient(AbstractApiClient): crawl_interval: 爬取一次笔记的延迟单位(秒) callback: 一次笔记爬取结束后的回调函数,是一个awaitable类型的函数 max_note_count: 帖子最大获取数量,如果为0则获取所有 + creator_page_html_content: 创作者主页HTML内容 Returns: """ - result = [] + # 百度贴吧比较特殊一些,前10个帖子是直接展示在主页上的,要单独处理,通过API获取不到 + result: List[TiebaNote] = [] + if creator_page_html_content: + thread_id_list = ( + self._page_extractor.extract_tieba_thread_id_list_from_creator_page( + creator_page_html_content + ) + ) + utils.logger.info( + f"[BaiduTieBaClient.get_all_notes_by_creator] got user_name:{user_name} thread_id_list len : {len(thread_id_list)}" + ) + note_detail_task = [ + self.get_note_by_id(thread_id) for thread_id in thread_id_list + ] + notes = await asyncio.gather(*note_detail_task) + if callback: + await callback(notes) + result.extend(notes) + notes_has_more = 1 page_number = 1 page_per_count = 20 diff --git a/media_platform/tieba/core.py b/media_platform/tieba/core.py index 567a1fa..e83a1eb 100644 --- a/media_platform/tieba/core.py +++ b/media_platform/tieba/core.py @@ -29,6 +29,7 @@ from var import crawler_type_var, source_keyword_var from .client import BaiduTieBaClient from .field import SearchNoteType, SearchSortType +from .help import TieBaExtractor from .login import BaiduTieBaLogin @@ -40,6 +41,7 @@ class TieBaCrawler(AbstractCrawler): def __init__(self) -> None: self.index_url = "https://tieba.baidu.com" self.user_agent = utils.get_user_agent() + self._page_extractor = TieBaExtractor() async def start(self) -> None: """ @@ -238,7 +240,8 @@ class TieBaCrawler(AbstractCrawler): """ utils.logger.info("[WeiboCrawler.get_creators_and_notes] Begin get weibo creators") for creator_url in config.TIEBA_CREATOR_URL_LIST: - creator_info: TiebaCreator = await self.tieba_client.get_creator_info_by_url(creator_url=creator_url) + creator_page_html_content = await self.tieba_client.get_creator_info_by_url(creator_url=creator_url) + creator_info: TiebaCreator = self._page_extractor.extract_creator_info(creator_page_html_content) if creator_info: utils.logger.info(f"[WeiboCrawler.get_creators_and_notes] creator info: {creator_info}") if not creator_info: @@ -251,7 +254,8 @@ class TieBaCrawler(AbstractCrawler): user_name=creator_info.user_name, crawl_interval=0, callback=tieba_store.batch_update_tieba_notes, - max_note_count=config.CRAWLER_MAX_NOTES_COUNT + max_note_count=config.CRAWLER_MAX_NOTES_COUNT, + creator_page_html_content=creator_page_html_content, ) await self.batch_get_note_comments(all_notes_list) diff --git a/media_platform/tieba/help.py b/media_platform/tieba/help.py index 0d2b9e2..539ed11 100644 --- a/media_platform/tieba/help.py +++ b/media_platform/tieba/help.py @@ -237,6 +237,29 @@ class TieBaExtractor: registration_duration=self.extract_registration_duration(user_content) ) + @staticmethod + def extract_tieba_thread_id_list_from_creator_page( + html_content: str + ) -> List[str]: + """ + 提取贴吧创作者主页的帖子列表 + Args: + html_content: + + Returns: + + """ + selector = Selector(text=html_content) + thread_id_list = [] + xpath_selector = ( + "//ul[@class='new_list clearfix']//div[@class='thread_name']/a[1]/@href" + ) + thread_url_list = selector.xpath(xpath_selector).getall() + for thread_url in thread_url_list: + thread_id = thread_url.split("?")[0].split("/")[-1] + thread_id_list.append(thread_id) + return thread_id_list + def extract_ip_and_pub_time(self, html_content: str) -> Tuple[str, str]: """ 提取IP位置和发布时间