From fbbead814a88d8390584464a1729632bcc78ea7f Mon Sep 17 00:00:00 2001
From: Relakkes <relakkes@gmail.com>
Date: Thu, 2 Jan 2025 20:29:05 +0800
Subject: [PATCH] =?UTF-8?q?fix:=20=E8=B4=B4=E5=90=A7=E5=88=9B=E4=BD=9C?=
 =?UTF-8?q?=E8=80=85bug=E4=BF=AE=E5=A4=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 media_platform/tieba/client.py | 30 ++++++++++++++++++++++++++----
 media_platform/tieba/core.py   |  8 ++++++--
 media_platform/tieba/help.py   | 23 +++++++++++++++++++++++
 3 files changed, 55 insertions(+), 6 deletions(-)

diff --git a/media_platform/tieba/client.py b/media_platform/tieba/client.py
index 1adbbe4..39e0d2d 100644
--- a/media_platform/tieba/client.py
+++ b/media_platform/tieba/client.py
@@ -301,7 +301,7 @@ class BaiduTieBaClient(AbstractApiClient):
         page_content = await self.get(uri, return_ori_content=True)
         return self._page_extractor.extract_tieba_note_list(page_content)
 
-    async def get_creator_info_by_url(self, creator_url: str) -> TiebaCreator:
+    async def get_creator_info_by_url(self, creator_url: str) -> str:
         """
         根据创作者ID获取创作者信息
         Args:
@@ -311,7 +311,7 @@ class BaiduTieBaClient(AbstractApiClient):
 
         """
         page_content = await self.request(method="GET", url=creator_url, return_ori_content=True)
-        return self._page_extractor.extract_creator_info(page_content)
+        return page_content
 
     async def get_notes_by_creator(self, user_name: str, page_number: int) -> Dict:
         """
@@ -335,7 +335,10 @@ class BaiduTieBaClient(AbstractApiClient):
     async def get_all_notes_by_creator_user_name(self,
                                                  user_name: str, crawl_interval: float = 1.0,
                                                  callback: Optional[Callable] = None,
-                                                 max_note_count: int = 0) -> List[TiebaNote]:
+                                                 max_note_count: int = 0,
+                                                 creator_page_html_content: str = None,
+                                                 ) -> List[TiebaNote]:
+
         """
         根据创作者用户名获取创作者所有帖子
         Args:
@@ -343,11 +346,30 @@ class BaiduTieBaClient(AbstractApiClient):
             crawl_interval: 爬取一次笔记的延迟单位（秒）
             callback: 一次笔记爬取结束后的回调函数，是一个awaitable类型的函数
             max_note_count: 帖子最大获取数量，如果为0则获取所有
+            creator_page_html_content: 创作者主页HTML内容
 
         Returns:
 
         """
-        result = []
+        # 百度贴吧比较特殊一些，前10个帖子是直接展示在主页上的，要单独处理，通过API获取不到
+        result: List[TiebaNote] = []
+        if creator_page_html_content:
+            thread_id_list = (
+                self._page_extractor.extract_tieba_thread_id_list_from_creator_page(
+                    creator_page_html_content
+                )
+            )
+            utils.logger.info(
+                f"[BaiduTieBaClient.get_all_notes_by_creator] got user_name:{user_name} thread_id_list len : {len(thread_id_list)}"
+            )
+            note_detail_task = [
+                self.get_note_by_id(thread_id) for thread_id in thread_id_list
+            ]
+            notes = await asyncio.gather(*note_detail_task)
+            if callback:
+                await callback(notes)
+            result.extend(notes)
+
         notes_has_more = 1
         page_number = 1
         page_per_count = 20
diff --git a/media_platform/tieba/core.py b/media_platform/tieba/core.py
index 567a1fa..e83a1eb 100644
--- a/media_platform/tieba/core.py
+++ b/media_platform/tieba/core.py
@@ -29,6 +29,7 @@ from var import crawler_type_var, source_keyword_var
 
 from .client import BaiduTieBaClient
 from .field import SearchNoteType, SearchSortType
+from .help import TieBaExtractor
 from .login import BaiduTieBaLogin
 
 
@@ -40,6 +41,7 @@ class TieBaCrawler(AbstractCrawler):
     def __init__(self) -> None:
         self.index_url = "https://tieba.baidu.com"
         self.user_agent = utils.get_user_agent()
+        self._page_extractor = TieBaExtractor()
 
     async def start(self) -> None:
         """
@@ -238,7 +240,8 @@ class TieBaCrawler(AbstractCrawler):
         """
         utils.logger.info("[WeiboCrawler.get_creators_and_notes] Begin get weibo creators")
         for creator_url in config.TIEBA_CREATOR_URL_LIST:
-            creator_info: TiebaCreator = await self.tieba_client.get_creator_info_by_url(creator_url=creator_url)
+            creator_page_html_content = await self.tieba_client.get_creator_info_by_url(creator_url=creator_url)
+            creator_info: TiebaCreator = self._page_extractor.extract_creator_info(creator_page_html_content)
             if creator_info:
                 utils.logger.info(f"[WeiboCrawler.get_creators_and_notes] creator info: {creator_info}")
                 if not creator_info:
@@ -251,7 +254,8 @@ class TieBaCrawler(AbstractCrawler):
                     user_name=creator_info.user_name,
                     crawl_interval=0,
                     callback=tieba_store.batch_update_tieba_notes,
-                    max_note_count=config.CRAWLER_MAX_NOTES_COUNT
+                    max_note_count=config.CRAWLER_MAX_NOTES_COUNT,
+                    creator_page_html_content=creator_page_html_content,
                 )
 
                 await self.batch_get_note_comments(all_notes_list)
diff --git a/media_platform/tieba/help.py b/media_platform/tieba/help.py
index 0d2b9e2..539ed11 100644
--- a/media_platform/tieba/help.py
+++ b/media_platform/tieba/help.py
@@ -237,6 +237,29 @@ class TieBaExtractor:
                             registration_duration=self.extract_registration_duration(user_content)
                             )
 
+    @staticmethod
+    def extract_tieba_thread_id_list_from_creator_page(
+        html_content: str
+    ) -> List[str]:
+        """
+        提取贴吧创作者主页的帖子列表
+        Args:
+            html_content:
+
+        Returns:
+
+        """
+        selector = Selector(text=html_content)
+        thread_id_list = []
+        xpath_selector = (
+            "//ul[@class='new_list clearfix']//div[@class='thread_name']/a[1]/@href"
+        )
+        thread_url_list = selector.xpath(xpath_selector).getall()
+        for thread_url in thread_url_list:
+            thread_id = thread_url.split("?")[0].split("/")[-1]
+            thread_id_list.append(thread_id)
+        return thread_id_list
+
     def extract_ip_and_pub_time(self, html_content: str) -> Tuple[str, str]:
         """
         提取IP位置和发布时间