fix: 贴吧创作者bug修复

This commit is contained in:
Relakkes 2025-01-02 20:29:05 +08:00
parent 7ce1273386
commit fbbead814a
3 changed files with 55 additions and 6 deletions

View File

@ -301,7 +301,7 @@ class BaiduTieBaClient(AbstractApiClient):
page_content = await self.get(uri, return_ori_content=True) page_content = await self.get(uri, return_ori_content=True)
return self._page_extractor.extract_tieba_note_list(page_content) return self._page_extractor.extract_tieba_note_list(page_content)
async def get_creator_info_by_url(self, creator_url: str) -> TiebaCreator: async def get_creator_info_by_url(self, creator_url: str) -> str:
""" """
根据创作者ID获取创作者信息 根据创作者ID获取创作者信息
Args: Args:
@ -311,7 +311,7 @@ class BaiduTieBaClient(AbstractApiClient):
""" """
page_content = await self.request(method="GET", url=creator_url, return_ori_content=True) page_content = await self.request(method="GET", url=creator_url, return_ori_content=True)
return self._page_extractor.extract_creator_info(page_content) return page_content
async def get_notes_by_creator(self, user_name: str, page_number: int) -> Dict: async def get_notes_by_creator(self, user_name: str, page_number: int) -> Dict:
""" """
@ -335,7 +335,10 @@ class BaiduTieBaClient(AbstractApiClient):
async def get_all_notes_by_creator_user_name(self, async def get_all_notes_by_creator_user_name(self,
user_name: str, crawl_interval: float = 1.0, user_name: str, crawl_interval: float = 1.0,
callback: Optional[Callable] = None, callback: Optional[Callable] = None,
max_note_count: int = 0) -> List[TiebaNote]: max_note_count: int = 0,
creator_page_html_content: str = None,
) -> List[TiebaNote]:
""" """
根据创作者用户名获取创作者所有帖子 根据创作者用户名获取创作者所有帖子
Args: Args:
@ -343,11 +346,30 @@ class BaiduTieBaClient(AbstractApiClient):
crawl_interval: 爬取一次笔记的延迟单位 crawl_interval: 爬取一次笔记的延迟单位
callback: 一次笔记爬取结束后的回调函数是一个awaitable类型的函数 callback: 一次笔记爬取结束后的回调函数是一个awaitable类型的函数
max_note_count: 帖子最大获取数量如果为0则获取所有 max_note_count: 帖子最大获取数量如果为0则获取所有
creator_page_html_content: 创作者主页HTML内容
Returns: Returns:
""" """
result = [] # 百度贴吧比较特殊一些前10个帖子是直接展示在主页上的要单独处理通过API获取不到
result: List[TiebaNote] = []
if creator_page_html_content:
thread_id_list = (
self._page_extractor.extract_tieba_thread_id_list_from_creator_page(
creator_page_html_content
)
)
utils.logger.info(
f"[BaiduTieBaClient.get_all_notes_by_creator] got user_name:{user_name} thread_id_list len : {len(thread_id_list)}"
)
note_detail_task = [
self.get_note_by_id(thread_id) for thread_id in thread_id_list
]
notes = await asyncio.gather(*note_detail_task)
if callback:
await callback(notes)
result.extend(notes)
notes_has_more = 1 notes_has_more = 1
page_number = 1 page_number = 1
page_per_count = 20 page_per_count = 20

View File

@ -29,6 +29,7 @@ from var import crawler_type_var, source_keyword_var
from .client import BaiduTieBaClient from .client import BaiduTieBaClient
from .field import SearchNoteType, SearchSortType from .field import SearchNoteType, SearchSortType
from .help import TieBaExtractor
from .login import BaiduTieBaLogin from .login import BaiduTieBaLogin
@ -40,6 +41,7 @@ class TieBaCrawler(AbstractCrawler):
def __init__(self) -> None: def __init__(self) -> None:
self.index_url = "https://tieba.baidu.com" self.index_url = "https://tieba.baidu.com"
self.user_agent = utils.get_user_agent() self.user_agent = utils.get_user_agent()
self._page_extractor = TieBaExtractor()
async def start(self) -> None: async def start(self) -> None:
""" """
@ -238,7 +240,8 @@ class TieBaCrawler(AbstractCrawler):
""" """
utils.logger.info("[WeiboCrawler.get_creators_and_notes] Begin get weibo creators") utils.logger.info("[WeiboCrawler.get_creators_and_notes] Begin get weibo creators")
for creator_url in config.TIEBA_CREATOR_URL_LIST: for creator_url in config.TIEBA_CREATOR_URL_LIST:
creator_info: TiebaCreator = await self.tieba_client.get_creator_info_by_url(creator_url=creator_url) creator_page_html_content = await self.tieba_client.get_creator_info_by_url(creator_url=creator_url)
creator_info: TiebaCreator = self._page_extractor.extract_creator_info(creator_page_html_content)
if creator_info: if creator_info:
utils.logger.info(f"[WeiboCrawler.get_creators_and_notes] creator info: {creator_info}") utils.logger.info(f"[WeiboCrawler.get_creators_and_notes] creator info: {creator_info}")
if not creator_info: if not creator_info:
@ -251,7 +254,8 @@ class TieBaCrawler(AbstractCrawler):
user_name=creator_info.user_name, user_name=creator_info.user_name,
crawl_interval=0, crawl_interval=0,
callback=tieba_store.batch_update_tieba_notes, callback=tieba_store.batch_update_tieba_notes,
max_note_count=config.CRAWLER_MAX_NOTES_COUNT max_note_count=config.CRAWLER_MAX_NOTES_COUNT,
creator_page_html_content=creator_page_html_content,
) )
await self.batch_get_note_comments(all_notes_list) await self.batch_get_note_comments(all_notes_list)

View File

@ -237,6 +237,29 @@ class TieBaExtractor:
registration_duration=self.extract_registration_duration(user_content) registration_duration=self.extract_registration_duration(user_content)
) )
@staticmethod
def extract_tieba_thread_id_list_from_creator_page(
html_content: str
) -> List[str]:
"""
提取贴吧创作者主页的帖子列表
Args:
html_content:
Returns:
"""
selector = Selector(text=html_content)
thread_id_list = []
xpath_selector = (
"//ul[@class='new_list clearfix']//div[@class='thread_name']/a[1]/@href"
)
thread_url_list = selector.xpath(xpath_selector).getall()
for thread_url in thread_url_list:
thread_id = thread_url.split("?")[0].split("/")[-1]
thread_id_list.append(thread_id)
return thread_id_list
def extract_ip_and_pub_time(self, html_content: str) -> Tuple[str, str]: def extract_ip_and_pub_time(self, html_content: str) -> Tuple[str, str]:
""" """
提取IP位置和发布时间 提取IP位置和发布时间