mirror of
https://github.com/RYDE-WORK/MediaCrawler.git
synced 2026-02-05 16:36:44 +08:00
fix: 贴吧创作者bug修复
This commit is contained in:
parent
7ce1273386
commit
fbbead814a
@ -301,7 +301,7 @@ class BaiduTieBaClient(AbstractApiClient):
|
|||||||
page_content = await self.get(uri, return_ori_content=True)
|
page_content = await self.get(uri, return_ori_content=True)
|
||||||
return self._page_extractor.extract_tieba_note_list(page_content)
|
return self._page_extractor.extract_tieba_note_list(page_content)
|
||||||
|
|
||||||
async def get_creator_info_by_url(self, creator_url: str) -> TiebaCreator:
|
async def get_creator_info_by_url(self, creator_url: str) -> str:
|
||||||
"""
|
"""
|
||||||
根据创作者ID获取创作者信息
|
根据创作者ID获取创作者信息
|
||||||
Args:
|
Args:
|
||||||
@ -311,7 +311,7 @@ class BaiduTieBaClient(AbstractApiClient):
|
|||||||
|
|
||||||
"""
|
"""
|
||||||
page_content = await self.request(method="GET", url=creator_url, return_ori_content=True)
|
page_content = await self.request(method="GET", url=creator_url, return_ori_content=True)
|
||||||
return self._page_extractor.extract_creator_info(page_content)
|
return page_content
|
||||||
|
|
||||||
async def get_notes_by_creator(self, user_name: str, page_number: int) -> Dict:
|
async def get_notes_by_creator(self, user_name: str, page_number: int) -> Dict:
|
||||||
"""
|
"""
|
||||||
@ -335,7 +335,10 @@ class BaiduTieBaClient(AbstractApiClient):
|
|||||||
async def get_all_notes_by_creator_user_name(self,
|
async def get_all_notes_by_creator_user_name(self,
|
||||||
user_name: str, crawl_interval: float = 1.0,
|
user_name: str, crawl_interval: float = 1.0,
|
||||||
callback: Optional[Callable] = None,
|
callback: Optional[Callable] = None,
|
||||||
max_note_count: int = 0) -> List[TiebaNote]:
|
max_note_count: int = 0,
|
||||||
|
creator_page_html_content: str = None,
|
||||||
|
) -> List[TiebaNote]:
|
||||||
|
|
||||||
"""
|
"""
|
||||||
根据创作者用户名获取创作者所有帖子
|
根据创作者用户名获取创作者所有帖子
|
||||||
Args:
|
Args:
|
||||||
@ -343,11 +346,30 @@ class BaiduTieBaClient(AbstractApiClient):
|
|||||||
crawl_interval: 爬取一次笔记的延迟单位(秒)
|
crawl_interval: 爬取一次笔记的延迟单位(秒)
|
||||||
callback: 一次笔记爬取结束后的回调函数,是一个awaitable类型的函数
|
callback: 一次笔记爬取结束后的回调函数,是一个awaitable类型的函数
|
||||||
max_note_count: 帖子最大获取数量,如果为0则获取所有
|
max_note_count: 帖子最大获取数量,如果为0则获取所有
|
||||||
|
creator_page_html_content: 创作者主页HTML内容
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
|
|
||||||
"""
|
"""
|
||||||
result = []
|
# 百度贴吧比较特殊一些,前10个帖子是直接展示在主页上的,要单独处理,通过API获取不到
|
||||||
|
result: List[TiebaNote] = []
|
||||||
|
if creator_page_html_content:
|
||||||
|
thread_id_list = (
|
||||||
|
self._page_extractor.extract_tieba_thread_id_list_from_creator_page(
|
||||||
|
creator_page_html_content
|
||||||
|
)
|
||||||
|
)
|
||||||
|
utils.logger.info(
|
||||||
|
f"[BaiduTieBaClient.get_all_notes_by_creator] got user_name:{user_name} thread_id_list len : {len(thread_id_list)}"
|
||||||
|
)
|
||||||
|
note_detail_task = [
|
||||||
|
self.get_note_by_id(thread_id) for thread_id in thread_id_list
|
||||||
|
]
|
||||||
|
notes = await asyncio.gather(*note_detail_task)
|
||||||
|
if callback:
|
||||||
|
await callback(notes)
|
||||||
|
result.extend(notes)
|
||||||
|
|
||||||
notes_has_more = 1
|
notes_has_more = 1
|
||||||
page_number = 1
|
page_number = 1
|
||||||
page_per_count = 20
|
page_per_count = 20
|
||||||
|
|||||||
@ -29,6 +29,7 @@ from var import crawler_type_var, source_keyword_var
|
|||||||
|
|
||||||
from .client import BaiduTieBaClient
|
from .client import BaiduTieBaClient
|
||||||
from .field import SearchNoteType, SearchSortType
|
from .field import SearchNoteType, SearchSortType
|
||||||
|
from .help import TieBaExtractor
|
||||||
from .login import BaiduTieBaLogin
|
from .login import BaiduTieBaLogin
|
||||||
|
|
||||||
|
|
||||||
@ -40,6 +41,7 @@ class TieBaCrawler(AbstractCrawler):
|
|||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
self.index_url = "https://tieba.baidu.com"
|
self.index_url = "https://tieba.baidu.com"
|
||||||
self.user_agent = utils.get_user_agent()
|
self.user_agent = utils.get_user_agent()
|
||||||
|
self._page_extractor = TieBaExtractor()
|
||||||
|
|
||||||
async def start(self) -> None:
|
async def start(self) -> None:
|
||||||
"""
|
"""
|
||||||
@ -238,7 +240,8 @@ class TieBaCrawler(AbstractCrawler):
|
|||||||
"""
|
"""
|
||||||
utils.logger.info("[WeiboCrawler.get_creators_and_notes] Begin get weibo creators")
|
utils.logger.info("[WeiboCrawler.get_creators_and_notes] Begin get weibo creators")
|
||||||
for creator_url in config.TIEBA_CREATOR_URL_LIST:
|
for creator_url in config.TIEBA_CREATOR_URL_LIST:
|
||||||
creator_info: TiebaCreator = await self.tieba_client.get_creator_info_by_url(creator_url=creator_url)
|
creator_page_html_content = await self.tieba_client.get_creator_info_by_url(creator_url=creator_url)
|
||||||
|
creator_info: TiebaCreator = self._page_extractor.extract_creator_info(creator_page_html_content)
|
||||||
if creator_info:
|
if creator_info:
|
||||||
utils.logger.info(f"[WeiboCrawler.get_creators_and_notes] creator info: {creator_info}")
|
utils.logger.info(f"[WeiboCrawler.get_creators_and_notes] creator info: {creator_info}")
|
||||||
if not creator_info:
|
if not creator_info:
|
||||||
@ -251,7 +254,8 @@ class TieBaCrawler(AbstractCrawler):
|
|||||||
user_name=creator_info.user_name,
|
user_name=creator_info.user_name,
|
||||||
crawl_interval=0,
|
crawl_interval=0,
|
||||||
callback=tieba_store.batch_update_tieba_notes,
|
callback=tieba_store.batch_update_tieba_notes,
|
||||||
max_note_count=config.CRAWLER_MAX_NOTES_COUNT
|
max_note_count=config.CRAWLER_MAX_NOTES_COUNT,
|
||||||
|
creator_page_html_content=creator_page_html_content,
|
||||||
)
|
)
|
||||||
|
|
||||||
await self.batch_get_note_comments(all_notes_list)
|
await self.batch_get_note_comments(all_notes_list)
|
||||||
|
|||||||
@ -237,6 +237,29 @@ class TieBaExtractor:
|
|||||||
registration_duration=self.extract_registration_duration(user_content)
|
registration_duration=self.extract_registration_duration(user_content)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def extract_tieba_thread_id_list_from_creator_page(
|
||||||
|
html_content: str
|
||||||
|
) -> List[str]:
|
||||||
|
"""
|
||||||
|
提取贴吧创作者主页的帖子列表
|
||||||
|
Args:
|
||||||
|
html_content:
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
|
||||||
|
"""
|
||||||
|
selector = Selector(text=html_content)
|
||||||
|
thread_id_list = []
|
||||||
|
xpath_selector = (
|
||||||
|
"//ul[@class='new_list clearfix']//div[@class='thread_name']/a[1]/@href"
|
||||||
|
)
|
||||||
|
thread_url_list = selector.xpath(xpath_selector).getall()
|
||||||
|
for thread_url in thread_url_list:
|
||||||
|
thread_id = thread_url.split("?")[0].split("/")[-1]
|
||||||
|
thread_id_list.append(thread_id)
|
||||||
|
return thread_id_list
|
||||||
|
|
||||||
def extract_ip_and_pub_time(self, html_content: str) -> Tuple[str, str]:
|
def extract_ip_and_pub_time(self, html_content: str) -> Tuple[str, str]:
|
||||||
"""
|
"""
|
||||||
提取IP位置和发布时间
|
提取IP位置和发布时间
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user