From 1a37df4d5e1bf79fc8e70c78ca117c0067550088 Mon Sep 17 00:00:00 2001 From: lyx0727 <1324938402@qq.com> Date: Fri, 1 Nov 2024 18:52:16 +0800 Subject: [PATCH 1/2] =?UTF-8?q?fix:=20xhs=E5=87=BA=E7=8E=B0=E9=AA=8C?= =?UTF-8?q?=E8=AF=81=E7=A0=81=E6=97=B6=E6=8A=A5=E9=94=99=EF=BC=8C=E5=A2=9E?= =?UTF-8?q?=E5=8A=A0=E7=94=A8=E6=88=B7=E6=89=8B=E5=8A=A8=E9=AA=8C=E8=AF=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- media_platform/xhs/client.py | 44 +++++++++++++++++++++++++++++++----- 1 file changed, 38 insertions(+), 6 deletions(-) diff --git a/media_platform/xhs/client.py b/media_platform/xhs/client.py index 397c290..d3055f9 100644 --- a/media_platform/xhs/client.py +++ b/media_platform/xhs/client.py @@ -17,7 +17,7 @@ from urllib.parse import urlencode import httpx from playwright.async_api import BrowserContext, Page -from tenacity import retry, stop_after_attempt, wait_fixed +from tenacity import retry, stop_after_attempt, wait_fixed, retry_if_result import config from base.base_crawler import AbstractApiClient @@ -503,8 +503,40 @@ class XiaoHongShuClient(AbstractApiClient): url = "https://www.xiaohongshu.com/explore/" + note_id + f"?xsec_token={xsec_token}&xsec_source={xsec_source}" html = await self.request(method="GET", url=url, return_response=True, headers=self.headers) - state = re.findall(r"window.__INITIAL_STATE__=({.*})", html)[0].replace("undefined", '""') - if state != "{}": - note_dict = transform_json_keys(state) - return note_dict["note"]["note_detail_map"][note_id]["note"] - raise DataFetchError(html) + + def get_note_dict(html): + state = re.findall(r"window.__INITIAL_STATE__=({.*})", html)[ + 0 + ].replace("undefined", '""') + + if state != "{}": + note_dict = transform_json_keys(state) + return note_dict["note"]["note_detail_map"][note_id]["note"] + return {} + + try: + return get_note_dict(html) + except: + href = re.findall(r'href="(.*?)"', html)[0] + utils.logger.info( + f"[XiaoHongShuClient.get_note_by_id_from_html] 出现验证码: {href}, 请手动验证" + ) + await self.playwright_page.goto(href) + # 等待用户完成操作页面重定向 + if await self.check_redirect(): + html = await self.playwright_page.content() + return get_note_dict(html) + else: + raise DataFetchError(html) + + @retry( + stop=stop_after_attempt(100), + wait=wait_fixed(5), + retry=retry_if_result(lambda value: value is False), + ) + async def check_redirect(self): + url = await self.playwright_page.url() + if url.startswith("https://www.xiaohongshu.com/explore"): + return True + return False + From 705b810269a7e392d0e4672c910d31f089eb5788 Mon Sep 17 00:00:00 2001 From: lyx0727 <1324938402@qq.com> Date: Fri, 1 Nov 2024 22:08:46 +0800 Subject: [PATCH 2/2] =?UTF-8?q?fix:=20xhs=E9=AA=8C=E8=AF=81=E7=A0=81?= =?UTF-8?q?=E5=A4=84=E7=90=86=EF=BC=8C=E8=B7=B3=E8=BD=AC=E9=93=BE=E6=8E=A5?= =?UTF-8?q?=E4=BF=AE=E5=A4=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- media_platform/xhs/client.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/media_platform/xhs/client.py b/media_platform/xhs/client.py index d3055f9..f61323e 100644 --- a/media_platform/xhs/client.py +++ b/media_platform/xhs/client.py @@ -22,6 +22,7 @@ from tenacity import retry, stop_after_attempt, wait_fixed, retry_if_result import config from base.base_crawler import AbstractApiClient from tools import utils +from html import unescape from .exception import DataFetchError, IPBlockError from .field import SearchNoteType, SearchSortType @@ -518,12 +519,18 @@ class XiaoHongShuClient(AbstractApiClient): return get_note_dict(html) except: href = re.findall(r'href="(.*?)"', html)[0] + href = unescape(href) + utils.logger.info( f"[XiaoHongShuClient.get_note_by_id_from_html] 出现验证码: {href}, 请手动验证" ) await self.playwright_page.goto(href) # 等待用户完成操作页面重定向 if await self.check_redirect(): + utils.logger.info( + f"[XiaoHongShuClient.get_note_by_id_from_html] 用户完成验证, 重定向到笔记详情页" + ) + html = await self.playwright_page.content() return get_note_dict(html) else: @@ -535,7 +542,7 @@ class XiaoHongShuClient(AbstractApiClient): retry=retry_if_result(lambda value: value is False), ) async def check_redirect(self): - url = await self.playwright_page.url() + url = self.playwright_page.url if url.startswith("https://www.xiaohongshu.com/explore"): return True return False