From 061d1c15e2a9d576b26e3641aa205f6fd33093b8 Mon Sep 17 00:00:00 2001 From: Relakkes Date: Tue, 11 Mar 2025 23:42:34 +0800 Subject: [PATCH] feat: kuaishou search params update --- media_platform/kuaishou/client.py | 93 ++++++++--------- media_platform/kuaishou/core.py | 166 +++++++++++++++++++----------- 2 files changed, 151 insertions(+), 108 deletions(-) diff --git a/media_platform/kuaishou/client.py b/media_platform/kuaishou/client.py index e728ed8..a3fd0db 100644 --- a/media_platform/kuaishou/client.py +++ b/media_platform/kuaishou/client.py @@ -1,12 +1,12 @@ -# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: -# 1. 不得用于任何商业用途。 -# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 -# 3. 不得进行大规模爬取或对平台造成运营干扰。 -# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 # 5. 不得用于任何非法或不当的用途。 -# -# 详细许可条款请参阅项目根目录下的LICENSE文件。 -# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 # -*- coding: utf-8 -*- @@ -28,13 +28,13 @@ from .graphql import KuaiShouGraphQL class KuaiShouClient(AbstractApiClient): def __init__( - self, - timeout=10, - proxies=None, - *, - headers: Dict[str, str], - playwright_page: Page, - cookie_dict: Dict[str, str], + self, + timeout=10, + proxies=None, + *, + headers: Dict[str, str], + playwright_page: Page, + cookie_dict: Dict[str, str], ): self.proxies = proxies self.timeout = timeout @@ -46,10 +46,7 @@ class KuaiShouClient(AbstractApiClient): async def request(self, method, url, **kwargs) -> Any: async with httpx.AsyncClient(proxies=self.proxies) as client: - response = await client.request( - method, url, timeout=self.timeout, - **kwargs - ) + response = await client.request(method, url, timeout=self.timeout, **kwargs) data: Dict = response.json() if data.get("errors"): raise DataFetchError(data.get("errors", "unkonw error")) @@ -59,14 +56,16 @@ class KuaiShouClient(AbstractApiClient): async def get(self, uri: str, params=None) -> Dict: final_uri = uri if isinstance(params, dict): - final_uri = (f"{uri}?" - f"{urlencode(params)}") - return await self.request(method="GET", url=f"{self._host}{final_uri}", headers=self.headers) + final_uri = f"{uri}?" f"{urlencode(params)}" + return await self.request( + method="GET", url=f"{self._host}{final_uri}", headers=self.headers + ) async def post(self, uri: str, data: dict) -> Dict: - json_str = json.dumps(data, separators=(',', ':'), ensure_ascii=False) - return await self.request(method="POST", url=f"{self._host}{uri}", - data=json_str, headers=self.headers) + json_str = json.dumps(data, separators=(",", ":"), ensure_ascii=False) + return await self.request( + method="POST", url=f"{self._host}{uri}", data=json_str, headers=self.headers + ) async def pong(self) -> bool: """get a note to check if login state is ok""" @@ -78,13 +77,15 @@ class KuaiShouClient(AbstractApiClient): "variables": { "ftype": 1, }, - "query": self.graphql.get("vision_profile_user_list") + "query": self.graphql.get("vision_profile_user_list"), } res = await self.post("", post_data) if res.get("visionProfileUserList", {}).get("result") == 1: ping_flag = True except Exception as e: - utils.logger.error(f"[KuaiShouClient.pong] Pong kuaishou failed: {e}, and try to login again...") + utils.logger.error( + f"[KuaiShouClient.pong] Pong kuaishou failed: {e}, and try to login again..." + ) ping_flag = False return ping_flag @@ -93,11 +94,14 @@ class KuaiShouClient(AbstractApiClient): self.headers["Cookie"] = cookie_str self.cookie_dict = cookie_dict - async def search_info_by_keyword(self, keyword: str, pcursor: str): + async def search_info_by_keyword( + self, keyword: str, pcursor: str, search_session_id: str = "" + ): """ KuaiShou web search api :param keyword: search keyword :param pcursor: limite page curson + :param search_session_id: search session id :return: """ post_data = { @@ -105,9 +109,10 @@ class KuaiShouClient(AbstractApiClient): "variables": { "keyword": keyword, "pcursor": pcursor, - "page": "search" + "page": "search", + "searchSessionId": search_session_id, }, - "query": self.graphql.get("search_query") + "query": self.graphql.get("search_query"), } return await self.post("", post_data) @@ -119,11 +124,8 @@ class KuaiShouClient(AbstractApiClient): """ post_data = { "operationName": "visionVideoDetail", - "variables": { - "photoId": photo_id, - "page": "search" - }, - "query": self.graphql.get("video_detail") + "variables": {"photoId": photo_id, "page": "search"}, + "query": self.graphql.get("video_detail"), } return await self.post("", post_data) @@ -135,11 +137,8 @@ class KuaiShouClient(AbstractApiClient): """ post_data = { "operationName": "commentListQuery", - "variables": { - "photoId": photo_id, - "pcursor": pcursor - }, - "query": self.graphql.get("comment_list") + "variables": {"photoId": photo_id, "pcursor": pcursor}, + "query": self.graphql.get("comment_list"), } return await self.post("", post_data) @@ -165,9 +164,7 @@ class KuaiShouClient(AbstractApiClient): async def get_creator_profile(self, userId: str) -> Dict: post_data = { "operationName": "visionProfile", - "variables": { - "userId": userId - }, + "variables": {"userId": userId}, "query": self.graphql.get("vision_profile"), } return await self.post("", post_data) @@ -175,11 +172,7 @@ class KuaiShouClient(AbstractApiClient): async def get_video_by_creater(self, userId: str, pcursor: str = "") -> Dict: post_data = { "operationName": "visionProfilePhotoList", - "variables": { - "page": "profile", - "pcursor": pcursor, - "userId": userId - }, + "variables": {"page": "profile", "pcursor": pcursor, "userId": userId}, "query": self.graphql.get("vision_profile_photo_list"), } return await self.post("", post_data) @@ -209,7 +202,7 @@ class KuaiShouClient(AbstractApiClient): pcursor = vision_commen_list.get("pcursor", "") comments = vision_commen_list.get("rootComments", []) if len(result) + len(comments) > max_count: - comments = comments[:max_count - len(result)] + comments = comments[: max_count - len(result)] if callback: # 如果有回调函数,就执行回调函数 await callback(photo_id, comments) result.extend(comments) @@ -260,7 +253,7 @@ class KuaiShouClient(AbstractApiClient): comments_res = await self.get_video_sub_comments( photo_id, root_comment_id, sub_comment_pcursor ) - vision_sub_comment_list = comments_res.get("visionSubCommentList",{}) + vision_sub_comment_list = comments_res.get("visionSubCommentList", {}) sub_comment_pcursor = vision_sub_comment_list.get("pcursor", "no_more") comments = vision_sub_comment_list.get("subComments", {}) diff --git a/media_platform/kuaishou/core.py b/media_platform/kuaishou/core.py index 609f705..0aa886a 100644 --- a/media_platform/kuaishou/core.py +++ b/media_platform/kuaishou/core.py @@ -1,12 +1,12 @@ -# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: -# 1. 不得用于任何商业用途。 -# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 -# 3. 不得进行大规模爬取或对平台造成运营干扰。 -# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 # 5. 不得用于任何非法或不当的用途。 -# -# 详细许可条款请参阅项目根目录下的LICENSE文件。 -# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 import asyncio @@ -16,8 +16,7 @@ import time from asyncio import Task from typing import Dict, List, Optional, Tuple -from playwright.async_api import (BrowserContext, BrowserType, Page, - async_playwright) +from playwright.async_api import BrowserContext, BrowserType, Page, async_playwright import config from base.base_crawler import AbstractCrawler @@ -43,18 +42,19 @@ class KuaishouCrawler(AbstractCrawler): async def start(self): playwright_proxy_format, httpx_proxy_format = None, None if config.ENABLE_IP_PROXY: - ip_proxy_pool = await create_ip_pool(config.IP_PROXY_POOL_COUNT, enable_validate_ip=True) + ip_proxy_pool = await create_ip_pool( + config.IP_PROXY_POOL_COUNT, enable_validate_ip=True + ) ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy() - playwright_proxy_format, httpx_proxy_format = self.format_proxy_info(ip_proxy_info) + playwright_proxy_format, httpx_proxy_format = self.format_proxy_info( + ip_proxy_info + ) async with async_playwright() as playwright: # Launch a browser context. chromium = playwright.chromium self.browser_context = await self.launch_browser( - chromium, - None, - self.user_agent, - headless=config.HEADLESS + chromium, None, self.user_agent, headless=config.HEADLESS ) # stealth.min.js is a js script to prevent the website from detecting the crawler. await self.browser_context.add_init_script(path="libs/stealth.min.js") @@ -69,10 +69,12 @@ class KuaishouCrawler(AbstractCrawler): login_phone=httpx_proxy_format, browser_context=self.browser_context, context_page=self.context_page, - cookie_str=config.COOKIES + cookie_str=config.COOKIES, ) await login_obj.begin() - await self.ks_client.update_cookies(browser_context=self.browser_context) + await self.ks_client.update_cookies( + browser_context=self.browser_context + ) crawler_type_var.set(config.CRAWLER_TYPE) if config.CRAWLER_TYPE == "search": @@ -96,29 +98,41 @@ class KuaishouCrawler(AbstractCrawler): config.CRAWLER_MAX_NOTES_COUNT = ks_limit_count start_page = config.START_PAGE for keyword in config.KEYWORDS.split(","): + search_session_id = "" source_keyword_var.set(keyword) - utils.logger.info(f"[KuaishouCrawler.search] Current search keyword: {keyword}") + utils.logger.info( + f"[KuaishouCrawler.search] Current search keyword: {keyword}" + ) page = 1 - while (page - start_page + 1) * ks_limit_count <= config.CRAWLER_MAX_NOTES_COUNT: + while ( + page - start_page + 1 + ) * ks_limit_count <= config.CRAWLER_MAX_NOTES_COUNT: if page < start_page: utils.logger.info(f"[KuaishouCrawler.search] Skip page: {page}") page += 1 continue - utils.logger.info(f"[KuaishouCrawler.search] search kuaishou keyword: {keyword}, page: {page}") + utils.logger.info( + f"[KuaishouCrawler.search] search kuaishou keyword: {keyword}, page: {page}" + ) video_id_list: List[str] = [] videos_res = await self.ks_client.search_info_by_keyword( keyword=keyword, pcursor=str(page), + search_session_id=search_session_id, ) if not videos_res: - utils.logger.error(f"[KuaishouCrawler.search] search info by keyword:{keyword} not found data") + utils.logger.error( + f"[KuaishouCrawler.search] search info by keyword:{keyword} not found data" + ) continue vision_search_photo: Dict = videos_res.get("visionSearchPhoto") if vision_search_photo.get("result") != 1: - utils.logger.error(f"[KuaishouCrawler.search] search info by keyword:{keyword} not found data ") + utils.logger.error( + f"[KuaishouCrawler.search] search info by keyword:{keyword} not found data " + ) continue - + search_session_id = vision_search_photo.get("searchSessionId", "") for video_detail in vision_search_photo.get("feeds"): video_id_list.append(video_detail.get("photo", {}).get("id")) await kuaishou_store.update_kuaishou_video(video_item=video_detail) @@ -131,7 +145,8 @@ class KuaishouCrawler(AbstractCrawler): """Get the information and comments of the specified post""" semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) task_list = [ - self.get_video_info_task(video_id=video_id, semaphore=semaphore) for video_id in config.KS_SPECIFIED_ID_LIST + self.get_video_info_task(video_id=video_id, semaphore=semaphore) + for video_id in config.KS_SPECIFIED_ID_LIST ] video_details = await asyncio.gather(*task_list) for video_detail in video_details: @@ -139,18 +154,26 @@ class KuaishouCrawler(AbstractCrawler): await kuaishou_store.update_kuaishou_video(video_detail) await self.batch_get_video_comments(config.KS_SPECIFIED_ID_LIST) - async def get_video_info_task(self, video_id: str, semaphore: asyncio.Semaphore) -> Optional[Dict]: + async def get_video_info_task( + self, video_id: str, semaphore: asyncio.Semaphore + ) -> Optional[Dict]: """Get video detail task""" async with semaphore: try: result = await self.ks_client.get_video_info(video_id) - utils.logger.info(f"[KuaishouCrawler.get_video_info_task] Get video_id:{video_id} info result: {result} ...") + utils.logger.info( + f"[KuaishouCrawler.get_video_info_task] Get video_id:{video_id} info result: {result} ..." + ) return result.get("visionVideoDetail") except DataFetchError as ex: - utils.logger.error(f"[KuaishouCrawler.get_video_info_task] Get video detail error: {ex}") + utils.logger.error( + f"[KuaishouCrawler.get_video_info_task] Get video detail error: {ex}" + ) return None except KeyError as ex: - utils.logger.error(f"[KuaishouCrawler.get_video_info_task] have not fund video detail video_id:{video_id}, err: {ex}") + utils.logger.error( + f"[KuaishouCrawler.get_video_info_task] have not fund video detail video_id:{video_id}, err: {ex}" + ) return None async def batch_get_video_comments(self, video_id_list: List[str]): @@ -160,14 +183,20 @@ class KuaishouCrawler(AbstractCrawler): :return: """ if not config.ENABLE_GET_COMMENTS: - utils.logger.info(f"[KuaishouCrawler.batch_get_video_comments] Crawling comment mode is not enabled") + utils.logger.info( + f"[KuaishouCrawler.batch_get_video_comments] Crawling comment mode is not enabled" + ) return - utils.logger.info(f"[KuaishouCrawler.batch_get_video_comments] video ids:{video_id_list}") + utils.logger.info( + f"[KuaishouCrawler.batch_get_video_comments] video ids:{video_id_list}" + ) semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) task_list: List[Task] = [] for video_id in video_id_list: - task = asyncio.create_task(self.get_comments(video_id, semaphore), name=video_id) + task = asyncio.create_task( + self.get_comments(video_id, semaphore), name=video_id + ) task_list.append(task) comment_tasks_var.set(task_list) @@ -182,17 +211,23 @@ class KuaishouCrawler(AbstractCrawler): """ async with semaphore: try: - utils.logger.info(f"[KuaishouCrawler.get_comments] begin get video_id: {video_id} comments ...") + utils.logger.info( + f"[KuaishouCrawler.get_comments] begin get video_id: {video_id} comments ..." + ) await self.ks_client.get_video_all_comments( photo_id=video_id, crawl_interval=random.random(), callback=kuaishou_store.batch_update_ks_video_comments, - max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES + max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES, ) except DataFetchError as ex: - utils.logger.error(f"[KuaishouCrawler.get_comments] get video_id: {video_id} comment error: {ex}") + utils.logger.error( + f"[KuaishouCrawler.get_comments] get video_id: {video_id} comment error: {ex}" + ) except Exception as e: - utils.logger.error(f"[KuaishouCrawler.get_comments] may be been blocked, err:{e}") + utils.logger.error( + f"[KuaishouCrawler.get_comments] may be been blocked, err:{e}" + ) # use time.sleeep block main coroutine instead of asyncio.sleep and cacel running comment task # maybe kuaishou block our request, we will take a nap and update the cookie again current_running_tasks = comment_tasks_var.get() @@ -200,10 +235,14 @@ class KuaishouCrawler(AbstractCrawler): task.cancel() time.sleep(20) await self.context_page.goto(f"{self.index_url}?isHome=1") - await self.ks_client.update_cookies(browser_context=self.browser_context) + await self.ks_client.update_cookies( + browser_context=self.browser_context + ) @staticmethod - def format_proxy_info(ip_proxy_info: IpInfoModel) -> Tuple[Optional[Dict], Optional[Dict]]: + def format_proxy_info( + ip_proxy_info: IpInfoModel, + ) -> Tuple[Optional[Dict], Optional[Dict]]: """format proxy info for playwright and httpx""" playwright_proxy = { "server": f"{ip_proxy_info.protocol}{ip_proxy_info.ip}:{ip_proxy_info.port}", @@ -217,8 +256,12 @@ class KuaishouCrawler(AbstractCrawler): async def create_ks_client(self, httpx_proxy: Optional[str]) -> KuaiShouClient: """Create ks client""" - utils.logger.info("[KuaishouCrawler.create_ks_client] Begin create kuaishou API client ...") - cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies()) + utils.logger.info( + "[KuaishouCrawler.create_ks_client] Begin create kuaishou API client ..." + ) + cookie_str, cookie_dict = utils.convert_cookies( + await self.browser_context.cookies() + ) ks_client_obj = KuaiShouClient( proxies=httpx_proxy, headers={ @@ -226,7 +269,7 @@ class KuaishouCrawler(AbstractCrawler): "Cookie": cookie_str, "Origin": self.index_url, "Referer": self.index_url, - "Content-Type": "application/json;charset=UTF-8" + "Content-Type": "application/json;charset=UTF-8", }, playwright_page=self.context_page, cookie_dict=cookie_dict, @@ -234,37 +277,41 @@ class KuaishouCrawler(AbstractCrawler): return ks_client_obj async def launch_browser( - self, - chromium: BrowserType, - playwright_proxy: Optional[Dict], - user_agent: Optional[str], - headless: bool = True + self, + chromium: BrowserType, + playwright_proxy: Optional[Dict], + user_agent: Optional[str], + headless: bool = True, ) -> BrowserContext: """Launch browser and create browser context""" - utils.logger.info("[KuaishouCrawler.launch_browser] Begin create browser context ...") + utils.logger.info( + "[KuaishouCrawler.launch_browser] Begin create browser context ..." + ) if config.SAVE_LOGIN_STATE: - user_data_dir = os.path.join(os.getcwd(), "browser_data", - config.USER_DATA_DIR % config.PLATFORM) # type: ignore + user_data_dir = os.path.join( + os.getcwd(), "browser_data", config.USER_DATA_DIR % config.PLATFORM + ) # type: ignore browser_context = await chromium.launch_persistent_context( user_data_dir=user_data_dir, accept_downloads=True, headless=headless, proxy=playwright_proxy, # type: ignore viewport={"width": 1920, "height": 1080}, - user_agent=user_agent + user_agent=user_agent, ) return browser_context else: browser = await chromium.launch(headless=headless, proxy=playwright_proxy) # type: ignore browser_context = await browser.new_context( - viewport={"width": 1920, "height": 1080}, - user_agent=user_agent + viewport={"width": 1920, "height": 1080}, user_agent=user_agent ) return browser_context async def get_creators_and_videos(self) -> None: """Get creator's videos and retrieve their comment information.""" - utils.logger.info("[KuaiShouCrawler.get_creators_and_videos] Begin get kuaishou creators") + utils.logger.info( + "[KuaiShouCrawler.get_creators_and_videos] Begin get kuaishou creators" + ) for user_id in config.KS_CREATOR_ID_LIST: # get creator detail info from web html content createor_info: Dict = await self.ks_client.get_creator_info(user_id=user_id) @@ -273,12 +320,14 @@ class KuaishouCrawler(AbstractCrawler): # Get all video information of the creator all_video_list = await self.ks_client.get_all_videos_by_creator( - user_id = user_id, - crawl_interval = random.random(), - callback = self.fetch_creator_video_detail + user_id=user_id, + crawl_interval=random.random(), + callback=self.fetch_creator_video_detail, ) - video_ids = [video_item.get("photo", {}).get("id") for video_item in all_video_list] + video_ids = [ + video_item.get("photo", {}).get("id") for video_item in all_video_list + ] await self.batch_get_video_comments(video_ids) async def fetch_creator_video_detail(self, video_list: List[Dict]): @@ -287,7 +336,8 @@ class KuaishouCrawler(AbstractCrawler): """ semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) task_list = [ - self.get_video_info_task(post_item.get("photo", {}).get("id"), semaphore) for post_item in video_list + self.get_video_info_task(post_item.get("photo", {}).get("id"), semaphore) + for post_item in video_list ] video_details = await asyncio.gather(*task_list)