diff --git a/main.py b/main.py index dd3dec8..81ac53a 100644 --- a/main.py +++ b/main.py @@ -6,6 +6,7 @@ import config import db from base import proxy_account_pool from media_platform.douyin import DouYinCrawler +from media_platform.kuaishou import KuaishouCrawler from media_platform.xhs import XiaoHongShuCrawler @@ -16,6 +17,8 @@ class CrawlerFactory: return XiaoHongShuCrawler() elif platform == "dy": return DouYinCrawler() + elif platform == "ks": + return KuaishouCrawler() else: raise ValueError("Invalid Media Platform Currently only supported xhs or dy ...") @@ -23,8 +26,8 @@ class CrawlerFactory: async def main(): # define command line params ... parser = argparse.ArgumentParser(description='Media crawler program.') - parser.add_argument('--platform', type=str, help='Media platform select (xhs | dy)', - choices=["xhs", "dy"], default=config.PLATFORM) + parser.add_argument('--platform', type=str, help='Media platform select (xhs | dy | ks)', + choices=["xhs", "dy", "ks"], default=config.PLATFORM) parser.add_argument('--lt', type=str, help='Login type (qrcode | phone | cookie)', choices=["qrcode", "phone", "cookie"], default=config.LOGIN_TYPE) parser.add_argument('--type', type=str, help='crawler type (search | detail)', diff --git a/media_platform/kuaishou/__init__.py b/media_platform/kuaishou/__init__.py index 7c68785..de877e0 100644 --- a/media_platform/kuaishou/__init__.py +++ b/media_platform/kuaishou/__init__.py @@ -1 +1,2 @@ -# -*- coding: utf-8 -*- \ No newline at end of file +# -*- coding: utf-8 -*- +from .core import KuaishouCrawler \ No newline at end of file diff --git a/media_platform/kuaishou/client.py b/media_platform/kuaishou/client.py index e89de5b..9346fd4 100644 --- a/media_platform/kuaishou/client.py +++ b/media_platform/kuaishou/client.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- -from typing import Dict import asyncio import json +from urllib.parse import urlencode from typing import Dict, Optional import httpx @@ -12,7 +12,6 @@ from tools import utils from .exception import DataFetchError, IPBlockError - class KuaishouClient: def __init__( self, @@ -26,7 +25,7 @@ class KuaishouClient: self.proxies = proxies self.timeout = timeout self.headers = headers - self._host = "https://edith.xiaohongshu.com" + self._host = "https://www.kuaishou.com" self.playwright_page = playwright_page self.cookie_dict = cookie_dict @@ -49,7 +48,7 @@ class KuaishouClient: final_uri = uri if isinstance(params, dict): final_uri = (f"{uri}?" - f"{'&'.join([f'{k}={v}' for k, v in params.items()])}") + f"{urlencode(params)}") headers = await self._pre_headers(final_uri) return await self.request(method="GET", url=f"{self._host}{final_uri}", headers=headers) @@ -59,13 +58,18 @@ class KuaishouClient: return await self.request(method="POST", url=f"{self._host}{uri}", data=json_str, headers=headers) - async def ping(self) -> bool: + async def pong(self) -> bool: """get a note to check if login state is ok""" - utils.logger.info("Begin to ping xhs...") + utils.logger.info("Begin pong kuaishou...") ping_flag = False try: pass except Exception as e: - utils.logger.error(f"Ping xhs failed: {e}, and try to login again...") + utils.logger.error(f"Pong kuaishou failed: {e}, and try to login again...") ping_flag = False return ping_flag + + async def update_cookies(self, browser_context: BrowserContext): + cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies()) + self.headers["Cookie"] = cookie_str + self.cookie_dict = cookie_dict \ No newline at end of file diff --git a/media_platform/kuaishou/core.py b/media_platform/kuaishou/core.py index 7c68785..67d60a6 100644 --- a/media_platform/kuaishou/core.py +++ b/media_platform/kuaishou/core.py @@ -1 +1,150 @@ -# -*- coding: utf-8 -*- \ No newline at end of file +import asyncio +import os +from typing import Dict, List, Optional, Tuple + +from playwright.async_api import (BrowserContext, BrowserType, Page, + async_playwright) + +import config +from base.base_crawler import AbstractCrawler +from base.proxy_account_pool import AccountPool +from tools import utils +from var import crawler_type_var + +from .client import KuaishouClient +from .login import KuaishouLogin + + +class KuaishouCrawler(AbstractCrawler): + platform: str + login_type: str + crawler_type: str + context_page: Page + ks_client: KuaishouClient + account_pool: AccountPool + browser_context: BrowserContext + + def __init__(self): + self.index_url = "https://www.kuaishou.com" + self.user_agent = utils.get_user_agent() + + def init_config(self, platform: str, login_type: str, account_pool: AccountPool, crawler_type: str): + self.platform = platform + self.login_type = login_type + self.account_pool = account_pool + self.crawler_type = crawler_type + + async def start(self): + account_phone, playwright_proxy, httpx_proxy = self.create_proxy_info() + async with async_playwright() as playwright: + # Launch a browser context. + chromium = playwright.chromium + self.browser_context = await self.launch_browser( + chromium, + playwright_proxy, + self.user_agent, + headless=config.HEADLESS + ) + # stealth.min.js is a js script to prevent the website from detecting the crawler. + await self.browser_context.add_init_script(path="libs/stealth.min.js") + self.context_page = await self.browser_context.new_page() + await self.context_page.goto(f"{self.index_url}?isHome=1") + + # Create a client to interact with the kuaishou website. + self.ks_client = await self.create_ks_client(httpx_proxy) + if not await self.ks_client.pong(): + login_obj = KuaishouLogin( + login_type=self.login_type, + login_phone=account_phone, + browser_context=self.browser_context, + context_page=self.context_page, + cookie_str=config.COOKIES + ) + await login_obj.begin() + await self.ks_client.update_cookies(browser_context=self.browser_context) + + crawler_type_var.set(self.crawler_type) + if self.crawler_type == "search": + # Search for notes and retrieve their comment information. + await self.search() + elif self.crawler_type == "detail": + # Get the information and comments of the specified post + await self.get_specified_notes() + else: + pass + + utils.logger.info("Kuaishou Crawler finished ...") + + async def search(self): + await asyncio.Event().wait() + + + async def get_specified_notes(self): + pass + + def create_proxy_info(self) -> Tuple[Optional[str], Optional[Dict], Optional[str]]: + """Create proxy info for playwright and httpx""" + # phone: 13012345671 ip_proxy: 111.122.xx.xx1:8888 + phone, ip_proxy = self.account_pool.get_account() + if not config.ENABLE_IP_PROXY: + return phone, None, None + utils.logger.info("Begin proxy info for playwright and httpx ...") + playwright_proxy = { + "server": f"{config.IP_PROXY_PROTOCOL}{ip_proxy}", + "username": config.IP_PROXY_USER, + "password": config.IP_PROXY_PASSWORD, + } + httpx_proxy = f"{config.IP_PROXY_PROTOCOL}{config.IP_PROXY_USER}:{config.IP_PROXY_PASSWORD}@{ip_proxy}" + return phone, playwright_proxy, httpx_proxy + + async def create_ks_client(self, httpx_proxy: Optional[str]) -> KuaishouClient: + """Create xhs client""" + utils.logger.info("Begin create kuaishou API client ...") + cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies()) + xhs_client_obj = KuaishouClient( + proxies=httpx_proxy, + headers={ + "User-Agent": self.user_agent, + "Cookie": cookie_str, + "Origin": self.index_url, + "Referer": self.index_url, + "Content-Type": "application/json;charset=UTF-8" + }, + playwright_page=self.context_page, + cookie_dict=cookie_dict, + ) + return xhs_client_obj + + async def launch_browser( + self, + chromium: BrowserType, + playwright_proxy: Optional[Dict], + user_agent: Optional[str], + headless: bool = True + ) -> BrowserContext: + """Launch browser and create browser context""" + utils.logger.info("Begin create browser context ...") + if config.SAVE_LOGIN_STATE: + user_data_dir = os.path.join(os.getcwd(), "browser_data", + config.USER_DATA_DIR % self.platform) # type: ignore + browser_context = await chromium.launch_persistent_context( + user_data_dir=user_data_dir, + accept_downloads=True, + headless=headless, + proxy=playwright_proxy, # type: ignore + viewport={"width": 1920, "height": 1080}, + user_agent=user_agent + ) + return browser_context + else: + browser = await chromium.launch(headless=headless, proxy=playwright_proxy) # type: ignore + browser_context = await browser.new_context( + viewport={"width": 1920, "height": 1080}, + user_agent=user_agent + ) + return browser_context + + async def close(self): + """Close browser context""" + await self.browser_context.close() + utils.logger.info("Browser context closed ...") \ No newline at end of file diff --git a/media_platform/kuaishou/login.py b/media_platform/kuaishou/login.py index 1564422..3552d80 100644 --- a/media_platform/kuaishou/login.py +++ b/media_platform/kuaishou/login.py @@ -1,11 +1,31 @@ -# -*- coding: utf-8 -*- +import asyncio +import functools +import sys +from typing import Optional +import redis +from playwright.async_api import BrowserContext, Page +from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt, + wait_fixed) + +import config from base.base_crawler import AbstractLogin +from tools import utils class KuaishouLogin(AbstractLogin): - def __init__(self): - pass + def __init__(self, + login_type: str, + browser_context: BrowserContext, + context_page: Page, + login_phone: Optional[str] = "", + cookie_str: str = "" + ): + self.login_type = login_type + self.browser_context = browser_context + self.context_page = context_page + self.login_phone = login_phone + self.cookie_str = cookie_str async def begin(self): pass