diff --git a/README.md b/README.md index 4f972db..d04cf12 100644 --- a/README.md +++ b/README.md @@ -23,8 +23,8 @@ | 小红书 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✕ | | 抖音 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | 快手 | ✅ | ✅ | ✕ | ✅ | ✅ | ✅ | ✅ | ✅ | ✕ | -| B 站 | ✅ | ✅ | ✕ | ✅ | ✅ | ✅ | ✅ | ✅ | ✕ | -| 微博 | ✅ | ✕ | ✕ | ✅ | ✅ | ✅ | ✅ | ✅ | ✕ | +| B 站 | ✅ | ✅ | ✕ | ✅ | ✅ | ✅ | ✅ | ✅ | ✕ | +| 微博 | ✅ | ✅ | ✕ | ✅ | ✅ | ✅ | ✅ | ✅ | ✕ | ## 使用方法 diff --git a/config/base_config.py b/config/base_config.py index d037737..c72471f 100644 --- a/config/base_config.py +++ b/config/base_config.py @@ -14,8 +14,8 @@ IP_PROXY_POOL_COUNT = 2 # 重试时间 RETRY_INTERVAL = 60 * 30 # 30 minutes -# 无头浏览器的标识,True:开启 False 关闭(会打开一个浏览器) -HEADLESS = False +# 设置为True不会打开浏览器(无头浏览器),设置False会打开一个浏览器(小红书如果一直扫码登录不通过,打开浏览器手动过一下滑动验证码) +HEADLESS = True # 是否保存登录状态 SAVE_LOGIN_STATE = True diff --git a/media_platform/weibo/client.py b/media_platform/weibo/client.py index a65f24f..fd2a783 100644 --- a/media_platform/weibo/client.py +++ b/media_platform/weibo/client.py @@ -69,9 +69,12 @@ class WeiboClient: utils.logger.info("[WeiboClient.pong] Begin pong weibo...") ping_flag = False try: - pass + uri = "/api/config" + resp_data: Dict = await self.request(method="GET", url=f"{self._host}{uri}", headers=self.headers) + if resp_data.get("login"): + ping_flag = True except Exception as e: - utils.logger.error(f"[BilibiliClient.pong] Pong weibo failed: {e}, and try to login again...") + utils.logger.error(f"[WeiboClient.pong] Pong weibo failed: {e}, and try to login again...") ping_flag = False return ping_flag diff --git a/media_platform/weibo/core.py b/media_platform/weibo/core.py index 01174f9..ff30e93 100644 --- a/media_platform/weibo/core.py +++ b/media_platform/weibo/core.py @@ -7,9 +7,8 @@ import asyncio import os import random -import time from asyncio import Task -from typing import Dict, List, Optional, Tuple, Union +from typing import Dict, List, Optional, Tuple from playwright.async_api import (BrowserContext, BrowserType, Page, async_playwright) @@ -19,7 +18,7 @@ from base.base_crawler import AbstractCrawler from models import weibo from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool from tools import utils -from var import comment_tasks_var, crawler_type_var +from var import crawler_type_var from .client import WeiboClient from .exception import DataFetchError @@ -37,8 +36,8 @@ class WeiboCrawler(AbstractCrawler): browser_context: BrowserContext def __init__(self): - self.index_url = "https://m.weibo.cn" - self.user_agent = utils.get_mobile_user_agent() + self.index_url = "https://www.weibo.com" + self.user_agent = utils.get_user_agent() def init_config(self, platform: str, login_type: str, crawler_type: str): self.platform = platform @@ -215,7 +214,7 @@ class WeiboCrawler(AbstractCrawler): weibo_client_obj = WeiboClient( proxies=httpx_proxy, headers={ - "User-Agent": self.user_agent, + "User-Agent": utils.get_mobile_user_agent(), "Cookie": cookie_str, "Origin": "https://m.weibo.cn", "Referer": "https://m.weibo.cn", diff --git a/media_platform/weibo/login.py b/media_platform/weibo/login.py index de80b02..82ccf8f 100644 --- a/media_platform/weibo/login.py +++ b/media_platform/weibo/login.py @@ -32,7 +32,7 @@ class WeiboLogin(AbstractLogin): async def begin(self): """Start login weibo""" - utils.logger.info("[WeiboLogin.begin] Begin login Bilibili ...") + utils.logger.info("[WeiboLogin.begin] Begin login weibo ...") if self.login_type == "qrcode": await self.login_by_qrcode() elif self.login_type == "phone": @@ -44,7 +44,7 @@ class WeiboLogin(AbstractLogin): "[WeiboLogin.begin] Invalid Login Type Currently only supported qrcode or phone or cookie ...") @retry(stop=stop_after_attempt(20), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False)) - async def check_login_state(self) -> bool: + async def check_login_state(self, no_logged_in_session: str) -> bool: """ Check if the current login status is successful and return True otherwise return False retry decorator will retry 20 times if the return value is False, and the retry interval is 1 second @@ -52,22 +52,45 @@ class WeiboLogin(AbstractLogin): """ current_cookie = await self.browser_context.cookies() _, cookie_dict = utils.convert_cookies(current_cookie) - if cookie_dict.get("SESSDATA", "") or cookie_dict.get("DedeUserID"): + current_web_session = cookie_dict.get("WBPSESS") + if current_web_session != no_logged_in_session: return True return False + async def popup_login_dialog(self): + """If the login dialog box does not pop up automatically, we will manually click the login button""" + dialog_selector = "xpath=//div[@class='woo-modal-main']" + try: + # check dialog box is auto popup and wait for 10 seconds + await self.context_page.wait_for_selector(dialog_selector, timeout=1000 * 10) + except Exception as e: + utils.logger.error( + f"[WeiboLogin.popup_login_dialog] login dialog box does not pop up automatically, error: {e}") + utils.logger.info( + "[WeiboLogin.popup_login_dialog] login dialog box does not pop up automatically, we will manually click the login button") + + # 向下滚动1000像素 + await self.context_page.mouse.wheel(0,500) + await asyncio.sleep(2) + + try: + # click login button + login_button_ele = self.context_page.locator( + "xpath=//a[text()='登录']" + ) + await login_button_ele.click() + await asyncio.sleep(0.5) + except Exception as e: + utils.logger.info(f"[WeiboLogin.popup_login_dialog] manually click the login button faield maybe login dialog Appear:{e}") + async def login_by_qrcode(self): """login weibo website and keep webdriver login state""" utils.logger.info("[WeiboLogin.login_by_qrcode] Begin login weibo by qrcode ...") - # click login button - login_button_ele = self.context_page.locator( - "xpath=//div[@class='right-entry__outside go-login-btn']//div" - ) - await login_button_ele.click() + await self.popup_login_dialog() # find login qrcode - qrcode_img_selector = "//div[@class='login-scan-box']//img" + qrcode_img_selector = "//div[@class='woo-modal-main']//img" base64_qrcode_img = await utils.find_login_qrcode( self.context_page, selector=qrcode_img_selector @@ -81,8 +104,14 @@ class WeiboLogin(AbstractLogin): asyncio.get_running_loop().run_in_executor(executor=None, func=partial_show_qrcode) utils.logger.info(f"[WeiboLogin.login_by_qrcode] Waiting for scan code login, remaining time is 20s") + + # get not logged session + current_cookie = await self.browser_context.cookies() + _, cookie_dict = utils.convert_cookies(current_cookie) + no_logged_in_session = cookie_dict.get("WBPSESS") + try: - await self.check_login_state() + await self.check_login_state(no_logged_in_session) except RetryError: utils.logger.info("[WeiboLogin.login_by_qrcode] Login weibo failed by qrcode login method ...") sys.exit() diff --git a/tools/crawler_util.py b/tools/crawler_util.py index 361fc75..f82ad82 100644 --- a/tools/crawler_util.py +++ b/tools/crawler_util.py @@ -9,9 +9,12 @@ import re from io import BytesIO from typing import Dict, List, Optional, Tuple +import httpx from PIL import Image, ImageDraw from playwright.async_api import Cookie, Page +from . import utils + async def find_login_qrcode(page: Page, selector: str) -> str: """find login qrcode image from target selector""" @@ -19,8 +22,17 @@ async def find_login_qrcode(page: Page, selector: str) -> str: elements = await page.wait_for_selector( selector=selector, ) - login_qrcode_img = await elements.get_property("src") # type: ignore - return str(login_qrcode_img) + login_qrcode_img = str(await elements.get_property("src")) # type: ignore + if "http://" in login_qrcode_img or "https://" in login_qrcode_img: + async with httpx.AsyncClient(follow_redirects=True) as client: + utils.logger.info(f"[find_login_qrcode] get qrcode by url:{login_qrcode_img}") + resp = await client.get(login_qrcode_img, headers={"User-Agent": get_user_agent()}) + if resp.status_code == 200: + image_data = resp.content + base64_image = base64.b64encode(image_data).decode('utf-8') + return base64_image + raise Exception(f"fetch login image url failed, response message:{resp.text}") + return login_qrcode_img except Exception as e: print(e) @@ -29,7 +41,8 @@ async def find_login_qrcode(page: Page, selector: str) -> str: def show_qrcode(qr_code) -> None: # type: ignore """parse base64 encode qrcode image and show it""" - qr_code = qr_code.split(",")[1] + if "," in qr_code: + qr_code = qr_code.split(",")[1] qr_code = base64.b64decode(qr_code) image = Image.open(BytesIO(qr_code))