mirror of
https://github.com/RYDE-WORK/MediaCrawler.git
synced 2026-02-06 08:53:21 +08:00
feat: 微博二维码登录done
This commit is contained in:
parent
27a2041929
commit
38d6f10bf0
@ -23,8 +23,8 @@
|
|||||||
| 小红书 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✕ |
|
| 小红书 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✕ |
|
||||||
| 抖音 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
| 抖音 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||||
| 快手 | ✅ | ✅ | ✕ | ✅ | ✅ | ✅ | ✅ | ✅ | ✕ |
|
| 快手 | ✅ | ✅ | ✕ | ✅ | ✅ | ✅ | ✅ | ✅ | ✕ |
|
||||||
| B 站 | ✅ | ✅ | ✕ | ✅ | ✅ | ✅ | ✅ | ✅ | ✕ |
|
| B 站 | ✅ | ✅ | ✕ | ✅ | ✅ | ✅ | ✅ | ✅ | ✕ |
|
||||||
| 微博 | ✅ | ✕ | ✕ | ✅ | ✅ | ✅ | ✅ | ✅ | ✕ |
|
| 微博 | ✅ | ✅ | ✕ | ✅ | ✅ | ✅ | ✅ | ✅ | ✕ |
|
||||||
|
|
||||||
|
|
||||||
## 使用方法
|
## 使用方法
|
||||||
|
|||||||
@ -14,8 +14,8 @@ IP_PROXY_POOL_COUNT = 2
|
|||||||
# 重试时间
|
# 重试时间
|
||||||
RETRY_INTERVAL = 60 * 30 # 30 minutes
|
RETRY_INTERVAL = 60 * 30 # 30 minutes
|
||||||
|
|
||||||
# 无头浏览器的标识,True:开启 False 关闭(会打开一个浏览器)
|
# 设置为True不会打开浏览器(无头浏览器),设置False会打开一个浏览器(小红书如果一直扫码登录不通过,打开浏览器手动过一下滑动验证码)
|
||||||
HEADLESS = False
|
HEADLESS = True
|
||||||
|
|
||||||
# 是否保存登录状态
|
# 是否保存登录状态
|
||||||
SAVE_LOGIN_STATE = True
|
SAVE_LOGIN_STATE = True
|
||||||
|
|||||||
@ -69,9 +69,12 @@ class WeiboClient:
|
|||||||
utils.logger.info("[WeiboClient.pong] Begin pong weibo...")
|
utils.logger.info("[WeiboClient.pong] Begin pong weibo...")
|
||||||
ping_flag = False
|
ping_flag = False
|
||||||
try:
|
try:
|
||||||
pass
|
uri = "/api/config"
|
||||||
|
resp_data: Dict = await self.request(method="GET", url=f"{self._host}{uri}", headers=self.headers)
|
||||||
|
if resp_data.get("login"):
|
||||||
|
ping_flag = True
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
utils.logger.error(f"[BilibiliClient.pong] Pong weibo failed: {e}, and try to login again...")
|
utils.logger.error(f"[WeiboClient.pong] Pong weibo failed: {e}, and try to login again...")
|
||||||
ping_flag = False
|
ping_flag = False
|
||||||
return ping_flag
|
return ping_flag
|
||||||
|
|
||||||
|
|||||||
@ -7,9 +7,8 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
import os
|
import os
|
||||||
import random
|
import random
|
||||||
import time
|
|
||||||
from asyncio import Task
|
from asyncio import Task
|
||||||
from typing import Dict, List, Optional, Tuple, Union
|
from typing import Dict, List, Optional, Tuple
|
||||||
|
|
||||||
from playwright.async_api import (BrowserContext, BrowserType, Page,
|
from playwright.async_api import (BrowserContext, BrowserType, Page,
|
||||||
async_playwright)
|
async_playwright)
|
||||||
@ -19,7 +18,7 @@ from base.base_crawler import AbstractCrawler
|
|||||||
from models import weibo
|
from models import weibo
|
||||||
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
|
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
|
||||||
from tools import utils
|
from tools import utils
|
||||||
from var import comment_tasks_var, crawler_type_var
|
from var import crawler_type_var
|
||||||
|
|
||||||
from .client import WeiboClient
|
from .client import WeiboClient
|
||||||
from .exception import DataFetchError
|
from .exception import DataFetchError
|
||||||
@ -37,8 +36,8 @@ class WeiboCrawler(AbstractCrawler):
|
|||||||
browser_context: BrowserContext
|
browser_context: BrowserContext
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.index_url = "https://m.weibo.cn"
|
self.index_url = "https://www.weibo.com"
|
||||||
self.user_agent = utils.get_mobile_user_agent()
|
self.user_agent = utils.get_user_agent()
|
||||||
|
|
||||||
def init_config(self, platform: str, login_type: str, crawler_type: str):
|
def init_config(self, platform: str, login_type: str, crawler_type: str):
|
||||||
self.platform = platform
|
self.platform = platform
|
||||||
@ -215,7 +214,7 @@ class WeiboCrawler(AbstractCrawler):
|
|||||||
weibo_client_obj = WeiboClient(
|
weibo_client_obj = WeiboClient(
|
||||||
proxies=httpx_proxy,
|
proxies=httpx_proxy,
|
||||||
headers={
|
headers={
|
||||||
"User-Agent": self.user_agent,
|
"User-Agent": utils.get_mobile_user_agent(),
|
||||||
"Cookie": cookie_str,
|
"Cookie": cookie_str,
|
||||||
"Origin": "https://m.weibo.cn",
|
"Origin": "https://m.weibo.cn",
|
||||||
"Referer": "https://m.weibo.cn",
|
"Referer": "https://m.weibo.cn",
|
||||||
|
|||||||
@ -32,7 +32,7 @@ class WeiboLogin(AbstractLogin):
|
|||||||
|
|
||||||
async def begin(self):
|
async def begin(self):
|
||||||
"""Start login weibo"""
|
"""Start login weibo"""
|
||||||
utils.logger.info("[WeiboLogin.begin] Begin login Bilibili ...")
|
utils.logger.info("[WeiboLogin.begin] Begin login weibo ...")
|
||||||
if self.login_type == "qrcode":
|
if self.login_type == "qrcode":
|
||||||
await self.login_by_qrcode()
|
await self.login_by_qrcode()
|
||||||
elif self.login_type == "phone":
|
elif self.login_type == "phone":
|
||||||
@ -44,7 +44,7 @@ class WeiboLogin(AbstractLogin):
|
|||||||
"[WeiboLogin.begin] Invalid Login Type Currently only supported qrcode or phone or cookie ...")
|
"[WeiboLogin.begin] Invalid Login Type Currently only supported qrcode or phone or cookie ...")
|
||||||
|
|
||||||
@retry(stop=stop_after_attempt(20), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False))
|
@retry(stop=stop_after_attempt(20), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False))
|
||||||
async def check_login_state(self) -> bool:
|
async def check_login_state(self, no_logged_in_session: str) -> bool:
|
||||||
"""
|
"""
|
||||||
Check if the current login status is successful and return True otherwise return False
|
Check if the current login status is successful and return True otherwise return False
|
||||||
retry decorator will retry 20 times if the return value is False, and the retry interval is 1 second
|
retry decorator will retry 20 times if the return value is False, and the retry interval is 1 second
|
||||||
@ -52,22 +52,45 @@ class WeiboLogin(AbstractLogin):
|
|||||||
"""
|
"""
|
||||||
current_cookie = await self.browser_context.cookies()
|
current_cookie = await self.browser_context.cookies()
|
||||||
_, cookie_dict = utils.convert_cookies(current_cookie)
|
_, cookie_dict = utils.convert_cookies(current_cookie)
|
||||||
if cookie_dict.get("SESSDATA", "") or cookie_dict.get("DedeUserID"):
|
current_web_session = cookie_dict.get("WBPSESS")
|
||||||
|
if current_web_session != no_logged_in_session:
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
async def popup_login_dialog(self):
|
||||||
|
"""If the login dialog box does not pop up automatically, we will manually click the login button"""
|
||||||
|
dialog_selector = "xpath=//div[@class='woo-modal-main']"
|
||||||
|
try:
|
||||||
|
# check dialog box is auto popup and wait for 10 seconds
|
||||||
|
await self.context_page.wait_for_selector(dialog_selector, timeout=1000 * 10)
|
||||||
|
except Exception as e:
|
||||||
|
utils.logger.error(
|
||||||
|
f"[WeiboLogin.popup_login_dialog] login dialog box does not pop up automatically, error: {e}")
|
||||||
|
utils.logger.info(
|
||||||
|
"[WeiboLogin.popup_login_dialog] login dialog box does not pop up automatically, we will manually click the login button")
|
||||||
|
|
||||||
|
# 向下滚动1000像素
|
||||||
|
await self.context_page.mouse.wheel(0,500)
|
||||||
|
await asyncio.sleep(2)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# click login button
|
||||||
|
login_button_ele = self.context_page.locator(
|
||||||
|
"xpath=//a[text()='登录']"
|
||||||
|
)
|
||||||
|
await login_button_ele.click()
|
||||||
|
await asyncio.sleep(0.5)
|
||||||
|
except Exception as e:
|
||||||
|
utils.logger.info(f"[WeiboLogin.popup_login_dialog] manually click the login button faield maybe login dialog Appear:{e}")
|
||||||
|
|
||||||
async def login_by_qrcode(self):
|
async def login_by_qrcode(self):
|
||||||
"""login weibo website and keep webdriver login state"""
|
"""login weibo website and keep webdriver login state"""
|
||||||
utils.logger.info("[WeiboLogin.login_by_qrcode] Begin login weibo by qrcode ...")
|
utils.logger.info("[WeiboLogin.login_by_qrcode] Begin login weibo by qrcode ...")
|
||||||
|
|
||||||
# click login button
|
await self.popup_login_dialog()
|
||||||
login_button_ele = self.context_page.locator(
|
|
||||||
"xpath=//div[@class='right-entry__outside go-login-btn']//div"
|
|
||||||
)
|
|
||||||
await login_button_ele.click()
|
|
||||||
|
|
||||||
# find login qrcode
|
# find login qrcode
|
||||||
qrcode_img_selector = "//div[@class='login-scan-box']//img"
|
qrcode_img_selector = "//div[@class='woo-modal-main']//img"
|
||||||
base64_qrcode_img = await utils.find_login_qrcode(
|
base64_qrcode_img = await utils.find_login_qrcode(
|
||||||
self.context_page,
|
self.context_page,
|
||||||
selector=qrcode_img_selector
|
selector=qrcode_img_selector
|
||||||
@ -81,8 +104,14 @@ class WeiboLogin(AbstractLogin):
|
|||||||
asyncio.get_running_loop().run_in_executor(executor=None, func=partial_show_qrcode)
|
asyncio.get_running_loop().run_in_executor(executor=None, func=partial_show_qrcode)
|
||||||
|
|
||||||
utils.logger.info(f"[WeiboLogin.login_by_qrcode] Waiting for scan code login, remaining time is 20s")
|
utils.logger.info(f"[WeiboLogin.login_by_qrcode] Waiting for scan code login, remaining time is 20s")
|
||||||
|
|
||||||
|
# get not logged session
|
||||||
|
current_cookie = await self.browser_context.cookies()
|
||||||
|
_, cookie_dict = utils.convert_cookies(current_cookie)
|
||||||
|
no_logged_in_session = cookie_dict.get("WBPSESS")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
await self.check_login_state()
|
await self.check_login_state(no_logged_in_session)
|
||||||
except RetryError:
|
except RetryError:
|
||||||
utils.logger.info("[WeiboLogin.login_by_qrcode] Login weibo failed by qrcode login method ...")
|
utils.logger.info("[WeiboLogin.login_by_qrcode] Login weibo failed by qrcode login method ...")
|
||||||
sys.exit()
|
sys.exit()
|
||||||
|
|||||||
@ -9,9 +9,12 @@ import re
|
|||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from typing import Dict, List, Optional, Tuple
|
from typing import Dict, List, Optional, Tuple
|
||||||
|
|
||||||
|
import httpx
|
||||||
from PIL import Image, ImageDraw
|
from PIL import Image, ImageDraw
|
||||||
from playwright.async_api import Cookie, Page
|
from playwright.async_api import Cookie, Page
|
||||||
|
|
||||||
|
from . import utils
|
||||||
|
|
||||||
|
|
||||||
async def find_login_qrcode(page: Page, selector: str) -> str:
|
async def find_login_qrcode(page: Page, selector: str) -> str:
|
||||||
"""find login qrcode image from target selector"""
|
"""find login qrcode image from target selector"""
|
||||||
@ -19,8 +22,17 @@ async def find_login_qrcode(page: Page, selector: str) -> str:
|
|||||||
elements = await page.wait_for_selector(
|
elements = await page.wait_for_selector(
|
||||||
selector=selector,
|
selector=selector,
|
||||||
)
|
)
|
||||||
login_qrcode_img = await elements.get_property("src") # type: ignore
|
login_qrcode_img = str(await elements.get_property("src")) # type: ignore
|
||||||
return str(login_qrcode_img)
|
if "http://" in login_qrcode_img or "https://" in login_qrcode_img:
|
||||||
|
async with httpx.AsyncClient(follow_redirects=True) as client:
|
||||||
|
utils.logger.info(f"[find_login_qrcode] get qrcode by url:{login_qrcode_img}")
|
||||||
|
resp = await client.get(login_qrcode_img, headers={"User-Agent": get_user_agent()})
|
||||||
|
if resp.status_code == 200:
|
||||||
|
image_data = resp.content
|
||||||
|
base64_image = base64.b64encode(image_data).decode('utf-8')
|
||||||
|
return base64_image
|
||||||
|
raise Exception(f"fetch login image url failed, response message:{resp.text}")
|
||||||
|
return login_qrcode_img
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(e)
|
print(e)
|
||||||
@ -29,7 +41,8 @@ async def find_login_qrcode(page: Page, selector: str) -> str:
|
|||||||
|
|
||||||
def show_qrcode(qr_code) -> None: # type: ignore
|
def show_qrcode(qr_code) -> None: # type: ignore
|
||||||
"""parse base64 encode qrcode image and show it"""
|
"""parse base64 encode qrcode image and show it"""
|
||||||
qr_code = qr_code.split(",")[1]
|
if "," in qr_code:
|
||||||
|
qr_code = qr_code.split(",")[1]
|
||||||
qr_code = base64.b64decode(qr_code)
|
qr_code = base64.b64decode(qr_code)
|
||||||
image = Image.open(BytesIO(qr_code))
|
image = Image.open(BytesIO(qr_code))
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user