diff --git a/README.md b/README.md
index 9950b4e..e71f46f 100644
--- a/README.md
+++ b/README.md
@@ -7,7 +7,7 @@
> 点击查看更为详细的免责声明。[点击跳转](#disclaimer)
# 仓库描述
-**小红书爬虫**,**抖音爬虫**, **快手爬虫**, **B站爬虫**, **微博爬虫**...。
+**小红书爬虫**,**抖音爬虫**, **快手爬虫**, **B站爬虫**, **微博爬虫**,**百度贴吧**...。
目前能抓取小红书、抖音、快手、B站、微博的视频、图片、评论、点赞、转发等信息。
原理:利用[playwright](https://playwright.dev/)搭桥,保留登录成功后的上下文浏览器环境,通过执行JS表达式获取一些加密参数
@@ -22,6 +22,7 @@
| 快手 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| B 站 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| 微博 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ |
+| 贴吧 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ |
## 使用方法
@@ -99,14 +100,51 @@
- [ MediaCrawler-基于抽象类设计重构项目缓存](https://articles.zsxq.com/id_4ju73oxewt9j.html)
- [ 手把手带你撸一个自己的IP代理池](https://articles.zsxq.com/id_38fza371ladm.html)
-
-
## 感谢下列Sponsors对本仓库赞助
- 通过注册安装这个款免费的Sider ChatGPT插件帮我获得一定奖励💰,这个插件我用了大半年,作为谷歌上最火的一款插件,体验非常不错。
> 安装并注册该浏览器插件之后保留一天即可,我就可以获得3元的推广奖励,谢谢大家,支持我继续开源项目。
成为赞助者,展示你的产品在这里,联系作者wx:yzglan
+## 打赏
+
+如果觉得项目不错的话可以打赏哦。您的支持就是我最大的动力!
+
+打赏时您可以备注名称,我会将您添加至打赏列表中。
+
+
+
+
+
+## 捐赠信息
+
+PS:如果打赏时请备注捐赠者,如有遗漏请联系我添加(有时候消息多可能会漏掉,十分抱歉)
+
+| 捐赠者 | 捐赠金额 | 捐赠日期 |
+|-------------|-------|------------|
+| *皓 | 50 元 | 2024-03-18 |
+| *刚 | 50 元 | 2024-03-18 |
+| *乐 | 20 元 | 2024-03-17 |
+| *木 | 20 元 | 2024-03-17 |
+| *诚 | 20 元 | 2024-03-17 |
+| Strem Gamer | 20 元 | 2024-03-16 |
+| *鑫 | 20 元 | 2024-03-14 |
+| Yuzu | 20 元 | 2024-03-07 |
+| **宁 | 100 元 | 2024-03-03 |
+| **媛 | 20 元 | 2024-03-03 |
+| Scarlett | 20 元 | 2024-02-16 |
+| Asun | 20 元 | 2024-01-30 |
+| 何* | 100 元 | 2024-01-21 |
+| allen | 20 元 | 2024-01-10 |
+| llllll | 20 元 | 2024-01-07 |
+| 邝*元 | 20 元 | 2023-12-29 |
+| 50chen | 50 元 | 2023-12-22 |
+| xiongot | 20 元 | 2023-12-17 |
+| atom.hu | 20 元 | 2023-12-16 |
+| 一呆 | 20 元 | 2023-12-01 |
+| 坠落 | 50 元 | 2023-11-08 |
+
+
## MediaCrawler爬虫项目交流群:
> 扫描下方我的个人微信,备注:github,拉你进MediaCrawler项目交流群(请一定备注:github,会有wx小助手自动拉群)
diff --git a/config/base_config.py b/config/base_config.py
index 53dc8bf..cefc711 100644
--- a/config/base_config.py
+++ b/config/base_config.py
@@ -28,7 +28,7 @@ HEADLESS = False
SAVE_LOGIN_STATE = True
# 数据保存类型选项配置,支持三种类型:csv、db、json
-SAVE_DATA_OPTION = "db" # csv or db or json
+SAVE_DATA_OPTION = "csv" # csv or db or json
# 用户浏览器缓存的浏览器文件配置
USER_DATA_DIR = "%s_user_data_dir" # %s will be replaced by platform name
@@ -46,18 +46,18 @@ MAX_CONCURRENCY_NUM = 1
ENABLE_GET_IMAGES = False
# 是否开启爬评论模式, 默认不开启爬评论
-ENABLE_GET_COMMENTS = True
+ENABLE_GET_COMMENTS = False
# 是否开启爬二级评论模式, 默认不开启爬二级评论
# 老版本项目使用了 db, 则需参考 schema/tables.sql line 287 增加表字段
-ENABLE_GET_SUB_COMMENTS = True
+ENABLE_GET_SUB_COMMENTS = False
# 指定小红书需要爬虫的笔记ID列表
XHS_SPECIFIED_ID_LIST = [
"6422c2750000000027000d88",
"64ca1b73000000000b028dd2",
"630d5b85000000001203ab41",
- "668fe13000000000030241fa", # 图文混合
+ "668fe13000000000030241fa", # 图文混合
# ........................
]
@@ -93,6 +93,10 @@ TIEBA_SPECIFIED_ID_LIST = [
]
+# 指定贴吧名称列表,爬取该贴吧下的帖子
+TIEBA_NAME_LIST = [
+ # "盗墓笔记"
+]
# 指定小红书创作者ID列表
XHS_CREATOR_ID_LIST = [
@@ -118,19 +122,18 @@ KS_CREATOR_ID_LIST = [
# ........................
]
-
-#词云相关
-#是否开启生成评论词云图
+# 词云相关
+# 是否开启生成评论词云图
ENABLE_GET_WORDCLOUD = False
# 自定义词语及其分组
-#添加规则:xx:yy 其中xx为自定义添加的词组,yy为将xx该词组分到的组名。
+# 添加规则:xx:yy 其中xx为自定义添加的词组,yy为将xx该词组分到的组名。
CUSTOM_WORDS = {
'零几': '年份', # 将“零几”识别为一个整体
'高频词': '专业术语' # 示例自定义词
}
-#停用(禁用)词文件路径
+# 停用(禁用)词文件路径
STOP_WORDS_FILE = "./docs/hit_stopwords.txt"
-#中文字体文件路径
-FONT_PATH= "./docs/STZHONGS.TTF"
+# 中文字体文件路径
+FONT_PATH = "./docs/STZHONGS.TTF"
diff --git a/media_platform/tieba/client.py b/media_platform/tieba/client.py
index 2ae4304..daa1c4c 100644
--- a/media_platform/tieba/client.py
+++ b/media_platform/tieba/client.py
@@ -1,17 +1,15 @@
import asyncio
import json
-import random
from typing import Any, Callable, Dict, List, Optional, Union
from urllib.parse import urlencode
import httpx
from playwright.async_api import BrowserContext
-from tenacity import (RetryError, retry, stop_after_attempt,
- wait_fixed)
+from tenacity import RetryError, retry, stop_after_attempt, wait_fixed
import config
from base.base_crawler import AbstractApiClient
-from model.m_baidu_tieba import TiebaNote, TiebaComment
+from model.m_baidu_tieba import TiebaComment, TiebaNote
from proxy.proxy_ip_pool import ProxyIpPool
from tools import utils
@@ -103,7 +101,7 @@ class BaiduTieBaClient(AbstractApiClient):
return res
utils.logger.error(f"[BaiduTieBaClient.get] 达到了最大重试次数,IP已经被Block,请尝试更换新的IP代理: {e}")
- raise e
+ raise Exception(f"[BaiduTieBaClient.get] 达到了最大重试次数,IP已经被Block,请尝试更换新的IP代理: {e}")
async def post(self, uri: str, data: dict, **kwargs) -> Dict:
"""
@@ -248,28 +246,44 @@ class BaiduTieBaClient(AbstractApiClient):
# raise Exception(f"[BaiduTieBaClient.pong] Cookies is empty, please login first...")
all_sub_comments: List[TiebaComment] = []
- for comment in comments:
- if comment.sub_comment_count == 0:
+ for parment_comment in comments:
+ if parment_comment.sub_comment_count == 0:
continue
current_page = 1
- max_sub_page_num = comment.sub_comment_count // 10 + 1
+ max_sub_page_num = parment_comment.sub_comment_count // 10 + 1
while max_sub_page_num >= current_page:
params = {
- "tid": comment.note_id, # 帖子ID
- "pid": comment.comment_id, # 父级评论ID
- "fid": comment.tieba_id, # 贴吧ID
+ "tid": parment_comment.note_id, # 帖子ID
+ "pid": parment_comment.comment_id, # 父级评论ID
+ "fid": parment_comment.tieba_id, # 贴吧ID
"pn": current_page # 页码
}
page_content = await self.get(uri, params=params, return_ori_content=True)
sub_comments = self._page_extractor.extract_tieba_note_sub_comments(page_content,
- parent_comment=comment)
+ parent_comment=parment_comment)
if not sub_comments:
break
if callback:
- await callback(comment.note_id, sub_comments)
+ await callback(parment_comment.note_id, sub_comments)
all_sub_comments.extend(sub_comments)
await asyncio.sleep(crawl_interval)
current_page += 1
return all_sub_comments
+
+
+
+ async def get_notes_by_tieba_name(self, tieba_name: str, page_num: int) -> List[TiebaNote]:
+ """
+ 根据贴吧名称获取帖子列表
+ Args:
+ tieba_name: 贴吧名称
+ page_num: 分页数量
+
+ Returns:
+
+ """
+ uri = f"/f?kw={tieba_name}&pn={page_num}"
+ page_content = await self.get(uri, return_ori_content=True)
+ return self._page_extractor.extract_tieba_note_list(page_content)
diff --git a/media_platform/tieba/core.py b/media_platform/tieba/core.py
index 2d10a0a..c8b8764 100644
--- a/media_platform/tieba/core.py
+++ b/media_platform/tieba/core.py
@@ -53,6 +53,7 @@ class TieBaCrawler(AbstractCrawler):
if config.CRAWLER_TYPE == "search":
# Search for notes and retrieve their comment information.
await self.search()
+ await self.get_specified_tieba_notes()
elif config.CRAWLER_TYPE == "detail":
# Get the information and comments of the specified post
await self.get_specified_notes()
@@ -92,7 +93,7 @@ class TieBaCrawler(AbstractCrawler):
if not notes_list:
utils.logger.info(f"[BaiduTieBaCrawler.search] Search note list is empty")
break
- utils.logger.info(f"[BaiduTieBaCrawler.search] Note List: {notes_list}")
+ utils.logger.info(f"[BaiduTieBaCrawler.search] Note list len: {len(notes_list)}")
await self.get_specified_notes(note_id_list=[note_detail.note_id for note_detail in notes_list])
page += 1
except Exception as ex:
@@ -100,6 +101,34 @@ class TieBaCrawler(AbstractCrawler):
f"[BaiduTieBaCrawler.search] Search keywords error, current page: {page}, current keyword: {keyword}, err: {ex}")
break
+ async def get_specified_tieba_notes(self):
+ """
+ Get the information and comments of the specified post by tieba name
+ Returns:
+
+ """
+ tieba_limit_count = 50
+ if config.CRAWLER_MAX_NOTES_COUNT < tieba_limit_count:
+ config.CRAWLER_MAX_NOTES_COUNT = tieba_limit_count
+ for tieba_name in config.TIEBA_NAME_LIST:
+ utils.logger.info(
+ f"[BaiduTieBaCrawler.get_specified_tieba_notes] Begin get tieba name: {tieba_name}")
+ page_number = 0
+ while page_number <= config.CRAWLER_MAX_NOTES_COUNT:
+ note_list: List[TiebaNote] = await self.tieba_client.get_notes_by_tieba_name(
+ tieba_name=tieba_name,
+ page_num=page_number
+ )
+ if not note_list:
+ utils.logger.info(
+ f"[BaiduTieBaCrawler.get_specified_tieba_notes] Get note list is empty")
+ break
+
+ utils.logger.info(
+ f"[BaiduTieBaCrawler.get_specified_tieba_notes] tieba name: {tieba_name} note list len: {len(note_list)}")
+ await self.get_specified_notes([note.note_id for note in note_list])
+ page_number += tieba_limit_count
+
async def get_specified_notes(self, note_id_list: List[str] = config.TIEBA_SPECIFIED_ID_LIST):
"""
Get the information and comments of the specified post
diff --git a/media_platform/tieba/help.py b/media_platform/tieba/help.py
index b46081d..4f3fe15 100644
--- a/media_platform/tieba/help.py
+++ b/media_platform/tieba/help.py
@@ -1,13 +1,13 @@
# -*- coding: utf-8 -*-
-import re
-import json
import html
-from typing import List, Dict, Tuple
+import json
+import re
+from typing import Dict, List, Tuple
from parsel import Selector
-from model.m_baidu_tieba import TiebaNote, TiebaComment
from constant import baidu_tieba as const
+from model.m_baidu_tieba import TiebaComment, TiebaNote
from tools import utils
@@ -43,6 +43,42 @@ class TieBaExtractor:
result.append(tieba_note)
return result
+ def extract_tieba_note_list(self, page_content: str) -> List[TiebaNote]:
+ """
+ 提取贴吧帖子列表
+ Args:
+ page_content:
+
+ Returns:
+
+ """
+ page_content = page_content.replace('
+
+
+
+
+
+
+
+ 盗墓笔记吧-百度贴吧--喜爱盗墓笔记的有爱稻米聚集地--盗墓笔记吧致力于为广大喜爱《盗墓笔记》的吧友服务,传递官方最新资讯,小说相关同人作品,鼓励吧友原创精品,解密分析、图片、文章等。
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+

+

+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+