diff --git a/README.md b/README.md
index 56c0c070..dc5557aa 100644
--- a/README.md
+++ b/README.md
@@ -8,10 +8,10 @@
-
+
-
🔥 抖音视频/图集/直播下载工具:基于 Requests 模块实现;批量下载抖音账号发布页或者喜欢页的作品;单独下载抖音链接对应的作品;获取抖音直播推流地址;下载抖音直播视频。
+🔥 抖音视频/图集/直播下载工具:基于 Requests 模块实现;批量下载抖音账号发布页或者喜欢页的作品;单独下载抖音链接对应的作品;获取抖音直播推流地址;下载抖音直播视频;抓取作品评论数据。
⭐ 使用者在使用本项目的代码时,请遵守 GNU General Public License v3.0 开源协议。
@@ -27,15 +27,16 @@
* ✅ 获取直播推流地址
* ✅ 下载抖音直播视频
* ✅ Web UI 交互界面
-* ☑️ 抓取作品评论数据
+* ✅ 抓取作品评论数据
* ☑️ 下载 TikTok 无水印视频/图集
# 📈 项目状态
* 🟢 [Releases](https://github.com/JoeanAmier/TikTokDownloader/releases/latest) 发布的源码已通过测试,功能均可正常使用
* 🟢 已完成 Web UI 交互界面
+* 🟢 已完成作品评论数据抓取功能
+* 🟡 准备开发作品评论回复抓取功能
* 🟡 准备开发多进程模式,提高多账号批量下载效率
-* 🟡 准备加入作品评论抓取功能
* 🔴 最新版本的源码可能存在不稳定的Bug
* 🔴 暂未发现影响使用的Bug,如果在使用过程中发现Bug,请及时告知作者修复
@@ -130,7 +131,7 @@ TikTokDownloader
| time | str | 发布时间的格式,默认值:年-月-日 时.分.秒
(注意:Windows下文件名不能包含英文冒号“:”) |
| split | str | 文件命名的分隔符,默认值:“-” |
| music | list\[bool\] | 是否下载视频和图集的音乐,默认值:False |
-| save | str | 详细数据保存格式,设置为空字符串代表不保存
目前支持: csv、xlsx、sql(SQLite) |
+| save | str | 作品和评论数据保存格式,设置为空字符串代表不保存
目前支持: csv、xlsx、sql(SQLite) |
| cookie | list\[str\] | 抖音网页版Cookie,必需参数
可以使用 Cookie_tool.py 写入配置文件 |
| dynamic | list\[bool\] | 是否下载动态封面图,默认值:False |
| original | list\[bool\] | 是否下载静态封面图,默认值:False |
diff --git a/src/DataAcquirer.py b/src/DataAcquirer.py
index f056852d..d8323f75 100644
--- a/src/DataAcquirer.py
+++ b/src/DataAcquirer.py
@@ -25,8 +25,11 @@ def reset(function):
def inner(self, *args, **kwargs):
if not isinstance(self.url, bool):
self.id_ = None
+ self.data = None
+ self.comment = []
+ self.cursor = 0
self.max_cursor = 0
- self.list = None # 未处理的数据
+ self.list = [] # 未处理的数据
self.name = None # 账号昵称
self.video_data = [] # 视频ID数据
self.image_data = [] # 图集ID数据
@@ -48,8 +51,8 @@ def inner(self, *args, **kwargs):
return inner
-def retry(max_num=3):
- """发生错误时尝试重新执行"""
+def retry(max_num=10):
+ """发生错误时尝试重新执行,装饰的函数需要返回布尔值"""
def inner(function):
def execute(self, *args, **kwargs):
@@ -73,17 +76,27 @@ class UserData:
r"^https://www\.douyin\.com/user/([a-zA-z0-9-_]+)(?:\?modal_id=([0-9]{19}))?.*$") # 账号链接
works_link = re.compile(
r"^https://www\.douyin\.com/(?:video|note)/([0-9]{19})$") # 作品链接
- live_link = re.compile(r"^https://live\.douyin\.com/([0-9]+)$") # 直播链接
+ live_link = re.compile(r"^https://live\.douyin\.com/([0-9]+)\?*.+") # 直播链接
live_api = "https://live.douyin.com/webcast/room/web/enter/" # 直播API
+ comment_api = "https://www.douyin.com/aweme/v1/web/comment/list/" # 评论API
+ reply_api = "https://www.douyin.com/aweme/v1/web/comment/list/reply/" # 评论回复API
+ """评论回复API参数
+ "item_id": "7248064381664136486",
+ "comment_id": "7248089935747449604",
+ """
clean = Cleaner() # 过滤非法字符
+ max_comment = 256 # 评论字数限制
def __init__(self, log: LoggerManager):
self.xb = XBogus() # 加密参数对象
self.log = log # 日志记录对象
+ self.data = None # 数据记录对象,仅评论抓取调用
self._cookie = False # 是否设置了Cookie
self.id_ = None # sec_uid or item_ids
- self.max_cursor = 0
- self.list = None # 未处理的数据
+ self.comment = [] # 评论数据
+ self.cursor = 0 # 评论页使用
+ self.max_cursor = 0 # 发布页和喜欢页使用
+ self.list = [] # 未处理的数据
self.name = None # 账号昵称
self.video_data = [] # 视频ID数据
self.image_data = [] # 图集ID数据
@@ -94,6 +107,7 @@ def __init__(self, log: LoggerManager):
self._url = None # 账号链接
self._api = None # 批量下载类型
self._proxies = None # 代理
+ self._time = None # 创建时间格式
@property
def url(self):
@@ -199,6 +213,24 @@ def proxies(self, value):
"ftp": None,
}
+ @property
+ def time(self):
+ return self._time
+
+ @time.setter
+ def time(self, value):
+ if value:
+ try:
+ _ = time.strftime(value, time.localtime())
+ self._time = value
+ self.log.info(f"时间格式设置成功: {value}", False)
+ except ValueError:
+ self.log.warning(f"时间格式错误: {value},将使用默认时间格式(年-月-日 时.分.秒)")
+ self._time = "%Y-%m-%d %H.%M.%S"
+ else:
+ self.log.warning("错误的时间格式,将使用默认时间格式(年-月-日 时.分.秒)")
+ self._time = "%Y-%m-%d %H.%M.%S"
+
@retry(max_num=5)
def get_id(self, value="sec_user_id", url=None):
"""获取账号ID或者作品ID"""
@@ -252,14 +284,13 @@ def get_user_data(self):
proxies=self.proxies,
timeout=10)
except requests.exceptions.ReadTimeout:
- print("请求超时!")
+ self.log.error("请求超时")
return False
sleep()
if response.status_code == 200:
try:
data = response.json()
except requests.exceptions.JSONDecodeError:
- self.list = []
self.log.error("数据接口返回内容异常!疑似接口失效", False)
return False
try:
@@ -267,11 +298,9 @@ def get_user_data(self):
self.list = data["aweme_list"]
return True
except KeyError:
- self.list = []
self.log.error(f"响应内容异常: {data}", False)
return False
else:
- self.list = []
self.log.error(f"响应码异常:{response.status_code},获取JSON数据失败")
return False
@@ -325,7 +354,7 @@ def get_nickname(self):
self.name = str(time.time())[:10]
self.log.warning(
f"请求超时,获取账号昵称失败,本次运行将默认使用当前时间戳作为帐号昵称: {self.name}")
- return
+ return False
if response.status_code == 200:
try:
data = response.json()
@@ -333,20 +362,23 @@ def get_nickname(self):
self.name = str(time.time())[:10]
self.log.warning(
f"数据接口返回内容异常,获取账号昵称失败,本次运行将默认使用当前时间戳作为帐号昵称: {self.name}")
- return
+ return False
try:
self.name = self.clean.filter(
data["aweme_list"][0]["author"]["nickname"]) or str(
time.time())[
:10]
+ return True
except KeyError:
self.name = str(time.time())[:10]
self.log.warning(
f"响应内容异常,获取账号昵称失败,本次运行将默认使用当前时间戳作为帐号昵称: {self.name}")
+ return False
else:
self.name = str(time.time())[:10]
self.log.warning(
f"响应码异常:{response.status_code},获取账号昵称失败,本次运行将默认使用当前时间戳作为帐号昵称: {self.name}")
+ return False
def early_stop(self):
"""如果获取数据的发布日期已经早于限制日期,就不需要再获取下一页的数据了"""
@@ -391,9 +423,6 @@ def run(self, index: int):
@check_cookie
def run_alone(self, text: str):
"""单独下载模式"""
- if not self.cookie:
- self.log.warning("请检查Cookie是否正确")
- return False
url = self.check_url(text)
if not url:
self.log.warning("无效的作品链接")
@@ -455,7 +484,7 @@ def get_live_data(self, link: str):
proxies=self.proxies)
return response.json()
except requests.exceptions.ReadTimeout:
- print("请求超时!")
+ self.log.warning("请求超时")
return False
except requests.exceptions.JSONDecodeError:
self.log.warning("直播数据接口返回内容格式错误")
@@ -472,40 +501,78 @@ def deal_live_data(self, data):
cover = data["data"]["data"][0]["cover"]["url_list"][0]
return nickname, title, url, cover
+ @reset
+ @check_cookie
+ def run_comment(self, id_: str, data):
+ self.data = data
+ while not self.finish:
+ self.get_comment(id_)
+ self.deal_comment()
-class CommentData:
- comment_api = "https://www.douyin.com/aweme/v1/web/comment/list/"
- params = {
- "device_platform": "webapp",
- "aid": "6383",
- "channel": "channel_pc_web",
- "aweme_id": "7246706009123720503",
- "cursor": "0",
- "count": "20",
- "item_type": "0",
- "insert_ids": "",
- "rcFT": "",
- "pc_client_type": "1",
- "version_code": "170400",
- "version_name": "17.4.0",
- "cookie_enabled": "true",
- "screen_width": "1536",
- "screen_height": "864",
- "browser_language": "zh-CN",
- "browser_platform": "Win32",
- "browser_name": "Edge",
- "browser_version": "114.0.1823.58",
- "browser_online": "true",
- "engine_name": "Blink",
- "engine_version": "114.0.0.0",
- "os_name": "Windows",
- "os_version": "10",
- "cpu_core_num": "16",
- "device_memory": "8",
- "platform": "PC",
- "downlink": "10",
- "effective_type": "4g",
- "round_trip_time": "50",
- "webid": "7248584490175383100",
- "msToken": "FX-6vWAx3sPmINCegC_qzzS46gfcN9LHHoaaKBtf8DYrBSmGXT803q4j0uzx0fDkFFUj1bPrkfA6O1tBTwUJi4RZGz3OkqEqI8RtIBu1X1NBeT60BHItrM2gK3jRVdI=",
- "X-Bogus": "DFSzswVL6lJANSwctnrmvGUClLxV"}
+ @retry(max_num=5)
+ def get_comment(self, id_: str):
+ params = {
+ "aid": "6383",
+ "aweme_id": id_,
+ "cursor": self.cursor,
+ "count": "20",
+ "cookie_enabled": "true",
+ "platform": "PC", }
+ params = self.deal_params(params)
+ try:
+ response = requests.get(
+ self.comment_api,
+ params=params,
+ headers=self.headers,
+ proxies=self.proxies,
+ timeout=10)
+ except requests.exceptions.ReadTimeout:
+ self.log.error("请求超时")
+ return False
+ sleep()
+ if response.status_code == 200:
+ try:
+ data = response.json()
+ except requests.exceptions.JSONDecodeError:
+ self.log.error("数据接口返回内容异常!疑似接口失效", False)
+ return False
+ try:
+ self.comment = data["comments"]
+ self.cursor = data["cursor"]
+ return True
+ except KeyError:
+ self.log.error(f"响应内容异常: {data}", False)
+ return False
+ else:
+ self.log.error(f"响应码异常:{response.status_code},获取JSON数据失败")
+ return False
+
+ def deal_comment(self):
+ if not self.comment:
+ self.log.info("该作品的评论数据获取结束")
+ self.finish = True
+ return
+ for item in self.comment:
+ """数据格式: 评论ID, 评论时间, 用户昵称, IP归属地, 评论内容, 点赞数量, 回复数量, 回复ID"""
+ create_time = time.strftime(
+ self.time,
+ time.localtime(
+ item["create_time"]))
+ ip_label = item["ip_label"]
+ text = item["text"][:self.max_comment]
+ nickname = item["user"]["nickname"]
+ digg_count = str(item["digg_count"])
+ cid = item["cid"]
+ reply_comment_total = str(item["reply_comment_total"])
+ reply_id = item["reply_id"]
+ result = [
+ cid,
+ create_time,
+ nickname,
+ ip_label,
+ text,
+ digg_count,
+ reply_comment_total,
+ reply_id]
+ self.log.info("评论: " + ", ".join(result))
+ self.data.save(result)
diff --git a/src/DataDownloader.py b/src/DataDownloader.py
index 22a1e410..27ea99cf 100644
--- a/src/DataDownloader.py
+++ b/src/DataDownloader.py
@@ -238,25 +238,22 @@ def get_data(self, item):
}
xb = self.xb.get_x_bogus(urlencode(params))
params["X-Bogus"] = xb
- for _ in range(3): # 获取数据为空时重新尝试
- try:
- response = requests.get(
- self.item_ids_api,
- params=params,
- proxies=self.proxies,
- headers=self.headers, timeout=10)
- sleep()
- if response.status_code == 200 and response.text:
- try:
- return response.json()["aweme_detail"]
- except (KeyError, IndexError):
- self.log.error(f"响应内容异常: {response.json()}", False)
- return False
- except requests.exceptions.ReadTimeout:
- continue
- self.log.error(
- f"资源 {item} 获取 item_list 失败")
- return False
+ try:
+ response = requests.get(
+ self.item_ids_api,
+ params=params,
+ proxies=self.proxies,
+ headers=self.headers, timeout=10)
+ sleep()
+ if response.status_code == 200 and response.text:
+ try:
+ return response.json()["aweme_detail"]
+ except (KeyError, IndexError):
+ self.log.error(f"响应内容异常: {response.json()}", False)
+ return False
+ except requests.exceptions.ReadTimeout:
+ self.log.error(f"请求超时,资源 {item} 获取 item_list 失败")
+ return False
def get_info(self, data, type_):
"""
@@ -444,7 +441,7 @@ def run_alone(self, id_: str, download=True):
self.create_folder(self.folder)
data = self.get_data(id_)
if not data:
- self.log.warning("获取作品详细信息失败!")
+ self.log.warning("获取作品详细信息失败")
return False
self.nickname = self.clean.filter(data["author"]["nickname"])
if data["images"]:
diff --git a/src/Recorder.py b/src/Recorder.py
index 5e4a30e3..47dd59c2 100644
--- a/src/Recorder.py
+++ b/src/Recorder.py
@@ -154,7 +154,13 @@ def save(self, *args, **kwargs):
class CSVLogger:
"""CSV格式记录"""
- def __init__(self, root: str, name="Download", title_line=None):
+ def __init__(
+ self,
+ root: str,
+ name="Download",
+ title_line=None,
+ *args,
+ **kwargs):
self.file = None # 文件对象
self.writer = None # CSV对象
self.root = root # 文件路径
@@ -188,7 +194,13 @@ def save(self, data):
class XLSXLogger:
"""XLSX格式"""
- def __init__(self, root: str, name="Download", title_line=None):
+ def __init__(
+ self,
+ root: str,
+ name="Download",
+ title_line=None,
+ *args,
+ **kwargs):
self.book = None # XLSX数据簿
self.sheet = None # XLSX数据表
self.root = root # 文件路径
@@ -228,12 +240,14 @@ def __init__(
self,
root: str,
name="Download",
+ file="TikTokDownloader.db",
title_line=None,
title_type=None):
self.db = None # 数据库
self.cursor = None # 游标对象
self.root = root # 文件路径
self.name = name # 数据表名称
+ self.file = file # 数据库文件名称
self.title_line = title_line or RecordManager.title # 数据表列名
self.title_type = title_type or RecordManager.title_type # 数据表数据类型
@@ -242,8 +256,8 @@ def __enter__(self):
os.mkdir(self.root)
self.db = sqlite3.connect(
os.path.join(
- self.root,
- "TikTokDownloader.db"))
+ self.root, self.file
+ ))
self.cursor = self.db.cursor()
self.create()
return self
@@ -269,7 +283,7 @@ class RecordManager:
"CHARACTER(2) NOT NULL",
"CHARACTER(19) PRIMARY KEY",
"CHARACTER(128) NOT NULL",
- "CHARACTER(19) NOT NULL",
+ "CHARACTER(20) NOT NULL",
"CHARACTER(20) NOT NULL",
"CHARACTER(64)",
)
diff --git a/src/main_complete.py b/src/main_complete.py
index 4d5d6b6f..439c90aa 100644
--- a/src/main_complete.py
+++ b/src/main_complete.py
@@ -19,6 +19,25 @@ class TikTok:
"xlsx": XLSXLogger,
"sql": SQLLogger,
}
+ Comment_Title = (
+ "评论ID",
+ "评论时间",
+ "用户昵称",
+ "IP归属地",
+ "评论内容",
+ "点赞数量",
+ "回复数量",
+ "回复ID")
+ Comment_Type = (
+ "CHARACTER(19) PRIMARY KEY",
+ "CHARACTER(20) NOT NULL",
+ "CHARACTER(20) NOT NULL",
+ "CHARACTER(10) NOT NULL",
+ "CHARACTER(256) NOT NULL",
+ "INTEGER NOT NULL",
+ "INTEGER NOT NULL",
+ "CHARACTER(19) NOT NULL",
+ )
def __init__(self):
self.record = None
@@ -114,7 +133,7 @@ def single_acquisition(self):
self.download.data = data
while True:
url = input("请输入分享链接:")
- if url in ("Q", "q", ""):
+ if not url:
break
id_ = self.request.run_alone(url)
if not id_:
@@ -166,7 +185,8 @@ def set_parameters(self):
self.download.folder = self._data["folder"]
self.download.name = self._data["name"]
self.download.music = self._data["music"]
- self.download.time = self._data["time"]
+ self.request.time = self._data["time"]
+ self.download.time = self.request.time
self.download.split = self._data["split"]
self.download.cookie = self._data["cookie"]
self.request.cookie = self._data["cookie"]
@@ -175,13 +195,28 @@ def set_parameters(self):
self.request.proxies = self._data["proxies"]
self.download.proxies = self.request.proxies
+ def comment_acquisition(self):
+ data_root = RecordManager.run(self._data["root"], "Comment")
+ save_file = self.DataLogger.get(self._data["save"], NoneLogger)
+ while True:
+ url = input("请输入作品链接:")
+ if not url:
+ break
+ id_ = self.request.run_alone(url)
+ if not id_:
+ self.record.error(f"{url} 获取 aweme_id 失败")
+ continue
+ with save_file(data_root, f"作品_{id_}", file="CommentData.db", title_line=self.Comment_Title,
+ title_type=self.Comment_Type) as data:
+ self.request.run_comment(id_, data)
+
def run(self):
if not self.check_config():
return False
self.initialize()
self.set_parameters()
select = input(
- "请选择下载模式:\n1. 批量下载账号作品\n2. 单独下载链接作品\n3. 获取直播推流地址\n输入序号:")
+ "请选择下载模式:\n1. 批量下载账号作品\n2. 单独下载链接作品\n3. 获取直播推流地址\n4. 抓取作品评论数据\n输入序号:")
"""兼容旧版本的Python,版本小于3.10不支持match语法"""
# match select:
# case "1":
@@ -202,6 +237,9 @@ def run(self):
elif select == "3":
self.record.info("已选择直播下载模式")
self.live_acquisition()
+ elif select == "4":
+ self.record.info("已选择评论抓取模式")
+ self.comment_acquisition()
self.record.info("程序运行结束")