diff --git a/README.md b/README.md index 56c0c070..dc5557aa 100644 --- a/README.md +++ b/README.md @@ -8,10 +8,10 @@ TikTokDownloader 轻量级工具 -QQ群聊 +QQ群聊
-

🔥 抖音视频/图集/直播下载工具:基于 Requests 模块实现;批量下载抖音账号发布页或者喜欢页的作品;单独下载抖音链接对应的作品;获取抖音直播推流地址;下载抖音直播视频。

+

🔥 抖音视频/图集/直播下载工具:基于 Requests 模块实现;批量下载抖音账号发布页或者喜欢页的作品;单独下载抖音链接对应的作品;获取抖音直播推流地址;下载抖音直播视频;抓取作品评论数据。

使用者在使用本项目的代码时,请遵守 GNU General Public License v3.0 开源协议。


@@ -27,15 +27,16 @@ * ✅ 获取直播推流地址 * ✅ 下载抖音直播视频 * ✅ Web UI 交互界面 -* ☑️ 抓取作品评论数据 +* ✅ 抓取作品评论数据 * ☑️ 下载 TikTok 无水印视频/图集 # 📈 项目状态 * 🟢 [Releases](https://github.com/JoeanAmier/TikTokDownloader/releases/latest) 发布的源码已通过测试,功能均可正常使用 * 🟢 已完成 Web UI 交互界面 +* 🟢 已完成作品评论数据抓取功能 +* 🟡 准备开发作品评论回复抓取功能 * 🟡 准备开发多进程模式,提高多账号批量下载效率 -* 🟡 准备加入作品评论抓取功能 * 🔴 最新版本的源码可能存在不稳定的Bug * 🔴 暂未发现影响使用的Bug,如果在使用过程中发现Bug,请及时告知作者修复 @@ -130,7 +131,7 @@ TikTokDownloader | time | str | 发布时间的格式,默认值:年-月-日 时.分.秒
(注意:Windows下文件名不能包含英文冒号“:”) | | split | str | 文件命名的分隔符,默认值:“-” | | music | list\[bool\] | 是否下载视频和图集的音乐,默认值:False | -| save | str | 详细数据保存格式,设置为空字符串代表不保存
目前支持: csv、xlsx、sql(SQLite) | +| save | str | 作品和评论数据保存格式,设置为空字符串代表不保存
目前支持: csv、xlsx、sql(SQLite) | | cookie | list\[str\] | 抖音网页版Cookie,必需参数
可以使用 Cookie_tool.py 写入配置文件 | | dynamic | list\[bool\] | 是否下载动态封面图,默认值:False | | original | list\[bool\] | 是否下载静态封面图,默认值:False | diff --git a/src/DataAcquirer.py b/src/DataAcquirer.py index f056852d..d8323f75 100644 --- a/src/DataAcquirer.py +++ b/src/DataAcquirer.py @@ -25,8 +25,11 @@ def reset(function): def inner(self, *args, **kwargs): if not isinstance(self.url, bool): self.id_ = None + self.data = None + self.comment = [] + self.cursor = 0 self.max_cursor = 0 - self.list = None # 未处理的数据 + self.list = [] # 未处理的数据 self.name = None # 账号昵称 self.video_data = [] # 视频ID数据 self.image_data = [] # 图集ID数据 @@ -48,8 +51,8 @@ def inner(self, *args, **kwargs): return inner -def retry(max_num=3): - """发生错误时尝试重新执行""" +def retry(max_num=10): + """发生错误时尝试重新执行,装饰的函数需要返回布尔值""" def inner(function): def execute(self, *args, **kwargs): @@ -73,17 +76,27 @@ class UserData: r"^https://www\.douyin\.com/user/([a-zA-z0-9-_]+)(?:\?modal_id=([0-9]{19}))?.*$") # 账号链接 works_link = re.compile( r"^https://www\.douyin\.com/(?:video|note)/([0-9]{19})$") # 作品链接 - live_link = re.compile(r"^https://live\.douyin\.com/([0-9]+)$") # 直播链接 + live_link = re.compile(r"^https://live\.douyin\.com/([0-9]+)\?*.+") # 直播链接 live_api = "https://live.douyin.com/webcast/room/web/enter/" # 直播API + comment_api = "https://www.douyin.com/aweme/v1/web/comment/list/" # 评论API + reply_api = "https://www.douyin.com/aweme/v1/web/comment/list/reply/" # 评论回复API + """评论回复API参数 + "item_id": "7248064381664136486", + "comment_id": "7248089935747449604", + """ clean = Cleaner() # 过滤非法字符 + max_comment = 256 # 评论字数限制 def __init__(self, log: LoggerManager): self.xb = XBogus() # 加密参数对象 self.log = log # 日志记录对象 + self.data = None # 数据记录对象,仅评论抓取调用 self._cookie = False # 是否设置了Cookie self.id_ = None # sec_uid or item_ids - self.max_cursor = 0 - self.list = None # 未处理的数据 + self.comment = [] # 评论数据 + self.cursor = 0 # 评论页使用 + self.max_cursor = 0 # 发布页和喜欢页使用 + self.list = [] # 未处理的数据 self.name = None # 账号昵称 self.video_data = [] # 视频ID数据 self.image_data = [] # 图集ID数据 @@ -94,6 +107,7 @@ def __init__(self, log: LoggerManager): self._url = None # 账号链接 self._api = None # 批量下载类型 self._proxies = None # 代理 + self._time = None # 创建时间格式 @property def url(self): @@ -199,6 +213,24 @@ def proxies(self, value): "ftp": None, } + @property + def time(self): + return self._time + + @time.setter + def time(self, value): + if value: + try: + _ = time.strftime(value, time.localtime()) + self._time = value + self.log.info(f"时间格式设置成功: {value}", False) + except ValueError: + self.log.warning(f"时间格式错误: {value},将使用默认时间格式(年-月-日 时.分.秒)") + self._time = "%Y-%m-%d %H.%M.%S" + else: + self.log.warning("错误的时间格式,将使用默认时间格式(年-月-日 时.分.秒)") + self._time = "%Y-%m-%d %H.%M.%S" + @retry(max_num=5) def get_id(self, value="sec_user_id", url=None): """获取账号ID或者作品ID""" @@ -252,14 +284,13 @@ def get_user_data(self): proxies=self.proxies, timeout=10) except requests.exceptions.ReadTimeout: - print("请求超时!") + self.log.error("请求超时") return False sleep() if response.status_code == 200: try: data = response.json() except requests.exceptions.JSONDecodeError: - self.list = [] self.log.error("数据接口返回内容异常!疑似接口失效", False) return False try: @@ -267,11 +298,9 @@ def get_user_data(self): self.list = data["aweme_list"] return True except KeyError: - self.list = [] self.log.error(f"响应内容异常: {data}", False) return False else: - self.list = [] self.log.error(f"响应码异常:{response.status_code},获取JSON数据失败") return False @@ -325,7 +354,7 @@ def get_nickname(self): self.name = str(time.time())[:10] self.log.warning( f"请求超时,获取账号昵称失败,本次运行将默认使用当前时间戳作为帐号昵称: {self.name}") - return + return False if response.status_code == 200: try: data = response.json() @@ -333,20 +362,23 @@ def get_nickname(self): self.name = str(time.time())[:10] self.log.warning( f"数据接口返回内容异常,获取账号昵称失败,本次运行将默认使用当前时间戳作为帐号昵称: {self.name}") - return + return False try: self.name = self.clean.filter( data["aweme_list"][0]["author"]["nickname"]) or str( time.time())[ :10] + return True except KeyError: self.name = str(time.time())[:10] self.log.warning( f"响应内容异常,获取账号昵称失败,本次运行将默认使用当前时间戳作为帐号昵称: {self.name}") + return False else: self.name = str(time.time())[:10] self.log.warning( f"响应码异常:{response.status_code},获取账号昵称失败,本次运行将默认使用当前时间戳作为帐号昵称: {self.name}") + return False def early_stop(self): """如果获取数据的发布日期已经早于限制日期,就不需要再获取下一页的数据了""" @@ -391,9 +423,6 @@ def run(self, index: int): @check_cookie def run_alone(self, text: str): """单独下载模式""" - if not self.cookie: - self.log.warning("请检查Cookie是否正确") - return False url = self.check_url(text) if not url: self.log.warning("无效的作品链接") @@ -455,7 +484,7 @@ def get_live_data(self, link: str): proxies=self.proxies) return response.json() except requests.exceptions.ReadTimeout: - print("请求超时!") + self.log.warning("请求超时") return False except requests.exceptions.JSONDecodeError: self.log.warning("直播数据接口返回内容格式错误") @@ -472,40 +501,78 @@ def deal_live_data(self, data): cover = data["data"]["data"][0]["cover"]["url_list"][0] return nickname, title, url, cover + @reset + @check_cookie + def run_comment(self, id_: str, data): + self.data = data + while not self.finish: + self.get_comment(id_) + self.deal_comment() -class CommentData: - comment_api = "https://www.douyin.com/aweme/v1/web/comment/list/" - params = { - "device_platform": "webapp", - "aid": "6383", - "channel": "channel_pc_web", - "aweme_id": "7246706009123720503", - "cursor": "0", - "count": "20", - "item_type": "0", - "insert_ids": "", - "rcFT": "", - "pc_client_type": "1", - "version_code": "170400", - "version_name": "17.4.0", - "cookie_enabled": "true", - "screen_width": "1536", - "screen_height": "864", - "browser_language": "zh-CN", - "browser_platform": "Win32", - "browser_name": "Edge", - "browser_version": "114.0.1823.58", - "browser_online": "true", - "engine_name": "Blink", - "engine_version": "114.0.0.0", - "os_name": "Windows", - "os_version": "10", - "cpu_core_num": "16", - "device_memory": "8", - "platform": "PC", - "downlink": "10", - "effective_type": "4g", - "round_trip_time": "50", - "webid": "7248584490175383100", - "msToken": "FX-6vWAx3sPmINCegC_qzzS46gfcN9LHHoaaKBtf8DYrBSmGXT803q4j0uzx0fDkFFUj1bPrkfA6O1tBTwUJi4RZGz3OkqEqI8RtIBu1X1NBeT60BHItrM2gK3jRVdI=", - "X-Bogus": "DFSzswVL6lJANSwctnrmvGUClLxV"} + @retry(max_num=5) + def get_comment(self, id_: str): + params = { + "aid": "6383", + "aweme_id": id_, + "cursor": self.cursor, + "count": "20", + "cookie_enabled": "true", + "platform": "PC", } + params = self.deal_params(params) + try: + response = requests.get( + self.comment_api, + params=params, + headers=self.headers, + proxies=self.proxies, + timeout=10) + except requests.exceptions.ReadTimeout: + self.log.error("请求超时") + return False + sleep() + if response.status_code == 200: + try: + data = response.json() + except requests.exceptions.JSONDecodeError: + self.log.error("数据接口返回内容异常!疑似接口失效", False) + return False + try: + self.comment = data["comments"] + self.cursor = data["cursor"] + return True + except KeyError: + self.log.error(f"响应内容异常: {data}", False) + return False + else: + self.log.error(f"响应码异常:{response.status_code},获取JSON数据失败") + return False + + def deal_comment(self): + if not self.comment: + self.log.info("该作品的评论数据获取结束") + self.finish = True + return + for item in self.comment: + """数据格式: 评论ID, 评论时间, 用户昵称, IP归属地, 评论内容, 点赞数量, 回复数量, 回复ID""" + create_time = time.strftime( + self.time, + time.localtime( + item["create_time"])) + ip_label = item["ip_label"] + text = item["text"][:self.max_comment] + nickname = item["user"]["nickname"] + digg_count = str(item["digg_count"]) + cid = item["cid"] + reply_comment_total = str(item["reply_comment_total"]) + reply_id = item["reply_id"] + result = [ + cid, + create_time, + nickname, + ip_label, + text, + digg_count, + reply_comment_total, + reply_id] + self.log.info("评论: " + ", ".join(result)) + self.data.save(result) diff --git a/src/DataDownloader.py b/src/DataDownloader.py index 22a1e410..27ea99cf 100644 --- a/src/DataDownloader.py +++ b/src/DataDownloader.py @@ -238,25 +238,22 @@ def get_data(self, item): } xb = self.xb.get_x_bogus(urlencode(params)) params["X-Bogus"] = xb - for _ in range(3): # 获取数据为空时重新尝试 - try: - response = requests.get( - self.item_ids_api, - params=params, - proxies=self.proxies, - headers=self.headers, timeout=10) - sleep() - if response.status_code == 200 and response.text: - try: - return response.json()["aweme_detail"] - except (KeyError, IndexError): - self.log.error(f"响应内容异常: {response.json()}", False) - return False - except requests.exceptions.ReadTimeout: - continue - self.log.error( - f"资源 {item} 获取 item_list 失败") - return False + try: + response = requests.get( + self.item_ids_api, + params=params, + proxies=self.proxies, + headers=self.headers, timeout=10) + sleep() + if response.status_code == 200 and response.text: + try: + return response.json()["aweme_detail"] + except (KeyError, IndexError): + self.log.error(f"响应内容异常: {response.json()}", False) + return False + except requests.exceptions.ReadTimeout: + self.log.error(f"请求超时,资源 {item} 获取 item_list 失败") + return False def get_info(self, data, type_): """ @@ -444,7 +441,7 @@ def run_alone(self, id_: str, download=True): self.create_folder(self.folder) data = self.get_data(id_) if not data: - self.log.warning("获取作品详细信息失败!") + self.log.warning("获取作品详细信息失败") return False self.nickname = self.clean.filter(data["author"]["nickname"]) if data["images"]: diff --git a/src/Recorder.py b/src/Recorder.py index 5e4a30e3..47dd59c2 100644 --- a/src/Recorder.py +++ b/src/Recorder.py @@ -154,7 +154,13 @@ def save(self, *args, **kwargs): class CSVLogger: """CSV格式记录""" - def __init__(self, root: str, name="Download", title_line=None): + def __init__( + self, + root: str, + name="Download", + title_line=None, + *args, + **kwargs): self.file = None # 文件对象 self.writer = None # CSV对象 self.root = root # 文件路径 @@ -188,7 +194,13 @@ def save(self, data): class XLSXLogger: """XLSX格式""" - def __init__(self, root: str, name="Download", title_line=None): + def __init__( + self, + root: str, + name="Download", + title_line=None, + *args, + **kwargs): self.book = None # XLSX数据簿 self.sheet = None # XLSX数据表 self.root = root # 文件路径 @@ -228,12 +240,14 @@ def __init__( self, root: str, name="Download", + file="TikTokDownloader.db", title_line=None, title_type=None): self.db = None # 数据库 self.cursor = None # 游标对象 self.root = root # 文件路径 self.name = name # 数据表名称 + self.file = file # 数据库文件名称 self.title_line = title_line or RecordManager.title # 数据表列名 self.title_type = title_type or RecordManager.title_type # 数据表数据类型 @@ -242,8 +256,8 @@ def __enter__(self): os.mkdir(self.root) self.db = sqlite3.connect( os.path.join( - self.root, - "TikTokDownloader.db")) + self.root, self.file + )) self.cursor = self.db.cursor() self.create() return self @@ -269,7 +283,7 @@ class RecordManager: "CHARACTER(2) NOT NULL", "CHARACTER(19) PRIMARY KEY", "CHARACTER(128) NOT NULL", - "CHARACTER(19) NOT NULL", + "CHARACTER(20) NOT NULL", "CHARACTER(20) NOT NULL", "CHARACTER(64)", ) diff --git a/src/main_complete.py b/src/main_complete.py index 4d5d6b6f..439c90aa 100644 --- a/src/main_complete.py +++ b/src/main_complete.py @@ -19,6 +19,25 @@ class TikTok: "xlsx": XLSXLogger, "sql": SQLLogger, } + Comment_Title = ( + "评论ID", + "评论时间", + "用户昵称", + "IP归属地", + "评论内容", + "点赞数量", + "回复数量", + "回复ID") + Comment_Type = ( + "CHARACTER(19) PRIMARY KEY", + "CHARACTER(20) NOT NULL", + "CHARACTER(20) NOT NULL", + "CHARACTER(10) NOT NULL", + "CHARACTER(256) NOT NULL", + "INTEGER NOT NULL", + "INTEGER NOT NULL", + "CHARACTER(19) NOT NULL", + ) def __init__(self): self.record = None @@ -114,7 +133,7 @@ def single_acquisition(self): self.download.data = data while True: url = input("请输入分享链接:") - if url in ("Q", "q", ""): + if not url: break id_ = self.request.run_alone(url) if not id_: @@ -166,7 +185,8 @@ def set_parameters(self): self.download.folder = self._data["folder"] self.download.name = self._data["name"] self.download.music = self._data["music"] - self.download.time = self._data["time"] + self.request.time = self._data["time"] + self.download.time = self.request.time self.download.split = self._data["split"] self.download.cookie = self._data["cookie"] self.request.cookie = self._data["cookie"] @@ -175,13 +195,28 @@ def set_parameters(self): self.request.proxies = self._data["proxies"] self.download.proxies = self.request.proxies + def comment_acquisition(self): + data_root = RecordManager.run(self._data["root"], "Comment") + save_file = self.DataLogger.get(self._data["save"], NoneLogger) + while True: + url = input("请输入作品链接:") + if not url: + break + id_ = self.request.run_alone(url) + if not id_: + self.record.error(f"{url} 获取 aweme_id 失败") + continue + with save_file(data_root, f"作品_{id_}", file="CommentData.db", title_line=self.Comment_Title, + title_type=self.Comment_Type) as data: + self.request.run_comment(id_, data) + def run(self): if not self.check_config(): return False self.initialize() self.set_parameters() select = input( - "请选择下载模式:\n1. 批量下载账号作品\n2. 单独下载链接作品\n3. 获取直播推流地址\n输入序号:") + "请选择下载模式:\n1. 批量下载账号作品\n2. 单独下载链接作品\n3. 获取直播推流地址\n4. 抓取作品评论数据\n输入序号:") """兼容旧版本的Python,版本小于3.10不支持match语法""" # match select: # case "1": @@ -202,6 +237,9 @@ def run(self): elif select == "3": self.record.info("已选择直播下载模式") self.live_acquisition() + elif select == "4": + self.record.info("已选择评论抓取模式") + self.comment_acquisition() self.record.info("程序运行结束")