-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclean.py
134 lines (112 loc) · 2.83 KB
/
clean.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
# Copyright (c) 2023.
# !/usr/bin/python
# -*- coding: UTF-8 -*-
# @Project: social_media_data_acquisition
# @FileName: Utils/clean.py
# @Author:hz157
# @DateTime: 24/1/2023 下午5:57
import re
def CleanTopic(data):
"""
递归清理微博话题
:param data:
:return:
"""
try:
start = data.index('#')
end = data[start + 1:len(data)].index('#') + start
remove = data[start:end + 2]
data = data.replace(remove, '')
return CleanTopic(data)
except Exception as e:
print(e)
return data
def Replace(data):
"""
清理ZWSP格式空格
:param data:
:return:
"""
result = data.replace('ZWSP', '').replace('<br />', ' ').replace('<br>', ' ')
return result
def CleanAt(data):
"""
递归清理用户@其他用户的标签
:param data:
:return:
"""
try:
start = data.index('@')
end = data[start + 1:len(data)].index(' ') + start
remove = data[start:end + 2]
data = data.replace(remove, '')
# return CleanTopic(data)
return CleanAt(data)
except Exception as e:
print(e)
return data
# 清理Emoji表情
def CleanEmoji(data):
"""
清理Emoji表情符号
:param data:
:return:
"""
try:
co = re.compile(u'[\U00010000-\U0010ffff]')
except re.error:
co = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]')
return co.sub('', data)
def CleanALabel(data):
"""
递归清理HTML<a>标签
:param data:
:return:
"""
try:
start = data.index('<a')
end = data[start + 1:len(data)].index('</a>') + start
remove = data[start:end + 5]
data = data.replace(remove, '')
return CleanALabel(data)
except Exception as e:
print(e)
return data
def CleanSpanLabel(data):
"""
递归清理HTML<span>标签
:param data:
:return:
"""
try:
start = data.index('<span')
end = data[start + 1:len(data)].index('</span>') + start
remove = data[start:end + 8]
data = data.replace(remove, '')
return CleanSpanLabel(data)
except Exception as e:
print(e)
return data
def CleanBracket(data):
"""
递归清理【】标签
:param data:
:return:
"""
try:
start = data.index('【')
end = data[start + 1:len(data)].index('】') + start
remove = data[start:end + 2]
data = data.replace(remove, '')
return CleanBracket(data)
except Exception as e:
print(e)
return data
def CleanChar(data):
return CleanBracket(CleanAt(CleanTopic(data)))
def CleanHTML(data):
return CleanSpanLabel(CleanALabel(data))
def CleanOther(data):
return CleanEmoji(Replace(data))
def CleanData(data):
return CleanChar(CleanHTML(CleanOther(data)))