-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsitedork.py
311 lines (277 loc) · 11.6 KB
/
sitedork.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
# coding=utf-8
try:
import requests
except ImportError:
print("ERROR: You need to install requests module.")
exit()
try:
from bs4 import *
except ImportError:
print("ERROR: You need to install bs4 module.")
exit()
import urllib.parse
import math
import argparse
import re
import signal
import sys
from collections import deque
import threading
from time import sleep
from random import randint
class BaiduUrlThread(threading.Thread):
"""
Thread class for getting real URL from Baidu redirection URL
"""
def __init__(self, title, redirect_url, item_results):
threading.Thread.__init__(self)
self.title = title
self.redirect_url = redirect_url
self.item_results = item_results
def run(self):
try:
loc_res = requests.head(self.redirect_url, allow_redirects=True, timeout=5)
except Exception:
return None
self.item_results.append((self.title, loc_res.url))
class InfoDork(object):
"""
_______. __ .___________. _______ _______ ______ .______ __ ___
/ || | | || ____|| \ / __ \ | _ \ | |/ /
| (----`| | `---| |----`| |__ | .--. | | | | | |_) | | ' /
\ \ | | | | | __| | | | | | | | | / | <
.----) | | | | | | |____ | '--' | `--' | | |\ \----.| . \\
|_______/ |__| |__| |_______||_______/ \______/ | _| `._____||__|\__\\
Author: repoog
Version: v1.2
Create Date: 2018.1.21
Last Update: 2019.5.20
Python Version: v3.6.4
"""
GOOGLE_DORK = {"subdomain": "site:{}",
"install": "site:{} inurl:readme OR inurl:license OR inurl:install OR inurl:setup",
"redirect": "site:{} inurl:redir OR inurl:url OR inurl:redirect OR inurl:return OR inurl:src=http",
"sensitive": "site:{} filetype:bak OR filetype:sql OR filetype:rar OR filetype:zip OR filetype:log",
"document": "site:{} filetype:doc OR filetype:docx OR filetype:csv OR filetype:pdf OR filetype:txt",
"extension": "site:{} filetype:cgi OR filetype:php OR filetype:aspx OR filetype:jsp OR filetype:swf OR filetype:fla OR filetype:xml"
}
GOOGLE_MAX_PAGE = 100 # Max results per page of Google
BAIDU_DORK = {"subdomain": "site:{}",
"install": "site:{} inurl:setup",
"redirect": "site:{} inurl:redirect",
"sensitive": "site:{} filetype:log",
"document": "site:{} filetype:txt",
"extension": "site:{} filetype:php"
}
BAIDU_MAX_PAGE = 50 # Max results per page of Baidu
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"
OUTPUT_PATH = "output/"
def __init__(self, domain, limit):
self.domain = domain
self.limit = limit
def dork_search(self):
"""
Searching domains,files and emails with Google and Baidu.
:return:
"""
for doc, value in InfoDork.GOOGLE_DORK.items():
print("[-] Executing to Google [%s]" % doc)
query = urllib.parse.quote_plus(value.format(self.domain))
self.google_search(doc, query)
for doc, value in InfoDork.BAIDU_DORK.items():
print("[-] Executing to Baidu [%s]" % doc)
query = urllib.parse.quote_plus(value.format(self.domain))
self.baidu_search(doc, query)
def get_query_pages(self, max_page_result):
"""
Get page count for searching and result count per page.
:param max_page_result: max results per page.
:return:
"""
if self.limit > max_page_result:
pages = int(math.ceil(self.limit / float(max_page_result)))
result_page = max_page_result
else:
pages = 1
result_page = self.limit
return pages, result_page
def google_search(self, doc, query):
"""
Google title and url with query string.
:param doc: document name
:param query: query string
:return: None
"""
search_results = []
output_file = InfoDork.OUTPUT_PATH + 'g-' + doc + '.txt'
main_url = "https://www.google.com/search?filter=0&start={0}&q={1}&num={2}"
header = {'user-agent': InfoDork.USER_AGENT, 'accept-language': 'en-US,en;q=0.5'}
pages, result_page = self.get_query_pages(InfoDork.GOOGLE_MAX_PAGE)
result_count = None # All results count.
for page in range(pages):
try:
result_html = requests.get(main_url.format(page, query, result_page), headers=header, timeout=20)
parse_html = BeautifulSoup(result_html.text, 'lxml')
except Exception as err:
print(err)
continue
# Sleep for random seconds.
sleep(randint(15, 30))
if result_count is None:
try:
count_text = parse_html.select("#resultStats")[0].children.__next__()
except Exception as err:
print(err)
break
result_count = int(re.search(r'([0-9\'\,]+)', count_text).group(1).replace(',', ''))
# Print progress line.
progress = int((page + 1) / float(pages) * 100)
sys.stdout.write("|" + ">" * progress + "|" + str(progress) + "%\r")
sys.stdout.flush() if progress != 100 else print('\n')
results = self.google_result_parse(parse_html.select("div.r a"))
if len(search_results) + len(results) > self.limit:
del results[self.limit - len(search_results):]
search_results += results
is_subdomain = re.search(r'^site[^\+]*$', query)
if not is_subdomain:
self.output_file(output_file, search_results)
else:
self.output_subdomain(output_file, search_results)
@staticmethod
def google_result_parse(results):
"""
Extract url from searching results.
:param results: title and whole url list
:return: title and url list
"""
item_results = []
for result in results:
title = result.get_text()
domain = result.attrs['href'].split('&sa=')[0][7:]
if title and domain:
item_results.append((title, domain))
return item_results
def baidu_search(self, doc, query):
"""
Baidu title and url with query string.
:param doc: document type
:param query: query string
:return: None
"""
search_results = []
output_file = InfoDork.OUTPUT_PATH + 'b-' + doc + '.txt'
main_url = "https://www.baidu.com/s?ie=utf-8&cl=0&pn={0}&wd={1}&rn={2}"
header = {'user-agent': InfoDork.USER_AGENT, 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8'}
pages, result_page = self.get_query_pages(InfoDork.BAIDU_MAX_PAGE)
result_count = None # All results count.
for page in range(pages):
try:
result_html = requests.get(main_url.format(page * result_page, query, result_page), headers=header, timeout=10)
parse_html = BeautifulSoup(result_html.text, 'lxml')
except TimeoutError:
continue
# Sleep for random seconds.
sleep(randint(5, 10))
if result_count is None:
count_text = parse_html.select(".nums")[0].get_text()
result_count = int(re.search(r'([0-9\'\,]+)', count_text).group(1).replace(',', ''))
if result_count == 0:
break
# Print progress line.
progress = int((page + 1) / float(pages) * 100)
sys.stdout.write("|" + ">" * progress + "|" + str(progress) + "%\r")
sys.stdout.flush() if progress != 100 else print('\n')
results = self.baidu_result_parse(parse_html.select(".t > a"))
if len(search_results) + len(results) > self.limit:
del results[self.limit - len(search_results):]
search_results += results
is_subdomain = re.search(r'^site[^\+]*$', query)
if not is_subdomain:
self.output_file(output_file, search_results)
else:
self.output_subdomain(output_file, search_results)
@staticmethod
def baidu_result_parse(results):
"""
Extract url from searching results.
:param results: title and whole url list
:return: title and url list
"""
fetcher_threads = deque([])
item_results = []
for result in results:
title = result.get_text()
redirect_url = result.attrs['href']
while 1:
running = 0
for thread in fetcher_threads:
if thread.is_alive():
running += 1
if running < InfoDork.BAIDU_MAX_PAGE:
break
true_url_thread = BaiduUrlThread(title, redirect_url, item_results)
true_url_thread.start()
fetcher_threads.append(true_url_thread)
for thread in fetcher_threads:
thread.join()
return item_results
@staticmethod
def output_file(file_name, results):
"""
Output to document with doc name.
:param file_name: document name
:param results: searched results.
:return: None
"""
if not results:
return
with open(file_name, "+a", encoding='utf-8') as f:
for item in results:
f.write(item[0] + "\n" + item[1] + "\n\n")
def output_subdomain(self, file_name, results):
"""
Output subdomains to document.
:param file_name: document name
:param results: searched results
:return: None
"""
subdomain_list = []
for item in results:
try:
subdomain = re.search(r'([a-zA-Z\d\-]{2,}\.){1,}[a-zA-Z]{2,}', item[1]).group(0)
if subdomain.find(self.domain) != -1:
subdomain_list.append(subdomain)
except Exception as err:
print(err)
subdomain_list = list(set(subdomain_list))
if not subdomain_list:
return
with open(file_name, "+a", encoding='utf-8') as f:
for subdomain in subdomain_list:
f.write(subdomain + "\n")
def sigint_handler(signum, frame):
print('You pressed the Ctrl+C.')
sys.exit(0)
def domain_valid(domain):
"""
Valid domain function for argument parser.
:param domain: domain name
:return: domain name
"""
is_valid = re.match(r'^[a-zA-Z\d\-]*\.[a-zA-Z\d]*$', domain)
if not is_valid:
error_msg = '%s is not a valid domain.' % domain
raise argparse.ArgumentTypeError(error_msg)
return domain
if __name__ == '__main__':
signal.signal(signal.SIGINT, sigint_handler)
print(InfoDork.__doc__)
parser = argparse.ArgumentParser(description="Information Dork tool for searching domains,files and emails.")
parser.add_argument('-l', '--limit', type=int, metavar='limit', default=100,
help='results of searching limit(default:%(default)s)')
parser.add_argument('-d', '--domain', type=domain_valid, metavar='domain', required=True,
help='domain name for searching')
args = parser.parse_args()
# search information with Google and Baidu.
engine = InfoDork(args.domain, args.limit)
engine.dork_search()