-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathGH_Crawler.py
100 lines (90 loc) · 3.24 KB
/
GH_Crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# coding:utf-8
'''
@File : GH_Crawler.py
@Time : 2021/10/1
@Author : Jarcis-cy
@Link : https://github.com/Jarcis-cy/Google-Hacking-Crawler
'''
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import argparse
import pprint
import time
import pandas as pd
from urllib import parse
# 获取页面中出现的链接
def getUrl():
urlList = []
a = browser.find_elements_by_xpath('//*[@class="g"]/div/div/div/a')
for i in a:
tmpUrl = i.get_attribute('href')
urlList.append(tmpUrl)
return urlList
# 判断传入的元素是否存在
def isElementExist(by,element):
flag=True
try:
browser.find_element(by,element)
return flag
except:
flag=False
return flag
# 接受爬取页数,开始爬取
def allPage(page,timeSleep):
allUrl = []
print("开始爬取")
for i in range(page):
time.sleep(timeSleep)
print("当前为第"+str(i+1)+"页")
if isElementExist(By.ID,"search"):
allUrl.append(getUrl())
print("第"+str(i+1)+"页爬取完成")
if isElementExist(By.ID,"pnnext"):
browser.find_element_by_id("pnnext").click()
if i == page - 1:
print("全部爬取完成!")
return allUrl
else:
print("没有下一页,全部爬取完成!")
return allUrl
else:
print("不存在元素,全部爬取完成!")
return allUrl
# 处理url并生成csv
def createCsv(aList,csvName):
domain = []
urlL = []
for i in range(len(aList)):
for j in range(len(aList[i])):
parsed_tuple = parse.urlparse(aList[i][j])
domain.append(parsed_tuple.netloc)
urlL.append(aList[i][j])
urlDict = {"domain":domain, "url":urlL}
df = pd.DataFrame(urlDict)
df.to_csv(csvName,index=False)
# 传入参数设置
csvName = str(int(time.time())) + ".csv"
parser = argparse.ArgumentParser()
parser.add_argument('--gpu', action="store_false", help='输入该参数将显示chrome,显示爬取过程,默认为False')
parser.add_argument('-s', type=str, default='site:.com', help='请输入你想搜索的google hacking语句,默认为site:.com,以此作为测试')
parser.add_argument('-p', type=int, default=1, help='请输入你想搜索的页数,默认1页')
parser.add_argument('-t', type=int, default=3, help='请输入翻下一页停顿的时间,默认3秒')
parser.add_argument('-r', type=str, default=csvName, help='请输入你想输出的文件名称,默认为'+csvName)
args = parser.parse_args()
# 初始化设置
chrome_options = Options()
if args.gpu:
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_experimental_option('excludeSwitches',['enable-automation'])
browser = webdriver.Chrome(chrome_options=chrome_options)
browser.get("https://www.google.com")
# 设置爬取目标并开始搜索
browser.find_element_by_xpath("/html/body/div[1]/div[3]/form/div[1]/div[1]/div[1]/div/div[2]/input").send_keys(args.s)
browser.find_element_by_xpath("/html/body/div[1]/div[3]/form/div[1]/div[1]/div[3]/center/input[1]").click()
# url处理
createCsv(allPage(args.p,args.t),args.r)
# 结束
browser.quit()