-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgoogle_first_page_links.py
54 lines (42 loc) · 1.22 KB
/
google_first_page_links.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# -*- coding: utf-8 -*-
"""google_first_page_links.ipynb
Automatically generated by Colaboratory.
"""
# !pip install lxml
# !pip install bs4
import requests
import re
import pandas as pd
from bs4 import BeautifulSoup
# Search word
WORD = "facebook.com"
# (Optionally) For searching inside the site
INSITE = "site:"
# Searching inside the site
SEARCH_WORD = INSITE + WORD
# URL, headers and params for searching
url = "https://www.google.com/search"
headers = {"User-Agent": "link_scraper"}
# If you only search word then use WORD instead of SEARCH_WORD
params = {"q": SEARCH_WORD}
# Get html page and a text from it
html_doc = requests.get(url, params=params, headers=headers)
text = html_doc.text
# Create soup from the text
soup = BeautifulSoup(text, "lxml")
# Find a desirable link
HREF = "https://www." + WORD + "/"
# Iterate links through the soup
link_list = []
for link in soup.findAll("a", href=re.compile(HREF)):
link_list.append(link)
# Clean the links
url_list = []
for url in link_list:
url = re.sub(r"&.+", "", str(url))
url = re.sub(r'<a href="\/url\?q=', "", url)
url_list.append(url)
links_df = pd.DataFrame(url_list)
links_df.to_csv(
f"{WORD}_links.csv", index=False, header=False, encoding="utf-8"
)