-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawl.py
125 lines (103 loc) · 3.95 KB
/
crawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
from abc import ABC, abstractmethod
import requests
from bs4 import BeautifulSoup
from config import BASE_LINK, STORAGE_TYPE
import json
from storage import FileStorage, MongoStorage
from parser import AdvertisementPageParser
class CrawlerBase(ABC):
"""
class Base
"""
def __init__(self):
self.storage = self.__set_storage()
@staticmethod
def __set_storage():
if STORAGE_TYPE == 'mongo':
return MongoStorage()
return FileStorage()
@abstractmethod
def start(self, store=False):
"""Saving automatically was not fun, so I added a parameter called store , False to True save if necessary."""
"""
for start scripts
"""
pass
@abstractmethod
def store(self, data, filename=None):
pass
@staticmethod
def get(link):
"""Because we need this method in both classes"""
try:
response = requests.get(link)
except requests.HTTPError:
return None
return response
class LinkCrawler(CrawlerBase):
"""
for Crawl all links in page main
"""
def __init__(self, cities, link=BASE_LINK):
self.cities = cities
self.link = link
super().__init__()
# def get_page(self, url, start=0):
# try:
# response = requests.get(url + str(start))
# except:
# return None
# return response
def find_links(self, html_doc):
soup = BeautifulSoup(html_doc, 'html.parser')
return soup.find_all('a', attrs={'class': 'hdrlnk'})
def start_crawl_city(self, url):
start = 0
crawl = True
list_link_house = list()
while crawl:
response = self.get(url + str(start)) # We did the division ourselves
if response is None:
crawl = False
continue
new_links = self.find_links(response.text)
list_link_house.extend(new_links)
start += 120
crawl = bool(len(new_links))
return list_link_house
def start(self, store=False):
adv_links = list()
for city in self.cities:
links = self.start_crawl_city(self.link.format(city))
print(f'{city}total: ', len(links))
adv_links.extend(links)
"""It is better to write the following line of code in a separate method"""
if store:
self.store(
[{'url': li.get('href'), 'flag': False} for li in adv_links]) # if not written = error Serialized
return adv_links
def store(self, data, *args):
# به این دلیل filename رو *args گذاشتم که اختباری باشد وارد کردن آن!!
self.storage.store(data, 'advertisements_links')
class DataCrawler(CrawlerBase):
"""
for crawl detail(description,id,price,...) in page detail
"""
def __init__(self):
super().__init__() # اینیت صدا زدم تا از استوریج یک instance داشته باشیم تا بر روی آن __load_link را call کنیم
self.links = self.__load_links()
self.parser = AdvertisementPageParser() # Composite
def __load_links(self):
return self.storage.load()
def start(self, store=False):
for link in self.links:
response = self.get(link['url'])
data_dict = self.parser.parse(response.text)
if store:
# اگر post_id نداشتی یه sample به عنوان نام فایل بذار
# البته که میتونم مثلا از لینکه یه اسم رندوم بسازیم
self.store(data_dict, data_dict.get('post_id', 'sample'))
self.storage.update_flag(link)
def store(self, data, *args):
# دقت کنید اینجا چون لینک های صفحه جزییات کرال میشوند filename نباید null باشند
self.storage.store(data, 'advertisement_data')