-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMegaChain.py
191 lines (173 loc) · 6.92 KB
/
MegaChain.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
import os
import re
import gzip
import xml.etree.ElementTree as ET
import requests
from lxml import etree
from loguru import logger
from CustomExceptions import WrongChainFileException, NoStoreException, NoSuchStoreException
from Chain import Chain
dateLinkR = re.compile('\d{8}/')
class MegaChain(Chain):
'''
The basic functions each Chain should implement
'''
def __init__(self, db, url, name, chainId, manu=None, itemCodes=None, codeCategoryR=None):
username = None
password = None
super().__init__(db, url, username, password, name, chainId, manu=manu, itemCodes=itemCodes, codeCategoryR=codeCategoryR)
def login(self):
'''
Login to site if needed
---------------------
Parameters:
Uses:
=====================
Return:
session object with relevant cookies
Side effects:
downloads files to dirname
'''
return requests.Session()
def download_page(self, page=1, updateDate=None, firstOfLast=None):
'''
get PriceFull file list created after updateDate
---------------------
Parameters:
updateDate - update date of reference
csrfToken - cerberus token for identification
Uses:
=====================
Return:
1. list of dics containing link to download and file name
2. False, shouldn't continue paging
Side effects:
'''
if updateDate is None:
updateDate = self._getLatestDate()
folders = self._getFolderContent(reFilter=dateLinkR)
relFolders = [folder for folder in folders if self._todatetime(folder[:-1]) > updateDate]
links = []
for folder in folders:
date = self._todatetime(folder[:-1])
if date < updateDate:
self._log(f"Stop paging, reached foldr date: {folder}")
break
files = self._getFolderContent(folder=folder, reFilter=self.priceR)
self._log(f"Found {len(files)} links in folder {folder}")
links = links + [{'link': f'{self.url}/{folder}{file}', 'name': file} for file in files]
return links, False
def getStoreFile(self, updating):
'''
Get file with chain stores for updating
---------------------
Parameters:
Uses:
=====================
Return:
location of stored file
Side effects:
Download file with stores data
'''
folders = self._getFolderContent(reFilter=dateLinkR)
folder = folders[0]
folderFiles = self._getFolderContent(reFilter=self.storeR, folder=folder)
storeFileName = folderFiles[0]
link = f'{self.url}/{folder}{storeFileName}'
if updating and os.path.exists(f"{self.dirname}/{storeFileName}.gz"):
raise NoSuchStoreException
return(self._download_gz(storeFileName, link))
def obtainStores(self, fn):
'''
Obtain chain stores
Has manual override for Victory, wrong store file ID
---------------------
Parameters:
fn - file name
=====================
Return:
list of Item objects
'''
self._log(f"Obtaining stores from {fn}")
with gzip.open(fn, 'rt', encoding='utf-16') as f:
data = f.read()
context = ET.fromstring(data)
chainId = int(context.find('.//ChainId').text)
if self.chainId is not None and chainId != self.chainId:
# chainId in file should be like setup
logger.error(f"Chain {self.chainId}: file with wrong chain Id {chainId} supplied {fn}")
raise WrongChainFileException
try:
self.chainId = self._getChain(chainId)
except TypeError:
self.chainId = self._insertChain(chainId)
subchains = self._getSubchains(self.chainId)
stores = self._getStores(self.chainId)
subchainsElem = context.find('.//SubChains')
storesIns = {}
storeLinks = {}
for sc in subchainsElem:
subchainId = int(sc.find('SubChainId').text)
if subchainId in subchains:
subchain = subchains[subchainId]
else:
subchainName = int(sc.find('SubChainName').text)
subchain = self._insertSubchain(self.chainId, subchainId, subchainName)
subchains[subchainId] = subchain
storesElem = sc.find('Stores')
for store in storesElem:
storeId = int(store.find("StoreId").text)
if storeId in stores:
continue
storeName = store.find("StoreName").text
city = store.find("City").text
storesIns[storeId] = [self.chainId, storeId, storeName, city]
storeLinks[storeId] = subchain
self._insertStores(storesIns, storeLinks)
# ========== PRIVATE ==========
def _getFolderContent(self, reFilter, folder=''):
'''
In MegaChain interface get links to data
---------------------
Parameters:
reFilter - regex precompiled of which links to pick
folder - where to search
Uses:
=====================
Return:
list of links
Side effects:
'''
self._log(f"searching for table for files in folder {folder} regex {reFilter}")
r = self.session.get(f'{self.url}/{folder}')
res = r.text
html = etree.HTML(res)
linksXml = html.findall(".//td[@valign='top']/a")
if reFilter is None:
relLinks = linksXml
else:
relLinks = [linkXml.attrib['href'] for linkXml in linksXml if reFilter.match(linkXml.attrib['href'])]
return(relLinks)
class YBitan(MegaChain):
def __init__(self, db):
name = "YBitan"
url = "http://publishprice.ybitan.co.il"
chainId = 7290725900003
codeCategoryR = re.compile(r'^1?\d{3}$')
itemCodes = [7290016334166, 7290003706167, 3560071348007,
7290000011593]
super().__init__(db, url, name, chainId, itemCodes=itemCodes, codeCategoryR=codeCategoryR)
class Mega(MegaChain):
def __init__(self, db):
name = "Mega"
url = "http://publishprice.mega.co.il"
chainId = 7290055700007
codeCategoryR = re.compile(r'^1\d{3}$')
super().__init__(db, url, name, chainId, itemCodes=itemCodes, codeCategoryR=codeCategoryR)
class MegaMarket(MegaChain):
def __init__(self, db):
name = "MegaMarket"
url = "http://publishprice.mega-market.co.il"
chainId = 7290055700014
codeCategoryR = re.compile(r'^1\d{3}$')
super().__init__(db, url, name, chainId, itemCodes=itemCodes, codeCategoryR=codeCategoryR)