-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
executable file
·154 lines (132 loc) · 4.97 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#!/usr/bin/python
import json
import os
import requests
import shutil
import hyperlink
import re
import click
from PIL import Image
from fpdf import FPDF
from pathlib import Path
from tqdm import tqdm
# Globals
verbosity = 0
def prepare_paths(download_path: str, image_path: str, bin_path: str, cleanup=False)->bool:
if cleanup or (os.path.isdir(download_path) and not cleanup):
log("Cleaning download directory", 1)
shutil.rmtree(download_path)
return True
else:
log(f'Storing downloads on: {download_path}', 1)
os.mkdir(download_path)
log(f'Storing Images on: {image_path}', 2)
os.mkdir(image_path)
log(f'Storing Binaries on: {bin_path}', 2)
os.mkdir(bin_path)
return False
# TODO: Check if the file is already there
def downloadFile(uri: str, path: str, headers: dict):
filename=os.path.join(os.getcwd(), path, uri.split('/')[-1])
url = "https://" + uri
jpg = open(filename, 'wb')
down_headers = headers
down_headers['Host'] = uri.split('/')[0]
response = requests.request("GET",url, headers=headers)
jpg.write(response.content)
jpg.close()
def extract_number_from_page(page_uri: str) -> int:
""" Tryies to extrat the number from the pages """
# Yeah, you will never remember this
# https://regex101.com/r/2ZGOJj/2
page_no = re.search(r'(?<=page_)\d+', page_uri)
if page_no == None:
print("Raise an exepction")
exit(1)
else:
return int(page_no.group())
return 0
def set_log_level(verbose):
global verbosity
verbosity = verbose
if verbosity > 0:
click.echo(message=f'Verbosity leverl set to: {verbosity}', color=True)
def log(message:str, level=0):
if level <= verbosity:
click.echo(message)
def get_reader_url(url: str) -> str:
""" This converst the document URL and converts to url that
provide all the reader data """
link= hyperlink.parse(url)
reader_path = []
for p in link.path:
if p != 'docs':
reader_path.append(p)
reader_path.append('reader3_4.json')
reader_link = link.replace(host="reader3.isu.pub", path=reader_path, query=[], )
log('URL with no GET Query Params: {u}'.format(u=reader_link))
return str(reader_link)
def get_json(url: str, headers) -> dict:
pass
@click.command()
@click.argument('url')
@click.option('-f', '--file', default='output.pdf', help='Name of the output pdf')
@click.option('-d', '--download', default='download', help='Download Directory')
@click.option('-k', '--keep', default=False, is_flag=True, help='Keep downladed files after generating the file')
@click.option('-v', '--verbose', count=True)
def main(download, file, keep, url, verbose):
set_log_level(verbose)
if os.path.exists(file):
log(f'PDF file {file} already exists, remove or specify another name with --file')
exit(1)
reader_url = get_reader_url(url)
image_path = os.path.join(download, "jpg/")
bin_path = os.path.join(download, "bin/")
payload = ""
headers = {
'User-Agent': "Mozilla/5.0 (X11; Linux x86_64; rv:74.0) Gecko/20100101 Firefox/74.0",
'Accept': "*/*",
'Accept-Language': "en-US,en;q=0.5",
'Referer': "https://issuu.com/ducatiomaha/docs/ducatiomaha_2015_diavel?e=1222863/13414409",
'Cache-Control': "max-age=0",
'DNT': "1",
'Origin': "https://issuu.com",
'TE': 'Trailers'
}
response = requests.request("GET", reader_url, data=payload, headers=headers)
data = response.json()
pages = len(data['document']['pages'])
log(f'Got a document with {pages} pages', 1)
prepare_paths(download, image_path, bin_path)
log("Downloading Files....")
images = []
for page in tqdm(data['document']['pages']):
imgPath = os.path.join(image_path, page['imageUri'].split('/')[-1])
page_no = extract_number_from_page(page['imageUri'])
downloadFile(uri=page['imageUri'], path=image_path, headers=headers)
images.append({
"name": imgPath,
"page_no" : page_no,
"height": page["height"],
"width": page["width"],
"verfied": False
})
downloadFile(uri=page['layersInfo']['uri'], path=bin_path, headers=headers)
# Converting images to PDFs
# https://pyfpdf.github.io/fpdf2/Tutorial.html
print("Converting images into PDF")
pdf = FPDF()
for img in tqdm(images):
if not os.path.exists(img['name']):
print("Something wronh could not find the image")
exit(1)
elif img['page_no'] == 1:
print('Using the first page as cover')
pdf = FPDF(unit= "pt", format= [img['width'], img['height']] )
pdf.add_page()
pdf.image(img['name'], 0, 0, img['width'], img['height'])
pdf.output(file)
log(f'Saved PDF to: {file}')
prepare_paths(download_path=download, image_path=image_path, bin_path=bin_path, cleanup=keep)
if __name__ == '__main__':
main()