-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathurl2md.py
executable file
·143 lines (119 loc) · 6.12 KB
/
url2md.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#!/usr/bin/env python3
'''
Goal: minimise number of browser tabs I need to open when researching, get LLM's help to reduce some cognitive load and do preliminary skimming.
Usage:
python3 url2md.py --url https://thereader.mitpress.mit.edu/noam-chomsky-and-andrea-moro-on-the-limits-of-our-comprehension/
python3 url2md.py --ul url_list.txt
python3 url2md.py --ul ul-sora.txt --redo
Behavior:
If both ul and url args are provided, all URLs are processed. if url is not provided, only ul is processed.
This script get a JS check on twitter: https://x.com/dwarkesh_sp/status/1741972913501405435 ("JavaScript is not available.")
A Facebook URL (perhaps private as well) also returns some blank / redirection thing
Sample output:
python3 url2md.py --url https://en.wikipedia.org/wiki/The_Revolt_of_the_Masses
retrieving url: https://en.wikipedia.org/wiki/The_Revolt_of_the_Masses
TODO
√1. make these into the same folder, can specify "research_codename" as such.
oh, via ul aja. jadi pake nama txt-nya. so all urls are assumed to be of that theme. simple!
2. support providing some DOM selector, e.g. extract top comments only from a HN thread.
or to not get the navigations from some wikipedia or blog pages
mirip shot-scraper sebenernya, so let's see if that is sufficient
'''
import os
import argparse
import html2text
import requests_html
from time import sleep
from pathlib import Path
from urllib3.util.url import parse_url
from urllib.parse import parse_qs
from base64 import b64decode
zyte_api_key = os.getenv("ZYTE_API_KEY")
if __name__ == '__main__':
session = requests_html.HTMLSession()
md_converter = html2text.HTML2Text()
md_converter.body_width = 0 # Disable line wrapping
url_list = []
url = ''
output_dir = 'output/mds/'
parser = argparse.ArgumentParser(description='get markdown for the url')
parser.add_argument('--url', type=str, default=url, help='url you\'d like to markdown')
parser.add_argument('--ul', type=str, help='path to url list, one url per line')
parser.add_argument('--output_dir', type=str, help='dir to use to save the output file')
parser.add_argument('--redo', action='store_true', help='if specified, will overwrite existing md files')
args = parser.parse_args()
ul = ''
if args.ul:
ul = args.ul
else:
url = args.url
if url:
url_list.append(url)
# if ul is provided then the output folder is theme-based
if(ul and os.path.exists(ul)):
with open(ul, 'r') as ful:
urls = ful.read().split("\n")
for url in urls:
if url:
url_list.append(url)
output_dir = output_dir+Path(ul).stem # os.path.basename(ul)
for url in url_list:
pu = parse_url(url)
netloc = pu.host
slug = netloc # if some mere netloc / mere homepage was given
if pu.path:
# if I wanna keep the netloc in the file and not rely on the subdirectories to infer source.
# hmm. better include the source URL in the MD? done
# slug += '-'+[segment for segment in pu.path.split('/') if segment][-1]
# replace the slug (filled with netloc) with just the path onwards
slug = [segment for segment in pu.path.split('/') if segment][-1]
if pu.query:
# bisa jelek kalo ada query params kyk gini nih
# url = 'https://www.mas.gov.sg/publications?date=2021-01-01T00%3A00%3A00Z%2C2021-12-31T23%3A59%3A59Z&content_type=Monographs%2FInformation%20Papers&page=1&q=ubin'
sanitised_qs = {param: parse_qs(pu.query).get(param, [None])[0] for param in parse_qs(pu.query).keys()}
# not efficient and kinda dumb lah, just to a string replace on the final slug
# sanitised_qs = {param: parse_qs(pu.query).get(param, [None])[0] for param in parse_qs(pu.query).keys() if '/' not in parse_qs(pu.query).get(param, [None])[0]}
for k in sanitised_qs.keys():
# slug += k+sanitised_qs[k].replace('/', '')
slug += k+sanitised_qs[k].replace('/', '').replace(':', '').replace(',', '').replace(' ', '')
if not ul:
output_dir = output_dir+netloc
# override output_dir if output_dir is specified as an arg
if(args.output_dir and os.path.exists(args.output_dir)):
output_dir = args.output_dir
if not os.path.exists(output_dir):
os.mkdir(output_dir)
if(not args.redo and os.path.exists(output_dir+'/'+slug+'.md')):
print(output_dir+'/'+slug+'.md' + " file exists")
continue
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}
# print(f"retrieving url: {url} with headers: {headers}")
if('medium.com' in url or 'x.com' in url or 'twitter.com' in url):
# use Zyte API
response = session.post("https://api.zyte.com/v1/extract",
auth=(zyte_api_key, ""),
json={
"url": url,
"browserHtml": True,
"screenshot": True,
},
)
browser_html: str = response.json()["browserHtml"]
markdown_content = "Downloaded from: "+url+"\n\n---\n\n"
markdown_content += md_converter.handle(browser_html)
screenshot: bytes = b64decode(response.json()["screenshot"])
with open(output_dir+'/'+slug+'.jpg', "wb") as fp:
fp.write(screenshot)
else:
# without Zyte API
response = session.get(url, headers=headers)
response.html.render()
markdown_content = "Downloaded from: "+url+"\n\n---\n\n"
markdown_content += md_converter.handle(response.html.html)
with open(output_dir+'/'+slug+'.md', 'w', encoding='utf-8') as f:
f.write(markdown_content)
print(output_dir+'/'+slug+'.md') # so I can pipe it to pbcopy and md2notes.py it easily
if(len(url_list) > 1):
sleep(1) # dumb throttling