-
Notifications
You must be signed in to change notification settings - Fork 39
/
Copy pathattach_all_github_repo_info.py
87 lines (70 loc) · 2.46 KB
/
attach_all_github_repo_info.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# import mistune
import markdown2
import re
import pymysql
import yaml
import iso8601
import pytz
import subprocess
import html.parser
import os
from bs4 import BeautifulSoup, Tag
from datetime import datetime, timedelta
from dateutil.parser import parse
input_dir = './original_awesomes/'
output_dir = './awesomes/'
with open('config.yml', 'r') as config_file:
config = yaml.load(config_file)
conn = pymysql.connect(
host='127.0.0.1',
charset='utf8',
use_unicode=True,
unix_socket='/tmp/mysql.sock',
user=config['database']['user'],
passwd=None,
db=config['database']['db'],
autocommit=True
)
cur = conn.cursor()
def attach_github_info(filename, input_directory='./', output_directory='./'):
print('processing ' + filename)
file_path = input_directory + filename
content = open(file_path, 'r').read()
content = content.replace('] (http', '](http')
markdown = markdown2.Markdown()
soup = BeautifulSoup(markdown.convert(content), 'html.parser')
lis = soup.find_all('li');
visited = set()
for li in lis:
a = li.find_all('a')
if len(a) > 0 and re.search('^https://github.com/[^/]+/[^/]+/?$', a[0]['href']):
repo_url = a[0]['href']
query = "select stargazers_count, pushed_at from awesome_augmented where repo_url='%s'" % (repo_url)
cur.execute(query)
rows = cur.fetchall()
if len(rows) == 1:
stars_count, updated_at = rows[0]
updated_at_datetime = parse(updated_at)
updated_days_ago = (datetime.now(pytz.utc)- updated_at_datetime).days
tag = soup.new_tag('sup')
tag.string = ' ★ %d, pushed %d days ago ' % (stars_count, updated_days_ago)
if a[0] in visited:
continue
else:
visited.add(a[0])
# a[0].insert_after(tag)
li.insert(len(li.contents), tag)
output_filename = output_directory + filename
f = open(output_filename, 'w')
f.write(soup.prettify(formatter=None))
del f
# subprocess.check_call(
# ['pandoc', filename + '.html', '-f', 'html', '-t', 'markdown_github', '-o', filename + '.md'])
def main():
file_names = os.listdir(input_dir)
for filename in file_names:
if filename[-2:] != 'md':
continue
attach_github_info(filename, input_dir, output_dir)
if __name__ == '__main__':
main()