-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDownloadAllPaperInfo.py
119 lines (101 loc) · 5.66 KB
/
DownloadAllPaperInfo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#coding=utf-8
import os
from AnaConJournalHomapage import LoadDatInfo,AnaACMConHomepage,AnaACLConHomepage
from AnaConJournalHomapage import AnaIEEEConHomepage,AnaJMLRConHomepage,AnaNIPSConHomepage
from AnaConJournalHomapage import xmlAPI,usefulAPI
# -------------------------------------------------------------------- #
# Download paper homepages from all top conferences:
# IEEE conferences: AAAI, IJCAI
# ACM conferences: WWW, CIKM, SIGKDD, SIGIR, WSDM
# ACL conferences: ACL, EMNLP, NAACL, COLING,
# ACL Journals: TACL, CL
# 添加时间: 2016-04-06
# Revise Time: 2017-01-19
# -------------------------------------------------------------------- #
class downloadPaperInAllCon:
def __init__(self,con_to_homepage_dict,con_to_org_dict,
data_base_file = './Data/dat_base.xml',
bad_data_file = './Data/dat_base_bad.xml'):
self.con_to_homepage_dict = con_to_homepage_dict
self.con_to_org_dict = con_to_org_dict
self.con_to_homepage_file_dict = {}
self.all_paper_info_list = []
self.paper_info_bad_good_list = []
self.data_base_file = data_base_file
self.bad_data_file = bad_data_file
def download_all_homepage(self,homepage_filefold = './Data/Homepage/'):
print 'DownLoad all conference homepages......'
i = 0
for con_name_year, url in self.con_to_homepage_dict.items():
i = i + 1
print str(i) + ' / ' + str(len(self.con_to_homepage_dict)) + ' --> ' + con_name_year
download_file = homepage_filefold + con_name_year + '.html'
if os.path.exists(download_file) == False:
usefulAPI.download_page(url,download_file)
self.con_to_homepage_file_dict[con_name_year] = download_file
print 'DownLoad all conference homepages Done.'
def analyze_conference(self,home_page_file,con_name_year):
homepage_url = self.con_to_homepage_dict[con_name_year]
if self.con_to_org_dict[con_name_year] == 'ACL':
AnaACLConHomepage.get_paper_info_list_from_con_year(homepage_url,home_page_file,con_name_year,self.all_paper_info_list)
elif self.con_to_org_dict[con_name_year] == 'ACM':
AnaACMConHomepage.get_paper_info_list_from_con_year(home_page_file,con_name_year,self.all_paper_info_list)
elif self.con_to_org_dict[con_name_year] == 'IEEE':
AnaIEEEConHomepage.get_paper_info_list_from_con_year(home_page_file,con_name_year,self.all_paper_info_list)
elif self.con_to_org_dict[con_name_year] == 'JMLR':
AnaJMLRConHomepage.get_paper_info_list_from_con_year(homepage_url,home_page_file,con_name_year,self.all_paper_info_list)
elif self.con_to_org_dict[con_name_year] == 'NIPS':
AnaNIPSConHomepage.get_paper_info_list_from_con_year(home_page_file,con_name_year,self.all_paper_info_list)
def analyze_all_conference(self):
print 'Analyze all conference homepages......'
i = 0
for con_name_year, url in self.con_to_homepage_dict.items():
i = i + 1
print str(i) + ' / ' + str(len(self.con_to_homepage_dict)) + ' --> ' + con_name_year
home_page_file = self.con_to_homepage_file_dict[con_name_year]
self.analyze_conference(home_page_file,con_name_year)
print 'Analyze all conference homepages Finished.'
def get_bad_good_paper_info_list(self):
for paper_info in self.all_paper_info_list:
temp_bad_good_str = 'good'
for key, value in paper_info.items():
#if key == 'title' and value == 'A Whole Page Click Model to Better Interpret Search Engine Click Data':
# print paper_info
# print paper_info['download_url']
# print paper_info['download_url'] == ''
if key == 'author_list':
if len(value) == 0:
temp_bad_good_str = 'lack_of_author'
else:
is_has_author = False
for author in value:
if author != '': is_has_author = True
if is_has_author == False:
temp_bad_good_str = 'lack_of_author'
elif value == '':
temp_bad_good_str = 'lack_of_' + key
self.paper_info_bad_good_list.append(temp_bad_good_str)
def print_out_all_paper_info(self):
print 'print_out_all_paper_info......'
self.get_bad_good_paper_info_list()
good_paper_info_list = []
bad_paper_info_list = []
for i in range(0,len(self.all_paper_info_list)):
if self.paper_info_bad_good_list[i] == 'good':
good_paper_info_list.append(self.all_paper_info_list[i])
else:
bad_paper_info_list.append(self.all_paper_info_list[i])
xmlAPI.print_out_paper_info(good_paper_info_list,self.data_base_file)
xmlAPI.print_out_paper_info(bad_paper_info_list,self.bad_data_file)
print 'print_out_all_paper_info finished......'
if __name__ == '__main__':
con_to_homepage_dict,con_to_org_dict = LoadDatInfo.load_con_to_homepage_org_dict()
data_base_file = './Data/dat_base.xml'
bad_data_file = './Data/dat_base_bad.xml'
down_paper_in_AllCon = downloadPaperInAllCon(con_to_homepage_dict,
con_to_org_dict,
data_base_file=data_base_file,
bad_data_file=bad_data_file)
down_paper_in_AllCon.download_all_homepage()
down_paper_in_AllCon.analyze_all_conference()
down_paper_in_AllCon.print_out_all_paper_info()