-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfaceplusplus.py
125 lines (115 loc) · 4.33 KB
/
faceplusplus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# -*- coding: utf-8 -*-
import csv
import MySQLdb as mdb
import re
import urllib2
import urlparse
import requests
import string
import sys
import os
import urllib
import json
import time
# Put your own API here
face_api = 'xxxxxx'
face_secret = 'xxxxx'
api_url = 'http://apicn.faceplusplus.com/v2/detection/detect'
out_file = open('detailedfaceresult.csv', 'wb')
csv_file = csv.writer(out_file)
csv_file.writerow(['main_url', 'url', 'face_id', 'gender', 'race', 'age', 'smiling'])
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:38.0) Gecko/20100101 Firefox/38.0',
}
try:
conn = mdb.connect('localhost', 'root', '123456', 'test_python')
cur = conn.cursor()
cur.execute("DROP TABLE IF EXISTS faceplusplus")
cur.execute("""CREATE TABLE faceplusplus(
id INT PRIMARY KEY AUTO_INCREMENT,
main_url VARCHAR(255),
url VARCHAR(255),
face_id VARCHAR(255),
gender VARCHAR(10),
race VARCHAR(50),
age INT,
smiling VARCHAR(50)) """)
except mdb.Error, e:
print e
#get the url and return json format data
def message(url, main_url):
get_data = {
'api_key': face_api,
'api_secret': face_secret,
'url': url,
'attribute': 'glass,pose,gender,age,race,smiling'
}
response = requests.get(api_url, params=get_data, headers=headers)
'''
response = urllib2.urlopen(
'http://apicn.faceplusplus.com/v2/detection/detect?api_key=' + face_api + '&api_secret=' + face_secret + '&url=' + url + '&attribute=glass,pose,gender,age,race,smiling')
data = json.loads(response.read())
'''
data = response.json()
try:
url = data['url']
#print data
for face in data['face']:
gender = face['attribute']['gender']['value']
age = face['attribute']['age']['value']
race = face['attribute']['race']['value']
smiling = face['attribute']['smiling']['value']
face_id = face['face_id']
main_url = main_url
print main_url, url, face_id, gender, race, age, smiling
csv_file.writerow([main_url, url, face_id, gender, race, age, smiling])
try:
cur.execute("""INSERT INTO faceplusplus(main_url, url, face_id, gender, race, age, smiling)
VALUES(%s, %s, %s, %s, %s, %s, %s)""", [main_url, url, face_id, gender, race, age, smiling])
except mdb.Error, e:
print e
cur.rollback()
except Exception, e:
print e
# crawl webpage to get all jpg image urls, we need fake ourselve as http header request
# or we need limit the number of images to analyze to avoid the request block.
def imgs(url, limit=10):
imgcontent = requests.get(url, headers=headers).text #crawl your webpage content
#urllist = re.findall(r'http.+?\.jpg', imgcontent, re.I) #analyze img urls
urllist = re.findall(r'<img.*src=\"(.*?\.jpg|gif|png)\"', imgcontent, re.I) #analyze img urls
print urllist
if not urllist:
print 'images for this url are not found...'
else:
url_set = set(url for url in urllist) # duplicate filter
url_set = list(url_set)
url_length = len(url_set)
# if limit is 0, it will crawl all pics in webpage
if limit == 0:
limit = url_length
if limit > url_length:
limit = url_length
for x, imgurl in enumerate(url_set[:limit]):
if 'http' not in imgurl:
imgurl = urlparse.urljoin(url, imgurl)
print imgurl
print u'Processing the %sth pictures...' % str(x+1)
message(imgurl,url)
#time.sleep(3)
if __name__ == "__main__":
#Open the wesite url you want to analyze or open a file with urls
with open('url_data.txt', 'rb') as F:
for line in F.readlines():
print line
print line.split()
url, limit = line.split()
imgs(url, int(limit))
out_file.close()
conn.commit()
conn.close()
'''
txt file should be like this
http://www.sina.com.cn 10
http://www.sohu.com 15
....
'''