forked from FenTechSolutions/CausalDiscoveryToolbox
-
Notifications
You must be signed in to change notification settings - Fork 0
Home
Nullgogo edited this page May 9, 2023
·
1 revision
Welcome to the CausalDiscoveryToolbox wiki!
from bs4 import BeautifulSoup
import requests
from urllib.request import urlopen,Request
import re
import pandas as pd
from lxml import etree
from io import BytesIO, StringIO
import gzip
from gzip import GzipFile
import zlib
import random
import codecs
import time
import os
page = [i+1 for i in range(500)]
page_url = 'https://www.xinli001.com/qa?page={}&type=question&object_name=last&title=&level2_tag=0&sort=id&from=houye-dh'
base_url = "https://www.xinli001.com"
title_lst = []
describe_lst = []
keyword_lst = []
for page_id in page:
url_1 = page_url.format(str(page_id))
try:
html = urlopen(url_1).read().decode('utf-8')
except:
continue
soup = BeautifulSoup(html, features='lxml')
module_list = soup.find_all("a", {"href": re.compile("/qa/([0-9]).*")})
for module in module_list:
sub_url = base_url+module["href"]
# try:
## 方法1
# sub_html = urlopen(sub_url).read().decode('utf-8')
## 方法2
# req = Request(sub_url)
# req.add_header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36")
# req.add_header("accept-encoding", "gzip")
# page = urlopen(req)
# txt = page.read()
# buf = BytesIO(txt)
# f = GzipFile(fileobj=buf)
# sub_html = f.read()
## 方法3
# f = urlopen(sub_url)
# buf = f.read()
# sub_html = gzip.decompress(buf)
# sub_html = sub_html.decode("utf-8")
## 方法4
req = Request(sub_url)
sub_html = urlopen(req).read()
sub_html = etree.HTML(sub_html)
titles = sub_html.xpath('/body/div[@class="title"]')
## 方法5
# res = requests.get(sub_url)
# res.encoding = 'utf-8'
# sub_html = res.text
#
# except:
# continue
sub_soup = BeautifulSoup(sub_html, features='lxml')
title_module = sub_soup.find_all("div", {"class": "title"})#[0]
title = title_module.find('h1').get_text()
describe = sub_soup.find_all("p", {"class": "text"})[0].get_text()
keyword_module = sub_soup.find_all("ul", {"class": "label detail-tag"})[0].find_all("li")
keywprd_list = [keyword.get_text().replace(" ","").replace("\n","") for keyword in keyword_module]
keyword = ",".join(keywprd_list)
title_lst.append(title)
describe_lst.append(describe)
keyword_lst.append(keyword)
df = pd.DataFrame({"标题": title, "关键词":keyword, "问题描述": describe_lst})
df.to_excel("./HealthQA.xlsx")