Skip to content
Nullgogo edited this page May 9, 2023 · 1 revision

Welcome to the CausalDiscoveryToolbox wiki!

from bs4 import BeautifulSoup
import requests
from urllib.request import urlopen,Request
import re
import pandas as pd
from lxml import etree
from io import BytesIO, StringIO
import gzip
from gzip import GzipFile
import zlib
import random
import codecs
import time
import os


page = [i+1 for i in range(500)]
page_url = 'https://www.xinli001.com/qa?page={}&type=question&object_name=last&title=&level2_tag=0&sort=id&from=houye-dh'
base_url = "https://www.xinli001.com"

title_lst = []
describe_lst = []
keyword_lst = []

for page_id in page:
    url_1 = page_url.format(str(page_id))
    try:
        html = urlopen(url_1).read().decode('utf-8')
    except:
        continue
    soup = BeautifulSoup(html, features='lxml')
    module_list = soup.find_all("a", {"href": re.compile("/qa/([0-9]).*")})
    for module in module_list:
        sub_url = base_url+module["href"]
        # try:
        ## 方法1
        # sub_html = urlopen(sub_url).read().decode('utf-8')

        ## 方法2
        # req = Request(sub_url)
        # req.add_header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36")
        # req.add_header("accept-encoding", "gzip")
        # page = urlopen(req)
        # txt = page.read()
        # buf = BytesIO(txt)
        # f = GzipFile(fileobj=buf)
        # sub_html = f.read()

        ## 方法3
        # f = urlopen(sub_url)
        # buf = f.read()
        # sub_html = gzip.decompress(buf)
        # sub_html = sub_html.decode("utf-8")

        ## 方法4
        req = Request(sub_url)
        sub_html = urlopen(req).read()
        sub_html = etree.HTML(sub_html)
        titles = sub_html.xpath('/body/div[@class="title"]')

        ## 方法5
        # res = requests.get(sub_url)
        # res.encoding = 'utf-8'
        # sub_html = res.text
        #
        # except:
        #     continue
        sub_soup = BeautifulSoup(sub_html, features='lxml')
        title_module = sub_soup.find_all("div", {"class": "title"})#[0]

        title = title_module.find('h1').get_text()
        describe = sub_soup.find_all("p", {"class": "text"})[0].get_text()
        keyword_module = sub_soup.find_all("ul", {"class": "label detail-tag"})[0].find_all("li")
        keywprd_list = [keyword.get_text().replace(" ","").replace("\n","") for keyword in keyword_module]
        keyword = ",".join(keywprd_list)
        title_lst.append(title)
        describe_lst.append(describe)
        keyword_lst.append(keyword)
df = pd.DataFrame({"标题": title, "关键词":keyword, "问题描述": describe_lst})

df.to_excel("./HealthQA.xlsx")
Clone this wiki locally