import urllib.request from bs4 import BeautifulSoup import time def main(): url = 'http://www.shicimingju.com/book/' #獲得請(qǐng)求對(duì)象 request = get_request(url) #獲得響應(yīng)參數(shù) html = get_response(request) #獲取數(shù)據(jù) soup_html(html) def get_request(url): #獲得請(qǐng)求對(duì)象 headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36' } request = urllib.request.Request(url=url,headers=headers) return request def get_response(req): response = urllib.request.urlopen(req) html = response.read().decode('utf8') return html #獲得第一層數(shù)據(jù) def soup_html(html): #實(shí)例化soup soup = BeautifulSoup(html,'lxml') ret = soup.select('.bookmark-list ul li h2 a') for i in ret: title = i.text + i['href'] # print(href) #調(diào)用生成請(qǐng)求對(duì)象的方法 request = get_request(href) #調(diào)用生成響應(yīng)對(duì)象的方法 text = get_response(request) filename = title + '.txt' fp = open(filename,'a',encoding='utf8') # print(text) # 調(diào)用第二層的生成的數(shù)據(jù) datas = get_txt(href) for i in datas: title = i[0] print("正在下載%s" %title) for i in i[1]: res = i.get_text() fp.write(title+'\n'+res) time.sleep(2) print("下載結(jié)束%s"%title) # print("datas:\n",datas) # # #獲得第二層數(shù)據(jù) def get_txt(href): # #調(diào)用生成請(qǐng)求對(duì)象的方法 request = get_request(href) # #調(diào)用生成響應(yīng)對(duì)象的方法 text = get_response(request) # print(text) #實(shí)例化一個(gè)soup soup = BeautifulSoup(text,'lxml') ret = soup.select('.book-mulu ul li a') # print(ret) print(1) d1 = [] for i in ret: title = i.text + i['href'] # print(title,href) res = get_data(href) d1.append(res) return d1 #獲得第三層數(shù)據(jù) def get_data(href): #調(diào)用生成對(duì)象的方法 request = get_request(href) #調(diào)用生成響應(yīng)對(duì)象的方法 response = get_response(request) #實(shí)例化一個(gè)soup soup = BeautifulSoup(response,'lxml') ret = soup.select('.www-shadow-card h1')# title = ret[0].text#獲得每個(gè)章節(jié)的章回名 if soup.select('.chapter_content p'): res = soup.select('.chapter_content p') return title, res elif soup.select('.chapter_content'): res = soup.select('.chapter_content') return title, res #循環(huán)第三層數(shù)據(jù) if __name__ == '__main__': main()
|
|
來(lái)自: 郭祺迦 > 《實(shí)踐案例》