def get_chapters(self): url = 'https://www./40_40289/' r = self.session.get(url) r.encoding = chardet.detect(r.content).get('encoding', 'utf-8') html = etree.HTML(r.text) for item in html.xpath('//dl/dd/a'): yield item.attrib['title'], url + item.attrib['href']
章節(jié)內(nèi)容獲取也非常簡單,就不分析了
def get_content(self, url): r = self.session.get(url) r.encoding = chardet.detect(r.content).get('encoding', 'utf-8') html = etree.HTML(r.text) title = html.xpath(r'//*[@class='bookname']/h1')[0].text for info in html.xpath('//div[@id='content']'): text = info.xpath('string(.)')
import chardet import requests from lxml import etree from aip import AipSpeech
classCollectNovels: def__init__(self): self.session = requests.session() self.session.headers['user-agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
''' 你的 APPID AK SK ''' APP_ID = '16416498' API_KEY = 'oEWGafQkaUGqmsmPbfkE5OMx' SECRET_KEY = '6jdsUcH0PXz5TYoELU47u58W5vPV9lwf' self.client = AipSpeech(APP_ID, API_KEY, SECRET_KEY)
defget_chapters(self, url): r = self.session.get(url) r.encoding = chardet.detect(r.content).get('encoding', 'utf-8') html = etree.HTML(r.text) for item in html.xpath('//dl/dd/a'): yield item.attrib['title'], url + item.attrib['href']
defget_content(self, url): r = self.session.get(url) r.encoding = chardet.detect(r.content).get('encoding', 'utf-8') html = etree.HTML(r.text) for info in html.xpath('//div[@id='content']'): text = info.xpath('string(.)') for line in text.split('。'): content = self.client.synthesis(line, 'zh', 1, {'per': 0}) with open('auido.mp3', 'rb') as fp: fp.write(content)
if __name__ == '__main__': novel = CollectNovels() home_url = 'https://www./40_40289/' for title, url in novel.get_chapters(home_url): novel.get_content(url)