import urllib.request from bs4 import BeautifulSoup import time class ZhilianSpider(object): def __init__(self,jl,kw,start_page,end_page): #保存到成員屬性中 self.jl = jl self.kw = kw self.start_page = start_page self.end_page = end_page self.url = 'https://sou./jobs/searchresult.ashx?' self.headers={ } self.items = [] def handle_request(self,page): data = { 'jl':self.jl, 'kw':self.kw, 'p':page } data = urllib.parse.urlencode(data) url = self.url + data request = urllib.request.request(url=url,headers=headers) return request # print(url) def parse_content(self,concent): #實例化一個soup soup = BeautifulSoup(concent,'lxml') #解析,,得到內(nèi)容 #先找到所有的table table_list = soup.find_all('table',class_='newlist')[1:] for table in table_list: #職位名稱 zwmc = table.select(".zwcm > div > a")[0].text #獲取公司名稱 gsmc = table.select('.gsmc > a')[0].text #獲取職位月薪 zwyx = table.select('.zwyx')[0].text #工作地點 gzdd = table.select('.gzdd')[0].text #將其保存到字典中、 item = { '職位名稱':zwmc, '公司名稱':gswx, '職位月薪':zwyx, '工作地點':gzdd } self.items.append(item) def run(self): #循環(huán),,拼接每一頁的url,,發(fā)送請求,獲取響應(yīng) for page in range(start_page,end_page+1): request = self.handle_request(page) concent = urllib.request.urlopen(request).read().decode("utf8") #調(diào)用解析函數(shù)解析內(nèi)容 self.parse_content(content) #將列表寫入到文件中 string = str(self.items) with open('work.txt','w',encoding='utf8') as fp: fp.write(string) def main(): #輸入工作地點 jl = input("請輸入工作地點:") #輸入關(guān)鍵字 kw = input("請輸入搜索關(guān)鍵字:") #輸入頁碼 start_page = int(input("請輸入起始頁碼:")) end_page = int(input("請輸入結(jié)束頁碼:")) zhilian = ZhilianSpider(jl,kw,start_page,end_page) zhilian.run() if __name__ == '__main__': main()
|
|