(所有圖片均對(duì)一些信息做了模糊處理,,應(yīng)csdn官方要求,,不能出現(xiàn)一些熱點(diǎn)話題和政治相關(guān)的東西,配圖只是為了讓大家看的更直觀,,故大家能看明白操作(技術(shù))過程就行了,。) 效果圖: 1.工先利其事必先利器,,首先我們得下載相應(yīng)的庫:
pip install requests
pip install lxml
pip install xlwt
requests 向網(wǎng)頁發(fā)送請(qǐng)求 lxml 處理xml文件(xpath) xlwt 對(duì)Excel做寫入操作 2.爬取b站熱門視頻的信息: ??打開b站熱門視頻頁面: ??按f12 進(jìn)入開發(fā)者選項(xiàng),然后點(diǎn)擊選中你要獲取的頁面信息,,即可找到該信息在該HTML文件中的什么位置(這對(duì)我們用xpath獲取元素屬性和元素值很重要),,例如: 代碼如下:
# 爬取b站熱門視頻信息
def spider(video_list):
url = 'https://www.bilibili.com/ranking?spm_id_from=333.851.b_7072696d61727950616765546162.3'
html_data = requests.get(url).text
selector = html.fromstring(html_data)
infolist = selector.xpath('//li[@class="rank-item"]/div[@class="content"]/div[@class="info"]')
rank = 0
# "".join(item.xpath('./div[@class="num"]/text()'))
for item in infolist:
rank += 1
videolink = "".join(item.xpath('./a/@href'))
title = "".join(item.xpath('./a/text()')).replace('"','')
playinfo = "".join(item.xpath('./div[@class="detail"]/span/text()')).split("萬")
play = playinfo[0] + "萬"
comment = playinfo[1]
if comment.isdigit() == False:
comment += "萬"
upname = "".join(item.xpath('./div[@class="detail"]/a/span/text()'))
uplink = "http:" + "".join(item.xpath('./div[@class="detail"]/a/@href'))
hot = "".join(item.xpath('./div[@class="pts"]/div/text()'))
video_list.append({
'rank': rank,
'videolink': videolink,
'title': title,
'play': play,
'comment': comment,
'upname': upname,
'uplink': uplink,
'hot': hot
})
return video_list
3.將我們拿到的信息集合(video_list)寫入到Excel表格中: ??xlwt的基本使用方法:
import xlwt
# 創(chuàng)建一個(gè)workbook (并設(shè)置編碼)
workbook = xlwt.Workbook(encoding = 'utf-8')
# 創(chuàng)建一個(gè)worksheet
worksheet = workbook.add_sheet('My Worksheet')
# 寫入excel
# 參數(shù)對(duì)應(yīng) 行, 列, 值,(格式)
worksheet.write(1,0, label = 'this is test')
# 保存
#參數(shù)為你保存該Excel文件的路徑
workbook.save('Excel_test.xls')
??如果我們想要點(diǎn)擊視頻名或者up的名字可以跳轉(zhuǎn),那么我們就要使用Excel表格的HYPERLINK方法: HYPERLINK(“http://www.baidu.com” ; “百度”) 百度為顯示在單元格的信息,,而前面的鏈接為跳轉(zhuǎn)鏈接,。 xlwt.Formula()方法需要傳入一個(gè)字符串s,,s='HYPERLINK(“http://www.baidu.com” ; “百度”)’。 代碼如下:
# 將爬取到的數(shù)據(jù)寫入Excel表格
def write_Excel(video_list):
print("將b站熱門視頻信息導(dǎo)入到Excel表格:")
workbook = xlwt.Workbook() # 定義workbook
sheet = workbook.add_sheet('b站熱門視頻') # 添加sheet
xstyle = xlwt.XFStyle() # 實(shí)例化表格樣式對(duì)象
xstyle.alignment.horz = 0x02 # 字體居中
xstyle.alignment.vert = 0x01 # 字體居中
head = ['視頻名', 'up主','排名', '熱度','播放量','評(píng)論數(shù)'] # 表頭
for h in range(len(head)):
sheet.write(0, h, head[h],xstyle) # 把表頭寫到Excel里面去
i = 1
for item in video_list:
# 向單元格(視頻名)添加(該視頻的)超鏈接
title_data = 'HYPERLINK("'+item["videolink"]+'";"'+item["title"]+'")' # 設(shè)置超鏈接
sheet.col(0).width = int(256 * len(title_data) * 3/5) # 設(shè)置列寬
sheet.write(i, 0, xlwt.Formula(title_data), xstyle)
name_data = 'HYPERLINK("' + item["uplink"] + '";"' + item["upname"] + '")' # 設(shè)置超鏈接
sheet.col(1).width = int(256 * len(title_data) * 3 / 10)
sheet.write(i, 1, xlwt.Formula(name_data), xstyle)
sheet.write(i, 2, item['rank'], xstyle)
sheet.write(i, 3, item['hot'], xstyle)
sheet.write(i, 4, item['play'], xstyle)
sheet.write(i, 5, item['comment'], xstyle)
i += 1
# 如果文件存在,,則將其刪除
if os.path.exists('D:/Test/b站熱門視頻信息.xls'):
os.remove('D:/Test/b站熱門視頻信息.xls')
workbook.save('D:/Test/b站熱門視頻信息.xls')
print('寫入excel成功')
print("文件位置:D:/Test/b站熱門視頻信息.xls")
4.在入口main中調(diào)用上面兩個(gè)函數(shù) 完整代碼如下:
import requests
from lxml import html
import xlwt
import os
# 爬取b站熱門視頻信息
def spider(video_list):
url = 'https://www.bilibili.com/ranking?spm_id_from=333.851.b_7072696d61727950616765546162.3'
html_data = requests.get(url).text
selector = html.fromstring(html_data)
infolist = selector.xpath('//li[@class="rank-item"]/div[@class="content"]/div[@class="info"]')
rank = 0
# "".join(item.xpath('./div[@class="num"]/text()'))
for item in infolist:
rank += 1
videolink = "".join(item.xpath('./a/@href'))
title = "".join(item.xpath('./a/text()')).replace('"','')
playinfo = "".join(item.xpath('./div[@class="detail"]/span/text()')).split("萬")
play = playinfo[0] + "萬"
comment = playinfo[1]
if comment.isdigit() == False:
comment += "萬"
upname = "".join(item.xpath('./div[@class="detail"]/a/span/text()'))
uplink = "http:" + "".join(item.xpath('./div[@class="detail"]/a/@href'))
hot = "".join(item.xpath('./div[@class="pts"]/div/text()'))
video_list.append({
'rank': rank,
'videolink': videolink,
'title': title,
'play': play,
'comment': comment,
'upname': upname,
'uplink': uplink,
'hot': hot
})
return video_list
# 將爬取到的數(shù)據(jù)寫入Excel表格
def write_Excel(video_list):
print("將b站熱門視頻信息導(dǎo)入到Excel表格:")
workbook = xlwt.Workbook() # 定義workbook
sheet = workbook.add_sheet('b站熱門視頻') # 添加sheet
xstyle = xlwt.XFStyle() # 實(shí)例化表格樣式對(duì)象
xstyle.alignment.horz = 0x02 # 字體居中
xstyle.alignment.vert = 0x01 # 字體居中
head = ['視頻名', 'up主','排名', '熱度','播放量','評(píng)論數(shù)'] # 表頭
for h in range(len(head)):
sheet.write(0, h, head[h],xstyle) # 把表頭寫到Excel里面去
i = 1
for item in video_list:
# 向單元格(視頻名)添加(該視頻的)超鏈接
title_data = 'HYPERLINK("'+item["videolink"]+'";"'+item["title"]+'")' # 設(shè)置超鏈接
sheet.col(0).width = int(256 * len(title_data) * 3/5) # 設(shè)置列寬
sheet.write(i, 0, xlwt.Formula(title_data), xstyle)
name_data = 'HYPERLINK("' + item["uplink"] + '";"' + item["upname"] + '")' # 設(shè)置超鏈接
sheet.col(1).width = int(256 * len(title_data) * 3 / 10)
sheet.write(i, 1, xlwt.Formula(name_data), xstyle)
sheet.write(i, 2, item['rank'], xstyle)
sheet.write(i, 3, item['hot'], xstyle)
sheet.write(i, 4, item['play'], xstyle)
sheet.write(i, 5, item['comment'], xstyle)
i += 1
# 如果文件存在,,則將其刪除
if os.path.exists('D:/Test/b站熱門視頻信息.xls'):
os.remove('D:/Test/b站熱門視頻信息.xls')
workbook.save('D:/Test/b站熱門視頻信息.xls')
print('寫入excel成功')
print("文件位置:D:/Test/b站熱門視頻信息.xls")
if __name__ == '__main__':
video_list = []
write_Excel(spider(video_list))
|