python編寫(xiě)的新浪微博爬蟲(chóng)

啊羊的書(shū)店 2013-02-24

展開(kāi)全文

python編寫(xiě)的新浪微博爬蟲(chóng)

分類(lèi)： python 新浪微博爬蟲(chóng) 2012-08-28 22:08 2804人閱讀評(píng)論(7) 收藏舉報(bào)

最近實(shí)驗(yàn)室安排了個(gè)任務(wù),，寫(xiě)一個(gè)新浪微博的爬蟲(chóng)，抓取大家的微博內(nèi)容進(jìn)行分析,。

話說(shuō)都快畢業(yè)了,，最近還在一家通信設(shè)備商實(shí)習(xí)（工資好少?。瑹o(wú)奈只能抽出晚上的時(shí)間來(lái)寫(xiě)這個(gè)程序,。

本來(lái)想用開(kāi)源的爬蟲(chóng)的,，于是嘗試了nutch和heritrix。nutch性能太不穩(wěn)定了,，老是出問(wèn)題,。heritrix功能多，復(fù)雜,，跑起來(lái)也慢,。

當(dāng)看到這兩篇博客后（http://kcclub./home.php?mod=space&uid=93&do=blog&id=890）（http://1.wklken.sinaapp.com/?p=177），決定干脆自己寫(xiě)個(gè)爬蟲(chóng)吧,。

這程序花了我6個(gè)晚上（其實(shí)大多數(shù)時(shí)間都在學(xué)習(xí)python語(yǔ)言,。。,。還有格式規(guī)范化,。。,。）,，現(xiàn)在拿出來(lái)和大家分享下吧。

如果有什么問(wèn)題,，請(qǐng)通過(guò)郵件和我聯(lián)系（[email protected]）,，我會(huì)及時(shí)更改（畢竟要交差么）。

程序運(yùn)行方式：保存所有代碼后,，打開(kāi)Main.py,，修改LoginName為你的新浪微博賬號(hào)，PassWord為你的密碼,。運(yùn)行Main.py,，程序會(huì)在當(dāng)前目錄下生成CrawledPages文件夾，并保存所有爬取到的文件在這個(gè)文件夾中,。

馬上要畢業(yè)找工作季了,，在此攢點(diǎn)rp，希望offer能好又多,。

1. 執(zhí)行文件,，文件名Main.py

[python] view plain copy ?

#!/usr/bin/env python
#coding=utf8
'''''Author: Zheng Yi
Email: [email protected]'''
import WeiboCrawl
if __name__ == '__main__':
weiboLogin = WeiboCrawl.WeiboLogin('LoginName', 'PassWord')
if weiboLogin.Login() == True:
print "The WeiboLogin module works well!"
#start with my blog :)
webCrawl = WeiboCrawl.WebCrawl('http://weibo.com/yaochen')
webCrawl.Crawl()
del webCrawl

2. 主類(lèi),，文件名WeiboCrawl.py

[python] view plain copy ?

#!/usr/bin/env python
#coding=utf8
'''''Author: Zheng Yi
Email: [email protected]'''
import urllib2
import cookielib
import threading
import os
import WeiboEncode
import WeiboSearch
import TextAnalyze
pagesContent = [] #html content of downloaded pages
textContent = [] #main text content of downloaded pages
triedUrl = [] #all tried urls, including failed and success
toTryUrl = [] #urls to be try
failedUrl = [] #urls that fails to download
class WeiboLogin:
"WeiboLogin class is for Weibo login, cookie, etc."
def __init__(self, user, pwd, enableProxy = False):
"Constructor of class WeiboLogin."
print "Initializing WeiboLogin..."
self.userName = user
self.passWord = pwd
self.enableProxy = enableProxy
print "UserName:", user
print "Password:", pwd
self.serverUrl = "http://login.sina.com.cn/sso/prelogin.php?entry=weibo&callback=sinaSSOController.preloginCallBack&su=dW5kZWZpbmVk&client=ssologin.js(v1.3.18)&_=1329806375939"
self.loginUrl = "http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.1)"
self.postHeader = {'User-Agent': 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11'}
def Login(self):
"Run this function to laungh the login process"
self.EnableCookie(self.enableProxy)
serverTime, nonce = self.GetServerTime()
postData = WeiboEncode.PostEncode(self.userName, self.passWord, serverTime, nonce)
print "Post data length:\n", len(postData)
req = urllib2.Request(self.loginUrl, postData, self.postHeader)
print "Posting request..."
result = urllib2.urlopen(req)
text = result.read()
print "Post result page length: ", len(text)
try:
loginUrl = WeiboSearch.sRedirectData(text)
urllib2.urlopen(loginUrl)
except:
print 'Login error!'
return False
print 'Login sucess!'
return True
def GetServerTime(self):
"Get server time and nonce, which are used to encode the password"
print "Getting server time and nonce..."
serverData = urllib2.urlopen(self.serverUrl).read()
print serverData
try:
serverTime, nonce = WeiboSearch.sServerData(serverData)
return serverTime, nonce
except:
print 'Get server time & nonce error!'
return None
def EnableCookie(self, enableProxy):
"Enable cookie & proxy (if needed)."
cookiejar = cookielib.LWPCookieJar()
cookie_support = urllib2.HTTPCookieProcessor(cookiejar)
if enableProxy:
proxy_support = urllib2.ProxyHandler({'http':'http://c'})
opener = urllib2.build_opener(proxy_support, cookie_support, urllib2.HTTPHandler)
print "Proxy enabled"
else:
opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler)
urllib2.install_opener(opener)
class WebCrawl:
"WebCrawl class is for crawling the Weibo"
def __init__(self, beginUrl, maxThreadNum = 10, maxDepth = 2, thLifetime = 10, saveDir = "." +os.sep + "CrawledPages"):
"Initialize the class WebCrawl"
toTryUrl.append(beginUrl)
self.maxThreadNum = maxThreadNum
self.saveDir = saveDir
self.maxDepth = maxDepth
self.thLifetime = thLifetime
self.triedPagesNum = 0
self.threadPool = []
if not os.path.exists(self.saveDir):
os.mkdir(self.saveDir)
self.logFile = open(self.saveDir + os.sep + 'log.txt','w')
def Crawl(self):
"Run this function to start the crawl process"
global toTryUrl
for depth in range(self.maxDepth):
print 'Searching depth ', depth, '...'
self.DownloadAll()
self.UpdateToTry()
def DownloadAll(self):
"Download all urls in current depth"
global toTryUrl
iDownloaded = 0
while iDownloaded < len(toTryUrl):
iThread = 0
while iThread < self.maxThreadNum and iDownloaded + iThread < len(toTryUrl):
iCurrentUrl = iDownloaded + iThread
pageNum = str(self.triedPagesNum)
self.DownloadUrl(toTryUrl[iCurrentUrl], pageNum)
self.triedPagesNum += 1
iThread += 1
iDownloaded += iThread
for th in self.threadPool:
th.join(self.thLifetime)
self.threadPool = []
toTryUrl = []
def DownloadUrl(self, url, pageNum):
"Download a single url and save"
cTh = CrawlThread(url, self.saveDir, pageNum, self.logFile)
self.threadPool.append(cTh)
cTh.start()
def UpdateToTry(self):
"Update toTryUrl based on textContent"
global toTryUrl
global triedUrl
global textContent
newUrlList = []
for textData in textContent:
newUrlList += WeiboSearch.sUrl(textData)
toTryUrl = list(set(newUrlList) - set(triedUrl))
pagesContent = []
textContent = []
class CrawlThread(threading.Thread):
"CrawlThread class is derived from threading.Thread, to create a thread."
thLock = threading.Lock()
def __init__(self, url, saveDir, pageNum, logFile):
"Initialize the CrawlThread"
threading.Thread.__init__(self)
self.url = url
self.pageNum = pageNum
self.fileName = saveDir + os.sep + pageNum + '.htm'
self.textName = saveDir + os.sep + pageNum + '.txt'
self.logFile = logFile
self.logLine = 'File: ' + pageNum + ' Url: '+ url
def run(self):
"rewrite the run() function"
global failedUrl
global triedUrl
global pagesContent
global textContent
try:
htmlContent = urllib2.urlopen(self.url).read()
transText = TextAnalyze.textTransfer(htmlContent)
fOut = open(self.fileName, 'w')
fOut.write(htmlContent)
fOut.close()
tOut = open(self.textName, 'w')
tOut.write(transText)
tOut.close()
except:
self.thLock.acquire()
triedUrl.append(self.url)
failedUrl.append(self.url)
sFailed = 'Failed! ' + self.logLine
print sFailed
self.logFile.write(sFailed + '\n')
self.thLock.release()
return None
self.thLock.acquire()
pagesContent.append(htmlContent)
textContent.append(transText)
triedUrl.append(self.url)
sSuccess = 'Success! ' + self.logLine
print sSuccess
self.logFile.write(sSuccess + '\n')
self.thLock.release()

3. 加密函數(shù)，文件名WeiboEncode.py

[html] view plain copy ?

#!/usr/bin/env python
#coding=utf8
'''Author: Zheng Yi
Email: [email protected]'''
import urllib
import base64
import hashlib
def PostEncode(userName, passWord, serverTime, nonce):
"Used to generate POST data"
encodedUserName = GetUserName(userName)
encodedPassWord = GetPassword(passWord, serverTime, nonce)
postPara = {
'entry': 'weibo',
'gateway': '1',
'from': '',
'savestate': '7',
'userticket': '1',
'ssosimplelogin': '1',
'vsnf': '1',
'vsnval': '',
'su': encodedUserName,
'service': 'miniblog',
'servertime': serverTime,
'nonce': nonce,
'pwencode': 'wsse',
'sp': encodedPassWord,
'encoding': 'UTF-8',
'url': 'http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack',
'returntype': 'META'
}
postData = urllib.urlencode(postPara)
return postData
def GetUserName(userName):
"Used to encode user name"
userNameTemp = urllib.quote(userName)
userNameEncoded = base64.encodestring(userNameTemp)[:-1]
return userNameEncoded
def GetPassword(passWord, serverTime, nonce):
"Used to encode user password"
pwdTemp1 = hashlib.sha1(passWord).hexdigest()
pwdTemp2 = hashlib.sha1(pwdTemp1).hexdigest()
pwdTemp3 = pwdTemp2 + serverTime + nonce
pwdEncoded = hashlib.sha1(pwdTemp3).hexdigest()
return pwdEncoded

4. 查找函數(shù),，文件名WeiboSearch.py

[python] view plain copy ?

#!/usr/bin/env python
#coding=utf8
'''''Author: Zheng Yi
Email: [email protected]'''
import re
import json
def sServerData(serverData):
"Search the server time & nonce from server data"
p = re.compile('\((.*)\)')
jsonData = p.search(serverData).group(1)
data = json.loads(jsonData)
serverTime = str(data['servertime'])
nonce = data['nonce']
print "Server time is:", serverTime
print "Nonce is:", nonce
return serverTime, nonce
def sRedirectData(text):
p = re.compile('location\.replace\(\'(.*?)\'\)')
loginUrl = p.search(text).group(1)
return loginUrl
def sUrl(htmlData):
iMainBegin = htmlData.find('<div class="feed_lists" node-type="feed_list">')
iMainEnd = htmlData.find('<div node-type="lazyload" class="W_loading">')
mainData = htmlData[iMainBegin:iMainEnd]
p = re.compile('href=\"(\/[a-zA-Z0-9\/\%]*?)\"')
#p = re.compile('href=\"(http:\/\/weibo.com\/[a-zA-Z]*?)\"')
semiUrlList = p.findall(mainData)
urlList = []
for url in semiUrlList:
urlList.append('http://weibo.com' + url)
return urlList

5. 簡(jiǎn)單的html內(nèi)容分析,、格式轉(zhuǎn)換，文件名TextAnalyze.py

[python] view plain copy ?

#!/usr/bin/env python
#coding=utf8
'''''Author: Zheng Yi
Email: [email protected]'''
def textTransfer(htmlContent):
"Decode the main part of html"
line = textExtract(htmlContent)
print 'line:', line
if line != None:
transText = textTransfer(line)
return transText
else:
return None
def textExtract(htmlContent):
"Extract the main part from html"
lines = htmlContent.splitlines()
for line in lines:
if line.startswith('<script>STK && STK.pageletM && STK.pageletM.view({"pid":"pl_content_[homeFeed|hisFeed]"'):
return line
else:
return None
def textTransfer(line):
"Decode the main part"
iText = line.find('html":"')
if iText > 0:
transText = line[iText + 7: -12].encode("utf-8").decode('unicode_escape').encode("utf-8").replace("\\", "")
return transText
else:
return None

本站是提供個(gè)人知識(shí)管理的網(wǎng)絡(luò)存儲(chǔ)空間,，所有內(nèi)容均由用戶發(fā)布,，不代表本站觀點(diǎn)。請(qǐng)注意甄別內(nèi)容中的聯(lián)系方式,、誘導(dǎo)購(gòu)買(mǎi)等信息,，謹(jǐn)防詐騙。如發(fā)現(xiàn)有害或侵權(quán)內(nèi)容,，請(qǐng)點(diǎn)擊一鍵舉報(bào),。

轉(zhuǎn)藏 分享

QQ空間 QQ好友新浪微博微信

獻(xiàn)花（0） +1

來(lái)自：啊羊的書(shū)店 > 《python-weibo》

舉報(bào)/認(rèn)領(lǐng)