很多SEO軟件有泛采集功能,,只需要指定關(guān)鍵字,就自動抓取相關(guān)文章,。這種抓取技術(shù),,需要用到HTML正文抽取算法,這里分享根據(jù)cx-extractor線性算法php版編寫的VB HTML正文抽取類模塊,。
感謝cx,,感謝xwf_like。
參考資料:http://code.google.com/p/cx-extractor 以及 http://hi.baidu.com/xwf_like
'======================================== '模塊名稱:clsHtmlExtractor '模塊作用:從HTML中抽取正文,,根據(jù)http header或者html自動獲取編碼 '模塊編寫:楚吟風 QQ:112704422 http://www. '模塊更新:2011-03-15 '模塊說明:感謝cx的基于行塊分布函數(shù)的通用網(wǎng)頁正文抽取算法 '========================================
Option Explicit
'======================================== '函數(shù)名稱:ReplaceX '函數(shù)作用:正則替換 '======================================== Public Function ReplaceX(ByVal sSource As String, ByVal sPattern As String, ByVal sTarget As String) As String On Error GoTo ErrHandle Dim RegEx, ReplaceTest As String, sRet As String Set RegEx = CreateObject("VBSCRIPT.REGEXP") RegEx.IgnoreCase = True RegEx.Global = True RegEx.Pattern = sPattern sRet = RegEx.Replace(sSource, sTarget) Set RegEx = Nothing ReplaceX = sRet Exit Function ErrHandle: Set RegEx = Nothing End Function
'======================================== '函數(shù)名稱:InstrX '函數(shù)作用:正則查找 '======================================== Public Function InstrX(ByVal Source As String, ByVal sPattern As String, Optional ByRef strs As Variant) As Integer On Error GoTo ErrHandle Dim i As Integer ReDim strs(i) Dim RegEx, Matches, Match, sCSet As String Set RegEx = CreateObject("VBSCRIPT.REGEXP") RegEx.IgnoreCase = True RegEx.Global = True RegEx.Pattern = sPattern If RegEx.Test(Source) Then Set Matches = RegEx.execute(Source) For Each Match In Matches i = i + 1 ReDim Preserve strs(i) strs(i) = Match.Value Next End If Set Match = Nothing ' Set Matches = Nothing Set RegEx = Nothing InstrX = i Exit Function ErrHandle: Set RegEx = Nothing End Function
'======================================== '函數(shù)名稱:GetCset '函數(shù)作用:根據(jù)給定的字符串獲取html編碼方式 '======================================== Public Function GetCset(ByVal Source As String) As String Dim i As Integer, strs() As String, sCSet As String i = InstrX(Source, "content-type.*?charset.*?=.*", strs) If i > 0 Then sCSet = strs(1) sCSet = ReplaceX(sCSet, ".*charset.*?=", "") sCSet = ReplaceX(sCSet, """|\s|/|>", "") GetCset = sCSet End Function
'======================================== '函數(shù)名稱:LenX '函數(shù)作用:把全角字符做為2字節(jié)計算長度,,忽略空格長度 '======================================== Private Function LenX(ByVal s_str As String) As Integer Dim i_num As Integer, i_index As Integer, i_len As Integer s_str = Replace(s_str, " ", "") i_len = Len(s_str) For i_index = 1 To i_len If Asc(Mid(s_str, i_index, 1)) < 0 Then i_num = i_num + 1 End If Next LenX = i_len + i_num End Function
'======================================== '函數(shù)名稱:Extract '函數(shù)作用:根據(jù)cx-extractor算法抽取正文 '======================================== Public Function Extract(ByVal Source As String, Optional ByVal BlockLine As Integer = 3, Optional ByVal OneLine As Boolean = True) Dim sLine() As String, iLine() As Long, i As Integer, iBlockLen() As Long, sBlock() As String Dim iStart As Long, iEnd As Long, iMaxLen As Long, iTemp As Long Dim sPortion As String, iCurTextLen As Long, sTemp As String, sOneLine As String sOneLine = IIf(OneLine, "", vbCrLf) '初步去噪 '去除DTD信息 Source = ReplaceX(Source, "<!DOCTYPE.*?>", "") '去除注釋 Source = ReplaceX(Source, "<!--(.|\n)*?-->", "") '去除script標簽 Source = ReplaceX(Source, "<script.*?>(.|\n)*?<\/script>", "") '去除style標簽 Source = ReplaceX(Source, "<style.*?>(.|\n)*?<\/style>", "") '去除html tag標簽 Source = ReplaceX(Source, "<(.|\n)*?>", "") '去除特殊字符 Source = ReplaceX(Source, "&.{1,5};|&#.{1,5};", "") '規(guī)范換行 Source = Replace(Source, vbCrLf, vbLf) Source = Replace(Source, vbCr, vbLf) Source = Replace(Source, vbLf, vbCrLf) '分割到行 sLine = Split(Source, vbCrLf) ReDim iBlockLen(0) For i = 0 To UBound(sLine) '將多個空白字符替換為一個空格 sLine(i) = ReplaceX(sLine(i), "\s+", " ") Next '計算第一塊大小 For i = 0 To (BlockLine - 1) iBlockLen(0) = iBlockLen(0) + LenX(sLine(i)) Next '計算其他塊大小 For i = 1 To UBound(sLine) - BlockLine - 1 ReDim Preserve iBlockLen(i) iBlockLen(i) = iBlockLen(i - 1) + LenX(sLine(i - 1 + BlockLine)) - LenX(sLine(i - 1)) Next '根據(jù)各個塊大小變化的峰值峰谷提取正文 iStart = -1: iEnd = -1: i = 0 Do While i < UBound(iBlockLen) Do While (i < UBound(iBlockLen) And iBlockLen(i) = 0) i = i + 1 Loop iTemp = i iCurTextLen = 0 sPortion = "" Do While (i < UBound(iBlockLen) And iBlockLen(i) <> 0) sPortion = sPortion & sLine(i) & sOneLine iCurTextLen = iCurTextLen + iBlockLen(i) i = i + 1 Loop If iCurTextLen > iMaxLen Then sTemp = sPortion iMaxLen = iCurTextLen iStart = iTemp iEnd = i - 1 End If Loop 'MsgBox sLine(iStart - 1), , iStart 'MsgBox sLine(iEnd + 1), , iEnd Extract = sTemp End Function
|