久久国产成人av_抖音国产毛片_a片网站免费观看_A片无码播放手机在线观看,色五月在线观看,亚洲精品m在线观看,女人自慰的免费网址,悠悠在线观看精品视频,一级日本片免费的,亚洲精品久,国产精品成人久久久久久久

分享

SEO之泛采集——HTML正文抽取算法

 doc360sir 2011-12-12

很多SEO軟件有泛采集功能,,只需要指定關(guān)鍵字,就自動抓取相關(guān)文章,。這種抓取技術(shù),,需要用到HTML正文抽取算法,這里分享根據(jù)cx-extractor線性算法php版編寫的VB HTML正文抽取類模塊,。

感謝cx,,感謝xwf_like。

參考資料:http://code.google.com/p/cx-extractor 以及         http://hi.baidu.com/xwf_like

'========================================
'模塊名稱:clsHtmlExtractor
'模塊作用:從HTML中抽取正文,,根據(jù)http header或者html自動獲取編碼
'模塊編寫:楚吟風 QQ:112704422  http://www.
'模塊更新:2011-03-15
'模塊說明:感謝cx的基于行塊分布函數(shù)的通用網(wǎng)頁正文抽取算法
'========================================

Option Explicit

'========================================
'函數(shù)名稱:ReplaceX
'函數(shù)作用:正則替換
'========================================
Public Function ReplaceX(ByVal sSource As String, ByVal sPattern As String, ByVal sTarget As String) As String
On Error GoTo ErrHandle
    Dim RegEx, ReplaceTest As String, sRet As String
    Set RegEx = CreateObject("VBSCRIPT.REGEXP")
    RegEx.IgnoreCase = True
    RegEx.Global = True
    RegEx.Pattern = sPattern
    sRet = RegEx.Replace(sSource, sTarget)
    Set RegEx = Nothing
    ReplaceX = sRet
    Exit Function
ErrHandle:
    Set RegEx = Nothing
End Function

'========================================
'函數(shù)名稱:InstrX
'函數(shù)作用:正則查找
'========================================
Public Function InstrX(ByVal Source As String, ByVal sPattern As String, Optional ByRef strs As Variant) As Integer
On Error GoTo ErrHandle
    Dim i As Integer
    ReDim strs(i)
    Dim RegEx, Matches, Match, sCSet As String
    Set RegEx = CreateObject("VBSCRIPT.REGEXP")
    RegEx.IgnoreCase = True
    RegEx.Global = True
    RegEx.Pattern = sPattern
    If RegEx.Test(Source) Then
        Set Matches = RegEx.execute(Source)
        For Each Match In Matches
            i = i + 1
            ReDim Preserve strs(i)
            strs(i) = Match.Value
        Next
    End If
    Set Match = Nothing '
    Set Matches = Nothing
    Set RegEx = Nothing
   
    InstrX = i
    Exit Function
ErrHandle:
    Set RegEx = Nothing
End Function

'========================================
'函數(shù)名稱:GetCset
'函數(shù)作用:根據(jù)給定的字符串獲取html編碼方式
'========================================
Public Function GetCset(ByVal Source As String) As String
    Dim i As Integer, strs() As String, sCSet As String
    i = InstrX(Source, "content-type.*?charset.*?=.*", strs)
    If i > 0 Then sCSet = strs(1)
    sCSet = ReplaceX(sCSet, ".*charset.*?=", "")
    sCSet = ReplaceX(sCSet, """|\s|/|>", "")
    GetCset = sCSet
End Function

 

'========================================
'函數(shù)名稱:LenX
'函數(shù)作用:把全角字符做為2字節(jié)計算長度,,忽略空格長度
'========================================
Private Function LenX(ByVal s_str As String) As Integer
    Dim i_num As Integer, i_index As Integer, i_len As Integer
    s_str = Replace(s_str, " ", "")
    i_len = Len(s_str)
    For i_index = 1 To i_len
        If Asc(Mid(s_str, i_index, 1)) < 0 Then
            i_num = i_num + 1
        End If
    Next
    LenX = i_len + i_num
End Function


'========================================
'函數(shù)名稱:Extract
'函數(shù)作用:根據(jù)cx-extractor算法抽取正文
'========================================
Public Function Extract(ByVal Source As String, Optional ByVal BlockLine As Integer = 3, Optional ByVal OneLine As Boolean = True)
    Dim sLine() As String, iLine() As Long, i As Integer, iBlockLen() As Long, sBlock() As String
    Dim iStart As Long, iEnd As Long, iMaxLen As Long, iTemp As Long
    Dim sPortion As String, iCurTextLen As Long, sTemp As String, sOneLine As String
    sOneLine = IIf(OneLine, "", vbCrLf)
   
   
    '初步去噪
   
    '去除DTD信息
    Source = ReplaceX(Source, "<!DOCTYPE.*?>", "")
    '去除注釋
    Source = ReplaceX(Source, "<!--(.|\n)*?-->", "")
    '去除script標簽
    Source = ReplaceX(Source, "<script.*?>(.|\n)*?<\/script>", "")
    '去除style標簽
    Source = ReplaceX(Source, "<style.*?>(.|\n)*?<\/style>", "")
    '去除html tag標簽
    Source = ReplaceX(Source, "<(.|\n)*?>", "")
    '去除特殊字符
    Source = ReplaceX(Source, "&.{1,5};|&#.{1,5};", "")
   
   
    '規(guī)范換行
    Source = Replace(Source, vbCrLf, vbLf)
    Source = Replace(Source, vbCr, vbLf)
    Source = Replace(Source, vbLf, vbCrLf)
   
    '分割到行
    sLine = Split(Source, vbCrLf)
    ReDim iBlockLen(0)
    For i = 0 To UBound(sLine)
        '將多個空白字符替換為一個空格
        sLine(i) = ReplaceX(sLine(i), "\s+", " ")
    Next
   
    '計算第一塊大小
    For i = 0 To (BlockLine - 1)
        iBlockLen(0) = iBlockLen(0) + LenX(sLine(i))
    Next
   
    '計算其他塊大小
    For i = 1 To UBound(sLine) - BlockLine - 1
        ReDim Preserve iBlockLen(i)
        iBlockLen(i) = iBlockLen(i - 1) + LenX(sLine(i - 1 + BlockLine)) - LenX(sLine(i - 1))
    Next
   
    '根據(jù)各個塊大小變化的峰值峰谷提取正文
    iStart = -1: iEnd = -1: i = 0
   
   
    Do While i < UBound(iBlockLen)
        Do While (i < UBound(iBlockLen) And iBlockLen(i) = 0)
            i = i + 1
        Loop
       
        iTemp = i
        iCurTextLen = 0
        sPortion = ""
       
        Do While (i < UBound(iBlockLen) And iBlockLen(i) <> 0)
            sPortion = sPortion & sLine(i) & sOneLine
            iCurTextLen = iCurTextLen + iBlockLen(i)
            i = i + 1
        Loop
       
        If iCurTextLen > iMaxLen Then
            sTemp = sPortion
            iMaxLen = iCurTextLen
            iStart = iTemp
            iEnd = i - 1
        End If
       
    Loop
    'MsgBox sLine(iStart - 1), , iStart
    'MsgBox sLine(iEnd + 1), , iEnd
    Extract = sTemp
End Function

    本站是提供個人知識管理的網(wǎng)絡(luò)存儲空間,,所有內(nèi)容均由用戶發(fā)布,不代表本站觀點,。請注意甄別內(nèi)容中的聯(lián)系方式,、誘導購買等信息,謹防詐騙,。如發(fā)現(xiàn)有害或侵權(quán)內(nèi)容,,請點擊一鍵舉報。
    轉(zhuǎn)藏 分享 獻花(0

    0條評論

    發(fā)表

    請遵守用戶 評論公約

    類似文章 更多