楼主 嘉昆2011 |
声明: 1,文章素材参考ExcelHero Daniel Ferry 博客,对常用的网页信息提取方法进行比较和提炼,以适应中文信息的提取:- http://www.excelhero.com/blog/2010/05/multi-threaded-vba.html
- http://www.excelhero.com/blog/2010/05/a-real-time-advanced-excel-chart.html
2,本文仅对各种方法作些初步比较,不对可视化的图表监控和面盘做深入讲解
涉及知识: 1,利用Internet Explorer提取表格等信息; 2,Excel内置宏录制提取信息; 3,利用MSXML2.XMLHTTP 内嵌式对象提取信息(包括“adodb.stream”,正则); 4,VBScript脚本借助VBA平台提取信息。
效果图:
利用柱型图+散点图制作模拟的进度观测条。图中灰色部分表示总共需要提取的记录数目;紫色部分表示已经发送出去的请求;蓝色部分代表已经获取的记录;红色线条代表获取当前记录所运行的速度(纵坐标所示)。
1,IE提取 在前文已经论述了IE提取的方法: http://www.exceltip.net/thread-41437-1-1.html
- Sub IE_Method()
-
- Dim IE As New InternetExplorer
- For i = 1 To Range("Code").Count
- 'IE.Visible = True
- IE.navigate "http://www.cninfo.com.cn/information/fund/netvalue/" & Range("Code").Cells(i, 1) & ".html"
- Do
- DoEvents
- Loop Until IE.readyState = READYSTATE_COMPLETE
- Dim Doc As HTMLDocument
- Set tb1 = IE.document.all.tags("table")(0)
- Range("Name").Cells(i, 1) = Split(tb1.Rows(0).Cells(0).innerText, "£º")(2)
- Set tb2 = IE.document.all.tags("table")(1)
- Range("Date").Cells(i, 1) = tb2.Rows(1).Cells(0).innerText
- Range("NetValue").Cells(i, 1) = tb2.Rows(1).Cells(1).innerText
- Next
- IE.Quit
- Set IE = Nothing
- End Sub
特点及原理:IE作为比较原始的网页信息提取,语法相对简单,但运行缓慢;主要是由于IE本身巨大,更糟的是VBA处理需要同步等待从IE返回的信息,大量浪费用户时间。
2,Excel内置录宏- Sub WebQuery_Method()
- Dim QT As QueryTable
- Dim StrConnect As String
- For i = 1 To Range("Code").Count
- For Each QT In Sheet2.QueryTables
- QT.Delete
- Next QT
- StrConnect = "URL;http://www.cninfo.com.cn/information/fund/netvalue/" & Range("Code").Cells(i, 1) & ".html"
- Set QT = Sheet2.QueryTables.Add(Connection:=StrConnect, Destination:=Sheet2.Range("K1"))
- With QT
- .BackgroundQuery = False
- .AdjustColumnWidth = False
- .RefreshPeriod = 0
- .WebSelectionType = xlAllTables
- .WebFormatting = xlWebFormattingNone
- .Refresh BackgroundQuery:=False
- End With
-
- Sheet1.Range("Name").Cells(i, 1) = Split(Sheet2.Range("WebTable").Cells(1, 1), "£º")(2)
- Sheet1.Range("Date").Cells(i, 1) = Sheet2.Range("WebTable").Cells(4, 1) & " "
- Sheet1.Range("NetValue").Cells(i, 1) = Sheet2.Range("WebTable").Cells(4, 2)
- Sheet2.Range("WebTable").Clear
- Next i
-
- End Sub
特点与原理:通过录制宏的方式来提取网页信息,相对简单(改编代码相对较少),但对本文中的示例网站暂不推荐,运行速度与IE一样缓慢,有时会出现“假死”现象。
3,利用MSXML2.XMLHTTP 内嵌式对象提取信息(包括“adodb.stream”,正则)
- Sub XMLHTTP_Method()
- Dim oXML As MSXML2.XMLHTTP
- Set oXML = New MSXML2.XMLHTTP
- For i = 1 To Range("Code").Count
- oXML.Open "GET", "http://www.cninfo.com.cn/information/fund/netvalue/" & Range("Code").Cells(i, 1) & ".html", True
- oXML.send
- Do
- DoEvents
- Loop Until oXML.readyState = READYSTATE_COMPLETE
-
- ss = TransCode(oXML.responseBody, "gb2312")
- Range("Name").Cells(i, 1) = Trim(Split(Split(ss, "</strong>")(2), "</td")(0))
- Range("Date").Cells(i, 1) = RegExp(Split(Split(ss, "<td class=""zx_data"">")(2), "</td")(0), "\s|[a-z]|\&|\;") & " "
- Range("NetValue").Cells(i, 1) = RegExp(Split(Split(ss, "<td class=""zx_data"">")(3), "</td")(0), "\s|[a-z]|\&|\;")
- Next i
- Set oXML = Nothing
-
- End Sub
特点与原理:作为一种内嵌对象,打个不恰当的比方,仿佛如没有界面的网络浏览器(即,不可见),很很快检索网页文本放入内存,从这点上说,运行速度比前两种快很多。但是仍依赖于Excel,每次只能提取一条记录;而且速度不是很稳定,时高时低。然很多中文网站的字符编码是gb2312,如本例。由于XMLHTTP默认识别的编码是UTF-8,因此直接从.responseText对象得到的字符串会发现其中的汉字为乱码。需要借助Adodb.Stream对象提取字符串,可以参考链接:- http://club.excelhome.net/thread-893760-1-1.html
http://www.exceltip.net/forum.php?mod=viewthread&tid=16447 二楼 随后将得到的字符串经过简单的正则处理,即可得到所需信息。- Function TransCode(StrBody, Cset)
- With CreateObject("adodb.stream")
- .Type = 1
- .Mode = 3
- .Open
- .Write StrBody
- .Position = 0
- .Type = 2
- .Charset = Cset
- TransCode = .ReadText
- .Close
- End With
- End Function
- Function RegExp(StrExp, RegRul)
- With CreateObject("vbscript.regexp")
- .Global = True
- .Pattern = RegRul
- RegExp = .Replace(StrExp, "")
- End With
- End Function
4,VBScript脚本借助VBA平台提取信息- Sub Swarm_Method()
- Dim Rng As Range
- Dim lngAgent As Long
- For i = 1 To Range("Code").Count
- Set Rng = Range("Name").Cells(i, 1)
- CreatVBScriptAgent Rng
- lngAgent = lngAgent + 1
- If lngAgent = [SwarmSize] Then lngAgent = 0
- Next i
- Set Rng = Nothing
- End Sub
- Public Sub CreatVBScriptAgent(Rng As Range)
- Dim sFileName As String
- Dim intFileNum As Integer
- Dim shellWin As New ShellWindows
- Dim S As String, sCode As String, sRecord As String
- Dim k As Long
- sRecord = Rng.Resize(, 3).Address
- sCode = Rng.Offset(, -1)
- k = Rng.Row Mod [SwarmSize]
- Rng = "Agent_" & k
- S = S & "Dim oXL, oXML, cRec, ss, Results(2)" & vbCrLf
- S = S & "cRec=""" & sCode & """" & vbCrLf
- S = S & "Set oXL = GetObject(, ""Excel.Application"") " & vbCrLf
- S = S & "Set oXML = WScript.CreateObject(""MSXML2.XMLHTTP"") " & vbCrLf
- S = S & vbCrLf
- S = S & vbCrLf
- S = S & "oXML.Open ""GET"", ""http://www.cninfo.com.cn/information/fund/netvalue/"" & cRec & "".html"", False" & vbCrLf
- 'S = S & "Wscript.Sleep 50 " & vbCrLf
- S = S & "oXML.Send" & vbCrLf
- 'S = S & "Wscript.Sleep 50 " & vbCrLf
- S = S & vbCrLf
- S = S & vbCrLf
- S = S & "ss = TransCode(oXML.responseBody, ""gb2312"")" & vbCrLf
- S = S & "Results(0) = Trim(Split(Split(ss, ""</strong>"")(2), ""</td"")(0))" & vbCrLf
- S = S & "Results(1) = RegExp(Split(Split(ss, ""<td class=""""zx_data"""">"")(2), ""</td"")(0), ""\s|[a-z]|\&|\;"") & "" """ & vbCrLf
- S = S & "Results(2) = RegExp(Split(Split(ss, ""<td class=""""zx_data"""">"")(3), ""</td"")(0), ""\s|[a-z]|\&|\;"")" & vbCrLf
- S = S & "Wscript.Sleep 50" & vbCrLf
- S = S & "oXL.Workbooks(""" & ThisWorkbook.Name & """).Sheets(""Interface"").Range(""" & sRecord & """) = Results" & vbCrLf
- S = S & vbCrLf
- S = S & vbCrLf
- S = S & "Function TransCode(StrBody, Cset)" & vbCrLf
- S = S & " With WScript.CreateObject(""adodb.stream"")" & vbCrLf
- S = S & " .Type = 1" & vbCrLf
- S = S & " .Mode = 3" & vbCrLf
- S = S & " .Open" & vbCrLf
- S = S & " .Write StrBody" & vbCrLf
- S = S & " .Position = 0" & vbCrLf
- S = S & " .Type = 2" & vbCrLf
- S = S & " .Charset = Cset" & vbCrLf
- S = S & " TransCode = .ReadText" & vbCrLf
- S = S & " .Close" & vbCrLf
- S = S & " End With " & vbCrLf
- S = S & "End Function" & vbCrLf
- S = S & vbCrLf
- S = S & vbCrLf
- S = S & "Function RegExp(StrExp, RegRul)" & vbCrLf
- S = S & " With WScript.CreateObject(""vbscript.regexp"")" & vbCrLf
- S = S & " .Global = True" & vbCrLf
- S = S & " .Pattern = RegRul" & vbCrLf
- S = S & " RegExp = .Replace(StrExp, """")" & vbCrLf
- S = S & " End With " & vbCrLf
- S = S & "End Function" & vbCrLf
-
- sFileName = ActiveWorkbook.Path & "\SwarmAgent_" & k & ".vbs"
- intFileNum = FreeFile
- Open sFileName For Output As intFileNum
- Print #intFileNum, S
- Close intFileNum
- Set wshShell = CreateObject("Wscript.Shell")
- wshShell.Run """" & sFileName & """"
- Set wshShell = Nothing
-
- End Sub
特点与原理:VBScript借助VBA平台运行,速度快而稳定,但是需要相对扎实的基本功和代码调试技巧。同时也依赖这几个因素:1,所选曲的网站;2,运行的操作系统(XP系统上只可以运行几个“蜂包”,而Vista却可以运行几十个--原文作者测试)Win7可以运行更多,但达到一定程度后,速度稳定,不会随“蜂包”的增加而加快。每个“蜂包”都有它的使命:到达目标网站,获取记录,写入Excel,消失;周而复始。
最后,不要干坏事。。。
附件:
VBAWEB.rar
|