楼主 xmyjk |
Q:如何提取证监会网站的保荐人名单以及所有保荐人的简历 http://www.csrc.gov.cn/pub/newsite/fxb/baxyjg/bjdbrmd/
A:首先,运用抓包工具,点击翻页:
可以发现,里面有个page_size属性,我们把他改成2500,并把整个参数带到URL后面。 http://202.106.183.110/csrc_bjr/csrc/wwweb/front_list.esp?begin_num=1&page_size=2500&user_name=&user_org=¤tpage=1 可以得到所有的页面的整个数据。呵呵,这个小技巧,后面就方便很多了。 提取目录后,在根据URL访问每个条目,这样就就可以提取每份简历。- Option Explicit
- Sub test()
- Dim tmp() As String, i As Integer, arr() As String, xmlhttp As Object, N As Long
- [a1].CurrentRegion.Offset(1).Clear
- Set xmlhttp = CreateObject("Microsoft.XMLHTTP")
- With xmlhttp
- .Open "get", "http://202.106.183.110/csrc_bjr/csrc/wwweb/front_list.esp?begin_num=1&page_size=2500&user_name=&user_org=¤tpage=1", False '获取大目录
- .send
- tmp = Filter(Split(Replace(.responsetext, "</a>", ""), "</td>"), "<td align=""center""") '将每个条目切出来
- End With
- ReDim arr(UBound(tmp) \ 6, 6) '切入数组
- For i = 0 To UBound(tmp)
- arr(i \ 6, i Mod 6) = Split(tmp(i), ">")(UBound(Split(tmp(i), ">"))) '整理文本
- If i Mod 6 = 1 Then arr(i \ 6, 6) = "http://202.106.183.110/csrc_bjr/front_detail.esp?user_id=" & Split(Split(tmp(i), "viewUser('")(1), "','")(0)
- Next
- [a2].Resize(UBound(arr) + 1, UBound(arr, 2) + 1) = arr '输入目录
- Erase tmp
- Erase arr
- Set xmlhttp = Nothing
- MsgBox "Ok"
- End Sub
- Sub testt()
- Dim p&, arr, d As New Dictionary
- arr = Range([c2], [g65536].End(3)).Value
- For p = 1 To UBound(arr) '历遍目录
- If Not d.Exists(Left(arr(p, 1), 4)) Then d.Add Left(arr(p, 1), 4), 0: Sheets.Add(, Sheets(Sheets.Count)).Select
- ActiveSheet.Name = Left(arr(p, 1), 4) '当公司名称不存在,则新建表
- With ActiveSheet.QueryTables.Add(Connection:="URL;" & arr(p, 5), Destination:=[a65536].End(3).Offset(2)) '依据URL导入简历
- .BackgroundQuery = True
- .WebSelectionType = xlSpecifiedTables
- .WebTables = "3,5"
- .Refresh BackgroundQuery:=False
- End With
- ActiveSheet.UsedRange.QueryTable.Delete
- Next
- End Sub
保荐人下载.zip |