楼主 kevinchengcw |
Q: 如何利用vba批量从word文档中提取所需信息?(正则方式) A: 代码及解释如下:- Sub test()
- Dim Dic As Object, FN$, Arr, Arrt, Arrt2, Rule, RegEx As Object, mMatch, Str$, Str2$, StrT$, N&, I&, T&, C%, D%, All&
- FN = Dir(ThisWorkbook.Path & "\*.doc") '循环提取当前目录下的全部word文档(本例仅针对03版文档,其他版本可参考使用"\*.doc?")
- All = 0 '初始化统计文件数量为零
- Do While FN <> "" '第一次循环用于计数文件数量
- All = All + 1
- FN = Dir
- Loop
- Set RegEx = CreateObject("vbscript.regexp") '创建正则项目
- With RegEx
- .Global = True '全局有效
- .MultiLine = True '多行有效
- .ignorecase = True '忽略大小写
- End With
- Set Dic = CreateObject("scripting.dictionary") '创建字典项目,用于替换推荐人代码为指定人名(本例为示例,实际使用中可以将item项替换成对应人名)
- With Dic
- .Add "A", "一"
- .Add "B", "二"
- .Add "C", "三"
- .Add "D", "四"
- .Add "E", "五"
- .Add "F", "六"
- .Add "G", "七"
- End With
- With CreateObject("word.application") '创建word进程,用于操作word文档
- .Visible = False '隐藏操作
- .AutomationSecurity = msoAutomationSecurityForceDisable '打开文档时不启用宏
- .DisplayAlerts = False '关闭警告信息
- ReDim Arr(1 To All + 1, 1 To 14) '定义输出的excel表格
- Arrt = Split("序号,姓名,性别,户籍,出生年月,毕业学校,专业,学历,工作年龄,最近三家单位及时长,推荐职位,待遇要求,推荐时间,推荐人", ",") '分配表头对应信息
- For N = LBound(Arrt) To UBound(Arrt) '将表头信息写入结果数组
- Arr(1, N + 1) = Arrt(N)
- Next N
- FN = Dir(ThisWorkbook.Path & "\*.doc") '第二次循环,用于提取各个文件操作
- T = 2 '初始化数据写入结果数组的起始行数(第一次已被表头占用)
- Rule = Array("基本情况:([\s\S]+)(?=二、教育..:)", "教育..:([\s\S]+)(?=三、工作..:)", "工作经历:([\s\S]+)(?=四、优势..:)", "推荐说明:([\s\S]+)(?=调研单位:)") '创建正则表达式规则数组,用于提取各个大项文本
- Do While FN <> "" '当存在有效文件时循环提取
- With .documents.Open(ThisWorkbook.Path & "\" & FN) '打开文件
- Str = .Range.Text '读取文档文字内容
- .Close False '关闭文档(不保存)
- End With
- Arr(T, 14) = Dic(Left(FN, 1)) '先写入推荐人信息
- With RegEx
- Arr(T, 1) = T - 1 '写入序号
- For N = LBound(Rule) To UBound(Rule) '循环各个正则规则表达式项目
- .Pattern = Rule(N) '将规则赋值给正则项目
- Str2 = "" '初始化数据文本为空
- If Rule(N) Like "基本*" Then '通过规则判断取得的结果区段,并进行最终信息提取
- If .test(Str) Then
- Str2 = .Execute(Str)(0).submatches(0)
- .Pattern = "\s*(\S?):\s*"
- Str2 = .Replace(Str2, "$1 ")
- .Pattern = "[\r\n]+"
- Str2 = WorksheetFunction.Trim(.Replace(Str2, " "))
- Arrt = Split(Str2, " ")
- Arr(T, 2) = Arrt(1)
- Arr(T, 3) = Arrt(3)
- Arr(T, 4) = Arrt(11)
- Arr(T, 5) = Arrt(5)
- End If
- ElseIf Rule(N) Like "教育*" Then
- If .test(Str) Then
- Str2 = .Execute(Str)(0).submatches(0)
- .Pattern = "[\r\n]+"
- Str2 = WorksheetFunction.Trim(.Replace(Str2, " "))
- Arrt = Split(Str2, " ")
- Arr(T, 6) = Arrt(1)
- Arr(T, 7) = Arrt(2)
- Arr(T, 8) = Arrt(3)
- End If
- ElseIf Rule(N) Like "工作*" Then
- If .test(Str) Then
- Str2 = WorksheetFunction.Trim(.Execute(Str)(0).submatches(0))
- .Pattern = "\([一二三四五六七**十]+\)(.+?)(?=[\r\n])"
- StrT = ""
- I = Year(Date)
- C = 0
- D = 0
- For Each mMatch In .Execute(Str2)
- If C < 3 Then
- Arrt2 = Split(Replace(mMatch.submatches(0), " ", ""), "—")
- If Arrt2(UBound(Arrt2)) Like "*至今*" Then
- D = Round((Year(Date) + Month(Date) / 12) - (Val(Arrt2(0)) + ((12 - Val(Split(Arrt2(0), "年")(1))) / 12)), 1)
- Else
- D = Round((Val(Arrt2(UBound(Arrt2))) + Val(Split(Arrt2(UBound(Arrt2)), "年")(1)) / 12) - (Val(Arrt2(0)) + ((12 - Val(Split(Arrt2(0), "年")(1))) / 12)), 1)
- End If
- StrT = StrT & Split(mMatch.submatches(0), " ")(UBound(Split(mMatch.submatches(0), " "))) & "(" & D & "年)"
- 'If C < 2 Then StrT = StrT & vbNewLine
- C = C + 1
- End If
- If Val(Split(mMatch.submatches(0), " ")(0)) < I Then I = Val(mMatch.submatches(0))
- Next mMatch
- Arr(T, 9) = Year(Date) - I
- Arr(T, 10) = StrT
- End If
- ElseIf Rule(N) Like "推荐*" Then
- If .test(Str) Then
- Str2 = WorksheetFunction.Trim(.Execute(Str)(0).submatches(0))
- .Pattern = "建议贵.*司授予\s*\S+\s*(\S+)(?=\s*职务)"
- If .test(Str2) Then Arr(T, 11) = .Execute(Str2)(0).submatches(0)
- If Str2 Like "*待遇*面谈*" Then
- Arr(T, 12) = "面谈"
- ElseIf Str2 Like "*期望*待遇不低于现有*" Then
- .Pattern = "原公司\S*待遇是\s*((\d+\.)*\d+.*)(?=元)"
- If .test(Str2) Then Arr(T, 12) = ">" & .Execute(Str2)(0).submatches(0)
- Else
- .Pattern = "期望待遇是\s*((\d+\.)*\d+.*?)(?=\/)"
- If .test(Str2) Then Arr(T, 12) = .Execute(Str2)(0).submatches(0)
- End If
- End If
- End If
- .Pattern = "(\d+)年(\d+)月(\d+)日\s*$"
- If .test(Str) Then
- With .Execute(Str)(0)
- Arr(T, 13) = .submatches(0) & "." & .submatches(1) & "." & .submatches(2)
- End With
- End If
- Next N
- End With
- T = T + 1 '数据行下移一行
- FN = Dir '循环到下一文件
- Loop
- .AutomationSecurity = msoAutomationSecurityByUI '设置打开word文档时询问是否启用宏状态
- .Quit '退出word进程
- End With
- With [a2].Resize(UBound(Arr), UBound(Arr, 2))
- .Value = Arr '结果数组写入单元格区域中
- With Intersect(Rows(2), .Cells) '设置标题行字体及颜色
- .Font.Color = vbRed
- .Font.Bold = True
- End With
- On Error Resume Next '设置容错
- .SpecialCells(xlCellTypeBlanks) = "未知" '空白单元格写入"未知"字样(如果没有空白单元格则会因上一句的容错而继续执行,避免了程序中断)
- End With
- Columns.AutoFit '列宽自适应
- Columns("F:K").ColumnWidth = 18 '对可能会过宽的F:K列设置固定列宽
- Cells.VerticalAlignment = xlCenter '文字剧中显示
- Set RegEx = Nothing '清空创建的项目
- Set Dic = Nothing
- End Sub
因程序提取的文件书写时存在一定的不规范性,为使程序能尽可能多的处理各种情况,需根据数据采集的实际情况不断丰富完善提取规则. 详见附件及素材源帖. JLHZ.rar |