楼主 amulee |
Q:如何对选定的多个文本文件内容求交集? A:如附件。每个文本都含有数量不等的由六个数字组成的六位数字字符串。现在,希望对所选的文本文件针对这一系列六个数字进行交集计算,即选定的文本文件所共有的六位数字字符串。 可参考以下方法: 由于数据量巨大,为提高速度,采取两个字典。基本思想是当读取到N个文本文件时,清空当前使用的字典,然后读取数据,判断当前元素是否存在于前一个文件处理后的字典中。如果存在,则在当前字典添加元素。两字典交替使用,直至文本文件遍历完毕。代码如下:- Sub 交集_字典法()
- Dim ArrFile
- Dim ArrD(1) As New Dictionary
- Dim i&, j&, FileCount%, FS%, Temp$, DNow%, DPre%
- t = Timer
- '选择文件
- With Application.FileDialog(msoFileDialogOpen)
- .Filters.Add "文本文件", "*.txt", 1
- If .Show = -1 Then
- FileCount = .SelectedItems.Count
- If FileCount = 1 Then
- MsgBox "至少选择两个以上文件"
- Exit Sub
- End If
- ReDim ArrFile(1 To FileCount)
- For i = 1 To FileCount
- ArrFile(i) = .SelectedItems(i)
- Next i
- Else
- MsgBox "未选择任何文件"
- Exit Sub
- End If
- End With
- '第一个文件。加入字典
- FS = FreeFile
- Open ArrFile(1) For Input As #FS
- Do While Not EOF(FS)
- Line Input #FS, Temp
- ArrD(1)(Temp) = 1
- Loop
- Close #FS
- '读取剩余文件。并加入字典。两字典交替用
- Set d = CreateObject("Scripting.Dictionary")
- For i = 2 To FileCount
- DNow = i Mod 2
- If DNow = 0 Then DPre = 1 Else DPre = 0
- ArrD(DNow).RemoveAll
- FS = FreeFile
- Open ArrFile(i) For Input As #FS
- Do While Not EOF(FS)
- Line Input #FS, Temp
- If ArrD(DPre).Exists(Temp) Then ArrD(DNow)(Temp) = 1
- Loop
- Close #FS
- Next i
- '清空数组
- Erase ArrFile
- ArrFile = ArrD(DNow).keys
- '输出文件
- FS = FreeFile
- Open ThisWorkbook.Path & "\交集+" & ArrD(DNow).Count & ".txt" For Output As #FS
- Print #FS, Join(ArrFile, vbCrLf)
- Close #FS
- MsgBox "耗时:" & Timer - t
- End Sub
如何对选定的文本文件内容求交集?.rar |
2楼 amulee |
上述代码还可以优化,因为不断对字典对象进行操作将会拖慢速度。所以将第一个文件选为最小的那个文件,即元素最少的文件。让第一次的字典就最接近最终结果,保证字典元素数量减少,也减少每次查找重复的概率。- Sub 交集_字典法()
- Dim ArrFile
- Dim ArrD(1) As New Dictionary
- Dim i&, j&, FileCount%, FS%, Temp$, DNow%, DPre%
- '选择文件
- With Application.FileDialog(msoFileDialogOpen)
- .Filters.Add "文本文件", "*.txt", 1
- If .Show = -1 Then
- t = Timer
- FileCount = .SelectedItems.Count
- If FileCount = 1 Then
- MsgBox "至少选择两个以上文件"
- Exit Sub
- End If
- ReDim ArrFile(1 To 2, 1 To FileCount)
- For i = 1 To FileCount
- ArrFile(1, i) = .SelectedItems(i)
- ArrFile(2, i) = FileLen(ArrFile(1, i))
- Next i
- Else
- MsgBox "未选择任何文件"
- Exit Sub
- End If
- End With
- '设定第一个文件为最小的文件,保证字典访问次数少
- j = 1
- For i = 2 To FileCount
- If ArrFile(2, i) < ArrFile(2, j) Then j = i
- Next i
- If j <> 1 Then
- Temp = ArrFile(1, j)
- ArrFile(1, j) = ArrFile(1, 1)
- ArrFile(1, 1) = Temp
- End If
- '第一个文件。加入字典
- FS = FreeFile
- Open ArrFile(1, 1) For Input As #FS
- Do While Not EOF(FS)
- Line Input #FS, Temp
- ArrD(1)(Temp) = 1
- Loop
- Close #FS
- '读取剩余文件。并加入字典。两字典交替用
- Set d = CreateObject("Scripting.Dictionary")
- For i = 2 To FileCount
- DNow = i Mod 2
- If DNow = 0 Then DPre = 1 Else DPre = 0
- ArrD(DNow).RemoveAll
- FS = FreeFile
- Open ArrFile(1, i) For Input As #FS
- Do While Not EOF(FS)
- Line Input #FS, Temp
- If ArrD(DPre).Exists(Temp) Then ArrD(DNow)(Temp) = 1
- Loop
- Close #FS
- Next i
- '清空数组
- Erase ArrFile
- ArrFile = ArrD(DNow).keys
- '输出文件
- FS = FreeFile
- Open ThisWorkbook.Path & "\交集+" & ArrD(DNow).Count & ".txt" For Output As #FS
- Print #FS, Join(ArrFile, vbCrLf)
- Close #FS
- MsgBox "耗时:" & Timer - t
- End Sub
|
3楼 amulee |
再补充一种方法。SQL法,适用于数据量更大的情形。- Sub 交集_ADO_SQL法()
- Dim AdoConn As Object
- Dim AdoRec As Object
- Dim Sql$, i&, FileCount%, sPath$, FS%
- Dim ArrFile
- On Error Resume Next
- '选择文件
- ReDim ArrFile(1 To 1)
- If Range("B5") <> "" Then ChDir Range("B5").Value '文件路径
- '选择文件
- With Application.FileDialog(msoFileDialogOpen)
- .Filters.Add "文本文件", "*.txt", 1
- If .Show = -1 Then
- t = Timer
- FileCount = .SelectedItems.Count
- sPath = .InitialFileName
- If FileCount < 2 Then
- MsgBox "至少选择两个以上文件"
- Exit Sub
- End If
- ReDim ArrFile(1 To FileCount)
- For i = 1 To FileCount
- ArrFile(i) = .SelectedItems(i)
- ArrFile(i) = Mid(ArrFile(i), InStrRev(ArrFile(i), Application.PathSeparator) + 1)
- Mid(ArrFile(i), Len(ArrFile(i)) - 3, 1) = "#"
- Next i
- Else
- MsgBox "未选择任何文件"
- Exit Sub
- End If
- End With
- '输出文件名
- With Sheet1
- .Range("B3:J3").ClearContents
- .Range("B3").Resize(1, FileCount) = WorksheetFunction.Index(ArrFile, 1, 0)
- End With
- '编写SQL语句
- Sql = "Select F1 from [" & ArrFile(1) & "]"
- For i = 2 To FileCount
- Sql = "Select F1 from [" & ArrFile(i) & "] where F1 in (" & Sql & ")"
- Next i
-
- '创建数据库连接
- Set AdoConn = CreateObject("ADODB.Connection")
- Set AdoRec = CreateObject("ADODB.Recordset")
- AdoConn.Open "Provider=Microsoft.ACE.OLEDB.12.0;" & _
- "Data Source=" & sPath & ";" & _
- "Extended Properties=""Text;HDR=No;IMEX=1"";"
- Set AdoRec = AdoConn.Execute(Sql) '执行查询
- ArrFile = AdoRec.GetRows '获取结果数组
- '关闭ADO对象
- AdoRec.Close
- AdoConn.Close
- Set AdoConn = Nothing
- Set AdoRec = Nothing
- Debug.Print Timer - t
- '写入文本文件
- FS = FreeFile
- Open ThisWorkbook.Path & "\交集+" & UBound(ArrFile, 2) + 1 & ".txt" For Output As #FS
- For i = 0 To UBound(ArrFile, 2)
- Print #FS, Trim(ArrFile(0, i))
- Next i
- Close #FS
- MsgBox "耗时:" & Timer - t
- End Sub
|
4楼 amulee |
再补充一种计数法,此法受内存限制。目前我的电脑只能处理6位数的文件,如果7位数就会溢出。此法从速度上比上述两种方法都要快很多,但是只能适用于数字的情形。- Sub 交集_计数法() '仅限于数字
- Dim ArrFile
- Dim i&, j&, FileCount%, FS%, k&, Temp$
- Dim ArrTxt, ArrTxtTemp, ArrCount() As Integer, LenT%
- Dim ArrJG
- Dim FSO As Object
- Dim Bln As Boolean
- On Error Resume Next
- '选择文件
- ReDim ArrFile(1 To 2, 1 To 1)
- If Range("B5") <> "" Then ChDir Range("B5").Value '文件路径
- '选择文件
- With Application.FileDialog(msoFileDialogOpen)
- .Filters.Add "文本文件", "*.txt", 1
- If .Show = -1 Then
- t = Timer
- FileCount = .SelectedItems.Count
- If FileCount < 2 Then
- MsgBox "至少选择两个以上文件"
- Exit Sub
- End If
- ReDim ArrFile(1 To 2, 1 To FileCount)
- For i = 1 To FileCount
- ArrFile(1, i) = .SelectedItems(i)
- ArrFile(2, i) = FileLen(ArrFile(1, i))
- Next i
- Else
- MsgBox "未选择任何文件"
- Exit Sub
- End If
- End With
- '输出文件名
- With Sheet1
- .Range("B3:J3").ClearContents
- .Range("B3").Resize(1, FileCount) = WorksheetFunction.Index(ArrFile, 1, 0)
- End With
- '读取所有文件
- Set FSO = CreateObject("Scripting.FileSystemObject")
- ReDim ArrTxt(1 To FileCount)
- For i = 1 To FileCount
- With FSO.OpenTextFile(ArrFile(1, i))
- ArrTxt(i) = Split(.ReadAll, vbCrLf)
- .Close
- End With
- Next i
- LenT = Len(ArrTxt(1)(0))
- ReDim ArrCount(1 To 10 ^ LenT - 1)
- Debug.Print Timer - t
- '遍历其他文件
- For i = 1 To FileCount
- '逐个比较
- For j = 0 To UBound(ArrTxt(i))
- k = Val(ArrTxt(i)(j))
- If i - ArrCount(k) = 1 Then ArrCount(k) = ArrCount(k) + 1
- Next j
- Next i
- '输出文件
- FS = FreeFile
- k = 0
- Open ThisWorkbook.Path & "\" & FS & ".txt" For Output As #FS
- For i = 1 To UBound(ArrCount)
- If ArrCount(i) = FileCount Then
- Print #FS, Format(i, String(LenT, "0"))
- k = k + 1
- End If
- Next i
- Close #FS
- Kill ThisWorkbook.Path & "\交集+" & k & ".txt"
- Name ThisWorkbook.Path & "\" & FS & ".txt" As ThisWorkbook.Path & "\交集+" & k & ".txt"
- MsgBox "耗时:" & Timer - t
- End Sub
|
5楼 lrlxxqxa |
精彩! |