1
+ function Find-PSOneDuplicateFileFast
2
+ {
3
+ <#
4
+ . SYNOPSIS
5
+ Identifies files with duplicate content and uses a partial hash for large files to speed calculation up
6
+
7
+ . DESCRIPTION
8
+ Returns a hashtable with the hashes that have at least two files (duplicates). Large files with partial hashes are suffixed with a "P".
9
+ Large files with a partial hash can be falsely positive: they may in fact be different even though the partial hash is the same
10
+ You either need to calculate the full hash for these files to be absolutely sure, or add -TestPartialHash.
11
+ Calculating a full hash for large files may take a very long time though. So you may be better off using other
12
+ strategies to identify duplicate file content, i.e. look at identical creation times, etc.
13
+
14
+ . EXAMPLE
15
+ $Path = [Environment]::GetFolderPath('MyDocuments')
16
+ Find-PSOneDuplicateFileFast -Path $Path
17
+ Find duplicate files in the user documents folder
18
+
19
+ . EXAMPLE
20
+ Find-PSOneDuplicateFileFast -Path c:\windows -Filter *.log
21
+ find log files in the Windows folder with duplicate content
22
+
23
+ . LINK
24
+ https://powershell.one
25
+ #>
26
+
27
+
28
+ param
29
+ (
30
+ # Path of folder to recursively search
31
+ [String ]
32
+ [Parameter (Mandatory )]
33
+ $Path ,
34
+
35
+ # Filter to apply. Default is '*' (all Files)
36
+ [String ]
37
+ $Filter = ' *' ,
38
+
39
+ # when there are multiple files with same partial hash
40
+ # they may still be different. When setting this switch,
41
+ # full hashes are calculated which may take a very long time
42
+ # for large files and/or slow networks
43
+ [switch ]
44
+ $TestPartialHash ,
45
+
46
+ # use partial hashes for files larger than this:
47
+ [int64 ]
48
+ $MaxFileSize = 100 KB
49
+ )
50
+
51
+ # get a hashtable of all files of size greater 0
52
+ # grouped by their length
53
+
54
+
55
+ # ENUMERATE ALL FILES RECURSIVELY
56
+ # call scriptblocks directly and pipe them together
57
+ # this is by far the fastest way and much faster than
58
+ # using Foreach-Object:
59
+ & {
60
+ try
61
+ {
62
+ # try and use the fast API way of enumerating files recursively
63
+ # this FAILS whenever there is any "Access Denied" errors
64
+ Write-Progress - Activity ' Acquiring Files' - Status ' Fast Method'
65
+ [IO.DirectoryInfo ]::new($Path ).GetFiles(' *' , ' AllDirectories' )
66
+ }
67
+ catch
68
+ {
69
+ # use PowerShell's own (slow) way of enumerating files if any error occurs:
70
+ Write-Progress - Activity ' Acquiring Files' - Status ' Falling Back to Slow Method'
71
+ Get-ChildItem - Path $Path - File - Recurse - ErrorAction Ignore
72
+ }
73
+ } |
74
+ # EXCLUDE EMPTY FILES:
75
+ # use direct process blocks with IF (which is much faster than Where-Object):
76
+ & {
77
+ process
78
+ {
79
+ # if the file has content...
80
+ if ($_.Length -gt 0 )
81
+ {
82
+ # let it pass through:
83
+ $_
84
+ }
85
+ }
86
+ } |
87
+ # GROUP FILES BY LENGTH, AND RETURN ONLY FILES WHERE THERE IS AT LEAST ONE
88
+ # OTHER FILE WITH SAME SIZE
89
+ # use direct scriptblocks with own hashtable (which is much faster than Group-Object)
90
+ & {
91
+ begin
92
+ # start with an empty hashtable
93
+ { $hash = @ {} }
94
+
95
+ process
96
+ {
97
+ # group files by their length
98
+ # (use "length" as hashtable key)
99
+ $file = $_
100
+ $key = $file.Length.toString ()
101
+
102
+ # if we see this key for the first time, create a generic
103
+ # list to hold group items, and store FileInfo objects in this list
104
+ # (specialized generic lists are faster than ArrayList):
105
+ if ($hash.ContainsKey ($key ) -eq $false )
106
+ {
107
+ $hash [$key ] = [Collections.Generic.List [System.IO.FileInfo ]]::new()
108
+ }
109
+ # add file to appropriate hashtable key:
110
+ $hash [$key ].Add($file )
111
+ }
112
+
113
+ end
114
+ {
115
+ # return only the files from groups with at least two files
116
+ # (if there is only one file with a given length, then it
117
+ # cannot have any duplicates for sure):
118
+ foreach ($pile in $hash.Values )
119
+ {
120
+ # are there at least 2 files in this pile?
121
+ if ($pile.Count -gt 1 )
122
+ {
123
+ # yes, add it to the candidates
124
+ $pile
125
+ }
126
+ }
127
+ }
128
+ } |
129
+ # CALCULATE THE NUMBER OF FILES TO HASH
130
+ # collect all files and hand over en-bloc
131
+ & {
132
+ end { , @ ($input ) }
133
+ } |
134
+ # GROUP FILES BY HASH, AND RETURN ONLY HASHES THAT HAVE AT LEAST TWO FILES:
135
+ # use a direct scriptblock call with a hashtable (much faster than Group-Object):
136
+ & {
137
+ begin
138
+ {
139
+ # start with an empty hashtable
140
+ $hash = @ {}
141
+
142
+ # since this is a length procedure, a progress bar is in order
143
+ # keep a counter of processed files:
144
+ $c = 0
145
+ }
146
+
147
+ process
148
+ {
149
+ $totalNumber = $_.Count
150
+ foreach ($file in $_ )
151
+ {
152
+
153
+ # update progress bar
154
+ $c ++
155
+
156
+ # update progress bar every 20 files:
157
+ if ($c % 20 -eq 0 -or $file.Length -gt 100 MB )
158
+ {
159
+ $percentComplete = $c * 100 / $totalNumber
160
+ Write-Progress - Activity ' Hashing File Content' - Status $file.Name - PercentComplete $percentComplete
161
+ }
162
+
163
+ # use the file hash of this file PLUS file length as a key to the hashtable
164
+ # use the fastest algorithm SHA1, and use partial hashes for files larger than 100KB:
165
+ $bufferSize = [Math ]::Min(100 KB , $MaxFileSize )
166
+ $result = Get-PsOneFileHash - StartPosition 1 KB - Length $MaxFileSize - BufferSize $bufferSize - AlgorithmName SHA1 - Path $file.FullName
167
+
168
+ # add a "P" to partial hashes:
169
+ if ($result.IsPartialHash ) {
170
+ $partialHash = ' P'
171
+ }
172
+ else
173
+ {
174
+ $partialHash = ' '
175
+ }
176
+
177
+
178
+ $key = ' {0}:{1}{2}' -f $result.Hash , $file.Length , $partialHash
179
+
180
+ # if we see this key the first time, add a generic list to this key:
181
+ if ($hash.ContainsKey ($key ) -eq $false )
182
+ {
183
+ $hash.Add ($key , [Collections.Generic.List [System.IO.FileInfo ]]::new())
184
+ }
185
+
186
+ # add the file to the approriate group:
187
+ $hash [$key ].Add($file )
188
+ }
189
+ }
190
+
191
+ end
192
+ {
193
+ # remove all hashtable keys with only one file in them
194
+
195
+
196
+
197
+ # do a detail check on partial hashes
198
+ if ($TestPartialHash )
199
+ {
200
+ # first, CLONE the list of hashtable keys
201
+ # (we cannot remove hashtable keys while enumerating the live
202
+ # keys list):
203
+ $keys = @ ($hash.Keys ).Clone()
204
+ $i = 0
205
+ Foreach ($key in $keys )
206
+ {
207
+ $i ++
208
+ $percentComplete = $i * 100 / $keys.Count
209
+ if ($hash [$key ].Count -gt 1 -and $key.EndsWith (' P' ))
210
+ {
211
+ foreach ($file in $hash [$key ])
212
+ {
213
+ Write-Progress - Activity ' Hashing Full File Content' - Status $file.Name - PercentComplete $percentComplete
214
+ $result = Get-FileHash - Path $file.FullName - Algorithm SHA1
215
+ $newkey = ' {0}:{1}' -f $result.Hash , $file.Length
216
+ if ($hash.ContainsKey ($newkey ) -eq $false )
217
+ {
218
+ $hash.Add ($newkey , [Collections.Generic.List [System.IO.FileInfo ]]::new())
219
+ }
220
+ $hash [$newkey ].Add($file )
221
+ }
222
+ $hash.Remove ($key )
223
+ }
224
+ }
225
+ }
226
+
227
+ # enumerate all keys...
228
+ $keys = @ ($hash.Keys ).Clone()
229
+
230
+ foreach ($key in $keys )
231
+ {
232
+ # ...if key has only one file, remove it:
233
+ if ($hash [$key ].Count -eq 1 )
234
+ {
235
+ $hash.Remove ($key )
236
+ }
237
+ }
238
+
239
+
240
+
241
+ # return the hashtable with only duplicate files left:
242
+ $hash
243
+ }
244
+ }
245
+ }
0 commit comments