1
+ function Find-PSOneDuplicateFile
2
+ {
3
+ <#
4
+ . SYNOPSIS
5
+ Identifies files with duplicate content
6
+
7
+ . DESCRIPTION
8
+ Returns a hashtable with the hashes that have at least two files (duplicates)
9
+
10
+ . EXAMPLE
11
+ $Path = [Environment]::GetFolderPath('MyDocuments')
12
+ Find-PSOneDuplicateFile -Path $Path
13
+ Find duplicate files in the user documents folder
14
+
15
+ . EXAMPLE
16
+ Find-PSOneDuplicateFile -Path c:\windows -Filter *.log
17
+ find log files in the Windows folder with duplicate content
18
+
19
+ . LINK
20
+ https://powershell.one/tricks/filesystem/finding-duplicate-files
21
+ #>
22
+
23
+
24
+ param
25
+ (
26
+ # Path of folder to recursively search
27
+ [String ]
28
+ [Parameter (Mandatory )]
29
+ $Path ,
30
+
31
+ # Filter to apply. Default is '*' (all Files)
32
+ [String ]
33
+ $Filter = ' *'
34
+ )
35
+
36
+ # get a hashtable of all files of size greater 0
37
+ # grouped by their length
38
+
39
+
40
+ # ENUMERATE ALL FILES RECURSIVELY
41
+ # call scriptblocks directly and pipe them together
42
+ # this is by far the fastest way and much faster than
43
+ # using Foreach-Object:
44
+ & {
45
+ try
46
+ {
47
+ # try and use the fast API way of enumerating files recursively
48
+ # this FAILS whenever there is any "Access Denied" errors
49
+ Write-Progress - Activity ' Acquiring Files' - Status ' Fast Method'
50
+ [IO.DirectoryInfo ]::new($Path ).GetFiles(' *' , ' AllDirectories' )
51
+ }
52
+ catch
53
+ {
54
+ # use PowerShell's own (slow) way of enumerating files if any error occurs:
55
+ Write-Progress - Activity ' Acquiring Files' - Status ' Falling Back to Slow Method'
56
+ Get-ChildItem - Path $Path - File - Recurse - ErrorAction Ignore
57
+ }
58
+ } |
59
+ # EXCLUDE EMPTY FILES:
60
+ # use direct process blocks with IF (which is much faster than Where-Object):
61
+ & {
62
+ process
63
+ {
64
+ # if the file has content...
65
+ if ($_.Length -gt 0 )
66
+ {
67
+ # let it pass through:
68
+ $_
69
+ }
70
+ }
71
+ } |
72
+ # GROUP FILES BY LENGTH, AND RETURN ONLY FILES WHERE THERE IS AT LEAST ONE
73
+ # OTHER FILE WITH SAME SIZE
74
+ # use direct scriptblocks with own hashtable (which is much faster than Group-Object)
75
+ & {
76
+ begin
77
+ # start with an empty hashtable
78
+ { $hash = @ {} }
79
+
80
+ process
81
+ {
82
+ # group files by their length
83
+ # (use "length" as hashtable key)
84
+ $file = $_
85
+ $key = $file.Length.toString ()
86
+
87
+ # if we see this key for the first time, create a generic
88
+ # list to hold group items, and store FileInfo objects in this list
89
+ # (specialized generic lists are faster than ArrayList):
90
+ if ($hash.ContainsKey ($key ) -eq $false )
91
+ {
92
+ $hash [$key ] = [Collections.Generic.List [System.IO.FileInfo ]]::new()
93
+ }
94
+ # add file to appropriate hashtable key:
95
+ $hash [$key ].Add($file )
96
+ }
97
+
98
+ end
99
+ {
100
+ # return only the files from groups with at least two files
101
+ # (if there is only one file with a given length, then it
102
+ # cannot have any duplicates for sure):
103
+ foreach ($pile in $hash.Values )
104
+ {
105
+ # are there at least 2 files in this pile?
106
+ if ($pile.Count -gt 1 )
107
+ {
108
+ # yes, add it to the candidates
109
+ $pile
110
+ }
111
+ }
112
+ }
113
+ } |
114
+ # CALCULATE THE NUMBER OF FILES TO HASH
115
+ # collect all files and hand over en-bloc
116
+ & {
117
+ end { , @ ($input ) }
118
+ } |
119
+ # GROUP FILES BY HASH, AND RETURN ONLY HASHES THAT HAVE AT LEAST TWO FILES:
120
+ # use a direct scriptblock call with a hashtable (much faster than Group-Object):
121
+ & {
122
+ begin
123
+ {
124
+ # start with an empty hashtable
125
+ $hash = @ {}
126
+
127
+ # since this is a length procedure, a progress bar is in order
128
+ # keep a counter of processed files:
129
+ $c = 0
130
+ }
131
+
132
+ process
133
+ {
134
+ $totalNumber = $_.Count
135
+ foreach ($file in $_ )
136
+ {
137
+
138
+ # update progress bar
139
+ $c ++
140
+
141
+ # update progress bar every 20 files:
142
+ if ($c % 20 -eq 0 )
143
+ {
144
+ $percentComplete = $c * 100 / $totalNumber
145
+ Write-Progress - Activity ' Hashing File Content' - Status $file.Name - PercentComplete $percentComplete
146
+ }
147
+
148
+ # use the file hash of this file PLUS file length as a key to the hashtable
149
+ # use the fastest algorithm SHA1
150
+ $result = Get-FileHash - Path $file.FullName - Algorithm SHA1
151
+ $key = ' {0}:{1}' -f $result.Hash , $file.Length
152
+
153
+ # if we see this key the first time, add a generic list to this key:
154
+ if ($hash.ContainsKey ($key ) -eq $false )
155
+ {
156
+ $hash.Add ($key , [Collections.Generic.List [System.IO.FileInfo ]]::new())
157
+ }
158
+
159
+ # add the file to the approriate group:
160
+ $hash [$key ].Add($file )
161
+ }
162
+ }
163
+
164
+ end
165
+ {
166
+ # remove all hashtable keys with only one file in them
167
+
168
+ # first, CLONE the list of hashtable keys
169
+ # (we cannot remove hashtable keys while enumerating the live
170
+ # keys list):
171
+ # remove keys
172
+ $keys = @ ($hash.Keys ).Clone()
173
+
174
+ # enumerate all keys...
175
+ foreach ($key in $keys )
176
+ {
177
+ # ...if key has only one file, remove it:
178
+ if ($hash [$key ].Count -eq 1 )
179
+ {
180
+ $hash.Remove ($key )
181
+ }
182
+ }
183
+
184
+ # return the hashtable with only duplicate files left:
185
+ $hash
186
+ }
187
+ }
188
+ }
0 commit comments