|
9 | 9 |
|
10 | 10 | .EXAMPLE
|
11 | 11 | $Path = [Environment]::GetFolderPath('MyDocuments')
|
12 |
| - Find-PSOneDuplicateFile -Path $Path |
| 12 | + Find-PSOneDuplicateFile -Path $Path |
13 | 13 | Find duplicate files in the user documents folder
|
14 | 14 |
|
15 | 15 | .EXAMPLE
|
16 |
| - Find-PSOneDuplicateFile -Path c:\windows -Filter *.log |
| 16 | + Find-PSOneDuplicateFile -Path c:\windows -Filter *.log |
17 | 17 | find log files in the Windows folder with duplicate content
|
18 | 18 |
|
19 | 19 | .LINK
|
|
27 | 27 | [String]
|
28 | 28 | [Parameter(Mandatory)]
|
29 | 29 | $Path,
|
30 |
| - |
31 |
| - # Filter to apply. Default is '*' (all Files) |
| 30 | + |
| 31 | + # Filter to apply. Default is '*' (all Files) |
32 | 32 | [String]
|
33 | 33 | $Filter = '*'
|
34 | 34 | )
|
35 | 35 |
|
36 | 36 | # get a hashtable of all files of size greater 0
|
37 | 37 | # grouped by their length
|
38 |
| - |
39 |
| - |
| 38 | + |
| 39 | + |
40 | 40 | # ENUMERATE ALL FILES RECURSIVELY
|
41 | 41 | # call scriptblocks directly and pipe them together
|
42 | 42 | # this is by far the fastest way and much faster than
|
43 | 43 | # using Foreach-Object:
|
44 |
| - & { |
| 44 | + & { |
45 | 45 | try
|
46 | 46 | {
|
47 | 47 | # try and use the fast API way of enumerating files recursively
|
48 | 48 | # this FAILS whenever there is any "Access Denied" errors
|
49 | 49 | Write-Progress -Activity 'Acquiring Files' -Status 'Fast Method'
|
50 |
| - [IO.DirectoryInfo]::new($Path).GetFiles('*', 'AllDirectories') |
| 50 | + [IO.DirectoryInfo]::new($Path).GetFiles($Filter, 'AllDirectories') |
51 | 51 | }
|
52 | 52 | catch
|
53 | 53 | {
|
54 | 54 | # use PowerShell's own (slow) way of enumerating files if any error occurs:
|
55 | 55 | Write-Progress -Activity 'Acquiring Files' -Status 'Falling Back to Slow Method'
|
56 | 56 | Get-ChildItem -Path $Path -File -Recurse -ErrorAction Ignore
|
57 | 57 | }
|
58 |
| - } | |
| 58 | + } | |
59 | 59 | # EXCLUDE EMPTY FILES:
|
60 | 60 | # use direct process blocks with IF (which is much faster than Where-Object):
|
61 | 61 | & {
|
|
68 | 68 | $_
|
69 | 69 | }
|
70 | 70 | }
|
71 |
| - } | |
| 71 | + } | |
72 | 72 | # GROUP FILES BY LENGTH, AND RETURN ONLY FILES WHERE THERE IS AT LEAST ONE
|
73 | 73 | # OTHER FILE WITH SAME SIZE
|
74 | 74 | # use direct scriptblocks with own hashtable (which is much faster than Group-Object)
|
75 |
| - & { |
76 |
| - begin |
| 75 | + & { |
| 76 | + begin |
77 | 77 | # start with an empty hashtable
|
78 |
| - { $hash = @{} } |
| 78 | + { $hash = @{} } |
79 | 79 |
|
80 |
| - process |
81 |
| - { |
| 80 | + process |
| 81 | + { |
82 | 82 | # group files by their length
|
83 | 83 | # (use "length" as hashtable key)
|
84 | 84 | $file = $_
|
85 | 85 | $key = $file.Length.toString()
|
86 |
| - |
| 86 | + |
87 | 87 | # if we see this key for the first time, create a generic
|
88 | 88 | # list to hold group items, and store FileInfo objects in this list
|
89 | 89 | # (specialized generic lists are faster than ArrayList):
|
90 |
| - if ($hash.ContainsKey($key) -eq $false) |
| 90 | + if ($hash.ContainsKey($key) -eq $false) |
91 | 91 | {
|
92 | 92 | $hash[$key] = [Collections.Generic.List[System.IO.FileInfo]]::new()
|
93 | 93 | }
|
94 | 94 | # add file to appropriate hashtable key:
|
95 | 95 | $hash[$key].Add($file)
|
96 |
| - } |
97 |
| - |
98 |
| - end |
99 |
| - { |
| 96 | + } |
| 97 | + |
| 98 | + end |
| 99 | + { |
100 | 100 | # return only the files from groups with at least two files
|
101 |
| - # (if there is only one file with a given length, then it |
| 101 | + # (if there is only one file with a given length, then it |
102 | 102 | # cannot have any duplicates for sure):
|
103 | 103 | foreach($pile in $hash.Values)
|
104 | 104 | {
|
|
109 | 109 | $pile
|
110 | 110 | }
|
111 | 111 | }
|
112 |
| - } |
113 |
| - } | |
| 112 | + } |
| 113 | + } | |
114 | 114 | # CALCULATE THE NUMBER OF FILES TO HASH
|
115 | 115 | # collect all files and hand over en-bloc
|
116 | 116 | & {
|
|
119 | 119 | # GROUP FILES BY HASH, AND RETURN ONLY HASHES THAT HAVE AT LEAST TWO FILES:
|
120 | 120 | # use a direct scriptblock call with a hashtable (much faster than Group-Object):
|
121 | 121 | & {
|
122 |
| - begin |
| 122 | + begin |
123 | 123 | {
|
124 | 124 | # start with an empty hashtable
|
125 | 125 | $hash = @{}
|
126 |
| - |
| 126 | + |
127 | 127 | # since this is a length procedure, a progress bar is in order
|
128 | 128 | # keep a counter of processed files:
|
129 | 129 | $c = 0
|
130 | 130 | }
|
131 |
| - |
| 131 | + |
132 | 132 | process
|
133 | 133 | {
|
134 | 134 | $totalNumber = $_.Count
|
135 | 135 | foreach($file in $_)
|
136 | 136 | {
|
137 |
| - |
| 137 | + |
138 | 138 | # update progress bar
|
139 | 139 | $c++
|
140 |
| - |
| 140 | + |
141 | 141 | # update progress bar every 20 files:
|
142 | 142 | if ($c % 20 -eq 0)
|
143 | 143 | {
|
144 | 144 | $percentComplete = $c * 100 / $totalNumber
|
145 | 145 | Write-Progress -Activity 'Hashing File Content' -Status $file.Name -PercentComplete $percentComplete
|
146 | 146 | }
|
147 |
| - |
| 147 | + |
148 | 148 | # use the file hash of this file PLUS file length as a key to the hashtable
|
149 | 149 | # use the fastest algorithm SHA1
|
150 | 150 | $result = Get-FileHash -Path $file.FullName -Algorithm SHA1
|
151 | 151 | $key = '{0}:{1}' -f $result.Hash, $file.Length
|
152 |
| - |
| 152 | + |
153 | 153 | # if we see this key the first time, add a generic list to this key:
|
154 | 154 | if ($hash.ContainsKey($key) -eq $false)
|
155 | 155 | {
|
156 | 156 | $hash.Add($key, [Collections.Generic.List[System.IO.FileInfo]]::new())
|
157 | 157 | }
|
158 |
| - |
| 158 | + |
159 | 159 | # add the file to the approriate group:
|
160 | 160 | $hash[$key].Add($file)
|
161 | 161 | }
|
162 | 162 | }
|
163 |
| - |
| 163 | + |
164 | 164 | end
|
165 | 165 | {
|
166 | 166 | # remove all hashtable keys with only one file in them
|
167 |
| - |
| 167 | + |
168 | 168 | # first, CLONE the list of hashtable keys
|
169 | 169 | # (we cannot remove hashtable keys while enumerating the live
|
170 | 170 | # keys list):
|
171 | 171 | # remove keys
|
172 | 172 | $keys = @($hash.Keys).Clone()
|
173 |
| - |
| 173 | + |
174 | 174 | # enumerate all keys...
|
175 | 175 | foreach($key in $keys)
|
176 | 176 | {
|
|
180 | 180 | $hash.Remove($key)
|
181 | 181 | }
|
182 | 182 | }
|
183 |
| - |
| 183 | + |
184 | 184 | # return the hashtable with only duplicate files left:
|
185 | 185 | $hash
|
186 | 186 | }
|
|
0 commit comments