Skip to content

Commit d6528a1

Browse files
committed
Version 1.7
added Find-PsOneDuplicateFile
1 parent 7734809 commit d6528a1

14 files changed

+189
-1
lines changed
Lines changed: 188 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,188 @@
1+
function Find-PSOneDuplicateFile
2+
{
3+
<#
4+
.SYNOPSIS
5+
Identifies files with duplicate content
6+
7+
.DESCRIPTION
8+
Returns a hashtable with the hashes that have at least two files (duplicates)
9+
10+
.EXAMPLE
11+
$Path = [Environment]::GetFolderPath('MyDocuments')
12+
Find-PSOneDuplicateFile -Path $Path
13+
Find duplicate files in the user documents folder
14+
15+
.EXAMPLE
16+
Find-PSOneDuplicateFile -Path c:\windows -Filter *.log
17+
find log files in the Windows folder with duplicate content
18+
19+
.LINK
20+
https://powershell.one/tricks/filesystem/finding-duplicate-files
21+
#>
22+
23+
24+
param
25+
(
26+
# Path of folder to recursively search
27+
[String]
28+
[Parameter(Mandatory)]
29+
$Path,
30+
31+
# Filter to apply. Default is '*' (all Files)
32+
[String]
33+
$Filter = '*'
34+
)
35+
36+
# get a hashtable of all files of size greater 0
37+
# grouped by their length
38+
39+
40+
# ENUMERATE ALL FILES RECURSIVELY
41+
# call scriptblocks directly and pipe them together
42+
# this is by far the fastest way and much faster than
43+
# using Foreach-Object:
44+
& {
45+
try
46+
{
47+
# try and use the fast API way of enumerating files recursively
48+
# this FAILS whenever there is any "Access Denied" errors
49+
Write-Progress -Activity 'Acquiring Files' -Status 'Fast Method'
50+
[IO.DirectoryInfo]::new($Path).GetFiles('*', 'AllDirectories')
51+
}
52+
catch
53+
{
54+
# use PowerShell's own (slow) way of enumerating files if any error occurs:
55+
Write-Progress -Activity 'Acquiring Files' -Status 'Falling Back to Slow Method'
56+
Get-ChildItem -Path $Path -File -Recurse -ErrorAction Ignore
57+
}
58+
} |
59+
# EXCLUDE EMPTY FILES:
60+
# use direct process blocks with IF (which is much faster than Where-Object):
61+
& {
62+
process
63+
{
64+
# if the file has content...
65+
if ($_.Length -gt 0)
66+
{
67+
# let it pass through:
68+
$_
69+
}
70+
}
71+
} |
72+
# GROUP FILES BY LENGTH, AND RETURN ONLY FILES WHERE THERE IS AT LEAST ONE
73+
# OTHER FILE WITH SAME SIZE
74+
# use direct scriptblocks with own hashtable (which is much faster than Group-Object)
75+
& {
76+
begin
77+
# start with an empty hashtable
78+
{ $hash = @{} }
79+
80+
process
81+
{
82+
# group files by their length
83+
# (use "length" as hashtable key)
84+
$file = $_
85+
$key = $file.Length.toString()
86+
87+
# if we see this key for the first time, create a generic
88+
# list to hold group items, and store FileInfo objects in this list
89+
# (specialized generic lists are faster than ArrayList):
90+
if ($hash.ContainsKey($key) -eq $false)
91+
{
92+
$hash[$key] = [Collections.Generic.List[System.IO.FileInfo]]::new()
93+
}
94+
# add file to appropriate hashtable key:
95+
$hash[$key].Add($file)
96+
}
97+
98+
end
99+
{
100+
# return only the files from groups with at least two files
101+
# (if there is only one file with a given length, then it
102+
# cannot have any duplicates for sure):
103+
foreach($pile in $hash.Values)
104+
{
105+
# are there at least 2 files in this pile?
106+
if ($pile.Count -gt 1)
107+
{
108+
# yes, add it to the candidates
109+
$pile
110+
}
111+
}
112+
}
113+
} |
114+
# CALCULATE THE NUMBER OF FILES TO HASH
115+
# collect all files and hand over en-bloc
116+
& {
117+
end { ,@($input) }
118+
} |
119+
# GROUP FILES BY HASH, AND RETURN ONLY HASHES THAT HAVE AT LEAST TWO FILES:
120+
# use a direct scriptblock call with a hashtable (much faster than Group-Object):
121+
& {
122+
begin
123+
{
124+
# start with an empty hashtable
125+
$hash = @{}
126+
127+
# since this is a length procedure, a progress bar is in order
128+
# keep a counter of processed files:
129+
$c = 0
130+
}
131+
132+
process
133+
{
134+
$totalNumber = $_.Count
135+
foreach($file in $_)
136+
{
137+
138+
# update progress bar
139+
$c++
140+
141+
# update progress bar every 20 files:
142+
if ($c % 20 -eq 0)
143+
{
144+
$percentComplete = $c * 100 / $totalNumber
145+
Write-Progress -Activity 'Hashing File Content' -Status $file.Name -PercentComplete $percentComplete
146+
}
147+
148+
# use the file hash of this file PLUS file length as a key to the hashtable
149+
# use the fastest algorithm SHA1
150+
$result = Get-FileHash -Path $file.FullName -Algorithm SHA1
151+
$key = '{0}:{1}' -f $result.Hash, $file.Length
152+
153+
# if we see this key the first time, add a generic list to this key:
154+
if ($hash.ContainsKey($key) -eq $false)
155+
{
156+
$hash.Add($key, [Collections.Generic.List[System.IO.FileInfo]]::new())
157+
}
158+
159+
# add the file to the approriate group:
160+
$hash[$key].Add($file)
161+
}
162+
}
163+
164+
end
165+
{
166+
# remove all hashtable keys with only one file in them
167+
168+
# first, CLONE the list of hashtable keys
169+
# (we cannot remove hashtable keys while enumerating the live
170+
# keys list):
171+
# remove keys
172+
$keys = @($hash.Keys).Clone()
173+
174+
# enumerate all keys...
175+
foreach($key in $keys)
176+
{
177+
# ...if key has only one file, remove it:
178+
if ($hash[$key].Count -eq 1)
179+
{
180+
$hash.Remove($key)
181+
}
182+
}
183+
184+
# return the hashtable with only duplicate files left:
185+
$hash
186+
}
187+
}
188+
}
File renamed without changes.
Binary file not shown.
File renamed without changes.
File renamed without changes.

0 commit comments

Comments
 (0)