Skip to content

Commit c0951fc

Browse files
committed
v2.1
1 parent 4db4fdc commit c0951fc

20 files changed

+796
-0
lines changed
Lines changed: 245 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,245 @@
1+
function Find-PSOneDuplicateFileFast
2+
{
3+
<#
4+
.SYNOPSIS
5+
Identifies files with duplicate content and uses a partial hash for large files to speed calculation up
6+
7+
.DESCRIPTION
8+
Returns a hashtable with the hashes that have at least two files (duplicates). Large files with partial hashes are suffixed with a "P".
9+
Large files with a partial hash can be falsely positive: they may in fact be different even though the partial hash is the same
10+
You either need to calculate the full hash for these files to be absolutely sure, or add -TestPartialHash.
11+
Calculating a full hash for large files may take a very long time though. So you may be better off using other
12+
strategies to identify duplicate file content, i.e. look at identical creation times, etc.
13+
14+
.EXAMPLE
15+
$Path = [Environment]::GetFolderPath('MyDocuments')
16+
Find-PSOneDuplicateFileFast -Path $Path
17+
Find duplicate files in the user documents folder
18+
19+
.EXAMPLE
20+
Find-PSOneDuplicateFileFast -Path c:\windows -Filter *.log
21+
find log files in the Windows folder with duplicate content
22+
23+
.LINK
24+
https://powershell.one
25+
#>
26+
27+
28+
param
29+
(
30+
# Path of folder to recursively search
31+
[String]
32+
[Parameter(Mandatory)]
33+
$Path,
34+
35+
# Filter to apply. Default is '*' (all Files)
36+
[String]
37+
$Filter = '*',
38+
39+
# when there are multiple files with same partial hash
40+
# they may still be different. When setting this switch,
41+
# full hashes are calculated which may take a very long time
42+
# for large files and/or slow networks
43+
[switch]
44+
$TestPartialHash,
45+
46+
# use partial hashes for files larger than this:
47+
[int64]
48+
$MaxFileSize = 100KB
49+
)
50+
51+
# get a hashtable of all files of size greater 0
52+
# grouped by their length
53+
54+
55+
# ENUMERATE ALL FILES RECURSIVELY
56+
# call scriptblocks directly and pipe them together
57+
# this is by far the fastest way and much faster than
58+
# using Foreach-Object:
59+
& {
60+
try
61+
{
62+
# try and use the fast API way of enumerating files recursively
63+
# this FAILS whenever there is any "Access Denied" errors
64+
Write-Progress -Activity 'Acquiring Files' -Status 'Fast Method'
65+
[IO.DirectoryInfo]::new($Path).GetFiles('*', 'AllDirectories')
66+
}
67+
catch
68+
{
69+
# use PowerShell's own (slow) way of enumerating files if any error occurs:
70+
Write-Progress -Activity 'Acquiring Files' -Status 'Falling Back to Slow Method'
71+
Get-ChildItem -Path $Path -File -Recurse -ErrorAction Ignore
72+
}
73+
} |
74+
# EXCLUDE EMPTY FILES:
75+
# use direct process blocks with IF (which is much faster than Where-Object):
76+
& {
77+
process
78+
{
79+
# if the file has content...
80+
if ($_.Length -gt 0)
81+
{
82+
# let it pass through:
83+
$_
84+
}
85+
}
86+
} |
87+
# GROUP FILES BY LENGTH, AND RETURN ONLY FILES WHERE THERE IS AT LEAST ONE
88+
# OTHER FILE WITH SAME SIZE
89+
# use direct scriptblocks with own hashtable (which is much faster than Group-Object)
90+
& {
91+
begin
92+
# start with an empty hashtable
93+
{ $hash = @{} }
94+
95+
process
96+
{
97+
# group files by their length
98+
# (use "length" as hashtable key)
99+
$file = $_
100+
$key = $file.Length.toString()
101+
102+
# if we see this key for the first time, create a generic
103+
# list to hold group items, and store FileInfo objects in this list
104+
# (specialized generic lists are faster than ArrayList):
105+
if ($hash.ContainsKey($key) -eq $false)
106+
{
107+
$hash[$key] = [Collections.Generic.List[System.IO.FileInfo]]::new()
108+
}
109+
# add file to appropriate hashtable key:
110+
$hash[$key].Add($file)
111+
}
112+
113+
end
114+
{
115+
# return only the files from groups with at least two files
116+
# (if there is only one file with a given length, then it
117+
# cannot have any duplicates for sure):
118+
foreach($pile in $hash.Values)
119+
{
120+
# are there at least 2 files in this pile?
121+
if ($pile.Count -gt 1)
122+
{
123+
# yes, add it to the candidates
124+
$pile
125+
}
126+
}
127+
}
128+
} |
129+
# CALCULATE THE NUMBER OF FILES TO HASH
130+
# collect all files and hand over en-bloc
131+
& {
132+
end { ,@($input) }
133+
} |
134+
# GROUP FILES BY HASH, AND RETURN ONLY HASHES THAT HAVE AT LEAST TWO FILES:
135+
# use a direct scriptblock call with a hashtable (much faster than Group-Object):
136+
& {
137+
begin
138+
{
139+
# start with an empty hashtable
140+
$hash = @{}
141+
142+
# since this is a length procedure, a progress bar is in order
143+
# keep a counter of processed files:
144+
$c = 0
145+
}
146+
147+
process
148+
{
149+
$totalNumber = $_.Count
150+
foreach($file in $_)
151+
{
152+
153+
# update progress bar
154+
$c++
155+
156+
# update progress bar every 20 files:
157+
if ($c % 20 -eq 0 -or $file.Length -gt 100MB)
158+
{
159+
$percentComplete = $c * 100 / $totalNumber
160+
Write-Progress -Activity 'Hashing File Content' -Status $file.Name -PercentComplete $percentComplete
161+
}
162+
163+
# use the file hash of this file PLUS file length as a key to the hashtable
164+
# use the fastest algorithm SHA1, and use partial hashes for files larger than 100KB:
165+
$bufferSize = [Math]::Min(100KB, $MaxFileSize)
166+
$result = Get-PsOneFileHash -StartPosition 1KB -Length $MaxFileSize -BufferSize $bufferSize -AlgorithmName SHA1 -Path $file.FullName
167+
168+
# add a "P" to partial hashes:
169+
if ($result.IsPartialHash) {
170+
$partialHash = 'P'
171+
}
172+
else
173+
{
174+
$partialHash = ''
175+
}
176+
177+
178+
$key = '{0}:{1}{2}' -f $result.Hash, $file.Length, $partialHash
179+
180+
# if we see this key the first time, add a generic list to this key:
181+
if ($hash.ContainsKey($key) -eq $false)
182+
{
183+
$hash.Add($key, [Collections.Generic.List[System.IO.FileInfo]]::new())
184+
}
185+
186+
# add the file to the approriate group:
187+
$hash[$key].Add($file)
188+
}
189+
}
190+
191+
end
192+
{
193+
# remove all hashtable keys with only one file in them
194+
195+
196+
197+
# do a detail check on partial hashes
198+
if ($TestPartialHash)
199+
{
200+
# first, CLONE the list of hashtable keys
201+
# (we cannot remove hashtable keys while enumerating the live
202+
# keys list):
203+
$keys = @($hash.Keys).Clone()
204+
$i = 0
205+
Foreach($key in $keys)
206+
{
207+
$i++
208+
$percentComplete = $i * 100 / $keys.Count
209+
if ($hash[$key].Count -gt 1 -and $key.EndsWith('P'))
210+
{
211+
foreach($file in $hash[$key])
212+
{
213+
Write-Progress -Activity 'Hashing Full File Content' -Status $file.Name -PercentComplete $percentComplete
214+
$result = Get-FileHash -Path $file.FullName -Algorithm SHA1
215+
$newkey = '{0}:{1}' -f $result.Hash, $file.Length
216+
if ($hash.ContainsKey($newkey) -eq $false)
217+
{
218+
$hash.Add($newkey, [Collections.Generic.List[System.IO.FileInfo]]::new())
219+
}
220+
$hash[$newkey].Add($file)
221+
}
222+
$hash.Remove($key)
223+
}
224+
}
225+
}
226+
227+
# enumerate all keys...
228+
$keys = @($hash.Keys).Clone()
229+
230+
foreach($key in $keys)
231+
{
232+
# ...if key has only one file, remove it:
233+
if ($hash[$key].Count -eq 1)
234+
{
235+
$hash.Remove($key)
236+
}
237+
}
238+
239+
240+
241+
# return the hashtable with only duplicate files left:
242+
$hash
243+
}
244+
}
245+
}
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
function Get-PSOneClipboardListenerStatus
2+
{
3+
<#
4+
.SYNOPSIS
5+
Gets information from the background thread that monitors the clipboard
6+
7+
.DESCRIPTION
8+
Outputs the current content of the shared hashtable that always returns the current state of the clipboard monitor
9+
This information can be used for debugging and to better understand how the clipboard monitor works
10+
11+
.EXAMPLE
12+
Get-ClipboardListenerStatus
13+
returns the current state of the clipboard monitor
14+
#>
15+
16+
# take the script-global object and return the hashtable, and
17+
# select status, possible exception messages, and the last text
18+
# that was read from the clipboard:
19+
$script:backgroundThread.Hash |
20+
Select-Object -Property Status, Error, Text
21+
}
22+

0 commit comments

Comments
 (0)