Skip to content

Commit 352591a

Browse files
authored
Fast inclusion operation on hashmaps and hashsets (#282)
* add fast subset operation * add explanation for subkey offset in lookupCont * rename subset operation for compatibility with containers API * update docs: union is not a least upper bound operator for `isSubmapOf`. * explain runtime complexity of isSubmapOf. * isSubmapOfBy: move `Empty` case to top * isSubmapOfBy: fix comments * isSubsetOf: add example * isSubmapOf: quickcheck test for compatibility with containers * isSubmapOf: use arbitrary instance of HashMap * isSubmapOf: fix comments again * isSubmapOf: update doc for runtime complexity * remove mathematical symbols from user doc * add difference subset quickcheck property * add `all` function for arrays * fix comments in `isSubmapOf` * fix wrong runtime complexity of set inclusion * delete unused property * fix error in `isSubmapOf` based on wrong assumption * add benchmarks * change a few recursive `isSubmap` cases to `False` * add strictness annotations * make isSubmapOf and isSubmapOfBy INLINABLE
1 parent f508e18 commit 352591a

File tree

9 files changed

+282
-29
lines changed

9 files changed

+282
-29
lines changed

Data/HashMap/Internal.hs

+133-8
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,8 @@ module Data.HashMap.Internal
4949
, update
5050
, alter
5151
, alterF
52+
, isSubmapOf
53+
, isSubmapOfBy
5254

5355
-- * Combine
5456
-- ** Union
@@ -148,7 +150,7 @@ import qualified Data.Foldable as Foldable
148150
import Data.Bifoldable
149151
#endif
150152
import qualified Data.List as L
151-
import GHC.Exts ((==#), build, reallyUnsafePtrEquality#)
153+
import GHC.Exts ((==#), build, reallyUnsafePtrEquality#, inline)
152154
import Prelude hiding (filter, foldl, foldr, lookup, map, null, pred)
153155
import Text.Read hiding (step)
154156

@@ -590,12 +592,12 @@ lookup k m = case lookup# k m of
590592
{-# INLINE lookup #-}
591593

592594
lookup# :: (Eq k, Hashable k) => k -> HashMap k v -> (# (# #) | v #)
593-
lookup# k m = lookupCont (\_ -> (# (# #) | #)) (\v _i -> (# | v #)) (hash k) k m
595+
lookup# k m = lookupCont (\_ -> (# (# #) | #)) (\v _i -> (# | v #)) (hash k) k 0 m
594596
{-# INLINABLE lookup# #-}
595597

596598
#else
597599

598-
lookup k m = lookupCont (\_ -> Nothing) (\v _i -> Just v) (hash k) k m
600+
lookup k m = lookupCont (\_ -> Nothing) (\v _i -> Just v) (hash k) k 0 m
599601
{-# INLINABLE lookup #-}
600602
#endif
601603

@@ -614,7 +616,7 @@ lookup' h k m = case lookupRecordCollision# h k m of
614616
(# | (# a, _i #) #) -> Just a
615617
{-# INLINE lookup' #-}
616618
#else
617-
lookup' h k m = lookupCont (\_ -> Nothing) (\v _i -> Just v) h k m
619+
lookup' h k m = lookupCont (\_ -> Nothing) (\v _i -> Just v) h k 0 m
618620
{-# INLINABLE lookup' #-}
619621
#endif
620622

@@ -649,13 +651,13 @@ lookupRecordCollision h k m = case lookupRecordCollision# h k m of
649651
-- into lookupCont because inlining takes care of that.
650652
lookupRecordCollision# :: Eq k => Hash -> k -> HashMap k v -> (# (# #) | (# v, Int# #) #)
651653
lookupRecordCollision# h k m =
652-
lookupCont (\_ -> (# (# #) | #)) (\v (I# i) -> (# | (# v, i #) #)) h k m
654+
lookupCont (\_ -> (# (# #) | #)) (\v (I# i) -> (# | (# v, i #) #)) h k 0 m
653655
-- INLINABLE to specialize to the Eq instance.
654656
{-# INLINABLE lookupRecordCollision# #-}
655657

656658
#else /* GHC < 8.2 so there are no unboxed sums */
657659

658-
lookupRecordCollision h k m = lookupCont (\_ -> Absent) Present h k m
660+
lookupRecordCollision h k m = lookupCont (\_ -> Absent) Present h k 0 m
659661
{-# INLINABLE lookupRecordCollision #-}
660662
#endif
661663

@@ -667,6 +669,10 @@ lookupRecordCollision h k m = lookupCont (\_ -> Absent) Present h k m
667669
-- so we can be representation-polymorphic in the result type. Since
668670
-- this whole thing is always inlined, we don't have to worry about
669671
-- any extra CPS overhead.
672+
--
673+
-- The @Int@ argument is the offset of the subkey in the hash. When looking up
674+
-- keys at the top-level of a hashmap, the offset should be 0. When looking up
675+
-- keys at level @n@ of a hashmap, the offset should be @n * bitsPerSubkey@.
670676
lookupCont ::
671677
#if __GLASGOW_HASKELL__ >= 802
672678
forall rep (r :: TYPE rep) k v.
@@ -677,8 +683,10 @@ lookupCont ::
677683
=> ((# #) -> r) -- Absent continuation
678684
-> (v -> Int -> r) -- Present continuation
679685
-> Hash -- The hash of the key
680-
-> k -> HashMap k v -> r
681-
lookupCont absent present !h0 !k0 !m0 = go h0 k0 0 m0
686+
-> k
687+
-> Int -- The offset of the subkey in the hash.
688+
-> HashMap k v -> r
689+
lookupCont absent present !h0 !k0 !s0 !m0 = go h0 k0 s0 m0
682690
where
683691
go :: Eq k => Hash -> k -> Int -> HashMap k v -> r
684692
go !_ !_ !_ Empty = absent (# #)
@@ -1409,6 +1417,116 @@ alterFEager f !k m = (<$> f mv) $ \fres ->
14091417
{-# INLINABLE alterFEager #-}
14101418
#endif
14111419

1420+
-- | /O(n*log m)/ Inclusion of maps. A map is included in another map if the keys
1421+
-- are subsets and the corresponding values are equal:
1422+
--
1423+
-- > isSubmapOf m1 m2 = keys m1 `isSubsetOf` keys m2 &&
1424+
-- > and [ v1 == v2 | (k1,v1) <- toList m1; let v2 = m2 ! k1 ]
1425+
--
1426+
-- ==== __Examples__
1427+
--
1428+
-- >>> fromList [(1,'a')] `isSubmapOf` fromList [(1,'a'),(2,'b')]
1429+
-- True
1430+
--
1431+
-- >>> fromList [(1,'a'),(2,'b')] `isSubmapOf` fromList [(1,'a')]
1432+
-- False
1433+
isSubmapOf :: (Eq k, Hashable k, Eq v) => HashMap k v -> HashMap k v -> Bool
1434+
isSubmapOf = (inline isSubmapOfBy) (==)
1435+
{-# INLINABLE isSubmapOf #-}
1436+
1437+
-- | /O(n*log m)/ Inclusion of maps with value comparison. A map is included in
1438+
-- another map if the keys are subsets and if the comparison function is true
1439+
-- for the corresponding values:
1440+
--
1441+
-- > isSubmapOfBy cmpV m1 m2 = keys m1 `isSubsetOf` keys m2 &&
1442+
-- > and [ v1 `cmpV` v2 | (k1,v1) <- toList m1; let v2 = m2 ! k1 ]
1443+
--
1444+
-- ==== __Examples__
1445+
--
1446+
-- >>> isSubmapOfBy (<=) (fromList [(1,'a')]) (fromList [(1,'b'),(2,'c')])
1447+
-- True
1448+
--
1449+
-- >>> isSubmapOfBy (<=) (fromList [(1,'b')]) (fromList [(1,'a'),(2,'c')])
1450+
-- False
1451+
isSubmapOfBy :: (Eq k, Hashable k) => (v1 -> v2 -> Bool) -> HashMap k v1 -> HashMap k v2 -> Bool
1452+
-- For maps without collisions the complexity is O(n*log m), where n is the size
1453+
-- of m1 and m the size of m2: the inclusion operation visits every leaf in m1 at least once.
1454+
-- For each leaf in m1, it looks up the key in m2.
1455+
--
1456+
-- The worst case complexity is O(n*m). The worst case is when both hashmaps m1
1457+
-- and m2 are collision nodes for the same hash. Since collision nodes are
1458+
-- unsorted arrays, it requires for every key in m1 a linear search to to find a
1459+
-- matching key in m2, hence O(n*m).
1460+
isSubmapOfBy comp !m1 !m2 = go 0 m1 m2
1461+
where
1462+
-- An empty map is always a submap of any other map.
1463+
go _ Empty _ = True
1464+
1465+
-- If the second map is empty and the first is not, it cannot be a submap.
1466+
go _ _ Empty = False
1467+
1468+
-- If the first map contains only one entry, lookup the key in the second map.
1469+
go s (Leaf h1 (L k1 v1)) t2 = lookupCont (\_ -> False) (\v2 _ -> comp v1 v2) h1 k1 s t2
1470+
1471+
-- In this case, we need to check that for each x in ls1, there is a y in
1472+
-- ls2 such that x `comp` y. This is the worst case complexity-wise since it
1473+
-- requires a O(m*n) check.
1474+
go _ (Collision h1 ls1) (Collision h2 ls2) =
1475+
h1 == h2 && subsetArray comp ls1 ls2
1476+
1477+
-- In this case, we only need to check the entries in ls2 with the hash h1.
1478+
go s t1@(Collision h1 _) (BitmapIndexed b ls2)
1479+
| b .&. m == 0 = False
1480+
| otherwise =
1481+
go (s+bitsPerSubkey) t1 (A.index ls2 (sparseIndex b m))
1482+
where m = mask h1 s
1483+
1484+
-- Similar to the previous case we need to traverse l2 at the index for the hash h1.
1485+
go s t1@(Collision h1 _) (Full ls2) =
1486+
go (s+bitsPerSubkey) t1 (A.index ls2 (index h1 s))
1487+
1488+
-- In cases where the first and second map are BitmapIndexed or Full,
1489+
-- traverse down the tree at the appropriate indices.
1490+
go s (BitmapIndexed b1 ls1) (BitmapIndexed b2 ls2) =
1491+
submapBitmapIndexed (go (s+bitsPerSubkey)) b1 ls1 b2 ls2
1492+
go s (BitmapIndexed b1 ls1) (Full ls2) =
1493+
submapBitmapIndexed (go (s+bitsPerSubkey)) b1 ls1 fullNodeMask ls2
1494+
go s (Full ls1) (Full ls2) =
1495+
submapBitmapIndexed (go (s+bitsPerSubkey)) fullNodeMask ls1 fullNodeMask ls2
1496+
1497+
-- Collision and Full nodes always contain at least two entries. Hence it
1498+
-- cannot be a map of a leaf.
1499+
go _ (Collision {}) (Leaf {}) = False
1500+
go _ (BitmapIndexed {}) (Leaf {}) = False
1501+
go _ (Full {}) (Leaf {}) = False
1502+
go _ (BitmapIndexed {}) (Collision {}) = False
1503+
go _ (Full {}) (Collision {}) = False
1504+
go _ (Full {}) (BitmapIndexed {}) = False
1505+
{-# INLINABLE isSubmapOfBy #-}
1506+
1507+
-- | /O(min n m))/ Checks if a bitmap indexed node is a submap of another.
1508+
submapBitmapIndexed :: (HashMap k v1 -> HashMap k v2 -> Bool) -> Bitmap -> A.Array (HashMap k v1) -> Bitmap -> A.Array (HashMap k v2) -> Bool
1509+
submapBitmapIndexed comp !b1 !ary1 !b2 !ary2 = subsetBitmaps && go 0 0 (b1Orb2 .&. negate b1Orb2)
1510+
where
1511+
go :: Int -> Int -> Bitmap -> Bool
1512+
go !i !j !m
1513+
| m > b1Orb2 = True
1514+
1515+
-- In case a key is both in ary1 and ary2, check ary1[i] <= ary2[j] and
1516+
-- increment the indices i and j.
1517+
| b1Andb2 .&. m /= 0 = comp (A.index ary1 i) (A.index ary2 j) &&
1518+
go (i+1) (j+1) (m `unsafeShiftL` 1)
1519+
1520+
-- In case a key occurs in ary1, but not ary2, only increment index j.
1521+
| b2 .&. m /= 0 = go i (j+1) (m `unsafeShiftL` 1)
1522+
1523+
-- In case a key neither occurs in ary1 nor ary2, continue.
1524+
| otherwise = go i j (m `unsafeShiftL` 1)
1525+
1526+
b1Andb2 = b1 .&. b2
1527+
b1Orb2 = b1 .|. b2
1528+
subsetBitmaps = b1Orb2 == b2
1529+
{-# INLINABLE submapBitmapIndexed #-}
14121530

14131531
------------------------------------------------------------------------
14141532
-- * Combine
@@ -2076,6 +2194,13 @@ updateOrConcatWithKey f ary1 ary2 = A.run $ do
20762194
return mary
20772195
{-# INLINABLE updateOrConcatWithKey #-}
20782196

2197+
-- | /O(n*m)/ Check if the first array is a subset of the second array.
2198+
subsetArray :: Eq k => (v1 -> v2 -> Bool) -> A.Array (Leaf k v1) -> A.Array (Leaf k v2) -> Bool
2199+
subsetArray cmpV ary1 ary2 = A.length ary1 <= A.length ary2 && A.all inAry2 ary1
2200+
where
2201+
inAry2 (L k1 v1) = lookupInArrayCont (\_ -> False) (\v2 _ -> cmpV v1 v2) k1 ary2
2202+
{-# INLINE inAry2 #-}
2203+
20792204
------------------------------------------------------------------------
20802205
-- Manually unrolled loops
20812206

Data/HashMap/Internal/Array.hs

+8-2
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ module Data.HashMap.Internal.Array
5959
, foldr
6060
, foldr'
6161
, foldMap
62+
, all
6263

6364
, thaw
6465
, map
@@ -79,9 +80,9 @@ import GHC.ST (ST(..))
7980
import Control.Monad.ST (stToIO)
8081

8182
#if __GLASGOW_HASKELL__ >= 709
82-
import Prelude hiding (filter, foldMap, foldr, foldl, length, map, read, traverse)
83+
import Prelude hiding (filter, foldMap, foldr, foldl, length, map, read, traverse, all)
8384
#else
84-
import Prelude hiding (filter, foldr, foldl, length, map, read)
85+
import Prelude hiding (filter, foldr, foldl, length, map, read, all)
8586
#endif
8687

8788
#if __GLASGOW_HASKELL__ >= 710
@@ -461,6 +462,11 @@ foldMap f = \ary0 -> case length ary0 of
461462
in go 0
462463
{-# INLINE foldMap #-}
463464

465+
-- | Verifies that a predicate holds for all elements of an array.
466+
all :: (a -> Bool) -> Array a -> Bool
467+
all p = foldr (\a acc -> p a && acc) True
468+
{-# INLINE all #-}
469+
464470
undefinedElem :: a
465471
undefinedElem = error "Data.HashMap.Internal.Array: Undefined element"
466472
{-# NOINLINE undefinedElem #-}

Data/HashMap/Internal/Strict.hs

+2
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,8 @@ module Data.HashMap.Internal.Strict
6363
, update
6464
, alter
6565
, alterF
66+
, isSubmapOf
67+
, isSubmapOfBy
6668

6769
-- * Combine
6870
-- ** Union

Data/HashMap/Lazy.hs

+2
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,8 @@ module Data.HashMap.Lazy
4949
, update
5050
, alter
5151
, alterF
52+
, isSubmapOf
53+
, isSubmapOfBy
5254

5355
-- * Combine
5456
-- ** Union

Data/HashMap/Strict.hs

+2
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,8 @@ module Data.HashMap.Strict
4848
, update
4949
, alter
5050
, alterF
51+
, isSubmapOf
52+
, isSubmapOfBy
5153

5254
-- * Combine
5355
-- ** Union

Data/HashSet.hs

+1
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@ module Data.HashSet
111111
, member
112112
, insert
113113
, delete
114+
, isSubsetOf
114115

115116
-- * Transformations
116117
, map

Data/HashSet/Internal.hs

+13
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ module Data.HashSet.Internal
5555
, member
5656
, insert
5757
, delete
58+
, isSubsetOf
5859

5960
-- * Transformations
6061
, map
@@ -310,6 +311,18 @@ fromMap = HashSet
310311
keysSet :: HashMap k a -> HashSet k
311312
keysSet m = fromMap (() <$ m)
312313

314+
-- | /O(n*log m)/ Inclusion of sets.
315+
--
316+
-- ==== __Examples__
317+
--
318+
-- >>> fromList [1,3] `isSubsetOf` fromList [1,2,3]
319+
-- True
320+
--
321+
-- >>> fromList [1,2] `isSubsetOf` fromList [1,3]
322+
-- False
323+
isSubsetOf :: (Eq a, Hashable a) => HashSet a -> HashSet a -> Bool
324+
isSubsetOf s1 s2 = H.isSubmapOfBy (\_ _ -> True) (asMap s1) (asMap s2)
325+
313326
-- | /O(n+m)/ Construct a set containing all elements from both sets.
314327
--
315328
-- To obtain good performance, the smaller set must be presented as

0 commit comments

Comments
 (0)