Skip to content

Improve fromList for IntSet and IntMap #1137

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 28 additions & 5 deletions containers-tests/benchmarks/IntMap.hs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ import qualified Data.IntMap as M
import qualified Data.IntMap.Strict as MS
import qualified Data.IntSet as S
import Data.Maybe (fromMaybe)
import System.Random (StdGen, mkStdGen, randoms, randomRs)
import Data.Word (Word8)
import System.Random (StdGen, mkStdGen, randoms)
import Prelude hiding (lookup)

import Utils.Fold (foldBenchmarks, foldWithKeyBenchmarks)
Expand All @@ -23,7 +24,8 @@ main = do
m_random = M.fromList elems_random
s = S.fromList keys
s_random2 = S.fromList keys_random2
evaluate $ rnf [elems_asc, elems_random, elems_randomDups]
evaluate $
rnf [elems_asc, elems_random, elems_randomDups, elems_fromListWorstCase]
evaluate $ rnf [m, m', m'', m''', m'''']
evaluate $ rnf m_random
evaluate $ rnf [s, s_random2]
Expand Down Expand Up @@ -58,9 +60,17 @@ main = do
, bench "fromList:random" $ whnf M.fromList elems_random
, bench "fromList:random:fusion" $
whnf (\(n,g) -> M.fromList (take n (unitValues (randoms g)))) (bound,gen)
, bench "fromListWith:randomDups" $ whnf (M.fromListWith const) elems_randomDups
, bench "fromList:randomDups" $ whnf M.fromList elems_randomDups
, bench "fromList:randomDups:fusion" $
whnf
(\(n,g) -> M.fromList (take n (unitValues (map word8ToInt (randoms g)))))
(bound,gen)
, bench "fromListWith:randomDups" $ whnf (M.fromListWith seq) elems_randomDups
, bench "fromListWith:randomDups:fusion" $
whnf (\(n,g) -> M.fromListWith const (take n (unitValues (randomRs (0,255) g)))) (bound,gen)
whnf
(\(n,g) -> M.fromListWith seq (take n (unitValues (map word8ToInt (randoms g)))))
(bound,gen)
, bench "fromList:worstCase" $ whnf M.fromList elems_fromListWorstCase
, bench "fromAscList" $ whnf M.fromAscList elems_asc
, bench "fromAscList:fusion" $
whnf (\n -> M.fromAscList (unitValues [1..n])) bound
Expand Down Expand Up @@ -93,7 +103,17 @@ main = do
elems_random = take bound (unitValues (randoms gen))
elems_asc = unitValues [1..bound]
-- Random elements in a small range to produce duplicates
elems_randomDups = take bound (unitValues (randomRs (0,255) gen))
elems_randomDups = take bound (unitValues (map word8ToInt (randoms gen)))
-- Worst case for the current fromList algorithm. Consider removing this
-- test case if the algorithm changes.
elems_fromListWorstCase =
unitValues $
take bound $
concat
[ take 63 (iterate (*2) 1)
, take 63 (map negate (iterate (*2) 1))
, interleave [1..] (map negate [1..])
]

--------------------------------------------------------
!bound = 2^12
Expand Down Expand Up @@ -167,3 +187,6 @@ unitValues = map (flip (,) ())
gen, gen2 :: StdGen
gen = mkStdGen 42
gen2 = mkStdGen 90

word8ToInt :: Word8 -> Int
word8ToInt = fromIntegral
44 changes: 39 additions & 5 deletions containers-tests/benchmarks/IntSet.hs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ import qualified Data.IntSet as IS
import qualified Data.Set as S
import qualified Data.IntMap as IM
import qualified Data.Map.Strict as M
import System.Random (StdGen, mkStdGen, randoms)
import Data.Word (Word8)
import System.Random (StdGen, mkStdGen, randoms, randomRs)

import Utils.Fold (foldBenchmarks)

Expand All @@ -23,7 +24,14 @@ main = do
s_even = IS.fromAscList elems_even :: IS.IntSet
s_odd = IS.fromAscList elems_odd :: IS.IntSet
s_sparse = IS.fromAscList elems_sparse :: IS.IntSet
evaluate $ rnf [elems_asc, elems_asc_sparse, elems_random]
evaluate $
rnf
[ elems_asc
, elems_asc_sparse
, elems_random
, elems_randomDups
, elems_fromListWorstCase
]
evaluate $ rnf [s, s_even, s_odd, s_sparse]
defaultMain
[ bench "member" $ whnf (member elems) s
Expand All @@ -40,14 +48,18 @@ main = do
, bench "union" $ whnf (IS.union s_even) s_odd
, bench "difference" $ whnf (IS.difference s) s_even
, bench "intersection" $ whnf (IS.intersection s) s_even
, bench "fromList:asc" $ whnf IS.fromList elems_asc
, bench "fromList:asc" $ whnf fromListNoinline elems_asc
, bench "fromList:asc:fusion" $ whnf (\n -> IS.fromList [1..n]) bound
, bench "fromList:asc:sparse" $ whnf IS.fromList elems_asc_sparse
, bench "fromList:asc:sparse" $ whnf fromListNoinline elems_asc_sparse
, bench "fromList:asc:sparse:fusion" $
whnf (\n -> IS.fromList (map (*64) [1..n])) bound
, bench "fromList:random" $ whnf IS.fromList elems_random
, bench "fromList:random" $ whnf fromListNoinline elems_random
, bench "fromList:random:fusion" $
whnf (\(n,g) -> IS.fromList (take n (randoms g))) (bound,gen)
, bench "fromList:randomDups" $ whnf fromListNoinline elems_randomDups
, bench "fromList:randomDups:fusion" $
whnf (\(n,g) -> IS.fromList (take n (map word8ToInt (randoms g)))) (bound,gen)
, bench "fromList:worstCase" $ whnf fromListNoinline elems_fromListWorstCase
, bench "fromRange" $ whnf IS.fromRange (1,bound)
, bench "fromRange:small" $ whnf IS.fromRange (-1,0)
, bench "fromAscList" $ whnf fromAscListNoinline elems
Expand Down Expand Up @@ -86,6 +98,17 @@ main = do
elems_asc = elems
elems_asc_sparse = elems_sparse
elems_random = take bound (randoms gen)
-- Random elements in a small range to produce duplicates
elems_randomDups = take bound (map word8ToInt (randoms gen))
-- Worst case for the current fromList algorithm. Consider removing this
-- test case if the algorithm changes.
elems_fromListWorstCase =
take bound $
concat
[ take 63 (iterate (*2) 1)
, take 63 (map negate (iterate (*2) 1))
, interleave [1..] (map negate [1..])
]

member :: [Int] -> IS.IntSet -> Int
member xs s = foldl' (\n x -> if IS.member x s then n + 1 else n) 0 xs
Expand All @@ -102,9 +125,20 @@ fromAscListNoinline :: [Int] -> IS.IntSet
fromAscListNoinline = IS.fromAscList
{-# NOINLINE fromAscListNoinline #-}

fromListNoinline :: [Int] -> IS.IntSet
fromListNoinline = IS.fromList
{-# NOINLINE fromListNoinline #-}

interleave :: [a] -> [a] -> [a]
interleave [] ys = ys
interleave (x:xs) (y:ys) = x : y : interleave xs ys

gen :: StdGen
gen = mkStdGen 42

word8ToInt :: Word8 -> Int
word8ToInt = fromIntegral

-- | Automata contain just the transitions
type NFA = IM.IntMap (IM.IntMap IS.IntSet)
type DFA = IM.IntMap (M.Map IS.IntSet IS.IntSet)
Expand Down
14 changes: 7 additions & 7 deletions containers-tests/tests/intmap-properties.hs
Original file line number Diff line number Diff line change
Expand Up @@ -1426,15 +1426,15 @@ prop_ascDescList :: [Int] -> Bool
prop_ascDescList xs = toAscList m == reverse (toDescList m)
where m = fromList $ zip xs $ repeat ()

prop_fromList :: [Int] -> Property
prop_fromList :: [(Int, A)] -> Property
prop_fromList xs
= case fromList (zip xs xs) of
= case fromList xs of
t -> valid t .&&.
t === fromAscList (zip sort_xs sort_xs) .&&.
t === fromDistinctAscList (zip nub_sort_xs nub_sort_xs) .&&.
t === List.foldr (uncurry insert) empty (zip xs xs)
where sort_xs = sort xs
nub_sort_xs = List.map List.head $ List.group sort_xs
t === fromAscList sort_xs .&&.
t === fromDistinctAscList nub_sort_xs .&&.
t === List.foldl' (\t' (k,x) -> insert k x t') empty xs
where sort_xs = List.sortBy (comparing fst) xs
nub_sort_xs = List.map NE.last $ NE.groupBy ((==) `on` fst) sort_xs

----------------------------------------------------------------

Expand Down
158 changes: 150 additions & 8 deletions containers/src/Data/IntMap/Internal.hs
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,13 @@ module Data.IntMap.Internal (
, Stack(..)
, ascLinkTop
, ascLinkAll
, IntMapBuilder(..)
, BStack(..)
, emptyB
, insertB
, finishB
, moveToB
, MoveResult(..)

-- * Used by "IntMap.Merge.Lazy" and "IntMap.Merge.Strict"
, mapWhenMissing
Expand Down Expand Up @@ -3307,20 +3314,25 @@ foldlFB = foldlWithKey


-- | \(O(n \min(n,W))\). Create a map from a list of key\/value pairs.
-- If the list contains more than one value for the same key, the last value
-- for the key is retained.
--
-- If the keys are in sorted order, ascending or descending, this function
-- takes \(O(n)\) time.
--
-- > fromList [] == empty
-- > fromList [(5,"a"), (3,"b"), (5, "c")] == fromList [(5,"c"), (3,"b")]
-- > fromList [(5,"c"), (3,"b"), (5, "a")] == fromList [(5,"a"), (3,"b")]

fromList :: [(Key,a)] -> IntMap a
fromList xs
= Foldable.foldl' ins empty xs
where
ins t (k,x) = insert k x t
fromList xs = finishB (Foldable.foldl' (\b (kx,x) -> insertB kx x b) emptyB xs)
{-# INLINE fromList #-} -- Inline for list fusion

-- | \(O(n \min(n,W))\). Build a map from a list of key\/value pairs with a combining function. See also 'fromAscListWith'.
--
-- If the keys are in sorted order, ascending or descending, this function
-- takes \(O(n)\) time.
--
-- > fromListWith (++) [(5,"a"), (5,"b"), (3,"x"), (5,"c")] == fromList [(3, "x"), (5, "cba")]
-- > fromListWith (++) [] == empty
--
Expand Down Expand Up @@ -3362,17 +3374,18 @@ fromListWith f xs

-- | \(O(n \min(n,W))\). Build a map from a list of key\/value pairs with a combining function. See also fromAscListWithKey'.
--
-- If the keys are in sorted order, ascending or descending, this function
-- takes \(O(n)\) time.
--
-- > let f key new_value old_value = show key ++ ":" ++ new_value ++ "|" ++ old_value
-- > fromListWithKey f [(5,"a"), (5,"b"), (3,"b"), (3,"a"), (5,"c")] == fromList [(3, "3:a|b"), (5, "5:c|5:b|a")]
-- > fromListWithKey f [] == empty
--
-- Also see the performance note on 'fromListWith'.

fromListWithKey :: (Key -> a -> a -> a) -> [(Key,a)] -> IntMap a
fromListWithKey f xs
= Foldable.foldl' ins empty xs
where
ins t (k,x) = insertWithKey f k x t
fromListWithKey f xs =
finishB (Foldable.foldl' (\b (kx,x) -> insertWithB (f kx) kx x b) emptyB xs)
{-# INLINE fromListWithKey #-} -- Inline for list fusion

-- | \(O(n)\). Build a map from a list of key\/value pairs where
Expand Down Expand Up @@ -3477,6 +3490,135 @@ ascLinkStack stk !rk r = case stk of
where
p = mask rk m

{--------------------------------------------------------------------
IntMapBuilder
--------------------------------------------------------------------}

-- Note [IntMapBuilder]
-- ~~~~~~~~~~~~~~~~~~~~
-- IntMapBuilder serves as an accumulator for element-by-element construction
-- of an IntMap. It can be used in folds to construct IntMaps. This plays nicely
-- with list fusion when the structure folded over is a list, as in fromList and
-- friends.
--
-- An IntMapBuilder is either empty (BNil) or has the recently inserted Tip
-- together with a stack of trees (BTip). The structure is effectively a
-- [zipper](https://en.wikipedia.org/wiki/Zipper_(data_structure)). It always
-- has its "focus" at the last inserted entry. To insert a new entry, we need
-- to move the focus to the new entry. To do this we move up the stack to the
-- lowest common ancestor of the currest position and the position of the
-- new key (implemented as moveUpB), then down to the position of the new key
-- (implemented as moveDownB).
--
-- When we are done inserting entries, we link the trees up the stack and get
-- the final result.
--
-- The advantage of this implementation is that we take the shortest path in
-- the tree from one key to the next. Unlike `insert`, we don't need to move
-- up to the root after every insertion. This is very beneficial when we have
-- runs of sorted keys, without many keys already in the tree in that range.
-- If the keys are fully sorted, inserting them all takes O(n) time instead
-- of O(n min(n,W)). But these benefits come at a small cost: when moving up
-- the tree we have to check at every point if it is time to move down. These
-- checks are absent in `insert`. So, in case we need to move up quite a lot,
-- repeated `insert` is slightly faster, but the trade-off is worthwhile since
-- such cases are pathological.

data IntMapBuilder a
= BNil
| BTip {-# UNPACK #-} !Int a !(BStack a)

-- BLeft: the IntMap is the left child
-- BRight: the IntMap is the right child
data BStack a
= BNada
| BLeft {-# UNPACK #-} !Prefix !(IntMap a) !(BStack a)
| BRight {-# UNPACK #-} !Prefix !(IntMap a) !(BStack a)

-- Empty builder.
emptyB :: IntMapBuilder a
emptyB = BNil

-- Insert a key and value. Replaces the old value if one already exists for
-- the key.
insertB :: Key -> a -> IntMapBuilder a -> IntMapBuilder a
insertB !ky y b = case b of
BNil -> BTip ky y BNada
BTip kx x stk -> case moveToB ky kx x stk of
MoveResult _ stk' -> BTip ky y stk'
{-# INLINE insertB #-}

-- Insert a key and value. The new value is combined with the old value if one
-- already exists for the key.
insertWithB :: (a -> a -> a) -> Key -> a -> IntMapBuilder a -> IntMapBuilder a
insertWithB f !ky y b = case b of
BNil -> BTip ky y BNada
BTip kx x stk -> case moveToB ky kx x stk of
MoveResult m stk' -> case m of
Nothing -> BTip ky y stk'
Just x' -> BTip ky (f y x') stk'
{-# INLINE insertWithB #-}

-- GHC >=9.6 supports unpacking sums, so we unpack the Maybe and avoid
-- allocating Justs. GHC optimizes the workers for moveUpB and moveDownB to
-- return (# (# (# #) | a #), BStack a #).
data MoveResult a
= MoveResult
#if __GLASGOW_HASKELL__ >= 906
{-# UNPACK #-}
#endif
!(Maybe a)
!(BStack a)

moveToB :: Key -> Key -> a -> BStack a -> MoveResult a
moveToB ky kx x stk
| kx == ky = MoveResult (Just x) stk
| otherwise = moveUpB ky kx (Tip kx x) stk

moveUpB :: Key -> Key -> IntMap a -> BStack a -> MoveResult a
moveUpB !ky !kx !tx stk = case stk of
BNada -> MoveResult Nothing (linkB ky kx tx BNada)
BLeft p l stk'
| nomatch ky p -> moveUpB ky kx (Bin p l tx) stk'
| left ky p -> moveDownB ky l (BRight p tx stk')
| otherwise -> MoveResult Nothing (linkB ky kx tx stk)
BRight p r stk'
| nomatch ky p -> moveUpB ky kx (Bin p tx r) stk'
| left ky p -> MoveResult Nothing (linkB ky kx tx stk)
| otherwise -> moveDownB ky r (BLeft p tx stk')

moveDownB :: Key -> IntMap a -> BStack a -> MoveResult a
moveDownB !ky tx !stk = case tx of
Bin p l r
| nomatch ky p -> MoveResult Nothing (linkB ky (unPrefix p) tx stk)
| left ky p -> moveDownB ky l (BRight p r stk)
| otherwise -> moveDownB ky r (BLeft p l stk)
Tip kx x
| kx == ky -> MoveResult (Just x) stk
| otherwise -> MoveResult Nothing (linkB ky kx tx stk)
Nil -> error "moveDownB Tip"

linkB :: Key -> Key -> IntMap a -> BStack a -> BStack a
linkB ky kx tx stk
| i2w ky < i2w kx = BRight p tx stk
| otherwise = BLeft p tx stk
where
p = branchPrefix ky kx
{-# INLINE linkB #-}

-- Finalize the builder into a Map.
finishB :: IntMapBuilder a -> IntMap a
finishB b = case b of
BNil -> Nil
BTip kx x stk -> finishUpB (Tip kx x) stk
{-# INLINABLE finishB #-}

finishUpB :: IntMap a -> BStack a -> IntMap a
finishUpB !t stk = case stk of
BNada -> t
BLeft p l stk' -> finishUpB (Bin p l t) stk'
BRight p r stk' -> finishUpB (Bin p t r) stk'

{--------------------------------------------------------------------
Eq
--------------------------------------------------------------------}
Expand Down
Loading