diff --git a/.ghci b/.ghci new file mode 100644 index 0000000..b9f2a7d --- /dev/null +++ b/.ghci @@ -0,0 +1 @@ +:set -ilib diff --git a/.stylish-haskell.yaml b/.stylish-haskell.yaml index 56c1a1c..51a508f 100644 --- a/.stylish-haskell.yaml +++ b/.stylish-haskell.yaml @@ -15,6 +15,14 @@ steps: # # true. # add_language_pragma: true + # Align the right hand side of some elements. This is quite conservative + # and only applies to statements where each element occupies a single + # line. + - simple_align: + cases: true + top_level_patterns: true + records: true + # Import cleanup - imports: # There are different ways we can align names and lists. @@ -22,13 +30,83 @@ steps: # - global: Align the import names and import list throughout the entire # file. # + # - file: Like global, but don't add padding when there are no qualified + # imports in the file. + # # - group: Only align the imports per group (a group is formed by adjacent # import lines). # # - none: Do not perform any alignment. # # Default: global. - align: none + align: group + + # Folowing options affect only import list alignment. + # + # List align has following options: + # + # - after_alias: Import list is aligned with end of import including + # 'as' and 'hiding' keywords. + # + # > import qualified Data.List as List (concat, foldl, foldr, head, + # > init, last, length) + # + # - with_alias: Import list is aligned with start of alias or hiding. + # + # > import qualified Data.List as List (concat, foldl, foldr, head, + # > init, last, length) + # + # - new_line: Import list starts always on new line. + # + # > import qualified Data.List as List + # > (concat, foldl, foldr, head, init, last, length) + # + # Default: after_alias + list_align: after_alias + + # Long list align style takes effect when import is too long. This is + # determined by 'columns' setting. + # + # - inline: This option will put as much specs on same line as possible. + # + # - new_line: Import list will start on new line. + # + # - new_line_multiline: Import list will start on new line when it's + # short enough to fit to single line. Otherwise it'll be multiline. + # + # - multiline: One line per import list entry. + # Type with contructor list acts like single import. + # + # > import qualified Data.Map as M + # > ( empty + # > , singleton + # > , ... + # > , delete + # > ) + # + # Default: inline + long_list_align: inline + + # List padding determines indentation of import list on lines after import. + # This option affects 'list_align' and 'long_list_align'. + list_padding: 4 + + # Separate lists option affects formating of import list for type + # or class. The only difference is single space between type and list + # of constructors, selectors and class functions. + # + # - true: There is single space between Foldable type and list of it's + # functions. + # + # > import Data.Foldable (Foldable (fold, foldl, foldMap)) + # + # - false: There is no space between Foldable type and list of it's + # functions. + # + # > import Data.Foldable (Foldable(fold, foldl, foldMap)) + # + # Default: true + separate_lists: true # Language pragmas - language_pragmas: @@ -38,16 +116,26 @@ steps: # # - compact: A more compact style. # + # - compact_line: Similar to compact, but wrap each line with + # `{-#LANGUAGE #-}'. + # # Default: vertical. style: vertical + # Align affects alignment of closing pragma brackets. + # + # - true: Brackets are aligned in same collumn. + # + # - false: Brackets are not aligned together. There is only one space + # between actual import and closing bracket. + # + # Default: true + align: true + # stylish-haskell can detect redundancy of some language pragmas. If this # is set to true, it will remove those redundant pragmas. Default: true. remove_redundant: true - # Align the types in record declarations - - records: {} - # Replace tabs by spaces. This is disabled by default. - tabs: # Number of spaces to use for each tab. Default: 8, as specified by the @@ -61,6 +149,18 @@ steps: # to. Different steps take this into account. Default: 80. columns: 78 +# By default, line endings are converted according to the OS. You can override +# preferred format here. +# +# - native: Native newline format. CRLF on Windows, LF on other OSes. +# +# - lf: Convert to LF ("\n"). +# +# - crlf: Convert to CRLF ("\r\n"). +# +# Default: native. +newline: native + # Sometimes, language extensions are specified in a cabal file or from the # command line instead of using language pragmas in the file. stylish-haskell # needs to be aware of these, so it can parse the file correctly. diff --git a/.travis.yml b/.travis.yml index 370915f..a3d63b8 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,8 +1,55 @@ +# Include Haskell tools language: haskell ghc: 7.8 + +# Use the docker infrastructure +sudo: false + +# Cache the installed Haskell stuff +cache: + directories: + - $HOME/.stack + notifications: email: on_success: change on_failure: change + +# The following enables several GHC versions to be tested; often it's enough to +# test only against the last release in a major GHC version. Feel free to omit +# lines listings versions you don't need/want testing for. +env: + global: + - PKG_CONFIG_PATH=$HOME/.stack/local/lib/pkgconfig + - LD_LIBRARY_PATH=$HOME/.stack/local/lib + - LD_RUN_PATH=$HOME/.stack/local/lib + - CFLAGS=-I$HOME/.stack/local/include + - LDFLAGS=-L$HOME/.stack/local/lib + matrix: + - STACK_RESOLVER=lts-3 + - STACK_RESOLVER=lts-5 + - STACK_RESOLVER=lts-6 + - STACK_RESOLVER=lts-7 + +# Download the latest stack command. before_install: - - cabal sandbox init + - travis_retry wget https://www.stackage.org/stack/linux-x86_64 + - tar -xvf linux-x86_64 + - mv stack-*-linux-x86_64/stack stack + +install: + - echo "$(ghc --version) [$(ghc --print-project-git-commit-id 2> /dev/null || echo '?')]" + +# Here starts the actual work to be performed for the package under test; any +# command which exits with a non-zero exit code causes the build to fail. +script: +# - cabal check +# - cabal sdist +# - export SRC=$(cabal info . | awk '{print $2;exit}') +# - tar -xzf "dist/$SRC.tar.gz" +# - cd "$SRC" + - travis_retry ./stack --no-terminal --resolver $STACK_RESOLVER setup + - travis_retry ./stack --no-terminal --resolver $STACK_RESOLVER install --only-snapshot -j4 --verbosity info + - ./stack --no-terminal --resolver $STACK_RESOLVER build + - ./stack --no-terminal --resolver $STACK_RESOLVER haddock --no-haddock-deps + - ./stack --no-terminal --resolver $STACK_RESOLVER test diff --git a/README.md b/README.md index 5c11c84..93eb342 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,19 @@ Rawr ==== -[![Build Status][3]][2] +[![Build status][travis-badge]][travis-link] -This is an implementation of the [roaring bitmaps][1] data structure in -Haskell. Roaring bitmaps is a compressed bitmap data structure which offers -better compression and performance than other compressed bitmaps in many -situations. +[*Rawr*][1] is a Haskell implementation of the [Roaring Bitmaps][2] +data structure and [serialisation format][3]. Roaring Bitmaps are a +compressed bitmap data structure offering better space and time +performance than other compressed bitmaps in many situations. -[1]: http://www.roaringbitmap.org/ -[2]: https://travis-ci.org/thsutton/rawr -[3]: https://travis-ci.org/thsutton/rawr.svg?branch=master +**Please note:** This is a work in progress and is not yet ready for +use. When complete it will be released on Hackage. + +[1]: https://github.com/thsutton/rawr +[2]: http://www.roaringbitmaps.org/ +[3]: https://github.com/RoaringBitmap/RoaringFormatSpec + +[travis-link]: https://travis-ci.org/thsutton/rawr +[travis-badge]: https://travis-ci.org/thsutton/rawr.svg?branch=master diff --git a/lib/Data/BitMap/Roaring/Chunk.hs b/lib/Data/BitMap/Roaring/Chunk.hs deleted file mode 100644 index 43d50b6..0000000 --- a/lib/Data/BitMap/Roaring/Chunk.hs +++ /dev/null @@ -1,113 +0,0 @@ -module Data.BitMap.Roaring.Chunk where - -import Control.Applicative -import Data.Bits -import Data.Monoid -import qualified Data.Vector.Unboxed as U -import Data.Word - -import Data.BitMap.Roaring.Utility - --- | A chunk representing the keys which share particular 16 high-order bits. --- --- Chunk with low density (i.e. no more than 4096 members) are represented as a --- sorted array of their low 16 bits. Chunks with high density (i.e. more than --- 4096 members) are represented by a bit vector. --- --- Both high and low density chunks include the high order bits shared by all --- entries in the chunk, and the cardinality of the chunk. -data Chunk - = LowDensity - { chunkIndex :: Word16 - , chunkCardinality :: Int - , chunkArray :: U.Vector Word16 - } - | HighDensity - { chunkIndex :: Word16 - , chunkCardinality :: Int - , chunkBits :: U.Vector Word64 - } - deriving (Eq,Show) - --- | 'Chunk's are ordered by their index. -instance Ord Chunk where - compare c1 c2 = compare (chunkIndex c1) (chunkIndex c2) - --- | Create a new chunk. -chunkNew :: Word16 -> Word16 -> Chunk -chunkNew i v = LowDensity i 1 (U.singleton v) - --- | Extract the 'Word32's stored in a 'Chunk'. -chunkToBits :: Chunk -> [Word32] -chunkToBits (LowDensity i _ a) = combineWord i <$> U.toList a -chunkToBits (HighDensity i _ a) = U.toList . U.concatMap f $ U.indexed a - where - f :: (Int, Word64) -> U.Vector Word32 - f (_p,_bs) = U.map (combineWord i) U.empty - - --- | Get a bit from a 'Chunk'. -chunkGet :: Word16 -> Chunk -> Bool -chunkGet v chunk = case chunk of - LowDensity _ _ a -> U.elem v a - HighDensity{} -> False -- TODO(thsutton) implement - --- | Set a bit in a chunk. --- --- TODO(thsutton) Promote LowDensity chunk when it rises above threshold. -chunkSet :: Word16 -> Chunk -> Chunk -chunkSet v chunk = case chunk of - LowDensity i c a -> LowDensity i c (setL v a) - HighDensity i c a -> HighDensity i c (setH v a) - where - setL :: Word16 -> U.Vector Word16 -> U.Vector Word16 - setL i a = uvInsert a i - setH :: Word16 -> U.Vector Word64 -> U.Vector Word64 - setH _ a = a -- TODO(thsutton) implement - --- | Clear a bit in a 'Chunk'. --- --- TODO(thsutton) Demote HighDensity chunk when it falls below threshold. -chunkClear :: Word16 -> Chunk -> Chunk -chunkClear v chunk = case chunk of - LowDensity i _ a -> - let a' = clearL v a - c' = U.length a' - in LowDensity i c' a' - HighDensity i _ a -> - let a' = clearH v a - c' = U.sum $ U.map popCount a' - in HighDensity i c' a' - where - clearL :: Word16 -> U.Vector Word16 -> U.Vector Word16 - clearL i a = uvDelete a i - clearH :: Word16 -> U.Vector Word64 -> U.Vector Word64 - clearH _ a = a -- TODO(thsutton) implement - --- | Take the union of two 'Chunk's, raising an 'error' if they do not share an --- index. -mergeChunks :: Chunk -> Chunk -> Chunk -mergeChunks c1 c2 = - if chunkIndex c1 == chunkIndex c2 - then merge c1 c2 - else error "Attempting to merge incompatible chunks!" - where - aPop :: U.Vector Word64 -> Int - aPop = U.sum . U.map popCount - aSet :: Word16 -> U.Vector Word64 -> U.Vector Word64 - aSet _i v = v - packA :: U.Vector Word16 -> U.Vector Word64 - packA _ = mempty - merge (HighDensity i _ a1) (HighDensity _ _ a2) = - let a' = U.zipWith (.|.) a1 a2 in HighDensity i (aPop a') a' - merge (HighDensity i _ ah) (LowDensity _ _ al) = - let a' = U.foldr' aSet ah al in HighDensity i (aPop a') a' - merge (LowDensity i _ al) (HighDensity _ _ ah) = - let a' = U.foldr' aSet ah al in HighDensity i (aPop a') a' - merge (LowDensity i _ a1) (LowDensity _ _ a2) = - let a' = vMerge a1 a2 - n' = U.length a' - -- TODO(thsutton): Is this eager enough? - in if n' <= 4096 - then LowDensity i n' a' - else HighDensity i n' (packA a') diff --git a/LICENSE b/rawr-format/LICENSE similarity index 97% rename from LICENSE rename to rawr-format/LICENSE index d80172e..f284b4b 100644 --- a/LICENSE +++ b/rawr-format/LICENSE @@ -1,4 +1,4 @@ -Copyright (c) 2015, Thomas Sutton +Copyright Thomas Sutton (c) 2017 All rights reserved. diff --git a/rawr-format/README.md b/rawr-format/README.md new file mode 100644 index 0000000..a22cc20 --- /dev/null +++ b/rawr-format/README.md @@ -0,0 +1,20 @@ +Rawr Format +=========== + +[![Build status][travis-badge]][travis-link] + +[*Rawr Format*][1] is a Haskell implementation of the Roaring Bitmaps +[serialisation format][3]. Roaring Bitmaps are a compressed bitmap +data structure offering better space and time performance than other +compressed bitmaps in many situations. This library implements the +interoperable serialisation format supported by many implementations. + +**Please note:** This is a work in progress and is not yet ready for +use. When complete it will be released on Hackage. + +[1]: https://github.com/thsutton/rawr +[2]: http://www.roaringbitmaps.org/ +[3]: https://github.com/RoaringBitmap/RoaringFormatSpec + +[travis-link]: https://travis-ci.org/thsutton/rawr +[travis-badge]: https://travis-ci.org/thsutton/rawr.svg?branch=master diff --git a/Setup.hs b/rawr-format/Setup.hs similarity index 100% rename from Setup.hs rename to rawr-format/Setup.hs diff --git a/rawr-format/lib/Lib.hs b/rawr-format/lib/Lib.hs new file mode 100644 index 0000000..d36ff27 --- /dev/null +++ b/rawr-format/lib/Lib.hs @@ -0,0 +1,6 @@ +module Lib + ( someFunc + ) where + +someFunc :: IO () +someFunc = putStrLn "someFunc" diff --git a/rawr-format/rawr-format.cabal b/rawr-format/rawr-format.cabal new file mode 100644 index 0000000..0c54cc5 --- /dev/null +++ b/rawr-format/rawr-format.cabal @@ -0,0 +1,53 @@ +name: rawr-format +version: 0.1.0.0 +synopsis: Interoperable Roaring Bitmaps serialisation for rawr +description: Please see README.md +homepage: https://github.com/thsutton/rawr#readme +license: BSD3 +license-file: LICENSE +author: Thomas Sutton +maintainer: me@thomas-sutton.id.au +copyright: Copyright: (c) 2016 Thomas Sutton +category: Web +build-type: Simple +extra-source-files: README.md +cabal-version: >=1.10 + +source-repository head + type: git + location: https://github.com/thsutton/rawr + +library + default-language: Haskell2010 + hs-source-dirs: lib + exposed-modules: Lib + build-depends: base >= 4.5 && < 5 + , rawr + +test-suite doctests + default-language: Haskell2010 + hs-source-dirs: test + type: exitcode-stdio-1.0 + ghc-options: -threaded + main-is: doctests.hs + build-depends: base + , QuickCheck + , doctest >= 0.9 + +test-suite zzz-check-hlint + default-language: Haskell2010 + type: exitcode-stdio-1.0 + hs-source-dirs: test + main-is: check-hlint.hs + build-depends: base + , hlint + +test-suite rawr-format-test + type: exitcode-stdio-1.0 + hs-source-dirs: test + main-is: Spec.hs + build-depends: base + , rawr-format + ghc-options: -threaded -rtsopts -with-rtsopts=-N + default-language: Haskell2010 + diff --git a/rawr-format/test/Spec.hs b/rawr-format/test/Spec.hs new file mode 100644 index 0000000..cd4753f --- /dev/null +++ b/rawr-format/test/Spec.hs @@ -0,0 +1,2 @@ +main :: IO () +main = putStrLn "Test suite not yet implemented" diff --git a/rawr-format/test/check-hlint.hs b/rawr-format/test/check-hlint.hs new file mode 100644 index 0000000..b1b488e --- /dev/null +++ b/rawr-format/test/check-hlint.hs @@ -0,0 +1,17 @@ +module Main (main) where + +import Language.Haskell.HLint (hlint) +import System.Exit (exitFailure, exitSuccess) + +arguments :: [String] +arguments = + [ "lib" + , "test" + ] + +main :: IO () +main = do + hints <- hlint arguments + if null hints + then exitSuccess + else exitFailure diff --git a/rawr-format/test/doctests.hs b/rawr-format/test/doctests.hs new file mode 100644 index 0000000..bd8b9ee --- /dev/null +++ b/rawr-format/test/doctests.hs @@ -0,0 +1,4 @@ +import Test.DocTest + +main :: IO () +main = doctest ["-ilib", "lib"] diff --git a/test/properties.hs b/rawr-format/test/properties.hs similarity index 61% rename from test/properties.hs rename to rawr-format/test/properties.hs index 4bb73d4..2f7ddbd 100644 --- a/test/properties.hs +++ b/rawr-format/test/properties.hs @@ -2,15 +2,16 @@ module Main where -import Control.Monad -import Data.List -import Data.Monoid -import qualified Data.Set as S -import Data.Word -import System.Exit -import Test.QuickCheck - -import qualified Data.BitMap.Roaring as R +import Control.Monad +import Data.List +import Data.Monoid +import qualified Data.Set as S +import Data.Word +import System.Exit +import Test.QuickCheck + +import qualified Data.BitMap.Roaring as R +import qualified Data.BitMap.Roaring.Chunk as C import qualified Data.BitMap.Roaring.Utility as R -- * Check utility functions @@ -40,6 +41,10 @@ prop_size_empty = 0 == R.size R.empty prop_size_singleton :: Word32 -> Bool prop_size_singleton i = 1 == R.size (R.singleton i) +-- | Sets have size of list length. +prop_size_fromList :: NonEmptyList Word32 -> Bool +prop_size_fromList (NonEmpty is) = length (nub is) == R.size (R.fromList is) + -- | Singletons have size 1, then size 0 when deleted. prop_size_delete_singleton :: Word32 -> Bool prop_size_delete_singleton i = @@ -55,17 +60,17 @@ prop_null_delete_singleton i = -- | 'toAscList' produces sorted lists. prop_toAscList_sorted :: NonEmptyList Word32 -> Bool prop_toAscList_sorted (NonEmpty l) = - let l' = S.toAscList (S.fromList l) + let l' = R.toAscList (R.fromList l) in l' == sort l' -- | 'toDescList' produces sorted lists. prop_toDescList_sorted :: NonEmptyList Word32 -> Bool prop_toDescList_sorted (NonEmpty l) = - let l' = S.toDescList (S.fromList l) + let l' = R.toDescList (R.fromList l) in l' == sortBy (flip compare) l' --- | "Data.IntSet" and "Data.BitMap.Roaring" agree about a set when building --- from the same list of inputs. +-- | "Data.Set" and "Data.BitMap.Roaring" agree about a set when +-- building from the same list of inputs. prop_intset_roaring_agree :: NonEmptyList Word32 -> Bool prop_intset_roaring_agree (NonEmpty l) = let r = R.toAscList $ R.fromList l @@ -78,13 +83,31 @@ prop_map_elem_fromList (NonEmpty l) = let r = R.fromList l in all (`R.member` r) l --- | union s1 s2 == fromList (toList s1 <> toList s2) +-- | union (fromList s1) (fromList s2) == fromList (s1 <> s2) prop_union_fromList :: NonEmptyList Word32 -> NonEmptyList Word32 -> Bool prop_union_fromList (NonEmpty as) (NonEmpty bs) = let q = R.fromList as r = R.fromList bs qr = R.fromList (as <> bs) - in (R.toAscList qr == R.toAscList (q `R.union` r)) + in R.toAscList qr == R.toAscList (q `R.union` r) + +prop_intersection_fromList :: NonEmptyList Word32 -> NonEmptyList Word32 -> Bool +prop_intersection_fromList (NonEmpty al) (NonEmpty bl) = + let am = R.fromList al + bm = R.fromList bl + im = R.intersection am bm + as = S.fromList al + bs = S.fromList bl + is = S.intersection as bs + in R.toList im == S.toList is + +prop_ld_chunk_intersection :: NonEmptyList Word16 -> NonEmptyList Word16 -> Bool +prop_ld_chunk_intersection (NonEmpty al) (NonEmpty bl) = + let is = S.intersection (S.fromList al) (S.fromList bl) + cc = C.chunkClear 0 (C.chunkNew 0 0) + fromList = foldl' (flip C.chunkSet) cc + ms = C.intersection (fromList al) (fromList bl) + in C.toList ms == (fromIntegral <$> S.toList is) -- -- Use Template Haskell to automatically run all of the properties above. diff --git a/rawr.cabal b/rawr.cabal deleted file mode 100644 index 674fd6c..0000000 --- a/rawr.cabal +++ /dev/null @@ -1,43 +0,0 @@ -name: rawr -version: 0.1.0.0 -synopsis: Roaring Bitmaps compressed bitmap data-structure. -description: Roaring Bitmaps compressed bitmap data-structure. -homepage: https://github.com/thsutton/rawr/ -license: BSD3 -license-file: LICENSE -author: Thomas Sutton -maintainer: me@thomas-sutton.id.au --- copyright: -category: Data -build-type: Simple -extra-source-files: README.md -cabal-version: >=1.10 - -source-repository HEAD - type: git - location: https://github.com/thsutton/rawr.git - -library - default-language: Haskell2010 - hs-source-dirs: lib - exposed-modules: - Data.BitMap.Roaring - Data.BitMap.Roaring.Chunk - Data.BitMap.Roaring.Utility - build-depends: - base >=4.7 && <4.8 - , convertible - , containers - , vector - , vector-algorithms - -test-suite properties - type: exitcode-stdio-1.0 - default-language: Haskell2010 - hs-source-dirs: test - main-is: properties.hs - build-depends: - base - , QuickCheck - , containers - , rawr diff --git a/HLint.hs b/rawr/HLint.hs similarity index 93% rename from HLint.hs rename to rawr/HLint.hs index 3ab0a2e..83f8b90 100644 --- a/HLint.hs +++ b/rawr/HLint.hs @@ -9,3 +9,4 @@ import "hint" HLint.Generalise ignore "Use if" ignore "Use liftM" +ignore "Use &&&" diff --git a/rawr/LICENSE b/rawr/LICENSE new file mode 100644 index 0000000..f284b4b --- /dev/null +++ b/rawr/LICENSE @@ -0,0 +1,30 @@ +Copyright Thomas Sutton (c) 2017 + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + * Neither the name of Thomas Sutton nor the names of other + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/rawr/README.md b/rawr/README.md new file mode 100644 index 0000000..65d95d2 --- /dev/null +++ b/rawr/README.md @@ -0,0 +1,42 @@ +Rawr +==== + +[![Build status][travis-badge]][travis-link] + +[*Rawr*][1] is a Haskell implementation of the [Roaring Bitmaps][2] +data structure. Roaring Bitmaps are a compressed bitmap data structure +offering better space and time performance than other compressed +bitmaps in many situations. + +For more information about *rawr* see the [documentation][3] or refer +to the [Roaring Bitmaps][2] web-site for other implementations and +publications about the data structure. + +**Please note:** This is a work in progress and is not yet ready for +use. When complete it will be released on Hackage. + +Structure +--------- + +The *Roaring Bitmap* structure divides the 32-bit keys into two 16-bit +values. One, the high order bits, identifies a *chunk* within the +map and the other, the low order bits, identifies a *bit* within +the chunk. + +There are two chunk representations: + +1. A sparse chunk contains a `Word16` for each *bit* present in the + chunk. + +2. A dense chunk contains 4096 `Word16`s which contains exactly one + bit for every possible *bit* which can be present in the chunk. + +The structure will convert the representation of each chunk as *bit*s +are set and cleared from the map. + +[1]: https://github.com/thsutton/rawr +[2]: http://www.roaringbitmaps.org/ +[3]: https://hackage.haskell.org/package/rawr/docs/Data-BitMap-Roaring.html + +[travis-link]: https://travis-ci.org/thsutton/rawr +[travis-badge]: https://travis-ci.org/thsutton/rawr.svg?branch=master diff --git a/rawr/Setup.hs b/rawr/Setup.hs new file mode 100644 index 0000000..9a994af --- /dev/null +++ b/rawr/Setup.hs @@ -0,0 +1,2 @@ +import Distribution.Simple +main = defaultMain diff --git a/lib/Data/BitMap/Roaring.hs b/rawr/lib/Data/BitMap/Roaring.hs similarity index 57% rename from lib/Data/BitMap/Roaring.hs rename to rawr/lib/Data/BitMap/Roaring.hs index 022ed71..7bb26d8 100644 --- a/lib/Data/BitMap/Roaring.hs +++ b/rawr/lib/Data/BitMap/Roaring.hs @@ -1,23 +1,23 @@ -- | --- Module: Data.BitMap.Roaring +-- Module: Data.BitMap.Roaring -- Description: Compressed bitmap data structure with good performance. --- Copyright: (c) Thomas Sutton 2015 --- License: BSD3 --- Maintainer: me@thomas-sutton.id.au --- Stability: experimental +-- Copyright: (c) Thomas Sutton 2015 +-- License: BSD3 +-- Maintainer: me@thomas-sutton.id.au +-- Stability: experimental -- -- A compressed bitmaps with good space and time performance. -- -- These modules are intended to be imported qualified, to avoid name clashes -- with Prelude functions, e.g. -- --- > import Data.BitMap.Roaring (BitMap) +-- > import Data.BitMap.Roaring (BitMap) -- > import qualified Data.BitMap.Roaring as Roaring -- --- The implementation paritions values into chunks based on their high 16 bits. --- Chunks are represented differently based on their density: low-density --- chunks are stored as packed arrays of the low-order bits while high-density --- chunks are stored as bit vectors. +-- The implementation partitions values into chunks based on their +-- high 16 bits. Chunks are represented according to their density: +-- low-density chunks are stored as packed arrays of the low-order +-- bits while high-density chunks are stored as bit vectors. -- -- * Samy Chambi, Daniel Lemire, Owen Kaser, Robert Godin, -- \"/Better bitmap performance with Roaring bitmaps/\", Software: Practice @@ -25,20 +25,47 @@ -- module Data.BitMap.Roaring where -import Data.Monoid -import Data.Vector (Vector) +import Data.Bits +import Data.Monoid +import Data.Vector (Vector) import qualified Data.Vector as V -import Data.Word +import Data.Word -import Data.BitMap.Roaring.Chunk -import Data.BitMap.Roaring.Utility +import Data.BitMap.Roaring.Chunk (Chunk) +import qualified Data.BitMap.Roaring.Chunk as C +import Data.BitMap.Roaring.Utility -- | A set of bits. data BitMap = BitMap (Vector Chunk) - deriving (Show) + deriving (Show, Eq) type Key = Word32 +instance Bits BitMap where + bitSize _ = 2^32 + bitSizeMaybe _ = Just (2^32) + isSigned _ = False + + (.&.) = intersection + (.|.) = union + xor = const -- TODO + complement a = a + + shift x i = x + rotate x i = x + + zeroBits = BitMap V.empty + bit i = singleton (fromIntegral i) + + popCount x = 0 + testBit x i = False + setBit x i = x + clearBit x i = x + complementBit x i = x + +instance FiniteBits BitMap where + finiteBitSize _ = 2^32 + -- * Query -- | /O(1)./ Is the set empty? @@ -47,15 +74,15 @@ null (BitMap v) = V.null v -- | Cardinality of the set. size :: BitMap -> Int -size (BitMap cs) = V.sum $ V.map chunkCardinality cs +size (BitMap cs) = V.foldl' (\s c-> s + popCount c) 0 cs -- | Is the value a member of the set? member :: Key -> BitMap -> Bool member k (BitMap cs) = let (i,b) = splitWord k - in case vLookup (\c -> i == chunkIndex c) cs of - Nothing -> False - Just (_,c) -> chunkGet b c + in case vLookup (\c -> i == C.chunkIndex c) cs of + Nothing -> False + Just (_, c) -> C.chunkCheck b c -- | Is this a subset? -- @(s1 `isSubsetOf` s2)@ tells whether @s1@ is a subset of @s2. @@ -88,8 +115,8 @@ singleton k = insert k empty insert :: Key -> BitMap -> BitMap insert k (BitMap v) = let (i,b) = splitWord k - f = Just . maybe (chunkNew i b) (chunkSet b) - v' = vAlter f (\c -> i == chunkIndex c) v + f = Just . maybe (C.chunkNew i b) (C.chunkSet b) + v' = vAlter f (\c -> i == C.chunkIndex c) v in BitMap v' -- | Delete a value in the set. @@ -98,13 +125,13 @@ insert k (BitMap v) = delete :: Key -> BitMap -> BitMap delete k (BitMap v) = let (i,b) = splitWord k - v' = vAlter (f b) (\c -> i == chunkIndex c) v + v' = vAlter (f b) (\c -> i == C.chunkIndex c) v in BitMap v' where f _ Nothing = Nothing f b (Just c) = - let c' = chunkClear b c - in if 0 == chunkCardinality c' + let c' = C.chunkClear b c + in if 0 == popCount c' then Nothing else Just c' @@ -112,20 +139,33 @@ delete k (BitMap v) = -- | The union of two sets. union :: BitMap -> BitMap -> BitMap -union (BitMap cs) (BitMap ds) = BitMap $ mergeWith f cs ds +union (BitMap cs) (BitMap ds) = + BitMap (vMergeWith merge cs ds) where - f :: Maybe Chunk -> Maybe Chunk -> Maybe Chunk - f Nothing b = b - f a Nothing = a - f (Just a) (Just b) = Just $ mergeChunks a b - --- | The difference between two sets. -difference :: BitMap -> BitMap -> BitMap -difference _ _ = empty + merge :: Maybe Chunk -> Maybe Chunk -> Maybe Chunk + merge (Just a) (Just b) = + let c = a `C.union` b + in if C.null c then Nothing else Just c + merge (Just a) Nothing = Just a + merge Nothing (Just b) = Just b + merge Nothing Nothing = Nothing -- | The intersection of two sets. intersection :: BitMap -> BitMap -> BitMap -intersection _ _ = empty +intersection (BitMap as) (BitMap bs) = + BitMap (vMergeWith merge as bs) + where + merge (Just a) (Just b) = + let c = a `C.intersection` b + in if C.null c then Nothing else Just c + merge _ _ = Nothing + +-- | The difference between two sets. +difference :: BitMap -> BitMap -> BitMap +difference (BitMap as) (BitMap bs) = + BitMap (vMergeWith merge as bs) + where + merge _ _ = Nothing -- * Conversion @@ -147,7 +187,7 @@ toAscList :: BitMap -> [Key] toAscList (BitMap cs) = work cs [] where work cs' l | V.null cs' = l - | otherwise = let c = chunkToBits $ V.head cs' + | otherwise = let c = C.toList $ V.head cs' cs'' = V.tail cs' in work cs'' (l <> c) @@ -163,33 +203,3 @@ toDescList = reverse . toAscList -- TODO(thsutton) Implement fromAscList :: [Key] -> BitMap fromAscList _ = empty - --- * Utility - - --- | Merge two 'Vector's of 'Chunk's. --- --- Precondition: Both vectors are sorted by 'chunkIndex'. --- Postcondition: Output vector sorted by 'chunkIndex'. --- Postcondition: length(output) >= max(length(a),length(b)) -mergeWith - :: (Maybe Chunk -> Maybe Chunk -> Maybe Chunk) - -- ^ Merge two chunks with the same index. - -> Vector Chunk - -> Vector Chunk - -> Vector Chunk -mergeWith f v1 v2 - | V.null v1 = v2 - | V.null v2 = v1 - | otherwise = - let a = V.head v1 - b = V.head v2 - in work a v1 b v2 - where - -- Note: we take the head and the *entirety* of each vector; NOT the head - -- and the tail! - work :: Chunk -> Vector Chunk -> Chunk -> Vector Chunk -> Vector Chunk - work a as b bs = case a `compare` b of - LT -> a `V.cons` mergeWith f (V.tail as) bs - EQ -> mergeChunks a b `V.cons` mergeWith f (V.tail as) (V.tail bs) - GT -> b `V.cons` mergeWith f as (V.tail bs) diff --git a/rawr/lib/Data/BitMap/Roaring/Chunk.hs b/rawr/lib/Data/BitMap/Roaring/Chunk.hs new file mode 100644 index 0000000..7edb203 --- /dev/null +++ b/rawr/lib/Data/BitMap/Roaring/Chunk.hs @@ -0,0 +1,189 @@ +module Data.BitMap.Roaring.Chunk where + +import Control.Applicative +import Control.Monad +import Control.Monad.Trans.State +import Data.Bits +import Data.Function +import Data.Monoid +import qualified Data.Vector.Algorithms.Heap as S +import qualified Data.Vector.Unboxed as U +import qualified Data.Vector.Unboxed.Mutable as M +import Data.Word + +import Data.BitMap.Roaring.Chunk.High (HDVector (..)) +import qualified Data.BitMap.Roaring.Chunk.High as H +import Data.BitMap.Roaring.Chunk.Low (LDVector (..)) +import qualified Data.BitMap.Roaring.Chunk.Low as L +import Data.BitMap.Roaring.Utility + +-- | Chunks are identified by a 'Word16' index. +type Index = Word16 + +-- | A chunk representing the keys which share particular 16 high-order bits. +-- +-- Chunk with low density (i.e. no more than 4096 members) are represented as a +-- sorted array of their low 16 bits. Chunks with high density (i.e. more than +-- 4096 members) are represented by a bit vector. +data Chunk + = LowDensity + { chunkIndex :: Index + , chunkArray :: LDVector + } + | HighDensity + { chunkIndex :: Index + , chunkBits :: HDVector + } + deriving (Eq, Show) + +-- | 'Chunk's are ordered by their index. +instance Ord Chunk where + compare = compare `on` chunkIndex + +instance Bits Chunk where + bitSize _ = 2^16 + bitSizeMaybe _ = Just (2^16) + isSigned _ = False + + (.|.) = union + (.&.) = intersection + + testBit (LowDensity ix a) i = L.testBit a (fromIntegral i) + testBit (HighDensity ix a) i = H.testBit a (fromIntegral i) + + bit i = singleton (fromIntegral i) + + popCount (LowDensity ix a) = L.popCount a + popCount (HighDensity ix a) = H.popCount a + +singleton :: Word32 -> Chunk +singleton i = + let (ix, b) = splitWord i + in chunkNew ix b + +-- | Create a new chunk. +chunkNew :: Index -> Word16 -> Chunk +chunkNew i v = LowDensity i (L.singleton v) + +-- | Add a word into a chunk. +set :: Word16 -> Chunk -> Chunk +set b c@(HighDensity i bs) + | H.testBit bs b = c + | otherwise = HighDensity i (H.setBit bs b) +set b c@(LowDensity i bs) + | L.testBit bs b = c + | otherwise = repackChunk $ LowDensity i (L.setBit bs b) + +toList :: Chunk -> [Word32] +toList (LowDensity i bs) = combineWord i <$> L.toList bs +toList (HighDensity i bs) = combineWord i <$> H.toList bs + +bits :: Word64 -> [Word16] +bits w = foldr abit [] [0..63] + where + abit :: Int -> [Word16] -> [Word16] + abit i l = if testBit w i + then fromIntegral i : l + else l + +chunkCheck :: Word16 -> Chunk -> Bool +chunkCheck w (LowDensity _ bs) = L.testBit bs w +chunkCheck w (HighDensity _ bs) = H.testBit bs w + +chunkSet :: Word16 -> Chunk -> Chunk +chunkSet w c@(HighDensity i bs) + | H.testBit bs w = c + | otherwise = HighDensity i (H.setBit bs w) +chunkSet w c@(LowDensity i bs) + | L.testBit bs w = c + | otherwise = LowDensity i (L.setBit bs w) + +chunkClear :: Word16 -> Chunk -> Chunk +chunkClear w c + | chunkCheck w c = + case c of + LowDensity i bs -> LowDensity i (L.clearBit bs w) + HighDensity i bs -> HighDensity i (H.clearBit bs w) + | otherwise = c + +-- | Take the union of two 'Chunk's. +-- +-- Postcondition: popCount (a `union` b) >= (popCount a) + (popCount b) +union :: Chunk -> Chunk -> Chunk +union a b + | chunkIndex a == chunkIndex b = work a b + | otherwise = error "Cannot take union of chunks with different indexes!" + where + work (HighDensity i as) (HighDensity _ bs) = + HighDensity i (as `H.union` bs) + work (HighDensity i as) (LowDensity _ bs) = + HighDensity i (as `H.union` toHDVector bs) + work (LowDensity i as) (HighDensity _ bs) = + HighDensity i (toHDVector as `H.union` bs) + work (LowDensity i as) (LowDensity _ bs) = + repackChunk $ LowDensity i (as `L.union` bs) + +-- | Take the intersection of two 'Chunk's. +-- +-- TODO: Maintain the density invariant. +intersection a b + | chunkIndex a == chunkIndex b = work a b + | otherwise = error "Cannot take intersection of chunks with different indexes!" + where + work (LowDensity ia a) (LowDensity ib b) = + LowDensity ia (a `L.intersection` b) + work (HighDensity ia a) (LowDensity ib b) = + LowDensity ia (toLDVector a `L.intersection` b) + work (LowDensity ia a) (HighDensity ib b) = + LowDensity ia (a `L.intersection` toLDVector b) + work (HighDensity ia a) (HighDensity ib b) = + repackChunk $ HighDensity ia (a `H.intersection` b) + +xor :: Chunk -> Chunk -> Chunk +xor a b + | chunkIndex a == chunkIndex b = work a b + | otherwise = error "Cannot take xor of chunks with different indexes!" + where + work :: Chunk -> Chunk -> Chunk + work (LowDensity ia as) (LowDensity ib bs) = + repackChunk $ LowDensity ia (as `L.xor` bs) + work (LowDensity ia as) (HighDensity ib bs) = + repackChunk $ HighDensity ia (toHDVector as `H.xor` bs) + work (HighDensity ia as) (LowDensity ib bs) = + repackChunk $ HighDensity ia (as `H.xor` toHDVector bs) + work (HighDensity ia as) (HighDensity ib bs) = + repackChunk $ HighDensity ia (as `H.xor` bs) + +-- * Queries + +null :: Chunk -> Bool +null c = popCount c == 0 + +-- * Utility + +-- | Repack a 'Chunk' to enforce the density invariant. +repackChunk :: Chunk -> Chunk +repackChunk c@(LowDensity ix v) + | L.popCount v >= 4096 = HighDensity ix (toHDVector v) + | otherwise = c +repackChunk c@(HighDensity ix v) + | H.popCount v < 4096 = LowDensity ix (toLDVector v) + | otherwise = c + +-- | Pack a low-density vector into a high-density vector. +toHDVector :: LDVector -> HDVector +toHDVector (LDVector bs) = U.foldl' H.setBit H.empty bs + +-- | Unpack a high-density vector to a low-density vector. +toLDVector :: HDVector -> LDVector +toLDVector v@(HDVector ws) = + let n = H.popCount v + bs = U.generate n (\i -> fromIntegral $ select i v) + in LDVector bs + where + -- | Select the nth set bit. + select :: Int -> HDVector -> Int + select i (HDVector v) = + let runningCount = evalState (U.mapM (\c -> modify (+ popCount c) >> get) v) 0 + (p,r) = U.span (< i) runningCount + in -1 diff --git a/rawr/lib/Data/BitMap/Roaring/Chunk/High.hs b/rawr/lib/Data/BitMap/Roaring/Chunk/High.hs new file mode 100644 index 0000000..20ad52f --- /dev/null +++ b/rawr/lib/Data/BitMap/Roaring/Chunk/High.hs @@ -0,0 +1,94 @@ +-- | +module Data.BitMap.Roaring.Chunk.High where + +import Data.Bits ((.&.), (.|.)) +import qualified Data.Bits as B +import qualified Data.Vector.Algorithms.Heap as S +import qualified Data.Vector.Unboxed as U +import qualified Data.Vector.Unboxed.Mutable as M +import Data.Word + +-- | "High density" bits packed into a vector. +newtype HDVector = HDVector (U.Vector Word64) + deriving (Eq, Show) + +-- * Construct + +-- | Empty high-density vector. +empty :: HDVector +empty = + HDVector (U.replicate 1024 0) + +-- | New high-density vector with a single bit set. +singleton :: Word16 -> HDVector +singleton i = + let (wi, bi) = fromIntegral i `divMod` 64 + mk i = if i == wi + then B.bit bi + else 0 + in HDVector (U.generate 1024 mk) + +-- * Modify + +-- | Set a bit in a high-density vector. +setBit :: HDVector -> Word16 -> HDVector +setBit (HDVector bs) ix = + let (w, b) = fromIntegral ix `divMod` 64 + in HDVector (U.modify (\v -> M.read v w >>= M.write v w . flip B.setBit b) bs) + +-- | Clear a bit in a high-density vector. +clearBit :: HDVector -> Word16 -> HDVector +clearBit (HDVector bs) ix = + let (w, b) = fromIntegral ix `divMod` 64 + in HDVector (U.modify (\v -> M.read v w >>= M.write v w . flip B.clearBit b) bs) + +-- | Flip a bit in a high-density vector. +-- +-- TODO Specialise implementation. +complementBit :: HDVector -> Word16 -> HDVector +complementBit v ix + | testBit v ix = clearBit v ix + | otherwise = setBit v ix + +-- * Operators + +-- | Take the intersection of two high-density vectors. +-- +-- NOTE: Callers must check and enforce the density invariant. +intersection :: HDVector -> HDVector -> HDVector +intersection (HDVector as) (HDVector bs) = + HDVector (U.zipWith (.&.) as bs) + +-- | Take the union of two high-density vectors. +union :: HDVector -> HDVector -> HDVector +union (HDVector as) (HDVector bs) = + HDVector (U.zipWith (.|.) as bs) + +-- | Take the exclusive or of two high-density vectors. +-- +-- NOTE: Callers must check and enforce the density invariant. +xor :: HDVector -> HDVector -> HDVector +xor (HDVector as) (HDVector bs) = + HDVector (U.zipWith B.xor as bs) + +-- * Query + +-- | Test a bit in a high-density vector. +testBit :: HDVector -> Word16 -> Bool +testBit (HDVector bs) ix = + let (wi, bi) = fromIntegral ix `divMod` 64 + in B.testBit (bs U.! wi) bi + +-- | Query number of set bits. +popCount :: HDVector -> Int +popCount (HDVector v) = U.foldl' (\a b -> a + B.popCount b) 0 v + +-- * Conversions + +-- | Unpack the 'Word16's set in a 'HDVector'. +toList :: HDVector -> [Word16] +toList (HDVector v) = [] + where + unpackWord :: Word64 -> [Word16] + unpackWord w = fmap fst . filter snd $ fmap (\bi -> (fromIntegral bi, B.testBit w bi)) [0..63] + diff --git a/rawr/lib/Data/BitMap/Roaring/Chunk/Low.hs b/rawr/lib/Data/BitMap/Roaring/Chunk/Low.hs new file mode 100644 index 0000000..d7a7fd5 --- /dev/null +++ b/rawr/lib/Data/BitMap/Roaring/Chunk/Low.hs @@ -0,0 +1,100 @@ +module Data.BitMap.Roaring.Chunk.Low where + +import qualified Data.Vector.Algorithms.Heap as S +import qualified Data.Vector.Unboxed as U +import qualified Data.Vector.Unboxed.Mutable as M +import Data.Word + +import Data.BitMap.Roaring.Utility + +-- | "Low density" bits stored in a vector. +newtype LDVector = LDVector { unwrapVector :: U.Vector Word16 } + deriving (Eq, Show) + +-- * Constructors + +-- | An empty low-density vector. +empty :: LDVector +empty = LDVector U.empty + +-- | A singleton low-density vector. +singleton :: Word16 -> LDVector +singleton b = LDVector (U.singleton b) + +-- * Modifying vectors + +-- | Set a word in a low-density vector. +-- +-- TODO: Implement in O(log n). +setBit :: LDVector -> Word16 -> LDVector +setBit lv@(LDVector bs) ix + | testBit lv ix = lv + | otherwise = LDVector (U.modify (S.sortBy compare) $ U.cons ix bs) + +-- | Clear a word in a low-density vector. +-- +-- Pre-condition: The word is present in the vector. +-- Post-condition: The word is not present in the vector. +-- +-- TODO: Implement in O(log n) +clearBit :: LDVector -> Word16 -> LDVector +clearBit (LDVector bs) ix = + LDVector (U.filter (/= ix) bs) + +-- | Flip a bit in a low-density vector. +-- +-- TODO Implement in O(log n) +complementBit :: LDVector -> Word16 -> LDVector +complementBit lv ix + | testBit lv ix = clearBit lv ix + | otherwise = setBit lv ix + +-- * Queries + +-- | Check whether a word is present in a low-density vector. +-- +-- TODO: Implement in O(log n) +testBit :: LDVector -> Word16 -> Bool +testBit (LDVector bs) ix = U.elem ix bs + +-- | Query the number of set bits in a low-density vector. +popCount :: LDVector -> Int +popCount (LDVector v) = U.length v + +-- * Operators + +-- | Take the intersection of two low-density vectors. +intersection :: LDVector -> LDVector -> LDVector +intersection (LDVector as) (LDVector bs) = + LDVector (vMergeWith merge as bs) + where + merge (Just a) (Just b) = Just a + merge (Just a) Nothing = Nothing + merge Nothing (Just b) = Nothing + merge Nothing Nothing = Nothing + +-- | Take the union of two low-density vectors. +-- +-- NOTE: Callers must check and enforce the density invariant. +union :: LDVector -> LDVector -> LDVector +union (LDVector v1) (LDVector v2) = + LDVector (vMergeWith merge v1 v2) + where + merge (Just a) (Just b) = Just a + merge Nothing a = a + merge a Nothing = a + +-- | Take the exclusive-or of two low-density vectors. +xor :: LDVector -> LDVector -> LDVector +xor (LDVector as) (LDVector bs) = + LDVector (vMergeWith merge as bs) + where + merge a Nothing = a + merge Nothing b = b + merge _ _ = Nothing + +-- * Conversions + +-- | Unpack the 'Word16's set in a 'HDVector'. +toList :: LDVector -> [Word16] +toList (LDVector v) = U.toList v diff --git a/lib/Data/BitMap/Roaring/Utility.hs b/rawr/lib/Data/BitMap/Roaring/Utility.hs similarity index 60% rename from lib/Data/BitMap/Roaring/Utility.hs rename to rawr/lib/Data/BitMap/Roaring/Utility.hs index f19cd2f..4f4da35 100644 --- a/lib/Data/BitMap/Roaring/Utility.hs +++ b/rawr/lib/Data/BitMap/Roaring/Utility.hs @@ -1,12 +1,14 @@ +{-# LANGUAGE MultiParamTypeClasses #-} module Data.BitMap.Roaring.Utility where -import Data.Bits -import Data.Convertible -import Data.Monoid -import qualified Data.Vector as V +import Data.Bits +import Data.Convertible +import Data.Monoid +import qualified Data.Vector as V import qualified Data.Vector.Algorithms.Heap as VAH -import qualified Data.Vector.Unboxed as U -import Data.Word +import qualified Data.Vector.Generic as G +import qualified Data.Vector.Unboxed as U +import Data.Word -- * Words @@ -24,17 +26,44 @@ combineWord h l = rotate (convert h) (-16) .|. convert l -- * Vectors -vMerge :: (U.Unbox e, Ord e) => U.Vector e -> U.Vector e -> U.Vector e -vMerge as bs - | U.null as = bs - | U.null bs = as +-- | Merge two sorted vectors. +-- +-- The right element of a pair which are equal according to 'compare' +-- will be discarded. +-- +-- Postcondition: sorted (vMerge xs ys) +-- Postcondition: length (vMerge xs ys) >= max (length xs) (length ys) +vMerge :: (G.Vector vector e, Ord e) => vector e -> vector e -> vector e +vMerge = vMergeWith merge + where + merge :: Maybe e -> Maybe e -> Maybe e + merge Nothing a = a + merge a Nothing = a + merge (Just a) (Just b) = Just a + +-- | Merge two sorted vectors. +vMergeWith + :: (G.Vector vector e, G.Vector vector r, Ord e) + => (Maybe e -> Maybe e -> Maybe r) + -> vector e + -> vector e + -> vector r +vMergeWith f as bs + | G.null as = G.concatMap (maybe G.empty G.singleton . f Nothing . Just) bs + | G.null bs = G.concatMap (\e -> maybe G.empty G.singleton $ f (Just e) Nothing) as | otherwise = - let a = U.head as - b = U.head bs + let a = G.head as + b = G.head bs in case a `compare` b of - LT -> a `U.cons` vMerge (U.tail as) bs - EQ -> a `U.cons` vMerge (U.tail as) (U.tail bs) - GT -> b `U.cons` vMerge as (U.tail bs) + LT -> case f (Just a) Nothing of + Nothing -> vMergeWith f (G.tail as) bs + Just r -> r `G.cons` vMergeWith f (G.tail as) bs + EQ -> case f (Just a) (Just b) of + Nothing -> vMergeWith f (G.tail as) (G.tail bs) + Just r -> r `G.cons` vMergeWith f (G.tail as) (G.tail bs) + GT -> case f Nothing (Just b) of + Nothing -> vMergeWith f as (G.tail bs) + Just r -> r `G.cons` vMergeWith f as (G.tail bs) -- | Alter the 'Chunk' with the given index in a vector of 'Chunk's. -- @@ -58,7 +87,9 @@ vAlter f p v = case vLookup p v of -- | Search for a 'Chunk' with a specific index. -- --- TODO(thsutton) better search algorithm. +-- /O(log n)/ +-- +-- TODO(thsutton) Make the complexity claim be true. vLookup :: Ord a => (a -> Bool) -> V.Vector a -> Maybe (Int, a) vLookup p v = case V.findIndex p v of Nothing -> Nothing diff --git a/rawr/rawr.cabal b/rawr/rawr.cabal new file mode 100644 index 0000000..23cbe4c --- /dev/null +++ b/rawr/rawr.cabal @@ -0,0 +1,61 @@ +name: rawr +version: 0.1.0.0 +synopsis: Roaring Bitmaps compressed bitmap data-structure. +description: Roaring Bitmaps compressed bitmap data-structure. +homepage: https://github.com/thsutton/rawr#readme +license: BSD3 +license-file: LICENSE +author: Thomas Sutton +maintainer: me@thomas-sutton.id.au +copyright: (c) 2015 Thomas Sutton +category: Data +build-type: Simple +extra-source-files: README.md +cabal-version: >=1.10 + +source-repository HEAD + type: git + location: https://github.com/thsutton/rawr/ + +library + default-language: Haskell2010 + hs-source-dirs: lib + exposed-modules: Data.BitMap.Roaring + Data.BitMap.Roaring.Chunk + Data.BitMap.Roaring.Chunk.High + Data.BitMap.Roaring.Chunk.Low + Data.BitMap.Roaring.Utility + build-depends: base >=4.5 && <4.10 + , containers + , convertible + , transformers + , vector + , vector-algorithms + +test-suite properties + type: exitcode-stdio-1.0 + default-language: Haskell2010 + hs-source-dirs: test + main-is: properties.hs + build-depends: base + , QuickCheck + , containers + , rawr + +test-suite doctests + default-language: Haskell2010 + hs-source-dirs: test + type: exitcode-stdio-1.0 + ghc-options: -threaded + main-is: doctests.hs + build-depends: base + , QuickCheck + , doctest >= 0.9 + +test-suite zzz-check-hlint + default-language: Haskell2010 + type: exitcode-stdio-1.0 + hs-source-dirs: test + main-is: check-hlint.hs + build-depends: base + , hlint diff --git a/rawr/test/check-hlint.hs b/rawr/test/check-hlint.hs new file mode 100644 index 0000000..b1b488e --- /dev/null +++ b/rawr/test/check-hlint.hs @@ -0,0 +1,17 @@ +module Main (main) where + +import Language.Haskell.HLint (hlint) +import System.Exit (exitFailure, exitSuccess) + +arguments :: [String] +arguments = + [ "lib" + , "test" + ] + +main :: IO () +main = do + hints <- hlint arguments + if null hints + then exitSuccess + else exitFailure diff --git a/rawr/test/doctests.hs b/rawr/test/doctests.hs new file mode 100644 index 0000000..bd8b9ee --- /dev/null +++ b/rawr/test/doctests.hs @@ -0,0 +1,4 @@ +import Test.DocTest + +main :: IO () +main = doctest ["-ilib", "lib"] diff --git a/rawr/test/properties.hs b/rawr/test/properties.hs new file mode 100644 index 0000000..2f7ddbd --- /dev/null +++ b/rawr/test/properties.hs @@ -0,0 +1,123 @@ +{-# LANGUAGE TemplateHaskell #-} + +module Main where + +import Control.Monad +import Data.List +import Data.Monoid +import qualified Data.Set as S +import Data.Word +import System.Exit +import Test.QuickCheck + +import qualified Data.BitMap.Roaring as R +import qualified Data.BitMap.Roaring.Chunk as C +import qualified Data.BitMap.Roaring.Utility as R + +-- * Check utility functions + +-- | id == uncurry combineWord . splitWord +prop_splitWord_combineWord_id :: Word32 -> Bool +prop_splitWord_combineWord_id w = + w == (uncurry R.combineWord . R.splitWord $ w) + +-- | The empty set is null. +prop_null_empty :: Bool +prop_null_empty = R.null R.empty + +-- | Singleton sets are not null. +prop_not_null_singleton :: Word32 -> Bool +prop_not_null_singleton i = not . R.null $ R.singleton i + +-- | Larger sets are not null. +prop_not_null_fromList :: NonEmptyList Word32 -> Bool +prop_not_null_fromList (NonEmpty is) = not . R.null $ R.fromList is + +-- | Empty sets have size zero. +prop_size_empty :: Bool +prop_size_empty = 0 == R.size R.empty + +-- | Singletons have size one. +prop_size_singleton :: Word32 -> Bool +prop_size_singleton i = 1 == R.size (R.singleton i) + +-- | Sets have size of list length. +prop_size_fromList :: NonEmptyList Word32 -> Bool +prop_size_fromList (NonEmpty is) = length (nub is) == R.size (R.fromList is) + +-- | Singletons have size 1, then size 0 when deleted. +prop_size_delete_singleton :: Word32 -> Bool +prop_size_delete_singleton i = + let s = R.singleton i + s' = R.delete i s + in R.size s == 1 && R.size s' == 0 + +-- | Singletons are empty when the sole item is deleted. +prop_null_delete_singleton :: Word32 -> Bool +prop_null_delete_singleton i = + R.null . R.delete i $ R.singleton i + +-- | 'toAscList' produces sorted lists. +prop_toAscList_sorted :: NonEmptyList Word32 -> Bool +prop_toAscList_sorted (NonEmpty l) = + let l' = R.toAscList (R.fromList l) + in l' == sort l' + +-- | 'toDescList' produces sorted lists. +prop_toDescList_sorted :: NonEmptyList Word32 -> Bool +prop_toDescList_sorted (NonEmpty l) = + let l' = R.toDescList (R.fromList l) + in l' == sortBy (flip compare) l' + +-- | "Data.Set" and "Data.BitMap.Roaring" agree about a set when +-- building from the same list of inputs. +prop_intset_roaring_agree :: NonEmptyList Word32 -> Bool +prop_intset_roaring_agree (NonEmpty l) = + let r = R.toAscList $ R.fromList l + s = S.toAscList $ S.fromList l + in r == s + +-- | Every item in the source list should be an element. +prop_map_elem_fromList :: NonEmptyList Word32 -> Bool +prop_map_elem_fromList (NonEmpty l) = + let r = R.fromList l + in all (`R.member` r) l + +-- | union (fromList s1) (fromList s2) == fromList (s1 <> s2) +prop_union_fromList :: NonEmptyList Word32 -> NonEmptyList Word32 -> Bool +prop_union_fromList (NonEmpty as) (NonEmpty bs) = + let q = R.fromList as + r = R.fromList bs + qr = R.fromList (as <> bs) + in R.toAscList qr == R.toAscList (q `R.union` r) + +prop_intersection_fromList :: NonEmptyList Word32 -> NonEmptyList Word32 -> Bool +prop_intersection_fromList (NonEmpty al) (NonEmpty bl) = + let am = R.fromList al + bm = R.fromList bl + im = R.intersection am bm + as = S.fromList al + bs = S.fromList bl + is = S.intersection as bs + in R.toList im == S.toList is + +prop_ld_chunk_intersection :: NonEmptyList Word16 -> NonEmptyList Word16 -> Bool +prop_ld_chunk_intersection (NonEmpty al) (NonEmpty bl) = + let is = S.intersection (S.fromList al) (S.fromList bl) + cc = C.chunkClear 0 (C.chunkNew 0 0) + fromList = foldl' (flip C.chunkSet) cc + ms = C.intersection (fromList al) (fromList bl) + in C.toList ms == (fromIntegral <$> S.toList is) + +-- +-- Use Template Haskell to automatically run all of the properties above. +-- + +return [] +runTests :: IO Bool +runTests = $quickCheckAll + +main :: IO () +main = do + result <- runTests + unless result exitFailure diff --git a/stack.yaml b/stack.yaml index 02519c1..ad27add 100644 --- a/stack.yaml +++ b/stack.yaml @@ -1,5 +1,7 @@ -flags: {} +resolver: lts-7.14 packages: -- '.' +- 'rawr' +- 'rawr-format' extra-deps: [] -resolver: lts-2.18 +flags: {} +extra-package-dbs: []