{- For a fair benchmark it is important that both BLAS and FFTW run in a single thread. You can check it by running only BLAS or only FFTW benchmarks and watch the output of 'top'. Set OpenBLAS to single thread with: export OPENBLAS_NUM_THREADS=1 Set FFTW to single thread with: export FFTW_NUM_THREADS=1 Both OpenBLAS and FFTW may not react to those settings, but to this one: export OMP_NUM_THREADS=1 For benchmarking better turn off runtime allocation checks in @cabal.project@: package guarded-allocation flags: -debug -} module Main where import Criterion.Main (Benchmark, defaultMain, bgroup, bench, whnf, nf) import qualified Knead import qualified LLVM.Core as LLVM -- import Foreign.ForeignPtr (ForeignPtr, castForeignPtr) import qualified Numeric.BLAS.Rank1.Convolution as ConvNaive import qualified Numeric.BLAS.Rank1.Fourier as TrafoNaive import qualified Numeric.BLAS.Rank1.Unit as Unit import qualified Numeric.FFTW.Extra.Rank1 as Driver import qualified Numeric.FFTW.Extra.Rank1.Convolution as ConvFourier import qualified Numeric.FFTW.Extra.Rank1.Transform as TrafoFourier import qualified Numeric.FFTW.Batch as Batch import qualified Numeric.FFTW.Rank1 as Rank1 import Numeric.FFTW.Extra.Utility (toCyclic) import Numeric.FFTW.Extra.NumberTheory (ceilingPowerOfTwo, larger5Smooth, ceilingEfficient) import qualified Numeric.BLAS.Matrix.RowMajor as Matrix import qualified Numeric.BLAS.Vector.Slice as VectorSlice import qualified Numeric.BLAS.Vector as Vector import Numeric.BLAS.Vector (Vector) -- import qualified Data.Array.Comfort.Storable.Unchecked as UncheckedArray import qualified Data.Array.Comfort.Storable as Array import qualified Data.Array.Comfort.Shape as Shape import qualified Data.NonEmpty as NonEmpty import Data.Complex (Complex) import Data.Int (Int64) convolveCyclicNaive, convolveCyclicDrv, convolveCyclicFFT, convolveCyclicViaTable, convolveCyclicViaAcyclic :: Int -> Vector (Shape.Cyclic Int) (Complex Float) convolveCyclicNaive n = let x = Vector.zero $ Shape.Cyclic n in ConvNaive.convolveCyclic x x convolveCyclicDrv n = let x = Vector.zero $ Shape.Cyclic n in Driver.convolve x x -- Driver.convolveCyclic x x convolveCyclicFFT n = let x = Vector.zero $ Shape.Cyclic n in ConvFourier.convolveCyclic x x convolveCyclicViaTable n = let x = Vector.zero $ Shape.ZeroBased n in toCyclic n $ ConvNaive.convolveViaTable x x convolveCyclicViaAcyclic n = let x = Vector.zero $ Shape.ZeroBased n in toCyclic n $ ConvFourier.convolve x x -- convolveCyclicOneManyNaive, convolveCyclicOneManyDrv, convolveCyclicOneManyFFT :: Int -> Int -> Vector (Shape.ZeroBased Int, Shape.Cyclic Int) (Complex Float) {- convolveCyclicOneManyNaive m n = ConvNaive.convolveCyclicOneMany (Vector.zero (Shape.Cyclic n)) (Vector.zero (Shape.ZeroBased m, Shape.Cyclic n)) -} convolveCyclicOneManyDrv m n = Driver.convolveOneMany (Vector.zero (Shape.Cyclic n)) (Vector.zero (Shape.ZeroBased m, Shape.Cyclic n)) -- Driver.convolveCyclic x x convolveCyclicOneManyFFT m n = ConvFourier.convolveCyclicOneMany (Vector.zero (Shape.Cyclic n)) (Vector.zero (Shape.ZeroBased m, Shape.Cyclic n)) convolveAcyclicFFT, convolveAcyclicShortLong, convolveAcyclicDrv :: Int -> Int -> Vector (Shape.ZeroBased Int) (Complex Float) convolveAcyclicFFT n m = ConvFourier.convolve (Vector.zero $ Shape.ZeroBased n) (Vector.zero $ Shape.ZeroBased m) convolveAcyclicShortLong n m = ConvFourier.convolveShortLong (Vector.zero $ Shape.ZeroBased n) (Vector.zero $ Shape.ZeroBased m) convolveAcyclicDrv n m = Driver.convolve (Vector.zero $ Shape.ZeroBased n) (Vector.zero $ Shape.ZeroBased m) bconvolution :: Benchmark bconvolution = bgroup "Convolution" [ let sizes = [4,8,16,32,64,67,73,79,83,89,94,97] in bgroup "CyclicSmall" [ bgroup "Naive" $ flip map sizes $ \n -> bench (show n) $ whnf convolveCyclicNaive n , bgroup "ViaTable" $ flip map sizes $ \n -> bench (show n) $ whnf convolveCyclicViaTable n , bgroup "Fourier" $ flip map sizes $ \n -> bench (show n) $ whnf convolveCyclicFFT n , bgroup "ViaAcyclic" $ flip map sizes $ \n -> bench (show n) $ whnf convolveCyclicViaAcyclic n , bgroup "Driver" $ flip map sizes $ \n -> bench (show n) $ whnf convolveCyclicDrv n ] {- bgroup "CyclicSmall" [ bgroup "1" [ bgroup "Naive" $ flip map sizes $ \n -> bench (show n) $ whnf convolveCyclicNaive n , bgroup "Fourier" $ flip map sizes $ \n -> bench (show n) $ whnf convolveCyclicFFT n ] , bgroup "2" [ bgroup "Naive" $ flip map sizes $ \n -> bench (show n) $ whnf convolveCyclicOneManyNaive 2 n , bgroup "Fourier" $ flip map sizes $ \n -> bench (show n) $ whnf convolveCyclicOneManyFFT 2 n ] , bgroup "3" [ bgroup "Naive" $ flip map sizes $ \n -> bench (show n) $ whnf convolveCyclicOneManyNaive 3 n , bgroup "Fourier" $ flip map sizes $ \n -> bench (show n) $ whnf convolveCyclicOneManyFFT 3 n ] ] -} , -- let sizes = [22690 .. 22700] in let sizes = ceilingPowerOfTwo 130000 : [150060 .. 150070] ++ larger5Smooth 150070 : [] in {- let sizes = ceilingPowerOfTwo 500000 : [590120 .. 590130] ++ larger5Smooth 590130 : [] in -} bgroup "Cyclic" [ bgroup "Driver" $ flip map sizes $ \n -> bench (show n) $ whnf convolveCyclicDrv n , bgroup "Fourier" $ flip map sizes $ \n -> bench (show n) $ whnf convolveCyclicFFT n ] , bgroup "Acyclic" $ flip map [2048,3072,4096,8192,65536,1024000::Int] $ \m -> bgroup (show m) $ flip map (map (div m) [1,4,16,64,256,1024]) $ \n -> bgroup (show n) [ bench "Fourier" $ whnf (convolveAcyclicFFT n) m, bench "ShortLong" $ whnf (convolveAcyclicShortLong n) m, bench "Driver" $ whnf (convolveAcyclicDrv n) m ] ] fourier :: Int -> Vector (Shape.ZeroBased Int, Shape.Cyclic Int) (Complex Float) fourier = Batch.fourier Rank1.Forward . Vector.zero . (,) (Shape.ZeroBased 10000) . Shape.Cyclic bfourierFull :: Benchmark bfourierFull = bgroup "Full" [ bgroup "Range" $ flip map [1..256] $ \n -> bench (show n) $ whnf fourier n , bgroup "Smooth" $ flip map (take 30 $ iterate (larger5Smooth . (1+)) 10) $ \n -> bench (show n) $ whnf fourier n , bgroup "Efficient" $ flip map (take 40 $ iterate (ceilingEfficient . (1+)) 10) $ \n -> bench (show n) $ whnf fourier n ] fourierFull :: Int -> Vector (Shape.Cyclic Int) (Complex Float) fourierFull = Rank1.fourier Rank1.Forward . flip Vector.unit 1 . Shape.Cyclic fourierBatch :: Int -> Int -> Vector (Shape.ZeroBased Int, Shape.Cyclic Int) (Complex Float) fourierBatch n m = Batch.fourier Rank1.Forward . Array.reshape (Shape.ZeroBased m, Shape.Cyclic (div n m)) . flip Vector.unit 1 . Shape.Cyclic $ n bfourierBatch :: Benchmark bfourierBatch = bgroup "Batch" $ (flip map [1024,2048,4096,8192,16384,32768,65536,131072,262144] $ \n -> bgroup (show n) $ bench "Full" (whnf fourierFull n) : (flip map [1,2,4,8,16,32,64,128,256,512] $ \m -> bench (show m) $ whnf (fourierBatch n) m)) ++ (flip map [1000,1200,1400,1600,1800,2000,5000,10000,20000,50000,100000] $ \n -> bgroup (show n) $ bench "Full" (whnf fourierFull n) : (flip map [1,2,4,5,8,10,20,50,100,200] $ \m -> bench (show m) $ whnf (fourierBatch n) m)) fourierFull2Batch :: Int -> Vector (Shape.ZeroBased Int, Shape.Cyclic Int) (Complex Float) fourierFull2Batch n = fourierBatch n 2 fourierFullBatch2 :: Int -> Vector (Shape.ZeroBased Int, Shape.Cyclic Int) (Complex Float) fourierFullBatch2 n = fourierBatch n (div n 2) {- ToDo: this is prohibitively slow - benchmark twiddle factor generation taking 10 coefficients requires twice as much time as taking 100 coefficients -} fourierPartial :: Int -> Int -> Int -> Vector (Shape.ZeroBased Int) (Complex Float) fourierPartial n m mcut = TrafoFourier.takeDivisorFourier Rank1.Forward mcut m $ Vector.unit (Shape.Cyclic n) 1 fourierPartialNaive :: Int -> Int -> Vector (Shape.ZeroBased Int) (Complex Float) fourierPartialNaive n k = TrafoNaive.takeFourier Rank1.Forward (Shape.ZeroBased k) $ Vector.unit (Shape.Cyclic n) 1 unitVector :: Int -> Vector (Shape.Cyclic Int) (Complex Float) unitVector = flip Vector.unit 1 . Shape.Cyclic transpose2Rows :: Int -> Vector (Shape.Cyclic Int, Shape.ZeroBased Int) (Complex Float) transpose2Rows = Matrix.transpose . Array.mapShape (\(Shape.Cyclic n) -> (Shape.ZeroBased 2, Shape.Cyclic (div n 2))) . flip Vector.unit 1 . Shape.Cyclic transpose2Columns :: Int -> Vector (Shape.Cyclic Int, Shape.ZeroBased Int) (Complex Float) transpose2Columns = Matrix.transpose . Array.mapShape (\(Shape.Cyclic n) -> (Shape.ZeroBased (div n 2), Shape.Cyclic 2)) . flip Vector.unit 1 . Shape.Cyclic twiddleVector :: Int -> Int -> Vector (Shape.ZeroBased Int) (Complex Float) twiddleVector n d = VectorSlice.toVector $ Unit.twiddleVectorAutoLoop d (2*pi / fromIntegral n) n (Shape.ZeroBased n) 1 dcOffset :: Int -> Complex Float dcOffset = Vector.sum . flip Vector.unit 1 . Shape.Cyclic realDCOffset :: Int -> Float realDCOffset = Vector.sum . flip Vector.unit 1 . Shape.Cyclic . (2*) vectorAdd :: Int -> Vector (Shape.Cyclic Int) (Complex Float) vectorAdd n = Vector.add (Vector.unit (Shape.Cyclic n) 0) (Vector.unit (Shape.Cyclic n) 1) dcKnead :: (Vector (Shape.ZeroBased Int64) (Knead.Packed Float) -> Knead.Packed Float) -> Int -> Knead.Packed Float dcKnead kneadSum = kneadSum . -- flip Array.replicate LLVM.zero . flip Array.replicate (LLVM.cyclicVector $ NonEmpty.singleton 0) . Shape.ZeroBased . flip div 4 . fromIntegral {- dcKnead kneadSum n = kneadSum . (\(UncheckedArray.Array _sh fptr) -> UncheckedArray.Array (Shape.ZeroBased (div (fromIntegral n) 4)) (castForeignPtr (fptr :: ForeignPtr (Complex Float)))) . flip Vector.unit 1 . Shape.ZeroBased $ n -} dcKneadLarge :: (Vector (Shape.ZeroBased Int64) (Knead.Large Float) -> Knead.Large Float) -> Int -> Knead.Large Float dcKneadLarge kneadSum = kneadSum . -- flip Array.replicate LLVM.zero . flip Array.replicate (LLVM.cyclicVector $ NonEmpty.singleton 0) . Shape.ZeroBased . flip div (4*8) . fromIntegral bfourierBasic :: (Vector (Shape.ZeroBased Int64) (Knead.Packed Float) -> Knead.Packed Float) -> (Vector (Shape.ZeroBased Int64) (Knead.Large Float) -> Knead.Large Float) -> Benchmark bfourierBasic kneadSum kneadSumLarge = bgroup "Basic" $ flip map [4096,4410,4800,8192,8820,9600,16384] $ \n -> bgroup (show n) [ bench "Full" $ nf fourierFull n, bench "Full2Batch" $ nf fourierFull2Batch n, bench "FullBatch2" $ nf fourierFullBatch2 n, bench "Transpose2Rows" $ nf transpose2Rows n, bench "Transpose2Columns" $ nf transpose2Columns n, bench "DCReal" $ whnf realDCOffset n, bench "DCComplex" $ whnf dcOffset n, bench "DCKnead" $ whnf (dcKnead kneadSum) n, bench "DCKneadLarge" $ whnf (dcKneadLarge kneadSumLarge) n, bench "VectorAdd" $ whnf vectorAdd n, bench "UnitVector" $ whnf unitVector n, bench "Twiddle4" $ whnf (twiddleVector n) 4, bench "Twiddle8" $ whnf (twiddleVector n) 8, bench "Twiddle16" $ whnf (twiddleVector n) 16, bench "Twiddle32" $ whnf (twiddleVector n) 32 ] bfourierPartialSmall :: Benchmark bfourierPartialSmall = bgroup "PartialSmall" [ bgroup "44100" [ bench "Full" $ whnf fourierFull 44100, bench "1000" $ whnf (fourierPartial 44100 1050) 1000, bench "100" $ whnf (fourierPartial 44100 100) 100, bench "10" $ whnf (fourierPartial 44100 10) 10, bench "1" $ whnf (fourierPartial 44100 1) 1, bench "10" $ whnf (fourierPartialNaive 44100) 10, bench "5" $ whnf (fourierPartialNaive 44100) 5, bench "2" $ whnf (fourierPartialNaive 44100) 2, bench "1" $ whnf (fourierPartialNaive 44100) 1, bench "Twiddle" $ whnf (twiddleVector 44100) 32, bench "DCReal" $ whnf realDCOffset 44100, bench "DCComplex" $ whnf dcOffset 44100 ] ] bfourierPartial :: Benchmark bfourierPartial = bgroup "Partial" [ bgroup "441000" [ bench "Full" $ whnf fourierFull 441000, bench "1000" $ whnf (fourierPartial 441000 1050) 1000, bench "100" $ whnf (fourierPartial 441000 100) 100, bench "10" $ whnf (fourierPartial 441000 10) 10, bench "1" $ whnf (fourierPartial 441000 1) 1, bench "10" $ whnf (fourierPartialNaive 441000) 10, bench "5" $ whnf (fourierPartialNaive 441000) 5, bench "2" $ whnf (fourierPartialNaive 441000) 2, bench "1" $ whnf (fourierPartialNaive 441000) 1, bench "Twiddle" $ whnf (twiddleVector 441000) 32, bench "DCReal" $ whnf realDCOffset 441000, bench "DCComplex" $ whnf dcOffset 441000 ] ] bfourier :: Benchmark bfourier = bgroup "Fourier" [bfourierFull, bfourierBatch, {-bfourierBasic,-} bfourierPartialSmall, bfourierPartial] btwiddle :: Benchmark btwiddle = bgroup "Twiddle" [ bgroup "Vector" $ flip map [441,4410,44100,441000, 480,4800,48000,480000] $ \n -> bgroup (show n) $ flip map [2,4,8,16,32,64] $ \d -> bench (show d) $ whnf (twiddleVector n) d ] main :: IO () main = do kneadSum <- Knead.runSum kneadSumLarge <- Knead.runSumLarge defaultMain $ bconvolution : bfourier : bgroup "Fourier" [bfourierBasic kneadSum kneadSumLarge] : btwiddle : []