{-
For a fair benchmark it is important
that both BLAS and FFTW run in a single thread.
You can check it by running only BLAS or only FFTW benchmarks
and watch the output of 'top'.

Set OpenBLAS to single thread with:
export OPENBLAS_NUM_THREADS=1

Set FFTW to single thread with:
export FFTW_NUM_THREADS=1

Both OpenBLAS and FFTW may not react to those settings, but to this one:
export OMP_NUM_THREADS=1


For benchmarking better turn off runtime allocation checks in @cabal.project@:

package guarded-allocation
  flags: -debug
-}
module Main where

import Criterion.Main (Benchmark, defaultMain, bgroup, bench, whnf, nf)

import qualified Knead
import qualified LLVM.Core as LLVM
-- import Foreign.ForeignPtr (ForeignPtr, castForeignPtr)

import qualified Numeric.BLAS.Rank1.Convolution as ConvNaive
import qualified Numeric.BLAS.Rank1.Fourier as TrafoNaive
import qualified Numeric.BLAS.Rank1.Unit as Unit
import qualified Numeric.FFTW.Extra.Rank1 as Driver
import qualified Numeric.FFTW.Extra.Rank1.Convolution as ConvFourier
import qualified Numeric.FFTW.Extra.Rank1.Transform as TrafoFourier
import qualified Numeric.FFTW.Batch as Batch
import qualified Numeric.FFTW.Rank1 as Rank1
import Numeric.FFTW.Extra.Utility (toCyclic)
import Numeric.FFTW.Extra.NumberTheory
         (ceilingPowerOfTwo, larger5Smooth, ceilingEfficient)

import qualified Numeric.BLAS.Matrix.RowMajor as Matrix
import qualified Numeric.BLAS.Vector.Slice as VectorSlice
import qualified Numeric.BLAS.Vector as Vector
import Numeric.BLAS.Vector (Vector)

-- import qualified Data.Array.Comfort.Storable.Unchecked as UncheckedArray
import qualified Data.Array.Comfort.Storable as Array
import qualified Data.Array.Comfort.Shape as Shape
import qualified Data.NonEmpty as NonEmpty
import Data.Complex (Complex)
import Data.Int (Int64)


convolveCyclicNaive, convolveCyclicDrv, convolveCyclicFFT,
      convolveCyclicViaTable, convolveCyclicViaAcyclic ::
   Int -> Vector (Shape.Cyclic Int) (Complex Float)
convolveCyclicNaive n =
   let x = Vector.zero $ Shape.Cyclic n in
   ConvNaive.convolveCyclic x x

convolveCyclicDrv n =
   let x = Vector.zero $ Shape.Cyclic n in
   Driver.convolve x x
--   Driver.convolveCyclic x x

convolveCyclicFFT n =
   let x = Vector.zero $ Shape.Cyclic n in
   ConvFourier.convolveCyclic x x

convolveCyclicViaTable n =
   let x = Vector.zero $ Shape.ZeroBased n in
   toCyclic n $ ConvNaive.convolveViaTable x x

convolveCyclicViaAcyclic n =
   let x = Vector.zero $ Shape.ZeroBased n in
   toCyclic n $ ConvFourier.convolve x x


-- convolveCyclicOneManyNaive,
convolveCyclicOneManyDrv,  convolveCyclicOneManyFFT ::
   Int -> Int -> Vector (Shape.ZeroBased Int, Shape.Cyclic Int) (Complex Float)
{-
convolveCyclicOneManyNaive m n =
   ConvNaive.convolveCyclicOneMany
      (Vector.zero (Shape.Cyclic n))
      (Vector.zero (Shape.ZeroBased m, Shape.Cyclic n))
-}

convolveCyclicOneManyDrv m n =
   Driver.convolveOneMany
      (Vector.zero (Shape.Cyclic n))
      (Vector.zero (Shape.ZeroBased m, Shape.Cyclic n))
--   Driver.convolveCyclic x x

convolveCyclicOneManyFFT m n =
   ConvFourier.convolveCyclicOneMany
      (Vector.zero (Shape.Cyclic n))
      (Vector.zero (Shape.ZeroBased m, Shape.Cyclic n))


convolveAcyclicFFT, convolveAcyclicShortLong, convolveAcyclicDrv ::
   Int -> Int -> Vector (Shape.ZeroBased Int) (Complex Float)
convolveAcyclicFFT n m =
   ConvFourier.convolve
      (Vector.zero $ Shape.ZeroBased n)
      (Vector.zero $ Shape.ZeroBased m)

convolveAcyclicShortLong n m =
   ConvFourier.convolveShortLong
      (Vector.zero $ Shape.ZeroBased n)
      (Vector.zero $ Shape.ZeroBased m)

convolveAcyclicDrv n m =
   Driver.convolve
      (Vector.zero $ Shape.ZeroBased n)
      (Vector.zero $ Shape.ZeroBased m)


bconvolution :: Benchmark
bconvolution =
   bgroup "Convolution" [
      let sizes = [4,8,16,32,64,67,73,79,83,89,94,97] in
      bgroup "CyclicSmall" [
         bgroup "Naive" $
         flip map sizes $ \n ->
         bench (show n) $ whnf convolveCyclicNaive n
      ,
         bgroup "ViaTable" $
         flip map sizes $ \n ->
         bench (show n) $ whnf convolveCyclicViaTable n
      ,
         bgroup "Fourier" $
         flip map sizes $ \n ->
         bench (show n) $ whnf convolveCyclicFFT n
      ,
         bgroup "ViaAcyclic" $
         flip map sizes $ \n ->
         bench (show n) $ whnf convolveCyclicViaAcyclic n
      ,
         bgroup "Driver" $
         flip map sizes $ \n ->
         bench (show n) $ whnf convolveCyclicDrv n
      ]
{-
      bgroup "CyclicSmall" [
         bgroup "1" [
            bgroup "Naive" $
            flip map sizes $ \n ->
            bench (show n) $ whnf convolveCyclicNaive n
         ,
            bgroup "Fourier" $
            flip map sizes $ \n ->
            bench (show n) $ whnf convolveCyclicFFT n
         ]
      ,
         bgroup "2" [
            bgroup "Naive" $
            flip map sizes $ \n ->
            bench (show n) $ whnf convolveCyclicOneManyNaive 2 n
         ,
            bgroup "Fourier" $
            flip map sizes $ \n ->
            bench (show n) $ whnf convolveCyclicOneManyFFT 2 n
         ]
      ,
         bgroup "3" [
            bgroup "Naive" $
            flip map sizes $ \n ->
            bench (show n) $ whnf convolveCyclicOneManyNaive 3 n
         ,
            bgroup "Fourier" $
            flip map sizes $ \n ->
            bench (show n) $ whnf convolveCyclicOneManyFFT 3 n
         ]
      ]
-}
   ,
--      let sizes = [22690 .. 22700] in
      let sizes =
            ceilingPowerOfTwo 130000 :
            [150060 .. 150070] ++ larger5Smooth 150070 : [] in
{-
      let sizes =
            ceilingPowerOfTwo 500000 :
            [590120 .. 590130] ++ larger5Smooth 590130 : [] in
-}
      bgroup "Cyclic" [
         bgroup "Driver" $
         flip map sizes $ \n ->
         bench (show n) $ whnf convolveCyclicDrv n
      ,
         bgroup "Fourier" $
         flip map sizes $ \n ->
         bench (show n) $ whnf convolveCyclicFFT n
      ]
   ,
      bgroup "Acyclic" $ flip map [2048,3072,4096,8192,65536,1024000::Int] $ \m ->
      bgroup (show m) $ flip map (map (div m) [1,4,16,64,256,1024]) $ \n ->
      bgroup (show n) [
         bench "Fourier"   $ whnf (convolveAcyclicFFT n) m,
         bench "ShortLong" $ whnf (convolveAcyclicShortLong n) m,
         bench "Driver"    $ whnf (convolveAcyclicDrv n) m
      ]
   ]

fourier :: Int -> Vector (Shape.ZeroBased Int, Shape.Cyclic Int) (Complex Float)
fourier =
   Batch.fourier Rank1.Forward . Vector.zero .
   (,) (Shape.ZeroBased 10000) . Shape.Cyclic

bfourierFull :: Benchmark
bfourierFull =
   bgroup "Full" [
      bgroup "Range" $
      flip map [1..256] $ \n ->
      bench (show n) $ whnf fourier n
   ,
      bgroup "Smooth" $
      flip map (take 30 $ iterate (larger5Smooth . (1+)) 10) $ \n ->
      bench (show n) $ whnf fourier n
   ,
      bgroup "Efficient" $
      flip map (take 40 $ iterate (ceilingEfficient . (1+)) 10) $ \n ->
      bench (show n) $ whnf fourier n
   ]

fourierFull :: Int -> Vector (Shape.Cyclic Int) (Complex Float)
fourierFull =
   Rank1.fourier Rank1.Forward . flip Vector.unit 1 . Shape.Cyclic


fourierBatch ::
   Int -> Int -> Vector (Shape.ZeroBased Int, Shape.Cyclic Int) (Complex Float)
fourierBatch n m =
   Batch.fourier Rank1.Forward .
   Array.reshape (Shape.ZeroBased m, Shape.Cyclic (div n m)) .
   flip Vector.unit 1 . Shape.Cyclic $ n

bfourierBatch :: Benchmark
bfourierBatch =
   bgroup "Batch" $
   (flip map [1024,2048,4096,8192,16384,32768,65536,131072,262144] $ \n ->
      bgroup (show n) $
         bench "Full" (whnf fourierFull n) :
         (flip map [1,2,4,8,16,32,64,128,256,512] $ \m ->
            bench (show m) $ whnf (fourierBatch n) m))
   ++
   (flip map [1000,1200,1400,1600,1800,2000,5000,10000,20000,50000,100000] $
         \n ->
      bgroup (show n) $
         bench "Full" (whnf fourierFull n) :
         (flip map [1,2,4,5,8,10,20,50,100,200] $ \m ->
            bench (show m) $ whnf (fourierBatch n) m))


fourierFull2Batch ::
   Int -> Vector (Shape.ZeroBased Int, Shape.Cyclic Int) (Complex Float)
fourierFull2Batch n = fourierBatch n 2

fourierFullBatch2 ::
   Int -> Vector (Shape.ZeroBased Int, Shape.Cyclic Int) (Complex Float)
fourierFullBatch2 n = fourierBatch n (div n 2)

{- ToDo:
this is prohibitively slow - benchmark twiddle factor generation
taking 10 coefficients requires twice as much time as taking 100 coefficients
-}
fourierPartial ::
   Int -> Int -> Int -> Vector (Shape.ZeroBased Int) (Complex Float)
fourierPartial n m mcut =
   TrafoFourier.takeDivisorFourier Rank1.Forward mcut m $
   Vector.unit (Shape.Cyclic n) 1

fourierPartialNaive ::
   Int -> Int -> Vector (Shape.ZeroBased Int) (Complex Float)
fourierPartialNaive n k =
   TrafoNaive.takeFourier Rank1.Forward (Shape.ZeroBased k) $
   Vector.unit (Shape.Cyclic n) 1

unitVector :: Int -> Vector (Shape.Cyclic Int) (Complex Float)
unitVector = flip Vector.unit 1 . Shape.Cyclic

transpose2Rows ::
   Int -> Vector (Shape.Cyclic Int, Shape.ZeroBased Int) (Complex Float)
transpose2Rows =
   Matrix.transpose .
   Array.mapShape
      (\(Shape.Cyclic n) -> (Shape.ZeroBased 2, Shape.Cyclic (div n 2))) .
   flip Vector.unit 1 . Shape.Cyclic

transpose2Columns ::
   Int -> Vector (Shape.Cyclic Int, Shape.ZeroBased Int) (Complex Float)
transpose2Columns =
   Matrix.transpose .
   Array.mapShape
      (\(Shape.Cyclic n) -> (Shape.ZeroBased (div n 2), Shape.Cyclic 2)) .
   flip Vector.unit 1 . Shape.Cyclic

twiddleVector :: Int -> Int -> Vector (Shape.ZeroBased Int) (Complex Float)
twiddleVector n d =
   VectorSlice.toVector $
   Unit.twiddleVectorAutoLoop d (2*pi / fromIntegral n) n (Shape.ZeroBased n) 1

dcOffset :: Int -> Complex Float
dcOffset = Vector.sum . flip Vector.unit 1 . Shape.Cyclic

realDCOffset :: Int -> Float
realDCOffset = Vector.sum . flip Vector.unit 1 . Shape.Cyclic . (2*)

vectorAdd :: Int -> Vector (Shape.Cyclic Int) (Complex Float)
vectorAdd n =
   Vector.add
      (Vector.unit (Shape.Cyclic n) 0)
      (Vector.unit (Shape.Cyclic n) 1)

dcKnead ::
   (Vector (Shape.ZeroBased Int64) (Knead.Packed Float) ->
      Knead.Packed Float) ->
   Int -> Knead.Packed Float
dcKnead kneadSum =
   kneadSum .
--   flip Array.replicate LLVM.zero .
   flip Array.replicate (LLVM.cyclicVector $ NonEmpty.singleton 0) .
   Shape.ZeroBased . flip div 4 . fromIntegral
{-
dcKnead kneadSum n =
   kneadSum .
   (\(UncheckedArray.Array _sh fptr) ->
      UncheckedArray.Array
         (Shape.ZeroBased (div (fromIntegral n) 4))
         (castForeignPtr (fptr :: ForeignPtr (Complex Float)))) .
   flip Vector.unit 1 . Shape.ZeroBased $ n
-}

dcKneadLarge ::
   (Vector (Shape.ZeroBased Int64) (Knead.Large Float) ->
      Knead.Large Float) ->
   Int -> Knead.Large Float
dcKneadLarge kneadSum =
   kneadSum .
--   flip Array.replicate LLVM.zero .
   flip Array.replicate (LLVM.cyclicVector $ NonEmpty.singleton 0) .
   Shape.ZeroBased . flip div (4*8) . fromIntegral

bfourierBasic ::
   (Vector (Shape.ZeroBased Int64) (Knead.Packed Float) ->
      Knead.Packed Float) ->
   (Vector (Shape.ZeroBased Int64) (Knead.Large Float) ->
      Knead.Large Float) ->
   Benchmark
bfourierBasic kneadSum kneadSumLarge =
   bgroup "Basic" $
      flip map [4096,4410,4800,8192,8820,9600,16384] $ \n ->
      bgroup (show n) [
         bench "Full"       $   nf fourierFull n,
         bench "Full2Batch" $   nf fourierFull2Batch n,
         bench "FullBatch2" $   nf fourierFullBatch2 n,
         bench "Transpose2Rows"    $ nf transpose2Rows n,
         bench "Transpose2Columns" $ nf transpose2Columns n,
         bench "DCReal"    $ whnf realDCOffset n,
         bench "DCComplex" $ whnf dcOffset n,
         bench "DCKnead"   $ whnf (dcKnead kneadSum) n,
         bench "DCKneadLarge" $ whnf (dcKneadLarge kneadSumLarge) n,
         bench "VectorAdd" $ whnf vectorAdd n,
         bench "UnitVector" $ whnf unitVector n,
         bench "Twiddle4"  $ whnf (twiddleVector n) 4,
         bench "Twiddle8"  $ whnf (twiddleVector n) 8,
         bench "Twiddle16" $ whnf (twiddleVector n) 16,
         bench "Twiddle32" $ whnf (twiddleVector n) 32
      ]

bfourierPartialSmall :: Benchmark
bfourierPartialSmall =
   bgroup "PartialSmall" [
      bgroup "44100" [
         bench    "Full" $ whnf fourierFull 44100,
         bench    "1000" $ whnf (fourierPartial 44100 1050) 1000,
         bench     "100" $ whnf (fourierPartial 44100  100)  100,
         bench      "10" $ whnf (fourierPartial 44100   10)   10,
         bench       "1" $ whnf (fourierPartial 44100    1)    1,
         bench      "10" $ whnf (fourierPartialNaive 44100) 10,
         bench       "5" $ whnf (fourierPartialNaive 44100)  5,
         bench       "2" $ whnf (fourierPartialNaive 44100)  2,
         bench       "1" $ whnf (fourierPartialNaive 44100)  1,
         bench "Twiddle" $ whnf (twiddleVector 44100) 32,
         bench "DCReal"    $ whnf realDCOffset 44100,
         bench "DCComplex" $ whnf dcOffset 44100
      ]
   ]

bfourierPartial :: Benchmark
bfourierPartial =
   bgroup "Partial" [
      bgroup "441000" [
         bench "Full" $ whnf fourierFull 441000,
         bench "1000" $ whnf (fourierPartial 441000 1050) 1000,
         bench  "100" $ whnf (fourierPartial 441000  100)  100,
         bench   "10" $ whnf (fourierPartial 441000   10)   10,
         bench    "1" $ whnf (fourierPartial 441000    1)    1,
         bench   "10" $ whnf (fourierPartialNaive 441000) 10,
         bench    "5" $ whnf (fourierPartialNaive 441000)  5,
         bench    "2" $ whnf (fourierPartialNaive 441000)  2,
         bench    "1" $ whnf (fourierPartialNaive 441000)  1,
         bench "Twiddle" $ whnf (twiddleVector 441000) 32,
         bench "DCReal"    $ whnf realDCOffset 441000,
         bench "DCComplex" $ whnf dcOffset 441000
      ]
   ]

bfourier :: Benchmark
bfourier =
   bgroup "Fourier"
      [bfourierFull, bfourierBatch,
       {-bfourierBasic,-} bfourierPartialSmall, bfourierPartial]


btwiddle :: Benchmark
btwiddle =
   bgroup "Twiddle" [
      bgroup "Vector" $
         flip map [441,4410,44100,441000, 480,4800,48000,480000] $ \n ->
         bgroup (show n) $
            flip map [2,4,8,16,32,64] $ \d ->
            bench (show d) $ whnf (twiddleVector n) d
   ]


main :: IO ()
main = do
   kneadSum <- Knead.runSum
   kneadSumLarge <- Knead.runSumLarge
   defaultMain $
      bconvolution :
      bfourier :
      bgroup "Fourier" [bfourierBasic kneadSum kneadSumLarge] :
      btwiddle :
      []