{-# LANGUAGE BangPatterns, GeneralizedNewtypeDeriving #-}
module Data.SearchEngine.DocTermIds (
    DocTermIds,
    TermId,
    fieldLength,
    fieldTermCount,
    fieldElems,
    create,
    denseTable,
    vecIndexIx,
    vecCreateIx,
  ) where

import Data.SearchEngine.TermBag (TermBag, TermId)
import qualified Data.SearchEngine.TermBag as TermBag

import Data.Vector (Vector, (!))
import qualified Data.Vector as Vec
import qualified Data.Vector.Unboxed as UVec
import Data.Ix (Ix)
import qualified Data.Ix as Ix


-- | The 'TermId's for the 'Term's that occur in a document. Documents may have
-- multiple fields and the 'DocTerms' type holds them separately for each field.
--
newtype DocTermIds field = DocTermIds (Vector TermBag)
  deriving (Show)

getField :: (Ix field, Bounded field) => DocTermIds field -> field -> TermBag
getField (DocTermIds fieldVec) = vecIndexIx fieldVec

create :: (Ix field, Bounded field) =>
          (field -> [TermId]) -> DocTermIds field
create docTermIds =
    DocTermIds (vecCreateIx (TermBag.fromList . docTermIds))

-- | The number of terms in a field within the document.
fieldLength :: (Ix field, Bounded field) => DocTermIds field -> field -> Int
fieldLength docterms field =
    TermBag.size (getField docterms field)

-- | /O(log n)/ The frequency of a particular term in a field within the document.
--
fieldTermCount :: (Ix field, Bounded field) =>
                  DocTermIds field -> field -> TermId -> Int
fieldTermCount docterms field termid =
    fromIntegral (TermBag.termCount (getField docterms field) termid)

fieldElems :: (Ix field, Bounded field) => DocTermIds field -> field -> [TermId]
fieldElems docterms field =
    TermBag.elems (getField docterms field)

-- | The 'DocTermIds' is really a sparse 2d array, and doing lookups with
-- 'fieldTermCount' has a O(log n) cost. This function converts to a dense
-- tabular representation which then enables linear scans.
--
denseTable :: (Ix field, Bounded field) => DocTermIds field ->
              (Int, Int -> TermId, Int -> field -> Int)
denseTable (DocTermIds fieldVec) =
    let (!termids, !termcounts) = TermBag.denseTable (Vec.toList fieldVec)
        !numTerms = UVec.length termids
     in ( numTerms
        , \i    -> termids UVec.! i
        , \i ix -> let j = Ix.index (minBound, maxBound) ix
                    in fromIntegral (termcounts UVec.! (j * numTerms + i))
        )

---------------------------------
-- Vector indexed by Ix Bounded
--

vecIndexIx  :: (Ix ix, Bounded ix) => Vector a -> ix -> a
vecIndexIx vec ix = vec ! Ix.index (minBound, maxBound) ix

vecCreateIx :: (Ix ix, Bounded ix) => (ix -> a) -> Vector a
vecCreateIx f = Vec.fromListN (Ix.rangeSize bounds)
                  [ y | ix <- Ix.range bounds, let !y = f ix ]
  where
    bounds = (minBound, maxBound)