{-# LANGUAGE DeriveLift #-}
{-# LANGUAGE UnboxedTuples #-}

module C.Char
  ( CharValue(..)
  , charValueFromCodeUnit
  , charValueFromCodePoint
  , charValueFromAddr
  , fromHaskellChar
  , utf8SingleByteCodeUnit
  ) where

-- base
import Data.Array.Byte
  ( ByteArray(..) )
import Data.Bits
  ( Bits(..) )
import Data.Char
  ( ord, chr )
import Data.List
  ( unfoldr )
import Data.Maybe
  ( fromMaybe )
import Data.Word
  ( Word8, Word64 )
import GHC.Exts
  ( Addr#, Int(..)
  , newByteArray#, copyAddrToByteArray#, unsafeFreezeByteArray#
  )
import GHC.Exts qualified as IsList
  ( IsList(..) )
import GHC.Generics
  ( Generic )
import GHC.ST
  ( ST(..), runST )


-- template-haskell
import Language.Haskell.TH.Syntax ( Lift )

--------------------------------------------------------------------------------

-- | A character, represented both as a Unicode code point (if applicable)
-- and as the underlying character code units.
data CharValue
  = CharValue
    { charValue :: !ByteArray
      -- ^ The **code units** for a character: some bytes whose interpretation
      -- depends on the text encoding chosen.
    , unicodeCodePoint :: !( Maybe Char )
       -- ^ The (optional) **Unicode code point** for this character. This is
       -- an unsigned integer less than @0x10FF@ that uniquely identifies a
       -- Unicode character. For example, @永@ has the Unicode code point
       -- @0x6C38@ (also written @U+6C38@).
       --
       -- The representation of this code point as actual bits depends on the
       -- text encoding, e.g. the UTF-8 encoding of @永@ is @0xE6B0B8@
       -- (or @15118520@ in decimal notation).
    }
  deriving stock ( Eq, Ord, Show, Generic, Lift )

-- | Create a 'CharValue' from an 'Addr#' (e.g. a primitive @Addr#@ literal).
charValueFromAddr :: Addr# -> Int -> Maybe Char -> CharValue
charValueFromAddr addr ( I# len ) mbUnicode = runST $ ST $
  -- Copied from 'Data.Array.Byte.addrToByteArray'
  \s -> case newByteArray# len s of
    (# s', mb #) ->
      case copyAddrToByteArray# addr mb 0# len s' of
        s'' ->
          case unsafeFreezeByteArray# mb s'' of
            (# s''', ret #) ->
              (# s''', CharValue ( ByteArray ret ) mbUnicode #)

-- | Turn a Haskell 'Char' into a @C@ 'CharValue'.
fromHaskellChar :: Char -> CharValue
fromHaskellChar c =
  CharValue
    { charValue = IsList.fromList
                $ fromMaybe [ 0xEF, 0xBF, 0xBD ] -- Unicode replacement character
                $ utf8ByteCodeUnits ( fromIntegral ( ord c ) )
    , unicodeCodePoint = Just c
      -- A Haskell 'Char' value is precisely a Unicode code point.
    }


-- | Does this character fit into a single byte?
utf8SingleByteCodeUnit :: CharValue -> Maybe Word8
utf8SingleByteCodeUnit ( CharValue { charValue = ba } ) =
  case IsList.toList ba of
    [ b ] -> Just b
    _     -> Nothing

-- | Convert a number to a 'CharValue' as raw bytes.
charValueFromCodeUnit :: ( Integral i, Bits i ) => i -> CharValue
charValueFromCodeUnit u =
   CharValue
    { charValue = IsList.fromList $ bytes u
    , unicodeCodePoint = Nothing }
  where
    bytes = reverse . unfoldr step
    step 0 = Nothing
    step x = Just (fromIntegral (x .&. 0xFF), x `shiftR` 8)
{-# INLINEABLE charValueFromCodeUnit #-}

-- | Convert a numeric Unicode code point to a 'CharValue'.
charValueFromCodePoint :: ( Integral i, Bits i ) => i -> CharValue
charValueFromCodePoint i =
  CharValue
    { charValue = IsList.fromList
                $ fromMaybe [ 0xEF, 0xBF, 0xBD ] -- Unicode replacement character
                $ utf8ByteCodeUnits ( fromIntegral i )
    , unicodeCodePoint = Just $ chr $ fromIntegral i
    }
{-# INLINEABLE charValueFromCodePoint #-}

-- | UTF-8 encoding of a Unicode code point.
utf8ByteCodeUnits :: Word64 -> Maybe [ Word8 ]
utf8ByteCodeUnits p
  | p <= 0x7F
  = Just [ fromIntegral p ]
  | p <= 0x7FF
  = Just
      [ 0xC0 .|. ( fromIntegral ( p `shiftR` 6 ) .&. 0x1F )
      , 0X80 .|. ( fromIntegral   p              .&. 0X3F )
      ]
  | p <= 0xFFFF
  = Just
      [ 0xE0 .|. ( fromIntegral ( p `shiftR` 12 ) .&. 0x0F )
      , 0x80 .|. ( fromIntegral ( p `shiftR` 6  ) .&. 0x3F )
      , 0X80 .|. ( fromIntegral   p               .&. 0X3F )
      ]
  | p <= 0x10FFFF
  = Just
      [ 0xF0 .|. ( fromIntegral ( p `shiftR` 18 ) .&. 0x07 )
      , 0x80 .|. ( fromIntegral ( p `shiftR` 12 ) .&. 0x3F )
      , 0x80 .|. ( fromIntegral ( p `shiftR` 6  ) .&. 0x3F )
      , 0X80 .|. ( fromIntegral   p               .&. 0X3F )
      ]
  | otherwise
  = Nothing