{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE CPP #-}
{-# LANGUAGE DoAndIfThenElse #-}
{-# LANGUAGE ForeignFunctionInterface #-}
{-# LANGUAGE EmptyDataDecls #-}
{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE NoImplicitPrelude #-}

-------------------------------------------------------------------------------
-- |
-- Module      :  GHC.Event.Windows.FFI
-- Copyright   :  (c) Tamar Christina 2019
-- License     :  BSD-style (see the file libraries/base/LICENSE)
--
-- Maintainer  :  libraries@haskell.org
-- Stability   :  stable
-- Portability :  non-portable
--
-- WinIO Windows API Foreign Function imports
--
-------------------------------------------------------------------------------

module GHC.Event.Windows.FFI (
    -- * IOCP
    IOCP(..),
    CompletionKey,
    newIOCP,
    associateHandleWithIOCP,
    getQueuedCompletionStatusEx,
    postQueuedCompletionStatus,
    getOverlappedResult,

    -- * Completion Data
    CompletionData(..),
    CompletionCallback,
    withRequest,

    -- * Overlapped
    OVERLAPPED,
    LPOVERLAPPED,
    OVERLAPPED_ENTRY(..),
    LPOVERLAPPED_ENTRY,
    HASKELL_OVERLAPPED,
    LPHASKELL_OVERLAPPED,
    allocOverlapped,
    zeroOverlapped,
    pokeOffsetOverlapped,
    overlappedIOStatus,
    overlappedIONumBytes,

    -- * Cancel pending I/O
    cancelIoEx,
    cancelIoEx',

    -- * Monotonic time

    -- ** GetTickCount
    getTickCount64,

    -- ** QueryPerformanceCounter
    queryPerformanceCounter,
    queryPerformanceFrequency,

    -- ** Miscellaneous
    throwWinErr,
    setLastError
) where

#include <ntstatus.h>
#include <windows.h>
#include "winio_structs.h"

##include "windows_cconv.h"

import Data.Maybe
import Foreign
import GHC.Base
import GHC.Num ((*))
import GHC.Real (fromIntegral)
import GHC.Show
import GHC.Windows
import qualified GHC.Event.Array as A
import qualified GHC.Windows     as Win32
import GHC.IO.Handle.Internals (debugIO)

------------------------------------------------------------------------
-- IOCP

-- | An I/O completion port.
newtype IOCP = IOCP HANDLE
    deriving (Eq, Ord, Show)

type CompletionKey = ULONG_PTR

-- | This function has two distinct purposes depending on the value of
-- The completion port handle:
--
--  - When the IOCP port is NULL then the function creates a new I/O completion
--    port.  See `newIOCP`.
--
--  - When The port contains a valid handle then the given handle is
--    associated with he given completion port handle.  Once associated it
--    cannot be easily changed.  Associating a Handle with a Completion Port
--    allows the I/O manager's worker threads to handle requests to the given
--    handle.
foreign import WINDOWS_CCONV unsafe "windows.h CreateIoCompletionPort"
    c_CreateIoCompletionPort :: HANDLE -> IOCP -> ULONG_PTR -> DWORD
                             -> IO IOCP

-- | Create a new I/O completion port.
newIOCP :: IO IOCP
newIOCP = failIf (== IOCP nullPtr) "newIOCP" $
          c_CreateIoCompletionPort iNVALID_HANDLE_VALUE (IOCP nullPtr) 0 0

-- | Associate a HANDLE with an I/O completion port.
associateHandleWithIOCP :: IOCP -> HANDLE -> CompletionKey -> IO ()
associateHandleWithIOCP iocp handle completionKey =
    failIf_ (/= iocp) "associateHandleWithIOCP" $
        c_CreateIoCompletionPort handle iocp completionKey 0

foreign import WINDOWS_CCONV safe "windows.h GetOverlappedResult"
    c_GetOverlappedResult :: HANDLE -> LPOVERLAPPED -> Ptr DWORD -> BOOL
                          -> IO BOOL

-- | Get the result of a single overlap operation without the IO manager
getOverlappedResult :: HANDLE -> Ptr OVERLAPPED -> BOOL -> IO (Maybe DWORD)
getOverlappedResult handle lp block
  = alloca $ \bytes ->
        do res <- c_GetOverlappedResult handle lp bytes block
           if res
              then fmap Just $ peek bytes
              else return Nothing

foreign import WINDOWS_CCONV safe "windows.h GetQueuedCompletionStatusEx"
    c_GetQueuedCompletionStatusEx :: IOCP -> LPOVERLAPPED_ENTRY -> Word32
                                  -> Ptr ULONG -> DWORD -> BOOL -> IO BOOL

-- | Note [Completion Ports]
--   ~~~~~~~~~~~~~~~~~~~~~~~
-- When an I/O operation has been queued by an operation
-- (ReadFile/WriteFile/etc) it is placed in a queue that the driver uses when
-- servicing IRQs.  This queue has some important properties:
--
-- 1.) It is not an ordered queue.  Requests may be performed out of order as
--     as the OS's native I/O manager may try to re-order requests such that as
--     few random seeks as possible are needed to complete the pending
--     operations.  As such do not assume a fixed order between something being
--     queued and dequeued.
--
-- 2.) Operations may skip the queue entirely.  In which case they do not end in
--     in this function. (This is an optimization flag we have turned on. See
--     `openFile`.)
--
-- 3.) Across this call the specified OVERLAPPED_ENTRY buffer MUST remain live,
--     and the buffer for an I/O operation cannot be freed or moved until
--     `getOverlappedResult` says it's done.  The reason is the kernel may not
--     have fully released the buffer, or finished writing to it when this
--     operation returns.  Failure to adhere to this will cause your IRQs to be
--     silently dropped and your program will never receive a completion for it.
--     This means that the OVERLAPPED buffer must also remain valid for the
--     duration of the call and as such must be allocated on the unmanaged heap.
--
-- 4.) When a thread calls this method it is associated with the I/O manager's
--     worker threads pool.  You should always use dedicated threads for this
--     since the OS I/O manager will now monitor the threads.  If the thread
--     becomes blocked for whatever reason, the Haskell I/O manager will wake up
--     another threads from it's pool to service the remaining results.
--     A new thread will also be woken up from the pool when the previous thread
--     is busy servicing requests and new requests have finished.  For this
--     reason the Haskell I/O manager multiplexes I/O operations from N haskell
--     threads into 1 completion port, which is serviced by M native threads in
--     an asynchronous method. This allows it to scale efficiently.
getQueuedCompletionStatusEx :: IOCP
                            -> A.Array OVERLAPPED_ENTRY
                            -> DWORD  -- ^ Timeout in milliseconds (or
                                      -- 'GHC.Windows.iNFINITE')
                            -> IO Int
getQueuedCompletionStatusEx iocp arr timeout =
    alloca $ \num_removed_ptr ->do
        A.unsafeLoad arr $ \oes cap -> do
            -- TODO: remove after debugging
            fillBytes oes 0 (cap * (sizeOf (undefined :: OVERLAPPED_ENTRY)))
            debugIO $ "-- call getQueuedCompletionStatusEx "
            -- don't block the call if the rts is not supporting threads.
            -- this would block the entire program.
            let alertable = False -- not rtsSupportsBoundThreads
            ok <- c_GetQueuedCompletionStatusEx iocp oes (fromIntegral cap)
                  num_removed_ptr timeout alertable
            debugIO $ "-- call getQueuedCompletionStatusEx: " ++ show ok
            err <- getLastError
            nc <- (peek num_removed_ptr)
            debugIO $ "-- getQueuedCompletionStatusEx: n=" ++ show nc ++ " ,err=" ++ show err
            if ok then fromIntegral `fmap` peek num_removed_ptr
            else do debugIO $ "failed getQueuedCompletionStatusEx: " ++ show err
                    if err == #{const WAIT_TIMEOUT} || alertable then return 0
                    else failWith "GetQueuedCompletionStatusEx" err

overlappedIOStatus :: LPOVERLAPPED -> IO NTSTATUS
overlappedIOStatus lpol = do
  status <- #{peek OVERLAPPED, Internal} lpol
  -- TODO: Map NTSTATUS to ErrCode?
  -- See https://github.com/libuv/libuv/blob/b12624c13693c4d29ca84b3556eadc9e9c0936a4/src/win/winsock.c#L153
  return status
{-# INLINE overlappedIOStatus #-}

overlappedIONumBytes :: LPOVERLAPPED -> IO ULONG_PTR
overlappedIONumBytes lpol = do
  bytes <- #{peek OVERLAPPED, InternalHigh} lpol
  return bytes
{-# INLINE overlappedIONumBytes #-}

foreign import WINDOWS_CCONV unsafe "windows.h PostQueuedCompletionStatus"
    c_PostQueuedCompletionStatus :: IOCP -> DWORD -> ULONG_PTR -> LPOVERLAPPED
                                 -> IO BOOL

-- | Manually post a completion to the specified I/O port.  This will wake up
-- a thread waiting `GetQueuedCompletionStatusEx`.
postQueuedCompletionStatus :: IOCP -> DWORD -> CompletionKey -> LPOVERLAPPED
                           -> IO ()
postQueuedCompletionStatus iocp numBytes completionKey lpol =
    failIfFalse_ "PostQueuedCompletionStatus" $
    c_PostQueuedCompletionStatus iocp numBytes completionKey lpol

------------------------------------------------------------------------
-- Completion Data

-- | Called when the completion is delivered.
type CompletionCallback a = ErrCode   -- ^ 0 indicates success
                          -> DWORD     -- ^ Number of bytes transferred
                          -> IO a

-- | Callback type that will be called when an I/O operation completes.
type IOCallback = CompletionCallback ()

-- | Structure that the I/O manager uses to associate callbacks with
-- additional payload such as their OVERLAPPED structure and Win32 handle
-- etc.  *Must* be kept in sync with that in `winio_structs.h` or horrible things
-- happen.
--
-- We keep the handle around for the benefit of ghc-external libraries making
-- use of the manager.
data CompletionData = CompletionData { cdHandle   :: !HANDLE
                                     , cdCallback :: !(StablePtr IOCallback)
                                     }

instance Storable CompletionData where
    sizeOf _    = #{size CompletionData}
    alignment _ = #{alignment CompletionData}

    peek ptr = do
      cdCallback <- #{peek CompletionData, cdCallback} ptr
      cdHandle   <- #{peek CompletionData, cdHandle} ptr
      let !cd = CompletionData{..}
      return cd

    poke ptr CompletionData{..} = do
      #{poke CompletionData, cdCallback} ptr cdCallback
      #{poke CompletionData, cdHandle} ptr cdHandle

------------------------------------------------------------------------
-- Overlapped

-- | Tag type for @LPOVERLAPPED@.
data OVERLAPPED

-- | Tag type for the extended version of @OVERLAPPED@ containg some book
--   keeping information.
data HASKELL_OVERLAPPED

-- | Identifies an I/O operation.  Used as the @LPOVERLAPPED@ parameter
-- for overlapped I/O functions (e.g. @ReadFile@, @WSASend@).
type LPOVERLAPPED = Ptr OVERLAPPED

-- | Pointer to the extended HASKELL_OVERLAPPED function.
type LPHASKELL_OVERLAPPED = Ptr HASKELL_OVERLAPPED

-- | An array of these is passed to GetQueuedCompletionStatusEx as an output
-- argument.
data OVERLAPPED_ENTRY = OVERLAPPED_ENTRY {
      lpCompletionKey            :: ULONG_PTR,
      lpOverlapped               :: LPOVERLAPPED,
      dwNumberOfBytesTransferred :: DWORD
    }

type LPOVERLAPPED_ENTRY = Ptr OVERLAPPED_ENTRY

instance Storable OVERLAPPED_ENTRY where
    sizeOf _    = #{size OVERLAPPED_ENTRY}
    alignment _ = #{alignment OVERLAPPED_ENTRY}

    peek ptr = do
      lpCompletionKey <- #{peek OVERLAPPED_ENTRY, lpCompletionKey} ptr
      lpOverlapped    <- #{peek OVERLAPPED_ENTRY, lpOverlapped} ptr
      dwNumberOfBytesTransferred <-
          #{peek OVERLAPPED_ENTRY, dwNumberOfBytesTransferred} ptr
      let !oe = OVERLAPPED_ENTRY{..}
      return oe

    poke ptr OVERLAPPED_ENTRY{..} = do
      #{poke OVERLAPPED_ENTRY, lpCompletionKey} ptr lpCompletionKey
      #{poke OVERLAPPED_ENTRY, lpOverlapped} ptr lpOverlapped
      #{poke OVERLAPPED_ENTRY, dwNumberOfBytesTransferred}
        ptr dwNumberOfBytesTransferred

-- | Allocate a new
-- <http://msdn.microsoft.com/en-us/library/windows/desktop/ms684342%28v=vs.85%29.aspx
-- OVERLAPPED> structure on the unmanaged heap. This also zeros the memory to
-- prevent the values inside the struct to be incorrectlt interpreted as data
-- payload.
--
-- We extend the overlapped structure with some extra book keeping information
-- such that we don't have to do a lookup on the Haskell side.
--
-- Future: We can gain some performance here by using a pool instead of calling
--         malloc for each request. A simple block allocator would be very
--         useful here, especially when we implement sockets support.
allocOverlapped :: Word64 -- ^ Offset/OffsetHigh
                -> IO (Ptr HASKELL_OVERLAPPED)
allocOverlapped offset = do
  lpol <- mallocBytes #{size HASKELL_OVERLAPPED}
  zeroOverlapped lpol
  pokeOffsetOverlapped (castPtr lpol) offset
  return lpol

-- | Zero-fill an HASKELL_OVERLAPPED structure.
zeroOverlapped :: LPHASKELL_OVERLAPPED -> IO ()
zeroOverlapped lpol = fillBytes lpol 0 #{size HASKELL_OVERLAPPED}
{-# INLINE zeroOverlapped #-}

-- | Set the offset field in an OVERLAPPED structure.
pokeOffsetOverlapped :: LPOVERLAPPED -> Word64 -> IO ()
pokeOffsetOverlapped lpol offset = do
  let (offsetHigh, offsetLow) = Win32.ddwordToDwords offset
  #{poke OVERLAPPED, Offset} lpol offsetLow
  #{poke OVERLAPPED, OffsetHigh} lpol offsetHigh
{-# INLINE pokeOffsetOverlapped #-}

-- | Set the event field in an OVERLAPPED structure.
pokeEventOverlapped :: LPOVERLAPPED -> HANDLE -> IO ()
pokeEventOverlapped lpol event = do
  #{poke OVERLAPPED, hEvent} lpol event
{-# INLINE pokeEventOverlapped #-}

------------------------------------------------------------------------
-- Request management

-- Note [AsyncHandles]
-- ~~~~~~~~~~~~~~~~~~~
-- In `winio` we have designed it to work in asynchronous mode always.
-- According to the MSDN documentation[1][2], when a handle is not opened
-- in asynchronous mode then the operation would simply work but operate
-- synchronously.
--
-- This seems to happen as documented for `File` handles, but `pipes` don't
-- seem to follow this documented behavior and so are a problem.
-- Under `msys2` your standard handles are actually pipes, not console
-- handles or files.  As such running under an msys2 console causes a hang
-- as the pipe read never returns.
--
-- [1] https://docs.microsoft.com/en-us/windows/win32/fileio/synchronous-and-asynchronous-i-o
-- [2] https://docs.microsoft.com/en-us/windows/win32/sync/synchronization-and-overlapped-input-and-output
--
-- As such we need to annotate all NativeHandles with a Boolean to indicate
-- wether it's an asynchronous handle or not.
-- This allows us to manually wait for the completion instead of relying
-- on the I/O system to do the right thing.  As we have been using the
-- buffers in async mode we may not have moved the file pointer on the kernel
-- object, as such we still need to give an `OVERLAPPED` structure, but we
-- instead create an event object that we can wait on.
--
-- As documented in MSDN this even object must be in manual reset mode.  This
-- approach gives us the flexibility, with minimum impact to support both
-- synchronous and asynchronous access.
--
-- Additional approaches explored
--
-- Normally the I/O system is in full control of all Handles it creates, with
-- one big exception: inheritance.
--
-- For any `HANDLE` we inherit we don't know how it's been open.  A different
-- solution I have explored was to try to detect the `HANDLE` mode.
-- But this approach would never work for a few reasons:
--
-- 1. The presence of an asynchronous flag does not indicate that we are able
--    to handle the operation asynchronously.  In particular, just because a
--    `HANDLE` is open in async mode, it may not be associated with our
--    completion port.
-- 2. One can only associate a `HANDLE` to a *single* completion port.  As
--    such, if the handle is opened in async mode but already registered to a
--    completion port then we can't use it asynchronously.
-- 3. You can only associate a completion port once, even if it's the same
--    port.  This means were we to strap a `HANDLE` of it's `NativeHandle`
--    wrapper and then wrap it again, we can't retest as the result would be
--    invalid.  This is an issue because to pass `HANDLE`s we have to pass
--    the native OS Handle not the Haskell one. i.e. remote-iserv.

-- See Note [AsyncHandles]
withRequest :: Bool -> Word64 -> HANDLE -> IOCallback
            -> (Ptr HASKELL_OVERLAPPED -> Ptr CompletionData -> IO a)
            -> IO a
withRequest async offset hdl cb f = do
    -- Create the completion record and store it.
    -- We only need the record when we enqueue a request, however if we
    -- delay creating it then we will run into a race condition where the
    -- driver may have finished servicing the request before we were ready
    -- and so the request won't have the book keeping information to know
    -- what to do.  So because of that we always create the payload,  If we
    -- need it ok, if we don't that's no problem.  This approach prevents
    -- expensive lookups in hash-tables.
    --
    -- Todo: Use a memory pool for this so we don't have to hit malloc every
    --       time.  This would allow us to scale better.
    cb_sptr <- newStablePtr cb
    let cbData :: CompletionData
        cbData = CompletionData hdl cb_sptr
    r <- allocaBytes #{size HASKELL_OVERLAPPED} $ \hs_lpol ->
      with cbData $ \cdData -> do
        zeroOverlapped hs_lpol
        let lpol = castPtr hs_lpol
        pokeOffsetOverlapped lpol offset
        -- If doing a synchronous request then register an event object.
        -- This event object MUST be manual reset per MSDN.
        case async of
          True -> f hs_lpol cdData
          False -> do
            event <- failIfNull "withRequest (create)" $
                       c_CreateEvent nullPtr True False nullPtr
            debugIO $ "{{ event " ++ show event ++ " for " ++ show hs_lpol
            pokeEventOverlapped lpol event
            res <- f hs_lpol cdData
            -- Once the request has finished, close the object and free it.
            failIfFalse_ "withRequest (free)" $ c_CloseHandle event
            return res

    freeStablePtr cb_sptr
    return r


-- | Create an event object for use when the HANDLE isn't asynchronous
foreign import WINDOWS_CCONV unsafe "windows.h CreateEventW"
    c_CreateEvent :: Ptr () -> Bool -> Bool -> LPCWSTR -> IO HANDLE

-- | Close a handle object
foreign import WINDOWS_CCONV unsafe "windows.h CloseHandle"
    c_CloseHandle :: HANDLE -> IO Bool

------------------------------------------------------------------------
-- Cancel pending I/O

-- | CancelIo shouldn't block, but cancellation happens infrequently,
-- so we might as well be on the safe side.
foreign import WINDOWS_CCONV unsafe "windows.h CancelIoEx"
    c_CancelIoEx :: HANDLE -> LPOVERLAPPED -> IO BOOL

-- | Cancel all pending overlapped I/O for the given file that was initiated by
-- the current OS thread.  Cancelling is just a request for cancellation and
-- before the OVERLAPPED struct is freed we must make sure that the IRQ has been
-- removed from the queue.  See `getOverlappedResult`.
cancelIoEx :: HANDLE -> LPOVERLAPPED -> IO ()
cancelIoEx h o = failIfFalse_ "CancelIoEx" . c_CancelIoEx h $ o

cancelIoEx' :: HANDLE -> LPOVERLAPPED -> IO Bool
cancelIoEx' = c_CancelIoEx

------------------------------------------------------------------------
-- Monotonic time

foreign import WINDOWS_CCONV "windows.h GetTickCount64"
    c_GetTickCount64 :: IO #{type ULONGLONG}

-- | Call the @GetTickCount64@ function, which returns a monotonic time in
-- milliseconds.
--
-- Problems:
--
--  * Low resolution (10 to 16 milliseconds).
--
-- <http://msdn.microsoft.com/en-us/library/windows/desktop/ms724408%28v=vs.85%29.aspx>
getTickCount64 :: IO Word64
getTickCount64 = c_GetTickCount64

-- | Call the @QueryPerformanceCounter@ function.
--
-- Problems:
--
--  * Might not be available on some hardware.  Use 'queryPerformanceFrequency'
--    to test for availability before calling this function.
--
--  * On a multiprocessor computer, may produce different results on
--    different processors due to hardware bugs.
--
-- To get a monotonic time in seconds, divide the result of
-- 'queryPerformanceCounter' by that of 'queryPerformanceFrequency'.
--
-- <http://msdn.microsoft.com/en-us/library/windows/desktop/ms644904%28v=vs.85%29.aspx>
queryPerformanceCounter :: IO Int64
queryPerformanceCounter =
    callQP c_QueryPerformanceCounter
    >>= maybe (throwGetLastError "QueryPerformanceCounter") return

-- | Call the @QueryPerformanceFrequency@ function.  Return 'Nothing' if the
-- hardware does not provide a high-resolution performance counter.
--
-- <http://msdn.microsoft.com/en-us/library/windows/desktop/ms644905%28v=vs.85%29.aspx>
queryPerformanceFrequency :: IO (Maybe Int64)
queryPerformanceFrequency = do
    m <- callQP c_QueryPerformanceFrequency
    case m of
        Nothing   -> return Nothing
        Just 0    -> return Nothing -- Shouldn't happen; just a safeguard to
                                    -- avoid a zero denominator.
        Just freq -> return (Just freq)

type QPFunc = Ptr Int64 -> IO BOOL

foreign import WINDOWS_CCONV "Windows.h QueryPerformanceCounter"
    c_QueryPerformanceCounter :: QPFunc

foreign import WINDOWS_CCONV "Windows.h QueryPerformanceFrequency"
    c_QueryPerformanceFrequency :: QPFunc

callQP :: QPFunc -> IO (Maybe Int64)
callQP qpfunc =
    allocaBytes #{size LARGE_INTEGER} $ \ptr -> do
        ok <- qpfunc ptr
        if ok then do
            n <- #{peek LARGE_INTEGER, QuadPart} ptr
            return (Just n)
        else
            return Nothing

------------------------------------------------------------------------
-- Miscellaneous

type ULONG_PTR  = #type ULONG_PTR

throwWinErr :: String -> ErrCode -> IO a
throwWinErr loc err = do
    c_SetLastError err
    Win32.failWith loc err

setLastError :: ErrCode -> IO ()
setLastError = c_SetLastError

foreign import WINDOWS_CCONV unsafe "windows.h SetLastError"
    c_SetLastError :: ErrCode -> IO ()