{-# LANGUAGE CPP #-}
--------------------------------------------------------------------
-- |
-- Module   : Text.Regex.PCRE.Light
-- Copyright: Copyright (c) 2007-2008, Don Stewart
-- License  : BSD3
--
-- Maintainer:  Don Stewart <dons@galois.com>
-- Stability :  experimental
-- Portability: H98 + CPP
--
--------------------------------------------------------------------
-- 
-- A simple, portable binding to perl-compatible regular expressions
-- (PCRE) via strict ByteStrings.
--

module Text.Regex.PCRE.Light (

        -- * The abstract PCRE Regex type
          Regex

        -- * ByteString interface
        , compile, compileM
        , match
        , captureCount

        -- * Regex types and constructors externally visible

        -- ** PCRE compile-time bit flags
        , PCREOption

        , anchored
        , auto_callout
        {-, bsr_anycrlf-}
        {-, bsr_unicode-}
        , caseless
        , dollar_endonly
        , dotall
        , dupnames
        , extended
        , extra
        , firstline
        , multiline
        {-, newline_any-}
        {-, newline_anycrlf-}
        , newline_cr
        , newline_crlf
        , newline_lf
        , no_auto_capture
        , ungreedy
        , utf8
        , no_utf8_check

        -- ** PCRE exec-time bit flags
        , PCREExecOption

        , exec_anchored
        {-, exec_newline_any     -}
        {-, exec_newline_anycrlf -}
        , exec_newline_cr
        , exec_newline_crlf
        , exec_newline_lf
        , exec_notbol
        , exec_noteol
        , exec_notempty
        , exec_no_utf8_check
        , exec_partial

    ) where

import Text.Regex.PCRE.Light.Base

-- Strings
import qualified Data.ByteString          as S

#if __GLASGOW_HASKELL__ >= 608
import qualified Data.ByteString.Internal as S
import qualified Data.ByteString.Unsafe   as S
#else
import qualified Data.ByteString.Base     as S
#endif

import System.IO.Unsafe (unsafePerformIO)

-- Foreigns
import Foreign (newForeignPtr, withForeignPtr)
import Foreign.Ptr
import Foreign.C.Types
import Foreign.C.String
import Foreign.Storable
import Foreign.Marshal.Alloc

-- | 'compile'
--
-- Compile a perl-compatible regular expression stored in a strict bytestring.
--
-- An example
--
-- > let r = compile (pack "^(b+|a){1,2}?bc") []
--
-- Or using GHC's -XOverloadedStrings flag, and importing
-- Data.ByteString.Char8, we can avoid the pack:
--
-- > let r = compile "^(b+|a){1,2}?bc" []
--
-- If the regular expression is invalid, an exception is thrown.
-- If this is unsuitable, 'compileM' is availlable, which returns failure 
-- in a monad.
--
-- To do case insentive matching,
--
-- > compile "^(b+|a){1,2}?bc" [caseless]
--
-- Other flags are documented below.
--
-- The resulting abstract regular expression can be passed to 'match'
-- for matching against a subject string.
--
-- The arguments are:
--
-- * 'pat': A ByteString containing the regular expression to be compiled. 
--
-- * 'flags', optional bit flags. If 'Nothing' is provided, defaults are used.
--
-- Valid compile-time flags are:
--
-- * 'anchored'        - Force pattern anchoring
--
-- * 'auto_callout'    - Compile automatic callouts
--
-- * 'bsr_anycrlf'     - \\R matches only CR, LF, or CRLF
--
-- * 'bsr_unicode'     - \\R matches all Unicode line endings
--
-- * 'caseless'        - Do caseless matching
--
-- * 'dollar_endonly'  - '$' not to match newline at end
--
-- * 'dotall'          - matches anything including NL
--
-- * 'dupnames'        - Allow duplicate names for subpatterns
--
-- * 'extended'        - Ignore whitespace and # comments
--
-- * 'extra'           - PCRE extra features (not much use currently)
--
-- * 'firstline'       - Force matching to be  before  newline
--
-- * 'multiline'       - '^' and '$' match newlines within data
--
-- * 'newline_any'     - Recognize any Unicode newline sequence
--
-- * 'newline_anycrlf' - Recognize CR, LF, and CRLF as newline sequences
--
-- * 'newline_cr'      - Set CR as the newline sequence
--
-- * 'newline_crlf'    - Set CRLF as the newline sequence
--
-- * 'newline_lf'      - Set LF as the newline sequence
--
-- * 'no_auto_capture' - Disable numbered capturing parentheses (named ones available)
--
-- * 'ungreedy'        - Invert greediness of quantifiers
--
-- * 'utf8'            - Run in UTF-8 mode
--
-- * 'no_utf8_check'   - Do not check the pattern for UTF-8 validity
--
-- The regex is allocated via malloc on the C side, and will be
-- deallocated by the runtime when the Haskell value representing it
-- goes out of scope.
--
-- See 'man pcreapi for more details.
--
-- Caveats: patterns with embedded nulls, such as "\0*" seem to be
-- mishandled, as this won't currently match the subject "\0\0\0".
--
compile :: S.ByteString -> [PCREOption] -> Regex
compile :: ByteString -> [PCREOption] -> Regex
compile ByteString
s [PCREOption]
o = case ByteString -> [PCREOption] -> Either String Regex
compileM ByteString
s [PCREOption]
o of
    Right Regex
r -> Regex
r
    Left  String
e -> String -> Regex
forall a. HasCallStack => String -> a
error (String
"Text.Regex.PCRE.Light: Error in regex: " String -> String -> String
forall a. [a] -> [a] -> [a]
++ String
e)

------------------------------------------------------------------------

-- | 'compileM'
-- A safe version of 'compile' with failure wrapped in an Either.
--
-- Examples,
--
-- > > compileM ".*" [] :: Either String Regex
-- > Right (Regex 0x000000004bb5b980 ".*")
--
-- > > compileM "*" [] :: Either String Regex
-- > Left "nothing to repeat"
--
compileM :: S.ByteString -> [PCREOption] -> Either String Regex
compileM :: ByteString -> [PCREOption] -> Either String Regex
compileM ByteString
str [PCREOption]
os = IO (Either String Regex) -> Either String Regex
forall a. IO a -> a
unsafePerformIO (IO (Either String Regex) -> Either String Regex)
-> IO (Either String Regex) -> Either String Regex
forall a b. (a -> b) -> a -> b
$
  ByteString
-> (CString -> IO (Either String Regex))
-> IO (Either String Regex)
forall a. ByteString -> (CString -> IO a) -> IO a
S.useAsCString ByteString
str ((CString -> IO (Either String Regex)) -> IO (Either String Regex))
-> (CString -> IO (Either String Regex))
-> IO (Either String Regex)
forall a b. (a -> b) -> a -> b
$ \CString
pattern -> do
    (Ptr CString -> IO (Either String Regex))
-> IO (Either String Regex)
forall a b. Storable a => (Ptr a -> IO b) -> IO b
alloca ((Ptr CString -> IO (Either String Regex))
 -> IO (Either String Regex))
-> (Ptr CString -> IO (Either String Regex))
-> IO (Either String Regex)
forall a b. (a -> b) -> a -> b
$ \Ptr CString
errptr       -> do
    (Ptr CInt -> IO (Either String Regex)) -> IO (Either String Regex)
forall a b. Storable a => (Ptr a -> IO b) -> IO b
alloca ((Ptr CInt -> IO (Either String Regex))
 -> IO (Either String Regex))
-> (Ptr CInt -> IO (Either String Regex))
-> IO (Either String Regex)
forall a b. (a -> b) -> a -> b
$ \Ptr CInt
erroffset    -> do
        Ptr PCRE
pcre_ptr <- CString
-> PCREOption
-> Ptr CString
-> Ptr CInt
-> Ptr Word8
-> IO (Ptr PCRE)
c_pcre_compile CString
pattern ([PCREOption] -> PCREOption
combineOptions [PCREOption]
os) Ptr CString
errptr Ptr CInt
erroffset Ptr Word8
forall a. Ptr a
nullPtr
        if Ptr PCRE
pcre_ptr Ptr PCRE -> Ptr PCRE -> Bool
forall a. Eq a => a -> a -> Bool
== Ptr PCRE
forall a. Ptr a
nullPtr
            then do
                String
err <- CString -> IO String
peekCString (CString -> IO String) -> IO CString -> IO String
forall (m :: * -> *) a b. Monad m => (a -> m b) -> m a -> m b
=<< Ptr CString -> IO CString
forall a. Storable a => Ptr a -> IO a
peek Ptr CString
errptr
                Either String Regex -> IO (Either String Regex)
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (String -> Either String Regex
forall a b. a -> Either a b
Left String
err)
            else do
                ForeignPtr PCRE
reg <- FinalizerPtr PCRE -> Ptr PCRE -> IO (ForeignPtr PCRE)
forall a. FinalizerPtr a -> Ptr a -> IO (ForeignPtr a)
newForeignPtr FinalizerPtr PCRE
forall a. FinalizerPtr a
finalizerFree Ptr PCRE
pcre_ptr -- release with free()
                Either String Regex -> IO (Either String Regex)
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (Regex -> Either String Regex
forall a b. b -> Either a b
Right (ForeignPtr PCRE -> ByteString -> Regex
Regex ForeignPtr PCRE
reg ByteString
str))

-- Possible improvements: an 'IsString' instance could be defined
-- for 'Regex', which would allow the compiler to insert calls to
-- 'compile' based on the type:
--
-- The following would be valid:
--
-- > match "a.*b" "abcdef" []
--
-- and equivalent to:
--
-- > match (either error id (compile "a.*b")) "abcdef" []

-- | 'match'
--
-- Matches a compiled regular expression against a given subject string,
-- using a matching algorithm that is similar to Perl's. If the subject
-- string doesn't match the regular expression, 'Nothing' is returned,
-- otherwise the portion of the string that matched is returned, along
-- with any captured subpatterns.
--
-- The arguments are:
--
-- * 'regex', a PCRE regular expression value produced by compile
--
-- * 'subject', the subject string to match against
--
-- * 'options', an optional set of exec-time flags to exec.
--
-- Available runtime options are:
--
-- * 'exec_anchored'        - Match only at the first position
--
-- * 'exec_newline_any'     - Recognize any Unicode newline sequence
--
-- * 'exec_newline_anycrlf' - Recognize CR, LF, and CRLF as newline sequences
--
-- * 'exec_newline_cr'      - Set CR as the newline sequence
--
-- * 'exec_newline_crlf'    - Set CRLF as the newline sequence
--
-- * 'exec_newline_lf'      - Set LF as the newline sequence
--
-- * 'exec_notbol'          - Subject is not the beginning of a line
--
-- * 'exec_noteol'          - Subject is not the end of a line
--
-- * 'exec_notempty'        - An empty string is not a valid match
--
-- * 'exec_no_utf8_check'   - Do not check the subject for UTF-8
--
-- * 'exec_partial'         - Return PCRE_ERROR_PARTIAL for a partial match
--
-- The result value, and any captured subpatterns, are returned.
-- If the regex is invalid, or the subject string is empty, Nothing
-- is returned.
--
match :: Regex -> S.ByteString -> [PCREExecOption] -> Maybe [S.ByteString]
match :: Regex -> ByteString -> [PCREExecOption] -> Maybe [ByteString]
match (Regex ForeignPtr PCRE
pcre_fp ByteString
_) ByteString
subject [PCREExecOption]
os = IO (Maybe [ByteString]) -> Maybe [ByteString]
forall a. IO a -> a
unsafePerformIO (IO (Maybe [ByteString]) -> Maybe [ByteString])
-> IO (Maybe [ByteString]) -> Maybe [ByteString]
forall a b. (a -> b) -> a -> b
$ do
  ForeignPtr PCRE
-> (Ptr PCRE -> IO (Maybe [ByteString])) -> IO (Maybe [ByteString])
forall a b. ForeignPtr a -> (Ptr a -> IO b) -> IO b
withForeignPtr ForeignPtr PCRE
pcre_fp ((Ptr PCRE -> IO (Maybe [ByteString])) -> IO (Maybe [ByteString]))
-> (Ptr PCRE -> IO (Maybe [ByteString])) -> IO (Maybe [ByteString])
forall a b. (a -> b) -> a -> b
$ \Ptr PCRE
pcre_ptr -> do
    Int
n_capt <- Ptr PCRE -> IO Int
forall {b}. Num b => Ptr PCRE -> IO b
captureCount' Ptr PCRE
pcre_ptr

    -- The smallest  size  for ovector that will allow for n captured
    -- substrings, in addition to the offsets  of  the  substring
    -- matched by the whole pattern, is (n+1)*3. (man pcreapi)

    let ovec_size :: Int
ovec_size = (Int
n_capt Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1) Int -> Int -> Int
forall a. Num a => a -> a -> a
* Int
3
        ovec_bytes :: Int
ovec_bytes = Int
ovec_size Int -> Int -> Int
forall a. Num a => a -> a -> a
* Int
size_of_cint

    Int
-> (Ptr CInt -> IO (Maybe [ByteString])) -> IO (Maybe [ByteString])
forall a b. Int -> (Ptr a -> IO b) -> IO b
allocaBytes Int
ovec_bytes ((Ptr CInt -> IO (Maybe [ByteString])) -> IO (Maybe [ByteString]))
-> (Ptr CInt -> IO (Maybe [ByteString])) -> IO (Maybe [ByteString])
forall a b. (a -> b) -> a -> b
$ \Ptr CInt
ovec -> do

        let (ForeignPtr Word8
str_fp, Int
off, Int
len) = ByteString -> (ForeignPtr Word8, Int, Int)
S.toForeignPtr ByteString
subject
        ForeignPtr Word8
-> (Ptr Word8 -> IO (Maybe [ByteString]))
-> IO (Maybe [ByteString])
forall a b. ForeignPtr a -> (Ptr a -> IO b) -> IO b
withForeignPtr ForeignPtr Word8
str_fp ((Ptr Word8 -> IO (Maybe [ByteString])) -> IO (Maybe [ByteString]))
-> (Ptr Word8 -> IO (Maybe [ByteString]))
-> IO (Maybe [ByteString])
forall a b. (a -> b) -> a -> b
$ \Ptr Word8
cstr -> do
            CInt
r <- Ptr PCRE
-> Ptr PCRE
-> Ptr Word8
-> CInt
-> CInt
-> PCREExecOption
-> Ptr CInt
-> CInt
-> IO CInt
c_pcre_exec
                         Ptr PCRE
pcre_ptr
                         Ptr PCRE
forall a. Ptr a
nullPtr
                         (Ptr Word8
cstr Ptr Word8 -> Int -> Ptr Word8
forall a b. Ptr a -> Int -> Ptr b
`plusPtr` Int
off) -- may contain binary zero bytes.
                         (Int -> CInt
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
len)
                         CInt
0
                         ([PCREExecOption] -> PCREExecOption
combineExecOptions [PCREExecOption]
os)
                         Ptr CInt
ovec
                         (Int -> CInt
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
ovec_size)

            if CInt
r CInt -> CInt -> Bool
forall a. Ord a => a -> a -> Bool
< CInt
0 -- errors, or error_no_match
                then Maybe [ByteString] -> IO (Maybe [ByteString])
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return Maybe [ByteString]
forall a. Maybe a
Nothing
                else let loop :: CInt -> Int -> [ByteString] -> IO (Maybe [ByteString])
loop CInt
n Int
o [ByteString]
acc =
                            if CInt
n CInt -> CInt -> Bool
forall a. Eq a => a -> a -> Bool
== CInt
r
                              then Maybe [ByteString] -> IO (Maybe [ByteString])
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return ([ByteString] -> Maybe [ByteString]
forall a. a -> Maybe a
Just ([ByteString] -> [ByteString]
forall a. [a] -> [a]
reverse [ByteString]
acc))
                              else do
                                    CInt
i <- Ptr CInt -> Int -> IO CInt
forall a. Storable a => Ptr a -> Int -> IO a
peekElemOff Ptr CInt
ovec (Int -> IO CInt) -> Int -> IO CInt
forall a b. (a -> b) -> a -> b
$! Int
o
                                    CInt
j <- Ptr CInt -> Int -> IO CInt
forall a. Storable a => Ptr a -> Int -> IO a
peekElemOff Ptr CInt
ovec (Int
oInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
1)
                                    let s :: ByteString
s = CInt -> CInt -> ByteString -> ByteString
substring CInt
i CInt
j ByteString
subject
                                    ByteString
s ByteString -> IO (Maybe [ByteString]) -> IO (Maybe [ByteString])
forall a b. a -> b -> b
`seq` CInt -> Int -> [ByteString] -> IO (Maybe [ByteString])
loop (CInt
nCInt -> CInt -> CInt
forall a. Num a => a -> a -> a
+CInt
1) (Int
oInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
2) (ByteString
s ByteString -> [ByteString] -> [ByteString]
forall a. a -> [a] -> [a]
: [ByteString]
acc)
                     in CInt -> Int -> [ByteString] -> IO (Maybe [ByteString])
loop CInt
0 Int
0 []

    -- The  first  two-thirds  of ovec is used to pass back captured
    -- substrings When  a  match  is  successful, information about captured
    -- substrings is returned in pairs of integers,  starting  at the
    -- beginning of ovector, and continuing up to two-thirds of its length at
    -- the most.  The first pair, ovector[0] and ovector[1], identify the
    -- portion of the subject string matched  by  the entire pattern.  The next
    -- pair is used for the first capturing subpattern,  and  so on.  The
    -- value returned  by pcre_exec() is one more than the highest num- bered
    -- pair that has been set. For  example,  if  two  sub- strings  have been
    -- captured, the returned value is 3. 

  where
    -- The first element of a pair is set  to  the offset of the first
    -- character in a substring, and the second is set to the offset of the
    -- first character after  the  end of a substring.
    substring :: CInt -> CInt -> S.ByteString -> S.ByteString
    substring :: CInt -> CInt -> ByteString -> ByteString
substring CInt
x CInt
y ByteString
_ | CInt
x CInt -> CInt -> Bool
forall a. Eq a => a -> a -> Bool
== CInt
y = ByteString
S.empty -- XXX an unset subpattern
    substring CInt
a CInt
b ByteString
s = ByteString
end -- note that we're not checking...
        where
            start :: ByteString
start = Int -> ByteString -> ByteString
S.unsafeDrop (CInt -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral CInt
a) ByteString
s
            end :: ByteString
end   = Int -> ByteString -> ByteString
S.unsafeTake (CInt -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral (CInt
bCInt -> CInt -> CInt
forall a. Num a => a -> a -> a
-CInt
a)) ByteString
start


captureCount :: Regex -> Int
captureCount :: Regex -> Int
captureCount (Regex ForeignPtr PCRE
pcre_fp ByteString
_) = IO Int -> Int
forall a. IO a -> a
unsafePerformIO (IO Int -> Int) -> IO Int -> Int
forall a b. (a -> b) -> a -> b
$ do
  ForeignPtr PCRE -> (Ptr PCRE -> IO Int) -> IO Int
forall a b. ForeignPtr a -> (Ptr a -> IO b) -> IO b
withForeignPtr ForeignPtr PCRE
pcre_fp ((Ptr PCRE -> IO Int) -> IO Int) -> (Ptr PCRE -> IO Int) -> IO Int
forall a b. (a -> b) -> a -> b
$ \Ptr PCRE
pcre_ptr -> do
    Ptr PCRE -> IO Int
forall {b}. Num b => Ptr PCRE -> IO b
captureCount' Ptr PCRE
pcre_ptr

captureCount' :: Ptr PCRE -> IO b
captureCount' Ptr PCRE
pcre_fp =
    (Ptr CInt -> IO b) -> IO b
forall a b. Storable a => (Ptr a -> IO b) -> IO b
alloca ((Ptr CInt -> IO b) -> IO b) -> (Ptr CInt -> IO b) -> IO b
forall a b. (a -> b) -> a -> b
$ \Ptr CInt
n_ptr -> do -- (st :: Ptr CInt)
      Ptr PCRE -> Ptr PCRE -> CInt -> Ptr CInt -> IO CInt
forall a. Ptr PCRE -> Ptr PCRE -> CInt -> Ptr a -> IO CInt
c_pcre_fullinfo Ptr PCRE
pcre_fp Ptr PCRE
forall a. Ptr a
nullPtr CInt
info_capturecount Ptr CInt
n_ptr
      b -> IO b
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (b -> IO b) -> (CInt -> b) -> CInt -> IO b
forall b c a. (b -> c) -> (a -> b) -> a -> c
. CInt -> b
forall a b. (Integral a, Num b) => a -> b
fromIntegral (CInt -> IO b) -> IO CInt -> IO b
forall (m :: * -> *) a b. Monad m => (a -> m b) -> m a -> m b
=<< Ptr CInt -> IO CInt
forall a. Storable a => Ptr a -> IO a
peek (Ptr CInt
n_ptr :: Ptr CInt)