{-# OPTIONS_GHC -fno-warn-orphans -fno-warn-unused-binds #-} {-# LANGUAGE NoImplicitPrelude, UndecidableInstances, FlexibleInstances, FlexibleContexts, BangPatterns #-} {-# LANGUAGE TemplateHaskell, QuasiQuotes, UnicodeSyntax #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE ForeignFunctionInterface, CPP #-} #if __GLASGOW_HASKELL__ < 710 {-# LANGUAGE OverlappingInstances #-} #endif -- | A usable regular expressions library on top of pcre-light. module Text.Regex.PCRE.Heavy ( -- * Matching (=~) , (≈) , scan , scanO , scanRanges , scanRangesO -- * Replacement , RegexReplacement , sub , subO , gsub , gsubO -- * Splitting , split , splitO -- * QuasiQuoter , re , mkRegexQQ -- * Building regexes , escape -- * Types and stuff from pcre-light , Regex , PCREOption , PCRE.compileM -- * Advanced raw stuff , rawMatch , rawSub ) where import Prelude.Compat import Language.Haskell.TH hiding (match) import Language.Haskell.TH.Quote import Language.Haskell.TH.Syntax import qualified Text.Regex.PCRE.Light as PCRE import Text.Regex.PCRE.Light.Base import Data.Maybe (isJust, fromMaybe) import Data.List (unfoldr, mapAccumL) import qualified Data.List.NonEmpty as NE import Data.String.Conversions import Data.String.Conversions.Monomorphic import qualified Data.ByteString.Char8 as BS import qualified Data.ByteString.Internal as BS import System.IO.Unsafe (unsafePerformIO) import Foreign (withForeignPtr, allocaBytes, nullPtr, plusPtr, peekElemOff) -- $setup -- >>> :set -XQuasiQuotes -XFlexibleContexts -XOverloadedStrings -- >>> :m + Text.Regex.PCRE.Heavy -- >>> import qualified Text.Regex.PCRE.Light as PCRE substr ∷ SBS → (Int, Int) → SBS substr s (f, t) = BS.take (t - f) . BS.drop f $ s behead ∷ NE.NonEmpty a → (a, [a]) behead l = (NE.head l, NE.tail l) reMatch ∷ ConvertibleStrings a SBS ⇒ Regex → a → Bool reMatch r s = isJust $ PCRE.match r (cs s) [] -- | Checks whether a string matches a regex. -- -- >>> "https://val.packett.cool" =~ [re|^http.*|] -- True (=~), (≈) ∷ ConvertibleStrings a SBS ⇒ a → Regex → Bool (=~) = flip reMatch -- | Same as =~. (≈) = (=~) -- | Does raw PCRE matching (you probably shouldn't use this directly). -- -- >>> rawMatch [re|\w{2}|] "a a ab abc ba" 0 [] -- Just [(4,6)] -- >>> rawMatch [re|\w{2}|] "a a ab abc ba" 6 [] -- Just [(7,9)] -- >>> rawMatch [re|(\w)(\w)|] "a a ab abc ba" 0 [] -- Just [(4,6),(4,5),(5,6)] rawMatch ∷ Regex → SBS → Int → [PCREExecOption] → Maybe [(Int, Int)] rawMatch r@(Regex pcreFp _) s offset opts = unsafePerformIO $ do withForeignPtr pcreFp $ \pcrePtr → do let nCapt = PCRE.captureCount r ovecSize = (nCapt + 1) * 3 ovecBytes = ovecSize * size_of_cint allocaBytes ovecBytes $ \ovec → do let (strFp, off, len) = BS.toForeignPtr s withForeignPtr strFp $ \strPtr → do results ← c_pcre_exec pcrePtr nullPtr (strPtr `plusPtr` off) (fromIntegral len) (fromIntegral offset) (combineExecOptions opts) ovec (fromIntegral ovecSize) if results < 0 then return Nothing else let loop n o acc = if n == results then return $ Just $ reverse acc else do i ← peekElemOff ovec $! o j ← peekElemOff ovec (o + 1) loop (n + 1) (o + 2) ((fromIntegral i, fromIntegral j) : acc) in loop 0 0 [] nextMatch ∷ Regex → [PCREExecOption] → SBS → Int → Maybe (NE.NonEmpty (Int, Int), Int) nextMatch r opts str offset = rawMatch r str offset opts >>= NE.nonEmpty >>= \ms → return (ms, maximum $ fmap snd ms) -- | Searches the string for all matches of a given regex. -- -- >>> scan [re|\s*entry (\d+) (\w+)\s*&?|] (" entry 1 hello &entry 2 hi" :: String) -- [(" entry 1 hello &",["1","hello"]),("entry 2 hi",["2","hi"])] -- -- It is lazy! If you only need the first match, just apply 'head' (or -- 'headMay' from the "safe" library) -- no extra work will be performed! -- -- >>> take 1 $ scan [re|\s*entry (\d+) (\w+)\s*&?|] (" entry 1 hello &entry 2 hi" :: String) -- [(" entry 1 hello &",["1","hello"])] scan ∷ (ConvertibleStrings SBS a, ConvertibleStrings a SBS) ⇒ Regex → a → [(a, [a])] scan r s = scanO r [] s -- | Exactly like 'scan', but passes runtime options to PCRE. scanO ∷ (ConvertibleStrings SBS a, ConvertibleStrings a SBS) ⇒ Regex → [PCREExecOption] → a → [(a, [a])] scanO r opts s = map behead $ fmap (cs . substr str) <$> unfoldr (nextMatch r opts str) 0 where str = toSBS s -- | Searches the string for all matches of a given regex, like 'scan', but -- returns positions inside of the string. -- -- >>> scanRanges [re|\s*entry (\d+) (\w+)\s*&?|] (" entry 1 hello &entry 2 hi" :: String) -- [((0,17),[(7,8),(9,14)]),((17,27),[(23,24),(25,27)])] -- -- And just like 'scan', it's lazy. scanRanges ∷ ConvertibleStrings a SBS ⇒ Regex → a → [((Int, Int), [(Int, Int)])] scanRanges r s = scanRangesO r [] s -- | Exactly like 'scanRanges', but passes runtime options to PCRE. scanRangesO ∷ ConvertibleStrings a SBS ⇒ Regex → [PCREExecOption] → a → [((Int, Int), [(Int, Int)])] scanRangesO r opts s = map behead $ unfoldr (nextMatch r opts str) 0 where str = toSBS s -- | Class of types that can serve as the replacement argument in the -- 'sub' family of functions. class RegexReplacement a where performReplacement ∷ SBS → [SBS] → a → SBS -- | A replacement string. instance {-# OVERLAPPABLE #-} ConvertibleStrings a SBS ⇒ RegexReplacement a where performReplacement _ _ to = cs to -- | A function mapping the matched string and groups to a replacement string. instance (ConvertibleStrings SBS a, ConvertibleStrings a SBS) ⇒ RegexReplacement (a → [a] → a) where performReplacement from groups replacer = cs $ replacer (cs from) (map cs groups) -- | A function mapping the matched string to a replacement string. instance (ConvertibleStrings SBS a, ConvertibleStrings a SBS) ⇒ RegexReplacement (a → a) where performReplacement from _ replacer = cs $ replacer (cs from) -- | A function mapping the matched groups to a replacement string. instance (ConvertibleStrings SBS a, ConvertibleStrings a SBS) ⇒ RegexReplacement ([a] → a) where performReplacement _ groups replacer = cs $ replacer (map cs groups) rawSub ∷ RegexReplacement r ⇒ Regex → r → SBS → Int → [PCREExecOption] → Maybe (SBS, Int) rawSub r t s offset opts = case rawMatch r s offset opts of Just ((begin, end):groups) → let replacement = performReplacement (substr s (begin, end)) (map (substr s) groups) t in Just (BS.concat [ substr s (0, begin) , replacement , substr s (end, BS.length s)], begin + BS.length replacement) _ → Nothing -- | Replaces the first occurence of a given regex. -- -- >>> sub [re|thing|] "world" "Hello, thing thing" :: String -- "Hello, world thing" -- -- >>> sub [re|a|] "b" "c" :: String -- "c" -- -- >>> sub [re|bad|] "xxxbad" "this is bad, right?" :: String -- "this is xxxbad, right?" -- -- You can use functions! -- A function of ConvertibleStrings SBS gets the full match. -- A function of [ConvertibleStrings SBS] gets the groups. -- A function of ConvertibleStrings SBS → [ConvertibleStrings SBS] gets both. -- -- >>> sub [re|%(\d+)(\w+)|] (\(d:w:_) -> "{" ++ d ++ " of " ++ w ++ "}" :: String) "Hello, %20thing" :: String -- "Hello, {20 of thing}" sub ∷ (ConvertibleStrings SBS a, ConvertibleStrings a SBS, RegexReplacement r) ⇒ Regex → r → a → a sub r t s = subO r [] t s -- | Exactly like 'sub', but passes runtime options to PCRE. subO ∷ (ConvertibleStrings SBS a, ConvertibleStrings a SBS, RegexReplacement r) ⇒ Regex → [PCREExecOption] → r → a → a subO r opts t s = fromMaybe s $ cs <$> fst <$> rawSub r t (cs s) 0 opts -- | Replaces all occurences of a given regex. -- -- See 'sub' for more documentation. -- -- >>> gsub [re|thing|] "world" "Hello, thing thing" :: String -- "Hello, world world" -- -- >>> gsub [re||] "" "Hello, world" :: String -- "Hello, world" -- -- https://codeberg.org/valpackett/pcre-heavy/issues/2 -- >>> gsub [re|good|] "bad" "goodgoodgood" :: String -- "badbadbad" -- -- >>> gsub [re|bad|] "xxxbad" "this is bad, right? bad" :: String -- "this is xxxbad, right? xxxbad" -- -- >>> gsub [re|a|] "" "aaa" :: String -- "" gsub ∷ (ConvertibleStrings SBS a, ConvertibleStrings a SBS, RegexReplacement r) ⇒ Regex → r → a → a gsub r t s = gsubO r [] t s -- | Exactly like 'gsub', but passes runtime options to PCRE. gsubO ∷ (ConvertibleStrings SBS a, ConvertibleStrings a SBS, RegexReplacement r) ⇒ Regex → [PCREExecOption] → r → a → a gsubO r opts t s = cs $ loop 0 str where str = toSBS s loop offset acc | offset >= l = acc | otherwise = case rawSub r t acc offset opts of Just (result, newOffset) → if newOffset == offset && l == BS.length result then acc else loop newOffset result _ → acc where l = BS.length acc -- | Splits the string using the given regex. -- -- Is lazy. -- -- >>> split [re|%(begin|next|end)%|] ("%begin%hello%next%world%end%" :: String) -- ["","hello","world",""] -- -- >>> split [re|%(begin|next|end)%|] ("" :: String) -- [""] split ∷ (ConvertibleStrings SBS a, ConvertibleStrings a SBS) ⇒ Regex → a → [a] split r s = splitO r [] s -- | Exactly like 'split', but passes runtime options to PCRE. splitO ∷ (ConvertibleStrings SBS a, ConvertibleStrings a SBS) ⇒ Regex → [PCREExecOption] → a → [a] splitO r opts s = map cs $ map' (substr str) partRanges where map' f = foldr ((:) . f) [f (lastL, BS.length str)] -- avoiding the snoc operation (lastL, partRanges) = mapAccumL invRange 0 ranges invRange acc (xl, xr) = (xr, (acc, xl)) ranges = map fst $ scanRangesO r opts str str = toSBS s instance Lift PCREOption where -- well, the constructor isn't exported, but at least it implements Read/Show :D liftTyped o = let o' = show o in [|| read o' ∷ PCREOption ||] quoteExpRegex ∷ [PCREOption] → String → ExpQ quoteExpRegex opts txt = [| PCRE.compile (cs (txt ∷ String)) opts |] where !_ = PCRE.compile (cs txt) opts -- check at compile time -- | Returns a QuasiQuoter like 're', but with given PCRE options. mkRegexQQ ∷ [PCREOption] → QuasiQuoter mkRegexQQ opts = QuasiQuoter { quoteExp = quoteExpRegex opts , quotePat = undefined , quoteType = undefined , quoteDec = undefined } -- | A QuasiQuoter for regular expressions that does a compile time check. re ∷ QuasiQuoter re = mkRegexQQ [utf8] -- Metacharacters used in PCRE syntax. Taken from pcrepattern(3) man -- page. pcreMetachars ∷ SBS pcreMetachars = "\\^$.[|()?*+{" -- Start and end quote markers in PCRE syntax. startQuoteMarker, endQuoteMarker ∷ SBS startQuoteMarker = "\\Q" endQuoteMarker = "\\E" -- | Escapes the regex metacharacters in a string. In other words, -- given a string, produces a regex that matches just that string (or -- case variations of that string, if case-insenstive matching is -- enabled). -- -- >>> ("foo*bar"::String) =~ PCRE.compile (escape "foo*bar") [] -- True escape ∷ (ConvertibleStrings a SBS, ConvertibleStrings SBS a) ⇒ a → a escape = convertString . escapeSBS . convertString where escapeSBS s -- Handle the special case where \Q...\E doesn't work. | endQuoteMarker `BS.isInfixOf` s = BS.concatMap step s -- Handle the typical case. | otherwise = BS.concat [startQuoteMarker, s, endQuoteMarker] step c | c `BS.elem` pcreMetachars = BS.pack ['\\', c] | otherwise = BS.singleton c