{-# LINE 1 "Text/Regex/PCRE/Light/Base.hsc" #-} {-# LANGUAGE CPP, ForeignFunctionInterface, GeneralizedNewtypeDeriving #-} -------------------------------------------------------------------- -- | -- Module : Text.Regex.PCRE.Light.Base -- Copyright: Copyright (c) 2007-2008, Don Stewart -- -- Documentation based on /man pcreapi/, written by Philip Hazel, 2007. -- -- License : BSD3 -- Maintainer: Don Stewart <dons@galois.com> -- Stability : experimental -- Portability: CPP, FFI -- Tested with: GHC 6.8.2 -- -- Raw FFI bindings to PCRE functions and constants. -- module Text.Regex.PCRE.Light.Base ( -- * A PCRE structure PCRE , Regex(..) -- * C exports , c_pcre_compile , c_pcre_exec , c_pcre_fullinfo ------------------------------------------------------------------------ -- * PCRE Options, an abstract newtyped Num wrapper over a CInt , PCREOption , combineOptions , anchored , auto_callout {-, bsr_anycrlf-} {-, bsr_unicode-} , caseless , dollar_endonly , dotall , dupnames , extended , extra , firstline , multiline {-, newline_any-} {-, newline_anycrlf-} , newline_cr , newline_crlf , newline_lf , no_auto_capture , ungreedy , utf8 , no_utf8_check -- * PCRE exec-time options, an abstract, newtyped Num wrapper over CInt , PCREExecOption , combineExecOptions , exec_anchored {-, exec_newline_any , exec_newline_anycrlf-} , exec_newline_cr , exec_newline_crlf , exec_newline_lf , exec_notbol , exec_noteol , exec_notempty , exec_no_utf8_check , exec_partial ------------------------------------------------------------------------ -- * PCRE Errors , PCREError , error_nomatch , error_null , error_badoption , error_badmagic {-, error_unknown_opcode-} , error_unknown_node , error_nomemory , error_nosubstring , error_matchlimit , error_callout , error_badutf8 , error_badutf8_offset , error_partial , error_badpartial , error_internal , error_badcount , error_dfa_uitem , error_dfa_ucond , error_dfa_umlimit , error_dfa_wssize , error_dfa_recurse , error_recursionlimit {-, error_nullwslimit-} {-, error_badnewline-} -- * PCRE Info , PCREInfo , info_options , info_size , info_capturecount , info_backrefmax , info_firstbyte , info_firstchar , info_firsttable , info_lastliteral , info_nameentrysize , info_namecount , info_nametable , info_studysize , info_default_tables {-, info_okpartial-} {-, info_jchanged-} {-, info_hascrorlf-} -- * PCRE Configuration , PCREConfig , config_utf8 , config_newline , config_link_size , config_posix_malloc_threshold , config_match_limit , config_stackrecurse , config_unicode_properties , config_match_limit_recursion {-, config_bsr-} -- * PCRE Extra , PCREExtraFlags , extra_study_data , extra_match_limit , extra_callout_data , extra_tables , extra_match_limit_recursion, ------------------------------------------------------------------------ size_of_cint ) where -- Foreigns import Foreign import Foreign.Ptr import Foreign.C.Types import Foreign.C.String import qualified Data.ByteString.Char8 as S -- | Get sizeof CInt from hsc2hs size_of_cint :: Int size_of_cint :: Int size_of_cint = Int 4 {-# LINE 108 "Text/Regex/PCRE/Light/Base.hsc" #-} ------------------------------------------------------------------------ -- Types -- | An abstract pointer to a compiled PCRE Regex structure -- The structure allocated by the PCRE library will be deallocated -- automatically by the Haskell storage manager. -- data Regex = Regex {-# UNPACK #-} !(ForeignPtr PCRE) {-# UNPACK #-} !S.ByteString deriving (Regex -> Regex -> Bool (Regex -> Regex -> Bool) -> (Regex -> Regex -> Bool) -> Eq Regex forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a $c== :: Regex -> Regex -> Bool == :: Regex -> Regex -> Bool $c/= :: Regex -> Regex -> Bool /= :: Regex -> Regex -> Bool Eq, Eq Regex Eq Regex => (Regex -> Regex -> Ordering) -> (Regex -> Regex -> Bool) -> (Regex -> Regex -> Bool) -> (Regex -> Regex -> Bool) -> (Regex -> Regex -> Bool) -> (Regex -> Regex -> Regex) -> (Regex -> Regex -> Regex) -> Ord Regex Regex -> Regex -> Bool Regex -> Regex -> Ordering Regex -> Regex -> Regex forall a. Eq a => (a -> a -> Ordering) -> (a -> a -> Bool) -> (a -> a -> Bool) -> (a -> a -> Bool) -> (a -> a -> Bool) -> (a -> a -> a) -> (a -> a -> a) -> Ord a $ccompare :: Regex -> Regex -> Ordering compare :: Regex -> Regex -> Ordering $c< :: Regex -> Regex -> Bool < :: Regex -> Regex -> Bool $c<= :: Regex -> Regex -> Bool <= :: Regex -> Regex -> Bool $c> :: Regex -> Regex -> Bool > :: Regex -> Regex -> Bool $c>= :: Regex -> Regex -> Bool >= :: Regex -> Regex -> Bool $cmax :: Regex -> Regex -> Regex max :: Regex -> Regex -> Regex $cmin :: Regex -> Regex -> Regex min :: Regex -> Regex -> Regex Ord, Int -> Regex -> ShowS [Regex] -> ShowS Regex -> String (Int -> Regex -> ShowS) -> (Regex -> String) -> ([Regex] -> ShowS) -> Show Regex forall a. (Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a $cshowsPrec :: Int -> Regex -> ShowS showsPrec :: Int -> Regex -> ShowS $cshow :: Regex -> String show :: Regex -> String $cshowList :: [Regex] -> ShowS showList :: [Regex] -> ShowS Show) type PCRE = () ------------------------------------------------------------------------ -- | A type for PCRE compile-time options. These are newtyped CInts, -- which can be bitwise-or'd together, using '(Data.Bits..|.)' -- newtype PCREOption = PCREOption { PCREOption -> PCREExtraFlags unPCREOption :: PCREOption_ } {-# LINE 129 "Text/Regex/PCRE/Light/Base.hsc" #-} deriving (Eq,Ord,Show,Read) {-# LINE 131 "Text/Regex/PCRE/Light/Base.hsc" #-} -- | Combine a list of options into a single option, using bitwise (.|.) combineOptions :: [PCREOption] -> PCREOption combineOptions :: [PCREOption] -> PCREOption combineOptions = PCREExtraFlags -> PCREOption PCREOption (PCREExtraFlags -> PCREOption) -> ([PCREOption] -> PCREExtraFlags) -> [PCREOption] -> PCREOption forall b c a. (b -> c) -> (a -> b) -> a -> c . (PCREOption -> PCREExtraFlags -> PCREExtraFlags) -> PCREExtraFlags -> [PCREOption] -> PCREExtraFlags forall a b. (a -> b -> b) -> b -> [a] -> b forall (t :: * -> *) a b. Foldable t => (a -> b -> b) -> b -> t a -> b foldr (PCREExtraFlags -> PCREExtraFlags -> PCREExtraFlags forall a. Bits a => a -> a -> a (.|.) (PCREExtraFlags -> PCREExtraFlags -> PCREExtraFlags) -> (PCREOption -> PCREExtraFlags) -> PCREOption -> PCREExtraFlags -> PCREExtraFlags forall b c a. (b -> c) -> (a -> b) -> a -> c . PCREOption -> PCREExtraFlags unPCREOption) PCREExtraFlags 0 -- Now follows the user-visible options to _exec and _compile. -- To avoid type errors, we newtype the underlying CInts, and -- statically differentiate PCREOptions from non-PCREOptions -- -- The safety can still be defeated using numeric literals though, -- and other Num operations. We could do more to protect against this. -- (a smart constructor for .|.) -- | 'anchored' -- -- If this bit is set, the pattern is forced to be /anchored/, that is, -- it is constrained to match only at the first matching point in the -- string that is being searched (the /subject string/). This effect can -- also be achieved by appropriate constructs in the pattern itself, which -- is the only way to do it in Perl. -- anchored :: PCREOption anchored :: PCREOption anchored = PCREExtraFlags -> PCREOption PCREOption PCREExtraFlags anchored_cint -- | 'auto_callout' -- -- If this bit is set, "compile" automatically inserts callout -- items, all with number 255, before each pattern item. For discussion -- of the callout facility, see the man pcrecallout documentation -- auto_callout :: PCREOption auto_callout :: PCREOption auto_callout = PCREExtraFlags -> PCREOption PCREOption PCREExtraFlags auto_callout_cint -- | 'bsr_anycrlf' and 'bsr_unicode' -- -- These options (which are mutually exclusive) control what the \\R escape -- sequence matches. The choice is either to match only CR, LF, or CRLF, or to -- match any Unicode new- line sequence. The default is specified when PCRE is -- built. It can be overridden from within the pattern, or by setting an option -- when a compiled pattern is matched. -- -- bsr_anycrlf :: PCREOption -- bsr_anycrlf = PCREOption bsr_anycrlf_cint -- | 'bsr_unicode'. See 'bse_anycrlf' -- -- bsr_unicode :: PCREOption -- bsr_unicode = PCREOption bsr_unicode_cint -- | 'caseless' -- -- If this bit is set, letters in the pattern match both upper and lower case -- letters. It is equivalent to Perl's \/i option, and it can be changed within a -- pattern by a (?i) option setting. In UTF-8 mode, PCRE always understands the -- concept of case for characters whose values are less than 128, so caseless -- matching is always possible. For characters with higher values, the concept of -- case is supported if PCRE is compiled with Unicode property sup- port, but not -- otherwise. If you want to use caseless matching for characters 128 and above, -- you must ensure that PCRE is compiled with Unicode property support as well as -- with UTF-8 support. -- caseless :: PCREOption caseless :: PCREOption caseless = PCREExtraFlags -> PCREOption PCREOption PCREExtraFlags caseless_cint -- | 'dollar_endonly' -- -- If this bit is set, a dollar metacharacter in the pattern matches only at -- the end of the subject string. Without this option, a dollar also matches -- immediately before a newline at the end of the string (but not before any other -- newlines). The 'dollar_endonly' option is ignored if 'multiline' -- is set. There is no equivalent to this option in Perl, and no way to set it -- within a pattern. -- dollar_endonly :: PCREOption dollar_endonly :: PCREOption dollar_endonly = PCREExtraFlags -> PCREOption PCREOption PCREExtraFlags dollar_endonly_cint -- | 'dotall' -- -- If this bit is set, a dot metacharater in the pattern matches all -- characters, including those that indicate newline. Without it, a dot does -- not match when the current position is at a newline. This option is -- equivalent to Perl's \/s option, and it can be changed within a pattern by a -- (?s) option setting. A negative class such as [^a] always matches newline -- characters, independent of the setting of this option. -- dotall :: PCREOption dotall :: PCREOption dotall = PCREExtraFlags -> PCREOption PCREOption PCREExtraFlags dotall_cint -- | 'dupnames' -- -- If this bit is set, names used to identify capturing subpatterns need not be -- unique. This can be helpful for certain types of pattern when it is known -- that only one instance of the named subpattern can ever be matched. There are -- more details of named subpatterns in the /man pcreapi/ documentation. -- dupnames :: PCREOption dupnames :: PCREOption dupnames = PCREExtraFlags -> PCREOption PCREOption PCREExtraFlags dupnames_cint -- | 'extended' -- -- If this bit is set, whitespace data characters in the pattern are totally -- ignored except when escaped or inside a character class. Whitespace does not -- include the VT character (code 11). In addition, characters between an -- unescaped \# outside a character class and the next newline, inclusive, are -- also ignored. This is equivalent to Perl's \/x option, and it can be changed --within a pattern by a (?x) option setting. -- -- This option makes it possible to include comments inside complicated -- patterns. Note, however, that this applies only to data characters. Whitespace -- characters may never appear within special character sequences in a pattern, -- for example within the sequence (?( which introduces a conditional subpattern. -- extended :: PCREOption extended :: PCREOption extended = PCREExtraFlags -> PCREOption PCREOption PCREExtraFlags extended_cint -- | 'extra' -- -- This option was invented in order to turn on additional functionality of -- PCRE that is incompatible with Perl, but it is currently of very little use. -- When set, any backslash in a pattern that is followed by a letter that has no -- special meaning causes an error, thus reserving these combinations for future -- expansion. By default, as in Perl, a backslash followed by a letter with no -- special meaning is treated as a literal. (Perl can, however, be persuaded to -- give a warning for this.) There are at present no other features controlled by -- this option. It can also be set by a (?X) option setting within a pattern. -- extra :: PCREOption extra :: PCREOption extra = PCREExtraFlags -> PCREOption PCREOption PCREExtraFlags extra_cint -- | 'firstline' -- -- If this option is set, an unanchored pattern is required to match before or -- at the first newline in the subject string, though the matched text may --continue over the newline. -- firstline :: PCREOption firstline :: PCREOption firstline = PCREExtraFlags -> PCREOption PCREOption PCREExtraFlags firstline_cint -- | 'multiline' -- -- By default, PCRE treats the subject string as consisting of a single line -- of characters (even if it actually contains newlines). The /start of line/ -- metacharacter (^) matches only at the start of the string, while the /end of line/ -- metacharacter ($) matches only at the end of the string, or before a -- terminating newline (unless 'dollar_endonly' is set). This is the same -- as Perl. -- -- When 'multiline' it is set, the /start of line/ and /end of line/ -- constructs match immediately following or immediately before internal newlines -- in the subject string, respectively, as well as at the very start and end. This -- is equivalent to Perl's \/m option, and it can be changed within a pattern by a -- (?m) option setting. If there are no newlines in a subject string, or no occur- -- rences of ^ or $ in a pattern, setting PCRE_MULTILINE has no effect. -- multiline :: PCREOption multiline :: PCREOption multiline = PCREExtraFlags -> PCREOption PCREOption PCREExtraFlags multiline_cint -- | newline_cr', 'newline_lf', 'newline_crlf', -- 'newline_anycrlf', 'newline_any' -- -- These options override the default newline definition that -- was chosen when PCRE was built. Setting the first or the -- second specifies that a newline is indicated by a single -- character (CR or LF, respectively). Setting 'newline_crlf' specifies -- that a newline is indicated by the two-character CRLF sequence. -- Setting 'newline_anycrlf' -- specifies that any of the three preceding sequences should -- be recognized. Setting 'newline_any' specifies that any -- Unicode newline sequence should be recognized. The Unicode -- newline sequences are the three just mentioned, plus the -- single characters VT (vertical tab, U+000B), FF (formfeed, -- U+000C), NEL (next line, U+0085), LS (line separator, -- U+2028), and PS (paragraph separator, U+2029). The last -- two are recognized only in UTF-8 mode. -- -- The newline setting in the options word uses three bits -- that are treated as a number, giving eight possibilities. -- Currently only six are used (default plus the five values -- above). This means that if you set more than one newline -- option, the combination may or may not be sensible. For -- example, 'newline_cr' with 'newline_lf' is equivalent to -- 'newline_crlf', but other combinations may yield unused numbers and -- cause an error. -- -- The only time that a line break is specially recognized -- when compiling a pattern is if 'extended' is set, and -- an unescaped \# outside a character class is encountered. -- This indicates a comment that lasts until after the next -- line break sequence. In other circumstances, line break -- sequences are treated as literal data, except that in -- 'extended' mode, both CR and LF are treated as whitespace characters -- and are therefore ignored. -- -- -- The newline option that is set at compile time becomes the -- default that is used for 'exec' but it can be overridden. -- -- newline_any :: PCREOption -- newline_any = PCREOption newline_any_cint -- | 'newline_anycrlf', see 'newline_any' -- newline_anycrlf :: PCREOption -- newline_anycrlf = PCREOption newline_anycrlf_cint -- | 'newline_cr', see 'newline_any' newline_cr :: PCREOption newline_cr :: PCREOption newline_cr = PCREExtraFlags -> PCREOption PCREOption PCREExtraFlags newline_cr_cint -- | 'newline_crlf', see 'newline_any' newline_crlf :: PCREOption newline_crlf :: PCREOption newline_crlf = PCREExtraFlags -> PCREOption PCREOption PCREExtraFlags newline_crlf_cint -- | 'newline_lf', see 'newline_any' newline_lf :: PCREOption newline_lf :: PCREOption newline_lf = PCREExtraFlags -> PCREOption PCREOption PCREExtraFlags newline_lf_cint -- | 'no_auto_capture' -- -- If this option is set, it disables the use of numbered -- capturing parentheses in the pattern. Any opening paren- -- thesis that is not followed by ? behaves as if it were -- followed by ?: but named parentheses can still be used for -- capturing (and they acquire numbers in the usual way). -- There is no equivalent of this option in Perl. -- no_auto_capture :: PCREOption no_auto_capture :: PCREOption no_auto_capture = PCREExtraFlags -> PCREOption PCREOption PCREExtraFlags no_auto_capture_cint -- | 'ungreedy' -- -- This option inverts the /greediness/ of the quantifiers so -- that they are not greedy by default, but become greedy if -- followed by /?/. It is not compatible with Perl. It can -- also be set by a (?U) option setting within the pattern. -- ungreedy :: PCREOption ungreedy :: PCREOption ungreedy = PCREExtraFlags -> PCREOption PCREOption PCREExtraFlags ungreedy_cint -- | 'utf8' -- -- This option causes PCRE to regard both the pattern and the -- subject as strings of UTF-8 characters instead of single-byte character -- strings. However, it is available only when -- PCRE is built to include UTF-8 support. If not, the use of -- this option provokes an error. Details of how this option -- changes the behaviour of PCRE are given in the section on -- UTF-8 support in the main pcre page. -- utf8 :: PCREOption utf8 :: PCREOption utf8 = PCREExtraFlags -> PCREOption PCREOption PCREExtraFlags utf8_cint -- | 'no_utf8_check' -- -- When PCRE_UTF8 is set, the validity of the pattern as a -- UTF-8 string is automatically checked. There is a discussion -- about the validity of UTF-8 strings in the main pcre -- page. If an invalid UTF-8 sequence of bytes is found, -- compile() returns an error. If you already know that -- your pattern is valid, and you want to skip this check for -- performance reasons, you can set the 'no_utf8_check' -- option. When it is set, the effect of passing an invalid -- UTF-8 string as a pattern is undefined. It may cause your -- program to crash. Note that this option can also be passed -- to 'exec', to suppress the UTF-8 validity checking of subject strings. -- no_utf8_check :: PCREOption no_utf8_check :: PCREOption no_utf8_check = PCREExtraFlags -> PCREOption PCREOption PCREExtraFlags no_utf8_check_cint -- Internal name for hsc2hs to bind to. type PCREOption_ = CInt -- PCRE compile options, as CInts anchored_cint :: PCREOption_ anchored_cint :: PCREExtraFlags anchored_cint = PCREExtraFlags 16 auto_callout_cint :: PCREOption_ auto_callout_cint :: PCREExtraFlags auto_callout_cint = PCREExtraFlags 16384 caseless_cint :: PCREOption_ caseless_cint :: PCREExtraFlags caseless_cint = PCREExtraFlags 1 dollar_endonly_cint :: PCREOption_ dollar_endonly_cint :: PCREExtraFlags dollar_endonly_cint = PCREExtraFlags 32 dotall_cint :: PCREOption_ dotall_cint :: PCREExtraFlags dotall_cint = PCREExtraFlags 4 dupnames_cint :: PCREOption_ dupnames_cint :: PCREExtraFlags dupnames_cint = PCREExtraFlags 524288 extended_cint :: PCREOption_ extended_cint :: PCREExtraFlags extended_cint = PCREExtraFlags 8 extra_cint :: PCREOption_ extra_cint :: PCREExtraFlags extra_cint = PCREExtraFlags 64 firstline_cint :: PCREOption_ firstline_cint :: PCREExtraFlags firstline_cint = PCREExtraFlags 262144 multiline_cint :: PCREOption_ multiline_cint :: PCREExtraFlags multiline_cint = PCREExtraFlags 2 newline_cr_cint :: PCREOption_ newline_cr_cint :: PCREExtraFlags newline_cr_cint = PCREExtraFlags 1048576 newline_crlf_cint :: PCREOption_ newline_crlf_cint :: PCREExtraFlags newline_crlf_cint = PCREExtraFlags 3145728 newline_lf_cint :: PCREOption_ newline_lf_cint :: PCREExtraFlags newline_lf_cint = PCREExtraFlags 2097152 no_auto_capture_cint :: PCREOption_ no_auto_capture_cint :: PCREExtraFlags no_auto_capture_cint = PCREExtraFlags 4096 ungreedy_cint :: PCREOption_ ungreedy_cint :: PCREExtraFlags ungreedy_cint = PCREExtraFlags 512 utf8_cint :: PCREOption_ utf8_cint :: PCREExtraFlags utf8_cint = PCREExtraFlags 2048 no_utf8_check_cint :: PCREOption_ no_utf8_check_cint :: PCREExtraFlags no_utf8_check_cint = PCREExtraFlags 8192 {-# LINE 421 "Text/Regex/PCRE/Light/Base.hsc" #-} -- , bsr_anycrlf_cint = PCRE_BSR_ANYCRLF -- , bsr_unicode_cint = PCRE_BSR_UNICODE -- , newline_any_cint = PCRE_NEWLINE_ANY -- , newline_anycrlf_cint = PCRE_NEWLINE_ANYCRLF ------------------------------------------------------------------------ -- | PCRE exec options, to be passed to exec newtype PCREExecOption = PCREExecOption { unPCREExecOption :: PCREExecOption_ } {-# LINE 432 "Text/Regex/PCRE/Light/Base.hsc" #-} deriving (Eq,Ord,Show,Read) {-# LINE 434 "Text/Regex/PCRE/Light/Base.hsc" #-} -- | Combine a list of exec options into a single option, using bitwise (.|.) combineExecOptions :: [PCREExecOption] -> PCREExecOption combineExecOptions = PCREExecOption . foldr ((.|.) . unPCREExecOption) 0 -- | 'anchored'. -- -- The 'anchored' option limits 'exec' to matching at -- the first matching position. If a pattern was compiled -- with 'anchored', or turned out to be anchored by virtue -- of its contents, it cannot be made unachored at matching -- time. exec_anchored :: PCREExecOption exec_anchored :: PCREExecOption exec_anchored = PCREExtraFlags -> PCREExecOption PCREExecOption PCREExtraFlags exec_anchored_cint -- | 'newline_cr', 'newline_lf', -- 'newline_crlf', 'newline_anycrlf', 'newline_any' -- -- These options override the newline definition that was -- chosen or defaulted when the pattern was compiled. For -- details, see the description of 'compile' above. Dur- -- ing matching, the newline choice affects the behaviour of -- the dot, circumflex, and dollar metacharacters. It may -- also alter the way the match position is advanced after a -- match failure for an unanchored pattern. -- -- When 'newline_crlf', 'newline_anycrlf', or 'newline_any' -- is set, and a match attempt for an unanchored -- pattern fails when the current position is at a CRLF -- sequence, and the pattern contains no explicit matches for -- CR or LF characters, the match position is advanced by two -- characters instead of one, in other words, to after the -- CRLF. -- -- The above rule is a compromise that makes the most common -- cases work as expected. For example, if the pattern is .+A -- (and the 'dotall' option is not set), it does not match -- the string /\\r\\nA/ because, after failing at the start, it -- skips both the CR and the LF before retrying. However, the -- pattern /[\\r\\n]A/ does match that string, because it contains -- an explicit CR or LF reference, and so advances only -- by one character after the first failure. -- -- An explicit match for CR of LF is either a literal appear- -- ance of one of those characters, or one of the \\r or \\n -- escape sequences. Implicit matches such as [^X] do not -- count, nor does \\s (which includes CR and LF in the char- -- acters that it matches). -- -- Notwithstanding the above, anomalous effects may still -- occur when CRLF is a valid newline sequence and explicit -- \\r or \\n escapes appear in the pattern. -- -- exec_newline_any :: PCREExecOption -- exec_newline_any = PCREExecOption exec_newline_any_cint -- | 'exec_newline_anycrlf', see 'exec_newline_any' -- exec_newline_anycrlf :: PCREExecOption -- exec_newline_anycrlf = PCREExecOption exec_newline_anycrlf_cint -- | 'exec_newline_cr', see 'exec_newline_any' exec_newline_cr :: PCREExecOption exec_newline_cr :: PCREExecOption exec_newline_cr = PCREExtraFlags -> PCREExecOption PCREExecOption PCREExtraFlags exec_newline_cr_cint -- | 'exec_newline_crlf', see 'exec_newline_any' exec_newline_crlf :: PCREExecOption exec_newline_crlf :: PCREExecOption exec_newline_crlf = PCREExtraFlags -> PCREExecOption PCREExecOption PCREExtraFlags exec_newline_crlf_cint -- | 'exec_newline_lf', see 'exec_newline_any' exec_newline_lf :: PCREExecOption exec_newline_lf :: PCREExecOption exec_newline_lf = PCREExtraFlags -> PCREExecOption PCREExecOption PCREExtraFlags exec_newline_lf_cint -- | 'PCRE_NOTBOL' -- -- This option specifies that first character of the subject -- string is not the beginning of a line, so the circumflex -- metacharacter should not match before it. Setting this -- without 'multiline' (at compile time) causes circumflex -- never to match. This option affects only the behaviour of -- the circumflex metacharacter. It does not affect \\A. -- exec_notbol :: PCREExecOption exec_notbol :: PCREExecOption exec_notbol = PCREExtraFlags -> PCREExecOption PCREExecOption PCREExtraFlags exec_notbol_cint -- | 'noteol' -- -- This option specifies that the end of the subject string -- is not the end of a line, so the dollar metacharacter -- should not match it nor (except in multiline mode) a newline -- immediately before it. Setting this without 'multiline' -- (at compile time) causes dollar never to match. -- This option affects only the behaviour of the dollar -- metacharacter. It does not affect \\Z or \\z. -- exec_noteol :: PCREExecOption exec_noteol :: PCREExecOption exec_noteol = PCREExtraFlags -> PCREExecOption PCREExecOption PCREExtraFlags exec_noteol_cint -- | PCRE_NOTEMPTY -- -- An empty string is not considered to be a valid match if -- this option is set. If there are alternatives in the pattern, -- they are tried. If all the alternatives match the -- empty string, the entire match fails. For example, if the -- pattern -- -- > a?b? -- -- is applied to a string not beginning with /a/ or /b/, it -- matches the empty string at the start of the subject. With -- 'notempty' set, this match is not valid, so 'PCRE -- searches further into the string for occurrences of /a/ or -- /b/. -- -- Perl has no direct equivalent of 'notempty', but it -- does make a special case of a pattern match of the empty -- string within its split() function, and when using the \/g -- modifier. It is possible to emulate Perl's behaviour after -- matching a null string by first trying the match again at -- the same offset with PCRE_NOTEMPTY and PCRE_ANCHORED, and -- then if that fails by advancing the starting offset (see -- below) and trying an ordinary match again. There is some -- code that demonstrates how to do this in the pcredemo.c -- sample program. -- exec_notempty :: PCREExecOption exec_notempty :: PCREExecOption exec_notempty = PCREExtraFlags -> PCREExecOption PCREExecOption PCREExtraFlags exec_notempty_cint -- | 'no_utf8_check' -- -- When 'utf8' is set at compile time, the validity of the -- subject as a UTF-8 string is automatically checked when -- exec() is subsequently called. The value of -- startoffset is also checked to ensure that it points to -- the start of a UTF-8 character. There is a discussion -- about the validity of UTF-8 strings in the section on -- UTF-8 support in the main pcre page. If an invalid UTF-8 -- sequence of bytes is found, exec() returns the error -- 'error_badutf8'. If startoffset contains an invalid -- value, 'error_badutf8_offset' is returned. -- -- If you already know that your subject is valid, and you -- want to skip these checks for performance reasons, you can -- set the 'no_utf8_check' option when calling -- 'exec'. You might want to do this for the second and -- subsequent calls to exec() if you are making repeated -- calls to find all the matches in a single subject string. -- However, you should be sure that the value of startoffset -- points to the start of a UTF-8 character. When -- 'no_utf8_check' is set, the effect of passing an -- invalid UTF-8 string as a subject, or a value of startoff- -- set that does not point to the start of a UTF-8 character, -- is undefined. Your program may crash. -- exec_no_utf8_check :: PCREExecOption exec_no_utf8_check :: PCREExecOption exec_no_utf8_check = PCREExtraFlags -> PCREExecOption PCREExecOption PCREExtraFlags exec_no_utf8_check_cint -- | 'partial' -- -- This option turns on the partial matching feature. If the -- subject string fails to match the pattern, but at some -- point during the matching process the end of the subject -- was reached (that is, the subject partially matches the -- pattern and the failure to match occurred only because -- there were not enough subject characters), 'exec' -- returns 'error_partial' instead of 'error_nomatch'. -- When 'partial' is used, there are restrictions on what -- may appear in the pattern. These are discussed in the -- pcrepartial documentation. -- exec_partial :: PCREExecOption exec_partial :: PCREExecOption exec_partial = PCREExtraFlags -> PCREExecOption PCREExecOption PCREExtraFlags exec_partial_cint -- Internal name for hsc2hs to bind to. type PCREExecOption_ = CInt -- PCRE exec options exec_anchored_cint :: PCREExecOption_ exec_anchored_cint :: PCREExtraFlags exec_anchored_cint = PCREExtraFlags 16 exec_newline_cr_cint :: PCREExecOption_ exec_newline_cr_cint :: PCREExtraFlags exec_newline_cr_cint = PCREExtraFlags 1048576 exec_newline_crlf_cint :: PCREExecOption_ exec_newline_crlf_cint :: PCREExtraFlags exec_newline_crlf_cint = PCREExtraFlags 3145728 exec_newline_lf_cint :: PCREExecOption_ exec_newline_lf_cint :: PCREExtraFlags exec_newline_lf_cint = PCREExtraFlags 2097152 exec_notbol_cint :: PCREExecOption_ exec_notbol_cint :: PCREExtraFlags exec_notbol_cint = PCREExtraFlags 128 exec_noteol_cint :: PCREExecOption_ exec_noteol_cint :: PCREExtraFlags exec_noteol_cint = PCREExtraFlags 256 exec_notempty_cint :: PCREExecOption_ exec_notempty_cint :: PCREExtraFlags exec_notempty_cint = PCREExtraFlags 1024 exec_no_utf8_check_cint :: PCREExecOption_ exec_no_utf8_check_cint :: PCREExtraFlags exec_no_utf8_check_cint = PCREExtraFlags 8192 exec_partial_cint :: PCREExecOption_ exec_partial_cint :: PCREExtraFlags exec_partial_cint = PCREExtraFlags 32768 {-# LINE 621 "Text/Regex/PCRE/Light/Base.hsc" #-} -- , exec_newline_any_cint = PCRE_NEWLINE_ANY -- , exec_newline_anycrlf_cint = PCRE_NEWLINE_ANYCRLF -- , dfa_shortest = PCRE_DFA_SHORTEST -- , dfa_restart = PCRE_DFA_RESTART ------------------------------------------------------------------------ -- | A type for PCRE Errors: exec-time error codes. type PCREError = CInt error_nomatch :: PCREError error_nomatch :: PCREExtraFlags error_nomatch = (-PCREExtraFlags 1) error_null :: PCREError error_null :: PCREExtraFlags error_null = (-PCREExtraFlags 2) error_badoption :: PCREError error_badoption :: PCREExtraFlags error_badoption = (-PCREExtraFlags 3) error_badmagic :: PCREError error_badmagic :: PCREExtraFlags error_badmagic = (-PCREExtraFlags 4) error_unknown_node :: PCREError error_unknown_node :: PCREExtraFlags error_unknown_node = (-PCREExtraFlags 5) error_nomemory :: PCREError error_nomemory :: PCREExtraFlags error_nomemory = (-PCREExtraFlags 6) error_nosubstring :: PCREError error_nosubstring :: PCREExtraFlags error_nosubstring = (-PCREExtraFlags 7) error_matchlimit :: PCREError error_matchlimit :: PCREExtraFlags error_matchlimit = (-PCREExtraFlags 8) error_callout :: PCREError error_callout :: PCREExtraFlags error_callout = (-PCREExtraFlags 9) error_badutf8 :: PCREError error_badutf8 :: PCREExtraFlags error_badutf8 = (-PCREExtraFlags 10) error_badutf8_offset :: PCREError error_badutf8_offset :: PCREExtraFlags error_badutf8_offset = (-PCREExtraFlags 11) error_partial :: PCREError error_partial :: PCREExtraFlags error_partial = (-PCREExtraFlags 12) error_badpartial :: PCREError error_badpartial :: PCREExtraFlags error_badpartial = (-PCREExtraFlags 13) error_internal :: PCREError error_internal :: PCREExtraFlags error_internal = (-PCREExtraFlags 14) error_badcount :: PCREError error_badcount :: PCREExtraFlags error_badcount = (-PCREExtraFlags 15) error_dfa_uitem :: PCREError error_dfa_uitem :: PCREExtraFlags error_dfa_uitem = (-PCREExtraFlags 16) error_dfa_ucond :: PCREError error_dfa_ucond :: PCREExtraFlags error_dfa_ucond = (-PCREExtraFlags 17) error_dfa_umlimit :: PCREError error_dfa_umlimit :: PCREExtraFlags error_dfa_umlimit = PCREExtraFlags (-PCREExtraFlags 18) error_dfa_wssize :: PCREError error_dfa_wssize :: PCREExtraFlags error_dfa_wssize = PCREExtraFlags (-PCREExtraFlags 19) error_dfa_recurse :: PCREError error_dfa_recurse :: PCREExtraFlags info_capturecount :: PCREExtraFlags error_dfa_recurse = (-PCREExtraFlags 20) error_recursionlimit :: PCREError error_recursionlimit :: PCREExtraFlags error_recursionlimit = PCREExtraFlags (-PCREExtraFlags 21) {-# LINE 655 "Text/Regex/PCRE/Light/Base.hsc" #-} -- , error_unknown_opcode = PCRE_ERROR_UNKNOWN_OPCODE -- , error_nullwslimit = PCRE_ERROR_NULLWSLIMIT -- , error_badnewline = PCRE_ERROR_BADNEWLINE ------------------------------------------------------------------------ -- Request types for fullinfo() */ -- | PCRE Info requests -- provides information about the compiled pattern. type PCREInfo = CInt info_options :: PCREInfo info_options = 0 info_size :: PCREInfo info_size = 1 info_capturecount :: PCREInfo info_capturecount = 2 info_backrefmax :: PCREInfo info_backrefmax = 3 info_firstbyte :: PCREInfo info_firstbyte = 4 info_firstchar :: PCREInfo info_firstchar :: PCREExtraFlags info_firstchar = PCREExtraFlags 4 info_firsttable :: PCREInfo info_firsttable :: PCREExtraFlags info_firsttable = PCREExtraFlags 5 info_lastliteral :: PCREInfo info_lastliteral :: PCREExtraFlags info_lastliteral = PCREExtraFlags 6 info_nameentrysize :: PCREInfo info_nameentrysize :: PCREExtraFlags info_nameentrysize = PCREExtraFlags 7 info_namecount :: PCREInfo info_namecount :: PCREExtraFlags info_namecount = PCREExtraFlags 8 info_nametable :: PCREInfo info_nametable :: PCREExtraFlags info_nametable = PCREExtraFlags 9 info_studysize :: PCREInfo info_studysize :: PCREExtraFlags info_studysize = PCREExtraFlags 10 info_default_tables :: PCREInfo info_default_tables :: PCREExtraFlags info_default_tables = PCREExtraFlags 11 {-# LINE 681 "Text/Regex/PCRE/Light/Base.hsc" #-} -- , info_okpartial = PCRE_INFO_OKPARTIAL -- , info_jchanged = PCRE_INFO_JCHANGED -- , info_hascrorlf = PCRE_INFO_HASCRORLF ------------------------------------------------------------------------ -- | Request types for config() type PCREConfig = CInt config_utf8 :: PCREConfig config_utf8 = 0 config_newline :: PCREConfig config_newline :: PCREExtraFlags config_newline = PCREExtraFlags 1 config_link_size :: PCREConfig config_link_size :: PCREExtraFlags config_link_size = PCREExtraFlags 2 config_posix_malloc_threshold :: PCREConfig config_posix_malloc_threshold :: PCREExtraFlags config_posix_malloc_threshold = PCREExtraFlags 3 config_match_limit :: PCREConfig config_match_limit :: PCREExtraFlags config_match_limit = PCREExtraFlags 4 config_stackrecurse :: PCREConfig config_stackrecurse :: PCREExtraFlags config_stackrecurse = PCREExtraFlags 5 config_unicode_properties :: PCREConfig config_unicode_properties :: PCREExtraFlags config_unicode_properties = PCREExtraFlags 6 config_match_limit_recursion :: PCREConfig config_match_limit_recursion :: PCREExtraFlags config_match_limit_recursion = PCREExtraFlags 7 {-# LINE 701 "Text/Regex/PCRE/Light/Base.hsc" #-} -- Not portable -- , config_bsr = PCRE_CONFIG_BSR ------------------------------------------------------------------------ -- | PCREExtra. -- A extra structure contains the following fields: -- -- * flags Bits indicating which fields are set -- * study_data Opaque data from study() -- * match_limit Limit on internal resource use -- * match_limit_recursion Limit on internal recursion depth -- * callout_data Opaque data passed back to callouts -- * tables Points to character tables or is NULL -- type PCREExtra = () -- | PCREExtraFlags. bit flags for extra structure. type PCREExtraFlags = CInt -- Bit flags for the extra structure. Do not re-arrange or redefine -- these bits, just add new ones on the end, in order to remain compatible. */ extra_study_data :: PCREExtraFlags extra_study_data :: PCREExtraFlags extra_study_data = PCREExtraFlags 1 extra_match_limit :: PCREExtraFlags extra_match_limit :: PCREExtraFlags extra_match_limit = PCREExtraFlags 2 extra_callout_data :: PCREExtraFlags extra_callout_data :: PCREExtraFlags extra_callout_data = PCREExtraFlags 4 extra_tables :: PCREExtraFlags extra_tables :: PCREExtraFlags extra_tables = PCREExtraFlags 8 extra_match_limit_recursion :: PCREExtraFlags extra_match_limit_recursion :: PCREExtraFlags extra_match_limit_recursion = PCREExtraFlags 16 {-# LINE 732 "Text/Regex/PCRE/Light/Base.hsc" #-} -- PCRE_EXP_DECL pcre *compile(const char *, int, const char **, int *, const unsigned char *); -- PCRE_EXP_DECL int config(int, void *); -- PCRE_EXP_DECL int exec(const pcre *, const extra *, PCRE_SPTR, int, int, int, int *, int); ------------------------------------------------------------------------ -- C api {- pcre *pcre_compile(const char *pattern, int options, const char **errptr, int *erroffset, const unsigned char *tableptr); -} -- | Compile a pattern to an internal form. The pattern is a C string -- terminated by a binary zero. A pointer to a single block of memory that is -- obtained via pcre_malloc is returned. It is up to the caller to free -- the memory (via pcre_free) when it is no longer required -- -- The options argument contains various bit settings that affect the -- compilation. It should be zero if no options are required. -- -- If errptr is NULL, pcre_compile() returns NULL immediately. -- Otherwise, if compilation of a pattern fails, pcre_compile() returns NULL, -- and sets the variable pointed to by errptr to point to a textual error -- message. -- -- The offset from the start of the pattern to the character where the error -- was discovered is placed in the variable pointed to by erroffset, which must -- not be NULL. -- foreign import ccall unsafe "pcre.h pcre_compile" c_pcre_compile :: CString -> PCREOption -> Ptr CString -> Ptr CInt -> Ptr Word8 -> IO (Ptr PCRE) -- Additional fields to c_pcre_compile: -- -- errptr Where to put an error message -- erroffset Offset in pattern where error was found -- tableptr Pointer to character tables, or NULL to to use built in {- int pcre_exec(const pcre *code, const pcre_extra *extra, const char *subject, int length, int startoffset, int options, int *ovector, int ovecsize); -} -- | This function matches a compiled regular expression -- against a given subject string, using a matching algorithm -- that is similar to Perl's. It returns offsets to captured -- substrings. -- -- Its arguments are, in order: -- -- * 'code' Points to the compiled pattern (result of pcre_compile) -- -- * 'extra' Points to an associated pcre_extra structure (result of pcre_study), or is NULL -- -- * 'subject' Points to the subject string -- -- * 'length' Length of the subject string, in bytes -- -- * 'startoffset' Offset in bytes in the subject at which to start matching -- -- * 'options' Option bits -- -- * 'ovector' Points to a vector of ints for result substrings -- -- * 'ovecsize' Number of elements in the vector (a multiple of 3) -- -- Note, subject not required to be null terminated. -- foreign import ccall unsafe "pcre.h pcre_exec" c_pcre_exec :: Ptr PCRE -> Ptr PCREExtra -> Ptr Word8 -> CInt -> CInt -> PCREExecOption -> Ptr CInt -> CInt -> IO CInt -- | Return information about a compiled pattern foreign import ccall unsafe "pcre.h pcre_fullinfo" c_pcre_fullinfo :: Ptr PCRE -> Ptr PCREExtra -> PCREInfo -> Ptr a -> IO CInt