Copyright	(C) 2026 - Eitan Chatav
License	BSD-style (see the file LICENSE)
Maintainer	Eitan Chatav <eitan.chatav@gmail.com>
Stability	provisional
Portability	non-portable
Safe Haskell	None
Language	Haskell2010

Control.Lens.Grammar

Contents

Regular grammar
Context-free grammar
Context-sensitive grammar
Utility
Re-exports

Description

See Chomsky, On Certain Formal Properties of Grammars

Synopsis

type RegGrammar token a = forall (p :: Type -> Type -> Type). (Lexical token p, Alternator p) => p a a
type Lexical token (p :: Type -> Type -> Type) = (forall x y. (x ~ (), y ~ ()) => TerminalSymbol token (p x y), forall x y. (x ~ token, y ~ token) => TokenAlgebra token (p x y))
newtype RegString = RegString {
- runRegString :: RegEx Char
}
regstringG :: RegGrammar Char a -> RegString
regexGrammar :: Grammar Char RegString
type Grammar token a = forall (p :: Type -> Type -> Type). (Lexical token p, Alternator p, forall x. BackusNaurForm (p x x)) => p a a
newtype RegBnf = RegBnf {
- runRegBnf :: Bnf RegString
}
regbnfG :: Grammar Char a -> RegBnf
regbnfGrammar :: Grammar Char RegBnf
applicativeG :: (Alternative f, TokenAlgebra token (f token), TerminalSymbol token (f ()), forall x. BackusNaurForm (f x)) => Grammar token a -> f a
transducerG :: Categorized token => Grammar token a -> Transducer token
type CtxGrammar token a = forall (p :: Type -> Type -> Type). (Lexical token p, Alternator p, Filtrator p, MonadicTry p) => p a a
printG :: (Cons string string token token, IsList string, Item string ~ token, Categorized token, Alternative m, Monad m, Filterable m) => CtxGrammar token a -> a -> m (string -> string)
parseG :: (Cons string string token token, Snoc string string token token, IsList string, Item string ~ token, Categorized token, Alternative m, Monad m, Filterable m) => CtxGrammar token a -> string -> m (a, string)
unparseG :: (Cons string string token token, Snoc string string token token, IsList string, Item string ~ token, Categorized token, Alternative m, Monad m, Filterable m) => CtxGrammar token a -> a -> string -> m string
parsecG :: (Cons string string token token, Snoc string string token token, Item string ~ token, Categorized token) => CtxGrammar token a -> string -> ParsecState string a
unparsecG :: (Cons string string token token, Snoc string string token token, Item string ~ token, Categorized token) => CtxGrammar token a -> a -> string -> ParsecState string a
readG :: CtxGrammar Char a -> ReadP a
monadG :: (MonadTry m, TokenAlgebra token (m token), TerminalSymbol token (m ())) => CtxGrammar token a -> m a
putStringLn :: (IsList string, Item string ~ Char) => string -> IO ()
module Control.Lens.Grammar.BackusNaur
module Control.Lens.Grammar.Boole
module Control.Lens.Grammar.Kleene
module Control.Lens.Grammar.Machine
module Control.Lens.Grammar.Symbol
module Control.Lens.Grammar.Token
module Control.Lens.PartialIso
module Control.Monad.Fail.Try
module Data.Profunctor.Distributor
module Data.Profunctor.Filtrator
module Data.Profunctor.Grammar
module Data.Profunctor.Grammar.Parsector
module Data.Profunctor.Monoidal
module Data.Profunctor.Separator
module Data.Traversable.Homogeneous

Regular grammar

type RegGrammar token a = forall (p :: Type -> Type -> Type). (Lexical token p, Alternator p) => p a a Source #

A regular grammar may be constructed using Lexical and Alternator combinators. Let's see an example using semantic versioning syntax.

>>> import Numeric.Natural (Natural)
>>> :{
data SemVer = SemVer          -- e.g., 2.1.5-rc.1+build.123
  { major         :: Natural  -- e.g., 1
  , minor         :: Natural  -- e.g., 2
  , patch         :: Natural  -- e.g., 3
  , preRelease    :: [String] -- e.g., "alpha.1", "rc.2"
  , buildMetadata :: [String] -- e.g., "build.123", "20130313144700"
  }
  deriving (Eq, Ord, Show, Read)
:}

We'd like to define an optic _SemVer, corresponding to the constructor pattern SemVer. You could generate it with the TemplateHaskell combinator, makeNestedPrisms.

makeNestedPrisms ''SemVer

Unfortunately, we can't use TemplateHaskell to generate it in GHCi, which is used to test this documenation. Here is equivalent Haskell code instead. Since SemVer has only one constructor, _SemVer can be an Iso.

>>> :set -XRecordWildCards
>>> import Control.Lens (Iso', iso)
>>> :{
_SemVer :: Iso' SemVer (Natural, (Natural, (Natural, ([String], [String]))))
_SemVer = iso
  (\SemVer {..} -> (major, (minor, (patch, (preRelease, buildMetadata)))))
  (\(major, (minor, (patch, (preRelease, buildMetadata)))) -> SemVer {..})
:}

Now we can build a RegGrammar for SemVer using the "idiom" style of Applicative parsing with a couple modifications.

>>> :{
semverGrammar :: RegGrammar Char SemVer
semverGrammar = _SemVer
  >?  numberG
  >*< terminal "." >* numberG
  >*< terminal "." >* numberG
  >*< optionP _Empty (terminal "-" >* identifiersG)
  >*< optionP _Empty (terminal "+" >* identifiersG)
  where
    numberG = iso show read >~ someP (asIn @Char DecimalNumber)
    identifiersG = several1 (sepWith ".") (someP charG)
    charG = asIn LowercaseLetter
      <|> asIn UppercaseLetter
      <|> asIn DecimalNumber
      <|> token '-'
:}

Instead of using the constructor SemVer with the Functor applicator <$>, we use the optic _SemVer we defined and the Choice applicator >?; although, we could have used the Profunctor applicator >~ instead, because _SemVer is an Iso. A few Alternative combinators like <|> work both Functorially and Profunctorially.

Functorial	Profunctorial
`SemVer`	`_SemVer`
`<$>`	`>?`
`pure`	`pureP`
`*>`	`>*`
`<*`	`*<`
`<*>`	`>*<`
`empty`	`empty`
`<\|>`	`<\|>`
`choice`	`choice`
`many`	`manyP`
`some`	`someP`
`optional`	`optionalP`

You can generate a RegString from a RegGrammar with regstringG.

>>> putStringLn (regstringG semverGrammar)
\p{Nd}+(.\p{Nd}+(.\p{Nd}+((-((\p{Ll}|\p{Lu}|\p{Nd}|-)+(.(\p{Ll}|\p{Lu}|\p{Nd}|-)+)*))?(\+((\p{Ll}|\p{Lu}|\p{Nd}|-)+(.(\p{Ll}|\p{Lu}|\p{Nd}|-)+)*))?)))

You can also generate parsers and printers.

>>> [parsed | (parsed, "") <- parseG semverGrammar "2.1.5-rc.1+build.123"]
[SemVer {major = 2, minor = 1, patch = 5, preRelease = ["rc","1"], buildMetadata = ["build","123"]}]

Parsing unconses tokens left-to-right, from the beginning of a string. Unparsing, on the other hand, snocs tokens left-to-right, to the end of a string.

>>> unparseG semverGrammar (SemVer 1 0 0 ["alpha"] []) "SemVer: " :: Maybe String
Just "SemVer: 1.0.0-alpha"

Printing, on the gripping hand, conses tokens right-to-left, to the beginning of a string.

>>> ($ " is the SemVer.") <$> printG semverGrammar (SemVer 1 2 3 [] []) :: Maybe String
Just "1.2.3 is the SemVer."

Profunctorial combinators give us correct-by-construction invertible parsers. New RegGrammar generators can be defined with new instances of Lexical Alternators.

type Lexical token (p :: Type -> Type -> Type) = (forall x y. (x ~ (), y ~ ()) => TerminalSymbol token (p x y), forall x y. (x ~ token, y ~ token) => TokenAlgebra token (p x y)) Source #

Lexical combinators include terminal symbols, Tokenized combinators and tokenClasses.

newtype RegString Source #

RegStrings are an embedded domain specific language of regular expression strings.

Since they are strings, they have a string-like interface.

>>> let rex = fromString "ab|c" :: RegString
>>> putStringLn rex
ab|c
>>> rex
"ab|c"

RegStrings can be generated from RegGrammars with regstringG.

>>> regstringG (terminal "a" >* terminal "b" <|> terminal "c")
"ab|c"

RegStrings are actually stored as an algebraic datatype, RegEx.

>>> runRegString rex
RegExam (Alternate (Sequence (RegExam (OneOf (fromList "a"))) (RegExam (OneOf (fromList "b")))) (RegExam (OneOf (fromList "c"))))

RegStrings are similar to regular expression strings in many other programming languages. We can use them to see if a word and pattern are Matching.

>>> "ab" =~ rex
True
>>> "c" =~ rex
True
>>> "xyz" =~ rex
False

Like RegGrammars, RegStrings can use all the Lexical combinators. Unlike RegGrammars, instead of using Monoidal and Alternator combinators, RegStrings use Monoid and KleeneStarAlgebra combinators.

>>> terminal "a" <> terminal "b" >|< terminal "c" :: RegString
"ab|c"
>>> mempty :: RegString
""

Since RegStrings are a KleeneStarAlgebra, they support Kleene quantifiers.

>>> starK rex
"(ab|c)*"
>>> plusK rex
"(ab|c)+"
>>> optK rex
"(ab|c)?"

Like other regular expression languages, RegStrings support character classes.

>>> oneOf "abc" :: RegString
"[abc]"
>>> notOneOf "abc" :: RegString
"[^abc]"

The character classes are used for failure, matching no character or string, as well as the wildcard, matching any single character.

>>> zeroK :: RegString
"[]"
>>> anyToken :: RegString
"[^]"

Additional forms of character classes test for a character's GeneralCategory.

>>> asIn LowercaseLetter :: RegString
"\\p{Ll}"
>>> notAsIn Control :: RegString
"\\P{Cc}"

KleeneStarAlgebras support alternation >|<, and the Tokenized combinators are all negatable. However, we'd like to be able to take the intersection of character classes as well. RegStrings can combine characters' tokenClasses using BooleanAlgebra combinators.

>>> tokenClass (notOneOf "abc" >&&< notOneOf "xyz") :: RegString
"[^abcxyz]"
>>> tokenClass (oneOf "abcxyz" >&&< notOneOf "xyz") :: RegString
"[abc]"
>>> tokenClass (notOneOf "#$%" >&&< notAsIn Control) :: RegString
"[^#$%\\P{Cc}]"
>>> tokenClass (allB notAsIn [MathSymbol, Control]) :: RegString
"\\P{Sm|Cc}"
>>> tokenClass (notB (oneOf "xyz")) :: RegString
"[^xyz]"

Ill-formed RegStrings normalize to failure.

>>> fromString ")(" :: RegString
"[]"

Constructors

RegString
Fields runRegString :: RegEx Char

Instances

Instances details

IsString RegString Source #

Instance details

Defined in Control.Lens.Grammar

Methods

fromString :: String -> RegString #

Monoid RegString Source #

Instance details

Defined in Control.Lens.Grammar

Methods

mempty :: RegString #

mappend :: RegString -> RegString -> RegString #

mconcat :: [RegString] -> RegString #

Semigroup RegString Source #

Instance details

Defined in Control.Lens.Grammar

Methods

(<>) :: RegString -> RegString -> RegString #

sconcat :: NonEmpty RegString -> RegString #

stimes :: Integral b => b -> RegString -> RegString #

IsList RegString Source #

Instance details

Defined in Control.Lens.Grammar

Associated Types

type Item RegString
Instance details Defined in Control.Lens.Grammar type Item RegString = Char

Methods

fromList :: [Item RegString] -> RegString #

fromListN :: Int -> [Item RegString] -> RegString #

toList :: RegString -> [Item RegString] #

Read RegString Source #

Instance details

Defined in Control.Lens.Grammar

Methods

readsPrec :: Int -> ReadS RegString #

readList :: ReadS [RegString] #

readPrec :: ReadPrec RegString #

readListPrec :: ReadPrec [RegString] #

Show RegString Source #

Instance details

Defined in Control.Lens.Grammar

Methods

showsPrec :: Int -> RegString -> ShowS #

show :: RegString -> String #

showList :: [RegString] -> ShowS #

KleeneStarAlgebra RegString Source #

Instance details

Defined in Control.Lens.Grammar

Methods

starK :: RegString -> RegString Source #

plusK :: RegString -> RegString Source #

optK :: RegString -> RegString Source #

(>|<) :: RegString -> RegString -> RegString Source #

zeroK :: RegString Source #

NonTerminalSymbol RegString Source #

Instance details

Defined in Control.Lens.Grammar

Methods

nonTerminal :: String -> RegString Source #

Eq RegString Source #

Instance details

Defined in Control.Lens.Grammar

Methods

(==) :: RegString -> RegString -> Bool #

(/=) :: RegString -> RegString -> Bool #

Ord RegString Source #

Instance details

Defined in Control.Lens.Grammar

Methods

compare :: RegString -> RegString -> Ordering #

(<) :: RegString -> RegString -> Bool #

(<=) :: RegString -> RegString -> Bool #

(>) :: RegString -> RegString -> Bool #

(>=) :: RegString -> RegString -> Bool #

max :: RegString -> RegString -> RegString #

min :: RegString -> RegString -> RegString #

TokenAlgebra Char RegString Source #

Instance details

Defined in Control.Lens.Grammar

Methods

tokenClass :: TokenClass Char -> RegString Source #

Matching String RegString Source #

Instance details

Defined in Control.Lens.Grammar

Methods

(=~) :: String -> RegString -> Bool Source #

TerminalSymbol Char RegString Source #

Instance details

Defined in Control.Lens.Grammar

Methods

terminal :: [Char] -> RegString Source #

Tokenized Char RegString Source #

Instance details

Defined in Control.Lens.Grammar

Methods

anyToken :: RegString Source #

token :: Char -> RegString Source #

oneOf :: Foldable f => f Char -> RegString Source #

notOneOf :: Foldable f => f Char -> RegString Source #

asIn :: Categorize Char -> RegString Source #

notAsIn :: Categorize Char -> RegString Source #

type Item RegString Source #

Instance details

Defined in Control.Lens.Grammar

type Item RegString = Char

regstringG :: RegGrammar Char a -> RegString Source #

regstringG generates a RegString from a regular grammar. Since context-free Grammars and CtxGrammars aren't necessarily regular, the type system will prevent regstringG from being applied to them.

regexGrammar :: Grammar Char RegString Source #

regexGrammar is a context-free Grammar for RegStrings. It can't be a RegGrammar, since RegStrings include parenthesization. But balanced parentheses are a context-free language.

>>> putStringLn (regbnfG regexGrammar)
{start} = \q{regex}
{alternate} = \q{sequence}(\|\q{sequence})*
{atom} = \\q\q{nonterminal}|\q{class}|\(\q{regex}\)
{category} = Ll|Lu|Lt|Lm|Lo|Mn|Mc|Me|Nd|Nl|No|Pc|Pd|Ps|Pe|Pi|Pf|Po|Sm|Sc|Sk|So|Zs|Zl|Zp|Cc|Cf|Cs|Co|Cn
{char} = [^\(\)\*\+\?\[\\\]\^\{\|\}\P{Cc}]|\\\q{char-escaped}
{char-control} = NUL|SOH|STX|ETX|EOT|ENQ|ACK|BEL|BS|HT|LF|VT|FF|CR|SO|SI|DLE|DC1|DC2|DC3|DC4|NAK|SYN|ETB|CAN|EM|SUB|ESC|FS|GS|RS|US|DEL|PAD|HOP|BPH|NBH|IND|NEL|SSA|ESA|HTS|HTJ|VTS|PLD|PLU|RI|SS2|SS3|DCS|PU1|PU2|STS|CCH|MW|SPA|EPA|SOS|SGCI|SCI|CSI|ST|OSC|PM|APC
{char-escaped} = [\(\)\*\+\?\[\\\]\^\{\|\}]|\q{char-control}
{class} = \q{class-one-of}|\q{class-not-one-of}
{class-category} = \\p\{\q{category}\}|\\P\{(\q{category}(\|\q{category})*)\}
{class-not-one-of} = \q{class-category}|\[\^\q{char}*(\q{class-category}?\])
{class-one-of} = \q{char}|\[\q{char}*\]
{expression} = \q{atom}\?|\q{atom}\*|\q{atom}\+|\q{atom}
{nonterminal} = \{\q{char}*\}
{regex} = \q{alternate}
{sequence} = \q{expression}*

Context-free grammar

type Grammar token a = forall (p :: Type -> Type -> Type). (Lexical token p, Alternator p, forall x. BackusNaurForm (p x x)) => p a a Source #

Context-free Grammars add two capabilities to RegGrammars, coming from the BackusNaurForm interface

rule abstraction,
and general recursion.

regexGrammar and regbnfGrammar are examples of context-free Grammars. Regular expressions are a form of expression algebra. Let's see a similar but simpler example, the algebra of arithmetic expressions of natural numbers.

>>> import Numeric.Natural (Natural)
>>> :{
data Arith
  = Num Natural
  | Add Arith Arith
  | Mul Arith Arith
  deriving stock (Eq, Ord, Show, Read)
:}

Here are Prisms for the constructor patterns.

>>> import Control.Lens (Prism', prism')
>>> :{
_Num :: Prism' Arith Natural
_Num = prism' Num (\case Num n -> Just n; _ -> Nothing)
_Add, _Mul :: Prism' Arith (Arith, Arith)
_Add = prism' (uncurry Add) (\case Add x y -> Just (x,y); _ -> Nothing)
_Mul = prism' (uncurry Mul) (\case Mul x y -> Just (x,y); _ -> Nothing)
:}

Now we can build a Grammar for Arith by combining "idiom" style with named rules, and tying the recursive loop (caused by parenthesization) with ruleRec.

>>> :{
arithGrammar :: Grammar Char Arith
arithGrammar = ruleRec "arith" sumG
  where
    sumG arith = rule "sum" $
      chain1 Left _Add (sepWith "+") (prodG arith)
    prodG arith = rule "product" $
      chain1 Left _Mul (sepWith "*") (factorG arith)
    factorG arith = rule "factor" $
      numberG <|> terminal "(" >* arith *< terminal ")"
    numberG = rule "number" $
      _Num . iso show read >? someP (asIn @Char DecimalNumber)
:}

We can generate grammar strings, printers and parsers from arithGrammar.

>>> putStringLn (regbnfG arithGrammar)
{start} = \q{arith}
{arith} = \q{sum}
{factor} = \q{number}|\(\q{arith}\)
{number} = \p{Nd}+
{product} = \q{factor}(\*\q{factor})*
{sum} = \q{product}(\+\q{product})*
>>> [x | (x,"") <- parseG arithGrammar "1+2*3+4"]
[Add (Add (Num 1) (Mul (Num 2) (Num 3))) (Num 4)]
>>> unparseG arithGrammar (Add (Num 1) (Mul (Num 2) (Num 3))) "" :: Maybe String
Just "1+2*3"
>>> do pr <- printG arithGrammar (Num 69); pure (pr "") :: Maybe String
Just "69"

If all rules are non-recursive, then a Grammar can be rewritten as a RegGrammar. Since Haskell permits general recursion, and RegGrammars are embedded in Haskell, you can define context-free grammars with them. But it's recommended to use Grammars for rule abstraction and generator support for ruleRec.

newtype RegBnf Source #

RegBnfs are an embedded domain specific language of Backus-Naur forms extended by regular expression strings.

A RegBnf consists of a distinguished RegString "start" rule, and a set of named RegString rules.

>>> putStringLn (rule "baz" (terminal "foo" >|< terminal "bar") :: RegBnf)
{start} = \q{baz}
{baz} = foo|bar

Like RegStrings they have a string-like interface.

>>> let bnf = fromString "{start} = foo|bar" :: RegBnf
>>> putStringLn bnf
{start} = foo|bar
>>> bnf
"{start} = foo|bar"
>>> :type toList bnf
toList bnf :: [Char]

RegBnfs can be generated from context-free Grammars with regbnfG.

>>> :type regbnfG regbnfGrammar
regbnfG regbnfGrammar :: RegBnf

Like RegStrings, RegBnfs can be constructed using Lexical, Monoid and KleeneStarAlgebra combinators. But they also support BackusNaurForm rules and ruleRecs.

>>> putStringLn (rule "baz" (bnf >|< terminal "baz"))
{start} = \q{baz}
{baz} = foo|bar|baz
>>> putStringLn (ruleRec "∞-loop" (\x -> x) :: RegBnf)
{start} = \q{∞-loop}
{∞-loop} = \q{∞-loop}

Constructors

RegBnf
Fields runRegBnf :: Bnf RegString

Instances

Instances details

IsString RegBnf Source #

Instance details

Defined in Control.Lens.Grammar

Methods

fromString :: String -> RegBnf #

Monoid RegBnf Source #

Instance details

Defined in Control.Lens.Grammar

Methods

mempty :: RegBnf #

mappend :: RegBnf -> RegBnf -> RegBnf #

mconcat :: [RegBnf] -> RegBnf #

Semigroup RegBnf Source #

Instance details

Defined in Control.Lens.Grammar

Methods

(<>) :: RegBnf -> RegBnf -> RegBnf #

sconcat :: NonEmpty RegBnf -> RegBnf #

stimes :: Integral b => b -> RegBnf -> RegBnf #

IsList RegBnf Source #

Instance details

Defined in Control.Lens.Grammar

Associated Types

type Item RegBnf
Instance details Defined in Control.Lens.Grammar type Item RegBnf = Char

Methods

fromList :: [Item RegBnf] -> RegBnf #

fromListN :: Int -> [Item RegBnf] -> RegBnf #

toList :: RegBnf -> [Item RegBnf] #

Read RegBnf Source #

Instance details

Defined in Control.Lens.Grammar

Methods

readsPrec :: Int -> ReadS RegBnf #

readList :: ReadS [RegBnf] #

readPrec :: ReadPrec RegBnf #

readListPrec :: ReadPrec [RegBnf] #

Show RegBnf Source #

Instance details

Defined in Control.Lens.Grammar

Methods

showsPrec :: Int -> RegBnf -> ShowS #

show :: RegBnf -> String #

showList :: [RegBnf] -> ShowS #

BackusNaurForm RegBnf Source #

Instance details

Defined in Control.Lens.Grammar

Methods

rule :: String -> RegBnf -> RegBnf Source #

ruleRec :: String -> (RegBnf -> RegBnf) -> RegBnf Source #

KleeneStarAlgebra RegBnf Source #

Instance details

Defined in Control.Lens.Grammar

Methods

starK :: RegBnf -> RegBnf Source #

plusK :: RegBnf -> RegBnf Source #

optK :: RegBnf -> RegBnf Source #

(>|<) :: RegBnf -> RegBnf -> RegBnf Source #

zeroK :: RegBnf Source #

NonTerminalSymbol RegBnf Source #

Instance details

Defined in Control.Lens.Grammar

Methods

nonTerminal :: String -> RegBnf Source #

Eq RegBnf Source #

Instance details

Defined in Control.Lens.Grammar

Methods

(==) :: RegBnf -> RegBnf -> Bool #

(/=) :: RegBnf -> RegBnf -> Bool #

Ord RegBnf Source #

Instance details

Defined in Control.Lens.Grammar

Methods

compare :: RegBnf -> RegBnf -> Ordering #

(<) :: RegBnf -> RegBnf -> Bool #

(<=) :: RegBnf -> RegBnf -> Bool #

(>) :: RegBnf -> RegBnf -> Bool #

(>=) :: RegBnf -> RegBnf -> Bool #

max :: RegBnf -> RegBnf -> RegBnf #

min :: RegBnf -> RegBnf -> RegBnf #

TokenAlgebra Char RegBnf Source #

Instance details

Defined in Control.Lens.Grammar

Methods

tokenClass :: TokenClass Char -> RegBnf Source #

Matching String RegBnf Source #

Instance details

Defined in Control.Lens.Grammar

Methods

(=~) :: String -> RegBnf -> Bool Source #

TerminalSymbol Char RegBnf Source #

Instance details

Defined in Control.Lens.Grammar

Methods

terminal :: [Char] -> RegBnf Source #

Tokenized Char RegBnf Source #

Instance details

Defined in Control.Lens.Grammar

Methods

anyToken :: RegBnf Source #

token :: Char -> RegBnf Source #

oneOf :: Foldable f => f Char -> RegBnf Source #

notOneOf :: Foldable f => f Char -> RegBnf Source #

asIn :: Categorize Char -> RegBnf Source #

notAsIn :: Categorize Char -> RegBnf Source #

type Item RegBnf Source #

Instance details

Defined in Control.Lens.Grammar

type Item RegBnf = Char

regbnfG :: Grammar Char a -> RegBnf Source #

regbnfG generates a RegBnf from a context-free Grammar. Since CtxGrammars aren't context-free, the type system will prevent regbnfG from being applied to a CtxGrammar. It can apply to a RegGrammar.

regbnfGrammar :: Grammar Char RegBnf Source #

regbnfGrammar is a context-free Grammar for RegBnfs. That means that it can generate a self-hosted definition.

>>> putStringLn (regbnfG regbnfGrammar)
{start} = \q{regbnf}
{alternate} = \q{sequence}(\|\q{sequence})*
{atom} = \\q\q{nonterminal}|\q{class}|\(\q{regex}\)
{category} = Ll|Lu|Lt|Lm|Lo|Mn|Mc|Me|Nd|Nl|No|Pc|Pd|Ps|Pe|Pi|Pf|Po|Sm|Sc|Sk|So|Zs|Zl|Zp|Cc|Cf|Cs|Co|Cn
{char} = [^\(\)\*\+\?\[\\\]\^\{\|\}\P{Cc}]|\\\q{char-escaped}
{char-control} = NUL|SOH|STX|ETX|EOT|ENQ|ACK|BEL|BS|HT|LF|VT|FF|CR|SO|SI|DLE|DC1|DC2|DC3|DC4|NAK|SYN|ETB|CAN|EM|SUB|ESC|FS|GS|RS|US|DEL|PAD|HOP|BPH|NBH|IND|NEL|SSA|ESA|HTS|HTJ|VTS|PLD|PLU|RI|SS2|SS3|DCS|PU1|PU2|STS|CCH|MW|SPA|EPA|SOS|SGCI|SCI|CSI|ST|OSC|PM|APC
{char-escaped} = [\(\)\*\+\?\[\\\]\^\{\|\}]|\q{char-control}
{class} = \q{class-one-of}|\q{class-not-one-of}
{class-category} = \\p\{\q{category}\}|\\P\{(\q{category}(\|\q{category})*)\}
{class-not-one-of} = \q{class-category}|\[\^\q{char}*(\q{class-category}?\])
{class-one-of} = \q{char}|\[\q{char}*\]
{expression} = \q{atom}\?|\q{atom}\*|\q{atom}\+|\q{atom}
{nonterminal} = \{\q{char}*\}
{regbnf} = \{start\} = \q{regex}(\LF\q{nonterminal}( = )\q{regex})*
{regex} = \q{alternate}
{sequence} = \q{expression}*

applicativeG Source #

Arguments

:: (Alternative f, TokenAlgebra token (f token), TerminalSymbol token (f ()), forall x. BackusNaurForm (f x))
=> Grammar token a	context-free grammar
-> f a

Generate any Applicative parser backend from a Grammar with applicativeG. It works the same way as monadG, for parsers without Monad instances. That permits backends to use algorithms that can only parse context-free Grammars.

transducerG :: Categorized token => Grammar token a -> Transducer token Source #

Compile a Grammar into a Transducer.

>>> let regexMachine = transducerG @Char regexGrammar

A transducer is a form of finite state machine, usable as an intermediary for further generators like =~, expectNext, languageSample, parseForest & unreachableRules.

>>> import Test.QuickCheck
>>> let regexLang = languageSample @Char regexMachine
>>> words100 <- generate (take 100 <$> regexLang)
>>> quickCheck (property (all (=~ regexMachine) words100))
+++ OK, passed 1 test.
>>> import Control.Monad.State
>>> import System.Random
>>> let gen = mkStdGen 69
>>> evalState (take 15 <$> regexLang) gen
["","|","\776269","()","[]","\\[","||","|\249908","\770923*","\1008821+","\318904?","\845807|","\477898\1026934","()*","()+"]

>>> import Data.Tree (drawForest)

>>> let (forest, _) = parseForest regexMachine "xy|z" in putStr (drawForest (map (fmap show) forest))
("regex",0,4,"xy|z")
|
`- ("alternate",0,4,"xy|z")
   |
   +- ("sequence",0,2,"xy")
   |  |
   |  +- ("expression",0,1,"x")
   |  |  |
   |  |  `- ("atom",0,1,"x")
   |  |     |
   |  |     `- ("class",0,1,"x")
   |  |        |
   |  |        `- ("class-one-of",0,1,"x")
   |  |           |
   |  |           `- ("char",0,1,"x")
   |  |
   |  `- ("expression",1,2,"y")
   |     |
   |     `- ("atom",1,2,"y")
   |        |
   |        `- ("class",1,2,"y")
   |           |
   |           `- ("class-one-of",1,2,"y")
   |              |
   |              `- ("char",1,2,"y")
   |
   `- ("sequence",3,4,"z")
      |
      `- ("expression",3,4,"z")
         |
         `- ("atom",3,4,"z")
            |
            `- ("class",3,4,"z")
               |
               `- ("class-one-of",3,4,"z")
                  |
                  `- ("char",3,4,"z")

Context-sensitive grammar

type CtxGrammar token a = forall (p :: Type -> Type -> Type). (Lexical token p, Alternator p, Filtrator p, MonadicTry p) => p a a Source #

For context-sensitivity, the Monadic interface is used by importing Data.Profunctor.Monadic qualified and using a "bonding" notation which mixes "idiom" style with qualified do-notation. Let's use length-encoded vectors of numbers as an example.

>>> import Numeric.Natural (Natural)
>>> import Control.Lens.Iso (Iso', iso)
>>> :set -XRecordWildCards
>>> :{
data LenVec = LenVec {length :: Natural, vector :: [Natural]}
  deriving (Eq, Ord, Show, Read)
_LenVec :: Iso' LenVec (Natural, [Natural])
_LenVec = iso (\LenVec {..} -> (length, vector)) (\(length, vector) -> LenVec {..})
:}

>>> :set -XQualifiedDo
>>> import qualified Data.Profunctor.Monadic as P
>>> :{
lenvecGrammar :: CtxGrammar Char LenVec
lenvecGrammar = _LenVec >? P.do
  let
    numberG = iso show read >~ someP (asIn @Char DecimalNumber)
    vectorG n = intercalateP n (sepWith ",") numberG
  len <- numberG             -- bonds to _LenVec
  terminal ";"               -- doesn't bond
  vectorG (fromIntegral len) -- bonds to _LenVec
:}

The qualified do-notation changes the signature of P.>>=, so that we must apply the constructor pattern _LenVec to the do-block with the >? applicator. Any scoped bound action, var <- action, gets "bonded" to the constructor pattern. Any unbound actions, except for the last action in the do-block, does not get bonded to the pattern. The last action does get bonded to the pattern. Any unscoped bound action, _ <- action, also gets bonded to the pattern, but being unscoped means it isn't added to the context. If all bound actions are unscoped, and filtration & failure handling aren't used, then a CtxGrammar can be rewritten as a Grammar since it is context-free. We can't generate a RegBnf from a CtxGrammar since the rules aren't static, but dynamic and contextual. We can generate parsers and printers as expected.

>>> [vec | (vec, "") <- parseG lenvecGrammar "3;1,2,3"] :: [LenVec]
[LenVec {length = 3, vector = [1,2,3]}]
>>> [vec | (vec, "") <- parseG lenvecGrammar "0;1,2,3"] :: [LenVec]
[]
>>> [pr "" | pr <- printG lenvecGrammar (LenVec 2 [6,7])] :: [String]
["2;6,7"]
>>> [pr "" | pr <- printG lenvecGrammar (LenVec 200 [100])] :: [String]
[]

In addition to context-sensitivity via Monadic combinators, CtxGrammars add unrestricted filtration to Grammars. The satisfy combinator is an unrestricted token filter. And the satisfied pattern is used together with the Choice & Cochoice applicator >?< for unrestricted filtration.

>>> :{
palindromeG :: CtxGrammar Char String
palindromeG = rule "palindrome" $
  satisfied (\wrd -> reverse wrd == wrd) >?< manyP (anyToken @Char)
:}

>>> [pal | word <- ["racecar", "word"], (pal, "") <- parseG palindromeG word]
["racecar"]

Since CtxGrammars are embedded in Haskell, permitting computable predicates, and Filtrator has a default definition for Monadic Alternators, the context-sensitivity of CtxGrammar implies unrestricted filtration of grammars by computable predicates, which can recognize the larger class of recursively enumerable languages.

Finally, CtxGrammars support failure reporting and backtracking. This has no effect on printG, parseG or unparseG; but it effects parsecG and unparsecG. For context, an LL grammar can be (un)parsed by an LL parser. An LL parser (un)parses from left to right, and constucts leftmost derivations. An LL(k) parser can look k tokens ahead. Parsor is an LL(∞) parser. Parsector is an LL(1) parser. The backtracking try combinator restores full lookahead to Parsector. Since both Parsor & Parsector are LL parsers they diverge if the CtxGrammar they're run on is left-recursive.

>>> parsecG (rule "foo" (fail "bar") <|> fail "baz") "abc"
ParsecState {parsecLooked = False, parsecOffset = 0, parsecStream = "abc", parsecFailure = ParsecFailure {parsecExpect = TokenClass (OneOf (fromList "")), parsecLabels = [Node {rootLabel = "foo", subForest = [Node {rootLabel = "bar", subForest = []}]},Node {rootLabel = "baz", subForest = []}]}, parsecResult = Nothing}

>>> parsecG (manyP (token 'a') >*< asIn @Char DecimalNumber) "aaab"
ParsecState {parsecLooked = True, parsecOffset = 3, parsecStream = "b", parsecFailure = ParsecFailure {parsecExpect = TokenClass (Alternate (TokenClass (OneOf (fromList "a"))) (TokenClass (NotOneOf (fromList "") (AndAsIn DecimalNumber)))), parsecLabels = []}, parsecResult = Nothing}

>>> unparsecG (tokens "abc") "abx" ""
ParsecState {parsecLooked = True, parsecOffset = 2, parsecStream = "ab", parsecFailure = ParsecFailure {parsecExpect = TokenClass (OneOf (fromList "c")), parsecLabels = []}, parsecResult = Nothing}

printG Source #

Arguments

:: (Cons string string token token, IsList string, Item string ~ token, Categorized token, Alternative m, Monad m, Filterable m)
=> CtxGrammar token a
-> a	syntax
-> m (string -> string)

printG generates a printer from a CtxGrammar. Since both RegGrammars and context-free Grammars are CtxGrammars, the type system will allow printG to be applied to them. Running the printer on a syntax value returns a function that conses tokens at the beginning of an input string, from right to left.

parseG Source #

Arguments

:: (Cons string string token token, Snoc string string token token, IsList string, Item string ~ token, Categorized token, Alternative m, Monad m, Filterable m)
=> CtxGrammar token a
-> string	input
-> m (a, string)

parseG generates a parser from a LL(∞) CtxGrammar. Since both RegGrammars and context-free Grammars are CtxGrammars, the type system will allow parseG to be applied to them. Running the parser on an input string value unconses tokens from the beginning of an input string from left to right, returning a syntax value and the remaining output string.

unparseG Source #

Arguments

:: (Cons string string token token, Snoc string string token token, IsList string, Item string ~ token, Categorized token, Alternative m, Monad m, Filterable m)
=> CtxGrammar token a
-> a	syntax
-> string	input
-> m string

unparseG generates a printer from a LL(∞) CtxGrammar. Since both RegGrammars and context-free Grammars are CtxGrammars, the type system will allow unparseG to be applied to them. Running the printer on a syntax value and an input string snocs tokens at the end of the string, from left to right, returning the output string.

parsecG Source #

Arguments

:: (Cons string string token token, Snoc string string token token, Item string ~ token, Categorized token)
=> CtxGrammar token a
-> string	input
-> ParsecState string a

parsecG generates a parser from a LL(1) CtxGrammar, with try for restoring full LL(∞) lookahead. Since both RegGrammars and context-free Grammars are CtxGrammars, the type system will allow parsecG to be applied to them. Running the parser on an input string value unconses tokens from the beginning of an input string from left to right, returning parsecResult as Nothing on failure or Just an output syntax value, with parse failure stored in parsecFailure, and a remaining output parsecStream.

unparsecG Source #

Arguments

:: (Cons string string token token, Snoc string string token token, Item string ~ token, Categorized token)
=> CtxGrammar token a
-> a	syntax
-> string	input
-> ParsecState string a

unparsecG generates a printer from a LL(1) CtxGrammar, with try for restoring full LL(∞) lookahead. Since both RegGrammars and context-free Grammars are CtxGrammars, the type system will allow unparsecG to be applied to them. Running the printer on a syntax value and an input string snocs tokens at the end of the string, from left to right, returning parsecResult as Nothing on failure or Just the input syntax value, with print success stored in parsecStream.

readG :: CtxGrammar Char a -> ReadP a Source #

Generate a ReadP backend from a CtxGrammar Char.

monadG Source #

Arguments

:: (MonadTry m, TokenAlgebra token (m token), TerminalSymbol token (m ()))
=> CtxGrammar token a	context-sensitive grammar
-> m a

Generate any parser Monad backend from a CtxGrammar with monadG. Let's see how to do this without orphan instances, using the Megaparsec library.

import qualified Text.Megaparsec as M
import qualified Text.Megaparsec.Char as M
import Control.Lens.Grammar

newtype WrapMega a = WrapMega {unwrapMega :: M.Parsec String String a}
  deriving newtype
    ( Functor, Applicative, Alternative
    , Monad, MonadPlus, MonadFail
    )
instance TerminalSymbol Char (WrapMega ()) where
  terminal str = WrapMega (M.chunk str *> pure ())
instance TokenAlgebra Char (WrapMega Char) where
  tokenClass exam = WrapMega $ M.label (show exam) (M.satisfy (tokenClass exam))
instance Tokenized Char (WrapMega Char) where
  anyToken = WrapMega M.anySingle
  token = WrapMega . M.single
  oneOf = WrapMega . M.oneOf
  notOneOf = WrapMega . M.noneOf
  asIn cat = WrapMega $ M.label ("in category " ++ show cat) (M.satisfy (asIn cat))
  notAsIn cat = WrapMega $ M.label ("not in category " ++ show cat) (M.satisfy (notAsIn cat))
instance BackusNaurForm (WrapMega a) where
  rule lbl (WrapMega p) = WrapMega (M.label lbl p)
  ruleRec lbl = rule lbl . fix
instance Filterable WrapMega where
  catMaybes m = m >>= maybe (fail "unrestricted filtration") pure
instance MonadTry WrapMega where
  try (WrapMega p) = WrapMega (M.try p)

megaparsecG
  :: CtxGrammar Char a
  -> M.Parsec String String a
megaparsecG gram = unwrapMega (monadG gram)