{-# LANGUAGE OverloadedStrings
           , TupleSections
           , StandaloneDeriving #-}

{-| Utilities for encoding arbitrary data as Bourne shell fragments that
    stream the data to standard output, using HERE documents and simple shell
    decoders.
 -}
module System.Posix.ARX.HEREDat where

import Control.Applicative
import Control.Arrow (first)
import Control.Monad
import Data.ByteString (ByteString)
import qualified Data.ByteString as Bytes
import qualified Data.ByteString.Char8
import qualified Data.ByteString.Internal as Bytes (c2w)
import qualified Data.List as List
import Data.Monoid
import Data.Ord
import Data.String
import Data.Word
import Numeric (showOct, showHex)

import qualified Blaze.ByteString.Builder as Blaze
import qualified Blaze.ByteString.Builder.Char8 as Blaze


{-| A chunk describes a block of binary data ready for inclusion in a shell
    script. For many data blocks, no encoding or decoding is necessary; these
    are stored in a 'SafeChunk'. Those blocks needing byte-translation are
    stored in an 'EncodedChunk'.
 -}
data Chunk                   =  SafeChunk !ByteString
                             |  EncodedChunk !ByteString -- Encoded data.
                                             !Int        -- Original length.
                                             !EscapeChar -- Null replacer.
                                             !EscapeChar -- Escaper.
deriving instance Show Chunk
instance IsString Chunk where
  fromString :: String -> Chunk
fromString                 =  ByteString -> Chunk
chunk (ByteString -> Chunk) -> (String -> ByteString) -> String -> Chunk
forall b c a. (b -> c) -> (a -> b) -> a -> c
. String -> ByteString
Data.ByteString.Char8.pack

{-| Converts a 'ByteString' into a string safe for inclusion in a shell HERE
    document and annotates with information to construct a shell decoder for
    that document, if necessary.

    A 'ByteString' with nulls is rewritten in a complicated way. Two escape
    characters are chosen from a class of ASCII printable characters that look
    like reasonable escape characters; the two that show up least frequently
    in the document (including 0 times) become the null replacer and the
    escaper. All instances of these two characters are rewritten to escape
    sequences formed with the escaper, while nulls are rewritten to the null
    replacer. Given the two characters thus chosen, a command line with @tr@
    and @sed@ in sequence can be constructed to decode the document.

    This encoding doubles the amount of space consumed by the escape
    characters. In the worst case, where the data is made of all 20 potential
    escapes, evenly distributed, and one null (so we can't punt on escaping),
    the data will grow in size by 10 percent. For data that is more evenly
    distributed over the bytes -- as we might expect of compressed tarballs --
    we expect a size growth of two 256ths, or less than 0.8 percent.
 -}
chunk                       ::  ByteString -> Chunk
chunk :: ByteString -> Chunk
chunk ByteString
block                  =  ByteString -> Int -> EscapeChar -> EscapeChar -> Chunk
EncodedChunk (Word8 -> Word8 -> ByteString -> ByteString
encode Word8
nW Word8
eW ByteString
block)
                                             (ByteString -> Int
Bytes.length ByteString
block) EscapeChar
nEsc EscapeChar
eEsc
--  | safeForHereDoc block     =  SafeChunk block
--  | otherwise                =  EncodedChunk (encode nW eW block)
--                                             (Bytes.length block) nEsc eEsc
 where
  nEsc :: EscapeChar
nEsc@(EscapeChar Word8
nW ByteString
_ ByteString
_ ByteString
_) :
    eEsc :: EscapeChar
eEsc@(EscapeChar Word8
eW ByteString
_ ByteString
_ ByteString
_) : [EscapeChar]
_ = (Int, EscapeChar) -> EscapeChar
forall a b. (a, b) -> b
snd ((Int, EscapeChar) -> EscapeChar)
-> [(Int, EscapeChar)] -> [EscapeChar]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> ((Int, EscapeChar) -> (Int, EscapeChar) -> Ordering)
-> [(Int, EscapeChar)] -> [(Int, EscapeChar)]
forall a. (a -> a -> Ordering) -> [a] -> [a]
List.sortBy (((Int, EscapeChar) -> (Int, Word8))
-> (Int, EscapeChar) -> (Int, EscapeChar) -> Ordering
forall a b. Ord a => (b -> a) -> b -> b -> Ordering
comparing (Int, EscapeChar) -> (Int, Word8)
forall {a}. (a, EscapeChar) -> (a, Word8)
cmp) [(Int, EscapeChar)]
counts
  cmp :: (a, EscapeChar) -> (a, Word8)
cmp (a
count, EscapeChar Word8
w ByteString
_ ByteString
_ ByteString
_) = (a
count, Word8
w)
  counts :: [(Int, EscapeChar)]
counts                     =  EscapeChar -> (Int, EscapeChar)
countAndBundle (EscapeChar -> (Int, EscapeChar))
-> [EscapeChar] -> [(Int, EscapeChar)]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> [EscapeChar]
escapes
   where
    countAndBundle :: EscapeChar -> (Int, EscapeChar)
countAndBundle e :: EscapeChar
e@(EscapeChar Word8
w ByteString
_ ByteString
_ ByteString
_) = (Word8 -> ByteString -> Int
Bytes.count Word8
w ByteString
block, EscapeChar
e)

{-| Given a byte to replace nulls and an escape byte, rewrites the data such
    that nulls are mapped to the replace byte, replace bytes are mapped to a
    pair of escape bytes and the escape byte is is mapped to an escape byte
    followed by an underscore. For example, if the null replace byte is @!@
    and the escape byte is @\#@ then all nulls become @!@, any @!@ become
    @\#\#@ and all @\#@ become @\#_@.

    This escaping scheme is dictated by the needs of our Sed decoder, which is
    just two global substitions, one after another. If the escaping were such
    that, with our characters above, @\#@ escaped to @\#\#@ and @!@ to @\#_@,
    then @\#_@ in the input becomes @\#\#_@. We want to run the subsitution
    for @\#@ first, to catch this; it produces @\#_@; then Sed feeds the input
    to the second substitution which unfortunately renders @!@. In the
    alternate scheme, the input is encoded @\#__@, the @!@ decoder runs first
    and ignores it, then the @\#@ decoder runs and catches it. When using a
    pipeline of stream processors to interpret escape sequences, it seems best
    to ensure that only the very last processor inserts escape characters, to
    prevent their further interpretation.
 -}
encode                      ::  Word8 -> Word8 -> ByteString -> ByteString
encode :: Word8 -> Word8 -> ByteString -> ByteString
encode Word8
nullReplaceByte Word8
escapeByte ByteString
bytes =
  (ByteString, Maybe (Maybe Word8, ByteString)) -> ByteString
forall a b. (a, b) -> a
fst ((ByteString, Maybe (Maybe Word8, ByteString)) -> ByteString)
-> (ByteString, Maybe (Maybe Word8, ByteString)) -> ByteString
forall a b. (a -> b) -> a -> b
$ Int
-> ((Maybe Word8, ByteString)
    -> Maybe (Word8, (Maybe Word8, ByteString)))
-> (Maybe Word8, ByteString)
-> (ByteString, Maybe (Maybe Word8, ByteString))
forall a.
Int -> (a -> Maybe (Word8, a)) -> a -> (ByteString, Maybe a)
Bytes.unfoldrN Int
len (Maybe Word8, ByteString)
-> Maybe (Word8, (Maybe Word8, ByteString))
f (Maybe Word8
forall a. Maybe a
Nothing, ByteString
bytes)
 where
  --  The encoding should introduce at most 10% overhead; we allocate a little
  --  more just to be safe. This allows us to make use of the somewhat faster
  --  unfoldrN function (which probably pre-allocates).
  len :: Int
len = Double -> Int
forall b. Integral b => Double -> b
forall a b. (RealFrac a, Integral b) => a -> b
ceiling (Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral (ByteString -> Int
Bytes.length ByteString
bytes) Double -> Double -> Double
forall a. Num a => a -> a -> a
* Double
1.25)
  -- The worker sometimes floats up a byte, sometimes escapes a byte and
  -- introduces a byte to be 'carried' (like carryies in arithmetic) and
  -- sometimes floats up the carried byte.
  f :: (Maybe Word8, ByteString)
-> Maybe (Word8, (Maybe Word8, ByteString))
f (Just Word8
carried, ByteString
bytes)    =  (Word8, (Maybe Word8, ByteString))
-> Maybe (Word8, (Maybe Word8, ByteString))
forall a. a -> Maybe a
Just (Word8
carried, (Maybe Word8
forall a. Maybe a
Nothing, ByteString
bytes))
  f (Maybe Word8
Nothing     , ByteString
bytes)    =  do
    ((Word8
b, Maybe Word8
carry), ByteString
t)         <-  (Word8 -> (Word8, Maybe Word8))
-> (Word8, ByteString) -> ((Word8, Maybe Word8), ByteString)
forall b c d. (b -> c) -> (b, d) -> (c, d)
forall (a :: * -> * -> *) b c d.
Arrow a =>
a b c -> a (b, d) (c, d)
first Word8 -> (Word8, Maybe Word8)
rewrite ((Word8, ByteString) -> ((Word8, Maybe Word8), ByteString))
-> Maybe (Word8, ByteString)
-> Maybe ((Word8, Maybe Word8), ByteString)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> ByteString -> Maybe (Word8, ByteString)
Bytes.uncons ByteString
bytes
    (Word8, (Maybe Word8, ByteString))
-> Maybe (Word8, (Maybe Word8, ByteString))
forall a. a -> Maybe a
Just (Word8
b, (Maybe Word8
carry, ByteString
t))
  rewrite :: Word8 -> (Word8, Maybe Word8)
rewrite Word8
b
    | Word8
b Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
0x00              =  (Word8
nullReplaceByte, Maybe Word8
forall a. Maybe a
Nothing)
    | Word8
b Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
escapeByte        =  (Word8
escapeByte     , Word8 -> Maybe Word8
forall a. a -> Maybe a
Just Word8
underscore)
    | Word8
b Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
nullReplaceByte   =  (Word8
escapeByte     , Word8 -> Maybe Word8
forall a. a -> Maybe a
Just Word8
escapeByte)
    | Bool
otherwise              =  (Word8
b              , Maybe Word8
forall a. Maybe a
Nothing)
  underscore :: Word8
underscore                 =  Char -> Word8
Bytes.c2w Char
'_'

{-| Given the byte used to replace nulls and the escape byte, undoes the result
    of the encode operation -- rewriting null replacers to literal nulls and
    escape patterns to the original bytes. This function is not intended to be
    used in practice -- it will be shell commands that unpack the data -- but
    serves to document the ideas behind decoding as well as offering a way to
    check the encoder.
 -}
decode                      ::  Word8 -> Word8 -> ByteString -> ByteString
decode :: Word8 -> Word8 -> ByteString -> ByteString
decode Word8
nullReplaceByte Word8
escapeByte = (ByteString -> ByteString
unEscape (ByteString -> ByteString)
-> (ByteString -> ByteString) -> ByteString -> ByteString
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Word8 -> Word8) -> ByteString -> ByteString
Bytes.map Word8 -> Word8
unReplace)
 where
  unReplace :: Word8 -> Word8
unReplace Word8
b
    | Word8
b Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
nullReplaceByte   =  Word8
0x00
    | Bool
otherwise              =  Word8
b
  unEscape :: ByteString -> ByteString
unEscape                   =  [ByteString] -> ByteString
Bytes.concat ([ByteString] -> ByteString)
-> (ByteString -> [ByteString]) -> ByteString -> ByteString
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [ByteString] -> [ByteString]
forall a. [a] -> [a]
List.reverse ([ByteString] -> [ByteString])
-> (ByteString -> [ByteString]) -> ByteString -> [ByteString]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ([ByteString], Bool) -> [ByteString]
forall a b. (a, b) -> a
fst
                             (([ByteString], Bool) -> [ByteString])
-> (ByteString -> ([ByteString], Bool))
-> ByteString
-> [ByteString]
forall b c a. (b -> c) -> (a -> b) -> a -> c
.  (([ByteString], Bool) -> ByteString -> ([ByteString], Bool))
-> ([ByteString], Bool) -> [ByteString] -> ([ByteString], Bool)
forall b a. (b -> a -> b) -> b -> [a] -> b
forall (t :: * -> *) b a.
Foldable t =>
(b -> a -> b) -> b -> t a -> b
List.foldl' ([ByteString], Bool) -> ByteString -> ([ByteString], Bool)
f ([], Bool
False)
                             ([ByteString] -> ([ByteString], Bool))
-> (ByteString -> [ByteString])
-> ByteString
-> ([ByteString], Bool)
forall b c a. (b -> c) -> (a -> b) -> a -> c
.  Word8 -> ByteString -> [ByteString]
Bytes.split Word8
escapeByte
   where
    nS :: ByteString
nS                       =  Word8 -> ByteString
Bytes.singleton Word8
nullReplaceByte
    f :: ([ByteString], Bool) -> ByteString -> ([ByteString], Bool)
f ([ByteString]
strings, Bool
True)  ByteString
""    =  (ByteString
nSByteString -> [ByteString] -> [ByteString]
forall a. a -> [a] -> [a]
:[ByteString]
strings , Bool
False)
    f ([ByteString]
strings, Bool
False) ByteString
""    =  ([ByteString]
strings    , Bool
True)
    f ([ByteString]
strings, Bool
False) ByteString
s     =  (ByteString
sByteString -> [ByteString] -> [ByteString]
forall a. a -> [a] -> [a]
:[ByteString]
strings  , Bool
True)
    f ([ByteString]
strings, Bool
True)  ByteString
s
      | Bool
underscore           =  (ByteString
eStByteString -> [ByteString] -> [ByteString]
forall a. a -> [a] -> [a]
:[ByteString]
strings, Bool
True)
      | Bool
otherwise            =  (ByteString
sByteString -> [ByteString] -> [ByteString]
forall a. a -> [a] -> [a]
:[ByteString]
strings  , Bool
True)
     where
      underscore :: Bool
underscore             =  HasCallStack => ByteString -> Word8
ByteString -> Word8
Bytes.head ByteString
s Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Char -> Word8
Bytes.c2w Char
'_'
      eSt :: ByteString
eSt                    =  Word8 -> ByteString -> ByteString
Bytes.cons Word8
escapeByte (HasCallStack => ByteString -> ByteString
ByteString -> ByteString
Bytes.tail ByteString
s)
   {- The second field of the tuple is the "escaped" flag and the reasoning
    - behind it's setting and unsetting is tricky. We start unescaped. If a
    - string follows another string in the list of splits, there must have
    - been an escape character to make us split it; therefore, seeing a string
    - makes us set escaping to True. However, if we see an empty string, it
    - means there were two escape characters next to one another. We
    - interpret the double escape sequence and unset the escape flag.
    -}

data EscapeChar = EscapeChar !Word8 !ByteString -- For @tr@ char list.
                                    !ByteString -- For @sed@ pattern.
                                    !ByteString -- For @sed@ replacement.
deriving instance Show EscapeChar

{-| The candidate escape characters, with the forms to be used in constructed
    @tr@ and @sed@ commands.
 -}
escapes                     ::  [EscapeChar]
escapes :: [EscapeChar]
escapes                      =  [Word8 -> ByteString -> ByteString -> ByteString -> EscapeChar
EscapeChar  Word8
0x21  ByteString
"!"    ByteString
"!"    ByteString
"!",
                                 Word8 -> ByteString -> ByteString -> ByteString -> EscapeChar
EscapeChar  Word8
0x22  ByteString
"\""   ByteString
"\""   ByteString
"\"",
                                 Word8 -> ByteString -> ByteString -> ByteString -> EscapeChar
EscapeChar  Word8
0x23  ByteString
"#"    ByteString
"#"    ByteString
"#",
                                 Word8 -> ByteString -> ByteString -> ByteString -> EscapeChar
EscapeChar  Word8
0x24  ByteString
"$"    ByteString
"[$]"  ByteString
"$",
                                 Word8 -> ByteString -> ByteString -> ByteString -> EscapeChar
EscapeChar  Word8
0x25  ByteString
"%"    ByteString
"%"    ByteString
"%",
                                 Word8 -> ByteString -> ByteString -> ByteString -> EscapeChar
EscapeChar  Word8
0x26  ByteString
"&"    ByteString
"&"    ByteString
"\\&",

                                 Word8 -> ByteString -> ByteString -> ByteString -> EscapeChar
EscapeChar  Word8
0x2a  ByteString
"*"    ByteString
"[*]"  ByteString
"*",
                                 Word8 -> ByteString -> ByteString -> ByteString -> EscapeChar
EscapeChar  Word8
0x2b  ByteString
"+"    ByteString
"[+]"  ByteString
"+",
                                 Word8 -> ByteString -> ByteString -> ByteString -> EscapeChar
EscapeChar  Word8
0x2c  ByteString
","    ByteString
","    ByteString
",",
                                 Word8 -> ByteString -> ByteString -> ByteString -> EscapeChar
EscapeChar  Word8
0x2d  ByteString
"-"    ByteString
"-"    ByteString
"-",
                                 Word8 -> ByteString -> ByteString -> ByteString -> EscapeChar
EscapeChar  Word8
0x2e  ByteString
"."    ByteString
"[.]"  ByteString
".",
                                 Word8 -> ByteString -> ByteString -> ByteString -> EscapeChar
EscapeChar  Word8
0x2f  ByteString
"/"    ByteString
"/"    ByteString
"/",

                                 Word8 -> ByteString -> ByteString -> ByteString -> EscapeChar
EscapeChar  Word8
0x3a  ByteString
":"    ByteString
":"    ByteString
":",
                                 Word8 -> ByteString -> ByteString -> ByteString -> EscapeChar
EscapeChar  Word8
0x3b  ByteString
";"    ByteString
";"    ByteString
";",

                                 Word8 -> ByteString -> ByteString -> ByteString -> EscapeChar
EscapeChar  Word8
0x3d  ByteString
"="    ByteString
"="    ByteString
"=",

                                 Word8 -> ByteString -> ByteString -> ByteString -> EscapeChar
EscapeChar  Word8
0x3f  ByteString
"?"    ByteString
"[?]"  ByteString
"?",
                                 Word8 -> ByteString -> ByteString -> ByteString -> EscapeChar
EscapeChar  Word8
0x40  ByteString
"@"    ByteString
"@"    ByteString
"@",

                                 Word8 -> ByteString -> ByteString -> ByteString -> EscapeChar
EscapeChar  Word8
0x5c  ByteString
"\\\\" ByteString
"\\\\" ByteString
"\\\\",

                                 Word8 -> ByteString -> ByteString -> ByteString -> EscapeChar
EscapeChar  Word8
0x60  ByteString
"`"    ByteString
"`"    ByteString
"`",

                                 Word8 -> ByteString -> ByteString -> ByteString -> EscapeChar
EscapeChar  Word8
0x7e  ByteString
"~"    ByteString
"~"    ByteString
"~"]
{- We use character classes instead of \ for many characters on the pattern
 - side because \ turns special behaviour on in basic mode and off in extended
 - mode, an ambiguity that, I feel, is best not to have to think about.
 -}

{-| Many binary strings can be embedded as-is in a HEREDOC, without escaping.
 -}
safeForHereDoc              ::  ByteString -> Bool
safeForHereDoc :: ByteString -> Bool
safeForHereDoc               =  Bool -> Bool
not (Bool -> Bool) -> (ByteString -> Bool) -> ByteString -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Word8 -> Bool) -> ByteString -> Bool
Bytes.any (Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
0x00)

{-| Predicate to determine whether data is represented as an encoded chunk or
    is unencoded.
 -}
encoded                     ::  Chunk -> Bool
encoded :: Chunk -> Bool
encoded (SafeChunk ByteString
_)        =  Bool
False
encoded (EncodedChunk ByteString
_ Int
_ EscapeChar
_ EscapeChar
_) = Bool
True

{-|
 -}
script :: Chunk -> Builder
script Chunk
block                 =  [Builder] -> Builder
forall a. Monoid a => [a] -> a
mconcat ([Builder] -> Builder) -> [Builder] -> Builder
forall a b. (a -> b) -> a -> b
$ case Chunk
block of
  SafeChunk ByteString
bytes           ->  [Chunk -> Builder
script (ByteString -> Chunk
chunk ByteString
bytes)] -- Convert to Encoded
  EncodedChunk ByteString
bytes Int
len
               (EscapeChar Word8
_ ByteString
trN ByteString
_ ByteString
sedRN) (EscapeChar Word8
b ByteString
_ ByteString
sedPE ByteString
sedRE) ->
    [ Builder
"{ ", [Builder] -> Builder
forall a. Monoid a => [a] -> a
mconcat [Builder]
tr, Builder
" | ", [Builder] -> Builder
forall a. Monoid a => [a] -> a
mconcat [Builder]
sed, Builder
" | ", Int -> Builder
forall {a}. Show a => a -> Builder
clip Int
len, Builder
" ;}",
      Builder -> ByteString -> Builder
dataSection (Word8 -> Builder
Blaze.fromWord8 Word8
b) ByteString
bytes ]
   where
    tr :: [Builder]
tr                       =  [Builder
"tr '", ByteString -> Builder
blz ByteString
trN, Builder
"' '\\000'"]
    (Builder
e, Builder
e', Builder
n)               =  (ByteString -> Builder
blz ByteString
sedPE, ByteString -> Builder
blz ByteString
sedRE, ByteString -> Builder
blz ByteString
sedRN)
    sed :: [Builder]
sed                      =  [Builder
"sed '",Builder
"s|",Builder
e, Builder
e, Builder
"|",Builder
n, Builder
"|g",
                                   Builder
" ; ",Builder
"s|",Builder
e,Builder
"_",Builder
"|",Builder
e',Builder
"|g",Builder
"'"]
 where
  blz :: ByteString -> Builder
blz                        =  ByteString -> Builder
Blaze.fromByteString
  nl :: Builder
nl                         =  Char -> Builder
Blaze.fromChar Char
'\n'
  dataSection :: Builder -> ByteString -> Builder
dataSection Builder
eof ByteString
bytes = [Builder] -> Builder
forall a. Monoid a => [a] -> a
mconcat [Builder
" <<\\", Builder
eof, Builder
nl, ByteString -> Builder
blz ByteString
bytes, Builder
nl, Builder
eof, Builder
nl]
  clip :: a -> Builder
clip a
len                   =  Builder
"head -c " Builder -> Builder -> Builder
forall a. Monoid a => a -> a -> a
`mappend` a -> Builder
forall {a}. Show a => a -> Builder
Blaze.fromShow a
len


 {- Catting a tarball escaped this way to a shell behind a TTY won't work very
  - well: a ^C or ^Z is passed literally and would cause the TTY to kill or
  - suspend the shell.
  -
  - One reason users might care about this is the 'requiretty' option in
  - sudoers, an option set by default on many systems. It prevents one from
  - running `sudo ...' over SSH without a TTY (enabled through the -t flag to
  - SSH).
  -
  - There are 33 control characters, counting delete with the leading 32. Some
  - don't need to be escaped at all -- for example, newline -- whereas for
  - others, it's unclear (like carriage return). We can trust, I think, that
  - bytes higher than 127 don't need to be escaped. In principle, we have a
  - base-222 alphabet in which to encode the data so it should still be more
  - more compact than base 64; but whether shell decoders can effectively
  - realize this efficiency is another matter.
  -}