{-# OPTIONS_GHC -Wno-partial-fields #-}
module Bio.TwoBit.Tool
    ( EncodeProgress(..)
    , buildFasta
    , faToTwoBit
    , formatCdna
    , parseAnno
    , twoBitToFa
    , vcfToTwoBit
    )
where

import           Bio.TwoBit
import           Control.Applicative
import           Control.Exception
import           Control.Monad
import           Data.Bits
import           Data.Bool
import qualified Data.ByteString                    as B
import qualified Data.ByteString.Builder            as B
import qualified Data.ByteString.Char8              as C
import qualified Data.ByteString.Lazy.Char8         as L
import           Data.ByteString.Short                      ( ShortByteString, toShort )
import qualified Data.ByteString.Short              as H
import           Data.Char                                  ( isSpace, isUpper )
import           Data.Foldable
import qualified Data.HashMap.Strict                as M
import           Data.Int                                   ( Int64 )
import           Data.Word                                  ( Word8, Word32 )
import           System.IO                                  ( stdout )

type Bytes = B.ByteString
type LazyBytes = L.ByteString

-- | A cDNA or mRNA or transcript (these are all synonymous), with some
-- metainformation collected from the annotation.  Whatever the input
-- was called, we call it 'cdna' in the transciptome.
data Cdna = Cdna
    { Cdna -> Bytes
c_id           :: !Bytes           -- identifier (typically an ENST number)
    , Cdna -> Range
c_pos          :: !Range           -- genomic position
    , Cdna -> Bytes
c_gene_id      :: !Bytes           -- gene identifier (typically an ENSG number)
    , Cdna -> Bytes
c_gene_symbol  :: !Bytes           -- colloquial name, aka locus
    , Cdna -> Bytes
c_gene_biotype :: !Bytes           -- whatever, just pass it on
    , Cdna -> Bytes
c_biotype      :: !Bytes           -- whatever, just pass it on
    , Cdna -> Bytes
c_description  :: !Bytes           -- unclear; always empty for now
    , Cdna -> [Range]
c_exons        :: [Range]         -- list of exon coordinates (sorted backwards)
    }
  deriving Int -> Cdna -> ShowS
[Cdna] -> ShowS
Cdna -> [Char]
(Int -> Cdna -> ShowS)
-> (Cdna -> [Char]) -> ([Cdna] -> ShowS) -> Show Cdna
forall a.
(Int -> a -> ShowS) -> (a -> [Char]) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> Cdna -> ShowS
showsPrec :: Int -> Cdna -> ShowS
$cshow :: Cdna -> [Char]
show :: Cdna -> [Char]
$cshowList :: [Cdna] -> ShowS
showList :: [Cdna] -> ShowS
Show

data Range = Range
    { Range -> Bytes
r_chrom :: !C.ByteString
    , Range -> Int
r_start :: !Int
    , Range -> Int
r_len   :: !Int }
  deriving Int -> Range -> ShowS
[Range] -> ShowS
Range -> [Char]
(Int -> Range -> ShowS)
-> (Range -> [Char]) -> ([Range] -> ShowS) -> Show Range
forall a.
(Int -> a -> ShowS) -> (a -> [Char]) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> Range -> ShowS
showsPrec :: Int -> Range -> ShowS
$cshow :: Range -> [Char]
show :: Range -> [Char]
$cshowList :: [Range] -> ShowS
showList :: [Range] -> ShowS
Show

reverseRange :: Range -> Range
reverseRange :: Range -> Range
reverseRange (Range Bytes
sq Int
pos Int
len) = Bytes -> Int -> Int -> Range
Range Bytes
sq (-Int
posInt -> Int -> Int
forall a. Num a => a -> a -> a
-Int
len) Int
len

null_cdna :: Cdna
null_cdna :: Cdna
null_cdna = Bytes
-> Range
-> Bytes
-> Bytes
-> Bytes
-> Bytes
-> Bytes
-> [Range]
-> Cdna
Cdna Bytes
"" (Bytes -> Int -> Int -> Range
Range Bytes
"" Int
0 Int
0) Bytes
"" Bytes
"" Bytes
"" Bytes
"" Bytes
"" []


formatCdna :: TwoBitFile -> Cdna -> B.Builder
formatCdna :: TwoBitFile -> Cdna -> Builder
formatCdna TwoBitFile
tbf Cdna{[Range]
Bytes
Range
c_id :: Cdna -> Bytes
c_pos :: Cdna -> Range
c_gene_id :: Cdna -> Bytes
c_gene_symbol :: Cdna -> Bytes
c_gene_biotype :: Cdna -> Bytes
c_biotype :: Cdna -> Bytes
c_description :: Cdna -> Bytes
c_exons :: Cdna -> [Range]
c_id :: Bytes
c_pos :: Range
c_gene_id :: Bytes
c_gene_symbol :: Bytes
c_gene_biotype :: Bytes
c_biotype :: Bytes
c_description :: Bytes
c_exons :: [Range]
..} = Builder
descr Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> Int -> [Word8] -> Builder
buildFasta Int
60 [Word8]
getExons
  where
    (Bytes
_,Bytes
tbf_fn) = (Char -> Bool) -> Bytes -> (Bytes, Bytes)
C.breakEnd (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
==Char
'/') (Bytes -> (Bytes, Bytes)) -> Bytes -> (Bytes, Bytes)
forall a b. (a -> b) -> a -> b
$ TwoBitFile -> Bytes
tbf_path TwoBitFile
tbf
    (Bytes
tbf_base,Bytes
_) = (Char -> Bool) -> Bytes -> (Bytes, Bytes)
C.breakEnd (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
==Char
'.') Bytes
tbf_fn
    genome_id :: Bytes
genome_id = if Bytes -> Bool
C.null Bytes
tbf_base then Bytes
tbf_fn else HasCallStack => Bytes -> Bytes
Bytes -> Bytes
C.init Bytes
tbf_base

    descr :: Builder
descr = Char -> Builder
B.char7 Char
'>' Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> Bytes -> Builder
B.byteString Bytes
c_id Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> Builder
" cdna chromosome:" Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<>
            Bytes -> Builder
B.byteString Bytes
genome_id Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> Char -> Builder
B.char7 Char
':' Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> Range -> Builder
formatRange Range
c_pos Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<>
            Builder
" gene:" Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> Bytes -> Builder
B.byteString Bytes
c_gene_id Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<>
            Builder -> Bytes -> Builder
maybeBS Builder
" gene_biotype:" Bytes
c_gene_biotype Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<>
            Builder -> Bytes -> Builder
maybeBS Builder
" transcript_biotype:" Bytes
c_biotype Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<>
            Builder -> Bytes -> Builder
maybeBS Builder
" gene_symbol:" Bytes
c_gene_symbol Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<>
            Builder -> Bytes -> Builder
maybeBS Builder
" description:" Bytes
c_description Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> Char -> Builder
B.char7 Char
'\n'

    formatRange :: Range -> Builder
formatRange Range
r | Range -> Int
r_start Range
r Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
0  = Range -> Builder
formatRange1 (Range -> Range
reverseRange Range
r) Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> Builder
":-1"
                  | Bool
otherwise      = Range -> Builder
formatRange1 Range
r Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> Builder
":1"

    formatRange1 :: Range -> Builder
formatRange1 Range
r = Bytes -> Builder
B.byteString (Range -> Bytes
r_chrom Range
r) Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> Char -> Builder
B.char7 Char
':' Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<>
                     Int -> Builder
B.intDec (Range -> Int
r_start Range
r) Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> Char -> Builder
B.char7 Char
':' Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<>
                     Int -> Builder
B.intDec (Range -> Int
r_start Range
r Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Range -> Int
r_len Range
r Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1)

    maybeBS :: Builder -> Bytes -> Builder
maybeBS Builder
p Bytes
s = if Bytes -> Bool
B.null Bytes
s then Builder
forall a. Monoid a => a
mempty else Builder
p Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> Bytes -> Builder
B.byteString Bytes
s

    getExons :: [Word8]
getExons | Range -> Int
r_start Range
c_pos Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
0  =  (Range -> [Word8]) -> [Range] -> [Word8]
forall (t :: * -> *) a b. Foldable t => (a -> [b]) -> t a -> [b]
concatMap Range -> [Word8]
getExon [Range]
c_exons
             | Bool
otherwise          =  (Range -> [Word8]) -> [Range] -> [Word8]
forall (t :: * -> *) a b. Foldable t => (a -> [b]) -> t a -> [b]
concatMap Range -> [Word8]
getExon ([Range] -> [Range]
forall a. [a] -> [a]
reverse [Range]
c_exons)

    getExon :: Range -> [Word8]
    getExon :: Range -> [Word8]
getExon (Range Bytes
ch Int
start Int
len) =
        case Bytes -> TwoBitFile -> Maybe TwoBitChromosome
findChrom Bytes
ch TwoBitFile
tbf of
            Just TwoBitChromosome
tbs | Int
start Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
0 -> Int -> [Word8] -> [Word8]
forall a. Int -> [a] -> [a]
take Int
len ([Word8] -> [Word8]) -> [Word8] -> [Word8]
forall a b. (a -> b) -> a -> b
$ TwoBitSequence' Unidrectional -> [Word8]
forall dir. TwoBitSequence' dir -> [Word8]
unpackRS (TwoBitSequence' Unidrectional -> [Word8])
-> TwoBitSequence' Unidrectional -> [Word8]
forall a b. (a -> b) -> a -> b
$ TwoBitChromosome -> Int -> TwoBitSequence' Unidrectional
tbc_fwd_seq TwoBitChromosome
tbs Int
start
                     | Bool
otherwise  -> Int -> [Word8] -> [Word8]
forall a. Int -> [a] -> [a]
take Int
len ([Word8] -> [Word8]) -> [Word8] -> [Word8]
forall a b. (a -> b) -> a -> b
$ TwoBitSequence' Bidirectional -> [Word8]
forall dir. TwoBitSequence' dir -> [Word8]
unpackRS (TwoBitSequence' Bidirectional -> [Word8])
-> TwoBitSequence' Bidirectional -> [Word8]
forall a b. (a -> b) -> a -> b
$ TwoBitChromosome -> Int -> TwoBitSequence' Bidirectional
tbc_rev_seq TwoBitChromosome
tbs (-Int
startInt -> Int -> Int
forall a. Num a => a -> a -> a
-Int
len)
            Maybe TwoBitChromosome
Nothing               -> [Char] -> [Word8]
forall a. HasCallStack => [Char] -> a
error ([Char] -> [Word8]) -> [Char] -> [Word8]
forall a b. (a -> b) -> a -> b
$ [Char]
"unknown reference " [Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ Bytes -> [Char]
forall a. Show a => a -> [Char]
show Bytes
ch




buildFasta :: Int -> [Word8] -> B.Builder
buildFasta :: Int -> [Word8] -> Builder
buildFasta Int
n = [Word8] -> Builder
go
  where
    go :: [Word8] -> Builder
go [   ] = Builder
forall a. Monoid a => a
mempty
    go [Word8]
s = let ([Word8]
u,[Word8]
v) = Int -> [Word8] -> ([Word8], [Word8])
forall a. Int -> [a] -> ([a], [a])
splitAt Int
n [Word8]
s
               in (Word8 -> Builder) -> [Word8] -> Builder
forall m a. Monoid m => (a -> m) -> [a] -> m
forall (t :: * -> *) m a.
(Foldable t, Monoid m) =>
(a -> m) -> t a -> m
foldMap Word8 -> Builder
B.word8 [Word8]
u Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> Char -> Builder
B.char7 Char
'\n' Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> [Word8] -> Builder
go [Word8]
v
{-# INLINE buildFasta #-}

twoBitToFa :: Int -> TwoBitSequence' dir -> IO ()
twoBitToFa :: forall dir. Int -> TwoBitSequence' dir -> IO ()
twoBitToFa Int
ln = Handle -> Builder -> IO ()
B.hPutBuilder Handle
stdout (Builder -> IO ())
-> (TwoBitSequence' dir -> Builder) -> TwoBitSequence' dir -> IO ()
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> [Word8] -> Builder
buildFasta Int
60 ([Word8] -> Builder)
-> (TwoBitSequence' dir -> [Word8])
-> TwoBitSequence' dir
-> Builder
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> [Word8] -> [Word8]
forall a. Int -> [a] -> [a]
take Int
ln ([Word8] -> [Word8])
-> (TwoBitSequence' dir -> [Word8])
-> TwoBitSequence' dir
-> [Word8]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. TwoBitSequence' dir -> [Word8]
forall dir. TwoBitSequence' dir -> [Word8]
unpackRSMasked


data EncodeProgress
    = EncodeProgress
        { EncodeProgress -> ShortByteString
ep_seqname    :: !ShortByteString
        , EncodeProgress -> Word32
ep_position   :: !Word32
        , EncodeProgress -> Word32
ep_hardmasked :: !Word32
        , EncodeProgress -> Word32
ep_softmasked :: !Word32
        , EncodeProgress -> Int64
ep_enclength  :: !Int64
        , EncodeProgress -> EncodeProgress
ep_tail       ::  EncodeProgress }
    | Encoded B.Builder


-- Strategy:  We can only write the packedDNA after we wrote the nBlocks
-- and mBlocks.  So packedDNA needs to be buffered.  We have to do three
-- simultaneous strict folds of the input, all of which result in reasonably
-- compact structures (name table, mask table, encoded dna), which get
-- concatenated at the end.
--
-- We also have to buffer everything, since the header with the sequence
-- names must be written first.  Oh joy.
--
-- We return a list of progress notifications terminated by the
-- 'B.Builder' for the whole 2bit file. The progress messages can be
-- printed or ignored; in either case, they should ensure enough
-- strictness to not waste more memory than necessary.

faToTwoBit :: L.ByteString -> EncodeProgress
faToTwoBit :: LazyBytes -> EncodeProgress
faToTwoBit = [(ShortByteString, LazyBytes)] -> LazyBytes -> EncodeProgress
get_each []
  where
    get_each :: [(ShortByteString, LazyBytes)] -> LazyBytes -> EncodeProgress
get_each [(ShortByteString, LazyBytes)]
acc LazyBytes
inp = case LazyBytes -> Maybe (Char, LazyBytes)
L.uncons (LazyBytes -> Maybe (Char, LazyBytes))
-> LazyBytes -> Maybe (Char, LazyBytes)
forall a b. (a -> b) -> a -> b
$ (Char -> Bool) -> LazyBytes -> LazyBytes
L.dropWhile (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
/= Char
'>') LazyBytes
inp of
                        Maybe (Char, LazyBytes)
Nothing     -> Builder -> EncodeProgress
Encoded (Builder -> EncodeProgress) -> Builder -> EncodeProgress
forall a b. (a -> b) -> a -> b
$ [(ShortByteString, LazyBytes)] -> Builder
seqs_to_twobit ([(ShortByteString, LazyBytes)] -> Builder)
-> [(ShortByteString, LazyBytes)] -> Builder
forall a b. (a -> b) -> a -> b
$ [(ShortByteString, LazyBytes)] -> [(ShortByteString, LazyBytes)]
forall a. [a] -> [a]
reverse [(ShortByteString, LazyBytes)]
acc
                        Just (Char
_,LazyBytes
s2) ->
                            let (LazyBytes
nm, LazyBytes
s') = (Char -> Bool) -> LazyBytes -> (LazyBytes, LazyBytes)
L.break (Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
' ') LazyBytes
s2
                            in [(ShortByteString, LazyBytes)]
-> ShortByteString
-> Word32
-> GapList
-> GapList
-> BaseAccu
-> LazyBytes
-> EncodeProgress
get_one [(ShortByteString, LazyBytes)]
acc (Bytes -> ShortByteString
toShort (LazyBytes -> Bytes
L.toStrict LazyBytes
nm)) Word32
0 (Word32 -> L2i -> GapList
GapList Word32
forall a. Bounded a => a
maxBound L2i
L2i_Nil)
                                    (Word32 -> L2i -> GapList
GapList Word32
forall a. Bounded a => a
maxBound L2i
L2i_Nil) (Int -> Word8 -> Accu -> BaseAccu
BaseAccu Int
0 Word8
0 Accu
emptyAccu)
                                    ((Char -> Bool) -> LazyBytes -> LazyBytes
L.dropWhile (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
/= Char
'\n') LazyBytes
s')

    get_one :: [(ShortByteString, LazyBytes)]
-> ShortByteString
-> Word32
-> GapList
-> GapList
-> BaseAccu
-> LazyBytes
-> EncodeProgress
get_one [(ShortByteString, LazyBytes)]
acc !ShortByteString
nm !Word32
pos !GapList
ns !GapList
ms !BaseAccu
bs LazyBytes
inp = case LazyBytes -> Maybe (Char, LazyBytes)
L.uncons LazyBytes
inp of
        Maybe (Char, LazyBytes)
Nothing            -> LazyBytes -> EncodeProgress
fin LazyBytes
L.empty
        Just (Char
c,LazyBytes
s')
            | Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
' '     -> [(ShortByteString, LazyBytes)]
-> ShortByteString
-> Word32
-> GapList
-> GapList
-> BaseAccu
-> LazyBytes
-> EncodeProgress
get_one [(ShortByteString, LazyBytes)]
acc ShortByteString
nm Word32
pos GapList
ns GapList
ms BaseAccu
bs LazyBytes
s'
            | Char
c Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
'>'     -> LazyBytes -> EncodeProgress
fin (Char -> LazyBytes -> LazyBytes
L.cons Char
c LazyBytes
s')
            | Bool
otherwise    -> [(ShortByteString, LazyBytes)]
-> ShortByteString
-> Word32
-> GapList
-> GapList
-> BaseAccu
-> LazyBytes
-> EncodeProgress
get_one [(ShortByteString, LazyBytes)]
acc ShortByteString
nm (Word32 -> Word32
forall a. Enum a => a -> a
succ Word32
pos)
                                      (GapList -> Word32 -> Char -> GapList
collect_Ns GapList
ns Word32
pos Char
c)
                                      (GapList -> Word32 -> Char -> GapList
collect_ms GapList
ms Word32
pos Char
c)
                                      (BaseAccu -> Char -> BaseAccu
collect_bases BaseAccu
bs Char
c) LazyBytes
s'
      where
        fin :: LazyBytes -> EncodeProgress
fin LazyBytes
k = let !r :: LazyBytes
r = Word32 -> GapList -> GapList -> BaseAccu -> LazyBytes
encode_seq Word32
pos GapList
ns GapList
ms BaseAccu
bs
                in ShortByteString
-> Word32
-> Word32
-> Word32
-> Int64
-> EncodeProgress
-> EncodeProgress
EncodeProgress ShortByteString
nm Word32
pos (Word32 -> GapList -> Word32
sum_L2i Word32
pos GapList
ns) (Word32 -> GapList -> Word32
sum_L2i Word32
pos GapList
ms) (LazyBytes -> Int64
L.length LazyBytes
r) (EncodeProgress -> EncodeProgress)
-> EncodeProgress -> EncodeProgress
forall a b. (a -> b) -> a -> b
$
                   [(ShortByteString, LazyBytes)] -> LazyBytes -> EncodeProgress
get_each ((ShortByteString
nm,LazyBytes
r)(ShortByteString, LazyBytes)
-> [(ShortByteString, LazyBytes)] -> [(ShortByteString, LazyBytes)]
forall a. a -> [a] -> [a]
:[(ShortByteString, LazyBytes)]
acc) LazyBytes
k

-- | Extracts the reference from a VCF.  This assumes the presence of at
-- least one record per site.  The VCF must be sorted by position.  When
-- writing out, we try to match the order of the contigs as listed in
-- the header.  Unlisted contigs follow at the end with their order
-- preserved; contigs without data are not written at all.
vcfToTwoBit :: [B.ByteString] -> EncodeProgress
vcfToTwoBit :: [Bytes] -> EncodeProgress
vcfToTwoBit [Bytes]
s0 = let ([(ShortByteString, Word32)]
lns, [Bytes]
s1) = [(ShortByteString, Word32)]
-> [Bytes] -> ([(ShortByteString, Word32)], [Bytes])
read_header [] [Bytes]
s0
                 in [(ShortByteString, Word32)]
-> [(ShortByteString, LazyBytes)] -> [Bytes] -> EncodeProgress
get_each [(ShortByteString, Word32)]
lns [] ([Bytes] -> EncodeProgress) -> [Bytes] -> EncodeProgress
forall a b. (a -> b) -> a -> b
$ (Bytes -> Bool) -> [Bytes] -> [Bytes]
forall a. (a -> Bool) -> [a] -> [a]
filter (\Bytes
s -> Bool -> Bool
not (Bytes -> Bool
B.null Bytes
s) Bool -> Bool -> Bool
&& Bytes -> Char
C.head Bytes
s Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
/= Char
'#') [Bytes]
s1
  where
    -- Collects the "contig" stanzas, parses their lengths.  Returns the
    -- length map and the remaining stream.
    read_header :: [(ShortByteString, Word32)]
-> [Bytes] -> ([(ShortByteString, Word32)], [Bytes])
read_header [(ShortByteString, Word32)]
acc [    ]                                         = ([(ShortByteString, Word32)] -> [(ShortByteString, Word32)]
forall a. [a] -> [a]
reverse [(ShortByteString, Word32)]
acc, [])
    read_header [(ShortByteString, Word32)]
acc (Bytes
l:[Bytes]
ls) | Bytes
"##contig=" Bytes -> Bytes -> Bool
`C.isPrefixOf` Bytes
l
                           , (Just !ShortByteString
nm, Just !Word32
ln) <- Bytes -> (Maybe ShortByteString, Maybe Word32)
parse_cline Bytes
l = [(ShortByteString, Word32)]
-> [Bytes] -> ([(ShortByteString, Word32)], [Bytes])
read_header ((ShortByteString
nm,Word32
ln)(ShortByteString, Word32)
-> [(ShortByteString, Word32)] -> [(ShortByteString, Word32)]
forall a. a -> [a] -> [a]
:[(ShortByteString, Word32)]
acc) [Bytes]
ls
                           | Bytes
"#" Bytes -> Bytes -> Bool
`C.isPrefixOf` Bytes
l                  = [(ShortByteString, Word32)]
-> [Bytes] -> ([(ShortByteString, Word32)], [Bytes])
read_header [(ShortByteString, Word32)]
acc [Bytes]
ls
                           | Bool
otherwise                             = ([(ShortByteString, Word32)] -> [(ShortByteString, Word32)]
forall a. [a] -> [a]
reverse [(ShortByteString, Word32)]
acc, Bytes
lBytes -> [Bytes] -> [Bytes]
forall a. a -> [a] -> [a]
:[Bytes]
ls)

    parse_cline :: Bytes -> (Maybe ShortByteString, Maybe Word32)
parse_cline = Bytes -> (Maybe ShortByteString, Maybe Word32)
forall {a}. Num a => Bytes -> (Maybe ShortByteString, Maybe a)
p1 (Bytes -> (Maybe ShortByteString, Maybe Word32))
-> (Bytes -> Bytes)
-> Bytes
-> (Maybe ShortByteString, Maybe Word32)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Char -> Bool) -> Bytes -> Bytes
C.filter (Bool -> Bool
not (Bool -> Bool) -> (Char -> Bool) -> Char -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> Bool
isSpace) (Bytes -> Bytes) -> (Bytes -> Bytes) -> Bytes -> Bytes
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Char -> Bool) -> Bytes -> Bytes
C.takeWhile (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
/=Char
'>') (Bytes -> Bytes) -> (Bytes -> Bytes) -> Bytes -> Bytes
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> Bytes -> Bytes
C.drop Int
1 (Bytes -> Bytes) -> (Bytes -> Bytes) -> Bytes -> Bytes
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Char -> Bool) -> Bytes -> Bytes
C.dropWhile (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
/=Char
'<')
      where
        p1 :: Bytes -> (Maybe ShortByteString, Maybe a)
p1 Bytes
s | Bytes
"ID=" Bytes -> Bytes -> Bool
`C.isPrefixOf` Bytes
s = let (Bytes
nm,Bytes
t) = (Char -> Bool) -> Bytes -> (Bytes, Bytes)
C.break (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
==Char
',') (Bytes -> (Bytes, Bytes)) -> Bytes -> (Bytes, Bytes)
forall a b. (a -> b) -> a -> b
$ Int -> Bytes -> Bytes
C.drop Int
3 Bytes
s
                                            (Maybe ShortByteString
_,Maybe a
ln) = Bytes -> (Maybe ShortByteString, Maybe a)
p1 (Bytes -> (Maybe ShortByteString, Maybe a))
-> Bytes -> (Maybe ShortByteString, Maybe a)
forall a b. (a -> b) -> a -> b
$ Int -> Bytes -> Bytes
C.drop Int
1 Bytes
t
                                        in (ShortByteString -> Maybe ShortByteString
forall a. a -> Maybe a
Just (Bytes -> ShortByteString
toShort Bytes
nm),Maybe a
ln)

             | Bytes
"length=" Bytes -> Bytes -> Bool
`C.isPrefixOf` Bytes
s = case Bytes -> Maybe (Int, Bytes)
C.readInt (Bytes -> Maybe (Int, Bytes)) -> Bytes -> Maybe (Int, Bytes)
forall a b. (a -> b) -> a -> b
$ Int -> Bytes -> Bytes
C.drop Int
7 Bytes
s of
                    Just (Int
ln,Bytes
u) -> let (Maybe ShortByteString
nm,Maybe a
_) = Bytes -> (Maybe ShortByteString, Maybe a)
p1 (Bytes -> (Maybe ShortByteString, Maybe a))
-> Bytes -> (Maybe ShortByteString, Maybe a)
forall a b. (a -> b) -> a -> b
$ Int -> Bytes -> Bytes
C.drop Int
1 (Bytes -> Bytes) -> Bytes -> Bytes
forall a b. (a -> b) -> a -> b
$ (Char -> Bool) -> Bytes -> Bytes
C.dropWhile (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
/=Char
',') Bytes
u in (Maybe ShortByteString
nm,a -> Maybe a
forall a. a -> Maybe a
Just (Int -> a
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
ln))
                    Maybe (Int, Bytes)
Nothing     -> Bytes -> (Maybe ShortByteString, Maybe a)
p1 (Bytes -> (Maybe ShortByteString, Maybe a))
-> Bytes -> (Maybe ShortByteString, Maybe a)
forall a b. (a -> b) -> a -> b
$ Int -> Bytes -> Bytes
C.drop Int
1 (Bytes -> Bytes) -> Bytes -> Bytes
forall a b. (a -> b) -> a -> b
$ (Char -> Bool) -> Bytes -> Bytes
C.dropWhile (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
/=Char
',') Bytes
s

             | Bytes -> Bool
C.null Bytes
s = (Maybe ShortByteString
forall a. Maybe a
Nothing,Maybe a
forall a. Maybe a
Nothing)
             | Bool
otherwise = Bytes -> (Maybe ShortByteString, Maybe a)
p1 (Bytes -> (Maybe ShortByteString, Maybe a))
-> Bytes -> (Maybe ShortByteString, Maybe a)
forall a b. (a -> b) -> a -> b
$ Int -> Bytes -> Bytes
C.drop Int
1 (Bytes -> Bytes) -> Bytes -> Bytes
forall a b. (a -> b) -> a -> b
$ (Char -> Bool) -> Bytes -> Bytes
C.dropWhile (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
/=Char
',') Bytes
s

    get_each :: [(ShortByteString,Word32)]
             -> [(ShortByteString, LazyBytes)]
             -> [B.ByteString]
             -> EncodeProgress
    get_each :: [(ShortByteString, Word32)]
-> [(ShortByteString, LazyBytes)] -> [Bytes] -> EncodeProgress
get_each [(ShortByteString, Word32)]
lns [(ShortByteString, LazyBytes)]
acc [    ] = Builder -> EncodeProgress
Encoded (Builder -> EncodeProgress) -> Builder -> EncodeProgress
forall a b. (a -> b) -> a -> b
$ [(ShortByteString, LazyBytes)] -> Builder
seqs_to_twobit ([(ShortByteString, LazyBytes)] -> Builder)
-> [(ShortByteString, LazyBytes)] -> Builder
forall a b. (a -> b) -> a -> b
$ [ShortByteString]
-> [(ShortByteString, LazyBytes)] -> [(ShortByteString, LazyBytes)]
forall a b. Eq a => [a] -> [(a, b)] -> [(a, b)]
reorder (((ShortByteString, Word32) -> ShortByteString)
-> [(ShortByteString, Word32)] -> [ShortByteString]
forall a b. (a -> b) -> [a] -> [b]
map (ShortByteString, Word32) -> ShortByteString
forall a b. (a, b) -> a
fst [(ShortByteString, Word32)]
lns) ([(ShortByteString, LazyBytes)] -> [(ShortByteString, LazyBytes)])
-> [(ShortByteString, LazyBytes)] -> [(ShortByteString, LazyBytes)]
forall a b. (a -> b) -> a -> b
$ [(ShortByteString, LazyBytes)] -> [(ShortByteString, LazyBytes)]
forall a. [a] -> [a]
reverse [(ShortByteString, LazyBytes)]
acc
    get_each [(ShortByteString, Word32)]
lns [(ShortByteString, LazyBytes)]
acc (Bytes
l:[Bytes]
s2) = ShortByteString
-> Word32
-> Word32
-> Word32
-> Int64
-> EncodeProgress
-> EncodeProgress
EncodeProgress ShortByteString
nm' Word32
ln' (Word32 -> GapList -> Word32
sum_L2i Word32
ln' GapList
ns') Word32
0 (LazyBytes -> Int64
L.length LazyBytes
r) (EncodeProgress -> EncodeProgress)
-> EncodeProgress -> EncodeProgress
forall a b. (a -> b) -> a -> b
$
                              [(ShortByteString, Word32)]
-> [(ShortByteString, LazyBytes)] -> [Bytes] -> EncodeProgress
get_each [(ShortByteString, Word32)]
lns ((ShortByteString
nm',LazyBytes
r)(ShortByteString, LazyBytes)
-> [(ShortByteString, LazyBytes)] -> [(ShortByteString, LazyBytes)]
forall a. a -> [a] -> [a]
:[(ShortByteString, LazyBytes)]
acc) [Bytes]
s3
      where
        nm :: Bytes
nm = (Word8 -> Bool) -> Bytes -> Bytes
B.takeWhile (Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
/= Word8
9) Bytes
l
        (Word32
pos,GapList
ns,BaseAccu
bs,[Bytes]
s3) = Bytes
-> Word32
-> GapList
-> BaseAccu
-> [Bytes]
-> (Word32, GapList, BaseAccu, [Bytes])
get_one Bytes
nm Word32
0 (Word32 -> L2i -> GapList
GapList Word32
forall a. Bounded a => a
maxBound L2i
L2i_Nil) (Int -> Word8 -> Accu -> BaseAccu
BaseAccu Int
0 Word8
0 Accu
emptyAccu) (Bytes
lBytes -> [Bytes] -> [Bytes]
forall a. a -> [a] -> [a]
:[Bytes]
s2)
        !nm' :: ShortByteString
nm' = Bytes -> ShortByteString
toShort Bytes
nm
        (GapList
ns',BaseAccu
bs',Word32
ln') = case ((ShortByteString, Word32) -> Bool)
-> [(ShortByteString, Word32)] -> Maybe (ShortByteString, Word32)
forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Maybe a
find (ShortByteString -> ShortByteString -> Bool
forall a. Eq a => a -> a -> Bool
(==) ShortByteString
nm' (ShortByteString -> Bool)
-> ((ShortByteString, Word32) -> ShortByteString)
-> (ShortByteString, Word32)
-> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (ShortByteString, Word32) -> ShortByteString
forall a b. (a, b) -> a
fst) [(ShortByteString, Word32)]
lns of
                            Just (ShortByteString
_,Word32
ln) | Word32
ln Word32 -> Word32 -> Bool
forall a. Ord a => a -> a -> Bool
> Word32
pos -> (GapList -> Word32 -> GapList
extend_gap GapList
ns Word32
ln, BaseAccu -> Int -> BaseAccu
pad_bases BaseAccu
bs (Word32 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Word32 -> Int) -> Word32 -> Int
forall a b. (a -> b) -> a -> b
$ Word32
lnWord32 -> Word32 -> Word32
forall a. Num a => a -> a -> a
-Word32
pos), Word32
ln)
                            Maybe (ShortByteString, Word32)
_                      -> (GapList
ns,BaseAccu
bs,Word32
pos)
        !r :: LazyBytes
r = Word32 -> GapList -> GapList -> BaseAccu -> LazyBytes
encode_seq Word32
ln' GapList
ns' (Word32 -> L2i -> GapList
GapList Word32
forall a. Bounded a => a
maxBound L2i
L2i_Nil) BaseAccu
bs'


    -- important: 1-based coordinates!
    get_one :: Bytes
-> Word32
-> GapList
-> BaseAccu
-> [Bytes]
-> (Word32, GapList, BaseAccu, [Bytes])
get_one !Bytes
_nm !Word32
pos !GapList
ns !BaseAccu
bs [    ]     =  (Word32
pos,GapList
ns,BaseAccu
bs,[])
    get_one  !Bytes
nm !Word32
pos !GapList
ns !BaseAccu
bs (Bytes
l:[Bytes]
s')
            | (Word8 -> Bool) -> Bytes -> Bytes
B.takeWhile (Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
/=Word8
9) Bytes
l Bytes -> Bytes -> Bool
forall a. Eq a => a -> a -> Bool
/= Bytes
nm  =  (Word32
pos,GapList
ns,BaseAccu
bs,Bytes
lBytes -> [Bytes] -> [Bytes]
forall a. a -> [a] -> [a]
:[Bytes]
s')

            | Just (Int
pos',Bytes
l3) <- Bytes -> Maybe (Int, Bytes)
C.readInt (Bytes -> Maybe (Int, Bytes))
-> (Bytes -> Bytes) -> Bytes -> Maybe (Int, Bytes)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> Bytes -> Bytes
C.drop Int
1 (Bytes -> Maybe (Int, Bytes)) -> Bytes -> Maybe (Int, Bytes)
forall a b. (a -> b) -> a -> b
$ (Word8 -> Bool) -> Bytes -> Bytes
B.dropWhile (Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
/=Word8
9) Bytes
l
            , Bytes
ref <- (Word8 -> Bool) -> Bytes -> Bytes
B.takeWhile (Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
/=Word8
9) (Bytes -> Bytes) -> (Bytes -> Bytes) -> Bytes -> Bytes
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> Bytes -> Bytes
B.drop Int
1 (Bytes -> Bytes) -> (Bytes -> Bytes) -> Bytes -> Bytes
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Word8 -> Bool) -> Bytes -> Bytes
B.dropWhile (Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
/=Word8
9) (Bytes -> Bytes) -> Bytes -> Bytes
forall a b. (a -> b) -> a -> b
$ Int -> Bytes -> Bytes
B.drop Int
1 Bytes
l3
            , Int -> Word32
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
pos' Word32 -> Word32 -> Bool
forall a. Ord a => a -> a -> Bool
>= Word32
pos Word32 -> Word32 -> Word32
forall a. Num a => a -> a -> a
+ Word32
1
            , Bool -> Bool
not (Bytes -> Bool
C.null Bytes
ref) =
                if Int -> Word32
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
pos' Word32 -> Word32 -> Bool
forall a. Eq a => a -> a -> Bool
== Word32
pos Word32 -> Word32 -> Word32
forall a. Num a => a -> a -> a
+ Word32
1
                    -- record in sequence
                    then Bytes
-> Word32
-> GapList
-> BaseAccu
-> [Bytes]
-> (Word32, GapList, BaseAccu, [Bytes])
get_one Bytes
nm (Word32 -> Word32
forall a. Enum a => a -> a
succ Word32
pos) (GapList -> Word32 -> Char -> GapList
collect_Ns GapList
ns Word32
pos (Char -> GapList) -> Char -> GapList
forall a b. (a -> b) -> a -> b
$ Bytes -> Char
C.head Bytes
ref)
                                               (BaseAccu -> Char -> BaseAccu
collect_bases BaseAccu
bs  (Char -> BaseAccu) -> Char -> BaseAccu
forall a b. (a -> b) -> a -> b
$ Bytes -> Char
C.head Bytes
ref) [Bytes]
s'
                    -- gap:  handle the gap, reprocess the record
                    else let gap_len :: Int
gap_len = Int
pos' Int -> Int -> Int
forall a. Num a => a -> a -> a
- Word32 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word32
pos Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1
                         in Bytes
-> Word32
-> GapList
-> BaseAccu
-> [Bytes]
-> (Word32, GapList, BaseAccu, [Bytes])
get_one Bytes
nm (Int -> Word32
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
pos' Word32 -> Word32 -> Word32
forall a. Num a => a -> a -> a
- Word32
1) (GapList -> Word32 -> GapList
extend_gap GapList
ns Word32
pos)
                                       (BaseAccu -> Int -> BaseAccu
pad_bases BaseAccu
bs Int
gap_len) (Bytes
lBytes -> [Bytes] -> [Bytes]
forall a. a -> [a] -> [a]
:[Bytes]
s')

            -- anything else can be ignored (parse errors or additional records)
            | Bool
otherwise                  =  Bytes
-> Word32
-> GapList
-> BaseAccu
-> [Bytes]
-> (Word32, GapList, BaseAccu, [Bytes])
get_one Bytes
nm Word32
pos GapList
ns BaseAccu
bs [Bytes]
s'


    pad_bases :: BaseAccu -> Int -> BaseAccu
pad_bases BaseAccu
bs Int
n = (BaseAccu -> Char -> BaseAccu) -> BaseAccu -> [Char] -> BaseAccu
forall b a. (b -> a -> b) -> b -> [a] -> b
forall (t :: * -> *) b a.
Foldable t =>
(b -> a -> b) -> b -> t a -> b
foldl' BaseAccu -> Char -> BaseAccu
collect_bases BaseAccu
bs ([Char] -> BaseAccu) -> [Char] -> BaseAccu
forall a b. (a -> b) -> a -> b
$ Int -> Char -> [Char]
forall a. Int -> a -> [a]
replicate Int
n Char
'T'

    -- Reorder a key-value list so it matches the order of a list of
    -- keys.  Missing keys are ignored, leftover pairs retain their
    -- original order.
    reorder :: Eq a => [a] -> [(a,b)] -> [(a,b)]
    reorder :: forall a b. Eq a => [a] -> [(a, b)] -> [(a, b)]
reorder [    ] [(a, b)]
vs = [(a, b)]
vs
    reorder (a
k:[a]
ks) [(a, b)]
vs = [(a, b)] -> [(a, b)] -> [(a, b)]
forall {b}. [(a, b)] -> [(a, b)] -> [(a, b)]
go [] [(a, b)]
vs
      where
        go :: [(a, b)] -> [(a, b)] -> [(a, b)]
go [(a, b)]
xs ((a
k1,b
v1):[(a, b)]
ys) | a
k  a -> a -> Bool
forall a. Eq a => a -> a -> Bool
==  a
k1 = (a
k1,b
v1) (a, b) -> [(a, b)] -> [(a, b)]
forall a. a -> [a] -> [a]
: [a] -> [(a, b)] -> [(a, b)]
forall a b. Eq a => [a] -> [(a, b)] -> [(a, b)]
reorder [a]
ks ([(a, b)] -> [(a, b)]
forall a. [a] -> [a]
reverse [(a, b)]
xs [(a, b)] -> [(a, b)] -> [(a, b)]
forall a. [a] -> [a] -> [a]
++ [(a, b)]
ys)
                           | Bool
otherwise = [(a, b)] -> [(a, b)] -> [(a, b)]
go ((a
k1,b
v1)(a, b) -> [(a, b)] -> [(a, b)]
forall a. a -> [a] -> [a]
:[(a, b)]
xs) [(a, b)]
ys
        go [(a, b)]
xs [          ]             = [a] -> [(a, b)] -> [(a, b)]
forall a b. Eq a => [a] -> [(a, b)] -> [(a, b)]
reorder [a]
ks ([(a, b)] -> [(a, b)]
forall a. [a] -> [a]
reverse [(a, b)]
xs)


-- List of pairs of 'Word32's.  Specialized and unpacked to conserve space.  Probably overkill...
data L2i = L2i {-# UNPACK #-} !Word32 {-# UNPACK #-} !Word32 L2i | L2i_Nil

data GapList = GapList !Word32 !L2i

sum_L2i :: Word32 -> GapList -> Word32
sum_L2i :: Word32 -> GapList -> Word32
sum_L2i Word32
p (GapList Word32
q L2i
xs) = Word32 -> L2i -> Word32
go (if Word32
q Word32 -> Word32 -> Bool
forall a. Eq a => a -> a -> Bool
== Word32
forall a. Bounded a => a
maxBound then Word32
0 else Word32
pWord32 -> Word32 -> Word32
forall a. Num a => a -> a -> a
-Word32
q) L2i
xs
  where
    go :: Word32 -> L2i -> Word32
go !Word32
a (L2i Word32
x Word32
y L2i
z) = Word32 -> L2i -> Word32
go (Word32
aWord32 -> Word32 -> Word32
forall a. Num a => a -> a -> a
+Word32
yWord32 -> Word32 -> Word32
forall a. Num a => a -> a -> a
-Word32
x) L2i
z
    go !Word32
a  L2i
L2i_Nil    = Word32
a

encodeL2i :: L2i -> B.Builder
encodeL2i :: L2i -> Builder
encodeL2i = Word32 -> Builder -> Builder -> L2i -> Builder
go Word32
0 Builder
forall a. Monoid a => a
mempty Builder
forall a. Monoid a => a
mempty
  where
    go :: Word32 -> Builder -> Builder -> L2i -> Builder
go !Word32
n Builder
ss Builder
ls  L2i
L2i_Nil     = Word32 -> Builder
B.word32LE Word32
n Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> Builder
ss Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> Builder
ls
    go !Word32
n Builder
ss Builder
ls (L2i Word32
s Word32
e L2i
rs) = Word32 -> Builder -> Builder -> L2i -> Builder
go (Word32 -> Word32
forall a. Enum a => a -> a
succ Word32
n) (Word32 -> Builder
B.word32LE Word32
s Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> Builder
ss) (Word32 -> Builder
B.word32LE (Word32
eWord32 -> Word32 -> Word32
forall a. Num a => a -> a -> a
-Word32
s) Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> Builder
ls) L2i
rs

seqs_to_twobit :: [(ShortByteString, LazyBytes)] -> B.Builder
seqs_to_twobit :: [(ShortByteString, LazyBytes)] -> Builder
seqs_to_twobit [(ShortByteString, LazyBytes)]
seqs = Word32 -> Builder
B.word32LE Word32
0x1A412743 Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> Word32 -> Builder
B.word32LE Word32
0 Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<>
                      Word32 -> Builder
B.word32LE (Int -> Word32
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Int -> Word32) -> Int -> Word32
forall a b. (a -> b) -> a -> b
$ [(ShortByteString, LazyBytes)] -> Int
forall a. [a] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [(ShortByteString, LazyBytes)]
seqs) Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> Word32 -> Builder
B.word32LE Word32
0 Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<>
                      [Builder] -> Builder
forall a. Monoid a => [a] -> a
mconcat ((ShortByteString -> Int -> Builder)
-> [ShortByteString] -> [Int] -> [Builder]
forall a b c. (a -> b -> c) -> [a] -> [b] -> [c]
zipWith (\ShortByteString
nm Int
off -> Word8 -> Builder
B.word8 (Int -> Word8
forall a b. (Integral a, Num b) => a -> b
fromIntegral (ShortByteString -> Int
H.length ShortByteString
nm)) Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<>
                                                   ShortByteString -> Builder
B.shortByteString ShortByteString
nm Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<>
                                                   Word32 -> Builder
B.word32LE (Int -> Word32
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
off))
                                       (((ShortByteString, LazyBytes) -> ShortByteString)
-> [(ShortByteString, LazyBytes)] -> [ShortByteString]
forall a b. (a -> b) -> [a] -> [b]
map (ShortByteString, LazyBytes) -> ShortByteString
forall a b. (a, b) -> a
fst [(ShortByteString, LazyBytes)]
seqs) [Int]
offsets) Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<>
                      ((ShortByteString, LazyBytes) -> Builder)
-> [(ShortByteString, LazyBytes)] -> Builder
forall m a. Monoid m => (a -> m) -> [a] -> m
forall (t :: * -> *) m a.
(Foldable t, Monoid m) =>
(a -> m) -> t a -> m
foldMap (LazyBytes -> Builder
B.lazyByteString (LazyBytes -> Builder)
-> ((ShortByteString, LazyBytes) -> LazyBytes)
-> (ShortByteString, LazyBytes)
-> Builder
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (ShortByteString, LazyBytes) -> LazyBytes
forall a b. (a, b) -> b
snd) [(ShortByteString, LazyBytes)]
seqs
  where
    offset0 :: Int
offset0 = Int
16 Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
5 Int -> Int -> Int
forall a. Num a => a -> a -> a
* [(ShortByteString, LazyBytes)] -> Int
forall a. [a] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [(ShortByteString, LazyBytes)]
seqs Int -> Int -> Int
forall a. Num a => a -> a -> a
+ [Int] -> Int
forall a. Num a => [a] -> a
forall (t :: * -> *) a. (Foldable t, Num a) => t a -> a
sum (((ShortByteString, LazyBytes) -> Int)
-> [(ShortByteString, LazyBytes)] -> [Int]
forall a b. (a -> b) -> [a] -> [b]
map (ShortByteString -> Int
H.length (ShortByteString -> Int)
-> ((ShortByteString, LazyBytes) -> ShortByteString)
-> (ShortByteString, LazyBytes)
-> Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (ShortByteString, LazyBytes) -> ShortByteString
forall a b. (a, b) -> a
fst) [(ShortByteString, LazyBytes)]
seqs)
    offsets :: [Int]
offsets = (Int -> LazyBytes -> Int) -> Int -> [LazyBytes] -> [Int]
forall b a. (b -> a -> b) -> b -> [a] -> [b]
scanl (\Int
a LazyBytes
b -> Int
a Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int64 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral (LazyBytes -> Int64
L.length LazyBytes
b)) Int
offset0 ([LazyBytes] -> [Int]) -> [LazyBytes] -> [Int]
forall a b. (a -> b) -> a -> b
$ ((ShortByteString, LazyBytes) -> LazyBytes)
-> [(ShortByteString, LazyBytes)] -> [LazyBytes]
forall a b. (a -> b) -> [a] -> [b]
map (ShortByteString, LazyBytes) -> LazyBytes
forall a b. (a, b) -> b
snd [(ShortByteString, LazyBytes)]
seqs


-- | A way to accumulate bytes.  If the accumulated bytes will hang
-- around in memory, this has much lower overhead than 'Builder'.  If it
-- has short lifetime, 'Builder' is much more convenient.
newtype Accu = Accu [Bytes]

emptyAccu :: Accu
emptyAccu :: Accu
emptyAccu = [Bytes] -> Accu
Accu []

-- | Appends bytes to a collection of 'Bytes' in such a way that the
-- 'Bytes' keep doubling in size.  This ensures O(n) time and space
-- complexity and fairly low overhead.
grow :: Word8 -> Accu -> Accu
grow :: Word8 -> Accu -> Accu
grow Word8
w = Int -> [Bytes] -> Accu -> Accu
go Int
1 [Word8 -> Bytes
B.singleton Word8
w]
  where
    go :: Int -> [Bytes] -> Accu -> Accu
go Int
l [Bytes]
acc (Accu (Bytes
s:[Bytes]
ss))
        | Bytes -> Int
B.length Bytes
s Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
l  = Int -> [Bytes] -> Accu -> Accu
go (Int
lInt -> Int -> Int
forall a. Num a => a -> a -> a
+Bytes -> Int
B.length Bytes
s) (Bytes
sBytes -> [Bytes] -> [Bytes]
forall a. a -> [a] -> [a]
:[Bytes]
acc) ([Bytes] -> Accu
Accu [Bytes]
ss)
        | Bool
otherwise        = let !s' :: Bytes
s' = [Bytes] -> Bytes
B.concat [Bytes]
acc in  [Bytes] -> Accu
Accu (Bytes
s' Bytes -> [Bytes] -> [Bytes]
forall a. a -> [a] -> [a]
: Bytes
s Bytes -> [Bytes] -> [Bytes]
forall a. a -> [a] -> [a]
: [Bytes]
ss)
    go Int
_ [Bytes]
acc (Accu [    ]) = let !s' :: Bytes
s' = [Bytes] -> Bytes
B.concat [Bytes]
acc in  [Bytes] -> Accu
Accu [Item [Bytes]
Bytes
s']

buildAccu :: Accu -> B.Builder
buildAccu :: Accu -> Builder
buildAccu (Accu [Bytes]
ss) = (Bytes -> Builder) -> [Bytes] -> Builder
forall m a. Monoid m => (a -> m) -> [a] -> m
forall (t :: * -> *) m a.
(Foldable t, Monoid m) =>
(a -> m) -> t a -> m
foldMap Bytes -> Builder
B.byteString ([Bytes] -> Builder) -> [Bytes] -> Builder
forall a b. (a -> b) -> a -> b
$ [Bytes] -> [Bytes]
forall a. [a] -> [a]
reverse [Bytes]
ss

encode_seq :: Word32                                    -- ^ length
           -> GapList                                   -- ^ list of N stretches
           -> GapList                                   -- ^ list of mask stretches
           -> BaseAccu                                  -- ^ accumulated bases
           -> LazyBytes

encode_seq :: Word32 -> GapList -> GapList -> BaseAccu -> LazyBytes
encode_seq Word32
pos GapList
ns GapList
ms BaseAccu
bs = LazyBytes -> Int64
L.length LazyBytes
r Int64 -> LazyBytes -> LazyBytes
forall a b. a -> b -> b
`seq` LazyBytes
r
  where
    ss' :: Accu
ss' = case BaseAccu
bs of (BaseAccu Int
0 Word8
_ Accu
ss) -> Accu
ss
                     (BaseAccu Int
n Word8
w Accu
ss) -> Word8 -> Accu -> Accu
grow (Word8
w Word8 -> Int -> Word8
forall a. Bits a => a -> Int -> a
`shiftL` (Int
8Int -> Int -> Int
forall a. Num a => a -> a -> a
-Int
2Int -> Int -> Int
forall a. Num a => a -> a -> a
*Int
n)) Accu
ss
    r :: LazyBytes
r = Builder -> LazyBytes
B.toLazyByteString (Builder -> LazyBytes) -> Builder -> LazyBytes
forall a b. (a -> b) -> a -> b
$
              Word32 -> Builder
B.word32LE Word32
pos Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<>
              L2i -> Builder
encodeL2i (case GapList
ns of GapList Word32
p L2i
rs | Word32
p Word32 -> Word32 -> Bool
forall a. Eq a => a -> a -> Bool
== Word32
forall a. Bounded a => a
maxBound -> L2i
rs ; GapList Word32
p L2i
rs -> Word32 -> Word32 -> L2i -> L2i
L2i Word32
p Word32
pos L2i
rs) Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<>
              L2i -> Builder
encodeL2i (case GapList
ms of GapList Word32
p L2i
rs | Word32
p Word32 -> Word32 -> Bool
forall a. Eq a => a -> a -> Bool
== Word32
forall a. Bounded a => a
maxBound -> L2i
rs ; GapList Word32
p L2i
rs -> Word32 -> Word32 -> L2i -> L2i
L2i Word32
p Word32
pos L2i
rs) Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<>
              Word32 -> Builder
B.word32LE Word32
0 Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<>
              Accu -> Builder
buildAccu Accu
ss'

-- | Collects stretches of Ns by looking at one character at a time.  In
-- reality, anything that isn't one of \"ACGT\" is treated as an N.
collect_Ns :: GapList -> Word32 -> Char -> GapList
collect_Ns :: GapList -> Word32 -> Char -> GapList
collect_Ns (GapList Word32
spos L2i
rs) Word32
pos Char
c
    | Word32
spos Word32 -> Word32 -> Bool
forall a. Eq a => a -> a -> Bool
== Word32
forall a. Bounded a => a
maxBound Bool -> Bool -> Bool
&& Char
c Char -> Bytes -> Bool
`C.elem` Bytes
"ACGTacgt" = Word32 -> L2i -> GapList
GapList Word32
forall a. Bounded a => a
maxBound L2i
rs
    | Word32
spos Word32 -> Word32 -> Bool
forall a. Eq a => a -> a -> Bool
== Word32
forall a. Bounded a => a
maxBound                          = Word32 -> L2i -> GapList
GapList      Word32
pos L2i
rs
    |                     Char
c Char -> Bytes -> Bool
`C.elem` Bytes
"ACGTacgt" = Word32 -> L2i -> GapList
GapList Word32
forall a. Bounded a => a
maxBound (Word32 -> Word32 -> L2i -> L2i
L2i Word32
spos Word32
pos L2i
rs)
    | Bool
otherwise                                 = Word32 -> L2i -> GapList
GapList     Word32
spos L2i
rs

-- | Collects stretches of masked dna by looking at one letter at a
-- time.  Anything lowercase is considered masked.
collect_ms :: GapList -> Word32 -> Char -> GapList
collect_ms :: GapList -> Word32 -> Char -> GapList
collect_ms (GapList Word32
spos L2i
rs) Word32
pos Char
c
    | Word32
spos Word32 -> Word32 -> Bool
forall a. Eq a => a -> a -> Bool
== Word32
forall a. Bounded a => a
maxBound Bool -> Bool -> Bool
&& Char -> Bool
isUpper Char
c = Word32 -> L2i -> GapList
GapList Word32
forall a. Bounded a => a
maxBound L2i
rs
    | Word32
spos Word32 -> Word32 -> Bool
forall a. Eq a => a -> a -> Bool
== Word32
forall a. Bounded a => a
maxBound              = Word32 -> L2i -> GapList
GapList      Word32
pos L2i
rs
    |                     Char -> Bool
isUpper Char
c = Word32 -> L2i -> GapList
GapList Word32
forall a. Bounded a => a
maxBound (Word32 -> Word32 -> L2i -> L2i
L2i Word32
spos Word32
pos L2i
rs)
    | Bool
otherwise                     = Word32 -> L2i -> GapList
GapList     Word32
spos L2i
rs

extend_gap :: GapList -> Word32 -> GapList
extend_gap :: GapList -> Word32 -> GapList
extend_gap (GapList Word32
spos L2i
rs) Word32
pos
    | Word32
spos Word32 -> Word32 -> Bool
forall a. Eq a => a -> a -> Bool
== Word32
forall a. Bounded a => a
maxBound = Word32 -> L2i -> GapList
GapList  Word32
pos L2i
rs
    | Bool
otherwise        = Word32 -> L2i -> GapList
GapList Word32
spos L2i
rs


data BaseAccu = BaseAccu !Int !Word8 !Accu

-- | Collects bases in 2bit format.  It accumulates 4 bases in one word,
-- then collects bytes in an 'Accu'.  From the 2bit spec:
--
-- packedDna - the DNA packed to two bits per base, represented as
--             so: T - 00, C - 01, A - 10, G - 11. The first base is
--             in the most significant 2-bit byte; the last base is
--             in the least significant 2 bits. For example, the
--             sequence TCAG is represented as 00011011.
collect_bases :: BaseAccu -> Char -> BaseAccu
collect_bases :: BaseAccu -> Char -> BaseAccu
collect_bases (BaseAccu Int
n Word8
w Accu
ss) Char
c
    = let code :: Word8
code = case Char
c of Char
'C'->Word8
1;Char
'c'->Word8
1;Char
'A'->Word8
2;Char
'a'->Word8
2;Char
'G'->Word8
3;Char
'g'->Word8
3;Char
_->Word8
0
          w' :: Word8
w'   = Word8 -> Int -> Word8
forall a. Bits a => a -> Int -> a
shiftL Word8
w Int
2 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.|. Word8
code
      in if Int
n Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
3 then Int -> Word8 -> Accu -> BaseAccu
BaseAccu Int
0 Word8
0 (Word8 -> Accu -> Accu
grow Word8
w' Accu
ss) else Int -> Word8 -> Accu -> BaseAccu
BaseAccu (Int -> Int
forall a. Enum a => a -> a
succ Int
n) Word8
w' Accu
ss

data Gene = Gene { Gene -> Bytes
g_id :: Bytes, Gene -> Bytes
g_symbol :: Bytes, Gene -> Bytes
g_biotype :: Bytes }

null_gene :: Gene
null_gene :: Gene
null_gene = Bytes -> Bytes -> Bytes -> Gene
Gene Bytes
"" Bytes
"" Bytes
""

data GffError = GffError String Int GffErrorDetail deriving Int -> GffError -> ShowS
[GffError] -> ShowS
GffError -> [Char]
(Int -> GffError -> ShowS)
-> (GffError -> [Char]) -> ([GffError] -> ShowS) -> Show GffError
forall a.
(Int -> a -> ShowS) -> (a -> [Char]) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> GffError -> ShowS
showsPrec :: Int -> GffError -> ShowS
$cshow :: GffError -> [Char]
show :: GffError -> [Char]
$cshowList :: [GffError] -> ShowS
showList :: [GffError] -> ShowS
Show
data GffErrorDetail = GffParseError | GffIdMismatch | GffUnknownRef Bytes deriving Int -> GffErrorDetail -> ShowS
[GffErrorDetail] -> ShowS
GffErrorDetail -> [Char]
(Int -> GffErrorDetail -> ShowS)
-> (GffErrorDetail -> [Char])
-> ([GffErrorDetail] -> ShowS)
-> Show GffErrorDetail
forall a.
(Int -> a -> ShowS) -> (a -> [Char]) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> GffErrorDetail -> ShowS
showsPrec :: Int -> GffErrorDetail -> ShowS
$cshow :: GffErrorDetail -> [Char]
show :: GffErrorDetail -> [Char]
$cshowList :: [GffErrorDetail] -> ShowS
showList :: [GffErrorDetail] -> ShowS
Show

instance Exception GffError where
    displayException :: GffError -> [Char]
displayException (GffError [Char]
fp Int
ln GffErrorDetail
dt) = GffErrorDetail -> [Char]
displayDetail GffErrorDetail
dt [Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ [Char]
" in line " [Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ Int -> [Char]
forall a. Show a => a -> [Char]
show Int
ln [Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ [Char]
" of gff file " [Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ [Char]
fp
      where
        displayDetail :: GffErrorDetail -> [Char]
displayDetail  GffErrorDetail
GffParseError     = [Char]
"parse error"
        displayDetail  GffErrorDetail
GffIdMismatch     = [Char]
"identifier does not match"
        displayDetail (GffUnknownRef Bytes
ch) = [Char]
"unknown reference " [Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ Bytes -> [Char]
forall a. Show a => a -> [Char]
show Bytes
ch

-- | Parses annotations in GFF format.  We want to turn an annotation
-- and a 2bit file into a FastA of the transcriptome (one sequence per
-- annotated transcript), that looks like the stuff Lior Pachter feeds
-- into Kallisto.  Annotations come in two dialects of GFF, either GFF3
-- or GTF.  We autodetect and understand both.

parseAnno :: String -> L.ByteString -> [Either GffError Cdna]
parseAnno :: [Char] -> LazyBytes -> [Either GffError Cdna]
parseAnno [Char]
fp = (Either GffError Cdna -> Bool)
-> [Either GffError Cdna] -> [Either GffError Cdna]
forall a. (a -> Bool) -> [a] -> [a]
filter ((GffError -> Bool)
-> (Cdna -> Bool) -> Either GffError Cdna -> Bool
forall a c b. (a -> c) -> (b -> c) -> Either a b -> c
either (Bool -> GffError -> Bool
forall a b. a -> b -> a
const Bool
True) (Bool -> Bool
not (Bool -> Bool) -> (Cdna -> Bool) -> Cdna -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [Range] -> Bool
forall a. [a] -> Bool
forall (t :: * -> *) a. Foldable t => t a -> Bool
null ([Range] -> Bool) -> (Cdna -> [Range]) -> Cdna -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Cdna -> [Range]
c_exons)) ([Either GffError Cdna] -> [Either GffError Cdna])
-> (LazyBytes -> [Either GffError Cdna])
-> LazyBytes
-> [Either GffError Cdna]
forall b c a. (b -> c) -> (a -> b) -> a -> c
.
               Gene -> Cdna -> [(Int, [Bytes])] -> [Either GffError Cdna]
go Gene
null_gene Cdna
null_cdna ([(Int, [Bytes])] -> [Either GffError Cdna])
-> (LazyBytes -> [(Int, [Bytes])])
-> LazyBytes
-> [Either GffError Cdna]
forall b c a. (b -> c) -> (a -> b) -> a -> c
.
               ((Int, Bytes) -> (Int, [Bytes]))
-> [(Int, Bytes)] -> [(Int, [Bytes])]
forall a b. (a -> b) -> [a] -> [b]
map ((Bytes -> [Bytes]) -> (Int, Bytes) -> (Int, [Bytes])
forall a b. (a -> b) -> (Int, a) -> (Int, b)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (Char -> Bytes -> [Bytes]
C.split Char
'\t')) ([(Int, Bytes)] -> [(Int, [Bytes])])
-> (LazyBytes -> [(Int, Bytes)]) -> LazyBytes -> [(Int, [Bytes])]
forall b c a. (b -> c) -> (a -> b) -> a -> c
.
               ((Int, Bytes) -> Bool) -> [(Int, Bytes)] -> [(Int, Bytes)]
forall a. (a -> Bool) -> [a] -> [a]
filter (\(Int
_,Bytes
s) -> Bool -> Bool
not (Bytes -> Bool
B.null Bytes
s) Bool -> Bool -> Bool
&& Bytes -> Char
C.head Bytes
s Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
/= Char
'#') ([(Int, Bytes)] -> [(Int, Bytes)])
-> (LazyBytes -> [(Int, Bytes)]) -> LazyBytes -> [(Int, Bytes)]
forall b c a. (b -> c) -> (a -> b) -> a -> c
.
               [Int] -> [Bytes] -> [(Int, Bytes)]
forall a b. [a] -> [b] -> [(a, b)]
zip (Int -> [Int]
forall a. Enum a => a -> [a]
enumFrom Int
1) ([Bytes] -> [(Int, Bytes)])
-> (LazyBytes -> [Bytes]) -> LazyBytes -> [(Int, Bytes)]
forall b c a. (b -> c) -> (a -> b) -> a -> c
.
               (LazyBytes -> Bytes) -> [LazyBytes] -> [Bytes]
forall a b. (a -> b) -> [a] -> [b]
map LazyBytes -> Bytes
L.toStrict ([LazyBytes] -> [Bytes])
-> (LazyBytes -> [LazyBytes]) -> LazyBytes -> [Bytes]
forall b c a. (b -> c) -> (a -> b) -> a -> c
.
               LazyBytes -> [LazyBytes]
L.lines
  where
    go :: Gene -> Cdna -> [(Int, [Bytes])] -> [Either GffError Cdna]
go Gene
gene Cdna
xscript ((Int
ln, Bytes
ch:Bytes
_:Bytes
tp:Bytes
fro_:Bytes
tho_:Bytes
_:Bytes
strand:Bytes
_:Bytes
stuff_:[Bytes]
_) : [(Int, [Bytes])]
strm)
        | Just (Int
fro,Bytes
"") <- Bytes -> Maybe (Int, Bytes)
C.readInt Bytes
fro_
        , Just (Int
tho,Bytes
"") <- Bytes -> Maybe (Int, Bytes)
C.readInt Bytes
tho_
        , Just Either (HashMap Bytes Bytes) (HashMap Bytes Bytes)
stuff <- Bytes -> Maybe (Either (HashMap Bytes Bytes) (HashMap Bytes Bytes))
parseStuff Bytes
stuff_  =
                    let rng :: Range
rng = (Range -> Range) -> (Range -> Range) -> Bool -> Range -> Range
forall a. a -> a -> Bool -> a
bool Range -> Range
forall a. a -> a
id Range -> Range
reverseRange (Bytes
strand Bytes -> Bytes -> Bool
forall a. Eq a => a -> a -> Bool
== Bytes
"-") (Range -> Range) -> Range -> Range
forall a b. (a -> b) -> a -> b
$ Bytes -> Int -> Int -> Range
Range Bytes
ch (Int
froInt -> Int -> Int
forall a. Num a => a -> a -> a
-Int
1) (Int
thoInt -> Int -> Int
forall a. Num a => a -> a -> a
-Int
froInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
1)
                    in Int
-> Gene
-> Cdna
-> [(Int, [Bytes])]
-> Range
-> Bytes
-> Either (HashMap Bytes Bytes) (HashMap Bytes Bytes)
-> [Either GffError Cdna]
go2 Int
ln Gene
gene Cdna
xscript [(Int, [Bytes])]
strm Range
rng ((Word8 -> Word8) -> Bytes -> Bytes
B.map (Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.|. Word8
32) Bytes
tp) Either (HashMap Bytes Bytes) (HashMap Bytes Bytes)
stuff

    go  Gene
gene Cdna
xscript ((Int
ln, [Bytes]
_) : [(Int, [Bytes])]
strm)  =  GffError -> Either GffError Cdna
forall a b. a -> Either a b
Left ([Char] -> Int -> GffErrorDetail -> GffError
GffError [Char]
fp Int
ln GffErrorDetail
GffParseError) Either GffError Cdna
-> [Either GffError Cdna] -> [Either GffError Cdna]
forall a. a -> [a] -> [a]
: Gene -> Cdna -> [(Int, [Bytes])] -> [Either GffError Cdna]
go Gene
gene Cdna
xscript [(Int, [Bytes])]
strm
    go Gene
_gene Cdna
xscript [              ]  =  Cdna -> Either GffError Cdna
forall a b. b -> Either a b
Right Cdna
xscript Either GffError Cdna
-> [Either GffError Cdna] -> [Either GffError Cdna]
forall a. a -> [a] -> [a]
: []

    go2 :: Int
-> Gene
-> Cdna
-> [(Int, [Bytes])]
-> Range
-> Bytes
-> Either (HashMap Bytes Bytes) (HashMap Bytes Bytes)
-> [Either GffError Cdna]
go2 Int
ln Gene
gene Cdna
xscript [(Int, [Bytes])]
strm Range
rng Bytes
tp (Left HashMap Bytes Bytes
stuff)
        | Bytes
tp Bytes -> Bytes -> Bool
forall a. Eq a => a -> a -> Bool
== Bytes
"exon" = if Bytes -> HashMap Bytes Bytes -> Maybe Bytes
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
M.lookup Bytes
"Parent" HashMap Bytes Bytes
stuff Maybe Bytes -> Maybe Bytes -> Bool
forall a. Eq a => a -> a -> Bool
== Bytes -> Maybe Bytes
forall a. a -> Maybe a
Just (Cdna -> Bytes
c_id Cdna
xscript)
                         then Gene -> Cdna -> [(Int, [Bytes])] -> [Either GffError Cdna]
go Gene
gene Cdna
xscript { c_exons = rng : c_exons xscript } [(Int, [Bytes])]
strm
                         else GffError -> Either GffError Cdna
forall a b. a -> Either a b
Left ([Char] -> Int -> GffErrorDetail -> GffError
GffError [Char]
fp Int
ln GffErrorDetail
GffIdMismatch) Either GffError Cdna
-> [Either GffError Cdna] -> [Either GffError Cdna]
forall a. a -> [a] -> [a]
: Gene -> Cdna -> [(Int, [Bytes])] -> [Either GffError Cdna]
go Gene
gene Cdna
xscript [(Int, [Bytes])]
strm

        | Bytes
tp Bytes -> Bytes -> Bool
forall a. Eq a => a -> a -> Bool
== Bytes
"transcript" Bool -> Bool -> Bool
|| Bytes
tp Bytes -> Bytes -> Bool
forall a. Eq a => a -> a -> Bool
== Bytes
"cdna" Bool -> Bool -> Bool
|| Bytes
tp Bytes -> Bytes -> Bool
forall a. Eq a => a -> a -> Bool
== Bytes
"mrna" =
                Cdna -> Either GffError Cdna
forall a b. b -> Either a b
Right Cdna
xscript Either GffError Cdna
-> [Either GffError Cdna] -> [Either GffError Cdna]
forall a. a -> [a] -> [a]
:
                case (Bytes -> HashMap Bytes Bytes -> Maybe Bytes
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
M.lookup Bytes
"ID" HashMap Bytes Bytes
stuff, Bytes -> HashMap Bytes Bytes -> Maybe Bytes
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
M.lookup Bytes
"Parent" HashMap Bytes Bytes
stuff) of
                    (Just Bytes
tid, Just Bytes
gid)
                        | Bytes
gid Bytes -> Bytes -> Bool
forall a. Eq a => a -> a -> Bool
== Gene -> Bytes
g_id Gene
gene -> let xscript' :: Cdna
xscript' = Cdna { c_id :: Bytes
c_id = Bytes
tid
                                                                  , c_pos :: Range
c_pos = Range
rng
                                                                  , c_gene_id :: Bytes
c_gene_id = Bytes
gid
                                                                  , c_gene_symbol :: Bytes
c_gene_symbol = Gene -> Bytes
g_symbol Gene
gene
                                                                  , c_gene_biotype :: Bytes
c_gene_biotype = Gene -> Bytes
g_biotype Gene
gene
                                                                  , c_biotype :: Bytes
c_biotype = Bytes -> Bytes -> HashMap Bytes Bytes -> Bytes
forall k v. (Eq k, Hashable k) => v -> k -> HashMap k v -> v
M.lookupDefault Bytes
"" Bytes
"biotype" HashMap Bytes Bytes
stuff
                                                                  , c_description :: Bytes
c_description = Bytes
"" -- XXX
                                                                  , c_exons :: [Range]
c_exons = [] }
                                              in Gene -> Cdna -> [(Int, [Bytes])] -> [Either GffError Cdna]
go Gene
gene Cdna
xscript' [(Int, [Bytes])]
strm

                        | Bool
otherwise -> GffError -> Either GffError Cdna
forall a b. a -> Either a b
Left ([Char] -> Int -> GffErrorDetail -> GffError
GffError [Char]
fp Int
ln GffErrorDetail
GffIdMismatch) Either GffError Cdna
-> [Either GffError Cdna] -> [Either GffError Cdna]
forall a. a -> [a] -> [a]
: Gene -> Cdna -> [(Int, [Bytes])] -> [Either GffError Cdna]
go Gene
gene Cdna
null_cdna [(Int, [Bytes])]
strm

                    (Maybe Bytes, Maybe Bytes)
_ -> GffError -> Either GffError Cdna
forall a b. a -> Either a b
Left ([Char] -> Int -> GffErrorDetail -> GffError
GffError [Char]
fp Int
ln GffErrorDetail
GffParseError) Either GffError Cdna
-> [Either GffError Cdna] -> [Either GffError Cdna]
forall a. a -> [a] -> [a]
: Gene -> Cdna -> [(Int, [Bytes])] -> [Either GffError Cdna]
go Gene
gene Cdna
null_cdna [(Int, [Bytes])]
strm

        | Bytes
tp Bytes -> Bytes -> Bool
forall a. Eq a => a -> a -> Bool
== Bytes
"gene" =
                Cdna -> Either GffError Cdna
forall a b. b -> Either a b
Right Cdna
xscript Either GffError Cdna
-> [Either GffError Cdna] -> [Either GffError Cdna]
forall a. a -> [a] -> [a]
:
                case Bytes -> HashMap Bytes Bytes -> Maybe Bytes
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
M.lookup Bytes
"ID" HashMap Bytes Bytes
stuff of
                    Just Bytes
gid -> let gene' :: Gene
gene' = Gene { g_id :: Bytes
g_id = Bytes
gid
                                                 , g_symbol :: Bytes
g_symbol = Bytes
""    -- XXX
                                                 , g_biotype :: Bytes
g_biotype = Bytes -> Bytes -> HashMap Bytes Bytes -> Bytes
forall k v. (Eq k, Hashable k) => v -> k -> HashMap k v -> v
M.lookupDefault Bytes
"" Bytes
"biotype" HashMap Bytes Bytes
stuff }
                                in Gene -> Cdna -> [(Int, [Bytes])] -> [Either GffError Cdna]
go Gene
gene' Cdna
null_cdna [(Int, [Bytes])]
strm

                    Maybe Bytes
Nothing -> GffError -> Either GffError Cdna
forall a b. a -> Either a b
Left ([Char] -> Int -> GffErrorDetail -> GffError
GffError [Char]
fp Int
ln GffErrorDetail
GffParseError) Either GffError Cdna
-> [Either GffError Cdna] -> [Either GffError Cdna]
forall a. a -> [a] -> [a]
: Gene -> Cdna -> [(Int, [Bytes])] -> [Either GffError Cdna]
go Gene
null_gene Cdna
null_cdna [(Int, [Bytes])]
strm

        | Bool
otherwise = Gene -> Cdna -> [(Int, [Bytes])] -> [Either GffError Cdna]
go Gene
gene Cdna
xscript [(Int, [Bytes])]
strm

    go2 Int
ln Gene
gene Cdna
xscript [(Int, [Bytes])]
strm Range
rng Bytes
tp (Right HashMap Bytes Bytes
stuff)
        | Bytes
tp Bytes -> Bytes -> Bool
forall a. Eq a => a -> a -> Bool
== Bytes
"exon" =
                case Bytes -> HashMap Bytes Bytes -> Maybe Bytes
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
M.lookup Bytes
"transcript_id" HashMap Bytes Bytes
stuff of
                    Just Bytes
tid
                        | Bytes
tid Bytes -> Bytes -> Bool
forall a. Eq a => a -> a -> Bool
== Cdna -> Bytes
c_id Cdna
xscript -> Gene -> Cdna -> [(Int, [Bytes])] -> [Either GffError Cdna]
go Gene
gene Cdna
xscript { c_exons = rng : c_exons xscript } [(Int, [Bytes])]
strm

                        | Bool
otherwise -> GffError -> Either GffError Cdna
forall a b. a -> Either a b
Left ([Char] -> Int -> GffErrorDetail -> GffError
GffError [Char]
fp Int
ln GffErrorDetail
GffIdMismatch) Either GffError Cdna
-> [Either GffError Cdna] -> [Either GffError Cdna]
forall a. a -> [a] -> [a]
: Gene -> Cdna -> [(Int, [Bytes])] -> [Either GffError Cdna]
go Gene
gene Cdna
xscript [(Int, [Bytes])]
strm

                    Maybe Bytes
Nothing -> GffError -> Either GffError Cdna
forall a b. a -> Either a b
Left ([Char] -> Int -> GffErrorDetail -> GffError
GffError [Char]
fp Int
ln GffErrorDetail
GffParseError) Either GffError Cdna
-> [Either GffError Cdna] -> [Either GffError Cdna]
forall a. a -> [a] -> [a]
: Gene -> Cdna -> [(Int, [Bytes])] -> [Either GffError Cdna]
go Gene
gene Cdna
xscript [(Int, [Bytes])]
strm


        | Bytes
tp Bytes -> Bytes -> Bool
forall a. Eq a => a -> a -> Bool
== Bytes
"transcript" Bool -> Bool -> Bool
|| Bytes
tp Bytes -> Bytes -> Bool
forall a. Eq a => a -> a -> Bool
== Bytes
"cdna" Bool -> Bool -> Bool
|| Bytes
tp Bytes -> Bytes -> Bool
forall a. Eq a => a -> a -> Bool
== Bytes
"mrna" =
                Cdna -> Either GffError Cdna
forall a b. b -> Either a b
Right Cdna
xscript Either GffError Cdna
-> [Either GffError Cdna] -> [Either GffError Cdna]
forall a. a -> [a] -> [a]
:
                case (Bytes -> HashMap Bytes Bytes -> Maybe Bytes
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
M.lookup Bytes
"transcript_id" HashMap Bytes Bytes
stuff, Bytes -> HashMap Bytes Bytes -> Maybe Bytes
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
M.lookup Bytes
"gene_id" HashMap Bytes Bytes
stuff) of
                    (Just Bytes
tid, Just Bytes
gid) -> let xscript' :: Cdna
xscript' = Cdna { c_id :: Bytes
c_id = Bytes
tid
                                                                , c_pos :: Range
c_pos = Range
rng
                                                                , c_gene_id :: Bytes
c_gene_id = Bytes
gid
                                                                , c_gene_symbol :: Bytes
c_gene_symbol = Bytes -> Bytes -> HashMap Bytes Bytes -> Bytes
forall k v. (Eq k, Hashable k) => v -> k -> HashMap k v -> v
M.lookupDefault Bytes
"" Bytes
"gene_name" HashMap Bytes Bytes
stuff
                                                                , c_gene_biotype :: Bytes
c_gene_biotype = Bytes
""   -- XXX
                                                                , c_biotype :: Bytes
c_biotype = Bytes
"" -- XXX
                                                                , c_description :: Bytes
c_description = Bytes
"" -- XXX
                                                                , c_exons :: [Range]
c_exons = [] }
                                            in Gene -> Cdna -> [(Int, [Bytes])] -> [Either GffError Cdna]
go Gene
gene Cdna
xscript' [(Int, [Bytes])]
strm

                    (Maybe Bytes, Maybe Bytes)
_ -> GffError -> Either GffError Cdna
forall a b. a -> Either a b
Left ([Char] -> Int -> GffErrorDetail -> GffError
GffError [Char]
fp Int
ln GffErrorDetail
GffParseError) Either GffError Cdna
-> [Either GffError Cdna] -> [Either GffError Cdna]
forall a. a -> [a] -> [a]
: Gene -> Cdna -> [(Int, [Bytes])] -> [Either GffError Cdna]
go Gene
gene Cdna
null_cdna [(Int, [Bytes])]
strm

        | Bool
otherwise = Gene -> Cdna -> [(Int, [Bytes])] -> [Either GffError Cdna]
go Gene
gene Cdna
xscript [(Int, [Bytes])]
strm


-- | Parses the random stuff in GFF into a hash table.  Returns 'Just
-- (Left _)' if the file uses assignment style ("foo=bar;"), returns
-- 'Just (Right _)' if the file uses statement style ("foo \"bar\";"),
-- otherwise returns Nothing.
parseStuff :: Bytes -> Maybe (Either (M.HashMap Bytes Bytes) (M.HashMap Bytes Bytes))
parseStuff :: Bytes -> Maybe (Either (HashMap Bytes Bytes) (HashMap Bytes Bytes))
parseStuff Bytes
s = HashMap Bytes Bytes
-> Either (HashMap Bytes Bytes) (HashMap Bytes Bytes)
forall a b. a -> Either a b
Left  (HashMap Bytes Bytes
 -> Either (HashMap Bytes Bytes) (HashMap Bytes Bytes))
-> Maybe (HashMap Bytes Bytes)
-> Maybe (Either (HashMap Bytes Bytes) (HashMap Bytes Bytes))
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> HashMap Bytes Bytes -> Bytes -> Maybe (HashMap Bytes Bytes)
parse_assignments HashMap Bytes Bytes
forall k v. HashMap k v
M.empty Bytes
s Maybe (Either (HashMap Bytes Bytes) (HashMap Bytes Bytes))
-> Maybe (Either (HashMap Bytes Bytes) (HashMap Bytes Bytes))
-> Maybe (Either (HashMap Bytes Bytes) (HashMap Bytes Bytes))
forall a. Maybe a -> Maybe a -> Maybe a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|>
               HashMap Bytes Bytes
-> Either (HashMap Bytes Bytes) (HashMap Bytes Bytes)
forall a b. b -> Either a b
Right (HashMap Bytes Bytes
 -> Either (HashMap Bytes Bytes) (HashMap Bytes Bytes))
-> Maybe (HashMap Bytes Bytes)
-> Maybe (Either (HashMap Bytes Bytes) (HashMap Bytes Bytes))
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> HashMap Bytes Bytes -> Bytes -> Maybe (HashMap Bytes Bytes)
parse_quoted HashMap Bytes Bytes
forall k v. HashMap k v
M.empty Bytes
s
  where
    parse_assignments :: HashMap Bytes Bytes -> Bytes -> Maybe (HashMap Bytes Bytes)
parse_assignments !HashMap Bytes Bytes
h Bytes
s0
        | Bytes -> Bool
C.null Bytes
s0 = HashMap Bytes Bytes -> Maybe (HashMap Bytes Bytes)
forall a. a -> Maybe a
Just HashMap Bytes Bytes
h
        | Bool
otherwise = do let (Bytes
k,Bytes
s1) = (Char -> Bool) -> Bytes -> (Bytes, Bytes)
C.break (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
==Char
'=') Bytes
s0
                         Bool -> Maybe ()
forall (f :: * -> *). Alternative f => Bool -> f ()
guard (Bool -> Maybe ()) -> (Bool -> Bool) -> Bool -> Maybe ()
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Bool -> Bool
not (Bool -> Maybe ()) -> Bool -> Maybe ()
forall a b. (a -> b) -> a -> b
$ Bytes -> Bool
C.null Bytes
k
                         Bool -> Maybe ()
forall (f :: * -> *). Alternative f => Bool -> f ()
guard (Bool -> Maybe ()) -> (Bool -> Bool) -> Bool -> Maybe ()
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Bool -> Bool
not (Bool -> Maybe ()) -> Bool -> Maybe ()
forall a b. (a -> b) -> a -> b
$ Bytes -> Bool
C.null Bytes
s1
                         let (Bytes
v,Bytes
s2) = (Char -> Bool) -> Bytes -> (Bytes, Bytes)
C.break (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
==Char
';') (Bytes -> (Bytes, Bytes)) -> Bytes -> (Bytes, Bytes)
forall a b. (a -> b) -> a -> b
$ HasCallStack => Bytes -> Bytes
Bytes -> Bytes
C.tail Bytes
s1
                         HashMap Bytes Bytes -> Bytes -> Maybe (HashMap Bytes Bytes)
parse_assignments (Bytes -> Bytes -> HashMap Bytes Bytes -> HashMap Bytes Bytes
forall k v.
(Eq k, Hashable k) =>
k -> v -> HashMap k v -> HashMap k v
M.insert Bytes
k Bytes
v HashMap Bytes Bytes
h) (Int -> Bytes -> Bytes
C.drop Int
1 Bytes
s2)

    parse_quoted :: HashMap Bytes Bytes -> Bytes -> Maybe (HashMap Bytes Bytes)
parse_quoted !HashMap Bytes Bytes
h Bytes
s0
        | Bytes -> Bool
C.null Bytes
s0 Bool -> Bool -> Bool
|| Bytes -> Char
C.head Bytes
s0 Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
'#' = HashMap Bytes Bytes -> Maybe (HashMap Bytes Bytes)
forall a. a -> Maybe a
Just HashMap Bytes Bytes
h
        | Bool
otherwise = do let (Bytes
k,Bytes
s1) = (Char -> Bool) -> Bytes -> (Bytes, Bytes)
C.break (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
==Char
' ') Bytes
s0
                         Bool -> Maybe ()
forall (f :: * -> *). Alternative f => Bool -> f ()
guard (Bool -> Maybe ()) -> (Bool -> Bool) -> Bool -> Maybe ()
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Bool -> Bool
not (Bool -> Maybe ()) -> Bool -> Maybe ()
forall a b. (a -> b) -> a -> b
$ Bytes -> Bool
C.null Bytes
k
                         Bool -> Maybe ()
forall (f :: * -> *). Alternative f => Bool -> f ()
guard (Bool -> Maybe ()) -> Bool -> Maybe ()
forall a b. (a -> b) -> a -> b
$ Bytes -> Bytes -> Bool
C.isPrefixOf Bytes
" \"" Bytes
s1
                         let (Bytes
v,Bytes
s2) = (Char -> Bool) -> Bytes -> (Bytes, Bytes)
C.break (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
==Char
'"') (Bytes -> (Bytes, Bytes)) -> Bytes -> (Bytes, Bytes)
forall a b. (a -> b) -> a -> b
$ Int -> Bytes -> Bytes
C.drop Int
2 Bytes
s1
                         Bool -> Maybe ()
forall (f :: * -> *). Alternative f => Bool -> f ()
guard (Bool -> Maybe ()) -> (Bool -> Bool) -> Bool -> Maybe ()
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Bool -> Bool
not (Bool -> Maybe ()) -> Bool -> Maybe ()
forall a b. (a -> b) -> a -> b
$ Bytes -> Bool
C.null Bytes
s2
                         let s3 :: Bytes
s3 = (Char -> Bool) -> Bytes -> Bytes
C.dropWhile (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
/=Char
';') Bytes
s2
                         HashMap Bytes Bytes -> Bytes -> Maybe (HashMap Bytes Bytes)
parse_quoted (Bytes -> Bytes -> HashMap Bytes Bytes -> HashMap Bytes Bytes
forall k v.
(Eq k, Hashable k) =>
k -> v -> HashMap k v -> HashMap k v
M.insert Bytes
k Bytes
v HashMap Bytes Bytes
h) (Bytes -> Maybe (HashMap Bytes Bytes))
-> (Bytes -> Bytes) -> Bytes -> Maybe (HashMap Bytes Bytes)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Char -> Bool) -> Bytes -> Bytes
C.dropWhile Char -> Bool
isSpace (Bytes -> Maybe (HashMap Bytes Bytes))
-> Bytes -> Maybe (HashMap Bytes Bytes)
forall a b. (a -> b) -> a -> b
$ Int -> Bytes -> Bytes
C.drop Int
1 Bytes
s3