{-# OPTIONS_GHC -Wno-partial-fields #-}
module Bio.TwoBit.Tool
( EncodeProgress(..)
, buildFasta
, faToTwoBit
, formatCdna
, parseAnno
, twoBitToFa
, vcfToTwoBit
)
where
import Bio.TwoBit
import Control.Applicative
import Control.Exception
import Control.Monad
import Data.Bits
import Data.Bool
import qualified Data.ByteString as B
import qualified Data.ByteString.Builder as B
import qualified Data.ByteString.Char8 as C
import qualified Data.ByteString.Lazy.Char8 as L
import Data.ByteString.Short ( ShortByteString, toShort )
import qualified Data.ByteString.Short as H
import Data.Char ( isSpace, isUpper )
import Data.Foldable
import qualified Data.HashMap.Strict as M
import Data.Int ( Int64 )
import Data.Word ( Word8, Word32 )
import System.IO ( stdout )
type Bytes = B.ByteString
type LazyBytes = L.ByteString
data Cdna = Cdna
{ Cdna -> Bytes
c_id :: !Bytes
, Cdna -> Range
c_pos :: !Range
, Cdna -> Bytes
c_gene_id :: !Bytes
, Cdna -> Bytes
c_gene_symbol :: !Bytes
, Cdna -> Bytes
c_gene_biotype :: !Bytes
, Cdna -> Bytes
c_biotype :: !Bytes
, Cdna -> Bytes
c_description :: !Bytes
, Cdna -> [Range]
c_exons :: [Range]
}
deriving Int -> Cdna -> ShowS
[Cdna] -> ShowS
Cdna -> [Char]
(Int -> Cdna -> ShowS)
-> (Cdna -> [Char]) -> ([Cdna] -> ShowS) -> Show Cdna
forall a.
(Int -> a -> ShowS) -> (a -> [Char]) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> Cdna -> ShowS
showsPrec :: Int -> Cdna -> ShowS
$cshow :: Cdna -> [Char]
show :: Cdna -> [Char]
$cshowList :: [Cdna] -> ShowS
showList :: [Cdna] -> ShowS
Show
data Range = Range
{ Range -> Bytes
r_chrom :: !C.ByteString
, Range -> Int
r_start :: !Int
, Range -> Int
r_len :: !Int }
deriving Int -> Range -> ShowS
[Range] -> ShowS
Range -> [Char]
(Int -> Range -> ShowS)
-> (Range -> [Char]) -> ([Range] -> ShowS) -> Show Range
forall a.
(Int -> a -> ShowS) -> (a -> [Char]) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> Range -> ShowS
showsPrec :: Int -> Range -> ShowS
$cshow :: Range -> [Char]
show :: Range -> [Char]
$cshowList :: [Range] -> ShowS
showList :: [Range] -> ShowS
Show
reverseRange :: Range -> Range
reverseRange :: Range -> Range
reverseRange (Range Bytes
sq Int
pos Int
len) = Bytes -> Int -> Int -> Range
Range Bytes
sq (-Int
posInt -> Int -> Int
forall a. Num a => a -> a -> a
-Int
len) Int
len
null_cdna :: Cdna
null_cdna :: Cdna
null_cdna = Bytes
-> Range
-> Bytes
-> Bytes
-> Bytes
-> Bytes
-> Bytes
-> [Range]
-> Cdna
Cdna Bytes
"" (Bytes -> Int -> Int -> Range
Range Bytes
"" Int
0 Int
0) Bytes
"" Bytes
"" Bytes
"" Bytes
"" Bytes
"" []
formatCdna :: TwoBitFile -> Cdna -> B.Builder
formatCdna :: TwoBitFile -> Cdna -> Builder
formatCdna TwoBitFile
tbf Cdna{[Range]
Bytes
Range
c_id :: Cdna -> Bytes
c_pos :: Cdna -> Range
c_gene_id :: Cdna -> Bytes
c_gene_symbol :: Cdna -> Bytes
c_gene_biotype :: Cdna -> Bytes
c_biotype :: Cdna -> Bytes
c_description :: Cdna -> Bytes
c_exons :: Cdna -> [Range]
c_id :: Bytes
c_pos :: Range
c_gene_id :: Bytes
c_gene_symbol :: Bytes
c_gene_biotype :: Bytes
c_biotype :: Bytes
c_description :: Bytes
c_exons :: [Range]
..} = Builder
descr Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> Int -> [Word8] -> Builder
buildFasta Int
60 [Word8]
getExons
where
(Bytes
_,Bytes
tbf_fn) = (Char -> Bool) -> Bytes -> (Bytes, Bytes)
C.breakEnd (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
==Char
'/') (Bytes -> (Bytes, Bytes)) -> Bytes -> (Bytes, Bytes)
forall a b. (a -> b) -> a -> b
$ TwoBitFile -> Bytes
tbf_path TwoBitFile
tbf
(Bytes
tbf_base,Bytes
_) = (Char -> Bool) -> Bytes -> (Bytes, Bytes)
C.breakEnd (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
==Char
'.') Bytes
tbf_fn
genome_id :: Bytes
genome_id = if Bytes -> Bool
C.null Bytes
tbf_base then Bytes
tbf_fn else HasCallStack => Bytes -> Bytes
Bytes -> Bytes
C.init Bytes
tbf_base
descr :: Builder
descr = Char -> Builder
B.char7 Char
'>' Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> Bytes -> Builder
B.byteString Bytes
c_id Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> Builder
" cdna chromosome:" Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<>
Bytes -> Builder
B.byteString Bytes
genome_id Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> Char -> Builder
B.char7 Char
':' Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> Range -> Builder
formatRange Range
c_pos Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<>
Builder
" gene:" Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> Bytes -> Builder
B.byteString Bytes
c_gene_id Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<>
Builder -> Bytes -> Builder
maybeBS Builder
" gene_biotype:" Bytes
c_gene_biotype Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<>
Builder -> Bytes -> Builder
maybeBS Builder
" transcript_biotype:" Bytes
c_biotype Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<>
Builder -> Bytes -> Builder
maybeBS Builder
" gene_symbol:" Bytes
c_gene_symbol Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<>
Builder -> Bytes -> Builder
maybeBS Builder
" description:" Bytes
c_description Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> Char -> Builder
B.char7 Char
'\n'
formatRange :: Range -> Builder
formatRange Range
r | Range -> Int
r_start Range
r Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
0 = Range -> Builder
formatRange1 (Range -> Range
reverseRange Range
r) Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> Builder
":-1"
| Bool
otherwise = Range -> Builder
formatRange1 Range
r Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> Builder
":1"
formatRange1 :: Range -> Builder
formatRange1 Range
r = Bytes -> Builder
B.byteString (Range -> Bytes
r_chrom Range
r) Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> Char -> Builder
B.char7 Char
':' Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<>
Int -> Builder
B.intDec (Range -> Int
r_start Range
r) Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> Char -> Builder
B.char7 Char
':' Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<>
Int -> Builder
B.intDec (Range -> Int
r_start Range
r Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Range -> Int
r_len Range
r Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1)
maybeBS :: Builder -> Bytes -> Builder
maybeBS Builder
p Bytes
s = if Bytes -> Bool
B.null Bytes
s then Builder
forall a. Monoid a => a
mempty else Builder
p Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> Bytes -> Builder
B.byteString Bytes
s
getExons :: [Word8]
getExons | Range -> Int
r_start Range
c_pos Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
0 = (Range -> [Word8]) -> [Range] -> [Word8]
forall (t :: * -> *) a b. Foldable t => (a -> [b]) -> t a -> [b]
concatMap Range -> [Word8]
getExon [Range]
c_exons
| Bool
otherwise = (Range -> [Word8]) -> [Range] -> [Word8]
forall (t :: * -> *) a b. Foldable t => (a -> [b]) -> t a -> [b]
concatMap Range -> [Word8]
getExon ([Range] -> [Range]
forall a. [a] -> [a]
reverse [Range]
c_exons)
getExon :: Range -> [Word8]
getExon :: Range -> [Word8]
getExon (Range Bytes
ch Int
start Int
len) =
case Bytes -> TwoBitFile -> Maybe TwoBitChromosome
findChrom Bytes
ch TwoBitFile
tbf of
Just TwoBitChromosome
tbs | Int
start Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
0 -> Int -> [Word8] -> [Word8]
forall a. Int -> [a] -> [a]
take Int
len ([Word8] -> [Word8]) -> [Word8] -> [Word8]
forall a b. (a -> b) -> a -> b
$ TwoBitSequence' Unidrectional -> [Word8]
forall dir. TwoBitSequence' dir -> [Word8]
unpackRS (TwoBitSequence' Unidrectional -> [Word8])
-> TwoBitSequence' Unidrectional -> [Word8]
forall a b. (a -> b) -> a -> b
$ TwoBitChromosome -> Int -> TwoBitSequence' Unidrectional
tbc_fwd_seq TwoBitChromosome
tbs Int
start
| Bool
otherwise -> Int -> [Word8] -> [Word8]
forall a. Int -> [a] -> [a]
take Int
len ([Word8] -> [Word8]) -> [Word8] -> [Word8]
forall a b. (a -> b) -> a -> b
$ TwoBitSequence' Bidirectional -> [Word8]
forall dir. TwoBitSequence' dir -> [Word8]
unpackRS (TwoBitSequence' Bidirectional -> [Word8])
-> TwoBitSequence' Bidirectional -> [Word8]
forall a b. (a -> b) -> a -> b
$ TwoBitChromosome -> Int -> TwoBitSequence' Bidirectional
tbc_rev_seq TwoBitChromosome
tbs (-Int
startInt -> Int -> Int
forall a. Num a => a -> a -> a
-Int
len)
Maybe TwoBitChromosome
Nothing -> [Char] -> [Word8]
forall a. HasCallStack => [Char] -> a
error ([Char] -> [Word8]) -> [Char] -> [Word8]
forall a b. (a -> b) -> a -> b
$ [Char]
"unknown reference " [Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ Bytes -> [Char]
forall a. Show a => a -> [Char]
show Bytes
ch
buildFasta :: Int -> [Word8] -> B.Builder
buildFasta :: Int -> [Word8] -> Builder
buildFasta Int
n = [Word8] -> Builder
go
where
go :: [Word8] -> Builder
go [ ] = Builder
forall a. Monoid a => a
mempty
go [Word8]
s = let ([Word8]
u,[Word8]
v) = Int -> [Word8] -> ([Word8], [Word8])
forall a. Int -> [a] -> ([a], [a])
splitAt Int
n [Word8]
s
in (Word8 -> Builder) -> [Word8] -> Builder
forall m a. Monoid m => (a -> m) -> [a] -> m
forall (t :: * -> *) m a.
(Foldable t, Monoid m) =>
(a -> m) -> t a -> m
foldMap Word8 -> Builder
B.word8 [Word8]
u Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> Char -> Builder
B.char7 Char
'\n' Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> [Word8] -> Builder
go [Word8]
v
{-# INLINE buildFasta #-}
twoBitToFa :: Int -> TwoBitSequence' dir -> IO ()
twoBitToFa :: forall dir. Int -> TwoBitSequence' dir -> IO ()
twoBitToFa Int
ln = Handle -> Builder -> IO ()
B.hPutBuilder Handle
stdout (Builder -> IO ())
-> (TwoBitSequence' dir -> Builder) -> TwoBitSequence' dir -> IO ()
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> [Word8] -> Builder
buildFasta Int
60 ([Word8] -> Builder)
-> (TwoBitSequence' dir -> [Word8])
-> TwoBitSequence' dir
-> Builder
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> [Word8] -> [Word8]
forall a. Int -> [a] -> [a]
take Int
ln ([Word8] -> [Word8])
-> (TwoBitSequence' dir -> [Word8])
-> TwoBitSequence' dir
-> [Word8]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. TwoBitSequence' dir -> [Word8]
forall dir. TwoBitSequence' dir -> [Word8]
unpackRSMasked
data EncodeProgress
= EncodeProgress
{ EncodeProgress -> ShortByteString
ep_seqname :: !ShortByteString
, EncodeProgress -> Word32
ep_position :: !Word32
, EncodeProgress -> Word32
ep_hardmasked :: !Word32
, EncodeProgress -> Word32
ep_softmasked :: !Word32
, EncodeProgress -> Int64
ep_enclength :: !Int64
, EncodeProgress -> EncodeProgress
ep_tail :: EncodeProgress }
| Encoded B.Builder
faToTwoBit :: L.ByteString -> EncodeProgress
faToTwoBit :: LazyBytes -> EncodeProgress
faToTwoBit = [(ShortByteString, LazyBytes)] -> LazyBytes -> EncodeProgress
get_each []
where
get_each :: [(ShortByteString, LazyBytes)] -> LazyBytes -> EncodeProgress
get_each [(ShortByteString, LazyBytes)]
acc LazyBytes
inp = case LazyBytes -> Maybe (Char, LazyBytes)
L.uncons (LazyBytes -> Maybe (Char, LazyBytes))
-> LazyBytes -> Maybe (Char, LazyBytes)
forall a b. (a -> b) -> a -> b
$ (Char -> Bool) -> LazyBytes -> LazyBytes
L.dropWhile (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
/= Char
'>') LazyBytes
inp of
Maybe (Char, LazyBytes)
Nothing -> Builder -> EncodeProgress
Encoded (Builder -> EncodeProgress) -> Builder -> EncodeProgress
forall a b. (a -> b) -> a -> b
$ [(ShortByteString, LazyBytes)] -> Builder
seqs_to_twobit ([(ShortByteString, LazyBytes)] -> Builder)
-> [(ShortByteString, LazyBytes)] -> Builder
forall a b. (a -> b) -> a -> b
$ [(ShortByteString, LazyBytes)] -> [(ShortByteString, LazyBytes)]
forall a. [a] -> [a]
reverse [(ShortByteString, LazyBytes)]
acc
Just (Char
_,LazyBytes
s2) ->
let (LazyBytes
nm, LazyBytes
s') = (Char -> Bool) -> LazyBytes -> (LazyBytes, LazyBytes)
L.break (Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
' ') LazyBytes
s2
in [(ShortByteString, LazyBytes)]
-> ShortByteString
-> Word32
-> GapList
-> GapList
-> BaseAccu
-> LazyBytes
-> EncodeProgress
get_one [(ShortByteString, LazyBytes)]
acc (Bytes -> ShortByteString
toShort (LazyBytes -> Bytes
L.toStrict LazyBytes
nm)) Word32
0 (Word32 -> L2i -> GapList
GapList Word32
forall a. Bounded a => a
maxBound L2i
L2i_Nil)
(Word32 -> L2i -> GapList
GapList Word32
forall a. Bounded a => a
maxBound L2i
L2i_Nil) (Int -> Word8 -> Accu -> BaseAccu
BaseAccu Int
0 Word8
0 Accu
emptyAccu)
((Char -> Bool) -> LazyBytes -> LazyBytes
L.dropWhile (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
/= Char
'\n') LazyBytes
s')
get_one :: [(ShortByteString, LazyBytes)]
-> ShortByteString
-> Word32
-> GapList
-> GapList
-> BaseAccu
-> LazyBytes
-> EncodeProgress
get_one [(ShortByteString, LazyBytes)]
acc !ShortByteString
nm !Word32
pos !GapList
ns !GapList
ms !BaseAccu
bs LazyBytes
inp = case LazyBytes -> Maybe (Char, LazyBytes)
L.uncons LazyBytes
inp of
Maybe (Char, LazyBytes)
Nothing -> LazyBytes -> EncodeProgress
fin LazyBytes
L.empty
Just (Char
c,LazyBytes
s')
| Char
c Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
<= Char
' ' -> [(ShortByteString, LazyBytes)]
-> ShortByteString
-> Word32
-> GapList
-> GapList
-> BaseAccu
-> LazyBytes
-> EncodeProgress
get_one [(ShortByteString, LazyBytes)]
acc ShortByteString
nm Word32
pos GapList
ns GapList
ms BaseAccu
bs LazyBytes
s'
| Char
c Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
'>' -> LazyBytes -> EncodeProgress
fin (Char -> LazyBytes -> LazyBytes
L.cons Char
c LazyBytes
s')
| Bool
otherwise -> [(ShortByteString, LazyBytes)]
-> ShortByteString
-> Word32
-> GapList
-> GapList
-> BaseAccu
-> LazyBytes
-> EncodeProgress
get_one [(ShortByteString, LazyBytes)]
acc ShortByteString
nm (Word32 -> Word32
forall a. Enum a => a -> a
succ Word32
pos)
(GapList -> Word32 -> Char -> GapList
collect_Ns GapList
ns Word32
pos Char
c)
(GapList -> Word32 -> Char -> GapList
collect_ms GapList
ms Word32
pos Char
c)
(BaseAccu -> Char -> BaseAccu
collect_bases BaseAccu
bs Char
c) LazyBytes
s'
where
fin :: LazyBytes -> EncodeProgress
fin LazyBytes
k = let !r :: LazyBytes
r = Word32 -> GapList -> GapList -> BaseAccu -> LazyBytes
encode_seq Word32
pos GapList
ns GapList
ms BaseAccu
bs
in ShortByteString
-> Word32
-> Word32
-> Word32
-> Int64
-> EncodeProgress
-> EncodeProgress
EncodeProgress ShortByteString
nm Word32
pos (Word32 -> GapList -> Word32
sum_L2i Word32
pos GapList
ns) (Word32 -> GapList -> Word32
sum_L2i Word32
pos GapList
ms) (LazyBytes -> Int64
L.length LazyBytes
r) (EncodeProgress -> EncodeProgress)
-> EncodeProgress -> EncodeProgress
forall a b. (a -> b) -> a -> b
$
[(ShortByteString, LazyBytes)] -> LazyBytes -> EncodeProgress
get_each ((ShortByteString
nm,LazyBytes
r)(ShortByteString, LazyBytes)
-> [(ShortByteString, LazyBytes)] -> [(ShortByteString, LazyBytes)]
forall a. a -> [a] -> [a]
:[(ShortByteString, LazyBytes)]
acc) LazyBytes
k
vcfToTwoBit :: [B.ByteString] -> EncodeProgress
vcfToTwoBit :: [Bytes] -> EncodeProgress
vcfToTwoBit [Bytes]
s0 = let ([(ShortByteString, Word32)]
lns, [Bytes]
s1) = [(ShortByteString, Word32)]
-> [Bytes] -> ([(ShortByteString, Word32)], [Bytes])
read_header [] [Bytes]
s0
in [(ShortByteString, Word32)]
-> [(ShortByteString, LazyBytes)] -> [Bytes] -> EncodeProgress
get_each [(ShortByteString, Word32)]
lns [] ([Bytes] -> EncodeProgress) -> [Bytes] -> EncodeProgress
forall a b. (a -> b) -> a -> b
$ (Bytes -> Bool) -> [Bytes] -> [Bytes]
forall a. (a -> Bool) -> [a] -> [a]
filter (\Bytes
s -> Bool -> Bool
not (Bytes -> Bool
B.null Bytes
s) Bool -> Bool -> Bool
&& Bytes -> Char
C.head Bytes
s Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
/= Char
'#') [Bytes]
s1
where
read_header :: [(ShortByteString, Word32)]
-> [Bytes] -> ([(ShortByteString, Word32)], [Bytes])
read_header [(ShortByteString, Word32)]
acc [ ] = ([(ShortByteString, Word32)] -> [(ShortByteString, Word32)]
forall a. [a] -> [a]
reverse [(ShortByteString, Word32)]
acc, [])
read_header [(ShortByteString, Word32)]
acc (Bytes
l:[Bytes]
ls) | Bytes
"##contig=" Bytes -> Bytes -> Bool
`C.isPrefixOf` Bytes
l
, (Just !ShortByteString
nm, Just !Word32
ln) <- Bytes -> (Maybe ShortByteString, Maybe Word32)
parse_cline Bytes
l = [(ShortByteString, Word32)]
-> [Bytes] -> ([(ShortByteString, Word32)], [Bytes])
read_header ((ShortByteString
nm,Word32
ln)(ShortByteString, Word32)
-> [(ShortByteString, Word32)] -> [(ShortByteString, Word32)]
forall a. a -> [a] -> [a]
:[(ShortByteString, Word32)]
acc) [Bytes]
ls
| Bytes
"#" Bytes -> Bytes -> Bool
`C.isPrefixOf` Bytes
l = [(ShortByteString, Word32)]
-> [Bytes] -> ([(ShortByteString, Word32)], [Bytes])
read_header [(ShortByteString, Word32)]
acc [Bytes]
ls
| Bool
otherwise = ([(ShortByteString, Word32)] -> [(ShortByteString, Word32)]
forall a. [a] -> [a]
reverse [(ShortByteString, Word32)]
acc, Bytes
lBytes -> [Bytes] -> [Bytes]
forall a. a -> [a] -> [a]
:[Bytes]
ls)
parse_cline :: Bytes -> (Maybe ShortByteString, Maybe Word32)
parse_cline = Bytes -> (Maybe ShortByteString, Maybe Word32)
forall {a}. Num a => Bytes -> (Maybe ShortByteString, Maybe a)
p1 (Bytes -> (Maybe ShortByteString, Maybe Word32))
-> (Bytes -> Bytes)
-> Bytes
-> (Maybe ShortByteString, Maybe Word32)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Char -> Bool) -> Bytes -> Bytes
C.filter (Bool -> Bool
not (Bool -> Bool) -> (Char -> Bool) -> Char -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> Bool
isSpace) (Bytes -> Bytes) -> (Bytes -> Bytes) -> Bytes -> Bytes
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Char -> Bool) -> Bytes -> Bytes
C.takeWhile (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
/=Char
'>') (Bytes -> Bytes) -> (Bytes -> Bytes) -> Bytes -> Bytes
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> Bytes -> Bytes
C.drop Int
1 (Bytes -> Bytes) -> (Bytes -> Bytes) -> Bytes -> Bytes
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Char -> Bool) -> Bytes -> Bytes
C.dropWhile (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
/=Char
'<')
where
p1 :: Bytes -> (Maybe ShortByteString, Maybe a)
p1 Bytes
s | Bytes
"ID=" Bytes -> Bytes -> Bool
`C.isPrefixOf` Bytes
s = let (Bytes
nm,Bytes
t) = (Char -> Bool) -> Bytes -> (Bytes, Bytes)
C.break (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
==Char
',') (Bytes -> (Bytes, Bytes)) -> Bytes -> (Bytes, Bytes)
forall a b. (a -> b) -> a -> b
$ Int -> Bytes -> Bytes
C.drop Int
3 Bytes
s
(Maybe ShortByteString
_,Maybe a
ln) = Bytes -> (Maybe ShortByteString, Maybe a)
p1 (Bytes -> (Maybe ShortByteString, Maybe a))
-> Bytes -> (Maybe ShortByteString, Maybe a)
forall a b. (a -> b) -> a -> b
$ Int -> Bytes -> Bytes
C.drop Int
1 Bytes
t
in (ShortByteString -> Maybe ShortByteString
forall a. a -> Maybe a
Just (Bytes -> ShortByteString
toShort Bytes
nm),Maybe a
ln)
| Bytes
"length=" Bytes -> Bytes -> Bool
`C.isPrefixOf` Bytes
s = case Bytes -> Maybe (Int, Bytes)
C.readInt (Bytes -> Maybe (Int, Bytes)) -> Bytes -> Maybe (Int, Bytes)
forall a b. (a -> b) -> a -> b
$ Int -> Bytes -> Bytes
C.drop Int
7 Bytes
s of
Just (Int
ln,Bytes
u) -> let (Maybe ShortByteString
nm,Maybe a
_) = Bytes -> (Maybe ShortByteString, Maybe a)
p1 (Bytes -> (Maybe ShortByteString, Maybe a))
-> Bytes -> (Maybe ShortByteString, Maybe a)
forall a b. (a -> b) -> a -> b
$ Int -> Bytes -> Bytes
C.drop Int
1 (Bytes -> Bytes) -> Bytes -> Bytes
forall a b. (a -> b) -> a -> b
$ (Char -> Bool) -> Bytes -> Bytes
C.dropWhile (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
/=Char
',') Bytes
u in (Maybe ShortByteString
nm,a -> Maybe a
forall a. a -> Maybe a
Just (Int -> a
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
ln))
Maybe (Int, Bytes)
Nothing -> Bytes -> (Maybe ShortByteString, Maybe a)
p1 (Bytes -> (Maybe ShortByteString, Maybe a))
-> Bytes -> (Maybe ShortByteString, Maybe a)
forall a b. (a -> b) -> a -> b
$ Int -> Bytes -> Bytes
C.drop Int
1 (Bytes -> Bytes) -> Bytes -> Bytes
forall a b. (a -> b) -> a -> b
$ (Char -> Bool) -> Bytes -> Bytes
C.dropWhile (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
/=Char
',') Bytes
s
| Bytes -> Bool
C.null Bytes
s = (Maybe ShortByteString
forall a. Maybe a
Nothing,Maybe a
forall a. Maybe a
Nothing)
| Bool
otherwise = Bytes -> (Maybe ShortByteString, Maybe a)
p1 (Bytes -> (Maybe ShortByteString, Maybe a))
-> Bytes -> (Maybe ShortByteString, Maybe a)
forall a b. (a -> b) -> a -> b
$ Int -> Bytes -> Bytes
C.drop Int
1 (Bytes -> Bytes) -> Bytes -> Bytes
forall a b. (a -> b) -> a -> b
$ (Char -> Bool) -> Bytes -> Bytes
C.dropWhile (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
/=Char
',') Bytes
s
get_each :: [(ShortByteString,Word32)]
-> [(ShortByteString, LazyBytes)]
-> [B.ByteString]
-> EncodeProgress
get_each :: [(ShortByteString, Word32)]
-> [(ShortByteString, LazyBytes)] -> [Bytes] -> EncodeProgress
get_each [(ShortByteString, Word32)]
lns [(ShortByteString, LazyBytes)]
acc [ ] = Builder -> EncodeProgress
Encoded (Builder -> EncodeProgress) -> Builder -> EncodeProgress
forall a b. (a -> b) -> a -> b
$ [(ShortByteString, LazyBytes)] -> Builder
seqs_to_twobit ([(ShortByteString, LazyBytes)] -> Builder)
-> [(ShortByteString, LazyBytes)] -> Builder
forall a b. (a -> b) -> a -> b
$ [ShortByteString]
-> [(ShortByteString, LazyBytes)] -> [(ShortByteString, LazyBytes)]
forall a b. Eq a => [a] -> [(a, b)] -> [(a, b)]
reorder (((ShortByteString, Word32) -> ShortByteString)
-> [(ShortByteString, Word32)] -> [ShortByteString]
forall a b. (a -> b) -> [a] -> [b]
map (ShortByteString, Word32) -> ShortByteString
forall a b. (a, b) -> a
fst [(ShortByteString, Word32)]
lns) ([(ShortByteString, LazyBytes)] -> [(ShortByteString, LazyBytes)])
-> [(ShortByteString, LazyBytes)] -> [(ShortByteString, LazyBytes)]
forall a b. (a -> b) -> a -> b
$ [(ShortByteString, LazyBytes)] -> [(ShortByteString, LazyBytes)]
forall a. [a] -> [a]
reverse [(ShortByteString, LazyBytes)]
acc
get_each [(ShortByteString, Word32)]
lns [(ShortByteString, LazyBytes)]
acc (Bytes
l:[Bytes]
s2) = ShortByteString
-> Word32
-> Word32
-> Word32
-> Int64
-> EncodeProgress
-> EncodeProgress
EncodeProgress ShortByteString
nm' Word32
ln' (Word32 -> GapList -> Word32
sum_L2i Word32
ln' GapList
ns') Word32
0 (LazyBytes -> Int64
L.length LazyBytes
r) (EncodeProgress -> EncodeProgress)
-> EncodeProgress -> EncodeProgress
forall a b. (a -> b) -> a -> b
$
[(ShortByteString, Word32)]
-> [(ShortByteString, LazyBytes)] -> [Bytes] -> EncodeProgress
get_each [(ShortByteString, Word32)]
lns ((ShortByteString
nm',LazyBytes
r)(ShortByteString, LazyBytes)
-> [(ShortByteString, LazyBytes)] -> [(ShortByteString, LazyBytes)]
forall a. a -> [a] -> [a]
:[(ShortByteString, LazyBytes)]
acc) [Bytes]
s3
where
nm :: Bytes
nm = (Word8 -> Bool) -> Bytes -> Bytes
B.takeWhile (Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
/= Word8
9) Bytes
l
(Word32
pos,GapList
ns,BaseAccu
bs,[Bytes]
s3) = Bytes
-> Word32
-> GapList
-> BaseAccu
-> [Bytes]
-> (Word32, GapList, BaseAccu, [Bytes])
get_one Bytes
nm Word32
0 (Word32 -> L2i -> GapList
GapList Word32
forall a. Bounded a => a
maxBound L2i
L2i_Nil) (Int -> Word8 -> Accu -> BaseAccu
BaseAccu Int
0 Word8
0 Accu
emptyAccu) (Bytes
lBytes -> [Bytes] -> [Bytes]
forall a. a -> [a] -> [a]
:[Bytes]
s2)
!nm' :: ShortByteString
nm' = Bytes -> ShortByteString
toShort Bytes
nm
(GapList
ns',BaseAccu
bs',Word32
ln') = case ((ShortByteString, Word32) -> Bool)
-> [(ShortByteString, Word32)] -> Maybe (ShortByteString, Word32)
forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Maybe a
find (ShortByteString -> ShortByteString -> Bool
forall a. Eq a => a -> a -> Bool
(==) ShortByteString
nm' (ShortByteString -> Bool)
-> ((ShortByteString, Word32) -> ShortByteString)
-> (ShortByteString, Word32)
-> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (ShortByteString, Word32) -> ShortByteString
forall a b. (a, b) -> a
fst) [(ShortByteString, Word32)]
lns of
Just (ShortByteString
_,Word32
ln) | Word32
ln Word32 -> Word32 -> Bool
forall a. Ord a => a -> a -> Bool
> Word32
pos -> (GapList -> Word32 -> GapList
extend_gap GapList
ns Word32
ln, BaseAccu -> Int -> BaseAccu
pad_bases BaseAccu
bs (Word32 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Word32 -> Int) -> Word32 -> Int
forall a b. (a -> b) -> a -> b
$ Word32
lnWord32 -> Word32 -> Word32
forall a. Num a => a -> a -> a
-Word32
pos), Word32
ln)
Maybe (ShortByteString, Word32)
_ -> (GapList
ns,BaseAccu
bs,Word32
pos)
!r :: LazyBytes
r = Word32 -> GapList -> GapList -> BaseAccu -> LazyBytes
encode_seq Word32
ln' GapList
ns' (Word32 -> L2i -> GapList
GapList Word32
forall a. Bounded a => a
maxBound L2i
L2i_Nil) BaseAccu
bs'
get_one :: Bytes
-> Word32
-> GapList
-> BaseAccu
-> [Bytes]
-> (Word32, GapList, BaseAccu, [Bytes])
get_one !Bytes
_nm !Word32
pos !GapList
ns !BaseAccu
bs [ ] = (Word32
pos,GapList
ns,BaseAccu
bs,[])
get_one !Bytes
nm !Word32
pos !GapList
ns !BaseAccu
bs (Bytes
l:[Bytes]
s')
| (Word8 -> Bool) -> Bytes -> Bytes
B.takeWhile (Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
/=Word8
9) Bytes
l Bytes -> Bytes -> Bool
forall a. Eq a => a -> a -> Bool
/= Bytes
nm = (Word32
pos,GapList
ns,BaseAccu
bs,Bytes
lBytes -> [Bytes] -> [Bytes]
forall a. a -> [a] -> [a]
:[Bytes]
s')
| Just (Int
pos',Bytes
l3) <- Bytes -> Maybe (Int, Bytes)
C.readInt (Bytes -> Maybe (Int, Bytes))
-> (Bytes -> Bytes) -> Bytes -> Maybe (Int, Bytes)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> Bytes -> Bytes
C.drop Int
1 (Bytes -> Maybe (Int, Bytes)) -> Bytes -> Maybe (Int, Bytes)
forall a b. (a -> b) -> a -> b
$ (Word8 -> Bool) -> Bytes -> Bytes
B.dropWhile (Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
/=Word8
9) Bytes
l
, Bytes
ref <- (Word8 -> Bool) -> Bytes -> Bytes
B.takeWhile (Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
/=Word8
9) (Bytes -> Bytes) -> (Bytes -> Bytes) -> Bytes -> Bytes
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> Bytes -> Bytes
B.drop Int
1 (Bytes -> Bytes) -> (Bytes -> Bytes) -> Bytes -> Bytes
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Word8 -> Bool) -> Bytes -> Bytes
B.dropWhile (Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
/=Word8
9) (Bytes -> Bytes) -> Bytes -> Bytes
forall a b. (a -> b) -> a -> b
$ Int -> Bytes -> Bytes
B.drop Int
1 Bytes
l3
, Int -> Word32
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
pos' Word32 -> Word32 -> Bool
forall a. Ord a => a -> a -> Bool
>= Word32
pos Word32 -> Word32 -> Word32
forall a. Num a => a -> a -> a
+ Word32
1
, Bool -> Bool
not (Bytes -> Bool
C.null Bytes
ref) =
if Int -> Word32
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
pos' Word32 -> Word32 -> Bool
forall a. Eq a => a -> a -> Bool
== Word32
pos Word32 -> Word32 -> Word32
forall a. Num a => a -> a -> a
+ Word32
1
then Bytes
-> Word32
-> GapList
-> BaseAccu
-> [Bytes]
-> (Word32, GapList, BaseAccu, [Bytes])
get_one Bytes
nm (Word32 -> Word32
forall a. Enum a => a -> a
succ Word32
pos) (GapList -> Word32 -> Char -> GapList
collect_Ns GapList
ns Word32
pos (Char -> GapList) -> Char -> GapList
forall a b. (a -> b) -> a -> b
$ Bytes -> Char
C.head Bytes
ref)
(BaseAccu -> Char -> BaseAccu
collect_bases BaseAccu
bs (Char -> BaseAccu) -> Char -> BaseAccu
forall a b. (a -> b) -> a -> b
$ Bytes -> Char
C.head Bytes
ref) [Bytes]
s'
else let gap_len :: Int
gap_len = Int
pos' Int -> Int -> Int
forall a. Num a => a -> a -> a
- Word32 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word32
pos Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1
in Bytes
-> Word32
-> GapList
-> BaseAccu
-> [Bytes]
-> (Word32, GapList, BaseAccu, [Bytes])
get_one Bytes
nm (Int -> Word32
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
pos' Word32 -> Word32 -> Word32
forall a. Num a => a -> a -> a
- Word32
1) (GapList -> Word32 -> GapList
extend_gap GapList
ns Word32
pos)
(BaseAccu -> Int -> BaseAccu
pad_bases BaseAccu
bs Int
gap_len) (Bytes
lBytes -> [Bytes] -> [Bytes]
forall a. a -> [a] -> [a]
:[Bytes]
s')
| Bool
otherwise = Bytes
-> Word32
-> GapList
-> BaseAccu
-> [Bytes]
-> (Word32, GapList, BaseAccu, [Bytes])
get_one Bytes
nm Word32
pos GapList
ns BaseAccu
bs [Bytes]
s'
pad_bases :: BaseAccu -> Int -> BaseAccu
pad_bases BaseAccu
bs Int
n = (BaseAccu -> Char -> BaseAccu) -> BaseAccu -> [Char] -> BaseAccu
forall b a. (b -> a -> b) -> b -> [a] -> b
forall (t :: * -> *) b a.
Foldable t =>
(b -> a -> b) -> b -> t a -> b
foldl' BaseAccu -> Char -> BaseAccu
collect_bases BaseAccu
bs ([Char] -> BaseAccu) -> [Char] -> BaseAccu
forall a b. (a -> b) -> a -> b
$ Int -> Char -> [Char]
forall a. Int -> a -> [a]
replicate Int
n Char
'T'
reorder :: Eq a => [a] -> [(a,b)] -> [(a,b)]
reorder :: forall a b. Eq a => [a] -> [(a, b)] -> [(a, b)]
reorder [ ] [(a, b)]
vs = [(a, b)]
vs
reorder (a
k:[a]
ks) [(a, b)]
vs = [(a, b)] -> [(a, b)] -> [(a, b)]
forall {b}. [(a, b)] -> [(a, b)] -> [(a, b)]
go [] [(a, b)]
vs
where
go :: [(a, b)] -> [(a, b)] -> [(a, b)]
go [(a, b)]
xs ((a
k1,b
v1):[(a, b)]
ys) | a
k a -> a -> Bool
forall a. Eq a => a -> a -> Bool
== a
k1 = (a
k1,b
v1) (a, b) -> [(a, b)] -> [(a, b)]
forall a. a -> [a] -> [a]
: [a] -> [(a, b)] -> [(a, b)]
forall a b. Eq a => [a] -> [(a, b)] -> [(a, b)]
reorder [a]
ks ([(a, b)] -> [(a, b)]
forall a. [a] -> [a]
reverse [(a, b)]
xs [(a, b)] -> [(a, b)] -> [(a, b)]
forall a. [a] -> [a] -> [a]
++ [(a, b)]
ys)
| Bool
otherwise = [(a, b)] -> [(a, b)] -> [(a, b)]
go ((a
k1,b
v1)(a, b) -> [(a, b)] -> [(a, b)]
forall a. a -> [a] -> [a]
:[(a, b)]
xs) [(a, b)]
ys
go [(a, b)]
xs [ ] = [a] -> [(a, b)] -> [(a, b)]
forall a b. Eq a => [a] -> [(a, b)] -> [(a, b)]
reorder [a]
ks ([(a, b)] -> [(a, b)]
forall a. [a] -> [a]
reverse [(a, b)]
xs)
data L2i = L2i {-# UNPACK #-} !Word32 {-# UNPACK #-} !Word32 L2i | L2i_Nil
data GapList = GapList !Word32 !L2i
sum_L2i :: Word32 -> GapList -> Word32
sum_L2i :: Word32 -> GapList -> Word32
sum_L2i Word32
p (GapList Word32
q L2i
xs) = Word32 -> L2i -> Word32
go (if Word32
q Word32 -> Word32 -> Bool
forall a. Eq a => a -> a -> Bool
== Word32
forall a. Bounded a => a
maxBound then Word32
0 else Word32
pWord32 -> Word32 -> Word32
forall a. Num a => a -> a -> a
-Word32
q) L2i
xs
where
go :: Word32 -> L2i -> Word32
go !Word32
a (L2i Word32
x Word32
y L2i
z) = Word32 -> L2i -> Word32
go (Word32
aWord32 -> Word32 -> Word32
forall a. Num a => a -> a -> a
+Word32
yWord32 -> Word32 -> Word32
forall a. Num a => a -> a -> a
-Word32
x) L2i
z
go !Word32
a L2i
L2i_Nil = Word32
a
encodeL2i :: L2i -> B.Builder
encodeL2i :: L2i -> Builder
encodeL2i = Word32 -> Builder -> Builder -> L2i -> Builder
go Word32
0 Builder
forall a. Monoid a => a
mempty Builder
forall a. Monoid a => a
mempty
where
go :: Word32 -> Builder -> Builder -> L2i -> Builder
go !Word32
n Builder
ss Builder
ls L2i
L2i_Nil = Word32 -> Builder
B.word32LE Word32
n Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> Builder
ss Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> Builder
ls
go !Word32
n Builder
ss Builder
ls (L2i Word32
s Word32
e L2i
rs) = Word32 -> Builder -> Builder -> L2i -> Builder
go (Word32 -> Word32
forall a. Enum a => a -> a
succ Word32
n) (Word32 -> Builder
B.word32LE Word32
s Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> Builder
ss) (Word32 -> Builder
B.word32LE (Word32
eWord32 -> Word32 -> Word32
forall a. Num a => a -> a -> a
-Word32
s) Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> Builder
ls) L2i
rs
seqs_to_twobit :: [(ShortByteString, LazyBytes)] -> B.Builder
seqs_to_twobit :: [(ShortByteString, LazyBytes)] -> Builder
seqs_to_twobit [(ShortByteString, LazyBytes)]
seqs = Word32 -> Builder
B.word32LE Word32
0x1A412743 Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> Word32 -> Builder
B.word32LE Word32
0 Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<>
Word32 -> Builder
B.word32LE (Int -> Word32
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Int -> Word32) -> Int -> Word32
forall a b. (a -> b) -> a -> b
$ [(ShortByteString, LazyBytes)] -> Int
forall a. [a] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [(ShortByteString, LazyBytes)]
seqs) Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<> Word32 -> Builder
B.word32LE Word32
0 Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<>
[Builder] -> Builder
forall a. Monoid a => [a] -> a
mconcat ((ShortByteString -> Int -> Builder)
-> [ShortByteString] -> [Int] -> [Builder]
forall a b c. (a -> b -> c) -> [a] -> [b] -> [c]
zipWith (\ShortByteString
nm Int
off -> Word8 -> Builder
B.word8 (Int -> Word8
forall a b. (Integral a, Num b) => a -> b
fromIntegral (ShortByteString -> Int
H.length ShortByteString
nm)) Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<>
ShortByteString -> Builder
B.shortByteString ShortByteString
nm Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<>
Word32 -> Builder
B.word32LE (Int -> Word32
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
off))
(((ShortByteString, LazyBytes) -> ShortByteString)
-> [(ShortByteString, LazyBytes)] -> [ShortByteString]
forall a b. (a -> b) -> [a] -> [b]
map (ShortByteString, LazyBytes) -> ShortByteString
forall a b. (a, b) -> a
fst [(ShortByteString, LazyBytes)]
seqs) [Int]
offsets) Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<>
((ShortByteString, LazyBytes) -> Builder)
-> [(ShortByteString, LazyBytes)] -> Builder
forall m a. Monoid m => (a -> m) -> [a] -> m
forall (t :: * -> *) m a.
(Foldable t, Monoid m) =>
(a -> m) -> t a -> m
foldMap (LazyBytes -> Builder
B.lazyByteString (LazyBytes -> Builder)
-> ((ShortByteString, LazyBytes) -> LazyBytes)
-> (ShortByteString, LazyBytes)
-> Builder
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (ShortByteString, LazyBytes) -> LazyBytes
forall a b. (a, b) -> b
snd) [(ShortByteString, LazyBytes)]
seqs
where
offset0 :: Int
offset0 = Int
16 Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
5 Int -> Int -> Int
forall a. Num a => a -> a -> a
* [(ShortByteString, LazyBytes)] -> Int
forall a. [a] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [(ShortByteString, LazyBytes)]
seqs Int -> Int -> Int
forall a. Num a => a -> a -> a
+ [Int] -> Int
forall a. Num a => [a] -> a
forall (t :: * -> *) a. (Foldable t, Num a) => t a -> a
sum (((ShortByteString, LazyBytes) -> Int)
-> [(ShortByteString, LazyBytes)] -> [Int]
forall a b. (a -> b) -> [a] -> [b]
map (ShortByteString -> Int
H.length (ShortByteString -> Int)
-> ((ShortByteString, LazyBytes) -> ShortByteString)
-> (ShortByteString, LazyBytes)
-> Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (ShortByteString, LazyBytes) -> ShortByteString
forall a b. (a, b) -> a
fst) [(ShortByteString, LazyBytes)]
seqs)
offsets :: [Int]
offsets = (Int -> LazyBytes -> Int) -> Int -> [LazyBytes] -> [Int]
forall b a. (b -> a -> b) -> b -> [a] -> [b]
scanl (\Int
a LazyBytes
b -> Int
a Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int64 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral (LazyBytes -> Int64
L.length LazyBytes
b)) Int
offset0 ([LazyBytes] -> [Int]) -> [LazyBytes] -> [Int]
forall a b. (a -> b) -> a -> b
$ ((ShortByteString, LazyBytes) -> LazyBytes)
-> [(ShortByteString, LazyBytes)] -> [LazyBytes]
forall a b. (a -> b) -> [a] -> [b]
map (ShortByteString, LazyBytes) -> LazyBytes
forall a b. (a, b) -> b
snd [(ShortByteString, LazyBytes)]
seqs
newtype Accu = Accu [Bytes]
emptyAccu :: Accu
emptyAccu :: Accu
emptyAccu = [Bytes] -> Accu
Accu []
grow :: Word8 -> Accu -> Accu
grow :: Word8 -> Accu -> Accu
grow Word8
w = Int -> [Bytes] -> Accu -> Accu
go Int
1 [Word8 -> Bytes
B.singleton Word8
w]
where
go :: Int -> [Bytes] -> Accu -> Accu
go Int
l [Bytes]
acc (Accu (Bytes
s:[Bytes]
ss))
| Bytes -> Int
B.length Bytes
s Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
l = Int -> [Bytes] -> Accu -> Accu
go (Int
lInt -> Int -> Int
forall a. Num a => a -> a -> a
+Bytes -> Int
B.length Bytes
s) (Bytes
sBytes -> [Bytes] -> [Bytes]
forall a. a -> [a] -> [a]
:[Bytes]
acc) ([Bytes] -> Accu
Accu [Bytes]
ss)
| Bool
otherwise = let !s' :: Bytes
s' = [Bytes] -> Bytes
B.concat [Bytes]
acc in [Bytes] -> Accu
Accu (Bytes
s' Bytes -> [Bytes] -> [Bytes]
forall a. a -> [a] -> [a]
: Bytes
s Bytes -> [Bytes] -> [Bytes]
forall a. a -> [a] -> [a]
: [Bytes]
ss)
go Int
_ [Bytes]
acc (Accu [ ]) = let !s' :: Bytes
s' = [Bytes] -> Bytes
B.concat [Bytes]
acc in [Bytes] -> Accu
Accu [Item [Bytes]
Bytes
s']
buildAccu :: Accu -> B.Builder
buildAccu :: Accu -> Builder
buildAccu (Accu [Bytes]
ss) = (Bytes -> Builder) -> [Bytes] -> Builder
forall m a. Monoid m => (a -> m) -> [a] -> m
forall (t :: * -> *) m a.
(Foldable t, Monoid m) =>
(a -> m) -> t a -> m
foldMap Bytes -> Builder
B.byteString ([Bytes] -> Builder) -> [Bytes] -> Builder
forall a b. (a -> b) -> a -> b
$ [Bytes] -> [Bytes]
forall a. [a] -> [a]
reverse [Bytes]
ss
encode_seq :: Word32
-> GapList
-> GapList
-> BaseAccu
-> LazyBytes
encode_seq :: Word32 -> GapList -> GapList -> BaseAccu -> LazyBytes
encode_seq Word32
pos GapList
ns GapList
ms BaseAccu
bs = LazyBytes -> Int64
L.length LazyBytes
r Int64 -> LazyBytes -> LazyBytes
forall a b. a -> b -> b
`seq` LazyBytes
r
where
ss' :: Accu
ss' = case BaseAccu
bs of (BaseAccu Int
0 Word8
_ Accu
ss) -> Accu
ss
(BaseAccu Int
n Word8
w Accu
ss) -> Word8 -> Accu -> Accu
grow (Word8
w Word8 -> Int -> Word8
forall a. Bits a => a -> Int -> a
`shiftL` (Int
8Int -> Int -> Int
forall a. Num a => a -> a -> a
-Int
2Int -> Int -> Int
forall a. Num a => a -> a -> a
*Int
n)) Accu
ss
r :: LazyBytes
r = Builder -> LazyBytes
B.toLazyByteString (Builder -> LazyBytes) -> Builder -> LazyBytes
forall a b. (a -> b) -> a -> b
$
Word32 -> Builder
B.word32LE Word32
pos Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<>
L2i -> Builder
encodeL2i (case GapList
ns of GapList Word32
p L2i
rs | Word32
p Word32 -> Word32 -> Bool
forall a. Eq a => a -> a -> Bool
== Word32
forall a. Bounded a => a
maxBound -> L2i
rs ; GapList Word32
p L2i
rs -> Word32 -> Word32 -> L2i -> L2i
L2i Word32
p Word32
pos L2i
rs) Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<>
L2i -> Builder
encodeL2i (case GapList
ms of GapList Word32
p L2i
rs | Word32
p Word32 -> Word32 -> Bool
forall a. Eq a => a -> a -> Bool
== Word32
forall a. Bounded a => a
maxBound -> L2i
rs ; GapList Word32
p L2i
rs -> Word32 -> Word32 -> L2i -> L2i
L2i Word32
p Word32
pos L2i
rs) Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<>
Word32 -> Builder
B.word32LE Word32
0 Builder -> Builder -> Builder
forall a. Semigroup a => a -> a -> a
<>
Accu -> Builder
buildAccu Accu
ss'
collect_Ns :: GapList -> Word32 -> Char -> GapList
collect_Ns :: GapList -> Word32 -> Char -> GapList
collect_Ns (GapList Word32
spos L2i
rs) Word32
pos Char
c
| Word32
spos Word32 -> Word32 -> Bool
forall a. Eq a => a -> a -> Bool
== Word32
forall a. Bounded a => a
maxBound Bool -> Bool -> Bool
&& Char
c Char -> Bytes -> Bool
`C.elem` Bytes
"ACGTacgt" = Word32 -> L2i -> GapList
GapList Word32
forall a. Bounded a => a
maxBound L2i
rs
| Word32
spos Word32 -> Word32 -> Bool
forall a. Eq a => a -> a -> Bool
== Word32
forall a. Bounded a => a
maxBound = Word32 -> L2i -> GapList
GapList Word32
pos L2i
rs
| Char
c Char -> Bytes -> Bool
`C.elem` Bytes
"ACGTacgt" = Word32 -> L2i -> GapList
GapList Word32
forall a. Bounded a => a
maxBound (Word32 -> Word32 -> L2i -> L2i
L2i Word32
spos Word32
pos L2i
rs)
| Bool
otherwise = Word32 -> L2i -> GapList
GapList Word32
spos L2i
rs
collect_ms :: GapList -> Word32 -> Char -> GapList
collect_ms :: GapList -> Word32 -> Char -> GapList
collect_ms (GapList Word32
spos L2i
rs) Word32
pos Char
c
| Word32
spos Word32 -> Word32 -> Bool
forall a. Eq a => a -> a -> Bool
== Word32
forall a. Bounded a => a
maxBound Bool -> Bool -> Bool
&& Char -> Bool
isUpper Char
c = Word32 -> L2i -> GapList
GapList Word32
forall a. Bounded a => a
maxBound L2i
rs
| Word32
spos Word32 -> Word32 -> Bool
forall a. Eq a => a -> a -> Bool
== Word32
forall a. Bounded a => a
maxBound = Word32 -> L2i -> GapList
GapList Word32
pos L2i
rs
| Char -> Bool
isUpper Char
c = Word32 -> L2i -> GapList
GapList Word32
forall a. Bounded a => a
maxBound (Word32 -> Word32 -> L2i -> L2i
L2i Word32
spos Word32
pos L2i
rs)
| Bool
otherwise = Word32 -> L2i -> GapList
GapList Word32
spos L2i
rs
extend_gap :: GapList -> Word32 -> GapList
extend_gap :: GapList -> Word32 -> GapList
extend_gap (GapList Word32
spos L2i
rs) Word32
pos
| Word32
spos Word32 -> Word32 -> Bool
forall a. Eq a => a -> a -> Bool
== Word32
forall a. Bounded a => a
maxBound = Word32 -> L2i -> GapList
GapList Word32
pos L2i
rs
| Bool
otherwise = Word32 -> L2i -> GapList
GapList Word32
spos L2i
rs
data BaseAccu = BaseAccu !Int !Word8 !Accu
collect_bases :: BaseAccu -> Char -> BaseAccu
collect_bases :: BaseAccu -> Char -> BaseAccu
collect_bases (BaseAccu Int
n Word8
w Accu
ss) Char
c
= let code :: Word8
code = case Char
c of Char
'C'->Word8
1;Char
'c'->Word8
1;Char
'A'->Word8
2;Char
'a'->Word8
2;Char
'G'->Word8
3;Char
'g'->Word8
3;Char
_->Word8
0
w' :: Word8
w' = Word8 -> Int -> Word8
forall a. Bits a => a -> Int -> a
shiftL Word8
w Int
2 Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.|. Word8
code
in if Int
n Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
3 then Int -> Word8 -> Accu -> BaseAccu
BaseAccu Int
0 Word8
0 (Word8 -> Accu -> Accu
grow Word8
w' Accu
ss) else Int -> Word8 -> Accu -> BaseAccu
BaseAccu (Int -> Int
forall a. Enum a => a -> a
succ Int
n) Word8
w' Accu
ss
data Gene = Gene { Gene -> Bytes
g_id :: Bytes, Gene -> Bytes
g_symbol :: Bytes, Gene -> Bytes
g_biotype :: Bytes }
null_gene :: Gene
null_gene :: Gene
null_gene = Bytes -> Bytes -> Bytes -> Gene
Gene Bytes
"" Bytes
"" Bytes
""
data GffError = GffError String Int GffErrorDetail deriving Int -> GffError -> ShowS
[GffError] -> ShowS
GffError -> [Char]
(Int -> GffError -> ShowS)
-> (GffError -> [Char]) -> ([GffError] -> ShowS) -> Show GffError
forall a.
(Int -> a -> ShowS) -> (a -> [Char]) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> GffError -> ShowS
showsPrec :: Int -> GffError -> ShowS
$cshow :: GffError -> [Char]
show :: GffError -> [Char]
$cshowList :: [GffError] -> ShowS
showList :: [GffError] -> ShowS
Show
data GffErrorDetail = GffParseError | GffIdMismatch | GffUnknownRef Bytes deriving Int -> GffErrorDetail -> ShowS
[GffErrorDetail] -> ShowS
GffErrorDetail -> [Char]
(Int -> GffErrorDetail -> ShowS)
-> (GffErrorDetail -> [Char])
-> ([GffErrorDetail] -> ShowS)
-> Show GffErrorDetail
forall a.
(Int -> a -> ShowS) -> (a -> [Char]) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> GffErrorDetail -> ShowS
showsPrec :: Int -> GffErrorDetail -> ShowS
$cshow :: GffErrorDetail -> [Char]
show :: GffErrorDetail -> [Char]
$cshowList :: [GffErrorDetail] -> ShowS
showList :: [GffErrorDetail] -> ShowS
Show
instance Exception GffError where
displayException :: GffError -> [Char]
displayException (GffError [Char]
fp Int
ln GffErrorDetail
dt) = GffErrorDetail -> [Char]
displayDetail GffErrorDetail
dt [Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ [Char]
" in line " [Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ Int -> [Char]
forall a. Show a => a -> [Char]
show Int
ln [Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ [Char]
" of gff file " [Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ [Char]
fp
where
displayDetail :: GffErrorDetail -> [Char]
displayDetail GffErrorDetail
GffParseError = [Char]
"parse error"
displayDetail GffErrorDetail
GffIdMismatch = [Char]
"identifier does not match"
displayDetail (GffUnknownRef Bytes
ch) = [Char]
"unknown reference " [Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ Bytes -> [Char]
forall a. Show a => a -> [Char]
show Bytes
ch
parseAnno :: String -> L.ByteString -> [Either GffError Cdna]
parseAnno :: [Char] -> LazyBytes -> [Either GffError Cdna]
parseAnno [Char]
fp = (Either GffError Cdna -> Bool)
-> [Either GffError Cdna] -> [Either GffError Cdna]
forall a. (a -> Bool) -> [a] -> [a]
filter ((GffError -> Bool)
-> (Cdna -> Bool) -> Either GffError Cdna -> Bool
forall a c b. (a -> c) -> (b -> c) -> Either a b -> c
either (Bool -> GffError -> Bool
forall a b. a -> b -> a
const Bool
True) (Bool -> Bool
not (Bool -> Bool) -> (Cdna -> Bool) -> Cdna -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [Range] -> Bool
forall a. [a] -> Bool
forall (t :: * -> *) a. Foldable t => t a -> Bool
null ([Range] -> Bool) -> (Cdna -> [Range]) -> Cdna -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Cdna -> [Range]
c_exons)) ([Either GffError Cdna] -> [Either GffError Cdna])
-> (LazyBytes -> [Either GffError Cdna])
-> LazyBytes
-> [Either GffError Cdna]
forall b c a. (b -> c) -> (a -> b) -> a -> c
.
Gene -> Cdna -> [(Int, [Bytes])] -> [Either GffError Cdna]
go Gene
null_gene Cdna
null_cdna ([(Int, [Bytes])] -> [Either GffError Cdna])
-> (LazyBytes -> [(Int, [Bytes])])
-> LazyBytes
-> [Either GffError Cdna]
forall b c a. (b -> c) -> (a -> b) -> a -> c
.
((Int, Bytes) -> (Int, [Bytes]))
-> [(Int, Bytes)] -> [(Int, [Bytes])]
forall a b. (a -> b) -> [a] -> [b]
map ((Bytes -> [Bytes]) -> (Int, Bytes) -> (Int, [Bytes])
forall a b. (a -> b) -> (Int, a) -> (Int, b)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (Char -> Bytes -> [Bytes]
C.split Char
'\t')) ([(Int, Bytes)] -> [(Int, [Bytes])])
-> (LazyBytes -> [(Int, Bytes)]) -> LazyBytes -> [(Int, [Bytes])]
forall b c a. (b -> c) -> (a -> b) -> a -> c
.
((Int, Bytes) -> Bool) -> [(Int, Bytes)] -> [(Int, Bytes)]
forall a. (a -> Bool) -> [a] -> [a]
filter (\(Int
_,Bytes
s) -> Bool -> Bool
not (Bytes -> Bool
B.null Bytes
s) Bool -> Bool -> Bool
&& Bytes -> Char
C.head Bytes
s Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
/= Char
'#') ([(Int, Bytes)] -> [(Int, Bytes)])
-> (LazyBytes -> [(Int, Bytes)]) -> LazyBytes -> [(Int, Bytes)]
forall b c a. (b -> c) -> (a -> b) -> a -> c
.
[Int] -> [Bytes] -> [(Int, Bytes)]
forall a b. [a] -> [b] -> [(a, b)]
zip (Int -> [Int]
forall a. Enum a => a -> [a]
enumFrom Int
1) ([Bytes] -> [(Int, Bytes)])
-> (LazyBytes -> [Bytes]) -> LazyBytes -> [(Int, Bytes)]
forall b c a. (b -> c) -> (a -> b) -> a -> c
.
(LazyBytes -> Bytes) -> [LazyBytes] -> [Bytes]
forall a b. (a -> b) -> [a] -> [b]
map LazyBytes -> Bytes
L.toStrict ([LazyBytes] -> [Bytes])
-> (LazyBytes -> [LazyBytes]) -> LazyBytes -> [Bytes]
forall b c a. (b -> c) -> (a -> b) -> a -> c
.
LazyBytes -> [LazyBytes]
L.lines
where
go :: Gene -> Cdna -> [(Int, [Bytes])] -> [Either GffError Cdna]
go Gene
gene Cdna
xscript ((Int
ln, Bytes
ch:Bytes
_:Bytes
tp:Bytes
fro_:Bytes
tho_:Bytes
_:Bytes
strand:Bytes
_:Bytes
stuff_:[Bytes]
_) : [(Int, [Bytes])]
strm)
| Just (Int
fro,Bytes
"") <- Bytes -> Maybe (Int, Bytes)
C.readInt Bytes
fro_
, Just (Int
tho,Bytes
"") <- Bytes -> Maybe (Int, Bytes)
C.readInt Bytes
tho_
, Just Either (HashMap Bytes Bytes) (HashMap Bytes Bytes)
stuff <- Bytes -> Maybe (Either (HashMap Bytes Bytes) (HashMap Bytes Bytes))
parseStuff Bytes
stuff_ =
let rng :: Range
rng = (Range -> Range) -> (Range -> Range) -> Bool -> Range -> Range
forall a. a -> a -> Bool -> a
bool Range -> Range
forall a. a -> a
id Range -> Range
reverseRange (Bytes
strand Bytes -> Bytes -> Bool
forall a. Eq a => a -> a -> Bool
== Bytes
"-") (Range -> Range) -> Range -> Range
forall a b. (a -> b) -> a -> b
$ Bytes -> Int -> Int -> Range
Range Bytes
ch (Int
froInt -> Int -> Int
forall a. Num a => a -> a -> a
-Int
1) (Int
thoInt -> Int -> Int
forall a. Num a => a -> a -> a
-Int
froInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
1)
in Int
-> Gene
-> Cdna
-> [(Int, [Bytes])]
-> Range
-> Bytes
-> Either (HashMap Bytes Bytes) (HashMap Bytes Bytes)
-> [Either GffError Cdna]
go2 Int
ln Gene
gene Cdna
xscript [(Int, [Bytes])]
strm Range
rng ((Word8 -> Word8) -> Bytes -> Bytes
B.map (Word8 -> Word8 -> Word8
forall a. Bits a => a -> a -> a
.|. Word8
32) Bytes
tp) Either (HashMap Bytes Bytes) (HashMap Bytes Bytes)
stuff
go Gene
gene Cdna
xscript ((Int
ln, [Bytes]
_) : [(Int, [Bytes])]
strm) = GffError -> Either GffError Cdna
forall a b. a -> Either a b
Left ([Char] -> Int -> GffErrorDetail -> GffError
GffError [Char]
fp Int
ln GffErrorDetail
GffParseError) Either GffError Cdna
-> [Either GffError Cdna] -> [Either GffError Cdna]
forall a. a -> [a] -> [a]
: Gene -> Cdna -> [(Int, [Bytes])] -> [Either GffError Cdna]
go Gene
gene Cdna
xscript [(Int, [Bytes])]
strm
go Gene
_gene Cdna
xscript [ ] = Cdna -> Either GffError Cdna
forall a b. b -> Either a b
Right Cdna
xscript Either GffError Cdna
-> [Either GffError Cdna] -> [Either GffError Cdna]
forall a. a -> [a] -> [a]
: []
go2 :: Int
-> Gene
-> Cdna
-> [(Int, [Bytes])]
-> Range
-> Bytes
-> Either (HashMap Bytes Bytes) (HashMap Bytes Bytes)
-> [Either GffError Cdna]
go2 Int
ln Gene
gene Cdna
xscript [(Int, [Bytes])]
strm Range
rng Bytes
tp (Left HashMap Bytes Bytes
stuff)
| Bytes
tp Bytes -> Bytes -> Bool
forall a. Eq a => a -> a -> Bool
== Bytes
"exon" = if Bytes -> HashMap Bytes Bytes -> Maybe Bytes
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
M.lookup Bytes
"Parent" HashMap Bytes Bytes
stuff Maybe Bytes -> Maybe Bytes -> Bool
forall a. Eq a => a -> a -> Bool
== Bytes -> Maybe Bytes
forall a. a -> Maybe a
Just (Cdna -> Bytes
c_id Cdna
xscript)
then Gene -> Cdna -> [(Int, [Bytes])] -> [Either GffError Cdna]
go Gene
gene Cdna
xscript { c_exons = rng : c_exons xscript } [(Int, [Bytes])]
strm
else GffError -> Either GffError Cdna
forall a b. a -> Either a b
Left ([Char] -> Int -> GffErrorDetail -> GffError
GffError [Char]
fp Int
ln GffErrorDetail
GffIdMismatch) Either GffError Cdna
-> [Either GffError Cdna] -> [Either GffError Cdna]
forall a. a -> [a] -> [a]
: Gene -> Cdna -> [(Int, [Bytes])] -> [Either GffError Cdna]
go Gene
gene Cdna
xscript [(Int, [Bytes])]
strm
| Bytes
tp Bytes -> Bytes -> Bool
forall a. Eq a => a -> a -> Bool
== Bytes
"transcript" Bool -> Bool -> Bool
|| Bytes
tp Bytes -> Bytes -> Bool
forall a. Eq a => a -> a -> Bool
== Bytes
"cdna" Bool -> Bool -> Bool
|| Bytes
tp Bytes -> Bytes -> Bool
forall a. Eq a => a -> a -> Bool
== Bytes
"mrna" =
Cdna -> Either GffError Cdna
forall a b. b -> Either a b
Right Cdna
xscript Either GffError Cdna
-> [Either GffError Cdna] -> [Either GffError Cdna]
forall a. a -> [a] -> [a]
:
case (Bytes -> HashMap Bytes Bytes -> Maybe Bytes
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
M.lookup Bytes
"ID" HashMap Bytes Bytes
stuff, Bytes -> HashMap Bytes Bytes -> Maybe Bytes
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
M.lookup Bytes
"Parent" HashMap Bytes Bytes
stuff) of
(Just Bytes
tid, Just Bytes
gid)
| Bytes
gid Bytes -> Bytes -> Bool
forall a. Eq a => a -> a -> Bool
== Gene -> Bytes
g_id Gene
gene -> let xscript' :: Cdna
xscript' = Cdna { c_id :: Bytes
c_id = Bytes
tid
, c_pos :: Range
c_pos = Range
rng
, c_gene_id :: Bytes
c_gene_id = Bytes
gid
, c_gene_symbol :: Bytes
c_gene_symbol = Gene -> Bytes
g_symbol Gene
gene
, c_gene_biotype :: Bytes
c_gene_biotype = Gene -> Bytes
g_biotype Gene
gene
, c_biotype :: Bytes
c_biotype = Bytes -> Bytes -> HashMap Bytes Bytes -> Bytes
forall k v. (Eq k, Hashable k) => v -> k -> HashMap k v -> v
M.lookupDefault Bytes
"" Bytes
"biotype" HashMap Bytes Bytes
stuff
, c_description :: Bytes
c_description = Bytes
""
, c_exons :: [Range]
c_exons = [] }
in Gene -> Cdna -> [(Int, [Bytes])] -> [Either GffError Cdna]
go Gene
gene Cdna
xscript' [(Int, [Bytes])]
strm
| Bool
otherwise -> GffError -> Either GffError Cdna
forall a b. a -> Either a b
Left ([Char] -> Int -> GffErrorDetail -> GffError
GffError [Char]
fp Int
ln GffErrorDetail
GffIdMismatch) Either GffError Cdna
-> [Either GffError Cdna] -> [Either GffError Cdna]
forall a. a -> [a] -> [a]
: Gene -> Cdna -> [(Int, [Bytes])] -> [Either GffError Cdna]
go Gene
gene Cdna
null_cdna [(Int, [Bytes])]
strm
(Maybe Bytes, Maybe Bytes)
_ -> GffError -> Either GffError Cdna
forall a b. a -> Either a b
Left ([Char] -> Int -> GffErrorDetail -> GffError
GffError [Char]
fp Int
ln GffErrorDetail
GffParseError) Either GffError Cdna
-> [Either GffError Cdna] -> [Either GffError Cdna]
forall a. a -> [a] -> [a]
: Gene -> Cdna -> [(Int, [Bytes])] -> [Either GffError Cdna]
go Gene
gene Cdna
null_cdna [(Int, [Bytes])]
strm
| Bytes
tp Bytes -> Bytes -> Bool
forall a. Eq a => a -> a -> Bool
== Bytes
"gene" =
Cdna -> Either GffError Cdna
forall a b. b -> Either a b
Right Cdna
xscript Either GffError Cdna
-> [Either GffError Cdna] -> [Either GffError Cdna]
forall a. a -> [a] -> [a]
:
case Bytes -> HashMap Bytes Bytes -> Maybe Bytes
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
M.lookup Bytes
"ID" HashMap Bytes Bytes
stuff of
Just Bytes
gid -> let gene' :: Gene
gene' = Gene { g_id :: Bytes
g_id = Bytes
gid
, g_symbol :: Bytes
g_symbol = Bytes
""
, g_biotype :: Bytes
g_biotype = Bytes -> Bytes -> HashMap Bytes Bytes -> Bytes
forall k v. (Eq k, Hashable k) => v -> k -> HashMap k v -> v
M.lookupDefault Bytes
"" Bytes
"biotype" HashMap Bytes Bytes
stuff }
in Gene -> Cdna -> [(Int, [Bytes])] -> [Either GffError Cdna]
go Gene
gene' Cdna
null_cdna [(Int, [Bytes])]
strm
Maybe Bytes
Nothing -> GffError -> Either GffError Cdna
forall a b. a -> Either a b
Left ([Char] -> Int -> GffErrorDetail -> GffError
GffError [Char]
fp Int
ln GffErrorDetail
GffParseError) Either GffError Cdna
-> [Either GffError Cdna] -> [Either GffError Cdna]
forall a. a -> [a] -> [a]
: Gene -> Cdna -> [(Int, [Bytes])] -> [Either GffError Cdna]
go Gene
null_gene Cdna
null_cdna [(Int, [Bytes])]
strm
| Bool
otherwise = Gene -> Cdna -> [(Int, [Bytes])] -> [Either GffError Cdna]
go Gene
gene Cdna
xscript [(Int, [Bytes])]
strm
go2 Int
ln Gene
gene Cdna
xscript [(Int, [Bytes])]
strm Range
rng Bytes
tp (Right HashMap Bytes Bytes
stuff)
| Bytes
tp Bytes -> Bytes -> Bool
forall a. Eq a => a -> a -> Bool
== Bytes
"exon" =
case Bytes -> HashMap Bytes Bytes -> Maybe Bytes
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
M.lookup Bytes
"transcript_id" HashMap Bytes Bytes
stuff of
Just Bytes
tid
| Bytes
tid Bytes -> Bytes -> Bool
forall a. Eq a => a -> a -> Bool
== Cdna -> Bytes
c_id Cdna
xscript -> Gene -> Cdna -> [(Int, [Bytes])] -> [Either GffError Cdna]
go Gene
gene Cdna
xscript { c_exons = rng : c_exons xscript } [(Int, [Bytes])]
strm
| Bool
otherwise -> GffError -> Either GffError Cdna
forall a b. a -> Either a b
Left ([Char] -> Int -> GffErrorDetail -> GffError
GffError [Char]
fp Int
ln GffErrorDetail
GffIdMismatch) Either GffError Cdna
-> [Either GffError Cdna] -> [Either GffError Cdna]
forall a. a -> [a] -> [a]
: Gene -> Cdna -> [(Int, [Bytes])] -> [Either GffError Cdna]
go Gene
gene Cdna
xscript [(Int, [Bytes])]
strm
Maybe Bytes
Nothing -> GffError -> Either GffError Cdna
forall a b. a -> Either a b
Left ([Char] -> Int -> GffErrorDetail -> GffError
GffError [Char]
fp Int
ln GffErrorDetail
GffParseError) Either GffError Cdna
-> [Either GffError Cdna] -> [Either GffError Cdna]
forall a. a -> [a] -> [a]
: Gene -> Cdna -> [(Int, [Bytes])] -> [Either GffError Cdna]
go Gene
gene Cdna
xscript [(Int, [Bytes])]
strm
| Bytes
tp Bytes -> Bytes -> Bool
forall a. Eq a => a -> a -> Bool
== Bytes
"transcript" Bool -> Bool -> Bool
|| Bytes
tp Bytes -> Bytes -> Bool
forall a. Eq a => a -> a -> Bool
== Bytes
"cdna" Bool -> Bool -> Bool
|| Bytes
tp Bytes -> Bytes -> Bool
forall a. Eq a => a -> a -> Bool
== Bytes
"mrna" =
Cdna -> Either GffError Cdna
forall a b. b -> Either a b
Right Cdna
xscript Either GffError Cdna
-> [Either GffError Cdna] -> [Either GffError Cdna]
forall a. a -> [a] -> [a]
:
case (Bytes -> HashMap Bytes Bytes -> Maybe Bytes
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
M.lookup Bytes
"transcript_id" HashMap Bytes Bytes
stuff, Bytes -> HashMap Bytes Bytes -> Maybe Bytes
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
M.lookup Bytes
"gene_id" HashMap Bytes Bytes
stuff) of
(Just Bytes
tid, Just Bytes
gid) -> let xscript' :: Cdna
xscript' = Cdna { c_id :: Bytes
c_id = Bytes
tid
, c_pos :: Range
c_pos = Range
rng
, c_gene_id :: Bytes
c_gene_id = Bytes
gid
, c_gene_symbol :: Bytes
c_gene_symbol = Bytes -> Bytes -> HashMap Bytes Bytes -> Bytes
forall k v. (Eq k, Hashable k) => v -> k -> HashMap k v -> v
M.lookupDefault Bytes
"" Bytes
"gene_name" HashMap Bytes Bytes
stuff
, c_gene_biotype :: Bytes
c_gene_biotype = Bytes
""
, c_biotype :: Bytes
c_biotype = Bytes
""
, c_description :: Bytes
c_description = Bytes
""
, c_exons :: [Range]
c_exons = [] }
in Gene -> Cdna -> [(Int, [Bytes])] -> [Either GffError Cdna]
go Gene
gene Cdna
xscript' [(Int, [Bytes])]
strm
(Maybe Bytes, Maybe Bytes)
_ -> GffError -> Either GffError Cdna
forall a b. a -> Either a b
Left ([Char] -> Int -> GffErrorDetail -> GffError
GffError [Char]
fp Int
ln GffErrorDetail
GffParseError) Either GffError Cdna
-> [Either GffError Cdna] -> [Either GffError Cdna]
forall a. a -> [a] -> [a]
: Gene -> Cdna -> [(Int, [Bytes])] -> [Either GffError Cdna]
go Gene
gene Cdna
null_cdna [(Int, [Bytes])]
strm
| Bool
otherwise = Gene -> Cdna -> [(Int, [Bytes])] -> [Either GffError Cdna]
go Gene
gene Cdna
xscript [(Int, [Bytes])]
strm
parseStuff :: Bytes -> Maybe (Either (M.HashMap Bytes Bytes) (M.HashMap Bytes Bytes))
parseStuff :: Bytes -> Maybe (Either (HashMap Bytes Bytes) (HashMap Bytes Bytes))
parseStuff Bytes
s = HashMap Bytes Bytes
-> Either (HashMap Bytes Bytes) (HashMap Bytes Bytes)
forall a b. a -> Either a b
Left (HashMap Bytes Bytes
-> Either (HashMap Bytes Bytes) (HashMap Bytes Bytes))
-> Maybe (HashMap Bytes Bytes)
-> Maybe (Either (HashMap Bytes Bytes) (HashMap Bytes Bytes))
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> HashMap Bytes Bytes -> Bytes -> Maybe (HashMap Bytes Bytes)
parse_assignments HashMap Bytes Bytes
forall k v. HashMap k v
M.empty Bytes
s Maybe (Either (HashMap Bytes Bytes) (HashMap Bytes Bytes))
-> Maybe (Either (HashMap Bytes Bytes) (HashMap Bytes Bytes))
-> Maybe (Either (HashMap Bytes Bytes) (HashMap Bytes Bytes))
forall a. Maybe a -> Maybe a -> Maybe a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|>
HashMap Bytes Bytes
-> Either (HashMap Bytes Bytes) (HashMap Bytes Bytes)
forall a b. b -> Either a b
Right (HashMap Bytes Bytes
-> Either (HashMap Bytes Bytes) (HashMap Bytes Bytes))
-> Maybe (HashMap Bytes Bytes)
-> Maybe (Either (HashMap Bytes Bytes) (HashMap Bytes Bytes))
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> HashMap Bytes Bytes -> Bytes -> Maybe (HashMap Bytes Bytes)
parse_quoted HashMap Bytes Bytes
forall k v. HashMap k v
M.empty Bytes
s
where
parse_assignments :: HashMap Bytes Bytes -> Bytes -> Maybe (HashMap Bytes Bytes)
parse_assignments !HashMap Bytes Bytes
h Bytes
s0
| Bytes -> Bool
C.null Bytes
s0 = HashMap Bytes Bytes -> Maybe (HashMap Bytes Bytes)
forall a. a -> Maybe a
Just HashMap Bytes Bytes
h
| Bool
otherwise = do let (Bytes
k,Bytes
s1) = (Char -> Bool) -> Bytes -> (Bytes, Bytes)
C.break (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
==Char
'=') Bytes
s0
Bool -> Maybe ()
forall (f :: * -> *). Alternative f => Bool -> f ()
guard (Bool -> Maybe ()) -> (Bool -> Bool) -> Bool -> Maybe ()
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Bool -> Bool
not (Bool -> Maybe ()) -> Bool -> Maybe ()
forall a b. (a -> b) -> a -> b
$ Bytes -> Bool
C.null Bytes
k
Bool -> Maybe ()
forall (f :: * -> *). Alternative f => Bool -> f ()
guard (Bool -> Maybe ()) -> (Bool -> Bool) -> Bool -> Maybe ()
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Bool -> Bool
not (Bool -> Maybe ()) -> Bool -> Maybe ()
forall a b. (a -> b) -> a -> b
$ Bytes -> Bool
C.null Bytes
s1
let (Bytes
v,Bytes
s2) = (Char -> Bool) -> Bytes -> (Bytes, Bytes)
C.break (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
==Char
';') (Bytes -> (Bytes, Bytes)) -> Bytes -> (Bytes, Bytes)
forall a b. (a -> b) -> a -> b
$ HasCallStack => Bytes -> Bytes
Bytes -> Bytes
C.tail Bytes
s1
HashMap Bytes Bytes -> Bytes -> Maybe (HashMap Bytes Bytes)
parse_assignments (Bytes -> Bytes -> HashMap Bytes Bytes -> HashMap Bytes Bytes
forall k v.
(Eq k, Hashable k) =>
k -> v -> HashMap k v -> HashMap k v
M.insert Bytes
k Bytes
v HashMap Bytes Bytes
h) (Int -> Bytes -> Bytes
C.drop Int
1 Bytes
s2)
parse_quoted :: HashMap Bytes Bytes -> Bytes -> Maybe (HashMap Bytes Bytes)
parse_quoted !HashMap Bytes Bytes
h Bytes
s0
| Bytes -> Bool
C.null Bytes
s0 Bool -> Bool -> Bool
|| Bytes -> Char
C.head Bytes
s0 Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
'#' = HashMap Bytes Bytes -> Maybe (HashMap Bytes Bytes)
forall a. a -> Maybe a
Just HashMap Bytes Bytes
h
| Bool
otherwise = do let (Bytes
k,Bytes
s1) = (Char -> Bool) -> Bytes -> (Bytes, Bytes)
C.break (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
==Char
' ') Bytes
s0
Bool -> Maybe ()
forall (f :: * -> *). Alternative f => Bool -> f ()
guard (Bool -> Maybe ()) -> (Bool -> Bool) -> Bool -> Maybe ()
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Bool -> Bool
not (Bool -> Maybe ()) -> Bool -> Maybe ()
forall a b. (a -> b) -> a -> b
$ Bytes -> Bool
C.null Bytes
k
Bool -> Maybe ()
forall (f :: * -> *). Alternative f => Bool -> f ()
guard (Bool -> Maybe ()) -> Bool -> Maybe ()
forall a b. (a -> b) -> a -> b
$ Bytes -> Bytes -> Bool
C.isPrefixOf Bytes
" \"" Bytes
s1
let (Bytes
v,Bytes
s2) = (Char -> Bool) -> Bytes -> (Bytes, Bytes)
C.break (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
==Char
'"') (Bytes -> (Bytes, Bytes)) -> Bytes -> (Bytes, Bytes)
forall a b. (a -> b) -> a -> b
$ Int -> Bytes -> Bytes
C.drop Int
2 Bytes
s1
Bool -> Maybe ()
forall (f :: * -> *). Alternative f => Bool -> f ()
guard (Bool -> Maybe ()) -> (Bool -> Bool) -> Bool -> Maybe ()
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Bool -> Bool
not (Bool -> Maybe ()) -> Bool -> Maybe ()
forall a b. (a -> b) -> a -> b
$ Bytes -> Bool
C.null Bytes
s2
let s3 :: Bytes
s3 = (Char -> Bool) -> Bytes -> Bytes
C.dropWhile (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
/=Char
';') Bytes
s2
HashMap Bytes Bytes -> Bytes -> Maybe (HashMap Bytes Bytes)
parse_quoted (Bytes -> Bytes -> HashMap Bytes Bytes -> HashMap Bytes Bytes
forall k v.
(Eq k, Hashable k) =>
k -> v -> HashMap k v -> HashMap k v
M.insert Bytes
k Bytes
v HashMap Bytes Bytes
h) (Bytes -> Maybe (HashMap Bytes Bytes))
-> (Bytes -> Bytes) -> Bytes -> Maybe (HashMap Bytes Bytes)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Char -> Bool) -> Bytes -> Bytes
C.dropWhile Char -> Bool
isSpace (Bytes -> Maybe (HashMap Bytes Bytes))
-> Bytes -> Maybe (HashMap Bytes Bytes)
forall a b. (a -> b) -> a -> b
$ Int -> Bytes -> Bytes
C.drop Int
1 Bytes
s3