{-# LANGUAGE RecordWildCards #-}
{-# OPTIONS_GHC -Wall #-}
module Anno where

import Data.List

-- What does this header mean?
-- >Feature ref|NC_012920.1|

to_tab :: String -> [Anno] -> [String]
to_tab nm ann = (">Feature " ++ nm) : (map (intercalate "\t") $ concatMap to_tab1 ann)
  where
    to_tab1 :: Anno -> [[String]]
    to_tab1 Gene{..} =
        [ show start, show end, label what name ] :
        ( if has_gene what then [[ "", "", "", "gene", name ]] else [] ) ++
        [ [ "", "", "", "gene_syn", sy ] | sy <- syns ] ++
        [ [ show start, show end, w ] | w <- describe what ] ++
        [ [ "", "", "", "product", p ] | p <- [prod], not (null p) ] ++
        [ [ "", "", "", "note", n ] | n <- notes ] ++
        more what

    describe CDS = ["CDS"]
    describe CDS' = ["CDS"]
    describe TRNA = ["tRNA"]
    describe RRNA = ["rRNA"]
    describe _ = []

    has_gene (STS _) = False
    has_gene Other = False
    has_gene _ = True

    label (STS _) _ = "STS"
    label Other n = n
    label _ _ = "gene"

    more CDS' = [ [ "", "", "", "note", "TAA stop codon is completed by the addition of 3' A residues to the mRNA" ] ]
    more (STS sn) = [ [ "", "", "", "standard_name", sn ] ]
    more _ = []

data Anno
    = Gene { start :: Int
           , end   :: Int
           , name  :: String
           , syns  :: [String]
           , what  :: What
           , prod  :: String
           , notes :: [String] }
    deriving Show

data What = CDS | CDS' | TRNA | RRNA | Other | STS String deriving Show

rCRS_anno :: [Anno]
rCRS_anno =
    [ Gene   576        1  "D-loop" [] Other "" []
    , Gene 16569    16024   ""      [] Other "" []
    , Gene   577      647   "TRNF"  [] TRNA  "tRNA-Phe" []
    , Gene   648     1601   "RNR1"  ["MTRNR1"] RRNA "s-rRNA" ["12S rRNA; 12S ribosomal RNA"]
    , Gene  1602     1670   "TRNV"  [] TRNA  "tRNA-Val" []
    , Gene  1671     3229   "RNR2"  [] RRNA  "l-rRNA" ["16S ribosomal RNA; 16S rRNA"]
    , Gene  3230     3304   "TRNL1" ["MTTL1"] TRNA "tRNA-Leu" []
    , Gene  3307     4262   "ND1"   [] CDS'  "NADH dehydrogenase subunit 1" []

    , Gene  4263     4331   "TRNI"  [] TRNA  "tRNA-Ile" []
    , Gene  4400     4329   "TRNQ"  [] TRNA  "tRNA-Gln" []
    , Gene  4402     4469   "TRNM"  [] TRNA  "tRNA-Met" []
    , Gene  4470     5511   "ND2"   [] CDS'  "NADH dehydrogenase subunit 2" []
    , Gene  5512     5579   "TRNW"  [] TRNA  "tRNA-Trp" []
    , Gene  5655     5587   "TRNA"  [] TRNA  "tRNA-Ala" []
    , Gene  5729     5657   "TRNN"  [] TRNA  "tRNA-Asn" []
    , Gene  5826     5761   "TRNC"  [] TRNA  "tRNA-Cys" []
    , Gene  5891     5826   "TRNY"  [] TRNA  "tRNA-Tyr" []
    , Gene  5904     7445   "COX1"  ["COI"] CDS "cytochrome c oxidase subunit I" ["cytochrome c oxidase I"]
    , Gene  7514     7446   "TRNS1" [] TRNA  "tRNA-Ser" []
    , Gene  7518     7585   "TRND"  [] TRNA  "tRNA-Asp" []
    , Gene  7586     8269   "COX2"  [] CDS   "cytochrome c oxidase subunit II" ["cytochrome c oxidase II"]
    , Gene  8295     8364   "TRNK"  [] TRNA  "tRNA-Lys" []
    , Gene  8366     8572   "ATP8"  [] CDS   "ATP synthase F0 subunit 8" ["ATP synthase 8; ATPase subunit 8"]
    , Gene  8527     9207   "ATP6"  [] CDS   "ATP synthase F0 subunit 6" ["ATP synthase 6; ATPase subunit 6"]
    , Gene  9207     9990   "COX3"  [] CDS'  "cytochrome c oxidase subunit III" []
    , Gene  9342     9416   ""      [] (STS "PMC55343P8") "" []
    , Gene  9991    10058   "TRNG"  [] TRNA  "tRNA-Gly" []
    , Gene 10059    10404   "ND3"   [] CDS'  "NADH dehydrogenase subunit 3" []
    , Gene 10405    10469   "TRNR"  [] TRNA  "tRNA-Arg" []
    , Gene 10470    10766   "ND4L"  [] CDS   "NADH dehydrogenase subunit 4L" []
    , Gene 10760    12137   "ND4"   [] CDS'  "NADH dehydrogenase subunit 4" []
    , Gene 12138    12206   "TRNH"  [] TRNA  "tRNA-His" []
    , Gene 12207    12265   "TRNS2" [] TRNA  "tRNA-Ser" []
    , Gene 12266    12336   "TRNL2" [] TRNA  "tRNA-Leu" []
    , Gene 12337    14148   "ND5"   [] CDS   "NADH dehydrogenase subunit 5" []
    , Gene 14673    14149   "ND6"   [] CDS   "NADH dehydrogenase subunit 6" []
    , Gene 14742    14674   "TRNE"  [] TRNA  "tRNA-Glu" []
    , Gene 14747    15887   "CYTB"  [] CDS'  "cytochrome b" []
    , Gene 15888    15953   "TRNT"  [] TRNA  "tRNA-Thr" []
    , Gene 16023    15956   "TRNP"  [] TRNA  "tRNA-Pro" [] ]

aas :: [(String, String)]
aas = [
     (,) "ND1"
                     "MPMANLLLLIVPILIAMAFLMLTERKILGYMQLRKGPNVVGPYG\
                     \LLQPFADAMKLFTKEPLKPATSTITLYITAPTLALTIALLLWTPLPMPNPLVNLNLGL\
                     \LFILATSSLAVYSILWSGWASNSNYALIGALRAVAQTISYEVTLAIILLSTLLMSGSF\
                     \NLSTLITTQEHLWLLLPSWPLAMMWFISTLAETNRTPFDLAEGESELVSGFNIEYAAG\
                     \PFALFFMAEYTNIIMMNTLTTTIFLGTTYDALSPELYTTYFVTKTLLLTSLFLWIRTA\
                     \YPRFRYDQLMHLLWKNFLPLTLALLMWYVSMPITISSIPPQT",
     (,) "ND2"
                     "MNPLAQPVIYSTIFAGTLITALSSHWFFTWVGLEMNMLAFIPVL\
                     \TKKMNPRSTEAAIKYFLTQATASMILLMAILFNNMLSGQWTMTNTTNQYSSLMIMMAM\
                     \AMKLGMAPFHFWVPEVTQGTPLTSGLLLLTWQKLAPISIMYQISPSLNVSLLLTLSIL\
                     \SIMAGSWGGLNQTQLRKILAYSSITHMGWMMAVLPYNPNMTILNLTIYIILTTTAFLL\
                     \LNLNSSTTTLLLSRTWNKLTWLTPLIPSTLLSLGGLPPLTGFLPKWAIIEEFTKNNSL\
                     \IIPTIMATITLLNLYFYLRLIYSTSITLLPMSNNVKMKWQFEHTKPTPFLPTLIALTT\
                     \LLLPISPFMLMIL",
     (,) "COX1"
                     "MFADRWLFSTNHKDIGTLYLLFGAWAGVLGTALSLLIRAELGQP\
                     \GNLLGNDHIYNVIVTAHAFVMIFFMVMPIMIGGFGNWLVPLMIGAPDMAFPRMNNMSF\
                     \WLLPPSLLLLLASAMVEAGAGTGWTVYPPLAGNYSHPGASVDLTIFSLHLAGVSSILG\
                     \AINFITTIINMKPPAMTQYQTPLFVWSVLITAVLLLLSLPVLAAGITMLLTDRNLNTT\
                     \FFDPAGGGDPILYQHLFWFFGHPEVYILILPGFGMISHIVTYYSGKKEPFGYMGMVWA\
                     \MMSIGFLGFIVWAHHMFTVGMDVDTRAYFTSATMIIAIPTGVKVFSWLATLHGSNMKW\
                     \SAAVLWALGFIFLFTVGGLTGIVLANSSLDIVLHDTYYVVAHFHYVLSMGAVFAIMGG\
                     \FIHWFPLFSGYTLDQTYAKIHFTIMFIGVNLTFFPQHFLGLSGMPRRYSDYPDAYTTW\
                     \NILSSVGSFISLTAVMLMIFMIWEAFASKRKVLMVEEPSMNLEWLYGCPPPYHTFEEP\
                     \VYMKS",
     (,) "COX2"
                     "MAHAAQVGLQDATSPIMEELITFHDHALMIIFLICFLVLYALFL\
                     \TLTTKLTNTNISDAQEMETVWTILPAIILVLIALPSLRILYMTDEVNDPSLTIKSIGH\
                     \QWYWTYEYTDYGGLIFNSYMLPPLFLEPGDLRLLDVDNRVVLPIEAPIRMMITSQDVL\
                     \HSWAVPTLGLKTDAIPGRLNQTTFTATRPGVYYGQCSEICGANHSFMPIVLELIPLKI\
                     \FEMGPVFTL",
     (,) "ATP8"
                     "MPQLNTTVWPTMITPMLLTLFLITQLKMLNTNYHLPPSPKPMKM\
                     \KNYNKPWEPKWTKICSLHSLPPQS",
     (,) "ATP6"
                     "MNENLFASFIAPTILGLPAAVLIILFPPLLIPTSKYLINNRLIT\
                     \TQQWLIKLTSKQMMTMHNTKGRTWSLMLVSLIIFIATTNLLGLLPHSFTPTTQLSMNL\
                     \AMAIPLWAGTVIMGFRSKIKNALAHFLPQGTPTPLIPMLVIIETISLLIQPMALAVRL\
                     \TANITAGHLLMHLIGSATLAMSTINLPSTLIIFTILILLTILEIAVALIQAYVFTLLV\
                     \SLYLHDNT",
     (,) "COX3"
                     "MTHQSHAYHMVKPSPWPLTGALSALLMTSGLAMWFHFHSMTLLM\
                     \LGLLTNTLTMYQWWRDVTRESTYQGHHTPPVQKGLRYGMILFITSEVFFFAGFFWAFY\
                     \HSSLAPTPQLGGHWPPTGITPLNPLEVPLLNTSVLLASGVSITWAHHSLMENNRNQMI\
                     \QALLITILLGLYFTLLQASEYFESPFTISDGIYGSTFFVATGFHGLHVIIGSTFLTIC\
                     \FIRQLMFHFTSKHHFGFEAAAWYWHFVDVVWLFLYVSIYWWGS",
     (,) "ND3"
                     "MNFALILMINTLLALLLMIITFWLPQLNGYMEKSTPYECGFDPM\
                     \SPARVPFSMKFFLVAITFLLFDLEIALLLPLPWALQTTNLPLMVMSSLLLIIILALSL\
                     \AYEWLQKGLDWTE",
     (,) "ND4L"
                     "MPLIYMNIMLAFTISLLGMLVYRSHLMSSLLCLEGMMLSLFIMA\
                     \TLMTLNTHSLLANIVPIAMLVFAACEAAVGLALLVSISNTYGLDYVHNLNLLQC",
     (,) "ND4"
                     "MLKLIVPTIMLLPLTWLSKKHMIWINTTTHSLIISIIPLLFFNQ\
                     \INNNLFSCSPTFSSDPLTTPLLMLTTWLLPLTIMASQRHLSSEPLSRKKLYLSMLISL\
                     \QISLIMTFTATELIMFYIFFETTLIPTLAIITRWGNQPERLNAGTYFLFYTLVGSLPL\
                     \LIALIYTHNTLGSLNILLLTLTAQELSNSWANNLMWLAYTMAFMVKMPLYGLHLWLPK\
                     \AHVEAPIAGSMVLAAVLLKLGGYGMMRLTLILNPLTKHMAYPFLVLSLWGMIMTSSIC\
                     \LRQTDLKSLIAYSSISHMALVVTAILIQTPWSFTGAVILMIAHGLTSSLLFCLANSNY\
                     \ERTHSRIMILSQGLQTLLPLMAFWWLLASLANLALPPTINLLGELSVLVTTFSWSNIT\
                     \LLLTGLNMLVTALYSLYMFTTTQWGSLTHHINNMKPSFTRENTLMFMHLSPILLLSLN\
                     \PDIITGFSS",
     (,) "ND5"
                     "MTMHTTMTTLTLTSLIPPILTTLVNPNKKNSYPHYVKSIVASTF\
                     \IISLFPTTMFMCLDQEVIISNWHWATTQTTQLSLSFKLDYFSMMFIPVALFVTWSIME\
                     \FSLWYMNSDPNINQFFKYLLIFLITMLILVTANNLFQLFIGWEGVGIMSFLLISWWYA\
                     \RADANTAAIQAILYNRIGDIGFILALAWFILHSNSWDPQQMALLNANPSLTPLLGLLL\
                     \AAAGKSAQLGLHPWLPSAMEGPTPVSALLHSSTMVVAGIFLLIRFHPLAENSPLIQTL\
                     \TLCLGAITTLFAAVCALTQNDIKKIVAFSTSSQLGLMMVTIGINQPHLAFLHICTHAF\
                     \FKAMLFMCSGSIIHNLNNEQDIRKMGGLLKTMPLTSTSLTIGSLALAGMPFLTGFYSK\
                     \DHIIETANMSYTNAWALSITLIATSLTSAYSTRMILLTLTGQPRFPTLTNINENNPTL\
                     \LNPIKRLAAGSLFAGFLITNNISPASPFQTTIPLYLKLTALAVTFLGLLTALDLNYLT\
                     \NKLKMKSPLCTFYFSNMLGFYPSITHRTIPYLGLLTSQNLPLLLLDLTWLEKLLPKTI\
                     \SQHQISTSIITSTQKGMIKLYFLSFFFPLILTLLLIT",
     (,) "ND6"
                     "MMYALFLLSVGLVMGFVGFSSKPSPIYGGLVLIVSGVVGCVIIL\
                     \NFGGGYMGLMVFLIYLGGMMVVFGYTTAMAIEEYPEAWGSGVEVLVSVLVGLAMEVGL\
                     \VLWVKEYDGVVVVVNFNSVGSWMIYEGEGSGLIREDPIGAGALYDYGRWLVVVTGWTL\
                     \FVGVYIVIEIARGN",
     (,) "CYTB"
                     "MTPMRKTNPLMKLINHSFIDLPTPSNISAWWNFGSLLGACLILQ\
                     \ITTGLFLAMHYSPDASTAFSSIAHITRDVNYGWIIRYLHANGASMFFICLFLHIGRGL\
                     \YYGSFLYSETWNIGIILLLATMATAFMGYVLPWGQMSFWGATVITNLLSAIPYIGTDL\
                     \VQWIWGGYSVDSPTLTRFFTFHFILPFIIAALATLHLLFLHETGSNNPLGITSHSDKI\
                     \TFHPYYTIKDALGLLLFLLSLMTLTLFSPDLLGDPDNYTLANPLNTPPHIKPEWYFLF\
                     \AYTILRSVPNKLGGVLALLLSILILAMIPILHMSKQQSMMFRPLSQSLYWLLAADLLI\
                     \LTWIGGQPVSYPFTIIGQVASVLYFTTILILMPTISLIENKMLKWA" ]