| Copyright | © Herbert Valerio Riedel 2017 |
|---|---|
| License | BSD3 |
| Maintainer | hvr@gnu.org |
| Stability | stable |
| Safe Haskell | Trustworthy |
| Language | Haskell2010 |
Data.Text.Short
Description
Memory-efficient representation of Unicode text strings.
This module is intended to be imported qualified, to avoid name
clashes with Prelude functions, e.g.
import qualified Data.Text.Short as TS import qualified Data.Text.Short (ShortText)
This modules deliberately omits (common) partial functions, which can be found in Data.Text.Short.Partial instead.
Since: 0.1
Synopsis
- data ShortText
- empty :: ShortText
- singleton :: Char -> ShortText
- pack :: [Char] -> ShortText
- append :: ShortText -> ShortText -> ShortText
- concat :: [ShortText] -> ShortText
- cons :: Char -> ShortText -> ShortText
- snoc :: ShortText -> Char -> ShortText
- replicate :: Int -> ShortText -> ShortText
- unpack :: ShortText -> [Char]
- uncons :: ShortText -> Maybe (Char, ShortText)
- unsnoc :: ShortText -> Maybe (ShortText, Char)
- null :: ShortText -> Bool
- length :: ShortText -> Int
- isAscii :: ShortText -> Bool
- all :: (Char -> Bool) -> ShortText -> Bool
- any :: (Char -> Bool) -> ShortText -> Bool
- find :: (Char -> Bool) -> ShortText -> Maybe Char
- isPrefixOf :: ShortText -> ShortText -> Bool
- isSuffixOf :: ShortText -> ShortText -> Bool
- (!?) :: ShortText -> Int -> Maybe Char
- indexMaybe :: ShortText -> Int -> Maybe Char
- indexEndMaybe :: ShortText -> Int -> Maybe Char
- findIndex :: (Char -> Bool) -> ShortText -> Maybe Int
- take :: Int -> ShortText -> ShortText
- takeEnd :: Int -> ShortText -> ShortText
- drop :: Int -> ShortText -> ShortText
- dropEnd :: Int -> ShortText -> ShortText
- takeWhile :: (Char -> Bool) -> ShortText -> ShortText
- takeWhileEnd :: (Char -> Bool) -> ShortText -> ShortText
- dropWhile :: (Char -> Bool) -> ShortText -> ShortText
- dropWhileEnd :: (Char -> Bool) -> ShortText -> ShortText
- dropAround :: (Char -> Bool) -> ShortText -> ShortText
- splitAt :: Int -> ShortText -> (ShortText, ShortText)
- splitAtEnd :: Int -> ShortText -> (ShortText, ShortText)
- span :: (Char -> Bool) -> ShortText -> (ShortText, ShortText)
- break :: (Char -> Bool) -> ShortText -> (ShortText, ShortText)
- spanEnd :: (Char -> Bool) -> ShortText -> (ShortText, ShortText)
- breakEnd :: (Char -> Bool) -> ShortText -> (ShortText, ShortText)
- split :: (Char -> Bool) -> ShortText -> [ShortText]
- stripPrefix :: ShortText -> ShortText -> Maybe ShortText
- stripSuffix :: ShortText -> ShortText -> Maybe ShortText
- intersperse :: Char -> ShortText -> ShortText
- intercalate :: ShortText -> [ShortText] -> ShortText
- reverse :: ShortText -> ShortText
- filter :: (Char -> Bool) -> ShortText -> ShortText
- foldl :: (a -> Char -> a) -> a -> ShortText -> a
- foldl' :: (a -> Char -> a) -> a -> ShortText -> a
- foldr :: (Char -> a -> a) -> a -> ShortText -> a
- fromString :: String -> ShortText
- toString :: ShortText -> String
- fromText :: Text -> ShortText
- toText :: ShortText -> Text
- fromShortByteString :: ShortByteString -> Maybe ShortText
- toShortByteString :: ShortText -> ShortByteString
- fromByteString :: ByteString -> Maybe ShortText
- toByteString :: ShortText -> ByteString
- toBuilder :: ShortText -> Builder
The ShortText type
A compact representation of Unicode strings.
A ShortText value is a sequence of Unicode scalar values, as defined in
§3.9, definition D76 of the Unicode 5.2 standard;
This means that a ShortText is a list of (scalar) Unicode code-points (i.e. code-points in the range [U+00 .. U+D7FF] ∪ [U+E000 .. U+10FFFF]).
This type relates to Text as ShortByteString relates to ByteString by providing a more compact type. Please consult the documentation of Data.ByteString.Short for more information.
Currently, a boxed unshared Text has a memory footprint of 6 words (i.e. 48 bytes on 64-bit systems) plus 2 or 4 bytes per code-point (due to the internal UTF-16 representation). Each Text value which can share its payload with another Text requires only 4 words additionally. Unlike ByteString, Text use unpinned memory.
In comparison, the footprint of a boxed ShortText is only 4 words (i.e. 32 bytes on 64-bit systems) plus 1, 2, 3, or 4 bytes per code-point (due to the internal UTF-8 representation).
It can be shown that for realistic data UTF-16 has a space overhead of 50% over UTF-8.
NOTE: The Typeable instance isn't defined for GHC 7.8 (and older) prior to text-short-0.1.3
Since: 0.1
Instances
| IsList ShortText Source # | Note: Surrogate pairs ( Since: 0.1.2 |
| Eq ShortText Source # | |
| Data ShortText Source # | It exposes a similar Since: 0.1.3 |
Defined in Data.Text.Short.Internal Methods gfoldl :: (forall d b. Data d => c (d -> b) -> d -> c b) -> (forall g. g -> c g) -> ShortText -> c ShortText # gunfold :: (forall b r. Data b => c (b -> r) -> c r) -> (forall r. r -> c r) -> Constr -> c ShortText # toConstr :: ShortText -> Constr # dataTypeOf :: ShortText -> DataType # dataCast1 :: Typeable t => (forall d. Data d => c (t d)) -> Maybe (c ShortText) # dataCast2 :: Typeable t => (forall d e. (Data d, Data e) => c (t d e)) -> Maybe (c ShortText) # gmapT :: (forall b. Data b => b -> b) -> ShortText -> ShortText # gmapQl :: (r -> r' -> r) -> r -> (forall d. Data d => d -> r') -> ShortText -> r # gmapQr :: forall r r'. (r' -> r -> r) -> r -> (forall d. Data d => d -> r') -> ShortText -> r # gmapQ :: (forall d. Data d => d -> u) -> ShortText -> [u] # gmapQi :: Int -> (forall d. Data d => d -> u) -> ShortText -> u # gmapM :: Monad m => (forall d. Data d => d -> m d) -> ShortText -> m ShortText # gmapMp :: MonadPlus m => (forall d. Data d => d -> m d) -> ShortText -> m ShortText # gmapMo :: MonadPlus m => (forall d. Data d => d -> m d) -> ShortText -> m ShortText # | |
| Ord ShortText Source # | |
| Read ShortText Source # | |
| Show ShortText Source # | |
| IsString ShortText Source # | Note: Surrogate pairs ( |
Defined in Data.Text.Short.Internal Methods fromString :: String -> ShortText # | |
| Semigroup ShortText Source # | |
| Monoid ShortText Source # | |
| PrintfArg ShortText Source # | Since: 0.1.2 |
Defined in Data.Text.Short.Internal | |
| Binary ShortText Source # | |
| NFData ShortText Source # | |
Defined in Data.Text.Short.Internal | |
| Hashable ShortText Source # | |
Defined in Data.Text.Short.Internal | |
| Lift ShortText Source # | Since 0.1.3 |
| type Item ShortText Source # | |
Defined in Data.Text.Short.Internal | |
Basic operations
Construction
singleton :: Char -> ShortText Source #
\(\mathcal{O}(1)\) Construct ShortText from single codepoint.
singleton c == pack [c]
length (singleton c) == 1
>>>singleton 'A'"A"
>>>map singleton ['\55295','\55296','\57343','\57344'] -- U+D7FF U+D800 U+DFFF U+E000["\55295","\65533","\65533","\57344"]
Note: This function is total because it replaces the (invalid) code-points U+D800 through U+DFFF with the replacement character U+FFFD.
Since: 0.1.2
pack :: [Char] -> ShortText Source #
\(\mathcal{O}(n)\) Construct a ShortText from a list of Chars.
This is an alias for fromString.
Since: 0.1.2
snoc :: ShortText -> Char -> ShortText Source #
\(\mathcal{O}(n)\) Append a character to the ond of a ShortText.
snoc t c == t <> singleton c
Since: 0.1.2
replicate :: Int -> ShortText -> ShortText Source #
\(\mathcal{O}(n*m)\) Replicate a ShortText.
A repetition count smaller than 1 results in an empty string result.
>>>replicate 3 "jobs!""jobs!jobs!jobs!"
>>>replicate 10000 """"
>>>replicate 0 "nothing"""
length (replicate n t) == max 0 n * length t
Since: 0.1.2
Deconstruction
Querying & predicates
null :: ShortText -> Bool Source #
\(\mathcal{O}(1)\) Test whether a ShortText is empty.
>>>null ""True
null (singleton c) == False
null t == (length t == 0)
Since: 0.1
length :: ShortText -> Int Source #
\(\mathcal{O}(n)\) Count the number of Unicode code-points in a ShortText.
>>>length "abcd€"5
>>>length ""0
length t >= 0
Since: 0.1
all :: (Char -> Bool) -> ShortText -> Bool Source #
\(\mathcal{O}(n)\) Test whether all code points in ShortText satisfy a predicate.
>>>all (const False) ""True
>>>all (> 'c') "abcdabcd"False
>>>all (/= 'c') "abdabd"True
Since: 0.1.2
any :: (Char -> Bool) -> ShortText -> Bool Source #
\(\mathcal{O}(n)\) Test whether any code points in ShortText satisfy a predicate.
>>>any (> 'c') "abcdabcd"True
>>>any (const True) ""False
>>>any (== 'c') "abdabd"False
any p t == not (all (not . p) t)
Since: 0.1.2
find :: (Char -> Bool) -> ShortText -> Maybe Char Source #
\(\mathcal{O}(n)\) Return the left-most codepoint in ShortText that satisfies the given predicate.
>>>find (> 'b') "abcdabcd"Just 'c'
>>>find (> 'b') "ababab"Nothing
Since: 0.1.2
Lookup & indexing
(!?) :: ShortText -> Int -> Maybe Char Source #
\(\mathcal{O}(n)\) Index i-th code-point in ShortText.
Infix operator alias of indexMaybe
>>>"abcdefg" !? 2Just 'c'
Since: 0.1.2
findIndex :: (Char -> Bool) -> ShortText -> Maybe Int Source #
\(\mathcal{O}(n)\) Return the index of the left-most codepoint in ShortText that satisfies the given predicate.
>>>findIndex (> 'b') "abcdabcdef"Just 2
>>>findIndex (> 'b') "ababab"Nothing
(indexMaybe t =<< findIndex p t) == find p t
Since: 0.1.2
Splitting ShortTexts
Basic functions
take :: Int -> ShortText -> ShortText Source #
\(\mathcal{O}(n)\) Take prefix of given length or return whole ShortText if too short.
>>>take 3 "abcdef""abc"
>>>take 3 "ab""ab"
Since: 0.1.2
takeEnd :: Int -> ShortText -> ShortText Source #
\(\mathcal{O}(n)\) Take suffix of given length or return whole ShortText if too short.
>>>takeEnd 3 "abcdefg""efg"
>>>takeEnd 3 "ab""ab"
Since: 0.1.2
takeWhile :: (Char -> Bool) -> ShortText -> ShortText Source #
\(\mathcal{O}(n)\) Take longest prefix satisfying given predicate.
takeWhile p t == fst (span p t)
>>>takeWhile (< 'c') "abcdabcd""ab"
Since: 0.1.2
takeWhileEnd :: (Char -> Bool) -> ShortText -> ShortText Source #
\(\mathcal{O}(n)\) Take longest suffix satisfying given predicate.
takeWhileEnd p t == snd (spanEnd p t)
>>>takeWhileEnd (>= 'c') "abcdabcd""cd"
Since: 0.1.2
dropWhile :: (Char -> Bool) -> ShortText -> ShortText Source #
\(\mathcal{O}(n)\) Remove longest prefix satisfying given predicate.
dropWhile p t == snd (span p t)
>>>dropWhile (< 'c') "abcdabcd""cdabcd"
Since: 0.1.2
dropWhileEnd :: (Char -> Bool) -> ShortText -> ShortText Source #
\(\mathcal{O}(n)\) Remove longest suffix satisfying given predicate.
dropWhileEnd p t == fst (spanEnd p t)
>>>dropWhileEnd (>= 'c') "abcdabcd""abcdab"
Since: 0.1.2
dropAround :: (Char -> Bool) -> ShortText -> ShortText Source #
\(\mathcal{O}(n)\) Strip characters from the beginning end and of ShortText which satisfy given predicate.
>>>dropAround (== ' ') " white space ""white space"
>>>dropAround (> 'a') "bcdefghi"""
Since: 0.1.2
Pair-valued functions
splitAt :: Int -> ShortText -> (ShortText, ShortText) Source #
\(\mathcal{O}(n)\) Split ShortText into two halves.
returns a pair of splitAt n tShortText with the following properties:
length (fst (splitAt n t)) == min (length t) (max 0 n)
fst (splitAt n t) <> snd (splitAt n t) == t
>>>splitAt 2 "abcdef"("ab","cdef")
>>>splitAt 10 "abcdef"("abcdef","")
>>>splitAt (-1) "abcdef"("","abcdef")
Since: 0.1.2
splitAtEnd :: Int -> ShortText -> (ShortText, ShortText) Source #
\(\mathcal{O}(n)\) Split ShortText into two halves.
returns a pair of splitAtEnd n tShortText with the following properties:
length (snd (splitAtEnd n t)) == min (length t) (max 0 n)
fst (splitAtEnd n t) <> snd (splitAtEnd n t) == t
splitAtEnd n t == splitAt (length t - n) t
>>>splitAtEnd 2 "abcdef"("abcd","ef")
>>>splitAtEnd 10 "abcdef"("","abcdef")
>>>splitAtEnd (-1) "abcdef"("abcdef","")
Since: 0.1.2
span :: (Char -> Bool) -> ShortText -> (ShortText, ShortText) Source #
\(\mathcal{O}(n)\) Split ShortText into longest prefix satisfying the given predicate and the remaining suffix.
>>>span (< 'c') "abcdabcd"("ab","cdabcd")
fst (span p t) <> snd (span p t) == t
Since: 0.1.2
break :: (Char -> Bool) -> ShortText -> (ShortText, ShortText) Source #
\(\mathcal{O}(n)\) Variant of span with negated predicate.
>>>break (> 'c') "abcdabcd"("abc","dabcd")
break p t == span (not . p) t
fst (break p t) <> snd (break p t) == t
Since: 0.1.2
spanEnd :: (Char -> Bool) -> ShortText -> (ShortText, ShortText) Source #
\(\mathcal{O}(n)\) Split ShortText into longest suffix satisfying the given predicate and the preceding prefix.
>>>spanEnd (> 'c') "abcdabcd"("abcdabc","d")
fst (spanEnd p t) <> snd (spanEnd p t) == t
Since: 0.1.2
breakEnd :: (Char -> Bool) -> ShortText -> (ShortText, ShortText) Source #
\(\mathcal{O}(n)\) Variant of spanEnd with negated predicate.
>>>breakEnd (< 'c') "abcdabcd"("abcdab","cd")
breakEnd p t == spanEnd (not . p) t
fst (breakEnd p t) <> snd (breakEnd p t) == t
Since: 0.1.2
Breaking into many substrings
split :: (Char -> Bool) -> ShortText -> [ShortText] Source #
\(\mathcal{O}(n)\) Splits a string into components delimited by separators, where the predicate returns True for a separator element. The resulting components do not contain the separators. Two adjacent separators result in an empty component in the output. eg.
>>>split (=='a') "aabbaca"["","","bb","c",""]
>>>split (=='a') ""[""]
intercalate (singleton c) (split (== c) t) = t
NOTE: split never returns an empty list to match the semantics of its counterpart from Data.Text.
Since: 0.1.3
Suffix & Prefix operations
Transformations
intersperse :: Char -> ShortText -> ShortText Source #
\(\mathcal{O}(n)\) Insert character between characters of ShortText.
>>>intersperse '*' "_""_"
>>>intersperse '*' "MASH""M*A*S*H"
Since: 0.1.2
reverse :: ShortText -> ShortText Source #
\(\mathcal{O}(n)\) Reverse characters in ShortText.
>>>reverse "star live desserts""stressed evil rats"
reverse (singleton c) == singleton c
reverse (reverse t) == t
Since: 0.1.2
filter :: (Char -> Bool) -> ShortText -> ShortText Source #
\(\mathcal{O}(n)\) Remove characters from ShortText which don't satisfy given predicate.
>>>filter (`notElem` ['a','e','i','o','u']) "You don't need vowels to convey information!""Y dn't nd vwls t cnvy nfrmtn!"
filter (const False) t == ""
filter (const True) t == t
length (filter p t) <= length t
filter p t == pack [ c | c <- unpack t, p c ]
Since: 0.1.2
Folds
foldl :: (a -> Char -> a) -> a -> ShortText -> a Source #
\(\mathcal{O}(n)\) Reduces the characters of the ShortText with
the binary operator and an initial in forward direction (i.e. from
left to right).
>>>foldl (\_ _ -> True) False ""False
>>>foldl (\s c -> c : s) ['.'] "abcd""dcba."
Since: 0.1.2
foldl' :: (a -> Char -> a) -> a -> ShortText -> a Source #
\(\mathcal{O}(n)\) Strict version of foldl.
Since: 0.1.2
foldr :: (Char -> a -> a) -> a -> ShortText -> a Source #
\(\mathcal{O}(n)\) Reduces the characters of the ShortText with
the binary operator and an initial in reverse direction (i.e. from
right to left).
>>>foldr (\_ _ -> True) False ""False
>>>foldr (:) ['.'] "abcd""abcd."
Since: 0.1.2
Conversions
String
fromString :: String -> ShortText Source #
\(\mathcal{O}(n)\) Construct/pack from String
>>>fromString []""
>>>fromString ['a','b','c']"abc"
>>>fromString ['\55295','\55296','\57343','\57344'] -- U+D7FF U+D800 U+DFFF U+E000"\55295\65533\65533\57344"
Note: This function is total because it replaces the (invalid) code-points U+D800 through U+DFFF with the replacement character U+FFFD.
Since: 0.1
toString :: ShortText -> String Source #
\(\mathcal{O}(n)\) Convert to String
(fromString . toString) t == t
Note: See documentation of fromString for why ( is not an identity function.toString . fromString)
Since: 0.1
Text
toText :: ShortText -> Text Source #
\(\mathcal{O}(n)\) Convert to Text
(fromText . toText) t == t
(toText . fromText) t == t
This is currently not \(\mathcal{O}(1)\) because currently Text uses UTF-16 as its internal representation.
In the event that Text will change its internal representation to UTF-8 this operation will become \(\mathcal{O}(1)\).
Since: 0.1
ByteString
fromShortByteString :: ShortByteString -> Maybe ShortText Source #
\(\mathcal{O}(n)\) Construct ShortText from UTF-8 encoded ShortByteString
This operation doesn't copy the input ShortByteString but it
cannot be \(\mathcal{O}(1)\) because we need to validate the UTF-8 encoding.
Returns Nothing in case of invalid UTF-8 encoding.
>>>fromShortByteString "\x00\x38\xF0\x90\x8C\x9A" -- U+00 U+38 U+1031AJust "\NUL8\66330"
>>>fromShortByteString "\xC0\x80" -- invalid denormalised U+00Nothing
>>>fromShortByteString "\xED\xA0\x80" -- U+D800 (non-scalar code-point)Nothing
>>>fromShortByteString "\xF4\x8f\xbf\xbf" -- U+10FFFFJust "\1114111"
>>>fromShortByteString "\xF4\x90\x80\x80" -- U+110000 (invalid)Nothing
fromShortByteString (toShortByteString t) == Just t
Since: 0.1
toShortByteString :: ShortText -> ShortByteString Source #
\(\mathcal{O}(0)\) Converts to UTF-8 encoded ShortByteString
This operation has effectively no overhead, as it's currently merely a newtype-cast.
Since: 0.1
fromByteString :: ByteString -> Maybe ShortText Source #
\(\mathcal{O}(n)\) Construct ShortText from UTF-8 encoded ByteString
fromByteString accepts (or rejects) the same input data as fromShortByteString.
Returns Nothing in case of invalid UTF-8 encoding.
Since: 0.1
toByteString :: ShortText -> ByteString Source #
\(\mathcal{O}(n)\) Converts to UTF-8 encoded ByteString
Since: 0.1