{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RecordWildCards #-}
module Langchain.TextSplitter.Character
(
CharacterSplitterOps (..)
, defaultCharacterSplitterOps
, splitText
) where
import Data.Text (Text)
import qualified Data.Text as T
data CharacterSplitterOps = CharacterSplitterOps
{ CharacterSplitterOps -> Int
chunkSize :: Int
, CharacterSplitterOps -> Text
separator :: Text
}
deriving (Int -> CharacterSplitterOps -> ShowS
[CharacterSplitterOps] -> ShowS
CharacterSplitterOps -> String
(Int -> CharacterSplitterOps -> ShowS)
-> (CharacterSplitterOps -> String)
-> ([CharacterSplitterOps] -> ShowS)
-> Show CharacterSplitterOps
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> CharacterSplitterOps -> ShowS
showsPrec :: Int -> CharacterSplitterOps -> ShowS
$cshow :: CharacterSplitterOps -> String
show :: CharacterSplitterOps -> String
$cshowList :: [CharacterSplitterOps] -> ShowS
showList :: [CharacterSplitterOps] -> ShowS
Show, CharacterSplitterOps -> CharacterSplitterOps -> Bool
(CharacterSplitterOps -> CharacterSplitterOps -> Bool)
-> (CharacterSplitterOps -> CharacterSplitterOps -> Bool)
-> Eq CharacterSplitterOps
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
$c== :: CharacterSplitterOps -> CharacterSplitterOps -> Bool
== :: CharacterSplitterOps -> CharacterSplitterOps -> Bool
$c/= :: CharacterSplitterOps -> CharacterSplitterOps -> Bool
/= :: CharacterSplitterOps -> CharacterSplitterOps -> Bool
Eq)
defaultCharacterSplitterOps :: CharacterSplitterOps
defaultCharacterSplitterOps :: CharacterSplitterOps
defaultCharacterSplitterOps =
CharacterSplitterOps
{ chunkSize :: Int
chunkSize = Int
100
, separator :: Text
separator = Text
"\n\n"
}
splitText :: CharacterSplitterOps -> Text -> [Text]
splitText :: CharacterSplitterOps -> Text -> [Text]
splitText CharacterSplitterOps {Int
Text
chunkSize :: CharacterSplitterOps -> Int
separator :: CharacterSplitterOps -> Text
chunkSize :: Int
separator :: Text
..} Text
txt =
[[Text]] -> [Text]
forall a. Monoid a => [a] -> a
mconcat ([[Text]] -> [Text]) -> [[Text]] -> [Text]
forall a b. (a -> b) -> a -> b
$
(Text -> [Text]) -> [Text] -> [[Text]]
forall a b. (a -> b) -> [a] -> [b]
map
(Int -> Text -> [Text]
T.chunksOf Int
chunkSize (Text -> [Text]) -> (Text -> Text) -> Text -> [Text]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> Text
T.strip)
(if Text -> Bool
T.null Text
separator then [Text
txt] else HasCallStack => Text -> Text -> [Text]
Text -> Text -> [Text]
T.splitOn Text
separator Text
txt)