{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RecordWildCards #-}

{- |
Module      : Langchain.TextSplitter.Character
Description : Character-based text splitting for LLM processing [[10]]
Copyright   : (c) 2025 Tushar Adhatrao
License     : MIT
Maintainer  : Tushar Adhatrao <tusharadhatrao@gmail.com>
Stability   : experimental

Character-based text splitting implementation following LangChain's text splitter concepts.
Splits text into chunks based on separators and maximum chunk sizes, useful for processing
large documents with LLMs.

For more information on text splitting concepts, see the Langchain documentation:
[Langchain TextSplitter](https://python.langchain.com/docs/concepts/text_splitters/).

Example usage:

@
-- Split text using default settings (100 char chunks, double newline separator)
splitText defaultCharacterSplitterOps "Long document text..."

-- Custom configuration for 500-char chunks with paragraph splitting
customSplit = splitText (CharacterSplitterOps 500 "\n\\s*\n")
@
-}
module Langchain.TextSplitter.Character
  ( -- * Configuration
    CharacterSplitterOps (..)
  , defaultCharacterSplitterOps

    -- * Splitting Function
  , splitText
  ) where

import Data.Text (Text)
import qualified Data.Text as T

{- | Configuration for character-based text splitting 
Contains:

- 'chunkSize' : Maximum characters per chunk
- 'separator' : Pattern to split text before chunking

Default values follow LangChain's recommended settings for LLM input preparation.
-}
data CharacterSplitterOps = CharacterSplitterOps
  { CharacterSplitterOps -> Int
chunkSize :: Int
  , CharacterSplitterOps -> Text
separator :: Text
  }
  deriving (Int -> CharacterSplitterOps -> ShowS
[CharacterSplitterOps] -> ShowS
CharacterSplitterOps -> String
(Int -> CharacterSplitterOps -> ShowS)
-> (CharacterSplitterOps -> String)
-> ([CharacterSplitterOps] -> ShowS)
-> Show CharacterSplitterOps
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> CharacterSplitterOps -> ShowS
showsPrec :: Int -> CharacterSplitterOps -> ShowS
$cshow :: CharacterSplitterOps -> String
show :: CharacterSplitterOps -> String
$cshowList :: [CharacterSplitterOps] -> ShowS
showList :: [CharacterSplitterOps] -> ShowS
Show, CharacterSplitterOps -> CharacterSplitterOps -> Bool
(CharacterSplitterOps -> CharacterSplitterOps -> Bool)
-> (CharacterSplitterOps -> CharacterSplitterOps -> Bool)
-> Eq CharacterSplitterOps
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
$c== :: CharacterSplitterOps -> CharacterSplitterOps -> Bool
== :: CharacterSplitterOps -> CharacterSplitterOps -> Bool
$c/= :: CharacterSplitterOps -> CharacterSplitterOps -> Bool
/= :: CharacterSplitterOps -> CharacterSplitterOps -> Bool
Eq)

{- | Default splitter configuration 

- 100 character chunks
- Splits on double newlines ("\n\n")

>>> defaultCharacterSplitterOps
CharacterSplitterOps {chunkSize = 100, separator = "\n\n"}
-}
defaultCharacterSplitterOps :: CharacterSplitterOps
defaultCharacterSplitterOps :: CharacterSplitterOps
defaultCharacterSplitterOps =
  CharacterSplitterOps
    { chunkSize :: Int
chunkSize = Int
100
    , separator :: Text
separator = Text
"\n\n"
    }

{- | Split text into chunks following LangChain's splitting strategy:
 -
1. Split by separator first
2. Chunk each segment into specified size
3. Preserve semantic boundaries where possible

Examples:
>>> splitText defaultCharacterSplitterOps ""
[]

>>> splitText defaultCharacterSplitterOps "Short text"
["Short text"]

>>> splitText defaultCharacterSplitterOps "Part1\n\nPart2\n\nPart3"
["Part1", "Part2", "Part3"]

>>> splitText (CharacterSplitterOps 20 "\n\n") "Very long text exceeding chunk size..."
["Very long text ex", "ceeding chunk size..."]
-}
splitText :: CharacterSplitterOps -> Text -> [Text]
splitText :: CharacterSplitterOps -> Text -> [Text]
splitText CharacterSplitterOps {Int
Text
chunkSize :: CharacterSplitterOps -> Int
separator :: CharacterSplitterOps -> Text
chunkSize :: Int
separator :: Text
..} Text
txt =
  [[Text]] -> [Text]
forall a. Monoid a => [a] -> a
mconcat ([[Text]] -> [Text]) -> [[Text]] -> [Text]
forall a b. (a -> b) -> a -> b
$
    (Text -> [Text]) -> [Text] -> [[Text]]
forall a b. (a -> b) -> [a] -> [b]
map
      (Int -> Text -> [Text]
T.chunksOf Int
chunkSize (Text -> [Text]) -> (Text -> Text) -> Text -> [Text]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> Text
T.strip)
      (if Text -> Bool
T.null Text
separator then [Text
txt] else HasCallStack => Text -> Text -> [Text]
Text -> Text -> [Text]
T.splitOn Text
separator Text
txt)

{- $examples
Test case patterns demonstrating key behaviors:

1. Empty input handling
   >>> splitText defaultCharacterSplitterOps ""
   []

2. Custom separator usage
   >>> splitText (CharacterSplitterOps 100 "|") "A|B|C"
   ["A", "B", "C"]

3. Combined splitting and chunking
   >>> splitText (CharacterSplitterOps 10 "\n") "1234567890\nABCDEFGHIJ"
   ["1234567890", "ABCDEFGHIJ"]
-}