{- Copyright (c) Meta Platforms, Inc. and affiliates. All rights reserved. This source code is licensed under the BSD-style license found in the LICENSE file in the root directory of this source tree. -} -- | There are several concepts of ranges to convert between. module Glean.Util.Range ( -- * source range get/set HasSrcRange(..) -- * byte range , ByteRange(..) , byteRangeExclusiveEnd , byteRangeContains , srcRangeToByteRange , packedByteSpansToRanges , fromToSpansAndExpansions , fromToSpansAndSpellings , rangesToPackedByteSpans , byteSpanToRange , rangeToByteSpan -- * comparisons , compareLineCol , rangeContains , compareRange , topLevelRanges -- * LineOffsets , LineOffsets(..), getLineOffsets, lengthsToLineOffsets, firstLineOffsetsDiff , sameUpToLastNewline, debugLineOffsets , srcRangeToFileLocation , srcRangeToSimpleByteRange , mkFileLines -- ** converting with LineOffsets , byteOffsetToLineCol , byteRangeToRange , byteRangesToLineNumbers ) where import Control.DeepSeq import Control.Exception.Extra (Partial) import Data.Bifunctor (bimap) import Data.ByteString (ByteString) import qualified Data.ByteString.Char8 as B import Data.Function (on) import Data.List (groupBy, sortBy) import Data.List.Extra (chunksOf, merge) import Data.Maybe (fromMaybe) import qualified Data.String.UTF8 as UTF8 hiding (take) import qualified Data.Vector as BoxedVector import Data.Vector.Unboxed (Vector, (!), (!?)) import qualified Data.Vector.Unboxed as Vector import Data.Word import Glean (toNat, fromNat) import Glean.Schema.Pp1.Types as Pp import Glean.Schema.Src.Types as Src hiding (ByteRange(..)) import Glean.Schema.Cxx1.Types as Cxx class HasSrcRange a where srcRange :: a -> Src.Range setSrcRange :: Src.Range -> a -> a instance HasSrcRange Cxx.NamespaceDeclaration_key where srcRange = namespaceDeclaration_key_source setSrcRange r a = a{ namespaceDeclaration_key_source = r } instance HasSrcRange Cxx.RecordDeclaration_key where srcRange = recordDeclaration_key_source setSrcRange r a = a{ recordDeclaration_key_source = r } instance HasSrcRange Cxx.EnumDeclaration_key where srcRange = enumDeclaration_key_source setSrcRange r a = a{ enumDeclaration_key_source = r } instance HasSrcRange Cxx.FunctionDeclaration_key where srcRange = functionDeclaration_key_source setSrcRange r a = a{ functionDeclaration_key_source = r } instance HasSrcRange Cxx.ObjcContainerDeclaration_key where srcRange = objcContainerDeclaration_key_source setSrcRange r a = a{ objcContainerDeclaration_key_source = r } instance HasSrcRange Cxx.ObjcMethodDeclaration_key where srcRange = objcMethodDeclaration_key_source setSrcRange r a = a{ objcMethodDeclaration_key_source = r } instance HasSrcRange Cxx.ObjcPropertyDeclaration_key where srcRange = objcPropertyDeclaration_key_source setSrcRange r a = a{ objcPropertyDeclaration_key_source = r } instance HasSrcRange Cxx.UsingDeclaration_key where srcRange = usingDeclaration_key_source setSrcRange r a = a{ usingDeclaration_key_source = r } instance HasSrcRange Cxx.UsingDirective_key where srcRange = usingDirective_key_source setSrcRange r a = a{ usingDirective_key_source = r } instance HasSrcRange Cxx.TypeAliasDeclaration_key where srcRange = typeAliasDeclaration_key_source setSrcRange r a = a{ typeAliasDeclaration_key_source = r } instance HasSrcRange Cxx.NamespaceAliasDeclaration_key where srcRange = namespaceAliasDeclaration_key_source setSrcRange r a = a{ namespaceAliasDeclaration_key_source = r } instance HasSrcRange Cxx.VariableDeclaration_key where srcRange = variableDeclaration_key_source setSrcRange r a = a{ variableDeclaration_key_source = r } instance HasSrcRange Cxx.Enumerator_key where srcRange = enumerator_key_source setSrcRange r a = a{ enumerator_key_source = r } instance HasSrcRange Pp.Define_key where srcRange = define_key_source setSrcRange r a = a{ define_key_source = r } instance HasSrcRange Pp.Use_key where srcRange = use_key_source setSrcRange r a = a{ use_key_source = r } data ByteRange = ByteRange { byteRange_begin :: {-# UNPACK #-}!Word64 , byteRange_length :: {-# UNPACK #-}!Word64 } deriving (Show, Eq, Ord) instance NFData ByteRange where rnf = (`seq` ()) -- | Compute the (exclusive) end offset byteRangeExclusiveEnd :: ByteRange -> Word64 byteRangeExclusiveEnd ByteRange{..} = byteRange_begin + byteRange_length -- | byteRangeContains r1 r2 is True iff r1 contains the range r2 byteRangeContains :: ByteRange -> ByteRange -> Bool byteRangeContains r1 r2 = byteRange_begin r1 <= byteRange_begin r2 && byteRangeExclusiveEnd r1 >= byteRangeExclusiveEnd r2 -- | Positions of start of each line in bytes. -- -- Index `lineOffsets` with 0-based line number to get start of that line. -- -- Never empty, the first value is always zero, strictly monotonically -- ascending. data LineOffsets = LineOffsets { lineOffsets :: Vector Word64 , lineOffsets_endsInNewline :: Bool , lineOffsets_hasUnicodeOrTabs :: Bool } deriving ( Eq, Show, Ord ) -- | Help with debugging by showin a multi-line friendly view of 'LineOffsets' -- prefixed with the provided 'String' label. debugLineOffsets :: String -> LineOffsets -> String debugLineOffsets msg x = let prefix = msg <> ": " header = [ "Length of lineOffsets :: Vector Word64 = " <> show (Vector.length (lineOffsets x)) , "Ends in newline: " <> show (lineOffsets_endsInNewline x) , "Has unicode or tabs: " <> show (lineOffsets_hasUnicodeOrTabs x) ] lengths = map show (chunksOf 10 (Vector.toList (lineOffsets x))) in unlines (map (prefix<>) (header ++ lengths)) -- | This finds the newlines (each line includes terminating newline) as well -- as the final line. getLineOffsets :: ByteString -> LineOffsets getLineOffsets bs | B.null bs = LineOffsets (Vector.singleton 0) False False | otherwise = let newlines = map (succ . fromIntegral) (B.elemIndices '\n' bs) lineOffsets_hasUnicodeOrTabs = B.any (\c -> c == '\t' || c >= big) bs where big = toEnum 128 in if B.last bs == '\n' then LineOffsets { lineOffsets = Vector.fromList . (0:) $ newlines , lineOffsets_endsInNewline = True , lineOffsets_hasUnicodeOrTabs } else LineOffsets { lineOffsets = Vector.fromList . (0:) . (++ [fromIntegral (B.length bs)]) $ newlines , lineOffsets_endsInNewline = False , lineOffsets_hasUnicodeOrTabs } mkFileLines :: Src.File -> LineOffsets -> Src.FileLines_key mkFileLines fp offs = Src.FileLines_key { fileLines_key_file = fp , fileLines_key_lengths = let lineOffs = Vector.toList $ lineOffsets offs in map toNat $ zipWith (-) (drop 1 lineOffs) lineOffs , fileLines_key_endsInNewline = lineOffsets_endsInNewline offs , fileLines_key_hasUnicodeOrTabs = lineOffsets_hasUnicodeOrTabs offs } -- | This assumes that the fileLines_key_lengths are strictly positive lengthsToLineOffsets :: Src.FileLines_key -> LineOffsets lengthsToLineOffsets flk = LineOffsets { lineOffsets = Vector.fromList $ scanl (+) 0 (map fromNat (fileLines_key_lengths flk)) , lineOffsets_endsInNewline = fileLines_key_endsInNewline flk , lineOffsets_hasUnicodeOrTabs = fileLines_key_hasUnicodeOrTabs flk } -- | For 0-based lines, the index of the first line to disagree, if any. -- -- If one vector is an incomplete prefix of the other vector, then the -- index of first difference will be the (length-1) of the shorter vector. -- -- The returned Int is always greater than or equal to zero -- (a good 0-based line number). firstLineOffsetsDiff :: LineOffsets -> LineOffsets -> Maybe Int firstLineOffsetsDiff lo1 lo2 = case Vector.findIndex not (Vector.zipWith (==) v1 v2) of Nothing -> if Vector.length v1 == Vector.length v2 then Nothing -- assert than 0 < length v1 && 0 < length v2 else Just (pred (min (Vector.length v1) (Vector.length v2))) Just i -> Just (pred i) -- assert that 0 < i where v1 = lineOffsets lo1 v2 = lineOffsets lo2 -- | Diffusion is crazy and omits the trailing newline. So cope with -- only differing at the trailing newline. Returns 'True' when the two -- arguments have identical length lines, or if they merely disagree about the -- final newline and so thus one (with final newline) has the last line length -- of 1 greater than the other. sameUpToLastNewline :: LineOffsets -> LineOffsets -> Bool sameUpToLastNewline a b = case firstLineOffsetsDiff a b of Nothing -> True Just line0 -> lineOffsets_endsInNewline x && not (lineOffsets_endsInNewline y) && (Vector.length (lineOffsets x) == Vector.length (lineOffsets y)) && (2 + line0 == Vector.length (lineOffsets x)) && (Vector.last (lineOffsets x) == 1 + Vector.last (lineOffsets y)) where -- If at least one of a or b ends in newline then x ends in newline (x, y) | lineOffsets_endsInNewline a = (a, b) | otherwise = (b, a) -- | Using the information in 'LineOffsets' this converts a 'Word64' byte -- offset into a pair of (1-based line, 0-based column). -- -- This is a total function, but there are numerous unspecified conditions -- (which may result in the line number being 0). -- -- This requires 'lineOffsets' to be non-empty and ascending with no duplicates -- to get a sane return value. -- -- This does handle 'lineOffsets_endsInNewline' correctly. -- -- This does not handle 'lineOffsets_hasUnicodeOrTabs' yet, so column -- number is actually byte offset along that line and only correct for Ascii. -- ( To handle this would require the actual 'Text' or utf8 'ByteString' ) -- Tracked in T62313027 -- -- If you ask for a byte offset smaller than the first 'lineOffsets' value then -- then return is unspecified. -- If you ask for a byte offset larger than the last in 'lineOffsets' value then -- the return is unspecified. -- If 'lineOffsets' is empty then the line return is unspecified. -- If 'lineOffsets' has duplicates then the line return is unspecified. -- if 'lineOffsets' is ever descending then the line returned is unspecified. -- -- Unicode versus column number -- -- This needs to become utf-8 aware, and make column count codepoints. -- utf-8 bytes can be classified as -- * 0 .. 127 : Ascii -- * 128..191 : continue multibyte codepoint -- * 192 .. 193 : invalid -- * 194 .. 244 : Start of multibyte codepoint -- * 245 .. 255 : invalid -- Need to count Ascii and Start characters from the start of the line to -- the current position. -- -- For bonus points, ignore utf-8 BOM in the first line (0xEF, 0xBB, 0xBF). byteOffsetToLineCol :: LineOffsets -> Word64 -> (Word64, Word64) byteOffsetToLineCol LineOffsets{lineOffsets=a, ..} | Vector.length a == 0 = (0,) -- should not happen, unspecified answer | Vector.length a == 1 = \p -> if p < a!0 then (0, p) -- before first line, unspecified answer else done 0 p -- only one line, you are on it | otherwise = search where search p | p < a!0 = (0, p) -- before first line, unspecified answer | a!e <= p = done e p -- after final newline (not impossible) | otherwise = go 0 e -- know 0 < e due to above guards where -- know 0 <= e since 2 <= Vector.length a due to original guards e | lineOffsets_endsInNewline = pred (Vector.length a) -- final \n | otherwise = pred (pred (Vector.length a)) -- final \n is earlier go :: Int -> Int -> (Word64, Word64) go i j = -- know (a!i) <= p < (a!j) and i < j if succ i == j -- found precise line then done i p else -- know i < succ i < j let mid = i + ((j - i + 1) `quot` 2) -- know i < mid < j in if p < a!mid then go i mid -- know (a!i) <= p < (a!mid) and i < mid else go mid j -- know (a!mid) <= p < (a!j) and mid < j -- Does not handle tabs or unicode yet done :: Int -> Word64 -> (Word64, Word64) done line_zero_based bytes = let line_one_based = fromIntegral (succ line_zero_based) column_zero_based = bytes - a!line_zero_based in (line_one_based, column_zero_based ) -- | Convert a Src.Range to a ByteRange, given a Src.FileLines srcRangeToByteRange :: Src.FileLines_key -> ByteString -> (Src.Range -> ByteRange) srcRangeToByteRange flk contents = let LineOffsets{..} = lengthsToLineOffsets flk -- Takes a single line and computes vector of utf-8 byte offsets of each -- unicode character (codepoint) in the row. -- Index the vector with a 0-based character-col -- and the values is the 0-based byte-offset-col -- -- tabs characters are not expanded, so count as 1 byte wide. charOffsets :: ByteString -> Vector Word64 charOffsets b = Vector.fromList $ 0 : go 0 (UTF8.fromRep b) where go :: Int -> UTF8.UTF8 ByteString -> [Word64] go off rest = case UTF8.uncons rest of Nothing -> [] Just (_, more) -> fromIntegral newOffset : go newOffset more where newOffset = off + B.length (UTF8.toRep rest) - B.length (UTF8.toRep more) -- Take 0-based line and character-column, return 0-based byte-offset-column -- This is always identity when there are no unicode or tab characters charToByteCol :: Int -> Int -> Word64 charToByteCol | lineOffsets_hasUnicodeOrTabs = \line col -> let offsets = lineVec BoxedVector.! line size = Vector.length offsets in -- Don't return _|_ if the column is out of range, just clamp -- to the end of the line. if col < size then offsets ! col else offsets ! (size-1) | otherwise = \_ col -> fromIntegral col where lineVec = BoxedVector.fromList $ map charOffsets (B.lines contents) in srcRangeToAdjustedByteRange (Just charToByteCol) lineOffsets -- | Convert a 'Src.Range' to a 'ByteRange', given a 'LineOffsets{lineOffsets}' -- and an optional function to correct for unicode and tab characters -- (where bytes and characters are not the same width). -- -- If Nothing is passed then this assumes characters are one byte wide. -- In particular this assumes the 'columnBegin' and 'columnEnd' in the -- 'Src.Range' are the same as byte offsets. Thus Nothing is equivalent to -- passing @(\ _ col -> fromIntegral col)@ as the function. srcRangeToAdjustedByteRange :: Partial => Maybe (Int -> Int -> Word64) -- ^ converts column from character count to byte count. -- Takes line (0-based) and column (0-based character count) -- to 0-based byte-count column. -> Vector Word64 -> Src.Range -> ByteRange srcRangeToAdjustedByteRange mCharToByte lineOffsets Src.Range{..} = let -- Convert from 1-based line and col (from Src.Range) to 0-based, and -- be careful subtracting Word64 to avoid wrap-around prevNat n = let w = fromNat n in fromIntegral (pred (max 1 w)) charToByteCol = fromMaybe (\ _ col -> fromIntegral col) mCharToByte -- compute 0-based lines and column coordinates from Src.Range -- * lines need to be 0-based to be an index into lineOffsets -- and to be passed to charToByteCol -- * columns need to be 0-based to be passed to charToByteCol -- (and then add to the byte offset) lineBegin = prevNat range_lineBegin lineEnd = prevNat range_lineEnd colBegin = prevNat range_columnBegin colEnd = prevNat range_columnEnd -- Note: range_columnEnd is the column of the last char (inclusive) -- so colEnd is inclusive, and so is inclusiveEnd lBegin = fromMaybe (error err) (lineOffsets !? lineBegin) lEnd = fromMaybe (error err) (lineOffsets !? lineEnd) err = show (lineBegin, lineEnd, Vector.length lineOffsets) begin = lBegin + charToByteCol lineBegin colBegin inclusiveEnd = lEnd + charToByteCol lineEnd colEnd -- be careful subtracting Word64 to avoid wrap-around len | begin <= inclusiveEnd = (inclusiveEnd - begin) + 1 | otherwise = 0 in ByteRange{ byteRange_begin = begin, byteRange_length = len } -- | Convert a Src.Range to a ByteRange, given a LineOffsets, and assuming -- columns count bytes (assuming characters and bytes and the same width). srcRangeToSimpleByteRange :: Partial => LineOffsets -> Src.Range -> ByteRange srcRangeToSimpleByteRange = srcRangeToAdjustedByteRange Nothing . lineOffsets -- | Convert a Src.Range to a Src.FileLocation, given the length of each line -- of the file. CAN'T be used with files that have unicode or tab characters. srcRangeToFileLocation :: [Int] -> (Src.Range -> Src.FileLocation) srcRangeToFileLocation lengths = let lineOffsets :: Vector Word64 lineOffsets = Vector.fromList $ scanl (+) 0 (map fromIntegral lengths) in \ sr -> let br = srcRangeToAdjustedByteRange Nothing lineOffsets sr in Src.FileLocation { fileLocation_file = range_file sr , fileLocation_span = Src.ByteSpan { byteSpan_start = toNat (byteRange_begin br) , byteSpan_length = toNat (byteRange_length br) } } packedByteSpansToRanges :: Src.PackedByteSpans -> [ByteRange] packedByteSpansToRanges = go 0 where go !_ [] = [] go !off (Src.PackedByteSpansGroup{..} : rest) = go2 off (fromNat packedByteSpansGroup_length) packedByteSpansGroup_offsets rest where go2 !off !_ [] rest = go off rest go2 !prev !len (off : offs) rest = ByteRange (fromNat off + prev) len : go2 (fromNat off + prev) len offs rest fromToSpansAndExpansions :: Cxx.From -> [ByteRange] fromToSpansAndExpansions Cxx.From{..} = merge (packedByteSpansToRanges from_spans) (packedByteSpansToRanges from_expansions) fromToSpansAndSpellings :: Cxx.From -> [ByteRange] fromToSpansAndSpellings Cxx.From{..} = merge (packedByteSpansToRanges from_spans) (packedByteSpansToRanges from_spellings) rangesToPackedByteSpans :: [ByteRange] -> Src.PackedByteSpans rangesToPackedByteSpans [] = [] rangesToPackedByteSpans (ByteRange begin len : rest) = go begin len [toNat begin] rest where go !off !len acc (ByteRange begin len' : rest) | len == len' = go begin len (tok : acc) rest | otherwise = Src.PackedByteSpansGroup (toNat len) (reverse acc) : go begin len' [tok] rest where !tok = toNat (begin-off) go !_ !_ [] [] = [] go !_ !len acc [] = [Src.PackedByteSpansGroup (toNat len) (reverse acc)] -- | Convert Src schema bytespans to ranges byteSpanToRange :: Src.ByteSpan -> ByteRange byteSpanToRange Src.ByteSpan{..} = ByteRange { byteRange_begin = fromNat byteSpan_start , byteRange_length = fromNat byteSpan_length } -- | Convert byte ranges to Src schema bytespans rangeToByteSpan :: ByteRange -> Src.ByteSpan rangeToByteSpan ByteRange{..} = Src.ByteSpan { byteSpan_start = toNat byteRange_begin , byteSpan_length = toNat byteRange_length } -- | Convert from byte coordinates to (line, col) coordinates. -- -- This does not yet handle uncode or tabs, which would require access -- the contents of the file. byteRangeToRange :: Src.File -> LineOffsets -> ByteRange -> Src.Range byteRangeToRange range_file lineOffsets be@ByteRange{..} = let -- be careful subtracting Word64 to avoid wrap-around inclusiveEnd = pred (max 1 (byteRangeExclusiveEnd be)) -- Convert from 0-based col to 1-based col for Src.Range nextNat w = toNat (succ w) (range_lineBegin, range_columnBegin) = bimap toNat nextNat $ byteOffsetToLineCol lineOffsets byteRange_begin (range_lineEnd, range_columnEnd) = bimap toNat nextNat $ byteOffsetToLineCol lineOffsets inclusiveEnd in Src.Range{..} byteRangesToLineNumbers :: LineOffsets -> [ByteRange] -> [Word64] byteRangesToLineNumbers lineOffsets = map (fst . byteOffsetToLineCol lineOffsets . byteRange_begin) -- | @compareLineCol line col a b@ is a helper for comparing @(line, col)@ -- coordinates of @a@ and @b@ compareLineCol :: Ord x => (a -> x) -> (a -> x) -> a -> a -> Ordering compareLineCol line col a b = (compare `on` line) a b <> (compare `on` col) a b -- | @rangeContains x y@ is True iff the range of y is the same as x, or -- inside x. rangeContains :: Src.Range -> Src.Range -> Bool rangeContains big small = ((==) `on` Src.range_file) big small && checkLE Src.range_lineBegin Src.range_columnBegin big small && checkLE Src.range_lineEnd Src.range_columnEnd small big where checkLE line col a b = GT /= compareLineCol line col a b -- | Order is by file id, ascending begin position, descending end position. compareRange :: Src.Range -> Src.Range -> Ordering compareRange r1 r2 = (compare `on` (Src.file_id . Src.range_file)) r1 r2 <> compareLineCol Src.range_lineBegin Src.range_columnBegin r1 r2 <> compareLineCol Src.range_lineEnd Src.range_columnEnd r2 r1 -- | Sort and filter the @[a]@ removing contained items. Order is by file, -- ascending begin position, descending end position. topLevelRanges :: (a -> Src.Range) -> [a] -> [a] topLevelRanges f = map head . groupBy (rangeContains `on` f) . sortBy (compareRange `on` f)