{- Copyright (c) Meta Platforms, Inc. and affiliates. All rights reserved. This source code is licensed under the BSD-style license found in the LICENSE file in the root directory of this source tree. -} module Glean.Database.Schema ( DbSchema(..) , PredicateDetails(..) , predicateRef , TypeDetails(..) , schemaSize, schemaPredicates , fromStoredSchema, toStoredSchema , newDbSchema , newMergedDbSchema , mkDbSchemaFromSource , lookupPid , compareSchemaPredicates , validateNewSchema , validateNewSchemaInstance , DbContent , readOnlyContent , readWriteContent , getSchemaInfo , getSchemaInfoForSchema , renderSchemaSource , toStoredVersions , derivationEdges -- testing , newDbSchemaForTesting ) where import Control.Applicative ((<|>)) import Control.Concurrent import Control.Exception import Control.Monad import Control.Monad.Except import Control.Monad.State as State import Data.Bifoldable (bifoldMap) import Data.ByteString (ByteString) import Data.Foldable import Data.Graph import Data.Hashable import Data.HashMap.Strict (HashMap) import qualified Data.HashMap.Strict as HashMap import qualified Data.HashMap.Lazy as Lazy.HashMap import qualified Data.HashSet as HashSet import Data.HashSet (HashSet) import Data.List.Extra (firstJust, nubOrd, sort) import qualified Data.IntMap as IntMap import Data.IntMap (IntMap) import qualified Data.Map as Map import Data.Map (Map) import Data.Maybe import Data.Set (Set) import qualified Data.Set as Set import Data.Text (Text) import qualified Data.Text as Text import qualified Data.Text.Encoding as Text import Compat.Prettyprinter import Compat.Prettyprinter.Render.Text import Data.Tuple (swap) import Safe (maximumMay) import TextShow import ServiceData.GlobalStats import ServiceData.Types as ODS import Util.Log.Text import Glean.RTS.Foreign.Inventory as Inventory import Glean.Database.Config import Glean.Database.Schema.Types import Glean.Display import Glean.Query.Transform ( transformPattern , transformFact ) import Glean.Internal.Types as Internal (StoredSchema(..)) import Glean.RTS.Traverse import Glean.RTS.Typecheck import Glean.RTS.Types as RTS import Glean.Angle.Hash import Glean.Angle.Types as Schema import Glean.Schema.Evolve ( validateEvolutions , directSchemaEvolutions , calcEvolutions , visiblePredicates , VisiblePredicates , visibleDefined , mapVisible , canBeStored ) import Glean.Schema.Util (showRef, ShowRef) import qualified Glean.ServerConfig.Types as ServerConfig import Glean.Query.Codegen.Types (QueryWithInfo(..)) import Glean.Query.Typecheck import Glean.Types as Thrift import Glean.Schema.Types import Glean.Database.Schema.ComputeIds import Glean.Query.Prune (pruneDerivations) import Glean.Query.Typecheck.Types (TcQuery) -- | Used to decide whether to activate 'derive default' definitions and -- 'evolves' directives. -- 'derive default' takes effect for a read-only DB when there are no -- facts of a predicate. -- 'evolves' directives take effect for a read-only DB when there are no facts -- of any predicate in a schema evolved by another schema. data DbContent = DbWritable | DbReadOnly (HashMap Pid PredicateStats) readOnlyContent :: HashMap Pid PredicateStats -> DbContent readOnlyContent = DbReadOnly readWriteContent :: DbContent readWriteContent = DbWritable schemaSize :: DbSchema -> Int schemaSize = HashMap.size . predicatesById schemaPredicates :: DbSchema -> [PredicateDetails] schemaPredicates = HashMap.elems . predicatesById fromStoredSchema :: Maybe (MVar DbSchemaCache) -> StoredSchema -> DbContent -> DebugFlags -> IO DbSchema fromStoredSchema schemaCache info@StoredSchema{..} dbContent debug = mkDbSchemaFromSource schemaCache (Just (storedSchemaPids info)) dbContent debug storedSchema_schema storedSchemaPids :: StoredSchema -> HashMap PredicateRef Pid storedSchemaPids StoredSchema{..} = HashMap.fromList [ (ref, Pid pid) | (pid, ref) <- Map.toList storedSchema_predicateIds ] toStoredSchema :: DbSchema -> StoredSchema toStoredSchema DbSchema{ schemaSource = (source, ver, sourceId), ..} = StoredSchema { storedSchema_schema = renderSchemaSource source , storedSchema_predicateIds = Map.fromList $ [ (fromPid $ predicatePid p, predicateRef p) | p <- HashMap.elems predicatesById , predicateInStoredSchema p ] -- Note: only the stored predicates. There can be multiple -- predicates with the same PredicateRef (different Pid and -- PredicateId), but only one of them is from the stored schema -- and corresponds to the Pids of facts stored in the DB. , storedSchema_versions = toStoredVersions ver sourceId } newMergedDbSchema :: Maybe (MVar DbSchemaCache) -> StoredSchema -- ^ schema from a DB -> SchemaIndex -- ^ current schema index -> DbContent -> DebugFlags -> IO DbSchema newMergedDbSchema schemaCache storedSchema@StoredSchema{..} index dbContent debug = do let -- If we have an identical schema in the index, then we can use -- the cached ProcessedSchema. This saves a lot of time -- re-processing the schema from the DB when we open it, and -- should be a common case. -- -- Schema equality is by comparing the SchemaIds. This is safe -- since the only way a schema can be used is through a SchemaId, -- and the SchemaId captures the content of all the definitions -- that can be referenced by a query (those visible in the NameEnv). identicalSchemas = [ proc | proc <- schemaIndexCurrent index : schemaIndexOlder index , let HashedSchema{..} = procSchemaHashed proc , IntMap.toList (fromStoredVersions storedSchema_versions) == [(fromIntegral hashedSchemaAllVersion, hashedSchemaId)] ] processStoredSchema = case processSchema Nothing storedSchema_schema of Left msg -> throwIO $ ErrorCall msg Right resolved -> return resolved fromDB <- if Map.null storedSchema_versions then processStoredSchema else case identicalSchemas of (one : _) -> return one [] -> processStoredSchema -- For the "source", we'll use the current schema source. It doesn't -- reflect what's in the DB, but it reflects what you can query for. mkDbSchema HashMap.toList schemaCache (Just (storedSchemaPids storedSchema)) dbContent fromDB (Just index) debug -- | Build a DbSchema from parsed/resolved Schemas. Note that we still need -- the original source for the schema for 'toStoredSchema' (otherwise we would -- need to pretty-print the resolved Schemas back into source. newDbSchema :: Maybe (MVar DbSchemaCache) -> SchemaIndex -> SchemaSelector -> DbContent -> DebugFlags -> IO DbSchema newDbSchema = newDbSchemaForTesting HashMap.toList -- | Testing version of 'newDbSchema' that accepts a custom @HashMap.toList@ -- function to weed out ordering assumptions newDbSchemaForTesting :: (forall k v . HashMap k v -> [(k,v)]) -> Maybe (MVar DbSchemaCache) -> SchemaIndex -> SchemaSelector -> DbContent -> DebugFlags -> IO DbSchema newDbSchemaForTesting toList schemaCache index selector dbContent debug = do schema <- case selector of SpecificSchemaId id -> case schemaForSchemaId index id of Nothing -> throwIO $ ErrorCall $ "schema " <> show id <> " not found" Just schema -> return schema _otherwise -> return (schemaIndexCurrent index) mkDbSchema toList schemaCache Nothing dbContent schema Nothing debug inventory :: [PredicateDetails] -> Inventory inventory ps = Inventory.new [ Inventory.CompiledPredicate { compiledPid = predicatePid , compiledRef = predicateRef d , compiledTypecheck = predicateTypecheck , compiledTraversal = predicateTraversal } | d@PredicateDetails{..} <- ps ] -- | -- Make a DbSchema for an existing database, using the predicates -- stored in the database. Anything in the 'Schemas' but not -- in the database is dropped from the 'DbSchema' mkDbSchemaFromSource :: Maybe (MVar DbSchemaCache) -> Maybe (HashMap PredicateRef Pid) -> DbContent -> DebugFlags -> ByteString -> IO DbSchema mkDbSchemaFromSource schemaCache knownPids dbContent debug source = do case processSchema Nothing source of Left str -> throwIO $ ErrorCall str Right (ProcessedSchema source resolved hashed) -> mkDbSchema HashMap.toList schemaCache knownPids dbContent (ProcessedSchema source resolved hashed) Nothing debug dbSchemaKey :: HashMap PredicateRef Pid -> ProcessedSchema -> Maybe SchemaIndex -> DbSchemaCacheKey dbSchemaKey pidMap stored maybeIndex = DbSchemaCacheKey $ hashBinary ( [ (showRef ref, pid) | (ref, Pid pid) <- HashMap.toList pidMap ] , versions (procSchemaHashed stored) , flip (maybe []) maybeIndex $ \SchemaIndex{..} -> map (versions . procSchemaHashed) (schemaIndexCurrent : schemaIndexOlder) ) where versions HashedSchema{..} = (hashedSchemaAllVersion, unSchemaId hashedSchemaId) {- | Caching DbSchema DbSchema is quite expensive to build: - we have to typecheck all the derived predicates - we have to build RTS.Types for types and predicates - we have to compile byte-code for typechecking and traversing facts So we'd like to cache these to avoid repeating the work and to share as much as possible between open DBs. The DbSchema depends on - the schema in the DB and the Pids we assigned to predicates - the SchemaIndex - the content of the DB (which predicates have facts) Schemas don't change all that much, so the first two will often be the same (or at least, there will be a few combinations in use at any given time). For a given schema, the Pids we assign should be deterministic. The dependency on the DB contents looks problematic. However, most of the work of building the DbSchema does *not* depend on the contents, only transformations and pruning derived predicates use that. So we can cache the plain DbSchema before DbContent is taken into account, and then apply the DbContent changes for each DB. -} withDbSchemaCache :: Maybe (MVar DbSchemaCache) -> Maybe (HashMap PredicateRef Pid) -> ProcessedSchema -> Maybe SchemaIndex -> IO DbSchema -> IO DbSchema withDbSchemaCache maybeCache maybePids stored maybeIndex mk = case (maybeCache, maybePids) of (Just cacheVar, Just pidMap) -> do -- only use the cache when we have stored Pids cache <- readMVar cacheVar let key = dbSchemaKey pidMap stored maybeIndex case HashMap.lookup key cache of Just schema -> do addStatValueType "glean.db.schema.cache.hit" 1 ODS.Sum vlog 2 $ "DbSchema cache hit, stored SchemaId: " <> unSchemaId (hashedSchemaId (procSchemaHashed stored)) return schema Nothing -> do addStatValueType "glean.db.schema.cache.miss" 1 ODS.Sum schema <- mk modifyMVar_ cacheVar $ return . HashMap.insert key schema return schema _otherwise -> do addStatValueType "glean.db.schema.cache.fail" 1 ODS.Sum mk mkDbSchema :: (forall k v . HashMap k v -> [(k,v)]) -> Maybe (MVar DbSchemaCache) -> Maybe (HashMap PredicateRef Pid) -> DbContent -> ProcessedSchema -> Maybe SchemaIndex -> DebugFlags -> IO DbSchema mkDbSchema toList cacheVar knownPids dbContent procStored@(ProcessedSchema source _ stored) index debug = do -- either fetch a cached DbSchema or build a new one schema <- withDbSchemaCache cacheVar knownPids procStored index buildDbSchema -- the cached DbSchema cannot depend on the DbContent, so next we add in -- any changes to DbSchema that need to depend on DbContent. transformations <- failOnLeft $ mkTransformations dbContent (predicatesById schema) procStored addedSchemas let -- to avoid the need to have different prunings for each schemaId -- we pass the union of all transformations to the 'prune' function -- such that if a predicate can be mapped to something in the db -- for any schemaId, then it is kept in the derivation query. byId = enableSelectiveDerivations dbContent <$> prune dbContent allTrans (predicatesById schema) intPid = fromIntegral . fromPid . predicatePid byPid = IntMap.fromList [ (intPid p, p) | p <- HashMap.elems byId ] allTrans = fold $ HashMap.elems transformations return $ schema { predicatesById = byId, predicatesByPid = byPid, predicatesTransformations = mkQueryTransformations byPid <$> transformations } where addedSchemas = case index of Nothing -> [] Just SchemaIndex{..} -> schemaIndexCurrent : schemaIndexOlder buildDbSchema = do let -- Assign Pids to predicates. -- - predicates in the stored schema get Pids from the StoredSchema -- (unless this is a new DB, in which case we'll assign fresh Pids) -- - predicates in the merged schema get new fresh Pids. -- -- Sort the predicates in order to ensure a deterministic result since -- 'toList' does not give us any ordering guarantees storedPids = case knownPids of Nothing -> zip (sort $ fst <$> toList (hashedPreds stored)) [lowestPid..] Just pidMap -> assign first (sort $ fst <$> toList (hashedPreds stored)) where first = maybe lowestPid succ $ maximumMay (snd <$> toList pidMap) assign _next [] = [] assign next (id : ids) = case HashMap.lookup (predicateIdRef id) pidMap of Nothing -> (id, next) : assign (succ next) ids Just pid -> (id, pid) : assign next ids nextPid = maybe lowestPid succ $ maximumMay (map snd storedPids) addedPreds = HashMap.unions $ map (hashedPreds . procSchemaHashed) addedSchemas addedPids = zip ids [nextPid ..] where ids = filter (not . isStoredPred) (HashMap.keys addedPreds) isStoredPred id = HashMap.member id (hashedPreds stored) idToPid = HashMap.fromList (storedPids ++ addedPids) let typecheck env (stored, ProcessedSchema _ resolved hashed) = typecheckSchema idToPid stored tcOpts hashed env where tcOpts = defaultTcOpts debug angleVersion angleVersion = maybe latestAngleVersion resolvedSchemaAngleVersion (listToMaybe (schemasResolved resolved)) -- typecheck all the schemas, producing PredicateDetails / TypeDetails tcEnv <- foldM typecheck emptyTcEnv ((True, procStored) : map (False,) addedSchemas) -- Check the invariant that stored predicates do not make use of -- negation or refer to derived predicates that make use of it. -- See Note [Negation in stored predicates] let usingNegation = usesOfNegation (tcEnvPredicates tcEnv) isStored = \case NoDeriving -> True Derive when _ -> case when of DeriveOnDemand -> False DerivedAndStored -> True DeriveIfEmpty -> True useOfNegation ref = HashMap.lookup ref usingNegation forM_ (tcEnvPredicates tcEnv) $ \d@PredicateDetails{..} -> when (isStored predicateDeriving) $ case useOfNegation predicateId of Nothing -> return () Just use -> let feature = case use of PatternNegation -> "negation" IfStatement -> "if-statements" in throwIO $ Thrift.Exception $ "use of " <> feature <> " is not allowed in a stored predicate: " <> showRef (predicateRef d) let predicates = snd <$> toList (tcEnvPredicates tcEnv) intPid = fromIntegral . fromPid . predicatePid byPid = IntMap.fromList [ (intPid deets, deets) | deets <- predicates ] maxPid = maybe lowestPid (Pid . fromIntegral . fst) (IntMap.lookupMax byPid) schemaEnvMap = Map.fromList [ (hashedSchemaId h, hashedSchemaEnv h) | h <- map procSchemaHashed (procStored : addedSchemas) ] latestSchema = procSchemaHashed $ case index of Just SchemaIndex{..} -> schemaIndexCurrent Nothing -> procStored dbSchemaId = hashedSchemaId stored dbSchemaAllVersion = hashedSchemaAllVersion stored predicatesById = tcEnvPredicates tcEnv derivationDepends = HashMap.fromListWith (++) [ (p, pp) | (_, p, pp) <- derivationEdges predicatesById] vlog 2 $ "DB schema " <> unSchemaId dbSchemaId <> " has " <> showt (HashMap.size (hashedTypes stored)) <> " types/" <> showt (HashMap.size (hashedPreds stored)) <> " predicates, global schema has " <> showt (sum (map (HashMap.size . hashedTypes . procSchemaHashed) addedSchemas)) <> " types/" <> showt (sum (map (HashMap.size . hashedPreds . procSchemaHashed) addedSchemas)) <> " predicates, final schema has " <> showt (HashMap.size (tcEnvTypes tcEnv)) <> " types/" <> showt (HashMap.size (tcEnvPredicates tcEnv)) <> " predicates." vlog 2 $ "all." <> showt (hashedSchemaAllVersion latestSchema) <> " = " <> unSchemaId (hashedSchemaId latestSchema) return $ DbSchema { predicatesById = predicatesById , typesById = tcEnvTypes tcEnv , schemaEnvs = schemaEnvMap , predicatesByPid = byPid , predicatesTransformations = HashMap.empty , schemaInventory = inventory predicates , schemaSource = (source, dbSchemaAllVersion, dbSchemaId) , schemaMaxPid = maxPid , schemaAllVersion = hashedSchemaAllVersion latestSchema , schemaId = hashedSchemaId latestSchema , derivationDepends = derivationDepends } mkTransformations :: DbContent -> HashMap PredicateId PredicateDetails -> ProcessedSchema -- ^ stored schema -> [ProcessedSchema] -- ^ other schemas -> Either Text (HashMap SchemaId (IntMap PredicateTransformation)) mkTransformations content byId stored otherSchemas = case content of DbWritable -> Right $ HashMap.fromList [ (schemaId, mempty) | schema <- processed , schemaId <- versionsOfAll schema ] DbReadOnly stats -> do evolutions :: [HashMap PredicateId PredicateId] <- forM processed $ \schema -> mkEvolutions stats byId [stored, schema] let allPredicateTrans :: HashMap (PredicateId, PredicateId) PredicateTransformation allPredicateTrans = HashMap.fromList [ (pair, trans) | pair@(old, new) <- nubOrd $ concatMap HashMap.toList evolutions , Just trans <- [mkPredicateTransformation detailsById old new] ] transformations :: [IntMap PredicateTransformation] transformations = [ IntMap.fromList [ (intPid, trans) | pair@(old, _) <- HashMap.toList e , Just trans <- [HashMap.lookup pair allPredicateTrans] , let intPid = fromIntegral $ fromPid $ predicatePid $ detailsById old ] | e <- evolutions ] -- Transformations are per ProcessedSchema. Here we find the -- transformations applicable to the union of the stored and a foreign -- ProcessedSchema and then we associate them with all 'all' schemas of -- the foreign ProcessedSchema. return $ HashMap.fromList [ (schemaId, trans) | (schema, trans) <- zip processed transformations , schemaId <- versionsOfAll schema ] where processed :: [ProcessedSchema] processed = stored : otherSchemas versionsOfAll :: ProcessedSchema -> [SchemaId] versionsOfAll s = [hashedSchemaId (procSchemaHashed s)] detailsById id = HashMap.lookupDefault err id byId where err = error $ "mkEvolutions: " <> show (displayDefault id) -- Calculate predicate evolutions for a set of ProcessedSchema mkEvolutions :: HashMap Pid PredicateStats -> HashMap PredicateId PredicateDetails -> [ProcessedSchema] -> Either Text (HashMap PredicateId PredicateId) mkEvolutions stats byId schemas = do manual <- calcSchemaEvolutions hasFacts bySchemaRef <$> directEvolutions schemas auto <- calcAutoEvolutions hasFacts byPredRef evolutions <- calcEvolutions predicateIdRef byPredRef bySchemaRef manual auto validateEvolutions (Just hasFacts) types preds evolutions return evolutions where (types, preds) = definitions schemas hasFacts id = fromMaybe False $ do details <- HashMap.lookup id byId let pid = predicatePid details pstats <- HashMap.lookup pid stats return $ predicateStats_count pstats > 0 byPredRef :: Map PredicateRef [PredicateId] byPredRef = predicatesByRef schemas bySchemaRef :: Map SchemaRef (VisiblePredicates PredicateId) bySchemaRef = predicatesBySchemaRef schemas -- | Predicates (which can be evolved) by predicate ref predicatesByRef :: [ProcessedSchema] -> Map PredicateRef [PredicateId] predicatesByRef schemas = Set.toList <$> Map.fromListWith (<>) [ (predicateIdRef pred, Set.singleton pred) | processed <- schemas , (pred, def) <- HashMap.toList $ hashedPreds $ procSchemaHashed processed , canBeStored (predicateDefDeriving def) ] predicatesBySchemaRef :: [ProcessedSchema] -> Map SchemaRef (VisiblePredicates PredicateId) predicatesBySchemaRef schemas = Map.fromListWith (<>) [ ( schemaRef resolved, visible) | processed <- schemas , let env = predRefToId $ schemaRefToIdEnv $ procSchemaHashed processed refToId ref = HashMap.lookupDefault (error "predicatesBySchemaRef") ref env , resolved <- schemasResolved $ procSchemaResolved processed , let visible = mapVisible refToId (visiblePredicates resolved) ] -- | Evolves relationships as per source code annotations. -- -- We only consider evolution annotations in the current schema. This is so -- that we don't need to wait until schemas are garbage-collected to be able to -- remove an evolves relationship. directEvolutions :: [ProcessedSchema] -> Either Text (Map SchemaRef SchemaRef) directEvolutions schemas = directSchemaEvolutions $ concatMap (schemasResolved . procSchemaResolved) schemas -- | Calculate predicate evolutions caused by version-less schema migrations. calcAutoEvolutions :: (PredicateId -> Bool) -> Map PredicateRef [PredicateId] -- ^ all SchemaIndex predicates -> Either Text (Map PredicateRef PredicateId) calcAutoEvolutions hasFacts byRef = Map.traverseMaybeWithKey choose byRef where -- from a list of predicates with the same PredicateRef, -- choose the one that has facts in the DB. choose :: PredicateRef -> [PredicateId] -> Either Text (Maybe PredicateId) choose ref xs = case filter hasFacts xs of [] -> Right Nothing [y] -> Right (Just y) ys -> Left $ Text.unwords [ "db has facts for multiple instances of", showRef ref, ":" , Text.unwords (showRef <$> ys) ] -- | Calculate which schema should effectively evolve which by taking the DB's -- content into account. -- -- If A.3 evolves A.2 which evolves A.1, but we only have facts for predicates -- from A.3, then we end-up with both A.1 and A.2 being directly evolved by -- A.3. calcSchemaEvolutions :: (PredicateId -> Bool) -> Map SchemaRef (VisiblePredicates PredicateId) -> Map SchemaRef SchemaRef -> Map SchemaRef SchemaRef -- ^ value evolves key calcSchemaEvolutions hasFacts bySchemaRef direct = Map.fromList [ (ref, target) | ref <- Map.keys bySchemaRef , Just target <- [transformationTarget ref] ] where -- map each schema to the schema its queries should be transformed into transformationTarget :: SchemaRef -> Maybe SchemaRef transformationTarget src | hasFactsInDb src = Nothing -- Pick closest related schema with facts in the db. -- If a schema has only derived facts there is no need to -- transform it since the original version will still work. | otherwise = find hasFactsInDb newerSchemas <|> find hasFactsInDb olderSchemas where -- if we have 'b evolves a' and 'c evolves b' -- given 'a' produces [b, c] newerSchemas = close (`Map.lookup` direct) src -- given 'c' produces [b, a] olderSchemas = close (`Map.lookup` reverseDirect) src reverseDirect :: Map SchemaRef SchemaRef reverseDirect = Map.fromList $ map swap $ Map.toList direct hasFactsInDb :: SchemaRef -> Bool hasFactsInDb sref = sref `Set.member` withFacts withFacts :: Set SchemaRef withFacts = Map.keysSet $ Map.filter (any hasFacts . visibleDefined) bySchemaRef close :: (a -> Maybe a) -> a -> [a] close next x = case next x of Just y -> y : close next y Nothing -> [] -- | Optimise derivation queries for fetching facts given the particular db's -- content. prune :: DbContent -> IntMap PredicateTransformation -> HashMap PredicateId PredicateDetails -> HashMap PredicateId PredicateDetails prune DbWritable _ pmap = pmap -- don't change derivations while db is still writable prune (DbReadOnly stats) transformations pmap = pruneDerivations hasFacts pmap where -- A predicate has facts in the db if it or the predicate it would be -- transformed into has facts in the db. hasFacts predId = fromMaybe False $ do details <- HashMap.lookup predId pmap let pid = predicatePid details availablePid = fromMaybe pid $ transformationFor pid pstats <- HashMap.lookup availablePid stats return $ predicateStats_count pstats > 0 transformationFor :: Pid -> Maybe Pid transformationFor pid = do trans <- IntMap.lookup (fromIntegral $ fromPid pid) transformations PidRef pid _ <- return $ tAvailable trans return pid -- | Compare schemas for compatibility. That is, if schema A defines a -- predicate or type P, then schema B must have an identical -- definition of P. -- -- This is used to determine whether we can safely use schema B for a -- stacked DB when the base DB is using schema A. It's useful to be -- able to do this, because we might want to add a new derived -- predicate to B for testing purposes, but it's only safe if we -- didn't change any predicates defined by A, because those will be -- mapped to the same Pids in the stacked DB. -- compareSchemaPredicates :: [PredicateId] -> [PredicateId] -> [Text] compareSchemaPredicates predsA predsB = errors where (errors, _) = flip execState ([], HashMap.empty) $ forM_ (predsA ++ predsB) $ \id -> do (errs, pids) <- State.get let (err, pids') = compareInsert "predicate" (predicateIdRef id) id pids State.put (err <> errs, pids') compareInsert :: (Hashable a, Eq a, Eq b, ShowRef a) => Text -> a -> b -> HashMap a b -> ([Text], HashMap a b) compareInsert thing ref id tids | Just id' <- HashMap.lookup ref tids = if id == id' then ([], tids) else ([thing <> " " <> showRef ref <> " has changed"], tids) | otherwise = ([], HashMap.insert ref id tids) mkPredicateTransformation :: Eq id => (id -> PredicateDetails) -> id -> id -> Maybe PredicateTransformation mkPredicateTransformation detailsFor requestedId availableId | requestedId == availableId = Nothing | otherwise = Just PredicateTransformation { tRequested = pidRef requested , tAvailable = pidRef available , tTransformFactBack = transformFact available requested , transformKeyPattern = transformPattern (key requested) (key available) , transformValuePattern = transformPattern (val requested) (val available) } where key = predicateKeyType val = predicateValueType available = detailsFor availableId requested = detailsFor requestedId {- Note [Negation in stored predicates] Facts derived based on the absence of other facts could be invalidated when new facts are added to an incremental database. The work required to identify which fact to invalidate is close to that of re-deriving those predicates all over again. To keep the overhead on incremental databases as low as possible, stored derived predicates are not allowed to use negation. -} typecheckSchema :: HashMap PredicateId Pid -> Bool -- ^ stored? -> TcOpts -> HashedSchema -> TcEnv -> IO TcEnv typecheckSchema idToPid stored tcOpts HashedSchema{..} tcEnv = do let -- NOTE: We store Maybe Type rather than filtering out those typedefs -- for which rtsType returns Nothing here because we need to be sufficiently -- lazy to tie the knot in rtsType. We get Nothing for typedefs which refer -- to predicates that are not in the database but do exist in the 'Schemas' -- (cf. 'mkDbSchema') or, more specifically, in `refToPid`, and then also -- for typedefs which refer to those typedefs and so on. -- -- Also note we need a lazy HashMap here to avoid forcing the -- elements too early, which would also cause a loop. typedefs :: Lazy.HashMap.HashMap TypeId (Maybe RTS.Type) typedefs = Lazy.HashMap.fromList [ (id, rtsType (Schema.rmLocType $ typeDefType def)) | (id, def) <- HashMap.toList hashedTypes ] lookupType :: TypeId -> Maybe RTS.Type lookupType ref = case Lazy.HashMap.lookup ref typedefs of Nothing -> Nothing Just r -> r rtsType = mkRtsType lookupType (`HashMap.lookup` idToPid) let -- If we have already seen this predicate, no need to -- typecheck it again. new = HashMap.difference hashedPreds (tcEnvPredicates tcEnv) new' = HashMap.map rmTypeLocPredDef new mkPredicateDetails (id,def) = case ( HashMap.lookup (predicateDefRef def) idToPid , rtsType $ predicateDefKeyType def , rtsType $ predicateDefValueType def ) of (Just pid, Just keyType, Just valueType) -> do typecheck <- checkSignature keyType valueType traversal <- genTraversal keyType valueType return $ Just PredicateDetails { predicatePid = pid , predicateId = id , predicateSchema = def , predicateKeyType = keyType , predicateValueType = valueType , predicateTraversal = traversal , predicateTypecheck = typecheck , predicateDeriving = NoDeriving , predicateInStoredSchema = stored } _ -> return Nothing newDetails <- catMaybes <$> mapM mkPredicateDetails (HashMap.toList new') let types = HashMap.fromList [ (id, TypeDetails id ty) | (id, Just ty) <- HashMap.toList typedefs ] insertPred details m = HashMap.insert (predicateId details) details m -- Build the environment in which we typecheck predicates from -- this schema. env = TcEnv { tcEnvPredicates = foldr insertPred (tcEnvPredicates tcEnv) newDetails , tcEnvTypes = HashMap.union types (tcEnvTypes tcEnv) } -- Typecheck the derivation for a predicate tcDeriving details = do drv <- do r <- runExceptT $ typecheckDeriving env tcOpts rtsType details (predicateDefDeriving (predicateSchema details)) either (throwIO . ErrorCall . Text.unpack) return r return details { predicateDeriving = drv } -- typecheck the derivations for newly added predicates finalDetails <- mapM tcDeriving newDetails let finalPreds = foldr insertPred (tcEnvPredicates tcEnv) finalDetails -- Check the invariant that stored predicates do not refer to derived -- predicates. We have to defer this check until last, because -- derivations may have been attached to existing predicates now. forM_ finalDetails $ \PredicateDetails{..} -> do let check = do checkStoredType finalPreds types predicateId predicateKeyType checkStoredType finalPreds types predicateId predicateValueType case predicateDeriving of NoDeriving -> return () -- In principle we should really enforce that predicates without -- a derivation don't refer to non-stored derived predicates. -- However, this can crop up in benign ways with backwards -- compatibility. For example, -- -- schema S.2 : S.1 { -- predicate P : T default P.1 ... -- predicate Q : P -- } -- -- if we have no facts of P.2 then the derivation would be -- enabled, and Q doesn't make sense. But there won't be any -- facts of Q either, so it doesn't matter. Derive DerivedAndStored _ -> check _ -> return () return env { tcEnvPredicates = finalPreds } -- | Enable or disable a DeriveIfEmpty derivation based on the DB -- contents. enableSelectiveDerivations :: DbContent -> PredicateDetails -> PredicateDetails enableSelectiveDerivations dbContent details = let disable = details { predicateDeriving = NoDeriving } in if | Derive DeriveIfEmpty _ <- predicateDeriving details -> case dbContent of DbWritable -> disable DbReadOnly stats -> do case HashMap.lookup (predicatePid details) stats of Nothing -> details Just stats | predicateStats_count stats == 0 -> details | otherwise -> disable | otherwise -> details usesOfNegation :: HashMap PredicateId PredicateDetails -> HashMap PredicateId UseOfNegation usesOfNegation preds = foldl' recordUseOfNegation mempty derivations where recordUseOfNegation usesNegation (mUseOfNeg, ref, deps) | Just use <- mUseOfNeg = HashMap.insert ref use usesNegation | Just use <- firstJust (`HashMap.lookup` usesNegation) deps = HashMap.insert ref use usesNegation | otherwise -- remove from Map in case a derived predicate in this schema that does -- not use negation overrode a derived predicate that used negation. = HashMap.delete ref usesNegation -- derivations in dependency order with flag for use of negation derivations :: [(Maybe UseOfNegation, PredicateId, [PredicateId])] derivations = toPred . getNode <$> reverse (topSort graph) (graph, getNode, _) = graphFromEdges $ derivationEdges preds toPred (b, k, ks) = (tcQueryUsesNegation b, k, ks) derivationEdges :: HashMap PredicateId PredicateDetails -> [(TcQuery, PredicateId, [PredicateId])] derivationEdges preds = [ (query, ref, Set.toList (tcQueryDeps query)) | (ref, details) <- HashMap.toList preds , Derive _ (QueryWithInfo query _ _ _) <- [predicateDeriving details] ] -- | Check that the type of a stored predicate doesn't refer to any -- derived predicates. checkStoredType :: HashMap PredicateId PredicateDetails -> HashMap TypeId TypeDetails -> PredicateId -> RTS.Type -> IO () checkStoredType preds types def ty = go ty where go (ArrayTy ty) = go ty go (MaybeTy ty) = go ty go (RecordTy fields) = forM_ fields $ \(FieldDef _ ty) -> go ty go (SumTy fields) = forM_ fields $ \(FieldDef _ ty) -> go ty go (PredicateTy _ (PidRef _ ref)) = case HashMap.lookup ref preds of Just PredicateDetails{..} -> case predicateDeriving of Derive DerivedAndStored _ -> return () Derive _ _ -> throwIO $ ErrorCall $ show $ "stored predicate " <> displayDefault def <> " refers to non-stored derived predicate " <> displayDefault ref _ -> return () Nothing -> error $ "checkStoredType: " <> Text.unpack (showRef ref) go (NamedTy _ (ExpandedType ref _)) = case HashMap.lookup ref types of Just TypeDetails{..} -> go typeType Nothing -> error $ "checkStoredType: " <> Text.unpack (showRef ref) go _ = return () -- | Check that a new schema can be processed and merged with the -- current schema index. This is used to ensure that a running server -- will not choke on a new schema before we deploy it. validateNewSchema :: ServerConfig.Config -> ByteString -> SchemaIndex -> DebugFlags -> IO () validateNewSchema _config newSrc current debug = do schema <- case processSchema Nothing newSrc of Left msg -> throwIO $ Thrift.Exception $ Text.pack msg Right resolved -> return resolved let curDbSchema <- mkDbSchema HashMap.toList Nothing Nothing readWriteContent schema Nothing debug void $ newMergedDbSchema Nothing (toStoredSchema curDbSchema) current readWriteContent debug failOnLeft :: Either Text a -> IO a failOnLeft = \case Right x -> return x Left err -> throwIO $ ErrorCall $ Text.unpack err -- | Check that the current schema in the SchemaIndex is compatible -- with each of the older schema instances. This is the validity check -- when adding a new schema instance. validateNewSchemaInstance :: SchemaIndex -> IO () validateNewSchemaInstance (SchemaIndex curr older) = failOnLeft $ do -- Prevent recursive predicates. Because this is a constraint we will remove -- in the future, for now we only check this when a new schema instance is to -- be generated. mapM_ checkRecursiveDefinitions (schemasResolved $ procSchemaResolved curr) let auto = Map.mapMaybe listToMaybe $ predicatesByRef [curr] direct <- directEvolutions schemas evolutions <- calcEvolutions predicateIdRef (predicatesByRef schemas) (predicatesBySchemaRef schemas) direct auto if performExhaustiveCheck then validateEvolutions Nothing types preds (transitive evolutions) else validateEvolutions Nothing types preds evolutions where schemas = curr : older (types, preds) = definitions schemas -- I P evolves Q and Q evolves R, we expect that `P evolves R` to work. If Q -- removes a field from R and P adds it again with a different type then `P -- evolves R` will not hold. -- -- This flag controls whether we should check for compatibility with all -- transitively evolved predicates. For now we don't have an efficient way to -- perform the check. This is O(n^2), and therefore off by default. performExhaustiveCheck = False -- add transitive relations explicitly to the map -- HashMap.fromList [(1,2),(2,3)] -- becomes -- HashMap.fromList [(1,2),(2,3),(1,3)] transitive :: (Hashable a, Eq a) => HashMap a a -> HashMap a a transitive m = HashMap.fromList [ (key, val) | key <- HashMap.keys m , val <- close (`HashMap.lookup` m) key ] -- | Detect recursion and co-recursion in predicate derivations. -- We don't check default derived predicates because whether they will in fact -- recurse or not depends on the db's content. checkRecursiveDefinitions :: ResolvedSchemaRef -> Either Text () checkRecursiveDefinitions resolved = unless (null recursiveDefinitions) $ Left $ Text.unlines $ "found cycles in predicate derivations: " : [ Text.intercalate " -> " $ map showRef (x:xs ++ [x]) | x:xs <- recursiveDefinitions ] where recursiveDefinitions :: [[PredicateRef]] recursiveDefinitions = cycles (HashMap.keys deps) (\p -> HashMap.lookupDefault [] p deps) -- derived predicate to derived predicate dependency map deps :: HashMap PredicateRef [PredicateRef] deps = fmap (filter hasDerivation . preds) derivationQueries hasDerivation :: PredicateRef -> Bool hasDerivation ref = ref `HashMap.member` derivationQueries derivationQueries :: HashMap PredicateRef (Query_ PredicateRef TypeRef) derivationQueries = HashMap.mapMaybe (toQuery . derivingDefDeriveInfo ) (resolvedSchemaDeriving resolved) where toQuery = \case NoDeriving -> Nothing Derive when query -> case when of DeriveIfEmpty -> Nothing DeriveOnDemand -> Just query DerivedAndStored -> Just query -- predicates referenced in the query preds :: Query_ PredicateRef TypeRef -> [PredicateRef] preds query = HashSet.toList $ bifoldMap HashSet.singleton references query cycles :: Ord a => [a] -> (a -> [a]) -> [[a]] cycles roots children = [ cycle | CyclicSCC cycle <- scc] where scc = stronglyConnComp [ (node, node, children node) | node <- roots ] -- predicates referenced by type references :: TypeRef -> HashSet PredicateRef references tref = fromMaybe mempty $ do TypeDef _ ty _ <- HashMap.lookup tref (resolvedSchemaTypes resolved) return $ bifoldMap HashSet.singleton references ty definitions :: [ProcessedSchema] -> (HashMap TypeId TypeDef, HashMap PredicateId PredicateDef) definitions schemas = (types, preds) where preds :: HashMap PredicateId PredicateDef preds = HashMap.fromList [ (id, def) | processed <- schemas , (id, def) <- HashMap.toList $ hashedPreds $ procSchemaHashed processed ] types :: HashMap TypeId TypeDef types = HashMap.unions $ hashedTypes . procSchemaHashed <$> schemas -- | Interrogate the schema associated with a DB getSchemaInfo :: DbSchema -> SchemaIndex -> GetSchemaInfo -> IO SchemaInfo getSchemaInfo dbSchema index@SchemaIndex{..} GetSchemaInfo{..} = do let pids = Map.fromList $ [ (fromPid $ predicatePid p, predicateRef p) | p <- HashMap.elems (predicatesById dbSchema) ] !(source, sourceVer, sourceId) = schemaSource dbSchema storedSchema = renderSchemaSource source schemaIds = toStoredVersions (schemaAllVersion dbSchema) (schemaId dbSchema) dbSchemaIds = toStoredVersions sourceVer sourceId otherSchemaIds = [ toStoredVersions (hashedSchemaAllVersion procSchemaHashed) (hashedSchemaId procSchemaHashed) | ProcessedSchema{..} <- schemaIndexOlder , hashedSchemaId procSchemaHashed /= sourceId ] fromPredicateId predId = fromPid $ predicatePid(predicatesById dbSchema HashMap.! predId) derivationDependencies = Map.fromList [ (fromPredicateId p, fromPredicateId <$> pp) | (p, pp) <- HashMap.toList (derivationDepends dbSchema) ] source <- if getSchemaInfo_omit_source then return "" else case getSchemaInfo_select of SelectSchema_stored{} -> return storedSchema SelectSchema_current{} -> findSchemaSource index dbSchema (schemaId dbSchema) SelectSchema_schema_id sid -> findSchemaSource index dbSchema sid _ -> return storedSchema return SchemaInfo { schemaInfo_schema = source , schemaInfo_predicateIds = pids , schemaInfo_schemaIds = schemaIds , schemaInfo_dbSchemaIds = dbSchemaIds , schemaInfo_otherSchemaIds = otherSchemaIds , schemaInfo_derivationDependencies = derivationDependencies } getSchemaInfoForSchema :: SchemaIndex -> SchemaId -> DebugFlags -> IO SchemaInfo getSchemaInfoForSchema index sid debug = do processed <- maybe (throwIO $ userError "schema id not found") return $ findSchemaInIndex index sid dbSchema <- mkDbSchema HashMap.toList Nothing Nothing readWriteContent processed Nothing debug let schemaSource = renderSchemaSource (procSchemaSource processed) res <- getSchemaInfo dbSchema index (GetSchemaInfo (SelectSchema_schema_id sid) True) return res{schemaInfo_schema = schemaSource} -- The version map in the 'StoredSchema', the 'SchemaIndex' and the -- 'SchemaInfo' is a @Map Text Version@ whereas the one we use -- internally is the inverse of this: @IntMap SchemaId@. These two -- functions convert back and forth. -- -- TODO: maybe they should be the same. toStoredVersions :: Version -> SchemaId -> Map Text Version toStoredVersions ver id = Map.fromList [ (unSchemaId id, fromIntegral ver) ] fromStoredVersions :: Map Text Version -> IntMap SchemaId fromStoredVersions versions = IntMap.fromList [ (fromIntegral ver, SchemaId id) | (id, ver) <- Map.toList versions ] renderSchemaSource :: SourceSchemas -> ByteString renderSchemaSource parsed = Text.encodeUtf8 $ renderStrict $ layoutCompact $ displayDefault parsed -- Try to find the source for the desired schema. It could be in -- our SchemaIndex, or it could be the schema stored in the DB if -- we're looking at an old DB. findSchemaSource :: Glean.Database.Config.SchemaIndex -> DbSchema -> Thrift.SchemaId -> IO ByteString findSchemaSource index dbSchema sid = do let inDB = findSchemaInDB dbSchema sid inIndex = findSchemaInIndex index sid case inDB <|> fmap procSchemaSource inIndex of Just schema -> return $ renderSchemaSource schema Nothing -> throwIO $ Thrift.Exception $ "SchemaId not found: " <> Thrift.unSchemaId sid findSchemaInDB :: DbSchema -> Thrift.SchemaId -> Maybe SourceSchemas findSchemaInDB dbSchema sid = listToMaybe matches where matches = [ source | (source, _, sourceId) <- [ schemaSource dbSchema ] , sid == sourceId ] findSchemaInIndex :: Glean.Database.Config.SchemaIndex -> Thrift.SchemaId -> Maybe ProcessedSchema findSchemaInIndex Glean.Database.Config.SchemaIndex{..} sid = listToMaybe matches where matches = [ proc | proc <- schemaIndexCurrent : schemaIndexOlder , sid == hashedSchemaId (procSchemaHashed proc) ]