{-
  Copyright (c) Meta Platforms, Inc. and affiliates.
  All rights reserved.

  This source code is licensed under the BSD-style license found in the
  LICENSE file in the root directory of this source tree.
-}

{-# LANGUAGE DeriveFunctor #-}

module Glean.Query.Incremental
  ( makeIncremental
  )
  where

import Data.Maybe (mapMaybe, catMaybes)

import Glean.Query.Codegen.Types
import Glean.RTS.Types (Pid, PidRef(..))
import Glean.RTS.Term

-- | Make a predicate derivation query efficiently derive new facts on an
-- incremental database.
-- See Note [Incremental derivation - Strategy]
makeIncremental
  :: (SeekSection -> Pid -> Bool)
  -- ^ whether a predicate has facts in the section or not.
  -> CodegenQuery
  -> CodegenQuery
makeIncremental hasFacts (QueryWithInfo (CgQuery h body) v g t) =
 QueryWithInfo (CgQuery h (expandGenerators hasFacts body)) v g t

-- | Represents the output from expanding statements such that they will
-- perform a combination of querying the base and incremental database.
data Incremental a = Incremental
  { seekingStacked :: Maybe a
    -- ^ invariant: contains at least one generator with SeekOnStacked
  , seekingOnlyBase :: Maybe a
    -- ^ invariant: contains 0 or more generators and all with SeekOnBase
  }
  deriving (Functor)

falseStmt  :: CgStatement
falseStmt = CgDisjunction []

-- | See Note [Incremental derivation - Implementation]
expandGenerators
  :: (SeekSection -> Pid -> Bool)
  -> [CgStatement]
  -> [CgStatement]
expandGenerators hasFacts stmts =
  case seekingStacked $ expandSequence stmts of
    Nothing -> [falseStmt]
    Just xs -> xs
  where
  expandSequence :: [CgStatement] -> Incremental [CgStatement]
  expandSequence [] = Incremental Nothing (Just [])
  expandSequence (x:xs) = Incremental new old
    where
      Incremental mnew mold = expandStatement x
      Incremental mnews molds = expandSequence xs
      new = nonnull $ disjunctionToSequence $ catMaybes
        [ do new <- mnew
             return (new:xs)
        , do old <- mold
             news <- mnews
             return (old:news)
        ]
      old = do
        old <- mold
        olds <- molds
        return (old:olds)

  expandStatement :: CgStatement -> Incremental CgStatement
  expandStatement stmt = case stmt of
    CgStatement pat (FactGenerator pref key val SeekOnAllFacts)
      -- when pat is known, this is a lookup not a seek so we
      -- don't need to expand it.
      | not (known pat) ->
      let PidRef pid _ = pref
          generator where_ =
            if hasFacts where_ pid
            then Just $ CgStatement pat (FactGenerator pref key val where_)
            else Nothing
      in
      Incremental (generator SeekOnStacked) (generator SeekOnBase)
    CgStatement{} ->
      Incremental Nothing (Just stmt)
    CgAllStatement{} -> Incremental Nothing (Just stmt)
    CgDisjunction stmtss ->
      CgDisjunction <$> expandDisjunction stmtss
    CgNegation{} ->
      error "unexpected negation in stored derived predicate"
    CgConditional{} ->
      error "unexpected if statement in stored derived predicate"

  known (Ref MatchFid{}) = True
  known (Ref MatchVar{}) = True
  known Nat{} = True
  known (Ref (MatchAnd p q)) = known p || known q
  known _ = False

  expandDisjunction :: [[CgStatement]] -> Incremental [[CgStatement]]
  expandDisjunction branches = Incremental (nonnull news) (nonnull olds)
    where
      ts = map expandSequence branches
      news = mapMaybe seekingStacked ts
      olds = mapMaybe seekingOnlyBase ts

  disjunctionToSequence ::  [[CgStatement]] -> [CgStatement]
  disjunctionToSequence = \case
    [] -> []
    [x] -> x
    xs -> [CgDisjunction xs]

  nonnull :: [a] -> Maybe [a]
  nonnull [] = Nothing
  nonnull xs = Just xs

{- Note [Incremental derivation - Strategy]

When deriving stored predicates on stacked databases we want to make sure that
we only derive facts that use the newly available data and that we are not
re-deriving everything that was derived before.

We can do this by making sure that the derivation query will always look at
fact combinations involving new facts in the stacked database.

We could require that at least one field of new derived facts come from the
stacked database, but that would be too restrictive. Consider this case

  predicate Path2 { a: Node, b: Node }
    { A, B } where
      Edge { A, K };
      Edge { K, B }

New edges should entail new `Path2` facts despite `Edge` not being part of
`Path2`'s key or value.

Instead we require that at least one FactGenerator in every query path
be restricted only to the stacked db.

-}

{- Note [Incremental derivation - Implementation]

When running derivations in an incremental database we want to enforce that:

 1. Every derived predicate be generated using some data coming from the new
    increment fact set.

 2. We must not repeat the derivation process already performend on the base db.

This can be achieved by querying all possible combinations of seeking on base
and incremental db for every fact generator. Rule 2 can be enforced by not
running the case where all generators query the base db. This is key as we
expect that branch to take much longer than all others.

We may try to transform each fact generator into a disjunction. Given the query:

   P A; Q B; R C;

naively transform it into

   (Pbase A | Pincr A);
   (Pbase B | Pincr B);
   (Rbase C | Rincr C);

This is problematic becase we cannot remove the case where all generators query
the base db (rule 2).

We may try another approach. Generate all possible combinations and drop the
unwanted case.

   (Pbase A; Qbase B; Rbase C)  -- this one is dropped
   (Pbase A; Qbase B; Rincr C)
   (Pbase A; Qincr B; Rbase C)
   (Pbase A; Qincr B; Rincr C)
   (Pincr A; Qbase B; Rbase C)
   (Pincr A; Qincr B; Rbase C)
   (Pincr A; Qbase B; Rincr C)
   (Pincr A; Qincr B; Rincr C)

this has two problems: the bytecode size complexity is O(2^n) and it is
inefficient as we are not doing any sharing between different paths.

Finally, we opt for the approach of making a tree out of it by branching on
every fact generator.

   ( Pincr A;
     ( Qbase B;
         ( Rbase C
         | Rincr C
         )
     ) |
     ( Qincr B;
         ( Rbase C
         | Rincr C
         )
     )
   ) |
   ( Pbase A;
     ( Qbase B;
         ( Rbase C -- we remove this node to have no branch querying only base
         | Rincr C
         )
     ) |
     ( Qincr B;
         ( Rbase C
         | Rincr C
         )
     )
   )

The bytecode size is still O(2^n) but the query is more efficient as we now
have more sharing.
We optimise it by realising that (Pincr A | Pbase A) == P A.

   ( Pincr A; Q B; R C ) |
   ( Pbase A;
      ( Qincr B; R C) |
      ( Qbase B; Rincr C)
   )

This leads fact generator expansion to two cases
  * Seek on increment db and proceed with the original query
  * Seek on the base db and process the rest of the query further.

The resulting algorithm is O(n) in time and space, but the output tree will
generate bytecode that is O(n^2) where n is the number of fact generators in
the longest query branch.

## Handling disjunctions

To have all possible combinations of seeking base and stacked dbs for
disjunctions we will need to separate the case where all branches seek
only on base from the other cases.

  (P A | Q B); R C;

becomes

     ( ( Pincr A | Qincr B ); R C )
   | ( ( Pbase A | Qbase B ); Rincr C )

-}