distinct_on for BigQuery

PR-URL: https://github.com/hasura/graphql-engine-mono/pull/3191 GitOrigin-RevId: e9f1c814b2caf09946389c4bcb30a0c42277abaf
2024-12-15 09:22:43 +03:00 · 2022-01-17 13:01:25 +03:00 · 2022-01-17 13:01:25 +03:00 · 7beb15b4a3
commit 7beb15b4a3
parent bd8766171f
18 changed files with 501 additions and 153 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -8,6 +8,7 @@
 - server: in mssql transactions, rollback only if the transaction is active
 - server: add request and response bodies to OpenAPI specification of REST endpoints
 - server: implement upsert mutations for MS SQL Server (close #7864)
+- server: bigquery: implement `distinct_on`.

 ### Bug fixes and improvements
 (Add entries below in the order of server, console, cli, docs, others)
--- a/server/src-lib/Hasura/Backends/BigQuery/DDL/Source.hs
+++ b/server/src-lib/Hasura/Backends/BigQuery/DDL/Source.hs
@ -12,6 +12,7 @@ import Data.Aeson qualified as J
 import Data.ByteString.Lazy qualified as L
 import Data.Environment qualified as Env
 import Data.HashMap.Strict qualified as HM
+import Data.Int qualified as Int
 import Data.Text qualified as T
 import Data.Text.Encoding qualified as T
 import Data.Time.Clock.System
@ -28,7 +29,7 @@ import Hasura.RQL.Types.SourceCustomization
 import Hasura.RQL.Types.Table
 import Hasura.SQL.Backend

-defaultGlobalSelectLimit :: Int
+defaultGlobalSelectLimit :: Int.Int64
 defaultGlobalSelectLimit = 1000

 resolveSourceConfig ::
--- a/server/src-lib/Hasura/Backends/BigQuery/FromIr.hs
+++ b/server/src-lib/Hasura/Backends/BigQuery/FromIr.hs
@ -16,10 +16,10 @@ where

 import Control.Monad.Validate
 import Data.HashMap.Strict qualified as HM
+import Data.Int qualified as Int
 import Data.List.NonEmpty qualified as NE
 import Data.Map.Strict (Map)
 import Data.Map.Strict qualified as M
-import Data.Proxy
 import Data.Text qualified as T
 import Hasura.Backends.BigQuery.Instances.Types ()
 import Hasura.Backends.BigQuery.Source (BigQuerySourceConfig (..))
@ -49,7 +49,6 @@ data Error
  | UnsupportedSQLExp Expression
  | UnsupportedDistinctOn
  | InvalidIntegerishSql Expression
-  | DistinctIsn'tSupported
  | ConnectionsNotSupported
  | ActionsNotSupported

@ -69,7 +68,6 @@ instance Show Error where
      UnsupportedSQLExp {} -> "UnsupportedSQLExp"
      UnsupportedDistinctOn {} -> "UnsupportedDistinctOn"
      InvalidIntegerishSql {} -> "InvalidIntegerishSql"
-      DistinctIsn'tSupported {} -> "DistinctIsn'tSupported"
      ConnectionsNotSupported {} -> "ConnectionsNotSupported"
      ActionsNotSupported {} -> "ActionsNotSupported"

@ -140,7 +138,7 @@ mkSQLSelect ::
  Ir.AnnSelectG 'BigQuery Void (Ir.AnnFieldG 'BigQuery Void) Expression ->
  FromIr BigQuery.Select
 mkSQLSelect jsonAggSelect annSimpleSel = do
-  select <- fromSelectRows annSimpleSel
+  select <- noExtraPartitionFields <$> fromSelectRows annSimpleSel
  pure
    ( select
        { selectCardinality =
@ -177,8 +175,22 @@ fromUnnestedJSON json columns _fields = do
        )
    )

-fromSelectRows :: Ir.AnnSelectG 'BigQuery Void (Ir.AnnFieldG 'BigQuery Void) Expression -> FromIr BigQuery.Select
+fromSelectRows :: Ir.AnnSelectG 'BigQuery Void (Ir.AnnFieldG 'BigQuery Void) Expression -> FromIr BigQuery.PartitionableSelect
 fromSelectRows annSelectG = do
+  let Ir.AnnSelectG
+        { _asnFields = fields,
+          _asnFrom = from,
+          _asnPerm = perm,
+          _asnArgs = args,
+          _asnStrfyNum = num
+        } = annSelectG
+      Ir.TablePerm {_tpLimit = mPermLimit, _tpFilter = permFilter} = perm
+      permissionBasedTop =
+        maybe NoTop (Top . fromIntegral) mPermLimit
+      stringifyNumbers =
+        if num
+          then StringifyNumbers
+          else LeaveNumbersAlone
  selectFrom <-
    case from of
      Ir.FromTable qualifiedObject -> fromQualifiedTable qualifiedObject
@ -190,7 +202,7 @@ fromSelectRows annSelectG = do
      argsWhere,
      argsJoins,
      argsTop,
-      argsDistinct = Proxy,
+      argsDistinct,
      argsOffset,
      argsExistingJoins
    } <-
@ -205,35 +217,119 @@ fromSelectRows annSelectG = do
    NE.nonEmpty (concatMap (toList . fieldSourceProjections True) fieldSources)
      `onNothing` refute (pure NoProjectionFields)
  globalTop <- getGlobalTop
+  let select =
+        Select
+          { selectCardinality = Many,
+            selectFinalWantedFields = pure (fieldTextNames fields),
+            selectGroupBy = mempty,
+            selectOrderBy = argsOrderBy,
+            -- We DO APPLY the global top here, because this pulls down all rows.
+            selectTop = globalTop <> permissionBasedTop <> argsTop,
+            selectProjections,
+            selectFrom,
+            selectJoins = argsJoins <> concat (mapMaybe fieldSourceJoins fieldSources),
+            selectWhere = argsWhere <> Where [filterExpression],
+            selectOffset = int64Expr <$> argsOffset
+          }
+  case argsDistinct of
+    Nothing ->
+      pure $ simpleSelect select
+    Just distinct ->
+      simulateDistinctOn select distinct argsOrderBy
+
+-- | Simulates DISTINCT ON for BigQuery using ROW_NUMBER() partitioned over distinct fields
+--
+-- Example:
+--
+-- For a GraphQL query:
+-- @
+-- hasura_test_article(distinct_on: author_id, order_by: [{author_id: asc}, {created_at: asc}]) {
+--   id
+--   title
+-- }
+-- @
+--
+-- it should produce from a query without a `distinct_on` clause:
+--
+-- SELECT `id`, `title`
+-- FROM `hasura_test`.`article`
+-- ORDER BY `author_id` ASC, `created_at` ASC
+--
+-- a query of the following form:
+--
+-- SELECT `id`, `title`
+-- FROM (SELECT *,
+--              ROW_NUMBER() OVER (PARTITION BY `author_id` ORDER BY `created_at` ASC) as `idx1`
+--       FROM `hasura_test`.`article`) as `t_article1`
+-- WHERE (`t_article1`.`idx1` = 1)
+-- ORDER BY `t_article1`.`author_id` ASC
+--
+-- Note: this method returns PartitionableSelect as it could be joined using an array relation
+-- which requires extra fields added to the PARTITION BY clause to return proper results
+simulateDistinctOn :: Select -> NonEmpty ColumnName -> Maybe (NonEmpty OrderBy) -> FromIr PartitionableSelect
+simulateDistinctOn select distinctOnColumns orderByColumns = do
+  rowNumAlias <- generateEntityAlias IndexTemplate
  pure
-    Select
-      { selectCardinality = Many,
-        selectFinalWantedFields = pure (fieldTextNames fields),
-        selectGroupBy = mempty,
-        selectOrderBy = argsOrderBy,
-        -- We DO APPLY the global top here, because this pulls down all rows.
-        selectTop = globalTop <> permissionBasedTop <> argsTop,
-        selectProjections,
-        selectFrom,
-        selectJoins = argsJoins <> concat (mapMaybe fieldSourceJoins fieldSources),
-        selectWhere = argsWhere <> Where [filterExpression],
-        selectOffset = argsOffset
+    PartitionableSelect
+      { pselectFrom = selectFrom select,
+        pselectFinalize = \mExtraPartitionField ->
+          let -- we use the same alias both for outer and inner selects
+              alias = entityAliasText (fromAlias (selectFrom select))
+              distinctFields = fmap (\(ColumnName name) -> FieldName name alias) distinctOnColumns
+              finalDistinctFields = case mExtraPartitionField of
+                Just extraFields
+                  | Just neExtraFields <- nonEmpty extraFields ->
+                    neExtraFields <> distinctFields
+                _ -> distinctFields
+              (distinctOnOrderBy, innerOrderBy) =
+                case orderByColumns of
+                  Just orderBy ->
+                    let (distincts, others) = NE.partition (\OrderBy {..} -> orderByFieldName `elem` distinctFields) orderBy
+                     in (NE.nonEmpty distincts, NE.nonEmpty others)
+                  Nothing ->
+                    (Nothing, Nothing)
+              innerFrom =
+                FromSelect
+                  Aliased
+                    { aliasedAlias = alias,
+                      aliasedThing =
+                        select
+                          { selectProjections =
+                              StarProjection
+                                :| [ WindowProjection
+                                       ( Aliased
+                                           { aliasedAlias = unEntityAlias rowNumAlias,
+                                             aliasedThing =
+                                               RowNumberOverPartitionBy
+                                                 finalDistinctFields
+                                                 innerOrderBy
+                                                 -- Above: Having the order by
+                                                 -- in here ensures that we get the proper
+                                                 -- row as the first one we select
+                                                 -- in the outer select WHERE condition
+                                                 -- to simulate DISTINCT ON semantics
+                                           }
+                                       )
+                                   ],
+                            selectTop = mempty,
+                            selectJoins = mempty,
+                            selectOrderBy = mempty,
+                            selectOffset = Nothing,
+                            selectGroupBy = mempty,
+                            selectFinalWantedFields = mempty
+                          }
+                    }
+           in select
+                { selectFrom = innerFrom,
+                  selectWhere =
+                    Where
+                      [ EqualExpression
+                          (ColumnExpression FieldName {fieldNameEntity = alias, fieldName = unEntityAlias rowNumAlias})
+                          (int64Expr 1)
+                      ],
+                  selectOrderBy = distinctOnOrderBy
+                }
      }
-  where
-    Ir.AnnSelectG
-      { _asnFields = fields,
-        _asnFrom = from,
-        _asnPerm = perm,
-        _asnArgs = args,
-        _asnStrfyNum = num
-      } = annSelectG
-    Ir.TablePerm {_tpLimit = mPermLimit, _tpFilter = permFilter} = perm
-    permissionBasedTop =
-      maybe NoTop Top mPermLimit
-    stringifyNumbers =
-      if num
-        then StringifyNumbers
-        else LeaveNumbersAlone

 fromSelectAggregate ::
  Maybe (EntityAlias, HashMap ColumnName ColumnName) ->
@ -244,7 +340,7 @@ fromSelectAggregate minnerJoinFields annSelectG = do
    case from of
      Ir.FromTable qualifiedObject -> fromQualifiedTable qualifiedObject
      _ -> refute (pure (FromTypeUnsupported from))
-  args'@Args {argsWhere, argsOrderBy, argsJoins, argsTop, argsOffset, argsDistinct = Proxy} <-
+  args'@Args {argsWhere, argsOrderBy, argsJoins, argsTop, argsOffset, argsDistinct} <-
    runReaderT (fromSelectArgsG args) (fromAlias selectFrom)
  filterExpression <-
    runReaderT (fromAnnBoolExp permFilter) (fromAlias selectFrom)
@ -271,6 +367,45 @@ fromSelectAggregate minnerJoinFields annSelectG = do
      )
      (refute (pure NoProjectionFields))
  indexAlias <- generateEntityAlias IndexTemplate
+  let innerSelectAlias = entityAliasText (fromAlias selectFrom)
+      mDistinctFields = fmap (fmap (\(ColumnName name) -> FieldName name innerSelectAlias)) argsDistinct
+      mPartitionFields =
+        fmap (NE.fromList . map fst) mforeignKeyConditions <> mDistinctFields
+      innerProjections =
+        case mPartitionFields of
+          Nothing -> pure StarProjection
+          Just partitionFields ->
+            StarProjection
+              :|
+              -- We setup an index over every row in
+              -- the sub select.  Then if you look at
+              -- the outer Select, you can see we apply
+              -- a WHERE that uses this index for
+              -- LIMIT/OFFSET or DISTINCT ON.
+              [ WindowProjection
+                  ( Aliased
+                      { aliasedAlias = unEntityAlias indexAlias,
+                        aliasedThing =
+                          RowNumberOverPartitionBy
+                            -- The row numbers start from 1.
+                            partitionFields
+                            argsOrderBy
+                            -- Above: Having the order by
+                            -- in here ensures that the
+                            -- row numbers are ordered by
+                            -- this ordering. Below, we
+                            -- order again for the
+                            -- general row order. Both
+                            -- are needed!
+                      }
+                  )
+              ]
+      indexColumn =
+        ColumnExpression $
+          FieldName
+            { fieldNameEntity = innerSelectAlias,
+              fieldName = unEntityAlias indexAlias
+            }
  pure
    Select
      { selectCardinality = One,
@ -283,36 +418,7 @@ fromSelectAggregate minnerJoinFields annSelectG = do
            ( Aliased
                { aliasedThing =
                    Select
-                      { selectProjections =
-                          case mforeignKeyConditions of
-                            Nothing -> pure StarProjection
-                            Just innerJoinFields ->
-                              pure StarProjection
-                                <>
-                                -- We setup an index over every row in
-                                -- the sub select.  Then if you look at
-                                -- the outer Select, you can see we apply
-                                -- a WHERE that uses this index for
-                                -- LIMIT/OFFSET.
-                                pure
-                                  ( WindowProjection
-                                      ( Aliased
-                                          { aliasedAlias = unEntityAlias indexAlias,
-                                            aliasedThing =
-                                              RowNumberOverPartitionBy
-                                                -- The row numbers start from 1.
-                                                (NE.fromList (map fst innerJoinFields))
-                                                argsOrderBy
-                                                -- Above: Having the order by
-                                                -- in here ensures that the
-                                                -- row numbers are ordered by
-                                                -- this ordering. Below, we
-                                                -- order again for the
-                                                -- general row order. Both
-                                                -- are needed!
-                                          }
-                                      )
-                                  ),
+                      { selectProjections = innerProjections,
                        selectFrom,
                        selectJoins = argsJoins,
                        selectWhere = argsWhere <> (Where [filterExpression]),
@ -325,55 +431,51 @@ fromSelectAggregate minnerJoinFields annSelectG = do
                        selectFinalWantedFields = Nothing,
                        selectCardinality = Many,
                        selectTop = maybe argsTop (const NoTop) mforeignKeyConditions,
-                        selectOffset = maybe argsOffset (const Nothing) mforeignKeyConditions,
+                        -- we apply offset only if we don't have partitions
+                        -- when we do OFFSET/LIMIT based on ROW_NUMBER()
+                        selectOffset = maybe (int64Expr <$> argsOffset) (const Nothing) mPartitionFields,
                        selectGroupBy = mempty
                      },
-                  aliasedAlias = entityAliasText (fromAlias selectFrom)
+                  aliasedAlias = innerSelectAlias
                }
            ),
        selectJoins = concat (mapMaybe fieldSourceJoins fieldSources),
        selectWhere =
-          case mforeignKeyConditions of
+          case mPartitionFields of
            Nothing -> mempty
            Just {} ->
              let offset =
-                    case argsOffset of
-                      Nothing -> mempty
-                      Just offset' ->
-                        Where
-                          -- Apply an offset using the row_number from above.
-                          [ OpExpression
-                              MoreOp
-                              ( ColumnExpression
-                                  FieldName
-                                    { fieldNameEntity =
-                                        coerce (fromAlias selectFrom),
-                                      fieldName = unEntityAlias indexAlias
-                                    }
-                              )
-                              offset'
-                          ]
+                    case argsDistinct of
+                      Nothing ->
+                        case argsOffset of
+                          Nothing -> mempty
+                          Just offset' ->
+                            -- Apply an offset using the row_number from above.
+                            [ OpExpression
+                                MoreOp
+                                indexColumn
+                                (int64Expr offset')
+                            ]
+                      Just {} ->
+                        -- in case of distinct_on we need to select the row number offset+1
+                        -- effectively skipping number of rows equal to offset
+                        [ EqualExpression
+                            indexColumn
+                            (int64Expr (fromMaybe 0 argsOffset + 1))
+                        ]
                  limit =
                    case argsTop of
                      NoTop -> mempty
                      Top limit' ->
-                        Where
-                          -- Apply a limit using the row_number from above.
-                          [ OpExpression
-                              LessOp
-                              ( ColumnExpression
-                                  FieldName
-                                    { fieldNameEntity =
-                                        coerce (fromAlias selectFrom),
-                                      fieldName = unEntityAlias indexAlias
-                                    }
-                              )
-                              ( ValueExpression . IntegerValue . Int64 . tshow $
-                                  limit' + 1 -- Because the row_number() indexing starts at 1.
-                                  -- So idx<l+1  means idx<2 where l = 1 i.e. "limit to 1 row".
-                              )
-                          ]
-               in offset <> limit,
+                        -- Apply a limit using the row_number from above.
+                        [ OpExpression
+                            LessOp
+                            indexColumn
+                            ( int64Expr (limit' + 1) -- Because the row_number() indexing starts at 1.
+                            -- So idx<l+1  means idx<2 where l = 1 i.e. "limit to 1 row".
+                            )
+                        ]
+               in Where (offset <> limit),
        selectOrderBy = Nothing,
        selectOffset = Nothing
      }
@ -387,7 +489,7 @@ fromSelectAggregate minnerJoinFields annSelectG = do
      } = annSelectG
    Ir.TablePerm {_tpLimit = mPermLimit, _tpFilter = permFilter} = perm
    permissionBasedTop =
-      maybe NoTop Top mPermLimit
+      maybe NoTop (Top . fromIntegral) mPermLimit
    stringifyNumbers =
      if num
        then StringifyNumbers
@ -401,8 +503,8 @@ data Args = Args
    argsOrderBy :: Maybe (NonEmpty OrderBy),
    argsJoins :: [Join],
    argsTop :: Top,
-    argsOffset :: Maybe Expression,
-    argsDistinct :: Proxy (Maybe (NonEmpty FieldName)),
+    argsOffset :: Maybe Int.Int64,
+    argsDistinct :: Maybe (NonEmpty ColumnName),
    argsExistingJoins :: Map TableName EntityAlias
  }
  deriving (Show)
@ -416,20 +518,9 @@ data UnfurledJoin = UnfurledJoin

 fromSelectArgsG :: Ir.SelectArgsG 'BigQuery Expression -> ReaderT EntityAlias FromIr Args
 fromSelectArgsG selectArgsG = do
-  let argsOffset = ValueExpression . IntegerValue . Int64 . tshow <$> moffset
  argsWhere <-
    maybe (pure mempty) (fmap (Where . pure) . fromAnnBoolExp) mannBoolExp
-  argsTop <- maybe (pure mempty) (pure . Top) mlimit
-  -- Not supported presently, per Vamshi:
-  --
-  -- > It is hardly used and we don't have to go to great lengths to support it.
-  --
-  -- But placeholdering the code so that when it's ready to be used,
-  -- you can just drop the Proxy wrapper.
-  argsDistinct <-
-    case mdistinct of
-      Nothing -> pure Proxy
-      Just {} -> refute (pure DistinctIsn'tSupported)
+  let argsTop = maybe mempty (Top . fromIntegral) mlimit
  (argsOrderBy, joins) <-
    runWriterT (traverse fromAnnotatedOrderByItemG (maybe [] toList orders))
  -- Any object-relation joins that we generated, we record their
@ -440,13 +531,14 @@ fromSelectArgsG selectArgsG = do
    Args
      { argsJoins = toList (fmap unfurledJoin joins),
        argsOrderBy = NE.nonEmpty argsOrderBy,
+        argsDistinct = mdistinct,
        ..
      }
  where
    Ir.SelectArgs
      { _saWhere = mannBoolExp,
        _saLimit = mlimit,
-        _saOffset = moffset,
+        _saOffset = argsOffset,
        _saDistinct = mdistinct,
        _saOrderBy = orders
      } = selectArgsG
@ -1213,17 +1305,22 @@ fromArrayAggregateSelectG annRelationSelectG = do
 --
 --     ORDER BY artist_other_id;
 --     ^ Ordering for the artist table should appear here.
+--
+-- Note: if original select already uses a PARTITION BY internally (for distinct_on)
+-- join fields are added to partition expressions to give proper semantics of distinct_on
+-- combined with an array relation
 fromArrayRelationSelectG ::
  Ir.ArrayRelationSelectG 'BigQuery Void Expression ->
  ReaderT EntityAlias FromIr Join
 fromArrayRelationSelectG annRelationSelectG = do
-  select <- lift (fromSelectRows annSelectG) -- Take the original select.
+  pselect <- lift (fromSelectRows annSelectG) -- Take the original select.
  joinFieldName <- lift (fromRelName aarRelationshipName)
  alias <- lift (generateEntityAlias (ArrayRelationTemplate joinFieldName))
  indexAlias <- lift (generateEntityAlias IndexTemplate)
  joinOn <- fromMappingFieldNames alias mapping
  innerJoinFields <-
-    fromMappingFieldNames (fromAlias (selectFrom select)) mapping
+    fromMappingFieldNames (fromAlias (pselectFrom pselect)) mapping
+  let select = withExtraPartitionFields pselect $ map fst innerJoinFields
  let joinFieldProjections =
        map
          ( \(fieldName', _) ->
--- a/server/src-lib/Hasura/Backends/BigQuery/Instances/Schema.hs
+++ b/server/src-lib/Hasura/Backends/BigQuery/Instances/Schema.hs
@ -20,7 +20,6 @@ import Hasura.GraphQL.Schema.Build qualified as GSB
 import Hasura.GraphQL.Schema.Common
 import Hasura.GraphQL.Schema.Select
 import Hasura.Prelude
-import Hasura.RQL.IR
 import Hasura.RQL.IR.Select qualified as IR
 import Hasura.RQL.Types
 import Language.GraphQL.Draft.Syntax qualified as G
@ -44,7 +43,7 @@ instance BackendSchema 'BigQuery where
  nodesAggExtension = Just ()

  -- table arguments
-  tableArguments = bqTableArgs
+  tableArguments = defaultTableArgs

  -- indivdual components
  columnParser = bqColumnParser
@ -145,34 +144,6 @@ bqBuildFunctionMutationFields ::
 bqBuildFunctionMutationFields _ _ _ _ _ =
  pure []

----------------------------------------------------------------
-- Table arguments
-
-bqTableArgs ::
-  forall r m n.
-  MonadBuildSchema 'BigQuery r m n =>
-  SourceName ->
-  TableInfo 'BigQuery ->
-  SelPermInfo 'BigQuery ->
-  m (InputFieldsParser n (IR.SelectArgsG 'BigQuery (UnpreparedValue 'BigQuery)))
-bqTableArgs sourceName tableInfo selectPermissions = do
-  whereParser <- tableWhereArg sourceName tableInfo selectPermissions
-  orderByParser <- tableOrderByArg sourceName tableInfo selectPermissions
-  pure do
-    whereArg <- whereParser
-    orderByArg <- orderByParser
-    limitArg <- tableLimitArg
-    offsetArg <- tableOffsetArg
-    pure $
-      IR.SelectArgs
-        { IR._saWhere = whereArg,
-          IR._saOrderBy = orderByArg,
-          IR._saLimit = limitArg,
-          IR._saOffset = offsetArg,
-          -- not supported on BigQuery for now
-          IR._saDistinct = Nothing
-        }
-
 ----------------------------------------------------------------
 -- Individual components

--- a/server/src-lib/Hasura/Backends/BigQuery/Source.hs
+++ b/server/src-lib/Hasura/Backends/BigQuery/Source.hs
@ -22,6 +22,7 @@ import Data.Aeson.Casing qualified as J
 import Data.Aeson.TH qualified as J
 import Data.ByteString.Lazy qualified as BL
 import Data.HashMap.Strict qualified as HM
+import Data.Int qualified as Int
 import Data.Text.Encoding qualified as TE
 import Data.X509 qualified as X509
 import Data.X509.Memory qualified as X509
@ -160,7 +161,7 @@ data BigQuerySourceConfig = BigQuerySourceConfig
    _scDatasets :: ![Text],
    _scProjectId :: !Text, -- this is part of service-account.json, but we put it here on purpose
    _scAccessTokenMVar :: !(MVar (Maybe TokenResp)),
-    _scGlobalSelectLimit :: !Int
+    _scGlobalSelectLimit :: !Int.Int64
  }
  deriving (Eq)

--- a/server/src-lib/Hasura/Backends/BigQuery/Types.hs
+++ b/server/src-lib/Hasura/Backends/BigQuery/Types.hs
@ -34,6 +34,10 @@ module Hasura.Backends.BigQuery.Types
    Reselect (..),
    ScalarType (..),
    Select (..),
+    PartitionableSelect (..),
+    noExtraPartitionFields,
+    withExtraPartitionFields,
+    simpleSelect,
    SelectJson (..),
    TableName (..),
    Time (..),
@ -48,6 +52,7 @@ module Hasura.Backends.BigQuery.Types
    doubleToFloat64,
    getGQLTableName,
    intToInt64,
+    int64Expr,
    isComparableType,
    isNumType,
    parseScalarValue,
@ -65,6 +70,7 @@ import Data.ByteString (ByteString)
 import Data.ByteString.Base64 qualified as Base64
 import Data.ByteString.Lazy qualified as L
 import Data.Coerce
+import Data.Int qualified as Int
 import Data.Scientific
 import Data.Text qualified as T
 import Data.Text.Encoding qualified as T
@ -105,6 +111,29 @@ instance Cacheable Select

 instance NFData Select

+-- | Helper type allowing addition of extra fields used
+-- in PARTITION BY.
+--
+-- The main purpose of this type is sumulation of DISTINCT ON
+-- implemented in Hasura.Backends.BigQuery.FromIr.simulateDistinctOn
+data PartitionableSelect = PartitionableSelect
+  { pselectFinalize :: Maybe [FieldName] -> Select,
+    pselectFrom :: !From
+  }
+
+simpleSelect :: Select -> PartitionableSelect
+simpleSelect select =
+  PartitionableSelect
+    { pselectFinalize = const select,
+      pselectFrom = selectFrom select
+    }
+
+noExtraPartitionFields :: PartitionableSelect -> Select
+noExtraPartitionFields PartitionableSelect {..} = pselectFinalize Nothing
+
+withExtraPartitionFields :: PartitionableSelect -> [FieldName] -> Select
+withExtraPartitionFields PartitionableSelect {..} = pselectFinalize . Just
+
 data ArrayAgg = ArrayAgg
  { arrayAggProjections :: !(NonEmpty Projection),
    arrayAggOrderBy :: !(Maybe (NonEmpty OrderBy)),
@ -327,7 +356,7 @@ instance NFData AsStruct

 data Top
  = NoTop
-  | Top Int
+  | Top Int.Int64
  deriving (Eq, Ord, Show, Generic, Data, Lift)

 instance FromJSON Top
@ -678,9 +707,12 @@ instance FromJSON Int64 where parseJSON = liberalInt64Parser Int64

 instance ToJSON Int64 where toJSON = liberalIntegralPrinter

-intToInt64 :: Int -> Int64
+intToInt64 :: Int.Int64 -> Int64
 intToInt64 = Int64 . tshow

+int64Expr :: Int.Int64 -> Expression
+int64Expr = ValueExpression . IntegerValue . intToInt64
+
 -- | BigQuery's conception of a fixed precision decimal.
 newtype Decimal = Decimal Text
  deriving (Show, Eq, Ord, Generic, Data, Cacheable, NFData, Hashable, Lift)
--- a/server/src-lib/Hasura/Incremental/Internal/Dependency.hs
+++ b/server/src-lib/Hasura/Incremental/Internal/Dependency.hs
@ -184,6 +184,8 @@ instance Cacheable Int where unchanged _ = (==)

 instance Cacheable Int32 where unchanged _ = (==)

+instance Cacheable Int64 where unchanged _ = (==)
+
 instance Cacheable Integer where unchanged _ = (==)

 instance Cacheable Scientific where unchanged _ = (==)
--- a/server/tests-py/queries/graphql_query/bigquery/agg_nodes.yaml
+++ b/server/tests-py/queries/graphql_query/bigquery/agg_nodes.yaml
@ -50,6 +50,28 @@
              author:
                id: '2'
                name: Author 2
+            - id: '4'
+              title: Title 4
+              author:
+                id: '2'
+                name: Author 2
+      - id: '4'
+        title: Title 4
+        author:
+          id: '2'
+          name: Author 2
+          articles_aggregate:
+            nodes:
+            - id: '3'
+              title: Title 3
+              author:
+                id: '2'
+                name: Author 2
+            - id: '4'
+              title: Title 4
+              author:
+                id: '2'
+                name: Author 2
  query:
    query: |
      query {
@ -96,6 +118,8 @@
          nodes:
          - title: Title 3
            article_citations: []
+          - title: Title 4
+            article_citations: []
  query:
    query: |
      query {
--- a/server/tests-py/queries/graphql_query/bigquery/basic_remote_joins.yaml
+++ b/server/tests-py/queries/graphql_query/bigquery/basic_remote_joins.yaml
@ -22,6 +22,12 @@
        author:
          id: '2'
          name: Author 2
+      - id: '4'
+        title: Title 4
+        content: Content 4
+        author:
+          id: '2'
+          name: Author 2
  query:
    query: |
      query {
@ -57,6 +63,9 @@
        - id: '3'
          title: Title 3
          content: Content 3
+        - id: '4'
+          title: Title 4
+          content: Content 4
  query:
    query: |
      query {
--- a/server/tests-py/queries/graphql_query/bigquery/distinct_on.yaml
+++ b/server/tests-py/queries/graphql_query/bigquery/distinct_on.yaml
@ -0,0 +1,175 @@
+- description: oldest articles by author, distinct on is_published with author and count of citations
+  url: /v1/graphql
+  status: 200
+  response:
+    data:
+      hasura_test_article:
+      - id: '1'
+        title: Title 1
+        created_at: '2008-12-25T07:30:01Z'
+        is_published: false
+        author:
+          name: Author 1
+        article_citations_aggregate:
+          aggregate:
+            count: '2'
+      - id: '2'
+        title: Title 2
+        created_at: '2008-12-25T07:30:02Z'
+        is_published: true
+        author:
+          name: Author 1
+        article_citations_aggregate:
+          aggregate:
+            count: '1'
+      - id: '3'
+        title: Title 3
+        created_at: '2008-12-25T07:30:03Z'
+        is_published: false
+        author:
+          name: Author 2
+        article_citations_aggregate:
+          aggregate:
+            count: '0'
+  query:
+    query: |
+      query {
+        hasura_test_article(distinct_on: [author_id, is_published], order_by: [{author_id:asc}, {is_published: asc}, {created_at: asc}]) {
+          id
+          title
+          created_at
+          is_published
+          author {
+            name
+          }
+          article_citations_aggregate {
+            aggregate {
+              count
+            }
+          }
+        }
+      }
+
+- description: author with oldest articles, distinct on is_published, with sums of ids of those articles
+  url: /v1/graphql
+  status: 200
+  response:
+    data:
+      hasura_test_author:
+      - id: '1'
+        name: Author 1
+        articles:
+        - title: Title 1
+          id: '1'
+        - title: Title 2
+          id: '2'
+        articles_aggregate:
+          aggregate:
+            sum:
+              id: '3'
+      - id: '2'
+        name: Author 2
+        articles:
+        - title: Title 3
+          id: '3'
+        articles_aggregate:
+          aggregate:
+            sum:
+              id: '3'
+  query:
+    query: |
+      query {
+        hasura_test_author(order_by: [{id:asc}]) {
+          id
+          name
+          articles(distinct_on: is_published, order_by: [{is_published: asc}, {created_at: asc}]) {
+            title
+            id
+          }
+          articles_aggregate(distinct_on: is_published, order_by: [{is_published: asc}, {created_at: asc}]) {
+            aggregate {
+              sum {
+                id
+              }
+            }
+          }
+        }
+      }
+
+- description: author with newest articles, distinct on is_published, with sums of ids of those articles
+  url: /v1/graphql
+  status: 200
+  response:
+    data:
+      hasura_test_author:
+      - id: '1'
+        name: Author 1
+        articles:
+        - title: Title 1
+          id: '1'
+        - title: Title 2
+          id: '2'
+        articles_aggregate:
+          aggregate:
+            sum:
+              id: '3'
+      - id: '2'
+        name: Author 2
+        articles:
+        - title: Title 4
+          id: '4'
+        articles_aggregate:
+          aggregate:
+            sum:
+              id: '4'
+  query:
+    query: |
+      query {
+        hasura_test_author(order_by: [{id:asc}]) {
+          id
+          name
+          articles(distinct_on: is_published, order_by: [{is_published: asc}, {created_at: desc}]) {
+            title
+            id
+          }
+          articles_aggregate(distinct_on: is_published, order_by: [{is_published: asc}, {created_at: desc}]) {
+            aggregate {
+              sum {
+                id
+              }
+            }
+          }
+        }
+      }
+
+- description: first and last articles distinct by author and published status
+  url: /v1/graphql
+  status: 200
+  response:
+    data:
+      first:
+      - id: '1'
+        title: Title 1
+      - id: '2'
+        title: Title 2
+      - id: '3'
+        title: Title 3
+      last:
+      - id: '1'
+        title: Title 1
+      - id: '2'
+        title: Title 2
+      - id: '4'
+        title: Title 4
+  query:
+    query: |
+      query {
+        first: hasura_test_article(distinct_on: [author_id, is_published], order_by: [{author_id: asc}, {is_published: asc}, {created_at: asc}]) {
+          id
+          title
+        }
+        last: hasura_test_article(distinct_on: [author_id, is_published], order_by: [{author_id: asc}, {is_published: asc}, {created_at: desc}]) {
+          id
+          title
+        }
+      }
--- a/server/tests-py/queries/graphql_query/bigquery/nested_array_relationships.yaml
+++ b/server/tests-py/queries/graphql_query/bigquery/nested_array_relationships.yaml
@ -37,6 +37,10 @@
          title: Title 3
          content: Content 3
          article_citations: []
+        - id: '4'
+          title: Title 4
+          content: Content 4
+          article_citations: []
  query:
    query: |
      query {
@ -89,6 +93,12 @@
          article_citations_aggregate:
            aggregate:
              count: '0'
+        - id: '4'
+          title: Title 4
+          content: Content 4
+          article_citations_aggregate:
+            aggregate:
+              count: '0'
  query:
    query: |
      query {
@ -135,6 +145,10 @@
          title: Title 3
          content: Content 3
          article_citations2: []
+        - id: '4'
+          title: Title 4
+          content: Content 4
+          article_citations2: []
  query:
    query: |
      query {
@ -145,7 +159,7 @@
            id
            title
            content
-            article_citations2 {
+            article_citations2 (order_by: {cited_article_id: asc}) {
              description
            }
          }
--- a/server/tests-py/queries/graphql_query/bigquery/nested_select_query_article_author.yaml
+++ b/server/tests-py/queries/graphql_query/bigquery/nested_select_query_article_author.yaml
@ -22,6 +22,12 @@ response:
      author:
        id: '2'
        name: Author 2
+    - id: '4'
+      title: Title 4
+      content: Content 4
+      author:
+        id: '2'
+        name: Author 2
 query:
  query: |
    query {
--- a/server/tests-py/queries/graphql_query/bigquery/schema_setup_bigquery.yaml
+++ b/server/tests-py/queries/graphql_query/bigquery/schema_setup_bigquery.yaml
@ -66,6 +66,9 @@ args:
      INSERT INTO `hasura_test.article` VALUES (
        3, "Title 3", "Content 3", 2, FALSE, NULL, PARSE_TIMESTAMP("%c", "Thu Dec 25 07:30:03 2008")
      );
+      INSERT INTO `hasura_test.article` VALUES (
+        4, "Title 4", "Content 4", 2, FALSE, NULL, PARSE_TIMESTAMP("%c", "Thu Dec 26 07:31:04 2008")
+      );

      -- a copy for remote joins as we can't have the same table tracked in two sources
      CREATE TABLE `hasura_test.article2`
--- a/server/tests-py/queries/graphql_query/bigquery/select_query_batching.yaml
+++ b/server/tests-py/queries/graphql_query/bigquery/select_query_batching.yaml
@ -7,6 +7,7 @@ response:
      - id: '1'
      - id: '2'
      - id: '3'
+      - id: '4'
  - data:
      hasura_test_author:
      - id: '1'
--- a/server/tests-py/queries/graphql_query/bigquery/select_query_nested_fragment.yaml
+++ b/server/tests-py/queries/graphql_query/bigquery/select_query_nested_fragment.yaml
@ -37,3 +37,5 @@ response:
      articles:
      - title: Title 3
        id: '3'
+      - title: Title 4
+        id: '4'
--- a/server/tests-py/queries/graphql_query/bigquery/select_query_top_level_fragment.yaml
+++ b/server/tests-py/queries/graphql_query/bigquery/select_query_top_level_fragment.yaml
@ -46,3 +46,6 @@ response:
    - title: Title 3
      author:
        name: Author 2
+    - title: Title 4
+      author:
+        name: Author 2
--- a/server/tests-py/queries/graphql_query/bigquery/user_perms.yaml
+++ b/server/tests-py/queries/graphql_query/bigquery/user_perms.yaml
@ -15,6 +15,9 @@
      - id: '3'
        title: Title 3
        content: Content 3
+      - id: '4'
+        title: Title 4
+        content: Content 4
  query:
    query: |
      query {
--- a/server/tests-py/test_graphql_queries.py
+++ b/server/tests-py/test_graphql_queries.py
@ -216,6 +216,9 @@ class TestGraphQLQueryBasicBigquery:
    def test_agg_nodes(self, hge_ctx, transport):
        check_query_f(hge_ctx, self.dir() + "/agg_nodes.yaml", transport)

+    def test_distinct_on(self, hge_ctx, transport):
+        check_query_f(hge_ctx, self.dir() + "/distinct_on.yaml", transport)
+
    @classmethod
    def dir(cls):
        return 'queries/graphql_query/bigquery'