Bigquery/fix limit offset for array aggregates

Blocked on https://github.com/hasura/graphql-engine-mono/pull/1640. While fiddling with BigQuery I noticed a severe issue with offset/limit for array-aggregates. I've fixed it now. The basic problem was that I was using a query like this: ```graphql query MyQuery { hasura_Artist(order_by: {artist_self_id: asc}) { artist_self_id albums_aggregate(order_by: {album_self_id: asc}, limit: 2) { nodes { album_self_id } aggregate { count } } } } ``` Producing this SQL: ```sql SELECT `t_Artist1`.`artist_self_id` AS `artist_self_id`, STRUCT(IFNULL(`aa_albums1`.`nodes`, NULL) AS `nodes`, IFNULL(`aa_albums1`.`aggregate`, STRUCT(0 AS `count`)) AS `aggregate`) AS `albums_aggregate` FROM `hasura`.`Artist` AS `t_Artist1` LEFT OUTER JOIN (SELECT ARRAY_AGG(STRUCT(`t_Album1`.`album_self_id` AS `album_self_id`) ORDER BY (`t_Album1`.`album_self_id`) ASC) AS `nodes`, STRUCT(COUNT(*) AS `count`) AS `aggregate`, `t_Album1`.`artist_other_id` AS `artist_other_id` FROM (SELECT * FROM `hasura`.`Album` AS `t_Album1` ORDER BY (`t_Album1`.`album_self_id`) ASC NULLS FIRST -- PROBLEM HERE LIMIT @param0) AS `t_Album1` GROUP BY `t_Album1`.`artist_other_id`) AS `aa_albums1` ON (`aa_albums1`.`artist_other_id` = `t_Artist1`.`artist_self_id`) ORDER BY (`t_Artist1`.`artist_self_id`) ASC NULLS FIRST ``` Note the `LIMIT @param0` -- that is incorrect because we want to limit per artist. Instead, we want: ```sql SELECT `t_Artist1`.`artist_self_id` AS `artist_self_id`, STRUCT(IFNULL(`aa_albums1`.`nodes`, NULL) AS `nodes`, IFNULL(`aa_albums1`.`aggregate`, STRUCT(0 AS `count`)) AS `aggregate`) AS `albums_aggregate` FROM `hasura`.`Artist` AS `t_Artist1` LEFT OUTER JOIN (SELECT ARRAY_AGG(STRUCT(`t_Album1`.`album_self_id` AS `album_self_id`) ORDER BY (`t_Album1`.`album_self_id`) ASC) AS `nodes`, STRUCT(COUNT(*) AS `count`) AS `aggregate`, `t_Album1`.`artist_other_id` AS `artist_other_id` FROM (SELECT *, -- ADDED ROW_NUMBER() OVER(PARTITION BY artist_other_id) artist_album_index FROM `hasura`.`Album` AS `t_Album1` ORDER BY (`t_Album1`.`album_self_id`) ASC NULLS FIRST ) AS `t_Album1` -- CHANGED WHERE artist_album_index <= @param GROUP BY `t_Album1`.`artist_other_id`) AS `aa_albums1` ON (`aa_albums1`.`artist_other_id` = `t_Artist1`.`artist_self_id`) ORDER BY (`t_Artist1`.`artist_self_id`) ASC NULLS FIRST ``` That serves both the LIMIT/OFFSET function in the where clause. Then, both the ARRAY_AGG and the COUNT are correct per artist. I've updated my Haskell test suite to add regression tests for this. I'll push a commit for Python tests shortly. The tests still pass there. This just fixes a case that we hadn't noticed. https://github.com/hasura/graphql-engine-mono/pull/1641 GitOrigin-RevId: 49933fa5e09a9306c89565743ecccf2cb54eaa80
2024-12-15 01:12:56 +03:00 · 2021-07-06 09:28:42 +01:00 · 2021-07-06 09:28:42 +01:00 · 614c0dab80
commit 614c0dab80
parent 6ed800abaa
4 changed files with 279 additions and 9 deletions
--- a/server/src-lib/Hasura/Backends/BigQuery/FromIr.hs
+++ b/server/src-lib/Hasura/Backends/BigQuery/FromIr.hs
@ -14,7 +14,7 @@ module Hasura.Backends.BigQuery.FromIr
  , Top(..) -- Re-export for FromIrConfig.
  ) where

-import           Hasura.Backends.BigQuery.Source (BigQuerySourceConfig(..))
+import           Hasura.Backends.BigQuery.Source          (BigQuerySourceConfig (..))
 import           Hasura.Prelude

 import qualified Data.HashMap.Strict                      as HM
@ -159,7 +159,7 @@ fromRootField =
  \case
    (Ir.QDBSingleRow s)    -> mkSQLSelect Rql.JASSingleObject s
    (Ir.QDBMultipleRows s) -> mkSQLSelect Rql.JASMultipleRows s
-    (Ir.QDBAggregation s)  -> fromSelectAggregate s
+    (Ir.QDBAggregation s)  -> fromSelectAggregate Nothing s
    (Ir.QDBConnection _)   -> refute $ pure ConnectionsNotSupported

 --------------------------------------------------------------------------------
@ -219,9 +219,10 @@ fromSelectRows annSelectG = do
        else LeaveNumbersAlone

 fromSelectAggregate ::
-     Ir.AnnSelectG 'BigQuery (Const Void) [(Rql.FieldName, Ir.TableAggregateFieldG 'BigQuery (Const Void) Expression)] Expression
+     Maybe (EntityAlias, HashMap ColumnName ColumnName)
+  -> Ir.AnnSelectG 'BigQuery (Const Void) [( Rql.FieldName , Ir.TableAggregateFieldG 'BigQuery (Const Void) Expression)] Expression
  -> FromIr BigQuery.Select
-fromSelectAggregate annSelectG = do
+fromSelectAggregate minnerJoinFields annSelectG = do
  selectFrom <-
    case from of
      Ir.FromTable qualifiedObject -> fromQualifiedTable qualifiedObject
@ -230,6 +231,11 @@ fromSelectAggregate annSelectG = do
    runReaderT (fromSelectArgsG args) (fromAlias selectFrom)
  filterExpression <-
    runReaderT (fromAnnBoolExp permFilter) (fromAlias selectFrom)
+  mforeignKeyConditions <-
+    for minnerJoinFields $ \(entityAlias, mapping) ->
+      runReaderT
+        (fromMappingFieldNames (fromAlias selectFrom) mapping)
+        entityAlias
  fieldSources <-
    runReaderT
      (traverse
@ -244,6 +250,7 @@ fromSelectAggregate annSelectG = do
    case NE.nonEmpty (concatMap (toList . fieldSourceProjections) fieldSources) of
      Nothing -> refute (pure NoProjectionFields)
      Just ne -> pure ne
+  indexAlias <- generateEntityAlias IndexTemplate
  pure
    Select
      { selectCardinality = One
@ -256,7 +263,36 @@ fromSelectAggregate annSelectG = do
            (Aliased
               { aliasedThing =
                   Select
-                     { selectProjections = pure StarProjection
+                     { selectProjections =
+                         case mforeignKeyConditions of
+                           Nothing -> pure StarProjection
+                           Just innerJoinFields ->
+                             pure StarProjection <>
+                             -- We setup an index over every row in
+                             -- the sub select.  Then if you look at
+                             -- the outer Select, you can see we apply
+                             -- a WHERE that uses this index for
+                             -- LIMIT/OFFSET.
+                             pure
+                               (WindowProjection
+                                  (Aliased
+                                     { aliasedAlias = unEntityAlias indexAlias
+                                     , aliasedThing =
+                                         RowNumberOverPartitionBy
+                                           -- The row numbers start from 1.
+                                           (NE.fromList
+                                              (map
+                                                 (\(fieldName', _) -> fieldName')
+                                                 (innerJoinFields)))
+                                           argsOrderBy
+                                           -- Above: Having the order by
+                                           -- in here ensures that the
+                                           -- row numbers are ordered by
+                                           -- this ordering. Below, we
+                                           -- order again for the
+                                           -- general row order. Both
+                                           -- are needed!
+                                     }))
                     , selectFrom
                     , selectJoins =
                         argsJoins <> mapMaybe fieldSourceJoin fieldSources
@ -269,14 +305,53 @@ fromSelectAggregate annSelectG = do
                     -- putting this elsewhere.
                     , selectFinalWantedFields = Nothing
                     , selectCardinality = Many
-                     , selectTop = argsTop
-                     , selectOffset = argsOffset
+                     , selectTop = maybe argsTop (const NoTop) mforeignKeyConditions
+                     , selectOffset = maybe argsOffset (const Nothing) mforeignKeyConditions
                     , selectGroupBy = mempty
                     }
               , aliasedAlias = entityAliasText (fromAlias selectFrom)
               })
      , selectJoins = mempty
-      , selectWhere = mempty
+      , selectWhere =
+          case mforeignKeyConditions of
+            Nothing -> mempty
+            Just {} ->
+              let offset =
+                    case argsOffset of
+                      Nothing -> mempty
+                      Just offset' ->
+                        Where
+                          -- Apply an offset using the row_number from above.
+                          [ OpExpression
+                              MoreOp
+                              (ColumnExpression
+                                 FieldName
+                                   { fieldNameEntity =
+                                       coerce (fromAlias selectFrom)
+                                   , fieldName = unEntityAlias indexAlias
+                                   })
+                              offset'
+                          ]
+                  limit =
+                    case argsTop of
+                      NoTop -> mempty
+                      Top limit' ->
+                        Where
+                          -- Apply a limit using the row_number from above.
+                          [ OpExpression
+                              LessOp
+                              (ColumnExpression
+                                 FieldName
+                                   { fieldNameEntity =
+                                       coerce (fromAlias selectFrom)
+                                   , fieldName = unEntityAlias indexAlias
+                                   })
+                              (ValueExpression . IntegerValue . Int64 . tshow $
+                               limit' + 1 -- Because the row_number() indexing starts at 1.
+                               -- So idx<l+1  means idx<2 where l = 1 i.e. "limit to 1 row".
+                               )
+                          ]
+               in offset <> limit
      , selectOrderBy = Nothing
      , selectOffset = Nothing
      }
@ -976,7 +1051,9 @@ fromArrayAggregateSelectG ::
  -> ReaderT EntityAlias FromIr Join
 fromArrayAggregateSelectG annRelationSelectG = do
  joinFieldName <- lift (fromRelName aarRelationshipName)
-  select <- lift (fromSelectAggregate annSelectG)
+  select <- do
+    lhsEntityAlias <- ask
+    lift (fromSelectAggregate (pure (lhsEntityAlias, mapping)) annSelectG)
  alias <- lift (generateEntityAlias (ArrayAggregateTemplate joinFieldName))
  joinOn <- fromMappingFieldNames alias mapping
  innerJoinFields <-
--- a/server/src-lib/Hasura/Backends/BigQuery/Instances/Execute.hs
+++ b/server/src-lib/Hasura/Backends/BigQuery/Instances/Execute.hs
@ -20,6 +20,7 @@ import qualified Hasura.Base.Error                as E
 import qualified Hasura.SQL.AnyBackend            as AB
 import qualified Hasura.Tracing                   as Tracing

+
 import           Hasura.Backends.BigQuery.Plan
 import           Hasura.Base.Error
 import           Hasura.EncJSON
--- a/server/tests-py/queries/graphql_query/bigquery/offset_regression.yaml
+++ b/server/tests-py/queries/graphql_query/bigquery/offset_regression.yaml
@ -0,0 +1,189 @@
+- description: Aggregate with a limit using nodes, count
+  url: /v1/graphql
+  status: 200
+  response:
+    data:
+      hasura_Artist:
+      - artist_self_id: '1000'
+        albums_aggregate:
+          nodes:
+          - album_self_id: '2002'
+          aggregate:
+            count: '1'
+      - artist_self_id: '1001'
+        albums_aggregate:
+          nodes:
+          - album_self_id: '2000'
+          aggregate:
+            count: '1'
+      - artist_self_id: '1002'
+        albums_aggregate:
+          nodes:
+          - album_self_id: '2003'
+          aggregate:
+            count: '1'
+  query:
+    query: |
+      query MyQuery {
+        hasura_Artist(order_by: {artist_self_id: asc}) {
+          artist_self_id
+          albums_aggregate(order_by: {album_self_id: asc}, limit: 1) {
+            nodes {
+              album_self_id
+            }
+            aggregate {
+              count
+            }
+          }
+        }
+      }
+
+- description: Aggregate with order by using nodes, count, sum
+  url: /v1/graphql
+  status: 200
+  response:
+    data:
+      hasura_Artist:
+      - artist_self_id: '1001'
+        albums_aggregate:
+          nodes:
+          - album_self_id: '2000'
+          - album_self_id: '2001'
+          aggregate:
+            count: '2'
+            sum:
+              album_self_id: '4001'
+      - artist_self_id: '1002'
+        albums_aggregate:
+          nodes:
+          - album_self_id: '2003'
+          - album_self_id: '2004'
+          aggregate:
+            count: '2'
+            sum:
+              album_self_id: '4007'
+      - artist_self_id: '1000'
+        albums_aggregate:
+          nodes:
+          - album_self_id: '2002'
+          - album_self_id: '2005'
+          aggregate:
+            count: '2'
+            sum:
+              album_self_id: '4007'
+  query:
+    query: |
+      query MyQuery {
+        hasura_Artist {
+          artist_self_id
+          albums_aggregate(order_by: {album_self_id: asc}) {
+            nodes {
+              album_self_id
+            }
+            aggregate {
+              count
+              sum {
+                album_self_id
+              }
+            }
+          }
+        }
+      }
+
+- description: Aggregate with limit using sum, nodes
+  url: /v1/graphql
+  status: 200
+  response:
+    data:
+        hasura_Artist:
+        - artist_self_id: '1001'
+          albums_aggregate:
+            nodes:
+            - album_self_id: '2000'
+            aggregate:
+              count: '1'
+              sum:
+                album_self_id: '2000'
+        - artist_self_id: '1002'
+          albums_aggregate:
+            nodes:
+            - album_self_id: '2003'
+            aggregate:
+              count: '1'
+              sum:
+                album_self_id: '2003'
+        - artist_self_id: '1000'
+          albums_aggregate:
+            nodes:
+            - album_self_id: '2002'
+            aggregate:
+              count: '1'
+              sum:
+                album_self_id: '2002'
+  query:
+    query: |
+      query MyQuery {
+        hasura_Artist {
+          artist_self_id
+          albums_aggregate(order_by: {album_self_id: asc}, limit: 1) {
+            nodes {
+              album_self_id
+            }
+            aggregate {
+              count
+              sum {
+                album_self_id
+              }
+            }
+          }
+        }
+      }
+
+- description: Aggregate with offset using sum, nodes
+  url: /v1/graphql
+  status: 200
+  response:
+    data:
+        hasura_Artist:
+        - artist_self_id: '1001'
+          albums_aggregate:
+            nodes:
+            - album_self_id: '2001'
+            aggregate:
+              count: '1'
+              sum:
+                album_self_id: '2001'
+        - artist_self_id: '1002'
+          albums_aggregate:
+            nodes:
+            - album_self_id: '2004'
+            aggregate:
+              count: '1'
+              sum:
+                album_self_id: '2004'
+        - artist_self_id: '1000'
+          albums_aggregate:
+            nodes:
+            - album_self_id: '2005'
+            aggregate:
+              count: '1'
+              sum:
+                album_self_id: '2005'
+  query:
+    query: |
+      query MyQuery {
+        hasura_Artist {
+          artist_self_id
+          albums_aggregate(order_by: {album_self_id: asc}, offset: 1) {
+            nodes {
+              album_self_id
+            }
+            aggregate {
+              count
+              sum {
+                album_self_id
+              }
+            }
+          }
+        }
+      }
--- a/server/tests-py/test_graphql_queries.py
+++ b/server/tests-py/test_graphql_queries.py
@ -21,6 +21,9 @@ class TestGraphQLQueryBasicBigquery:
    def test_global_limit(self, hge_ctx, transport):
        check_query_f(hge_ctx, self.dir() + "/global_limit.yaml", transport)

+    def test_offset_regression(self, hge_ctx, transport):
+        check_query_f(hge_ctx, self.dir() + "/offset_regression.yaml", transport)
+
    def test_user_perms(self, hge_ctx, transport):
        check_query_f(hge_ctx, self.dir() + "/user_perms.yaml", transport)