graphql-engine/server/src-lib/Hasura/Backends/BigQuery/DDL/Source.hs

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

182 lines
7.1 KiB
Haskell
Raw Normal View History

{-# LANGUAGE DuplicateRecordFields #-}
module Hasura.Backends.BigQuery.DDL.Source
( resolveSource,
postDropSourceHook,
resolveSourceConfig,
restTypeToScalarType,
)
where
import Data.Aeson qualified as J
import Data.ByteString.Lazy qualified as L
import Data.Environment qualified as Env
import Data.HashMap.Strict.Extended qualified as HashMap
import Data.Int qualified as Int
import Data.Text qualified as T
import Data.Text.Encoding qualified as T
import Data.Time.Clock.System
import Hasura.Backends.BigQuery.Connection
import Hasura.Backends.BigQuery.Meta
import Hasura.Backends.BigQuery.Source
import Hasura.Backends.BigQuery.Types
import Hasura.Base.Error
import Hasura.Function.Cache (FunctionOverloads (..))
import Hasura.Prelude
import Hasura.RQL.Types.Backend (BackendConfig)
import Hasura.RQL.Types.BackendType
import Hasura.RQL.Types.Column
import Hasura.RQL.Types.Common
import Hasura.RQL.Types.Source
import Hasura.Table.Cache
defaultGlobalSelectLimit :: Int.Int64
defaultGlobalSelectLimit = 1000
defaultRetryLimit :: Int
defaultRetryLimit = 5
defaultRetryBaseDelay :: Microseconds
defaultRetryBaseDelay = 500000
resolveSourceConfig ::
(MonadIO m) =>
SourceName ->
BigQueryConnSourceConfig ->
BackendSourceKind 'BigQuery ->
BackendConfig 'BigQuery ->
Env.Environment ->
manager ->
m (Either QErr BigQuerySourceConfig)
resolveSourceConfig _name BigQueryConnSourceConfig {..} _backendKind _backendConfig env _manager = runExceptT $ do
eSA <- resolveConfigurationJson env _cscServiceAccount
case eSA of
Left e -> throw400 Unexpected $ T.pack e
Right serviceAccount -> do
projectId <- BigQueryProjectId <$> resolveConfigurationInput env _cscProjectId
retryOptions <- do
numRetries <-
resolveConfigurationInput env `mapM` _cscRetryLimit >>= \case
Nothing -> pure defaultRetryLimit
Just v -> readNonNegative v "retry limit"
if numRetries == 0
then pure Nothing
else do
let _retryNumRetries = numRetries
_retryBaseDelay <-
resolveConfigurationInput env `mapM` _cscRetryBaseDelay >>= \case
Nothing -> pure defaultRetryBaseDelay
Just v -> fromInteger <$> readNonNegative v "retry base delay"
pure $ Just RetryOptions {..}
_scConnection <- initConnection serviceAccount projectId retryOptions
_scDatasets <- fmap BigQueryDataset <$> resolveConfigurationInputs env _cscDatasets
_scGlobalSelectLimit <-
resolveConfigurationInput env `mapM` _cscGlobalSelectLimit >>= \case
Nothing -> pure defaultGlobalSelectLimit
Just v -> readNonNegative v "global select limit"
pure BigQuerySourceConfig {..}
readNonNegative :: (MonadError QErr m, Num a, Ord a, J.FromJSON a, Read a) => Text -> Text -> m a
readNonNegative i paramName =
-- This works around the inconsistency between JSON and
-- environment variables. The config handling module should be
-- reworked to handle non-text values better.
case readMaybe (T.unpack i) <|> J.decode (L.fromStrict (T.encodeUtf8 i)) of
Nothing -> throw400 Unexpected $ "Need a non-negative integer for " <> paramName
Just i' -> do
when (i' < 0) $ throw400 Unexpected $ "Need the integer for the " <> paramName <> " to be non-negative"
pure i'
resolveSource ::
(MonadIO m) =>
BigQuerySourceConfig ->
server: plumb `StoredIntrospection` while building the Schema Cache We'd like to be able to build a Schema Cache from only serializable data. We already have Metadata. The data that's missing to build a Schema Cache is referred to as "stored introspection", and this includes: - DB introspection - User-defined enum values (i.e. contents of specific DB tables) - Remote schema introspection This PR introduces a new `StoredIntrospection` container that holds that data, and plumbs it through to the right parts of the schema cache building process, so that stored introspection can be used as a substitute for fresh introspection requests against live data sources. The serialization of `StoredIntrospection` is intended to be straightforward: just take the serialized source introspection results, and put them in an appropriate JSON object. Though I don't think that this PR achieves that entirely. In order for `StoredIntrospection` to be deserializable (through `aeson` instances), while keeping the required code changes low, this piggy-backs off of the `ResolvedSource` data type. `ResolvedSource` is _almost_ exactly what we want, and _almost_ deserializable, so this PR brings it across the finish line by moving a few things out of that type, and adding a `FromJSON (RawFunctionInfo b)` context to the `Backend` type class. [PLAT-270]: https://hasurahq.atlassian.net/browse/PLAT-270?atlOrigin=eyJpIjoiNWRkNTljNzYxNjVmNDY3MDlhMDU5Y2ZhYzA5YTRkZjUiLCJwIjoiZ2l0aHViLWNvbS1KU1cifQ [PLAT-270]: https://hasurahq.atlassian.net/browse/PLAT-270?atlOrigin=eyJpIjoiNWRkNTljNzYxNjVmNDY3MDlhMDU5Y2ZhYzA5YTRkZjUiLCJwIjoiZ2l0aHViLWNvbS1KU1cifQ [PLAT-276]: https://hasurahq.atlassian.net/browse/PLAT-276?atlOrigin=eyJpIjoiNWRkNTljNzYxNjVmNDY3MDlhMDU5Y2ZhYzA5YTRkZjUiLCJwIjoiZ2l0aHViLWNvbS1KU1cifQ [PLAT-276]: https://hasurahq.atlassian.net/browse/PLAT-276?atlOrigin=eyJpIjoiNWRkNTljNzYxNjVmNDY3MDlhMDU5Y2ZhYzA5YTRkZjUiLCJwIjoiZ2l0aHViLWNvbS1KU1cifQ PR-URL: https://github.com/hasura/graphql-engine-mono/pull/7053 GitOrigin-RevId: 5001b4ea086195cb5e65886747eac2a0a657b64c
2023-01-20 17:51:11 +03:00
m (Either QErr (DBObjectsIntrospection 'BigQuery))
resolveSource sourceConfig =
runExceptT $ do
tables <- getTables sourceConfig
routines <- getRoutines sourceConfig
let result = (,) <$> tables <*> routines
case result of
Left err ->
throw400 Unexpected
$ "unexpected exception while connecting to database: "
<> tshow err
Right (restTables, restRoutines) -> do
seconds <- liftIO $ fmap systemSeconds getSystemTime
let functions = FunctionOverloads <$> HashMap.groupOnNE (routineReferenceToFunctionName . routineReference) restRoutines
pure
server: plumb `StoredIntrospection` while building the Schema Cache We'd like to be able to build a Schema Cache from only serializable data. We already have Metadata. The data that's missing to build a Schema Cache is referred to as "stored introspection", and this includes: - DB introspection - User-defined enum values (i.e. contents of specific DB tables) - Remote schema introspection This PR introduces a new `StoredIntrospection` container that holds that data, and plumbs it through to the right parts of the schema cache building process, so that stored introspection can be used as a substitute for fresh introspection requests against live data sources. The serialization of `StoredIntrospection` is intended to be straightforward: just take the serialized source introspection results, and put them in an appropriate JSON object. Though I don't think that this PR achieves that entirely. In order for `StoredIntrospection` to be deserializable (through `aeson` instances), while keeping the required code changes low, this piggy-backs off of the `ResolvedSource` data type. `ResolvedSource` is _almost_ exactly what we want, and _almost_ deserializable, so this PR brings it across the finish line by moving a few things out of that type, and adding a `FromJSON (RawFunctionInfo b)` context to the `Backend` type class. [PLAT-270]: https://hasurahq.atlassian.net/browse/PLAT-270?atlOrigin=eyJpIjoiNWRkNTljNzYxNjVmNDY3MDlhMDU5Y2ZhYzA5YTRkZjUiLCJwIjoiZ2l0aHViLWNvbS1KU1cifQ [PLAT-270]: https://hasurahq.atlassian.net/browse/PLAT-270?atlOrigin=eyJpIjoiNWRkNTljNzYxNjVmNDY3MDlhMDU5Y2ZhYzA5YTRkZjUiLCJwIjoiZ2l0aHViLWNvbS1KU1cifQ [PLAT-276]: https://hasurahq.atlassian.net/browse/PLAT-276?atlOrigin=eyJpIjoiNWRkNTljNzYxNjVmNDY3MDlhMDU5Y2ZhYzA5YTRkZjUiLCJwIjoiZ2l0aHViLWNvbS1KU1cifQ [PLAT-276]: https://hasurahq.atlassian.net/browse/PLAT-276?atlOrigin=eyJpIjoiNWRkNTljNzYxNjVmNDY3MDlhMDU5Y2ZhYzA5YTRkZjUiLCJwIjoiZ2l0aHViLWNvbS1KU1cifQ PR-URL: https://github.com/hasura/graphql-engine-mono/pull/7053 GitOrigin-RevId: 5001b4ea086195cb5e65886747eac2a0a657b64c
2023-01-20 17:51:11 +03:00
( DBObjectsIntrospection
{ _rsTables =
HashMap.fromList
[ ( restTableReferenceToTableName tableReference,
DBTableMetadata
{ _ptmiOid = OID (fromIntegral seconds + index :: Int), -- TODO: The seconds are used for uniqueness. BigQuery doesn't support a "stable" ID for a table.
_ptmiColumns =
[ RawColumnInfo
{ rciName = ColumnName name,
rciPosition = position,
Nested array support for Data Connectors Backend and MongoDB ## Description This change adds support for querying into nested arrays in Data Connector agents that support such a concept (currently MongoDB). ### DC API changes - New API type `ColumnType` which allows representing the type of a "column" as either a scalar type, an object reference or an array of `ColumnType`s. This recursive definition allows arbitrary nesting of arrays of types. - The `type` fields in the API types `ColumnInfo` and `ColumnInsertSchema` now take a `ColumnType` instead of a `ScalarType`. - To ensure backwards compatibility, a `ColumnType` representing a scalar serialises and deserialises to the same representation as `ScalarType`. - In queries, the `Field` type now has a new constructor `NestedArrayField`. This contains a nested `Field` along with optional `limit`, `offset`, `where` and `order_by` arguments. (These optional arguments are not yet used by either HGE or the MongoDB agent.) ### MongoDB Haskell agent changes - The `/schema` endpoint will now recognise arrays within the JSON validation schema and generate corresponding arrays in the DC schema. - The `/query` endpoint will now handle `NestedArrayField`s within queries (although it does not yet handle `limit`, `offset`, `where` and `order_by`). ### HGE server changes - The `Backend` type class adds a new type family `XNestedArrays b` to enable nested arrays on a per-backend basis (currently enabled only for the `DataConnector` backend. - Within `RawColumnInfo` the column type is now represented by a new type `RawColumnType b` which mirrors the shape of the DC API `ColumnType`, but uses `XNestedObjects b` and `XNestedArrays b` type families to allow turning nested object and array supports on or off for a particular backend. In the `DataConnector` backend `API.CustomType` is converted into `RawColumnInfo 'DataConnector` while building the schema. - In the next stage of schema building, the `RawColumnInfo` is converted into a `StructuredColumnInfo` which allows us to represent the three different types of columns: scalar, object and array. TODO: the `StructuredColumnInfo` looks very similar to the Logical Model types. The main difference is that it uses the `XNestedObjects` and `XNestedArrays` type families. We should be able to combine these two representations. - The `StructuredColumnInfo` is then placed into a `FIColumn` `FieldInfo`. This involved some refactoring of `FieldInfo` as I had previously split out `FINestedObject` into a separate constructor. However it works out better to represent all "column" fields (i.e. scalar, object and array) using `FIColumn` as this make it easier to implement permission checking correctly. This is the reason the `StructuredColumnInfo` was needed. - Next, the `FieldInfo` are used to generate `FieldParser`s. We add a new constructor to `AnnFieldG` for `AFNestedArray`. An `AFNestedArray` field parser can contain either a simple array selection or an array aggregate. Simple array `FieldParsers` are currently limited to subfield selection. We will add support for limit, offset, where and order_by in a future PR. We also don't yet generate array aggregate `FieldParsers. - The new `AFNestedArray` field is handled by the `QueryPlan` module in the `DataConnector` backend. There we generate an `API.NestedArrayField` from the AFNestedArray. We also handle nested arrays when reshaping the response from the DC agent. ## Limitations - Support for limit, offset, filter (where) and order_by is not yet fully implemented, although it should not be hard to add this - Support for aggregations on nested arrays is not yet fully implemented - Permissions involving nested arrays (and objects) not yet implemented - This should be integrated with Logical Model types, but that will happen in a separate PR PR-URL: https://github.com/hasura/graphql-engine-mono/pull/9149 GitOrigin-RevId: 0e7b71a994fc1d2ca1ef73bfe7b96e95b5328531
2023-05-24 11:00:59 +03:00
rciType = RawColumnTypeScalar $ restTypeToScalarType type',
rciIsNullable =
case mode of
Nullable -> True
_ -> False,
rciDescription = Nothing,
rciMutability = ColumnMutability {_cmIsInsertable = True, _cmIsUpdatable = True}
}
| (position, RestFieldSchema {name, type', mode}) <-
zip [1 ..] fields -- TODO: Same trouble as Oid above.
],
_ptmiPrimaryKey = Nothing,
_ptmiUniqueConstraints = mempty,
_ptmiForeignKeys = mempty,
_ptmiViewInfo = Just $ ViewInfo False False False,
_ptmiDescription = Nothing,
Replace TableObjectType, etc. with the corresponding Logical Model types ## Description This is the first step in making use of Logical Models with document databases such as MongoDB. As part of schema introspection, a data connector agent can supply a set of custom types that can be used to describe the schema for columns within the tables of the database (or _fields_ within a _document collection_ in MongoDB terminology). Previously, we were storing these custom types as `TableObjectType`s within the `TableCoreInfo` for each table. In this PR we - replace the `TableObjectTypes` with `LogicalModel` types - store these directly within the `DBObjectsIntrospection` instead of within the `TableCoreInfo` for each table. (The custom types are shared at the source level so there was no reason to have a separate set of types for each table.) - When building the `SourceInfo`, we combine the `LogicalModel`s from `DBObjectsIntrospection` with `LogicalModel`s from the user's metadata to create the set of `LogicalModels` in the `SourceInfo` within the `SchemaCache`. I.e. we combine the set of types obtained by database introspection with the set of types specified by the user in the metadata. If two types have the same name, we use the type defined in the metadata. ## Limitations and future work - Provide a way for the user to associate a meta-data defined `LogicalModel` with a table instead of requiring one to be provided by DB introspection - Provide a way for the user to edit the `LogicalModel` types provided by introspection and add them to the metadata. - Allow a `LogicalModel` object type to describe and entire table rather than just individual columns. - Better handling for "unknown" types, e.g. if the type of a collection (or part of a collection) is unknown we should treat it as a JSON scalar value. This may also involve adding an `_everything` field which returns the full document as a JSON scalar. PR-URL: https://github.com/hasura/graphql-engine-mono/pull/9345 GitOrigin-RevId: 5cec72fc1be1380d8600f7be547bbf71aad770bd
2023-05-30 17:04:16 +03:00
_ptmiExtraTableMetadata = ()
}
)
| (index, RestTable {tableReference, schema}) <-
zip [0 ..] restTables,
let RestTableSchema fields = schema
],
_rsFunctions = functions,
Replace TableObjectType, etc. with the corresponding Logical Model types ## Description This is the first step in making use of Logical Models with document databases such as MongoDB. As part of schema introspection, a data connector agent can supply a set of custom types that can be used to describe the schema for columns within the tables of the database (or _fields_ within a _document collection_ in MongoDB terminology). Previously, we were storing these custom types as `TableObjectType`s within the `TableCoreInfo` for each table. In this PR we - replace the `TableObjectTypes` with `LogicalModel` types - store these directly within the `DBObjectsIntrospection` instead of within the `TableCoreInfo` for each table. (The custom types are shared at the source level so there was no reason to have a separate set of types for each table.) - When building the `SourceInfo`, we combine the `LogicalModel`s from `DBObjectsIntrospection` with `LogicalModel`s from the user's metadata to create the set of `LogicalModels` in the `SourceInfo` within the `SchemaCache`. I.e. we combine the set of types obtained by database introspection with the set of types specified by the user in the metadata. If two types have the same name, we use the type defined in the metadata. ## Limitations and future work - Provide a way for the user to associate a meta-data defined `LogicalModel` with a table instead of requiring one to be provided by DB introspection - Provide a way for the user to edit the `LogicalModel` types provided by introspection and add them to the metadata. - Allow a `LogicalModel` object type to describe and entire table rather than just individual columns. - Better handling for "unknown" types, e.g. if the type of a collection (or part of a collection) is unknown we should treat it as a JSON scalar value. This may also involve adding an `_everything` field which returns the full document as a JSON scalar. PR-URL: https://github.com/hasura/graphql-engine-mono/pull/9345 GitOrigin-RevId: 5cec72fc1be1380d8600f7be547bbf71aad770bd
2023-05-30 17:04:16 +03:00
_rsScalars = mempty,
_rsLogicalModels = mempty
}
)
restTypeToScalarType :: RestType -> ScalarType
restTypeToScalarType =
\case
STRING -> StringScalarType
BYTES -> BytesScalarType
INTEGER -> IntegerScalarType
FLOAT -> FloatScalarType
BOOL -> BoolScalarType
TIMESTAMP -> TimestampScalarType
DATE -> DateScalarType
TIME -> TimeScalarType
DATETIME -> DatetimeScalarType
GEOGRAPHY -> GeographyScalarType
STRUCT -> StructScalarType
BIGDECIMAL -> BigDecimalScalarType
DECIMAL -> DecimalScalarType
JSON -> JsonScalarType
-- Hierarchy: Project / Dataset / Table
-- see <https://cloud.google.com/bigquery/docs/datasets-intro>
restTableReferenceToTableName :: RestTableReference -> TableName
restTableReferenceToTableName RestTableReference {..} =
TableName {tableName = tableId, tableNameSchema = datasetId}
-- We ignore project id and push that requirement up to the top to
-- the data source level.
postDropSourceHook ::
(MonadIO m) =>
BigQuerySourceConfig ->
TableEventTriggers 'BigQuery ->
m ()
postDropSourceHook _ _ =
-- On BigQuery we don't keep connections open.
pure ()