ENG-1287 remove metadata copy from SQL artifact (#1348)

<!-- The PR description should answer 2 important questions: -->

### What

Remove copies of metadata from SQL artifacts:

Resulting in **19% smaller artifact** for chinook, **17% faster
runtime**.
Customer metadata with multiple subgraphs will show more dramatic
effect.

There was a separate copy of the metadata stored for each subgraph in
the sql "catalog". From the point of view of MBS we were serializing 2+N
copies of the same metadata which bloats artifacts and is very slow.

Additionally the usage sites became confusing because you had multiple
identical (we assume) copies of metadata in scope at the same time.

Instead we reconstruct the original Catalog type before use, because
it's required for the Datafusion impls

### How

convert to a different type for serialization, re-hydrate in body of sql
code

V3_GIT_ORIGIN_REV_ID: 0e7e35255cfe8fe01ea328a1d7cb96db0e2dd726
This commit is contained in:
Brandon Simmons 2024-11-18 13:05:31 -05:00 committed by hasura-bot
parent 5dd7edcb7f
commit 426a834412
7 changed files with 111 additions and 49 deletions

View File

@ -12,6 +12,7 @@ use metadata_resolve::{data_connectors::NdcVersion, LifecyclePluginConfigs};
use open_dds::session_variables::{SessionVariableName, SESSION_VARIABLE_ROLE}; use open_dds::session_variables::{SessionVariableName, SESSION_VARIABLE_ROLE};
use pretty_assertions::assert_eq; use pretty_assertions::assert_eq;
use serde_json as json; use serde_json as json;
use sql::catalog::CatalogSerializable;
use sql::execute::SqlRequest; use sql::execute::SqlRequest;
use std::collections::BTreeMap; use std::collections::BTreeMap;
use std::iter; use std::iter;
@ -261,9 +262,12 @@ pub fn test_execution_expectation_for_multiple_ndc_versions(
// Ensure sql_context can be serialized and deserialized // Ensure sql_context can be serialized and deserialized
let sql_context = sql::catalog::Catalog::from_metadata(&gds.metadata); let sql_context = sql::catalog::Catalog::from_metadata(&gds.metadata);
let sql_context_str = serde_json::to_string(&sql_context)?; let sql_context_str = serde_json::to_string(&sql_context.clone().to_serializable())?;
let sql_context_parsed = serde_json::from_str(&sql_context_str)?; let sql_context_parsed: CatalogSerializable = serde_json::from_str(&sql_context_str)?;
assert_eq!(sql_context, sql_context_parsed); assert_eq!(
sql_context,
sql_context_parsed.from_serializable(&gds.metadata)
);
assert_eq!( assert_eq!(
schema, deserialized_metadata, schema, deserialized_metadata,
"initial built metadata does not match deserialized metadata" "initial built metadata does not match deserialized metadata"
@ -577,9 +581,12 @@ pub(crate) fn test_sql(test_path_string: &str) -> anyhow::Result<()> {
// Ensure sql_context can be serialized and deserialized // Ensure sql_context can be serialized and deserialized
let sql_context = sql::catalog::Catalog::from_metadata(&gds.metadata); let sql_context = sql::catalog::Catalog::from_metadata(&gds.metadata);
let sql_context_str = serde_json::to_string(&sql_context)?; let sql_context_str = serde_json::to_string(&sql_context.clone().to_serializable())?;
let sql_context_parsed = serde_json::from_str(&sql_context_str)?; let sql_context_parsed: CatalogSerializable = serde_json::from_str(&sql_context_str)?;
assert_eq!(sql_context, sql_context_parsed); assert_eq!(
sql_context,
sql_context_parsed.from_serializable(&gds.metadata)
);
let request = if let Ok(content) = read_to_string(&request_path) { let request = if let Ok(content) = read_to_string(&request_path) {
SqlRequest::new(content) SqlRequest::new(content)

View File

@ -14,7 +14,7 @@ use open_dds::{
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use std::collections::{BTreeMap, BTreeSet}; use std::collections::{BTreeMap, BTreeSet};
#[derive(Debug, Deserialize, Serialize)] #[derive(Debug, Deserialize, Serialize, PartialEq)]
pub struct Catalog { pub struct Catalog {
pub state_per_role: BTreeMap<Role, State>, pub state_per_role: BTreeMap<Role, State>,
} }
@ -40,7 +40,7 @@ impl Catalog {
} }
} }
#[derive(Debug, Deserialize, Serialize)] #[derive(Debug, Deserialize, Serialize, PartialEq)]
pub struct State { pub struct State {
pub routes: BTreeMap<String, Model>, pub routes: BTreeMap<String, Model>,
#[serde( #[serde(
@ -50,7 +50,7 @@ pub struct State {
pub object_types: BTreeMap<Qualified<CustomTypeName>, ObjectType>, pub object_types: BTreeMap<Qualified<CustomTypeName>, ObjectType>,
} }
#[derive(Debug, Deserialize, Serialize)] #[derive(Debug, Deserialize, Serialize, PartialEq)]
pub struct ObjectType(pub IndexMap<FieldName, Type>); pub struct ObjectType(pub IndexMap<FieldName, Type>);
impl State { impl State {
@ -125,7 +125,7 @@ impl State {
// //
// for now we'll try d) but we should check we're happy with this before general release // for now we'll try d) but we should check we're happy with this before general release
// of the feature // of the feature
#[derive(Debug, Deserialize, Serialize)] #[derive(Debug, Deserialize, Serialize, PartialEq)]
pub enum Type { pub enum Type {
Scalar(ndc_models::TypeRepresentation), Scalar(ndc_models::TypeRepresentation),
ScalarForDataConnector(ScalarTypeForDataConnector), ScalarForDataConnector(ScalarTypeForDataConnector),
@ -133,14 +133,14 @@ pub enum Type {
Object(Qualified<CustomTypeName>), Object(Qualified<CustomTypeName>),
} }
#[derive(Debug, Deserialize, Serialize)] #[derive(Debug, Deserialize, Serialize, PartialEq)]
pub struct ScalarTypeForDataConnector { pub struct ScalarTypeForDataConnector {
pub type_representations: BTreeSet<ndc_models::TypeRepresentation>, pub type_representations: BTreeSet<ndc_models::TypeRepresentation>,
} }
// only the parts of a Model we need to construct a JSONAPI // only the parts of a Model we need to construct a JSONAPI
// we'll filter out fields a given role can't see // we'll filter out fields a given role can't see
#[derive(Debug, Deserialize, Serialize)] #[derive(Debug, Deserialize, Serialize, PartialEq)]
pub struct Model { pub struct Model {
pub name: Qualified<ModelName>, pub name: Qualified<ModelName>,
pub description: Option<String>, pub description: Option<String>,

View File

@ -1,5 +1,6 @@
use std::{any::Any, sync::Arc}; use std::{any::Any, sync::Arc};
use crate::catalog::subgraph::Subgraph;
use hasura_authn_core::Session; use hasura_authn_core::Session;
use indexmap::IndexMap; use indexmap::IndexMap;
use metadata_resolve::{self as resolved}; use metadata_resolve::{self as resolved};
@ -25,15 +26,46 @@ pub mod model;
pub mod subgraph; pub mod subgraph;
pub mod types; pub mod types;
/// The context in which to compile and execute SQL queries. /// [`Catalog`] but parameterized, so that `subgraph` can be `SubgraphSerializable` for
/// serialization.
#[derive(Serialize, Deserialize, Clone, Debug, PartialEq)] #[derive(Serialize, Deserialize, Clone, Debug, PartialEq)]
pub struct Catalog { pub struct CatalogSer<SG> {
pub(crate) subgraphs: IndexMap<String, Arc<subgraph::Subgraph>>, pub(crate) subgraphs: IndexMap<String, SG>,
pub(crate) table_valued_functions: IndexMap<String, Arc<command::Command>>, pub(crate) table_valued_functions: IndexMap<String, Arc<command::Command>>,
pub(crate) introspection: Arc<introspection::IntrospectionSchemaProvider>, pub(crate) introspection: Arc<introspection::IntrospectionSchemaProvider>,
pub(crate) default_schema: Option<String>, pub(crate) default_schema: Option<String>,
} }
/// The context in which to compile and execute SQL queries.
pub type Catalog = CatalogSer<subgraph::Subgraph>;
pub type CatalogSerializable = CatalogSer<subgraph::SubgraphSerializable>;
impl CatalogSerializable {
pub fn from_serializable(self, metadata: &Arc<resolved::Metadata>) -> Catalog {
let subgraphs = self
.subgraphs
.into_iter()
.map(|(name, tables)| {
(
name,
Subgraph {
metadata: metadata.clone(),
tables,
},
)
})
.collect();
CatalogSer {
subgraphs,
table_valued_functions: self.table_valued_functions,
introspection: self.introspection,
default_schema: self.default_schema,
}
}
}
impl Catalog { impl Catalog {
/// Create a no-op Catalog, used when `sql` layer is disabled /// Create a no-op Catalog, used when `sql` layer is disabled
pub fn empty() -> Self { pub fn empty() -> Self {
@ -46,6 +78,23 @@ impl Catalog {
default_schema: None, default_schema: None,
} }
} }
/// prepare a Catalog for serializing by stripping redundant artifacts
pub fn to_serializable(self) -> CatalogSer<subgraph::SubgraphSerializable> {
let serialized_subgraphs = self
.subgraphs
.into_iter()
.map(|(name, subgraph)| (name, subgraph.tables))
.collect();
CatalogSer {
subgraphs: serialized_subgraphs,
table_valued_functions: self.table_valued_functions,
introspection: self.introspection,
default_schema: self.default_schema,
}
}
/// Derive a SQL Context from resolved Open DDS metadata. /// Derive a SQL Context from resolved Open DDS metadata.
pub fn from_metadata(metadata: &Arc<resolved::Metadata>) -> Self { pub fn from_metadata(metadata: &Arc<resolved::Metadata>) -> Self {
let type_registry = TypeRegistry::build_type_registry(metadata); let type_registry = TypeRegistry::build_type_registry(metadata);
@ -109,10 +158,7 @@ impl Catalog {
); );
Catalog { Catalog {
subgraphs: subgraphs subgraphs,
.into_iter()
.map(|(k, v)| (k, Arc::new(v)))
.collect(),
table_valued_functions, table_valued_functions,
introspection: Arc::new(introspection), introspection: Arc::new(introspection),
default_schema, default_schema,
@ -134,7 +180,7 @@ impl datafusion::CatalogProvider for model::WithSession<Catalog> {
fn schema(&self, name: &str) -> Option<Arc<dyn datafusion::SchemaProvider>> { fn schema(&self, name: &str) -> Option<Arc<dyn datafusion::SchemaProvider>> {
let subgraph_provider = self.value.subgraphs.get(name).cloned().map(|schema| { let subgraph_provider = self.value.subgraphs.get(name).cloned().map(|schema| {
Arc::new(model::WithSession { Arc::new(model::WithSession {
value: schema, value: schema.into(),
session: self.session.clone(), session: self.session.clone(),
}) as Arc<dyn datafusion::SchemaProvider> }) as Arc<dyn datafusion::SchemaProvider>
}); });

View File

@ -55,23 +55,23 @@ pub enum UnsupportedModel {
} }
#[derive(Serialize, Deserialize, Clone, Debug, PartialEq)] #[derive(Serialize, Deserialize, Clone, Debug, PartialEq)]
pub(crate) struct Model { pub struct Model {
pub subgraph: SubgraphName, pub(crate) subgraph: SubgraphName,
pub name: ModelName, pub(crate) name: ModelName,
pub description: Option<String>, pub(crate) description: Option<String>,
pub arguments: IndexMap<ArgumentName, ArgumentInfo>, pub(crate) arguments: IndexMap<ArgumentName, ArgumentInfo>,
// The struct type of the model's object type // The struct type of the model's object type
pub struct_type: StructTypeName, pub(crate) struct_type: StructTypeName,
// Datafusion table schema // Datafusion table schema
pub schema: datafusion::SchemaRef, pub(crate) schema: datafusion::SchemaRef,
// This is the entry point for the type mappings stored // This is the entry point for the type mappings stored
// in ModelSource // in ModelSource
pub data_type: Qualified<CustomTypeName>, pub(crate) data_type: Qualified<CustomTypeName>,
} }
impl Model { impl Model {

View File

@ -3,7 +3,6 @@ use metadata_resolve::Metadata;
use std::{any::Any, sync::Arc}; use std::{any::Any, sync::Arc};
use indexmap::IndexMap; use indexmap::IndexMap;
use serde::{Deserialize, Serialize};
mod datafusion { mod datafusion {
pub(super) use datafusion::error::Result; pub(super) use datafusion::error::Result;
@ -14,12 +13,22 @@ use crate::catalog;
use super::model; use super::model;
#[derive(Serialize, Deserialize, Clone, Debug, PartialEq)] /// TODO document
pub(crate) struct Subgraph { ///
pub metadata: Arc<Metadata>, /// This is intentionally not `Serialize`/`Deserialize`, and constructed from a
pub tables: IndexMap<String, Arc<catalog::model::Model>>, /// [`SubgraphSerializable`].
#[derive(Clone, Debug, PartialEq)]
pub struct Subgraph {
pub(crate) metadata: Arc<Metadata>,
pub(crate) tables: IndexMap<String, Arc<catalog::model::Model>>,
} }
/// This is [`Subgraph`] but with `metadata` removed (to avoid redundancy in artifact creation, and
/// to avoid the confusion of multiple copies of the same thing expected to be identical but
/// perhaps not, or perhaps no one knows...). It is reconstituted as `Subgraph` at some point after
/// deserializing.
pub type SubgraphSerializable = IndexMap<String, Arc<catalog::model::Model>>;
#[async_trait] #[async_trait]
impl datafusion::SchemaProvider for catalog::model::WithSession<Subgraph> { impl datafusion::SchemaProvider for catalog::model::WithSession<Subgraph> {
fn as_any(&self) -> &dyn Any { fn as_any(&self) -> &dyn Any {

View File

@ -1,5 +1,5 @@
use std::{ use std::{
collections::{BTreeSet, HashMap, HashSet}, collections::{BTreeMap, BTreeSet},
fmt::{Debug, Display}, fmt::{Debug, Display},
sync::Arc, sync::Arc,
}; };
@ -29,7 +29,7 @@ type SupportedScalar = (NormalizedType, datafusion::DataType);
#[derive(Error, Serialize, Deserialize, Clone, Debug, PartialEq)] #[derive(Error, Serialize, Deserialize, Clone, Debug, PartialEq)]
pub enum UnsupportedScalar { pub enum UnsupportedScalar {
#[error("Multiple NDC type representations found for scalar type '{0}'")] #[error("Multiple NDC type representations found for scalar type '{0}'")]
MultipleNdcTypeRepresentations(CustomTypeName, HashSet<ndc_models::TypeRepresentation>), MultipleNdcTypeRepresentations(CustomTypeName, BTreeSet<ndc_models::TypeRepresentation>),
#[error("No NDC representation found for scalar type '{0}'")] #[error("No NDC representation found for scalar type '{0}'")]
NoNdcRepresentation(CustomTypeName), NoNdcRepresentation(CustomTypeName),
#[error("Unsupported NDC type representation for scalar type '{0}': {1:?}")] #[error("Unsupported NDC type representation for scalar type '{0}': {1:?}")]
@ -42,7 +42,7 @@ pub fn resolve_scalar_type(
scalar_type_name: &Qualified<CustomTypeName>, scalar_type_name: &Qualified<CustomTypeName>,
representation: &ScalarTypeRepresentation, representation: &ScalarTypeRepresentation,
) -> Scalar { ) -> Scalar {
let representations: HashSet<ndc_models::TypeRepresentation> = let representations: BTreeSet<ndc_models::TypeRepresentation> =
representation.representations.values().cloned().collect(); representation.representations.values().cloned().collect();
let mut iter = representations.into_iter(); let mut iter = representations.into_iter();
let ndc_representation = match iter.next() { let ndc_representation = match iter.next() {
@ -83,8 +83,8 @@ pub fn resolve_scalar_type(
pub fn resolve_scalar_types( pub fn resolve_scalar_types(
metadata: &resolved::Metadata, metadata: &resolved::Metadata,
) -> HashMap<Qualified<CustomTypeName>, Scalar> { ) -> BTreeMap<Qualified<CustomTypeName>, Scalar> {
let mut custom_scalars: HashMap<Qualified<CustomTypeName>, Scalar> = HashMap::new(); let mut custom_scalars: BTreeMap<Qualified<CustomTypeName>, Scalar> = BTreeMap::new();
for (scalar_type_name, representation) in &metadata.scalar_types { for (scalar_type_name, representation) in &metadata.scalar_types {
let scalar = resolve_scalar_type(scalar_type_name, representation); let scalar = resolve_scalar_type(scalar_type_name, representation);
custom_scalars.insert(scalar_type_name.clone(), scalar); custom_scalars.insert(scalar_type_name.clone(), scalar);
@ -93,8 +93,8 @@ pub fn resolve_scalar_types(
} }
pub struct TypeRegistry { pub struct TypeRegistry {
custom_scalars: HashMap<Qualified<CustomTypeName>, Scalar>, custom_scalars: BTreeMap<Qualified<CustomTypeName>, Scalar>,
struct_types: HashMap<Qualified<CustomTypeName>, Struct>, struct_types: BTreeMap<Qualified<CustomTypeName>, Struct>,
default_schema: Option<SubgraphName>, default_schema: Option<SubgraphName>,
} }
@ -113,7 +113,7 @@ impl TypeRegistry {
// build a default schema by checking if the object types are spread across more than one // build a default schema by checking if the object types are spread across more than one
// subgraph // subgraph
let default_schema = { let default_schema = {
let subgraphs: HashSet<_> = metadata let subgraphs: BTreeSet<_> = metadata
.object_types .object_types
.keys() .keys()
.map(|object_type_name| &object_type_name.subgraph) .map(|object_type_name| &object_type_name.subgraph)
@ -127,7 +127,7 @@ impl TypeRegistry {
let custom_scalars = resolve_scalar_types(metadata); let custom_scalars = resolve_scalar_types(metadata);
let mut struct_types: HashMap<Qualified<CustomTypeName>, Struct> = HashMap::new(); let mut struct_types: BTreeMap<Qualified<CustomTypeName>, Struct> = BTreeMap::new();
for (object_type_name, object_type) in &metadata.object_types { for (object_type_name, object_type) in &metadata.object_types {
let struct_type = struct_type( let struct_type = struct_type(
@ -206,7 +206,7 @@ impl TypeRegistry {
// } // }
} }
pub(crate) fn struct_types(&self) -> &HashMap<Qualified<CustomTypeName>, Struct> { pub(crate) fn struct_types(&self) -> &BTreeMap<Qualified<CustomTypeName>, Struct> {
&self.struct_types &self.struct_types
} }
@ -214,7 +214,7 @@ impl TypeRegistry {
self.default_schema.as_ref() self.default_schema.as_ref()
} }
pub fn custom_scalars(&self) -> &HashMap<Qualified<CustomTypeName>, Scalar> { pub fn custom_scalars(&self) -> &BTreeMap<Qualified<CustomTypeName>, Scalar> {
&self.custom_scalars &self.custom_scalars
} }
} }
@ -338,7 +338,7 @@ type Struct = Result<StructType, UnsupportedObject>;
fn struct_type( fn struct_type(
default_schema: &Option<SubgraphName>, default_schema: &Option<SubgraphName>,
metadata: &resolved::Metadata, metadata: &resolved::Metadata,
custom_scalars: &HashMap<Qualified<CustomTypeName>, Scalar>, custom_scalars: &BTreeMap<Qualified<CustomTypeName>, Scalar>,
object_type_name: &Qualified<CustomTypeName>, object_type_name: &Qualified<CustomTypeName>,
object_type: &resolved::ObjectTypeWithRelationships, object_type: &resolved::ObjectTypeWithRelationships,
disallowed_object_types: &BTreeSet<Qualified<CustomTypeName>>, disallowed_object_types: &BTreeSet<Qualified<CustomTypeName>>,
@ -491,7 +491,7 @@ fn ndc_representation_to_datatype(
fn to_arrow_type( fn to_arrow_type(
default_schema: &Option<SubgraphName>, default_schema: &Option<SubgraphName>,
metadata: &resolved::Metadata, metadata: &resolved::Metadata,
custom_scalars: &HashMap<Qualified<CustomTypeName>, Scalar>, custom_scalars: &BTreeMap<Qualified<CustomTypeName>, Scalar>,
ty: &resolved::QualifiedBaseType, ty: &resolved::QualifiedBaseType,
disallowed_object_types: &BTreeSet<Qualified<CustomTypeName>>, disallowed_object_types: &BTreeSet<Qualified<CustomTypeName>>,
// ) -> GeneratedArrowType { // ) -> GeneratedArrowType {

View File

@ -17,7 +17,7 @@ use datafusion::{
error::DataFusionError, error::DataFusionError,
scalar::ScalarValue, scalar::ScalarValue,
}; };
use std::{collections::HashSet, sync::Arc}; use std::{collections::BTreeSet, sync::Arc};
/// Parses a datafusion literal into a json value, supports most types (including structs and arrays). /// Parses a datafusion literal into a json value, supports most types (including structs and arrays).
pub(crate) fn parse_datafusion_literal( pub(crate) fn parse_datafusion_literal(
@ -197,7 +197,7 @@ pub(crate) fn parse_struct_literal(
} }
} else { } else {
// Create a map of expected fields // Create a map of expected fields
let expected_field_set: HashSet<_> = let expected_field_set: BTreeSet<_> =
expected_fields.iter().map(|field| field.name()).collect(); expected_fields.iter().map(|field| field.name()).collect();
// Check for extra fields in the struct that aren't expected // Check for extra fields in the struct that aren't expected
@ -216,7 +216,7 @@ pub(crate) fn parse_struct_literal(
} }
} }
let field_name_to_index: std::collections::HashMap<_, _> = struct_array let field_name_to_index: std::collections::BTreeMap<_, _> = struct_array
.fields() .fields()
.iter() .iter()
.enumerate() .enumerate()