ENG-1287 remove metadata copy from SQL artifact (#1348)

<!-- The PR description should answer 2 important questions: -->

### What

Remove copies of metadata from SQL artifacts:

Resulting in **19% smaller artifact** for chinook, **17% faster
runtime**.
Customer metadata with multiple subgraphs will show more dramatic
effect.

There was a separate copy of the metadata stored for each subgraph in
the sql "catalog". From the point of view of MBS we were serializing 2+N
copies of the same metadata which bloats artifacts and is very slow.

Additionally the usage sites became confusing because you had multiple
identical (we assume) copies of metadata in scope at the same time.

Instead we reconstruct the original Catalog type before use, because
it's required for the Datafusion impls

### How

convert to a different type for serialization, re-hydrate in body of sql
code

V3_GIT_ORIGIN_REV_ID: 0e7e35255cfe8fe01ea328a1d7cb96db0e2dd726
This commit is contained in:
Brandon Simmons 2024-11-18 13:05:31 -05:00 committed by hasura-bot
parent 5dd7edcb7f
commit 426a834412
7 changed files with 111 additions and 49 deletions

View File

@ -12,6 +12,7 @@ use metadata_resolve::{data_connectors::NdcVersion, LifecyclePluginConfigs};
use open_dds::session_variables::{SessionVariableName, SESSION_VARIABLE_ROLE};
use pretty_assertions::assert_eq;
use serde_json as json;
use sql::catalog::CatalogSerializable;
use sql::execute::SqlRequest;
use std::collections::BTreeMap;
use std::iter;
@ -261,9 +262,12 @@ pub fn test_execution_expectation_for_multiple_ndc_versions(
// Ensure sql_context can be serialized and deserialized
let sql_context = sql::catalog::Catalog::from_metadata(&gds.metadata);
let sql_context_str = serde_json::to_string(&sql_context)?;
let sql_context_parsed = serde_json::from_str(&sql_context_str)?;
assert_eq!(sql_context, sql_context_parsed);
let sql_context_str = serde_json::to_string(&sql_context.clone().to_serializable())?;
let sql_context_parsed: CatalogSerializable = serde_json::from_str(&sql_context_str)?;
assert_eq!(
sql_context,
sql_context_parsed.from_serializable(&gds.metadata)
);
assert_eq!(
schema, deserialized_metadata,
"initial built metadata does not match deserialized metadata"
@ -577,9 +581,12 @@ pub(crate) fn test_sql(test_path_string: &str) -> anyhow::Result<()> {
// Ensure sql_context can be serialized and deserialized
let sql_context = sql::catalog::Catalog::from_metadata(&gds.metadata);
let sql_context_str = serde_json::to_string(&sql_context)?;
let sql_context_parsed = serde_json::from_str(&sql_context_str)?;
assert_eq!(sql_context, sql_context_parsed);
let sql_context_str = serde_json::to_string(&sql_context.clone().to_serializable())?;
let sql_context_parsed: CatalogSerializable = serde_json::from_str(&sql_context_str)?;
assert_eq!(
sql_context,
sql_context_parsed.from_serializable(&gds.metadata)
);
let request = if let Ok(content) = read_to_string(&request_path) {
SqlRequest::new(content)

View File

@ -14,7 +14,7 @@ use open_dds::{
use serde::{Deserialize, Serialize};
use std::collections::{BTreeMap, BTreeSet};
#[derive(Debug, Deserialize, Serialize)]
#[derive(Debug, Deserialize, Serialize, PartialEq)]
pub struct Catalog {
pub state_per_role: BTreeMap<Role, State>,
}
@ -40,7 +40,7 @@ impl Catalog {
}
}
#[derive(Debug, Deserialize, Serialize)]
#[derive(Debug, Deserialize, Serialize, PartialEq)]
pub struct State {
pub routes: BTreeMap<String, Model>,
#[serde(
@ -50,7 +50,7 @@ pub struct State {
pub object_types: BTreeMap<Qualified<CustomTypeName>, ObjectType>,
}
#[derive(Debug, Deserialize, Serialize)]
#[derive(Debug, Deserialize, Serialize, PartialEq)]
pub struct ObjectType(pub IndexMap<FieldName, Type>);
impl State {
@ -125,7 +125,7 @@ impl State {
//
// for now we'll try d) but we should check we're happy with this before general release
// of the feature
#[derive(Debug, Deserialize, Serialize)]
#[derive(Debug, Deserialize, Serialize, PartialEq)]
pub enum Type {
Scalar(ndc_models::TypeRepresentation),
ScalarForDataConnector(ScalarTypeForDataConnector),
@ -133,14 +133,14 @@ pub enum Type {
Object(Qualified<CustomTypeName>),
}
#[derive(Debug, Deserialize, Serialize)]
#[derive(Debug, Deserialize, Serialize, PartialEq)]
pub struct ScalarTypeForDataConnector {
pub type_representations: BTreeSet<ndc_models::TypeRepresentation>,
}
// only the parts of a Model we need to construct a JSONAPI
// we'll filter out fields a given role can't see
#[derive(Debug, Deserialize, Serialize)]
#[derive(Debug, Deserialize, Serialize, PartialEq)]
pub struct Model {
pub name: Qualified<ModelName>,
pub description: Option<String>,

View File

@ -1,5 +1,6 @@
use std::{any::Any, sync::Arc};
use crate::catalog::subgraph::Subgraph;
use hasura_authn_core::Session;
use indexmap::IndexMap;
use metadata_resolve::{self as resolved};
@ -25,15 +26,46 @@ pub mod model;
pub mod subgraph;
pub mod types;
/// The context in which to compile and execute SQL queries.
/// [`Catalog`] but parameterized, so that `subgraph` can be `SubgraphSerializable` for
/// serialization.
#[derive(Serialize, Deserialize, Clone, Debug, PartialEq)]
pub struct Catalog {
pub(crate) subgraphs: IndexMap<String, Arc<subgraph::Subgraph>>,
pub struct CatalogSer<SG> {
pub(crate) subgraphs: IndexMap<String, SG>,
pub(crate) table_valued_functions: IndexMap<String, Arc<command::Command>>,
pub(crate) introspection: Arc<introspection::IntrospectionSchemaProvider>,
pub(crate) default_schema: Option<String>,
}
/// The context in which to compile and execute SQL queries.
pub type Catalog = CatalogSer<subgraph::Subgraph>;
pub type CatalogSerializable = CatalogSer<subgraph::SubgraphSerializable>;
impl CatalogSerializable {
pub fn from_serializable(self, metadata: &Arc<resolved::Metadata>) -> Catalog {
let subgraphs = self
.subgraphs
.into_iter()
.map(|(name, tables)| {
(
name,
Subgraph {
metadata: metadata.clone(),
tables,
},
)
})
.collect();
CatalogSer {
subgraphs,
table_valued_functions: self.table_valued_functions,
introspection: self.introspection,
default_schema: self.default_schema,
}
}
}
impl Catalog {
/// Create a no-op Catalog, used when `sql` layer is disabled
pub fn empty() -> Self {
@ -46,6 +78,23 @@ impl Catalog {
default_schema: None,
}
}
/// prepare a Catalog for serializing by stripping redundant artifacts
pub fn to_serializable(self) -> CatalogSer<subgraph::SubgraphSerializable> {
let serialized_subgraphs = self
.subgraphs
.into_iter()
.map(|(name, subgraph)| (name, subgraph.tables))
.collect();
CatalogSer {
subgraphs: serialized_subgraphs,
table_valued_functions: self.table_valued_functions,
introspection: self.introspection,
default_schema: self.default_schema,
}
}
/// Derive a SQL Context from resolved Open DDS metadata.
pub fn from_metadata(metadata: &Arc<resolved::Metadata>) -> Self {
let type_registry = TypeRegistry::build_type_registry(metadata);
@ -109,10 +158,7 @@ impl Catalog {
);
Catalog {
subgraphs: subgraphs
.into_iter()
.map(|(k, v)| (k, Arc::new(v)))
.collect(),
subgraphs,
table_valued_functions,
introspection: Arc::new(introspection),
default_schema,
@ -134,7 +180,7 @@ impl datafusion::CatalogProvider for model::WithSession<Catalog> {
fn schema(&self, name: &str) -> Option<Arc<dyn datafusion::SchemaProvider>> {
let subgraph_provider = self.value.subgraphs.get(name).cloned().map(|schema| {
Arc::new(model::WithSession {
value: schema,
value: schema.into(),
session: self.session.clone(),
}) as Arc<dyn datafusion::SchemaProvider>
});

View File

@ -55,23 +55,23 @@ pub enum UnsupportedModel {
}
#[derive(Serialize, Deserialize, Clone, Debug, PartialEq)]
pub(crate) struct Model {
pub subgraph: SubgraphName,
pub name: ModelName,
pub struct Model {
pub(crate) subgraph: SubgraphName,
pub(crate) name: ModelName,
pub description: Option<String>,
pub(crate) description: Option<String>,
pub arguments: IndexMap<ArgumentName, ArgumentInfo>,
pub(crate) arguments: IndexMap<ArgumentName, ArgumentInfo>,
// The struct type of the model's object type
pub struct_type: StructTypeName,
pub(crate) struct_type: StructTypeName,
// Datafusion table schema
pub schema: datafusion::SchemaRef,
pub(crate) schema: datafusion::SchemaRef,
// This is the entry point for the type mappings stored
// in ModelSource
pub data_type: Qualified<CustomTypeName>,
pub(crate) data_type: Qualified<CustomTypeName>,
}
impl Model {

View File

@ -3,7 +3,6 @@ use metadata_resolve::Metadata;
use std::{any::Any, sync::Arc};
use indexmap::IndexMap;
use serde::{Deserialize, Serialize};
mod datafusion {
pub(super) use datafusion::error::Result;
@ -14,12 +13,22 @@ use crate::catalog;
use super::model;
#[derive(Serialize, Deserialize, Clone, Debug, PartialEq)]
pub(crate) struct Subgraph {
pub metadata: Arc<Metadata>,
pub tables: IndexMap<String, Arc<catalog::model::Model>>,
/// TODO document
///
/// This is intentionally not `Serialize`/`Deserialize`, and constructed from a
/// [`SubgraphSerializable`].
#[derive(Clone, Debug, PartialEq)]
pub struct Subgraph {
pub(crate) metadata: Arc<Metadata>,
pub(crate) tables: IndexMap<String, Arc<catalog::model::Model>>,
}
/// This is [`Subgraph`] but with `metadata` removed (to avoid redundancy in artifact creation, and
/// to avoid the confusion of multiple copies of the same thing expected to be identical but
/// perhaps not, or perhaps no one knows...). It is reconstituted as `Subgraph` at some point after
/// deserializing.
pub type SubgraphSerializable = IndexMap<String, Arc<catalog::model::Model>>;
#[async_trait]
impl datafusion::SchemaProvider for catalog::model::WithSession<Subgraph> {
fn as_any(&self) -> &dyn Any {

View File

@ -1,5 +1,5 @@
use std::{
collections::{BTreeSet, HashMap, HashSet},
collections::{BTreeMap, BTreeSet},
fmt::{Debug, Display},
sync::Arc,
};
@ -29,7 +29,7 @@ type SupportedScalar = (NormalizedType, datafusion::DataType);
#[derive(Error, Serialize, Deserialize, Clone, Debug, PartialEq)]
pub enum UnsupportedScalar {
#[error("Multiple NDC type representations found for scalar type '{0}'")]
MultipleNdcTypeRepresentations(CustomTypeName, HashSet<ndc_models::TypeRepresentation>),
MultipleNdcTypeRepresentations(CustomTypeName, BTreeSet<ndc_models::TypeRepresentation>),
#[error("No NDC representation found for scalar type '{0}'")]
NoNdcRepresentation(CustomTypeName),
#[error("Unsupported NDC type representation for scalar type '{0}': {1:?}")]
@ -42,7 +42,7 @@ pub fn resolve_scalar_type(
scalar_type_name: &Qualified<CustomTypeName>,
representation: &ScalarTypeRepresentation,
) -> Scalar {
let representations: HashSet<ndc_models::TypeRepresentation> =
let representations: BTreeSet<ndc_models::TypeRepresentation> =
representation.representations.values().cloned().collect();
let mut iter = representations.into_iter();
let ndc_representation = match iter.next() {
@ -83,8 +83,8 @@ pub fn resolve_scalar_type(
pub fn resolve_scalar_types(
metadata: &resolved::Metadata,
) -> HashMap<Qualified<CustomTypeName>, Scalar> {
let mut custom_scalars: HashMap<Qualified<CustomTypeName>, Scalar> = HashMap::new();
) -> BTreeMap<Qualified<CustomTypeName>, Scalar> {
let mut custom_scalars: BTreeMap<Qualified<CustomTypeName>, Scalar> = BTreeMap::new();
for (scalar_type_name, representation) in &metadata.scalar_types {
let scalar = resolve_scalar_type(scalar_type_name, representation);
custom_scalars.insert(scalar_type_name.clone(), scalar);
@ -93,8 +93,8 @@ pub fn resolve_scalar_types(
}
pub struct TypeRegistry {
custom_scalars: HashMap<Qualified<CustomTypeName>, Scalar>,
struct_types: HashMap<Qualified<CustomTypeName>, Struct>,
custom_scalars: BTreeMap<Qualified<CustomTypeName>, Scalar>,
struct_types: BTreeMap<Qualified<CustomTypeName>, Struct>,
default_schema: Option<SubgraphName>,
}
@ -113,7 +113,7 @@ impl TypeRegistry {
// build a default schema by checking if the object types are spread across more than one
// subgraph
let default_schema = {
let subgraphs: HashSet<_> = metadata
let subgraphs: BTreeSet<_> = metadata
.object_types
.keys()
.map(|object_type_name| &object_type_name.subgraph)
@ -127,7 +127,7 @@ impl TypeRegistry {
let custom_scalars = resolve_scalar_types(metadata);
let mut struct_types: HashMap<Qualified<CustomTypeName>, Struct> = HashMap::new();
let mut struct_types: BTreeMap<Qualified<CustomTypeName>, Struct> = BTreeMap::new();
for (object_type_name, object_type) in &metadata.object_types {
let struct_type = struct_type(
@ -206,7 +206,7 @@ impl TypeRegistry {
// }
}
pub(crate) fn struct_types(&self) -> &HashMap<Qualified<CustomTypeName>, Struct> {
pub(crate) fn struct_types(&self) -> &BTreeMap<Qualified<CustomTypeName>, Struct> {
&self.struct_types
}
@ -214,7 +214,7 @@ impl TypeRegistry {
self.default_schema.as_ref()
}
pub fn custom_scalars(&self) -> &HashMap<Qualified<CustomTypeName>, Scalar> {
pub fn custom_scalars(&self) -> &BTreeMap<Qualified<CustomTypeName>, Scalar> {
&self.custom_scalars
}
}
@ -338,7 +338,7 @@ type Struct = Result<StructType, UnsupportedObject>;
fn struct_type(
default_schema: &Option<SubgraphName>,
metadata: &resolved::Metadata,
custom_scalars: &HashMap<Qualified<CustomTypeName>, Scalar>,
custom_scalars: &BTreeMap<Qualified<CustomTypeName>, Scalar>,
object_type_name: &Qualified<CustomTypeName>,
object_type: &resolved::ObjectTypeWithRelationships,
disallowed_object_types: &BTreeSet<Qualified<CustomTypeName>>,
@ -491,7 +491,7 @@ fn ndc_representation_to_datatype(
fn to_arrow_type(
default_schema: &Option<SubgraphName>,
metadata: &resolved::Metadata,
custom_scalars: &HashMap<Qualified<CustomTypeName>, Scalar>,
custom_scalars: &BTreeMap<Qualified<CustomTypeName>, Scalar>,
ty: &resolved::QualifiedBaseType,
disallowed_object_types: &BTreeSet<Qualified<CustomTypeName>>,
// ) -> GeneratedArrowType {

View File

@ -17,7 +17,7 @@ use datafusion::{
error::DataFusionError,
scalar::ScalarValue,
};
use std::{collections::HashSet, sync::Arc};
use std::{collections::BTreeSet, sync::Arc};
/// Parses a datafusion literal into a json value, supports most types (including structs and arrays).
pub(crate) fn parse_datafusion_literal(
@ -197,7 +197,7 @@ pub(crate) fn parse_struct_literal(
}
} else {
// Create a map of expected fields
let expected_field_set: HashSet<_> =
let expected_field_set: BTreeSet<_> =
expected_fields.iter().map(|field| field.name()).collect();
// Check for extra fields in the struct that aren't expected
@ -216,7 +216,7 @@ pub(crate) fn parse_struct_literal(
}
}
let field_name_to_index: std::collections::HashMap<_, _> = struct_array
let field_name_to_index: std::collections::BTreeMap<_, _> = struct_array
.fields()
.iter()
.enumerate()