Export Chinook dataset to JSON lines for Athena consumption

PR-URL: https://github.com/hasura/graphql-engine-mono/pull/6489
GitOrigin-RevId: 36a068179d595f3d838995fb057b8ee590d1fd02
This commit is contained in:
Daniel Chambers 2022-10-24 13:09:16 +11:00 committed by hasura-bot
parent ef386ccee1
commit 792ef3117d
6 changed files with 168 additions and 2 deletions

View File

@ -102,7 +102,10 @@ test-suite tests-dc-api
data-fix,
deepseq,
dc-api,
directory,
file-embed,
filepath,
graphql-parser,
hashable,
hspec,
hspec-core,
@ -119,6 +122,7 @@ test-suite tests-dc-api
servant-client-core,
servant-openapi3,
text,
time,
unordered-containers,
vector,
xml-conduit,
@ -134,6 +138,7 @@ test-suite tests-dc-api
Command
Paths_dc_api
Test.Data
Test.DataExport
Test.CapabilitiesSpec
Test.Expectations
Test.HealthSpec

View File

@ -3,6 +3,8 @@ module Command
TestConfig (..),
NameCasing (..),
TestOptions (..),
ExportDataConfig (..),
ExportFormat (..),
parseCommandLine,
)
where
@ -22,6 +24,7 @@ import Prelude
data Command
= Test TestOptions
| ExportOpenAPISpec
| ExportData ExportDataConfig
data TestConfig = TestConfig
{ _tcTableNamePrefix :: [Text],
@ -45,6 +48,17 @@ data TestOptions = TestOptions
_toExportMatchStrings :: Bool
}
data ExportDataConfig = ExportDataConfig
{ _edcDirectory :: FilePath,
_edcFormat :: ExportFormat,
_edcDateTimeFormat :: Maybe String
}
data ExportFormat
= JSON
| JSONLines
deriving (Eq, Show, Read)
parseCommandLine :: IO Command
parseCommandLine =
execParser $
@ -69,7 +83,7 @@ version =
commandParser :: Parser Command
commandParser =
subparser
(testCommand <> exportOpenApiSpecCommand)
(testCommand <> exportOpenApiSpecCommand <> exportData)
where
testCommand =
command
@ -85,6 +99,13 @@ commandParser =
(helper <*> pure ExportOpenAPISpec)
(progDesc "Exports the OpenAPI specification of the Data Connector API that agents must implement")
)
exportData =
command
"export-data"
( info
(helper <*> (ExportData <$> exportDataConfigParser))
(progDesc "Exports the Chinook dataset to files in the specified directory")
)
testConfigParser :: Parser TestConfig
testConfigParser =
@ -167,6 +188,30 @@ testOptionsParser =
testCommandParser :: Parser Command
testCommandParser = Test <$> testOptionsParser
exportDataConfigParser :: Parser ExportDataConfig
exportDataConfigParser =
ExportDataConfig
<$> strOption
( long "directory"
<> short 'd'
<> metavar "DIR"
<> help "The directory to export the data files into"
)
<*> option
auto
( long "format"
<> short 'f'
<> metavar "FORMAT"
<> help "The format to export (JSON or JSONLines)"
)
<*> optional
( strOption
( long "datetime-format"
<> metavar "FORMAT"
<> help "Format string to use when formatting DateTime columns (use format syntax from https://hackage.haskell.org/package/time-1.12.2/docs/Data-Time-Format.html#v:formatTime)"
)
)
baseUrl :: ReadM BaseUrl
baseUrl = eitherReader $ left show . parseBaseUrl

View File

@ -18,6 +18,7 @@ import Servant.API (NamedRoutes)
import Servant.Client (Client, ClientError, hoistClient, mkClientEnv, runClientM)
import Test.CapabilitiesSpec qualified
import Test.Data (TestData, guardedCapabilities, mkTestData)
import Test.DataExport (exportData)
import Test.ErrorSpec qualified
import Test.ExplainSpec qualified
import Test.HealthSpec qualified
@ -62,6 +63,8 @@ main = do
traverse_ ((traverse_ putStrLn) . (foldPaths . extractLabels)) tree
ExportOpenAPISpec ->
Text.putStrLn $ encodeToLazyText openApiSchema
ExportData config ->
exportData config
pure ()

View File

@ -3,7 +3,7 @@ This test suite provides a set of tests that is able to test any Data Connector
Not all tests will be appropriate for all agents. Agents self-describe their capabilities and only the tests appropriate for those capabilities will be run.
The executable also has the ability to export the OpenAPI spec of the Data Connector agent API so that customers can use that to ensure their agent complies with the API format.
The executable also has the ability to export the OpenAPI spec of the Data Connector agent API so that customers can use that to ensure their agent complies with the API format. In addition, the Chinook data set can be exported to files on disk in various formats.
## How to Use
First, start your Data Connector agent and ensure it is populated with the [Chinook data set](https://github.com/lerocha/chinook-database/). For example, you could start the Reference Agent by following the instructions in [its README](../../dc-agents/reference/README.md).
@ -29,3 +29,14 @@ To export the OpenAPI spec, you can run this command, and the spec will be writt
```
> cabal run test:tests-dc-api -- export-openapi-spec
```
To export the Chinook data set, you can run this command:
```
> cabal run test:tests-dc-api -- export-data -d /tmp/chinook-data -f JSONLines
```
This will export the data into the directory specified by `-d` in the `JSONLines` format (`-f`) which is as a JSON object per row, newline separated. Each table's data will be exported into a separate file.
If you need to customize the format of any DateTime columns, you can use the `--datetime-format` option and specify a format string using the syntax specified [here](https://hackage.haskell.org/package/time-1.12.2/docs/Data-Time-Format.html#v:formatTime). By default the DateTime columns are exported in ISO8601 format.
The other format supported by `-f` is `JSON`, which results in each file being a JSON array of table rows as JSON objects.

View File

@ -5,6 +5,8 @@ module Test.Data
( -- = Test Data
TestData (..),
mkTestData,
schemaTables,
allTableRows,
-- = Utilities
emptyQuery,
sortBy,
@ -274,6 +276,34 @@ genresTableRelationships =
]
)
playlistsTableName :: API.TableName
playlistsTableName = mkTableName "Playlist"
playlistsRows :: [HashMap API.FieldName API.FieldValue]
playlistsRows = sortBy (API.FieldName "PlaylistId") $ readTableFromXmlIntoRows playlistsTableName
playlistTracksTableName :: API.TableName
playlistTracksTableName = mkTableName "PlaylistTrack"
playlistTracksRows :: [HashMap API.FieldName API.FieldValue]
playlistTracksRows = sortOn (\r -> (r ^? ix (API.FieldName "PlaylistId"), r ^? ix (API.FieldName "TrackId"))) $ readTableFromXmlIntoRows playlistTracksTableName
allTableRows :: HashMap API.TableName ([HashMap API.FieldName API.FieldValue])
allTableRows =
HashMap.fromList
[ (artistsTableName, artistsRows),
(albumsTableName, albumsRows),
(customersTableName, customersRows),
(employeesTableName, employeesRows),
(genresTableName, genresRows),
(invoicesTableName, invoicesRows),
(invoiceLinesTableName, invoiceLinesRows),
(mediaTypesTableName, mediaTypesRows),
(playlistsTableName, playlistsRows),
(playlistTracksTableName, playlistTracksRows),
(tracksTableName, tracksRows)
]
data TestData = TestData
{ -- = Schema
_tdSchemaTables :: [API.TableInfo],

View File

@ -0,0 +1,72 @@
module Test.DataExport
( exportData,
)
where
import Command (ExportDataConfig (..), ExportFormat (..))
import Control.Lens ((&))
import Data.Aeson qualified as J
import Data.ByteString.Lazy qualified as BSL
import Data.HashMap.Strict (HashMap)
import Data.HashMap.Strict qualified as HashMap
import Data.List.NonEmpty qualified as NonEmpty
import Data.Maybe (fromMaybe)
import Data.Text qualified as Text
import Data.Time (ZonedTime, defaultTimeLocale)
import Data.Time.Format (formatTime)
import Data.Time.Format.ISO8601 (iso8601ParseM)
import Hasura.Backends.DataConnector.API
import System.Directory qualified as Directory
import System.FilePath ((<.>), (</>))
import Test.Data qualified as Data
exportData :: ExportDataConfig -> IO ()
exportData ExportDataConfig {..} = do
absDirectory <- Directory.makeAbsolute _edcDirectory
Directory.createDirectoryIfMissing True absDirectory
putStrLn $ "Exporting to " <> absDirectory
let convertTable' = case _edcFormat of
JSON -> convertTable ".json" convertTableToJSON
JSONLines -> convertTable ".json" convertTableToJSONLines
Data.schemaTables
& fmap (\tableInfo@TableInfo {..} -> (tableInfo,) <$> fromMaybe [] $ HashMap.lookup _tiName Data.allTableRows)
& mapM_ \(tableInfo@TableInfo {..}, rows) -> do
let rows' = maybe rows (\formatString -> formatDateColumnsInRow formatString tableInfo <$> rows) _edcDateTimeFormat
let (filename, contents) = convertTable' _tiName rows'
let destFile = absDirectory </> filename
BSL.writeFile destFile contents
putStrLn $ "Exported " <> filename
convertTable :: String -> ([HashMap FieldName FieldValue] -> BSL.ByteString) -> TableName -> [HashMap FieldName FieldValue] -> (FilePath, BSL.ByteString)
convertTable extension convertRows tableName rows =
(filename, fileContents)
where
filename = (Text.unpack . NonEmpty.last $ unTableName tableName) <.> extension
fileContents = convertRows rows
convertTableToJSON :: [HashMap FieldName FieldValue] -> BSL.ByteString
convertTableToJSON rows =
J.encode $ J.toJSON rows
convertTableToJSONLines :: [HashMap FieldName FieldValue] -> BSL.ByteString
convertTableToJSONLines rows =
BSL.intercalate "\n" $ J.encode . J.toJSON <$> rows
formatDateColumnsInRow :: String -> TableInfo -> HashMap FieldName FieldValue -> HashMap FieldName FieldValue
formatDateColumnsInRow dateTimeFormatString TableInfo {..} row =
row
& HashMap.mapWithKey
( \fieldName fieldValue ->
if fieldName `elem` dateFields
then fromMaybe fieldValue $ tryFormatDate fieldValue
else fieldValue
)
where
dateFields = fmap (\ColumnInfo {..} -> FieldName $ unColumnName _ciName) $ filter (\ColumnInfo {..} -> _ciType == dateTimeScalarType) _tiColumns
dateTimeScalarType = CustomTy "DateTime"
tryFormatDate fieldValue = case deserializeAsColumnFieldValue fieldValue of
J.String value -> do
(zonedTime :: ZonedTime) <- iso8601ParseM $ Text.unpack value
let formattedString = formatTime defaultTimeLocale dateTimeFormatString zonedTime
Just . mkColumnFieldValue . J.String . Text.pack $ formattedString
_ -> Nothing