Generate .shard_metadata file in cron job shard (#814)

Co-authored-by: Azeem Shaikh <azeems@google.com>
This commit is contained in:
Azeem Shaikh 2021-08-06 11:07:42 -07:00 committed by GitHub
parent d58fd2d927
commit 7f71928daa
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 254 additions and 7 deletions

View File

@ -104,9 +104,11 @@ build: ## Build all binaries and images in the reepo.
build: $(build-targets)
build-proto: ## Compiles and generates all required protobufs
build-proto: cron/data/request.pb.go
build-proto: cron/data/request.pb.go cron/data/metadata.pb.go
cron/data/request.pb.go: cron/data/request.proto | $(PROTOC)
protoc --go_out=../../../ cron/data/request.proto
cron/data/metadata.pb.go: cron/data/metadata.proto | $(PROTOC)
protoc --go_out=../../../ cron/data/metadata.proto
generate-docs: ## Generates docs
generate-docs: docs/checks.md
@ -152,8 +154,9 @@ dockerbuild: ## Runs docker build
# Build all Docker images in the Repo
$(call ndef, GITHUB_AUTH_TOKEN)
DOCKER_BUILDKIT=1 docker build . --file Dockerfile --tag $(IMAGE_NAME)
DOCKER_BUILDKIT=1 docker build . --file cron/controller/Dockerfile --tag $(IMAGE_NAME)-batch-controller
DOCKER_BUILDKIT=1 docker build . --file cron/worker/Dockerfile --tag $(IMAGE_NAME)-batch-worker
DOCKER_BUILDKIT=1 docker build . --file cron/controller/Dockerfile \
--build-arg=COMMIT_SHA=$(GIT_HASH) --tag $(IMAGE_NAME)-batch-controller
DOCKER_BUILDKIT=1 docker build . --file cron/worker/Dockerfile --tag $(IMAGE_NAME)-batch-worker
DOCKER_BUILDKIT=1 docker build . --file cron/bq/Dockerfile --tag $(IMAGE_NAME)-bq-transfer
###############################################################################

View File

@ -73,6 +73,9 @@ func getBucketSummary(ctx context.Context, bucketURL string) (*bucketSummary, er
summary.getOrCreate(creationTime).shardsCreated++
case filename == config.TransferStatusFilename:
summary.getOrCreate(creationTime).isTransferred = true
case filename == config.ShardMetadataFilename:
// TODO(azeems): Handle shard_metadata file.
continue
default:
// nolint: goerr113
return nil, fmt.Errorf("found unrecognized file: %s", key)

View File

@ -15,6 +15,7 @@
steps:
- name: 'gcr.io/cloud-builders/docker'
args: ['build', '.',
'--build-arg', 'COMMIT_SHA=$COMMIT_SHA',
'-t', 'gcr.io/openssf/scorecard-batch-controller:$COMMIT_SHA',
'-t', 'gcr.io/openssf/scorecard-batch-controller:latest',
'-f', 'cron/controller/Dockerfile']

View File

@ -29,6 +29,8 @@ import (
)
const (
// ShardMetadataFilename file contains metadata for the created shard.
ShardMetadataFilename string = ".shard_metadata"
// ShardNumFilename is the name of the file that stores the number of shards.
ShardNumFilename string = ".shard_num"
// TransferStatusFilename file identifies if shard transfer to BigQuery is completed.

View File

@ -26,6 +26,8 @@ ARG TARGETARCH
RUN CGO_ENABLED=0 make build-pubsub
FROM gcr.io/distroless/base:nonroot@sha256:bc84925113289d139a9ef2f309f0dd7ac46ea7b786f172ba9084ffdb4cbd9490
ARG COMMIT_SHA
ENV SCORECARD_COMMIT_SHA=${COMMIT_SHA}
COPY ./cron/data/projects*csv cron/data/
COPY --from=pubsub /src/cron/controller/controller cron/controller/controller
ENTRYPOINT ["cron/controller/controller"]

View File

@ -22,6 +22,7 @@ import (
"strconv"
"time"
"google.golang.org/protobuf/encoding/protojson"
"google.golang.org/protobuf/types/known/timestamppb"
"github.com/ossf/scorecard/v2/cron/config"
@ -29,6 +30,8 @@ import (
"github.com/ossf/scorecard/v2/cron/pubsub"
)
const commitSHA = "SCORECARD_COMMIT_SHA"
func publishToRepoRequestTopic(ctx context.Context, iter data.Iterator, datetime time.Time) (int32, error) {
var shardNum int32
request := data.ScorecardBatchRequest{
@ -99,18 +102,37 @@ func main() {
panic(err)
}
shardNum, err := publishToRepoRequestTopic(ctx, reader, t)
if err != nil {
panic(err)
}
bucket, err := config.GetResultDataBucketURL()
if err != nil {
panic(err)
}
shardNum, err := publishToRepoRequestTopic(ctx, reader, t)
if err != nil {
panic(err)
}
// TODO(azeems): Stop populating `.shard_num` file.
err = data.WriteToBlobStore(ctx, bucket,
data.GetShardNumFilename(t),
[]byte(strconv.Itoa(int(shardNum+1))))
if err != nil {
panic(err)
}
// Populate `.shard_metadata` file.
metadata := data.ShardMetadata{
NumShard: new(int32),
ShardLoc: new(string),
CommitSha: new(string),
}
*metadata.NumShard = (shardNum + 1)
*metadata.ShardLoc = bucket + "/" + data.GetBlobFilename("", t)
*metadata.CommitSha = os.Getenv(commitSHA)
metadataJSON, err := protojson.Marshal(&metadata)
if err != nil {
panic(fmt.Errorf("error during protojson.Marshal: %w", err))
}
err = data.WriteToBlobStore(ctx, bucket, data.GetShardMetadataFilename(t), metadataJSON)
if err != nil {
panic(fmt.Errorf("error writing to BlobStore: %w", err))
}
}

View File

@ -130,6 +130,11 @@ func GetTransferStatusFilename(datetime time.Time) string {
return GetBlobFilename(config.TransferStatusFilename, datetime)
}
// GetShardMetadataFilename returns shard_metadata filename for a shard.
func GetShardMetadataFilename(datetime time.Time) string {
return GetBlobFilename(config.ShardMetadataFilename, datetime)
}
// ParseBlobFilename parses a blob key into a Time object.
func ParseBlobFilename(key string) (time.Time, string, error) {
if len(key) < len(filePrefixFormat) {

184
cron/data/metadata.pb.go Normal file
View File

@ -0,0 +1,184 @@
// Copyright 2021 Security Scorecard Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Code generated by protoc-gen-go. DO NOT EDIT.
// versions:
// protoc-gen-go v1.27.1
// protoc v3.15.8
// source: cron/data/metadata.proto
package data
import (
protoreflect "google.golang.org/protobuf/reflect/protoreflect"
protoimpl "google.golang.org/protobuf/runtime/protoimpl"
reflect "reflect"
sync "sync"
)
const (
// Verify that this generated code is sufficiently up-to-date.
_ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion)
// Verify that runtime/protoimpl is sufficiently up-to-date.
_ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20)
)
type ShardMetadata struct {
state protoimpl.MessageState
sizeCache protoimpl.SizeCache
unknownFields protoimpl.UnknownFields
ShardLoc *string `protobuf:"bytes,1,opt,name=shard_loc,json=shardLoc,proto3,oneof" json:"shard_loc,omitempty"`
NumShard *int32 `protobuf:"varint,2,opt,name=num_shard,json=numShard,proto3,oneof" json:"num_shard,omitempty"`
CommitSha *string `protobuf:"bytes,3,opt,name=commit_sha,json=commitSha,proto3,oneof" json:"commit_sha,omitempty"`
}
func (x *ShardMetadata) Reset() {
*x = ShardMetadata{}
if protoimpl.UnsafeEnabled {
mi := &file_cron_data_metadata_proto_msgTypes[0]
ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
ms.StoreMessageInfo(mi)
}
}
func (x *ShardMetadata) String() string {
return protoimpl.X.MessageStringOf(x)
}
func (*ShardMetadata) ProtoMessage() {}
func (x *ShardMetadata) ProtoReflect() protoreflect.Message {
mi := &file_cron_data_metadata_proto_msgTypes[0]
if protoimpl.UnsafeEnabled && x != nil {
ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
if ms.LoadMessageInfo() == nil {
ms.StoreMessageInfo(mi)
}
return ms
}
return mi.MessageOf(x)
}
// Deprecated: Use ShardMetadata.ProtoReflect.Descriptor instead.
func (*ShardMetadata) Descriptor() ([]byte, []int) {
return file_cron_data_metadata_proto_rawDescGZIP(), []int{0}
}
func (x *ShardMetadata) GetShardLoc() string {
if x != nil && x.ShardLoc != nil {
return *x.ShardLoc
}
return ""
}
func (x *ShardMetadata) GetNumShard() int32 {
if x != nil && x.NumShard != nil {
return *x.NumShard
}
return 0
}
func (x *ShardMetadata) GetCommitSha() string {
if x != nil && x.CommitSha != nil {
return *x.CommitSha
}
return ""
}
var File_cron_data_metadata_proto protoreflect.FileDescriptor
var file_cron_data_metadata_proto_rawDesc = []byte{
0x0a, 0x18, 0x63, 0x72, 0x6f, 0x6e, 0x2f, 0x64, 0x61, 0x74, 0x61, 0x2f, 0x6d, 0x65, 0x74, 0x61,
0x64, 0x61, 0x74, 0x61, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x12, 0x18, 0x6f, 0x73, 0x73, 0x66,
0x2e, 0x73, 0x63, 0x6f, 0x72, 0x65, 0x63, 0x61, 0x72, 0x64, 0x2e, 0x63, 0x72, 0x6f, 0x6e, 0x2e,
0x64, 0x61, 0x74, 0x61, 0x22, 0xa2, 0x01, 0x0a, 0x0d, 0x53, 0x68, 0x61, 0x72, 0x64, 0x4d, 0x65,
0x74, 0x61, 0x64, 0x61, 0x74, 0x61, 0x12, 0x20, 0x0a, 0x09, 0x73, 0x68, 0x61, 0x72, 0x64, 0x5f,
0x6c, 0x6f, 0x63, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x48, 0x00, 0x52, 0x08, 0x73, 0x68, 0x61,
0x72, 0x64, 0x4c, 0x6f, 0x63, 0x88, 0x01, 0x01, 0x12, 0x20, 0x0a, 0x09, 0x6e, 0x75, 0x6d, 0x5f,
0x73, 0x68, 0x61, 0x72, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x05, 0x48, 0x01, 0x52, 0x08, 0x6e,
0x75, 0x6d, 0x53, 0x68, 0x61, 0x72, 0x64, 0x88, 0x01, 0x01, 0x12, 0x22, 0x0a, 0x0a, 0x63, 0x6f,
0x6d, 0x6d, 0x69, 0x74, 0x5f, 0x73, 0x68, 0x61, 0x18, 0x03, 0x20, 0x01, 0x28, 0x09, 0x48, 0x02,
0x52, 0x09, 0x63, 0x6f, 0x6d, 0x6d, 0x69, 0x74, 0x53, 0x68, 0x61, 0x88, 0x01, 0x01, 0x42, 0x0c,
0x0a, 0x0a, 0x5f, 0x73, 0x68, 0x61, 0x72, 0x64, 0x5f, 0x6c, 0x6f, 0x63, 0x42, 0x0c, 0x0a, 0x0a,
0x5f, 0x6e, 0x75, 0x6d, 0x5f, 0x73, 0x68, 0x61, 0x72, 0x64, 0x42, 0x0d, 0x0a, 0x0b, 0x5f, 0x63,
0x6f, 0x6d, 0x6d, 0x69, 0x74, 0x5f, 0x73, 0x68, 0x61, 0x42, 0x25, 0x5a, 0x23, 0x67, 0x69, 0x74,
0x68, 0x75, 0x62, 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x6f, 0x73, 0x73, 0x66, 0x2f, 0x73, 0x63, 0x6f,
0x72, 0x65, 0x63, 0x61, 0x72, 0x64, 0x2f, 0x63, 0x72, 0x6f, 0x6e, 0x2f, 0x64, 0x61, 0x74, 0x61,
0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33,
}
var (
file_cron_data_metadata_proto_rawDescOnce sync.Once
file_cron_data_metadata_proto_rawDescData = file_cron_data_metadata_proto_rawDesc
)
func file_cron_data_metadata_proto_rawDescGZIP() []byte {
file_cron_data_metadata_proto_rawDescOnce.Do(func() {
file_cron_data_metadata_proto_rawDescData = protoimpl.X.CompressGZIP(file_cron_data_metadata_proto_rawDescData)
})
return file_cron_data_metadata_proto_rawDescData
}
var file_cron_data_metadata_proto_msgTypes = make([]protoimpl.MessageInfo, 1)
var file_cron_data_metadata_proto_goTypes = []interface{}{
(*ShardMetadata)(nil), // 0: ossf.scorecard.cron.data.ShardMetadata
}
var file_cron_data_metadata_proto_depIdxs = []int32{
0, // [0:0] is the sub-list for method output_type
0, // [0:0] is the sub-list for method input_type
0, // [0:0] is the sub-list for extension type_name
0, // [0:0] is the sub-list for extension extendee
0, // [0:0] is the sub-list for field type_name
}
func init() { file_cron_data_metadata_proto_init() }
func file_cron_data_metadata_proto_init() {
if File_cron_data_metadata_proto != nil {
return
}
if !protoimpl.UnsafeEnabled {
file_cron_data_metadata_proto_msgTypes[0].Exporter = func(v interface{}, i int) interface{} {
switch v := v.(*ShardMetadata); i {
case 0:
return &v.state
case 1:
return &v.sizeCache
case 2:
return &v.unknownFields
default:
return nil
}
}
}
file_cron_data_metadata_proto_msgTypes[0].OneofWrappers = []interface{}{}
type x struct{}
out := protoimpl.TypeBuilder{
File: protoimpl.DescBuilder{
GoPackagePath: reflect.TypeOf(x{}).PkgPath(),
RawDescriptor: file_cron_data_metadata_proto_rawDesc,
NumEnums: 0,
NumMessages: 1,
NumExtensions: 0,
NumServices: 0,
},
GoTypes: file_cron_data_metadata_proto_goTypes,
DependencyIndexes: file_cron_data_metadata_proto_depIdxs,
MessageInfos: file_cron_data_metadata_proto_msgTypes,
}.Build()
File_cron_data_metadata_proto = out.File
file_cron_data_metadata_proto_rawDesc = nil
file_cron_data_metadata_proto_goTypes = nil
file_cron_data_metadata_proto_depIdxs = nil
}

25
cron/data/metadata.proto Normal file
View File

@ -0,0 +1,25 @@
// Copyright 2021 Security Scorecard Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
syntax = "proto3";
package ossf.scorecard.cron.data;
option go_package = "github.com/ossf/scorecard/cron/data";
message ShardMetadata {
optional string shard_loc = 1;
optional int32 num_shard = 2;
optional string commit_sha = 3;
}