Implement filter arguments support in Mononoke GRit server

Summary: This diff implements capability to parse and accept filter arguments from git client. A follow up diff will add actual support for filter at the server Reviewed By: markbt Differential Revision: D57052433 fbshipit-source-id: 7bedaa398d938f33f18570fe96e36ac3c83df0ed
2024-10-04 05:47:13 +03:00 · 2024-05-09 03:37:15 -07:00 · 2024-05-09 03:37:15 -07:00 · 7952fc3222
commit 7952fc3222
parent b457df85e1
8 changed files with 190 additions and 21 deletions
--- a/eden/mononoke/git/protocol/Cargo.toml
+++ b/eden/mononoke/git/protocol/Cargo.toml
@ -23,6 +23,7 @@ git_symbolic_refs = { version = "0.1.0", path = "../../git_symbolic_refs" }
 git_types = { version = "0.1.0", path = "../git_types" }
 gix-date = "0.7"
 gix-hash = "0.11"
+gix-object = "0.33"
 metaconfig_types = { version = "0.1.0", path = "../../metaconfig/types" }
 mononoke_types = { version = "0.1.0", path = "../../mononoke_types" }
 packetline = { version = "0.1.0", path = "../packetline" }
--- a/eden/mononoke/git/protocol/TARGETS
+++ b/eden/mononoke/git/protocol/TARGETS
@ -13,6 +13,7 @@ rust_library(
        "fbsource//third-party/rust:futures",
        "fbsource//third-party/rust:gix-date",
        "fbsource//third-party/rust:gix-hash",
+        "fbsource//third-party/rust:gix-object",
        "fbsource//third-party/rust:rustc-hash",
        "fbsource//third-party/rust:tokio",
        "//common/rust/shed/buffered_weighted:buffered_weighted",
--- a/eden/mononoke/git/protocol/src/types.rs
+++ b/eden/mononoke/git/protocol/src/types.rs
@ -340,12 +340,20 @@ pub struct FetchRequest {
    pub deepen_not: Option<ObjectId>,
    /// Request that various objects from the packfile be omitted using
    /// one of several filtering techniques
-    pub filter: Option<String>,
+    pub filter: Option<FetchFilter>,
    /// The concurrency setting to be used for generating the packfile items for the
    /// fetch request
    pub concurrency: PackfileConcurrency,
 }

+/// Struct representing the filtering options that can be used during fetch / clone
+#[derive(Debug, Clone)]
+pub struct FetchFilter {
+    pub max_blob_size: u64,
+    pub max_tree_depth: u64,
+    pub allowed_object_types: Vec<gix_object::Kind>,
+}
+
 /// Struct representing the packfile item response generated for the
 /// given range of commits
 pub struct PackItemStreamResponse<'a> {
--- a/eden/mononoke/git_server/Cargo.toml
+++ b/eden/mononoke/git_server/Cargo.toml
@ -33,6 +33,7 @@ futures_stats = { version = "0.1.0", git = "https://github.com/facebookexperimen
 git_symbolic_refs = { version = "0.1.0", path = "../git_symbolic_refs" }
 gix-date = "0.7"
 gix-hash = "0.11"
+gix-object = "0.33"
 gix-packetline = { version = "0.17.2", features = ["blocking-io"] }
 gix-transport = "0.41"
 gotham = "0.7.1"
--- a/eden/mononoke/git_server/TARGETS
+++ b/eden/mononoke/git_server/TARGETS
@ -16,6 +16,7 @@ rust_binary(
        "fbsource//third-party/rust:futures",
        "fbsource//third-party/rust:gix-date",
        "fbsource//third-party/rust:gix-hash",
+        "fbsource//third-party/rust:gix-object",
        "fbsource//third-party/rust:gix-packetline",
        "fbsource//third-party/rust:gix-transport",
        "fbsource//third-party/rust:gotham",
--- a/eden/mononoke/git_server/src/command/fetch.rs
+++ b/eden/mononoke/git_server/src/command/fetch.rs
@ -5,11 +5,15 @@
 * GNU General Public License version 2.
 */

+use anyhow::bail;
 use anyhow::Context;
+use anyhow::Result;
 use gix_hash::ObjectId;
+use gix_object::Kind;
 use gix_packetline::PacketLineRef;
 use gix_packetline::StreamingPeekableIter;
 use gix_transport::bstr::ByteSlice;
+use protocol::types::FetchFilter;
 use protocol::types::FetchRequest;
 use protocol::types::PackfileConcurrency;

@ -81,7 +85,7 @@ pub struct FetchArgs {
    pub deepen_not: Option<ObjectId>,
    /// Request that various objects from the packfile be omitted using
    /// one of several filtering techniques
-    pub filter: Option<String>,
+    pub filter: Option<FilterArgs>,
    /// Indicates to the server that the client wants to retrieve a particular set of
    /// refs by providing the full name of the ref on the server
    pub want_refs: Vec<String>,
@ -97,7 +101,115 @@ pub struct FetchArgs {
    pub wait_for_done: bool,
 }

-fn parse_oid(data: &[u8], oid_type: &[u8]) -> anyhow::Result<ObjectId> {
+/// Argument for filtering objects during clone/fetch
+#[derive(Clone, Debug)]
+pub struct FilterArgs {
+    max_blob_size: u64,
+    max_tree_depth: u64,
+    allowed_object_types: Vec<Kind>,
+}
+
+impl Default for FilterArgs {
+    fn default() -> Self {
+        Self {
+            max_blob_size: u64::MAX,
+            max_tree_depth: u64::MAX,
+            allowed_object_types: vec![Kind::Blob, Kind::Tree, Kind::Commit, Kind::Tag],
+        }
+    }
+}
+
+impl FilterArgs {
+    const COMBINE_PREFIX: &'static str = "combine:";
+    const OBJECT_TYPE_PREFIX: &'static str = "object:type=";
+    const BLOB_PREFIX: &'static str = "blob:";
+    const TREE_PREFIX: &'static str = "tree:";
+    const NO_BLOBS: &'static str = "none";
+    const SIZE_LIMIT: &'static str = "limit=";
+    const FILTER_SPLITTER: &'static str = "+";
+
+    fn parse_size(size: &str) -> Result<u64> {
+        const KB_SUFFIX: &str = "k";
+        const MB_SUFFIX: &str = "m";
+        const GB_SUFFIX: &str = "g";
+        let mut multiplier = 1;
+        let size = if let Some(size_num) = size.strip_suffix(KB_SUFFIX) {
+            multiplier = 1024;
+            size_num
+        } else if let Some(size_num) = size.strip_suffix(MB_SUFFIX) {
+            multiplier = 1024 * 1024;
+            size_num
+        } else if let Some(size_num) = size.strip_suffix(GB_SUFFIX) {
+            multiplier = 1024 * 1024 * 1024;
+            size_num
+        } else {
+            size
+        };
+        size.parse::<u64>()
+            .map(|size| size * multiplier)
+            .with_context(|| format!("Invalid blob size {:?}", size))
+    }
+
+    fn parse_from_spec(data: String) -> Result<Self> {
+        let filter_set = if let Some(combined_filters) = data.strip_prefix(Self::COMBINE_PREFIX) {
+            // There are multiple filters combined together
+            combined_filters
+                .split(Self::FILTER_SPLITTER)
+                .map(String::from)
+                .collect()
+        } else {
+            // There is only one filter
+            vec![data]
+        };
+        let mut filter_args = FilterArgs::default();
+        let mut allowed_type = None;
+        for filter in filter_set {
+            if let Some(object_type) = filter.strip_prefix(Self::OBJECT_TYPE_PREFIX) {
+                let object_kind = Kind::from_bytes(object_type.as_bytes())
+                    .with_context(|| format!("Invalid object type {:?}", object_type))?;
+                // Git has this weird behavior if you specify multiple allowed object types
+                // it just honors the first one it comes across. And no, there is no mention
+                // of it in the docs. Found this out through code reading and trail-and-error :)
+                if allowed_type.is_none() {
+                    allowed_type = Some(object_kind);
+                }
+            }
+            if let Some(blob_size) = filter.strip_prefix(Self::BLOB_PREFIX) {
+                if blob_size == Self::NO_BLOBS {
+                    filter_args.max_blob_size = 0;
+                } else if let Some(blob_limit) = blob_size.strip_prefix(Self::SIZE_LIMIT) {
+                    filter_args.max_blob_size = Self::parse_size(blob_limit)?;
+                } else {
+                    bail!(
+                        "Invalid blob size {:?} in filter spec {}",
+                        blob_size,
+                        filter
+                    );
+                }
+            }
+            if let Some(tree_depth) = filter.strip_prefix(Self::TREE_PREFIX) {
+                let max_depth = tree_depth
+                    .parse::<u64>()
+                    .with_context(|| format!("Invalid tree depth {:?}", tree_depth))?;
+                filter_args.max_tree_depth = max_depth;
+            }
+        }
+        if let Some(allowed_type) = allowed_type {
+            filter_args.allowed_object_types = vec![allowed_type];
+        }
+        Ok(filter_args)
+    }
+
+    fn into_fetch_filter(self) -> FetchFilter {
+        FetchFilter {
+            max_blob_size: self.max_blob_size,
+            max_tree_depth: self.max_tree_depth,
+            allowed_object_types: self.allowed_object_types,
+        }
+    }
+}
+
+fn parse_oid(data: &[u8], oid_type: &[u8]) -> Result<ObjectId> {
    ObjectId::from_hex(data).with_context(|| {
        format!(
            "Invalid {:?}object id {:?} received during fetch request",
@ -110,11 +222,10 @@ fn bytes_to_str<'a, 'b, 'c>(
    bytes: &'a [u8],
    bytes_type: &'b str,
    arg_type: &'c str,
-) -> anyhow::Result<&'a str> {
+) -> Result<&'a str> {
    std::str::from_utf8(bytes).with_context(|| {
        format!(
-            "Invalid {} bytes {:?} received for {:?} during fetch command args parsing",
-            bytes_type, arg_type, bytes
+            "Invalid {bytes_type} bytes {bytes:?} received for {arg_type} during fetch command args parsing",
        )
    })
 }
@ -133,13 +244,13 @@ impl FetchArgs {
        self.filter.is_some()
    }

-    fn validate(&self) -> anyhow::Result<()> {
+    fn validate(&self) -> Result<()> {
        if self.deepen.is_some() && self.deepen_since.is_some() {
-            anyhow::bail!(
+            bail!(
                "deepen and deepen-since arguments cannot be provided at the same time for fetch command"
            )
        } else if self.deepen.is_some() && self.deepen_not.is_some() {
-            anyhow::bail!(
+            bail!(
                "deepen and deepen-not arguments cannot be provided at the same time for fetch command"
            )
        } else {
@ -147,7 +258,7 @@ impl FetchArgs {
        }
    }

-    pub fn parse_from_packetline(args: &[u8]) -> anyhow::Result<Self> {
+    pub fn parse_from_packetline(args: &[u8]) -> Result<Self> {
        let mut tokens = StreamingPeekableIter::new(args, &[PacketLineRef::Flush], true);
        let mut fetch_args = Self::default();
        while let Some(token) = tokens.read_line() {
@ -179,7 +290,7 @@ impl FetchArgs {
                    fetch_args.deepen_not = Some(parse_oid(oid_depth, DEEPEN_NOT_PREFIX)?);
                } else if let Some(filter) = data.strip_prefix(FILTER_PREFIX) {
                    let filter_spec = bytes_to_str(filter, "filter_spec", "filter")?.to_owned();
-                    fetch_args.filter = Some(filter_spec);
+                    fetch_args.filter = Some(FilterArgs::parse_from_spec(filter_spec)?);
                } else if let Some(want_ref) = data.strip_prefix(WANT_REF_PREFIX) {
                    let want_ref = bytes_to_str(want_ref, "want_ref", "want-ref")?.to_owned();
                    fetch_args.want_refs.push(want_ref);
@ -201,14 +312,14 @@ impl FetchArgs {
                        WAIT_FOR_DONE => fetch_args.wait_for_done = true,
                        SIDEBAND_ALL => fetch_args.sideband_all = true,
                        DEEPEN_RELATIVE => fetch_args.deepen_relative = true,
-                        arg => anyhow::bail!(
+                        arg => bail!(
                            "Unexpected arg {} in fetch command args",
                            String::from_utf8_lossy(arg)
                        ),
                    };
                }
            } else {
-                anyhow::bail!(
+                bail!(
                    "Unexpected token {:?} in packetline during fetch command args parsing",
                    token
                );
@ -231,7 +342,7 @@ impl FetchArgs {
            deepen_since: self.deepen_since,
            deepen_not: self.deepen_not,
            deepen_relative: self.deepen_relative,
-            filter: self.filter,
+            filter: self.filter.map(FilterArgs::into_fetch_filter),
            concurrency,
        }
    }
@ -241,13 +352,14 @@ impl FetchArgs {
 mod tests {
    use std::io::Write;

+    use anyhow::Result;
    use gix_packetline::encode::flush_to_write;
    use gix_packetline::Writer;

    use super::*;

    #[test]
-    fn test_fetch_command_args_parsing() -> anyhow::Result<()> {
+    fn test_fetch_command_args_parsing() -> Result<()> {
        let inner_writer = Vec::new();
        let mut packetline_writer = Writer::new(inner_writer);
        packetline_writer.write_all(b"thin-pack\n")?;
@ -266,6 +378,7 @@ mod tests {
        packetline_writer.write_all(b"want 1000000000000000000000000000000000000001\n")?;
        packetline_writer.write_all(b"have 2000000000000000000000000000000000000002\n")?;
        packetline_writer.write_all(b"shallow 1000000000000000000000000000000000000001\n")?;
+        packetline_writer.write_all(b"filter combine:blob:none+tree:5+object:type=blob+object:type=tree+object:type=commit\n")?;
        packetline_writer.write_all(b"done\n")?;
        packetline_writer.flush()?;
        let mut inner_writer = packetline_writer.into_inner();
@ -283,11 +396,12 @@ mod tests {
        assert_eq!(parsed_args.shallow.len(), 2);
        assert_eq!(parsed_args.haves.len(), 3);
        assert_eq!(parsed_args.wants.len(), 2);
+        assert!(parsed_args.filter.is_some());
        Ok(())
    }

    #[test]
-    fn test_fetch_command_args_validation() -> anyhow::Result<()> {
+    fn test_fetch_command_args_validation() -> Result<()> {
        let inner_writer = Vec::new();
        let mut packetline_writer = Writer::new(inner_writer);
        packetline_writer.write_all(b"deepen 1\n")?;
@ -309,7 +423,7 @@ mod tests {
    }

    #[test]
-    fn test_fetch_command_args_time_parsing() -> anyhow::Result<()> {
+    fn test_fetch_command_args_time_parsing() -> Result<()> {
        let inner_writer = Vec::new();
        let mut packetline_writer = Writer::new(inner_writer);
        packetline_writer.write_all(b"deepen-since 1979-02-26 18:30:00\n")?;
@ -327,4 +441,45 @@ mod tests {
        assert!(FetchArgs::parse_from_packetline(&inner_writer).is_ok());
        Ok(())
    }
+
+    #[test]
+    fn test_filter_args_parsing() -> Result<()> {
+        let raw_input =
+            "combine:blob:none+tree:5+object:type=blob+object:type=tree+object:type=commit"
+                .to_string();
+        let filter_args = FilterArgs::parse_from_spec(raw_input)?;
+
+        assert_eq!(
+            filter_args.allowed_object_types,
+            vec![Kind::Blob] // Since blob was the first one in the spec, rest are ignored
+        );
+        assert_eq!(filter_args.max_tree_depth, 5);
+        assert_eq!(filter_args.max_blob_size, 0);
+
+        let raw_input = "tree:5".to_string();
+        let filter_args = FilterArgs::parse_from_spec(raw_input)?;
+        assert_eq!(filter_args.max_tree_depth, 5);
+        assert_eq!(filter_args.max_blob_size, u64::MAX);
+
+        let raw_input = "object:type=commit".to_string();
+        let filter_args = FilterArgs::parse_from_spec(raw_input)?;
+        assert_eq!(filter_args.max_tree_depth, u64::MAX);
+        assert_eq!(filter_args.max_blob_size, u64::MAX);
+        assert_eq!(filter_args.allowed_object_types, vec![Kind::Commit]);
+
+        let raw_input = "blob:limit=5m".to_string();
+        let filter_args = FilterArgs::parse_from_spec(raw_input)?;
+        assert_eq!(filter_args.max_tree_depth, u64::MAX);
+        assert_eq!(filter_args.max_blob_size, 5 * 1024 * 1024);
+
+        let raw_input = "blob:limit=49999".to_string();
+        let filter_args = FilterArgs::parse_from_spec(raw_input)?;
+        assert_eq!(filter_args.max_blob_size, 49999);
+        assert_eq!(filter_args.max_tree_depth, u64::MAX);
+        assert_eq!(
+            filter_args.allowed_object_types,
+            vec![Kind::Blob, Kind::Tree, Kind::Commit, Kind::Tag]
+        );
+        Ok(())
+    }
 }
--- a/eden/mononoke/git_server/src/read/capability_advertisement.rs
+++ b/eden/mononoke/git_server/src/read/capability_advertisement.rs
@ -22,9 +22,7 @@ use crate::model::ServiceType;

 const CORE_CAPABILITIES: &[&str] = &[
    "ls-refs=unborn",
-    "fetch=shallow",
-    "wait-for-done",
-    "filter",
+    "fetch=shallow wait-for-done filter",
    "ref-in-want",
    "object-format=sha1",
 ];
--- a/eden/mononoke/tests/integration/test-mononoke-git-server-filter-clone.t
+++ b/eden/mononoke/tests/integration/test-mononoke-git-server-filter-clone.t
@ -50,4 +50,8 @@
 # Clone the Git repo from Mononoke
  $ git_client clone $MONONOKE_GIT_SERVICE_BASE_URL/$REPONAME.git --filter=blob:limit=5k --filter=tree:3 --filter=object:type=blob --filter=object:type=tree --filter=object:type=commit 
  Cloning into 'repo'...
-  warning: filtering not recognized by server, ignoring
+
+# Verify that we get the same Git repo back that we started with
+  $ cd $REPONAME  
+  $ git rev-list --objects --all | git cat-file --batch-check='%(objectname) %(objecttype) %(rest)' | sort > $TESTTMP/new_object_list
+  $ diff -w $TESTTMP/new_object_list $TESTTMP/object_list