rustshlex: add external source code for posix-style parsing

Summary: Very small library ( one file ) that allows for posix-style splitting. The library was not vendored in third-party and therefore was just added to unblock as fast as possible. Reviewed By: quark-zju Differential Revision: D15911319 fbshipit-source-id: 2820d5beb5b3493a507f00f4b94e93b0405cf991
2024-10-07 15:27:13 +03:00 · 2019-07-20 01:01:33 -07:00 · 2019-07-20 01:01:33 -07:00 · 137edc1814
commit 137edc1814
parent 5f02e5cd5c
3 changed files with 290 additions and 0 deletions
--- a/lib/Cargo.toml
+++ b/lib/Cargo.toml
@ -27,6 +27,7 @@ members = [
    "procinfo",
    "radixbuf",
    "revisionstore",
+    "shlex",
    "treestate",
    "types",
    "url-ext",
--- a/lib/shlex/Cargo.toml
+++ b/lib/shlex/Cargo.toml
@ -0,0 +1,9 @@
+[package]
+name = "shlex"
+version = "0.1.1"
+authors = ["comex <comexk@gmail.com>"]
+license = "MIT/Apache-2.0"
+repository = "https://github.com/comex/rust-shlex"
+description = """
+Split a string into shell words, like Python's shlex.
+"""
--- a/lib/shlex/src/lib.rs
+++ b/lib/shlex/src/lib.rs
@ -0,0 +1,280 @@
+// Copyright 2015 Nicholas Allegra (comex).
+// Licensed under the Apache License, Version 2.0 <http://www.apache.org/licenses/LICENSE-2.0> or
+// the MIT license <http://opensource.org/licenses/MIT>, at your option. This file may not be
+// copied, modified, or distributed except according to those terms.
+
+//! Same idea as (but implementation not directly based on) the Python shlex module.  However, this
+//! implementation does not support any of the Python module's customization because it makes
+//! parsing slower and is fairly useless.  You only get the default settings of shlex.split, which
+//! mimic the POSIX shell:
+//! http://pubs.opengroup.org/onlinepubs/9699919799/utilities/V3_chap02.html
+//!
+//! This implementation also deviates from the Python version in not treating \r specially, which I
+//! believe is more compliant.
+//!
+//! The algorithms in this crate are oblivious to UTF-8 high bytes, so they iterate over the bytes
+//! directly as a micro-optimization.
+
+use std::borrow::Cow;
+
+/// An iterator that takes an input string and splits it into the words using the same syntax as
+/// the POSIX shell.
+pub struct Shlex<'a> {
+    in_iter: std::str::Bytes<'a>,
+    /// The number of newlines read so far, plus one.
+    pub line_no: usize,
+    /// An input string is erroneous if it ends while inside a quotation or right after an
+    /// unescaped backslash.  Since Iterator does not have a mechanism to return an error, if that
+    /// happens, Shlex just throws out the last token, ends the iteration, and sets 'had_error' to
+    /// true; best to check it after you're done iterating.
+    pub had_error: bool,
+}
+
+impl<'a> Shlex<'a> {
+    pub fn new(in_str: &'a str) -> Self {
+        Shlex {
+            in_iter: in_str.bytes(),
+            line_no: 1,
+            had_error: false,
+        }
+    }
+
+    fn parse_word(&mut self, mut ch: u8) -> Option<String> {
+        let mut result: Vec<u8> = Vec::new();
+        loop {
+            match ch as char {
+                '"' => {
+                    if let Err(()) = self.parse_double(&mut result) {
+                        self.had_error = true;
+                        return None;
+                    }
+                }
+                '\'' => {
+                    if let Err(()) = self.parse_single(&mut result) {
+                        self.had_error = true;
+                        return None;
+                    }
+                }
+                '\\' => {
+                    if let Some(ch2) = self.next_char() {
+                        if ch2 != '\n' as u8 {
+                            result.push(ch2);
+                        }
+                    } else {
+                        self.had_error = true;
+                        return None;
+                    }
+                }
+                ' ' | '\t' | '\n' => {
+                    break;
+                }
+                _ => {
+                    result.push(ch as u8);
+                }
+            }
+            if let Some(ch2) = self.next_char() {
+                ch = ch2;
+            } else {
+                break;
+            }
+        }
+        unsafe { Some(String::from_utf8_unchecked(result)) }
+    }
+
+    fn parse_double(&mut self, result: &mut Vec<u8>) -> Result<(), ()> {
+        loop {
+            if let Some(ch2) = self.next_char() {
+                match ch2 as char {
+                    '\\' => {
+                        if let Some(ch3) = self.next_char() {
+                            match ch3 as char {
+                                // \$ => $
+                                '$' | '`' | '"' | '\\' => {
+                                    result.push(ch3);
+                                }
+                                // \<newline> => nothing
+                                '\n' => {}
+                                // \x => =x
+                                _ => {
+                                    result.push('\\' as u8);
+                                    result.push(ch3);
+                                }
+                            }
+                        } else {
+                            return Err(());
+                        }
+                    }
+                    '"' => {
+                        return Ok(());
+                    }
+                    _ => {
+                        result.push(ch2);
+                    }
+                }
+            } else {
+                return Err(());
+            }
+        }
+    }
+
+    fn parse_single(&mut self, result: &mut Vec<u8>) -> Result<(), ()> {
+        loop {
+            if let Some(ch2) = self.next_char() {
+                match ch2 as char {
+                    '\\' => {
+                        if let Some(ch3) = self.next_char() {
+                            match ch3 as char {
+                                // for single quotes, only these can be escaped
+                                '\'' | '\\' => {
+                                    result.push(ch3);
+                                }
+                                _ => {
+                                    result.push('\\' as u8);
+                                    result.push(ch3);
+                                }
+                            }
+                        } else {
+                            return Err(());
+                        }
+                    }
+                    '\'' => {
+                        return Ok(());
+                    }
+                    _ => {
+                        result.push(ch2);
+                    }
+                }
+            } else {
+                return Err(());
+            }
+        }
+    }
+
+    fn next_char(&mut self) -> Option<u8> {
+        let res = self.in_iter.next();
+        if res == Some('\n' as u8) {
+            self.line_no += 1;
+        }
+        res
+    }
+}
+
+impl<'a> Iterator for Shlex<'a> {
+    type Item = String;
+    fn next(&mut self) -> Option<String> {
+        if let Some(mut ch) = self.next_char() {
+            // skip initial whitespace
+            loop {
+                match ch as char {
+                    ' ' | '\t' | '\n' => {}
+                    '#' => {
+                        while let Some(ch2) = self.next_char() {
+                            if ch2 as char == '\n' {
+                                break;
+                            }
+                        }
+                    }
+                    _ => {
+                        break;
+                    }
+                }
+                if let Some(ch2) = self.next_char() {
+                    ch = ch2;
+                } else {
+                    return None;
+                }
+            }
+            self.parse_word(ch)
+        } else {
+            // no initial character
+            None
+        }
+    }
+}
+
+/// Convenience function that consumes the whole string at once.  Returns None if the input was
+/// erroneous.
+pub fn split(in_str: &str) -> Option<Vec<String>> {
+    let mut shl = Shlex::new(in_str);
+    let res = shl.by_ref().collect();
+    if shl.had_error {
+        None
+    } else {
+        Some(res)
+    }
+}
+
+/// Given a single word, return a string suitable to encode it as a shell argument.
+pub fn quote(in_str: &str) -> Cow<str> {
+    if in_str.len() == 0 {
+        "\"\"".into()
+    } else if in_str.bytes().any(|c| match c as char {
+        '|' | '&' | ';' | '<' | '>' | '(' | ')' | '$' | '`' | '\\' | '"' | '\'' | ' ' | '\t'
+        | '\r' | '\n' | '*' | '?' | '[' | '#' | '~' | '=' | '%' => true,
+        _ => false,
+    }) {
+        let mut out: Vec<u8> = Vec::new();
+        out.push('"' as u8);
+        for c in in_str.bytes() {
+            match c as char {
+                '$' | '`' | '"' | '\\' => out.push('\\' as u8),
+                _ => (),
+            }
+            out.push(c);
+        }
+        out.push('"' as u8);
+        unsafe { String::from_utf8_unchecked(out) }.into()
+    } else {
+        in_str.into()
+    }
+}
+
+#[cfg(test)]
+static SPLIT_TEST_ITEMS: &'static [(&'static str, Option<&'static [&'static str]>)] = &[
+    ("foo$baz", Some(&["foo$baz"])),
+    ("foo baz", Some(&["foo", "baz"])),
+    ("foo\"bar\"baz", Some(&["foobarbaz"])),
+    ("foo \"bar\"baz", Some(&["foo", "barbaz"])),
+    ("   foo \nbar", Some(&["foo", "bar"])),
+    ("foo\\\nbar", Some(&["foobar"])),
+    ("\"foo\\\nbar\"", Some(&["foobar"])),
+    ("'baz\\$b'", Some(&["baz\\$b"])),
+    ("'baz\\\''", Some(&["baz\'"])),
+    ("\\", None),
+    ("\"\\", None),
+    ("'\\", None),
+    ("\"", None),
+    ("'", None),
+    ("foo #bar\nbaz", Some(&["foo", "baz"])),
+    ("foo #bar", Some(&["foo"])),
+    ("foo#bar", Some(&["foo#bar"])),
+    ("foo\"#bar", None),
+];
+
+#[test]
+fn test_split() {
+    for &(input, output) in SPLIT_TEST_ITEMS {
+        assert_eq!(
+            split(input),
+            output.map(|o| o.iter().map(|&x| x.to_owned()).collect())
+        );
+    }
+}
+
+#[test]
+fn test_lineno() {
+    let mut sh = Shlex::new("\nfoo\nbar");
+    while let Some(word) = sh.next() {
+        if word == "bar" {
+            assert_eq!(sh.line_no, 3);
+        }
+    }
+}
+
+#[test]
+fn test_quote() {
+    assert_eq!(quote("foobar"), "foobar");
+    assert_eq!(quote("foo bar"), "\"foo bar\"");
+    assert_eq!(quote("\""), "\"\\\"\"");
+    assert_eq!(quote(""), "\"\"");
+}