Conserve BOM and properly support UTF16 (#6497)

This commit is contained in:
Alexis-Lapierre 2023-05-01 00:40:06 +02:00 committed by GitHub
parent efd09b6c7c
commit b0b3f45b80
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 186 additions and 37 deletions

View File

@ -5111,7 +5111,8 @@ async fn shell_impl_async(
let output = if let Some(mut stdin) = process.stdin.take() {
let input_task = tokio::spawn(async move {
if let Some(input) = input {
helix_view::document::to_writer(&mut stdin, encoding::UTF_8, &input).await?;
helix_view::document::to_writer(&mut stdin, (encoding::UTF_8, false), &input)
.await?;
}
Ok::<_, anyhow::Error>(())
});

View File

@ -407,3 +407,41 @@ async fn test_write_fail_new_path() -> anyhow::Result<()> {
Ok(())
}
#[tokio::test(flavor = "multi_thread")]
async fn test_write_utf_bom_file() -> anyhow::Result<()> {
// "ABC" with utf8 bom
const UTF8_FILE: [u8; 6] = [0xef, 0xbb, 0xbf, b'A', b'B', b'C'];
// "ABC" in UTF16 with bom
const UTF16LE_FILE: [u8; 8] = [0xff, 0xfe, b'A', 0x00, b'B', 0x00, b'C', 0x00];
const UTF16BE_FILE: [u8; 8] = [0xfe, 0xff, 0x00, b'A', 0x00, b'B', 0x00, b'C'];
edit_file_with_content(&UTF8_FILE).await?;
edit_file_with_content(&UTF16LE_FILE).await?;
edit_file_with_content(&UTF16BE_FILE).await?;
Ok(())
}
async fn edit_file_with_content(file_content: &[u8]) -> anyhow::Result<()> {
let mut file = tempfile::NamedTempFile::new()?;
file.as_file_mut().write_all(&file_content)?;
helpers::test_key_sequence(
&mut helpers::AppBuilder::new().build()?,
Some(&format!(":o {}<ret>:x<ret>", file.path().to_string_lossy())),
None,
true,
)
.await?;
file.rewind()?;
let mut new_file_content: Vec<u8> = Vec::new();
file.read_to_end(&mut new_file_content)?;
assert_eq!(file_content, new_file_content);
Ok(())
}

View File

@ -5,6 +5,7 @@ use futures_util::future::BoxFuture;
use futures_util::FutureExt;
use helix_core::auto_pairs::AutoPairs;
use helix_core::doc_formatter::TextFormat;
use helix_core::encoding::Encoding;
use helix_core::syntax::Highlight;
use helix_core::text_annotations::{InlineAnnotation, TextAnnotations};
use helix_core::Range;
@ -130,6 +131,7 @@ pub struct Document {
path: Option<PathBuf>,
encoding: &'static encoding::Encoding,
has_bom: bool,
pub restore_cursor: bool,
@ -277,16 +279,104 @@ impl fmt::Debug for DocumentInlayHintsId {
}
}
enum Encoder {
Utf16Be,
Utf16Le,
EncodingRs(encoding::Encoder),
}
impl Encoder {
fn from_encoding(encoding: &'static encoding::Encoding) -> Self {
if encoding == encoding::UTF_16BE {
Self::Utf16Be
} else if encoding == encoding::UTF_16LE {
Self::Utf16Le
} else {
Self::EncodingRs(encoding.new_encoder())
}
}
fn encode_from_utf8(
&mut self,
src: &str,
dst: &mut [u8],
is_empty: bool,
) -> (encoding::CoderResult, usize, usize) {
if src.is_empty() {
return (encoding::CoderResult::InputEmpty, 0, 0);
}
let mut write_to_buf = |convert: fn(u16) -> [u8; 2]| {
let to_write = src.char_indices().map(|(indice, char)| {
let mut encoded: [u16; 2] = [0, 0];
(
indice,
char.encode_utf16(&mut encoded)
.iter_mut()
.flat_map(|char| convert(*char))
.collect::<Vec<u8>>(),
)
});
let mut total_written = 0usize;
for (indice, utf16_bytes) in to_write {
let character_size = utf16_bytes.len();
if dst.len() <= (total_written + character_size) {
return (encoding::CoderResult::OutputFull, indice, total_written);
}
for character in utf16_bytes {
dst[total_written] = character;
total_written += 1;
}
}
(encoding::CoderResult::InputEmpty, src.len(), total_written)
};
match self {
Self::Utf16Be => write_to_buf(u16::to_be_bytes),
Self::Utf16Le => write_to_buf(u16::to_le_bytes),
Self::EncodingRs(encoder) => {
let (code_result, read, written, ..) = encoder.encode_from_utf8(src, dst, is_empty);
(code_result, read, written)
}
}
}
}
// Apply BOM if encoding permit it, return the number of bytes written at the start of buf
fn apply_bom(encoding: &'static encoding::Encoding, buf: &mut [u8; BUF_SIZE]) -> usize {
if encoding == encoding::UTF_8 {
buf[0] = 0xef;
buf[1] = 0xbb;
buf[2] = 0xbf;
3
} else if encoding == encoding::UTF_16BE {
buf[0] = 0xfe;
buf[1] = 0xff;
2
} else if encoding == encoding::UTF_16LE {
buf[0] = 0xff;
buf[1] = 0xfe;
2
} else {
0
}
}
// The documentation and implementation of this function should be up-to-date with
// its sibling function, `to_writer()`.
//
/// Decodes a stream of bytes into UTF-8, returning a `Rope` and the
/// encoding it was decoded as. The optional `encoding` parameter can
/// be used to override encoding auto-detection.
/// encoding it was decoded as with BOM information. The optional `encoding`
/// parameter can be used to override encoding auto-detection.
pub fn from_reader<R: std::io::Read + ?Sized>(
reader: &mut R,
encoding: Option<&'static encoding::Encoding>,
) -> Result<(Rope, &'static encoding::Encoding), Error> {
encoding: Option<&'static Encoding>,
) -> Result<(Rope, &'static Encoding, bool), Error> {
// These two buffers are 8192 bytes in size each and are used as
// intermediaries during the decoding process. Text read into `buf`
// from `reader` is decoded into `buf_out` as UTF-8. Once either
@ -296,25 +386,32 @@ pub fn from_reader<R: std::io::Read + ?Sized>(
let mut buf_out = [0u8; BUF_SIZE];
let mut builder = RopeBuilder::new();
// By default, the encoding of the text is auto-detected via the
// `chardetng` crate which requires sample data from the reader.
// By default, the encoding of the text is auto-detected by
// `encoding_rs` for_bom, and if it fails, from `chardetng`
// crate which requires sample data from the reader.
// As a manual override to this auto-detection is possible, the
// same data is read into `buf` to ensure symmetry in the upcoming
// loop.
let (encoding, mut decoder, mut slice, mut is_empty) = {
let (encoding, has_bom, mut decoder, mut slice, mut is_empty) = {
let read = reader.read(&mut buf)?;
let is_empty = read == 0;
let encoding = encoding.unwrap_or_else(|| {
let mut encoding_detector = chardetng::EncodingDetector::new();
encoding_detector.feed(&buf, is_empty);
encoding_detector.guess(None, true)
});
let (encoding, has_bom) = encoding
.map(|encoding| (encoding, false))
.or_else(|| {
encoding::Encoding::for_bom(&buf).map(|(encoding, _bom_size)| (encoding, true))
})
.unwrap_or_else(|| {
let mut encoding_detector = chardetng::EncodingDetector::new();
encoding_detector.feed(&buf, is_empty);
(encoding_detector.guess(None, true), false)
});
let decoder = encoding.new_decoder();
// If the amount of bytes read from the reader is less than
// `buf.len()`, it is undesirable to read the bytes afterwards.
let slice = &buf[..read];
(encoding, decoder, slice, is_empty)
(encoding, has_bom, decoder, slice, is_empty)
};
// `RopeBuilder::append()` expects a `&str`, so this is the "real"
@ -382,7 +479,7 @@ pub fn from_reader<R: std::io::Read + ?Sized>(
is_empty = read == 0;
}
let rope = builder.finish();
Ok((rope, encoding))
Ok((rope, encoding, has_bom))
}
// The documentation and implementation of this function should be up-to-date with
@ -393,7 +490,7 @@ pub fn from_reader<R: std::io::Read + ?Sized>(
/// replacement characters may appear in the encoded text.
pub async fn to_writer<'a, W: tokio::io::AsyncWriteExt + Unpin + ?Sized>(
writer: &'a mut W,
encoding: &'static encoding::Encoding,
encoding_with_bom_info: (&'static Encoding, bool),
rope: &'a Rope,
) -> Result<(), Error> {
// Text inside a `Rope` is stored as non-contiguous blocks of data called
@ -402,13 +499,22 @@ pub async fn to_writer<'a, W: tokio::io::AsyncWriteExt + Unpin + ?Sized>(
// determined by filtering the iterator to remove all empty chunks and then
// appending an empty chunk to it. This is valuable for detecting when all
// chunks in the `Rope` have been iterated over in the subsequent loop.
let (encoding, has_bom) = encoding_with_bom_info;
let iter = rope
.chunks()
.filter(|c| !c.is_empty())
.chain(std::iter::once(""));
let mut buf = [0u8; BUF_SIZE];
let mut encoder = encoding.new_encoder();
let mut total_written = 0usize;
let mut total_written = if has_bom {
apply_bom(encoding, &mut buf)
} else {
0
};
let mut encoder = Encoder::from_encoding(encoding);
for chunk in iter {
let is_empty = chunk.is_empty();
let mut total_read = 0usize;
@ -449,6 +555,7 @@ pub async fn to_writer<'a, W: tokio::io::AsyncWriteExt + Unpin + ?Sized>(
break;
}
}
Ok(())
}
@ -466,10 +573,10 @@ use url::Url;
impl Document {
pub fn from(
text: Rope,
encoding: Option<&'static encoding::Encoding>,
encoding_with_bom_info: Option<(&'static Encoding, bool)>,
config: Arc<dyn DynAccess<Config>>,
) -> Self {
let encoding = encoding.unwrap_or(encoding::UTF_8);
let (encoding, has_bom) = encoding_with_bom_info.unwrap_or((encoding::UTF_8, false));
let changes = ChangeSet::new(&text);
let old_state = None;
@ -477,6 +584,7 @@ impl Document {
id: DocumentId::default(),
path: None,
encoding,
has_bom,
text,
selections: HashMap::default(),
inlay_hints: HashMap::default(),
@ -511,21 +619,21 @@ impl Document {
/// overwritten with the `encoding` parameter.
pub fn open(
path: &Path,
encoding: Option<&'static encoding::Encoding>,
encoding: Option<&'static Encoding>,
config_loader: Option<Arc<syntax::Loader>>,
config: Arc<dyn DynAccess<Config>>,
) -> Result<Self, Error> {
// Open the file if it exists, otherwise assume it is a new file (and thus empty).
let (rope, encoding) = if path.exists() {
let (rope, encoding, has_bom) = if path.exists() {
let mut file =
std::fs::File::open(path).context(format!("unable to open {:?}", path))?;
from_reader(&mut file, encoding)?
} else {
let encoding = encoding.unwrap_or(encoding::UTF_8);
(Rope::from(DEFAULT_LINE_ENDING.as_str()), encoding)
(Rope::from(DEFAULT_LINE_ENDING.as_str()), encoding, false)
};
let mut doc = Self::from(rope, Some(encoding), config);
let mut doc = Self::from(rope, Some((encoding, has_bom)), config);
// set the path and try detecting the language
doc.set_path(Some(path))?;
@ -576,7 +684,7 @@ impl Document {
})?;
{
let mut stdin = process.stdin.take().ok_or(FormatterError::BrokenStdin)?;
to_writer(&mut stdin, encoding::UTF_8, &text)
to_writer(&mut stdin, (encoding::UTF_8, false), &text)
.await
.map_err(|_| FormatterError::BrokenStdin)?;
}
@ -688,8 +796,7 @@ impl Document {
let current_rev = self.get_current_revision();
let doc_id = self.id();
let encoding = self.encoding;
let encoding_with_bom_info = (self.encoding, self.has_bom);
let last_saved_time = self.last_saved_time;
// We encode the file according to the `Document`'s encoding.
@ -718,7 +825,7 @@ impl Document {
}
let mut file = File::create(&path).await?;
to_writer(&mut file, encoding, &text).await?;
to_writer(&mut file, encoding_with_bom_info, &text).await?;
let event = DocumentSavedEvent {
revision: current_rev,
@ -776,7 +883,7 @@ impl Document {
provider_registry: &DiffProviderRegistry,
redraw_handle: RedrawHandle,
) -> Result<(), Error> {
let encoding = &self.encoding;
let encoding = self.encoding;
let path = self
.path()
.filter(|path| path.exists())
@ -810,13 +917,16 @@ impl Document {
/// Sets the [`Document`]'s encoding with the encoding correspondent to `label`.
pub fn set_encoding(&mut self, label: &str) -> Result<(), Error> {
self.encoding = encoding::Encoding::for_label(label.as_bytes())
.ok_or_else(|| anyhow!("unknown encoding"))?;
let encoding =
Encoding::for_label(label.as_bytes()).ok_or_else(|| anyhow!("unknown encoding"))?;
self.encoding = encoding;
Ok(())
}
/// Returns the [`Document`]'s current encoding.
pub fn encoding(&self) -> &'static encoding::Encoding {
pub fn encoding(&self) -> &'static Encoding {
self.encoding
}
@ -1280,7 +1390,7 @@ impl Document {
/// Intialize/updates the differ for this document with a new base.
pub fn set_diff_base(&mut self, diff_base: Vec<u8>, redraw_handle: RedrawHandle) {
if let Ok((diff_base, _)) = from_reader(&mut diff_base.as_slice(), Some(self.encoding)) {
if let Ok((diff_base, ..)) = from_reader(&mut diff_base.as_slice(), Some(self.encoding)) {
if let Some(differ) = &self.diff_handle {
differ.update_diff_base(diff_base);
return;
@ -1724,7 +1834,7 @@ mod test {
assert!(ref_path.exists());
let mut file = std::fs::File::open(path).unwrap();
let text = from_reader(&mut file, Some(encoding))
let text = from_reader(&mut file, Some(encoding.into()))
.unwrap()
.0
.to_string();
@ -1750,7 +1860,7 @@ mod test {
let text = Rope::from_str(&std::fs::read_to_string(path).unwrap());
let mut buf: Vec<u8> = Vec::new();
helix_lsp::block_on(to_writer(&mut buf, encoding, &text)).unwrap();
helix_lsp::block_on(to_writer(&mut buf, (encoding, false), &text)).unwrap();
let expectation = std::fs::read(ref_path).unwrap();
assert_eq!(buf, expectation);

View File

@ -1305,10 +1305,10 @@ impl Editor {
}
pub fn new_file_from_stdin(&mut self, action: Action) -> Result<DocumentId, Error> {
let (rope, encoding) = crate::document::from_reader(&mut stdin(), None)?;
let (rope, encoding, has_bom) = crate::document::from_reader(&mut stdin(), None)?;
Ok(self.new_file_from_document(
action,
Document::from(rope, Some(encoding), self.config.clone()),
Document::from(rope, Some((encoding, has_bom)), self.config.clone()),
))
}