/* * Copyright (c) 2023, Tim Schumacher * * SPDX-License-Identifier: BSD-2-Clause */ #include #include #include namespace Compress { ErrorOr> Lzma2Decompressor::create_from_raw_stream(MaybeOwned stream, u32 dictionary_size) { auto dictionary = TRY(CircularBuffer::create_empty(dictionary_size)); auto decompressor = TRY(adopt_nonnull_own_or_enomem(new (nothrow) Lzma2Decompressor(move(stream), move(dictionary)))); return decompressor; } Lzma2Decompressor::Lzma2Decompressor(MaybeOwned stream, CircularBuffer dictionary) : m_stream(move(stream)) , m_dictionary(move(dictionary)) { } ErrorOr Lzma2Decompressor::read_some(Bytes bytes) { if (!m_current_chunk_stream.has_value() || (*m_current_chunk_stream)->is_eof()) { // "LZMA2 data consists of packets starting with a control byte, with the following values:" auto const control_byte = TRY(m_stream->read_value()); if (control_byte == 0) { // " - 0 denotes the end of the file" m_found_end_of_stream = true; return bytes.trim(0); } if (control_byte == 1) { // " - 1 denotes a dictionary reset followed by an uncompressed chunk" m_dictionary.clear(); m_dictionary_initialized = true; // The XZ utils test files (bad-1-lzma2-8.xz) check that the decompressor // requires a new set of properties after a dictionary reset. m_last_lzma_options = {}; } if (control_byte == 1 || control_byte == 2) { // " - 2 denotes an uncompressed chunk without a dictionary reset" if (!m_dictionary_initialized) return Error::from_string_literal("LZMA2 stream uses dictionary without ever resetting it"); // "Uncompressed chunks consist of: // - A 16-bit big-endian value encoding the data size minus one // - The data to be copied verbatim into the dictionary and the output" u32 data_size = TRY(m_stream->read_value>()) + 1; m_in_uncompressed_chunk = true; m_current_chunk_stream = TRY(try_make(MaybeOwned { *m_stream }, data_size)); } if (3 <= control_byte && control_byte <= 0x7f) { // " - 3-0x7f are invalid values" return Error::from_string_literal("Invalid control byte in LZMA2 stream"); } if (0x80 <= control_byte) { // " - 0x80-0xff denotes an LZMA chunk, where the lowest 5 bits are used as bit 16-20 // of the uncompressed size minus one, and bit 5-6 indicates what should be reset." auto encoded_uncompressed_size_high = control_byte & 0b11111; auto reset_indicator = (control_byte & 0b1100000) >> 5; // "LZMA chunks consist of: // - A 16-bit big-endian value encoding the low 16-bits of the uncompressed size minus one // - A 16-bit big-endian value encoding the compressed size minus one // - A properties/lclppb byte if bit 6 in the control byte is set // - The LZMA compressed data, starting with the 5 bytes (of which the first is ignored) // used to initialize the range coder (which are included in the compressed size)" u16 encoded_uncompressed_size_low = TRY(m_stream->read_value>()); u16 encoded_compressed_size = TRY(m_stream->read_value>()); u64 uncompressed_size = ((encoded_uncompressed_size_high << 16) | encoded_uncompressed_size_low) + 1; u32 compressed_size = encoded_compressed_size + 1; m_current_chunk_stream = TRY(try_make(MaybeOwned { *m_stream }, compressed_size)); // "Bits 5-6 for LZMA chunks can be:" switch (reset_indicator) { case 3: { // " - 3: state reset, properties reset using properties byte, dictionary reset" m_dictionary.clear(); m_dictionary_initialized = true; [[fallthrough]]; } case 2: { // " - 2: state reset, properties reset using properties byte" // Update the stored LZMA options with the new settings, the stream will be recreated later. auto encoded_properties = TRY(m_stream->read_value()); auto properties = TRY(LzmaHeader::decode_model_properties(encoded_properties)); auto dictionary_size = m_dictionary.capacity(); VERIFY(dictionary_size <= NumericLimits::max()); m_last_lzma_options = LzmaDecompressorOptions { .literal_context_bits = properties.literal_context_bits, .literal_position_bits = properties.literal_position_bits, .position_bits = properties.position_bits, .dictionary_size = static_cast(dictionary_size), .uncompressed_size = uncompressed_size, // Note: This is not specified anywhere. However, it is apparently tested by bad-1-lzma2-7.xz from the XZ utils test files. .reject_end_of_stream_marker = true, }; [[fallthrough]]; } case 1: { // " - 1: state reset" if (!m_last_lzma_options.has_value()) return Error::from_string_literal("LZMA2 stream contains LZMA chunk without settings"); if (!m_dictionary_initialized) return Error::from_string_literal("LZMA2 stream uses dictionary without ever resetting it"); m_last_lzma_options->uncompressed_size = uncompressed_size; m_last_lzma_stream = TRY(LzmaDecompressor::create_from_raw_stream(m_current_chunk_stream.release_value(), *m_last_lzma_options, MaybeOwned { m_dictionary })); break; } case 0: { // " - 0: nothing reset" if (!m_last_lzma_stream.has_value()) return Error::from_string_literal("LZMA2 stream contains no-reset LZMA chunk without previous state"); if (!m_dictionary_initialized) return Error::from_string_literal("LZMA2 stream uses dictionary without ever resetting it"); TRY((*m_last_lzma_stream)->append_input_stream(m_current_chunk_stream.release_value(), uncompressed_size)); break; } } m_in_uncompressed_chunk = false; m_current_chunk_stream = MaybeOwned { **m_last_lzma_stream }; } } auto result = TRY((*m_current_chunk_stream)->read_some(bytes)); // For an uncompressed block we are reading directly from the input stream, // so we need to capture the 'uncompressed' data into the dictionary manually. // Since we only care about having the correct value in the seekback buffer, // we can also immediately discard the written data and only ever have to write // the last bytes into it. if (m_in_uncompressed_chunk) { VERIFY(m_dictionary.used_space() == 0); auto relevant_data = result; if (relevant_data.size() > m_dictionary.capacity()) relevant_data = relevant_data.slice(relevant_data.size() - m_dictionary.capacity(), relevant_data.size()); auto written_bytes = m_dictionary.write(relevant_data); VERIFY(written_bytes == relevant_data.size()); MUST(m_dictionary.discard(written_bytes)); } return result; } ErrorOr Lzma2Decompressor::write_some(ReadonlyBytes) { return Error::from_errno(EBADF); } bool Lzma2Decompressor::is_eof() const { return m_found_end_of_stream; } bool Lzma2Decompressor::is_open() const { return true; } void Lzma2Decompressor::close() { } }