From 750ae371cf83cf07a5b608f219c5657b11bee99d Mon Sep 17 00:00:00 2001 From: 1024jp <1024jp@wolfrosch.com> Date: Tue, 9 Aug 2022 00:17:15 +0900 Subject: [PATCH] Remove ISO-2022-JP detection --- CHANGELOG.md | 1 + CotEditor/Sources/String+Encoding.swift | 37 ++++---------------- CotEditor/en-GB.lproj/Acknowledgments.html | 2 +- CotEditor/en.lproj/Acknowledgments.html | 2 +- CotEditor/ja.lproj/Acknowledgments.html | 2 +- CotEditor/pt.lproj/Acknowledgments.html | 2 +- CotEditor/tr.lproj/Acknowledgments.html | 2 +- CotEditor/zh-Hans.lproj/Acknowledgments.html | 2 +- CotEditor/zh-Hant.lproj/Acknowledgments.html | 2 +- Tests/EncodingDetectionTests.swift | 10 ++++-- 10 files changed, 23 insertions(+), 39 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d8e50dbdb..e6f7acd2b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,7 @@ Change Log - Display the error message in the pattern sort dialog if the regular expression pattern is invalid. - Improve the algorithm to parse numbers in the Sort by Pattern command. - Improve the algorithm of uncommenting. +- Improve the algorithm of encoding detection. - Deprecate the “Ignore line endings when counting characters” option. - [trivial] Organize the editor's contextual menu. - [trivial] Improve the basic regular expression syntax reference. diff --git a/CotEditor/Sources/String+Encoding.swift b/CotEditor/Sources/String+Encoding.swift index 1c9a86a24..82dcc14d6 100644 --- a/CotEditor/Sources/String+Encoding.swift +++ b/CotEditor/Sources/String+Encoding.swift @@ -66,18 +66,6 @@ extension Unicode { } -private let ISO2022JPEscapeSequences: [Data] = [ - [0x1B, 0x28, 0x42], // ASCII - [0x1B, 0x28, 0x49], // kana - [0x1B, 0x24, 0x40], // 1978 - [0x1B, 0x24, 0x42], // 1983 - [0x1B, 0x24, 0x28, 0x44], // JISX0212 - ].map { Data($0) } - - -private let maxDetectionLength = 1024 * 8 - - // MARK: - @@ -167,38 +155,27 @@ extension String { init(data: Data, suggestedCFEncodings: [CFStringEncoding], usedEncoding: inout String.Encoding?) throws { // detect encoding from so-called "magic numbers" - // check Unicode's BOM for bom in Unicode.BOM.allCases { guard data.starts(with: bom.sequence), let string = String(bomCapableData: data, encoding: bom.encoding) - else { continue } + else { continue } usedEncoding = bom.encoding self = string return } - // try ISO-2022-JP by checking the existence of typical escape sequences - // -> It's not perfect yet works in most cases. (2016-01) - if data.prefix(maxDetectionLength).contains(0x1B), - ISO2022JPEscapeSequences.contains(where: { data.range(of: $0) != nil }), - let string = String(data: data, encoding: .iso2022JP) - { - usedEncoding = .iso2022JP - self = string - return - } - // try encodings in order from the top of the encoding list for cfEncoding in suggestedCFEncodings { let encoding = String.Encoding(cfEncoding: cfEncoding) + guard + let string = String(data: data, encoding: encoding) + else { continue } - if let string = String(data: data, encoding: encoding) { - usedEncoding = encoding - self = string - return - } + usedEncoding = encoding + self = string + return } throw CocoaError(.fileReadUnknownStringEncoding) diff --git a/CotEditor/en-GB.lproj/Acknowledgments.html b/CotEditor/en-GB.lproj/Acknowledgments.html index 010e08391..9f8238d77 100644 --- a/CotEditor/en-GB.lproj/Acknowledgments.html +++ b/CotEditor/en-GB.lproj/Acknowledgments.html @@ -125,7 +125,7 @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRES

Other Information Sources

diff --git a/CotEditor/en.lproj/Acknowledgments.html b/CotEditor/en.lproj/Acknowledgments.html index bbead4288..70be5a597 100644 --- a/CotEditor/en.lproj/Acknowledgments.html +++ b/CotEditor/en.lproj/Acknowledgments.html @@ -125,7 +125,7 @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRES

Other Information Sources

diff --git a/CotEditor/ja.lproj/Acknowledgments.html b/CotEditor/ja.lproj/Acknowledgments.html index dac373e16..e8dd3e5a4 100644 --- a/CotEditor/ja.lproj/Acknowledgments.html +++ b/CotEditor/ja.lproj/Acknowledgments.html @@ -125,7 +125,7 @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRES

その他の情報ソース

diff --git a/CotEditor/pt.lproj/Acknowledgments.html b/CotEditor/pt.lproj/Acknowledgments.html index 9396ee72a..2a017547a 100644 --- a/CotEditor/pt.lproj/Acknowledgments.html +++ b/CotEditor/pt.lproj/Acknowledgments.html @@ -125,7 +125,7 @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLI

Outras Fontes de Informações

diff --git a/CotEditor/tr.lproj/Acknowledgments.html b/CotEditor/tr.lproj/Acknowledgments.html index 91eec528b..2e3f14cd2 100644 --- a/CotEditor/tr.lproj/Acknowledgments.html +++ b/CotEditor/tr.lproj/Acknowledgments.html @@ -125,7 +125,7 @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRES

Diğer Bilgi Kaynakları

diff --git a/CotEditor/zh-Hans.lproj/Acknowledgments.html b/CotEditor/zh-Hans.lproj/Acknowledgments.html index e57b5b39a..9dee96b25 100644 --- a/CotEditor/zh-Hans.lproj/Acknowledgments.html +++ b/CotEditor/zh-Hans.lproj/Acknowledgments.html @@ -125,7 +125,7 @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRES

其他信息

diff --git a/CotEditor/zh-Hant.lproj/Acknowledgments.html b/CotEditor/zh-Hant.lproj/Acknowledgments.html index 7d0ef5641..f47687174 100644 --- a/CotEditor/zh-Hant.lproj/Acknowledgments.html +++ b/CotEditor/zh-Hant.lproj/Acknowledgments.html @@ -125,7 +125,7 @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRES

其他資訊

diff --git a/Tests/EncodingDetectionTests.swift b/Tests/EncodingDetectionTests.swift index 89d25f678..6c3761427 100644 --- a/Tests/EncodingDetectionTests.swift +++ b/Tests/EncodingDetectionTests.swift @@ -9,7 +9,7 @@ // // --------------------------------------------------------------------------- // -// © 2016-2021 1024jp +// © 2016-2022 1024jp // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -70,8 +70,14 @@ final class EncodingDetectionTests: XCTestCase { func testISO2022() throws { + let data = try self.dataForFileName("ISO 2022-JP") + let encodings: [String.Encoding] = [.utf8, .iso2022JP, .utf16] + let cfEncodings = encodings + .map(\.rawValue) + .map(CFStringConvertNSStringEncodingToEncoding) + var encoding: String.Encoding? - let string = try self.encodedStringForFileName("ISO 2022-JP", usedEncoding: &encoding) + let string = try String(data: data, suggestedCFEncodings: cfEncodings, usedEncoding: &encoding) XCTAssertEqual(string, "dog犬") XCTAssertEqual(encoding, .iso2022JP)