From 750ae371cf83cf07a5b608f219c5657b11bee99d Mon Sep 17 00:00:00 2001
From: 1024jp <1024jp@wolfrosch.com>
Date: Tue, 9 Aug 2022 00:17:15 +0900
Subject: [PATCH] Remove ISO-2022-JP detection
---
CHANGELOG.md | 1 +
CotEditor/Sources/String+Encoding.swift | 37 ++++----------------
CotEditor/en-GB.lproj/Acknowledgments.html | 2 +-
CotEditor/en.lproj/Acknowledgments.html | 2 +-
CotEditor/ja.lproj/Acknowledgments.html | 2 +-
CotEditor/pt.lproj/Acknowledgments.html | 2 +-
CotEditor/tr.lproj/Acknowledgments.html | 2 +-
CotEditor/zh-Hans.lproj/Acknowledgments.html | 2 +-
CotEditor/zh-Hant.lproj/Acknowledgments.html | 2 +-
Tests/EncodingDetectionTests.swift | 10 ++++--
10 files changed, 23 insertions(+), 39 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index d8e50dbdb..e6f7acd2b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -25,6 +25,7 @@ Change Log
- Display the error message in the pattern sort dialog if the regular expression pattern is invalid.
- Improve the algorithm to parse numbers in the Sort by Pattern command.
- Improve the algorithm of uncommenting.
+- Improve the algorithm of encoding detection.
- Deprecate the “Ignore line endings when counting characters” option.
- [trivial] Organize the editor's contextual menu.
- [trivial] Improve the basic regular expression syntax reference.
diff --git a/CotEditor/Sources/String+Encoding.swift b/CotEditor/Sources/String+Encoding.swift
index 1c9a86a24..82dcc14d6 100644
--- a/CotEditor/Sources/String+Encoding.swift
+++ b/CotEditor/Sources/String+Encoding.swift
@@ -66,18 +66,6 @@ extension Unicode {
}
-private let ISO2022JPEscapeSequences: [Data] = [
- [0x1B, 0x28, 0x42], // ASCII
- [0x1B, 0x28, 0x49], // kana
- [0x1B, 0x24, 0x40], // 1978
- [0x1B, 0x24, 0x42], // 1983
- [0x1B, 0x24, 0x28, 0x44], // JISX0212
- ].map { Data($0) }
-
-
-private let maxDetectionLength = 1024 * 8
-
-
// MARK: -
@@ -167,38 +155,27 @@ extension String {
init(data: Data, suggestedCFEncodings: [CFStringEncoding], usedEncoding: inout String.Encoding?) throws {
// detect encoding from so-called "magic numbers"
- // check Unicode's BOM
for bom in Unicode.BOM.allCases {
guard
data.starts(with: bom.sequence),
let string = String(bomCapableData: data, encoding: bom.encoding)
- else { continue }
+ else { continue }
usedEncoding = bom.encoding
self = string
return
}
- // try ISO-2022-JP by checking the existence of typical escape sequences
- // -> It's not perfect yet works in most cases. (2016-01)
- if data.prefix(maxDetectionLength).contains(0x1B),
- ISO2022JPEscapeSequences.contains(where: { data.range(of: $0) != nil }),
- let string = String(data: data, encoding: .iso2022JP)
- {
- usedEncoding = .iso2022JP
- self = string
- return
- }
-
// try encodings in order from the top of the encoding list
for cfEncoding in suggestedCFEncodings {
let encoding = String.Encoding(cfEncoding: cfEncoding)
+ guard
+ let string = String(data: data, encoding: encoding)
+ else { continue }
- if let string = String(data: data, encoding: encoding) {
- usedEncoding = encoding
- self = string
- return
- }
+ usedEncoding = encoding
+ self = string
+ return
}
throw CocoaError(.fileReadUnknownStringEncoding)
diff --git a/CotEditor/en-GB.lproj/Acknowledgments.html b/CotEditor/en-GB.lproj/Acknowledgments.html
index 010e08391..9f8238d77 100644
--- a/CotEditor/en-GB.lproj/Acknowledgments.html
+++ b/CotEditor/en-GB.lproj/Acknowledgments.html
@@ -125,7 +125,7 @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRES
Other Information Sources
diff --git a/CotEditor/en.lproj/Acknowledgments.html b/CotEditor/en.lproj/Acknowledgments.html
index bbead4288..70be5a597 100644
--- a/CotEditor/en.lproj/Acknowledgments.html
+++ b/CotEditor/en.lproj/Acknowledgments.html
@@ -125,7 +125,7 @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRES
Other Information Sources
diff --git a/CotEditor/ja.lproj/Acknowledgments.html b/CotEditor/ja.lproj/Acknowledgments.html
index dac373e16..e8dd3e5a4 100644
--- a/CotEditor/ja.lproj/Acknowledgments.html
+++ b/CotEditor/ja.lproj/Acknowledgments.html
@@ -125,7 +125,7 @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRES
diff --git a/CotEditor/pt.lproj/Acknowledgments.html b/CotEditor/pt.lproj/Acknowledgments.html
index 9396ee72a..2a017547a 100644
--- a/CotEditor/pt.lproj/Acknowledgments.html
+++ b/CotEditor/pt.lproj/Acknowledgments.html
@@ -125,7 +125,7 @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLI
Outras Fontes de Informações
diff --git a/CotEditor/tr.lproj/Acknowledgments.html b/CotEditor/tr.lproj/Acknowledgments.html
index 91eec528b..2e3f14cd2 100644
--- a/CotEditor/tr.lproj/Acknowledgments.html
+++ b/CotEditor/tr.lproj/Acknowledgments.html
@@ -125,7 +125,7 @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRES
Diğer Bilgi Kaynakları
- - FUJIDANA tarafından sağlanan FJDDetectEncoding'e teşekkürlerimizi sunarız. <http://blogs.dion.ne.jp/fujidana/archives/4169016.html> CotEditor'un kodlama otomatik algılamasının (ISO 2022-JP, UTF-8, UTF-16) temel aldığı. FJDDetectEncoding, BSD lisansı altında yayınlandı.
+ - FUJIDANA tarafından sağlanan FJDDetectEncoding'e teşekkürlerimizi sunarız. <http://blogs.dion.ne.jp/fujidana/archives/4169016.html> CotEditor'un kodlama otomatik algılamasının (UTF-8, UTF-16) temel aldığı. FJDDetectEncoding, BSD lisansı altında yayınlandı.
- CotEditor'un Dosya Bırakma işlevinin değişkenlerinin dayandığı Daisuke Kamiyama tarafından yazılan Mi <https://www.mimikaki.net> .
diff --git a/CotEditor/zh-Hans.lproj/Acknowledgments.html b/CotEditor/zh-Hans.lproj/Acknowledgments.html
index e57b5b39a..9dee96b25 100644
--- a/CotEditor/zh-Hans.lproj/Acknowledgments.html
+++ b/CotEditor/zh-Hans.lproj/Acknowledgments.html
@@ -125,7 +125,7 @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRES
diff --git a/CotEditor/zh-Hant.lproj/Acknowledgments.html b/CotEditor/zh-Hant.lproj/Acknowledgments.html
index 7d0ef5641..f47687174 100644
--- a/CotEditor/zh-Hant.lproj/Acknowledgments.html
+++ b/CotEditor/zh-Hant.lproj/Acknowledgments.html
@@ -125,7 +125,7 @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRES
diff --git a/Tests/EncodingDetectionTests.swift b/Tests/EncodingDetectionTests.swift
index 89d25f678..6c3761427 100644
--- a/Tests/EncodingDetectionTests.swift
+++ b/Tests/EncodingDetectionTests.swift
@@ -9,7 +9,7 @@
//
// ---------------------------------------------------------------------------
//
-// © 2016-2021 1024jp
+// © 2016-2022 1024jp
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@@ -70,8 +70,14 @@ final class EncodingDetectionTests: XCTestCase {
func testISO2022() throws {
+ let data = try self.dataForFileName("ISO 2022-JP")
+ let encodings: [String.Encoding] = [.utf8, .iso2022JP, .utf16]
+ let cfEncodings = encodings
+ .map(\.rawValue)
+ .map(CFStringConvertNSStringEncodingToEncoding)
+
var encoding: String.Encoding?
- let string = try self.encodedStringForFileName("ISO 2022-JP", usedEncoding: &encoding)
+ let string = try String(data: data, suggestedCFEncodings: cfEncodings, usedEncoding: &encoding)
XCTAssertEqual(string, "dog犬")
XCTAssertEqual(encoding, .iso2022JP)