Deprecate feature to distinguish Shift JIS and Shift JIS X0213

This commit is contained in:
1024jp 2024-01-19 10:39:42 +09:00
parent b71afa3f18
commit e5948f587d
9 changed files with 32 additions and 70 deletions

View File

@ -8,6 +8,7 @@
- When the file encoding is incompatible with the document content, display a notification instead of silently overwriting the document file with lossy data in autosaving until the user explicitly allows it.
- Suppress displaying the dialog for saving in an incompatible encoding when the user once allowed it.
- Update the C syntax to improve highlighting numbers.
- Deprecate the future to interpret `Shift_JIS` as Shift JIS or Shift JIS X0213 according to the user's priority setting when referring to encoding declarations in documents in encoding detection.
- Improve stability.
- [trivial] Improve alert messages for incompatible text encodings.

View File

@ -21,8 +21,6 @@
<li>To make the detection of declarations work, you have to put an equal sign (“=”, no other characters) right after “<code>charset</code>” or “<code>encoding</code>.”</li>
<li>The encoding name which comes after “<code>charset=</code>” or “<code>encoding=</code>” can be without double-quotes (&quot;) around.</li>
<li>In the case there are multiple encoding declarations in your document, CotEditor uses the first one to determine the text encoding.</li>
<li>When the encoding is declared as Shift JIS (written like <code>charset=&quot;Shift_JIS&quot;</code>), CotEditor chooses the prioritized one (the upper one in the encoding list) between “Japanese (Shift JIS)” and “Japanese (Shift JIS X0213).”<br />
<a href="howto_customize_encoding_order.html">Change priorities of text encodings</a></li>
</ul>

View File

@ -30,6 +30,7 @@
<li>When the file encoding is incompatible with the document content, display a notification instead of silently overwriting the document file with lossy data in autosaving until the user explicitly allows it.</li>
<li>Suppress displaying the dialog for saving in an incompatible encoding when the user once allowed it.</li>
<li>Update the C syntax to improve highlighting numbers.</li>
<li>Deprecate the future to interpret <code>Shift_JIS</code> as Shift JIS or Shift JIS X0213 according to the user's priority setting when referring to encoding declarations in documents in encoding detection.</li>
<li>Improve stability.</li>
<li><span class="trivial">trivial</span>: Improve alert messages for incompatible text encodings.</li>

View File

@ -21,8 +21,6 @@
<li><code>charset</code>, <code>encoding</code>の直後にスペースなど<code>=</code>以外の文字があってはいけません。</li>
<li><code>charset=</code>, <code>encoding=</code>の後ろは、<code>&quot;</code>で囲っていなくても認識します。</li>
<li>複数の記述があるときは、一番最初のものを読み取ります。</li>
<li><code>charset=&quot;Shift_JIS&quot;</code>の場合は、「日本語 (Shift JIS)」と「日本語 (Shift JIS X0213)」のうち、優先されている方(メニューで上にある方)として認識されます。<br />
<a href="howto_customize_encoding_order.html">テキストエンコーディングの優先順位を変更する</a></li>
</ul>

View File

@ -30,6 +30,7 @@
<li>エンコーディングが書類内容と互換性のない場合、ユーザが明示的に許可をするまでオートセーブはせずにダイアログでその旨を表示</li>
<li>書類内容と互換性のないエンコーディングでの保存をユーザが一度許可したのちは、保存毎にダイアログを表示しないように</li>
<li>Cシンタックスを更新し数値のハイライトを改良</li>
<li>エンコーディングの自動認識で書類内のエンコーディング宣言を参照するときに、<code>Shift_JIS</code>をユーザの優先順位設定設定によってShift JISとShift JIS X0213に解釈し分ける機能を廃止</li>
<li>安定性を向上</li>
<li><span class="trivial">trivial</span>: 書類内容と互換性のないエンコーディングに関するアラートメッセージを改良</li>

View File

@ -320,13 +320,13 @@ final class Document: NSDocument, AdditionalDocumentPreparing, EncodingChanging
return .specific(encoding)
}
var encodingList = UserDefaults.standard[.encodingList]
var encodingPriority = EncodingManager.shared.encodings.compactMap { $0 }
let isInitialOpen = (self.fileData == nil) && (self.textStorage.length == 0)
if !isInitialOpen { // prioritize the current encoding
encodingList.insert(self.fileEncoding.encoding.cfEncoding, at: 0)
encodingPriority.insert(self.fileEncoding.encoding, at: 0)
}
return .automatic(priority: encodingList, refersToTag: UserDefaults.standard[.referToEncodingTag])
return .automatic(priority: encodingPriority, refersToTag: UserDefaults.standard[.referToEncodingTag])
}()
// .readingEncoding is only valid once

View File

@ -8,7 +8,7 @@
//
// ---------------------------------------------------------------------------
//
// © 2018-2023 1024jp
// © 2018-2024 1024jp
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@ -44,7 +44,7 @@ struct DocumentFile {
enum EncodingStrategy {
case automatic(priority: [CFStringEncoding], refersToTag: Bool)
case automatic(priority: [String.Encoding], refersToTag: Bool)
case specific(String.Encoding)
}
@ -92,7 +92,7 @@ struct DocumentFile {
switch encodingStrategy {
case .automatic(let priority, let refersToTag):
(content, encoding) = try Self.string(data: data, xattrEncoding: self.xattrEncoding,
suggestedCFEncodings: priority,
suggestedEncodings: priority,
refersToEncodingTag: refersToTag)
case .specific(let readingEncoding):
encoding = readingEncoding
@ -124,10 +124,10 @@ struct DocumentFile {
/// - Parameters:
/// - data: The data to encode.
/// - xattrEncoding: The text encoding read from the file's extended attributes.
/// - suggestedCFEncodings: The list of CSStringEncodings to test the encoding.
/// - suggestedEncodings: The list of encodings to test the encoding.
/// - refersToEncodingTag: The boolean whether to refer encoding tag in the file content.
/// - Returns: The decoded string and used encoding.
private static func string(data: Data, xattrEncoding: String.Encoding?, suggestedCFEncodings: [CFStringEncoding], refersToEncodingTag: Bool) throws -> (String, String.Encoding) {
private static func string(data: Data, xattrEncoding: String.Encoding?, suggestedEncodings: [String.Encoding], refersToEncodingTag: Bool) throws -> (String, String.Encoding) {
// try interpreting with xattr encoding
if let xattrEncoding {
@ -139,11 +139,12 @@ struct DocumentFile {
// detect encoding from data
var usedEncoding: String.Encoding?
let string = try String(data: data, suggestedCFEncodings: suggestedCFEncodings, usedEncoding: &usedEncoding)
let string = try String(data: data, suggestedEncodings: suggestedEncodings, usedEncoding: &usedEncoding)
// try reading encoding declaration and take priority of it if it seems well
if refersToEncodingTag,
let scannedEncoding = string.scanEncodingDeclaration(upTo: self.maxEncodingScanLength, suggestedCFEncodings: suggestedCFEncodings),
let scannedEncoding = string.scanEncodingDeclaration(upTo: self.maxEncodingScanLength),
suggestedEncodings.contains(scannedEncoding),
scannedEncoding != usedEncoding,
let string = String(bomCapableData: data, encoding: scannedEncoding)
{

View File

@ -65,14 +65,6 @@ extension Unicode {
// MARK: -
private extension CFStringEncoding {
static let shiftJIS = CFStringEncoding(CFStringEncodings.shiftJIS.rawValue)
static let shiftJIS_X0213 = CFStringEncoding(CFStringEncodings.shiftJIS_X0213.rawValue)
}
extension String.Encoding {
init(cfEncodings: CFStringEncodings) {
@ -141,10 +133,10 @@ extension String {
///
/// - Parameters:
/// - data: The data object containing the string data.
/// - suggestedCFEncodings: The prioritized list of encoding candidates.
/// - suggestedEncodings: The prioritized list of encoding candidates.
/// - usedEncoding: The encoding used to interpret the data.
/// - Throws: `CocoaError(.fileReadUnknownStringEncoding)`
init(data: Data, suggestedCFEncodings: [CFStringEncoding], usedEncoding: inout String.Encoding?) throws {
init(data: Data, suggestedEncodings: [String.Encoding], usedEncoding: inout String.Encoding?) throws {
// detect encoding from so-called "magic numbers"
for bom in Unicode.BOM.allCases {
@ -159,11 +151,8 @@ extension String {
}
// try encodings in order from the top of the encoding list
for cfEncoding in suggestedCFEncodings {
let encoding = String.Encoding(cfEncoding: cfEncoding)
guard
let string = String(data: data, encoding: encoding)
else { continue }
for encoding in suggestedEncodings {
guard let string = String(data: data, encoding: encoding) else { continue }
usedEncoding = encoding
self = string
@ -181,32 +170,19 @@ extension String {
///
/// - Parameters:
/// - maxLength: The number of forward characters to be scanned.
/// - suggestedCFEncodings: The priority of encodings to determine the user-preferred Shift JIS encoding.
/// - Returns: A string encoding, or `nil` if not found.
func scanEncodingDeclaration(upTo maxLength: Int, suggestedCFEncodings: [CFStringEncoding]) -> String.Encoding? {
func scanEncodingDeclaration(upTo maxLength: Int) -> String.Encoding? {
assert(maxLength > 0)
guard !self.isEmpty else { return nil }
let regex = /\b(charset=|encoding=|@charset|encoding:|coding:) *["']? *(?<encoding>[-_a-zA-Z0-9]+)/.wordBoundaryKind(.simple)
let regex = /\b(charset=|encoding=|@charset|encoding:|coding:) *["']? *(?<encoding>[-_a-zA-Z0-9]+)/
.wordBoundaryKind(.simple)
guard let ianaCharSetName = try? regex.firstMatch(in: self.prefix(maxLength))?.encoding else { return nil }
// convert IANA CharSet name to CFStringEncoding
let cfEncoding: CFStringEncoding = if ianaCharSetName.uppercased() == "SHIFT_JIS",
let cfEncoding = suggestedCFEncodings.first(where: { $0 == .shiftJIS || $0 == .shiftJIS_X0213 })
{
// pick user's preferred one for "Shift_JIS"
// -> CFStringConvertIANACharSetNameToEncoding() converts "SHIFT_JIS" to .shiftJIS regardless of the letter case.
// Although this behavior is theoretically correct since the IANA charset name is case insensitive,
// we treat them with care by respecting the user's priority.
// FYI: CFStringConvertEncodingToIANACharSetName() converts .shiftJIS and .shiftJIS_X0213
// to "shift_jis" and "Shift_JIS" respectively.
cfEncoding
} else {
CFStringConvertIANACharSetNameToEncoding(ianaCharSetName as CFString)
}
let cfEncoding = CFStringConvertIANACharSetNameToEncoding(ianaCharSetName as CFString)
guard cfEncoding != kCFStringEncodingInvalidId else { return nil }

View File

@ -75,12 +75,9 @@ final class EncodingDetectionTests: XCTestCase {
let data = try self.dataForFileName("ISO 2022-JP")
let encodings: [String.Encoding] = [.iso2022JP, .utf16]
let cfEncodings = encodings
.map(\.rawValue)
.map(CFStringConvertNSStringEncodingToEncoding)
var encoding: String.Encoding?
let string = try String(data: data, suggestedCFEncodings: cfEncodings, usedEncoding: &encoding)
let string = try String(data: data, suggestedEncodings: encodings, usedEncoding: &encoding)
XCTAssertEqual(string, "dog犬")
XCTAssertEqual(encoding, .iso2022JP)
@ -92,7 +89,7 @@ final class EncodingDetectionTests: XCTestCase {
let data = try self.dataForFileName("UTF-8")
var encoding: String.Encoding?
XCTAssertThrowsError(try String(data: data, suggestedCFEncodings: [], usedEncoding: &encoding)) { error in
XCTAssertThrowsError(try String(data: data, suggestedEncodings: [], usedEncoding: &encoding)) { error in
XCTAssertEqual(error as? CocoaError, CocoaError(.fileReadUnknownStringEncoding))
}
XCTAssertNil(encoding)
@ -104,9 +101,8 @@ final class EncodingDetectionTests: XCTestCase {
let data = try self.dataForFileName("UTF-8")
var encoding: String.Encoding?
let invalidInt = kCFStringEncodingInvalidId
let utf8Int = CFStringBuiltInEncodings.UTF8.rawValue
let string = try String(data: data, suggestedCFEncodings: [invalidInt, utf8Int], usedEncoding: &encoding)
let invalidEncoding = String.Encoding(cfEncoding: kCFStringEncodingInvalidId)
let string = try String(data: data, suggestedEncodings: [invalidEncoding, .utf8], usedEncoding: &encoding)
XCTAssertEqual(string, "0")
XCTAssertEqual(encoding, .utf8)
@ -120,7 +116,7 @@ final class EncodingDetectionTests: XCTestCase {
var encoding: String.Encoding?
var string: String?
XCTAssertThrowsError(string = try String(data: data, suggestedCFEncodings: [], usedEncoding: &encoding)) { error in
XCTAssertThrowsError(string = try String(data: data, suggestedEncodings: [], usedEncoding: &encoding)) { error in
XCTAssertEqual(error as? CocoaError, CocoaError(.fileReadUnknownStringEncoding))
}
@ -143,23 +139,13 @@ final class EncodingDetectionTests: XCTestCase {
func testEncodingDeclarationScan() {
let string = "<meta charset=\"Shift_JIS\"/>"
let utf8 = CFStringBuiltInEncodings.UTF8.rawValue
let shiftJIS = CFStringEncoding(CFStringEncodings.shiftJIS.rawValue)
let shiftJISX0213 = CFStringEncoding(CFStringEncodings.shiftJIS_X0213.rawValue)
XCTAssertNil(string.scanEncodingDeclaration(upTo: 16))
XCTAssertEqual(string.scanEncodingDeclaration(upTo: 128), String.Encoding(cfEncodings: .shiftJIS))
XCTAssertNil(string.scanEncodingDeclaration(upTo: 16, suggestedCFEncodings: [utf8, shiftJIS, shiftJISX0213]))
XCTAssertEqual(string.scanEncodingDeclaration(upTo: 128, suggestedCFEncodings: [utf8, shiftJIS, shiftJISX0213]),
String.Encoding(cfEncodings: CFStringEncodings.shiftJIS))
XCTAssertEqual(string.scanEncodingDeclaration(upTo: 128, suggestedCFEncodings: [utf8, shiftJISX0213, shiftJIS]),
String.Encoding(cfEncodings: CFStringEncodings.shiftJIS_X0213))
XCTAssertEqual("<meta charset=\"utf-8\"/>".scanEncodingDeclaration(upTo: 128, suggestedCFEncodings: [utf8, shiftJISX0213, shiftJIS]),
.utf8)
XCTAssertEqual("<meta charset=\"utf-8\"/>".scanEncodingDeclaration(upTo: 128), .utf8)
// Swift.Regex with non-simple word boundaries never returns when the given string contains a specific pattern of letters (2023-12 on Swift 5.9).
XCTAssertNil("タマゴ,1,".scanEncodingDeclaration(upTo: 128, suggestedCFEncodings: []))
XCTAssertNil("タマゴ,1,".scanEncodingDeclaration(upTo: 128))
XCTAssertNil(try /\ba/.wordBoundaryKind(.simple).firstMatch(in: "タマゴ,1,"))
}
@ -305,7 +291,7 @@ private extension EncodingDetectionTests {
let data = try self.dataForFileName(fileName)
return try String(data: data, suggestedCFEncodings: [], usedEncoding: &usedEncoding)
return try String(data: data, suggestedEncodings: [], usedEncoding: &usedEncoding)
}