add RFC 2047 encoded word decoder for MIME filenames
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,99 @@
|
||||
import Foundation
|
||||
|
||||
public enum RFC2047Decoder {
|
||||
/// Decode RFC 2047 encoded words in a string.
|
||||
/// Pattern: =?charset?encoding?encoded_text?=
|
||||
public static func decode(_ input: String) -> String {
|
||||
let pattern = #"=\?([^?]+)\?([BbQq])\?([^?]*)\?="#
|
||||
guard let regex = try? NSRegularExpression(pattern: pattern) else {
|
||||
return input
|
||||
}
|
||||
|
||||
let nsInput = input as NSString
|
||||
let matches = regex.matches(in: input, range: NSRange(location: 0, length: nsInput.length))
|
||||
|
||||
guard !matches.isEmpty else { return input }
|
||||
|
||||
var result = ""
|
||||
var lastEnd = 0
|
||||
|
||||
for match in matches {
|
||||
let matchRange = match.range
|
||||
// Add any non-encoded text between matches (skip whitespace between adjacent encoded words)
|
||||
let gap = nsInput.substring(with: NSRange(location: lastEnd, length: matchRange.location - lastEnd))
|
||||
let trimmedGap = gap.trimmingCharacters(in: .whitespaces)
|
||||
if !trimmedGap.isEmpty || lastEnd == 0 {
|
||||
// Only add gap if it's not just whitespace between encoded words
|
||||
if lastEnd == 0 && matchRange.location > 0 {
|
||||
result += gap
|
||||
} else if !trimmedGap.isEmpty {
|
||||
result += gap
|
||||
}
|
||||
}
|
||||
|
||||
let charset = nsInput.substring(with: match.range(at: 1))
|
||||
let encoding = nsInput.substring(with: match.range(at: 2)).uppercased()
|
||||
let encodedText = nsInput.substring(with: match.range(at: 3))
|
||||
|
||||
let cfEncoding = CFStringConvertIANACharSetNameToEncoding(charset as CFString)
|
||||
let nsEncoding = CFStringConvertEncodingToNSStringEncoding(cfEncoding)
|
||||
|
||||
let decoded: String?
|
||||
if encoding == "B" {
|
||||
guard let data = Data(base64Encoded: encodedText) else {
|
||||
result += nsInput.substring(with: matchRange)
|
||||
lastEnd = matchRange.location + matchRange.length
|
||||
continue
|
||||
}
|
||||
decoded = String(data: data, encoding: String.Encoding(rawValue: nsEncoding))
|
||||
} else {
|
||||
// Q encoding: like quoted-printable but underscores represent spaces
|
||||
let withSpaces = encodedText.replacingOccurrences(of: "_", with: " ")
|
||||
let data = decodeQuotedPrintableBytes(withSpaces)
|
||||
decoded = String(data: data, encoding: String.Encoding(rawValue: nsEncoding))
|
||||
}
|
||||
|
||||
result += decoded ?? nsInput.substring(with: matchRange)
|
||||
lastEnd = matchRange.location + matchRange.length
|
||||
}
|
||||
|
||||
// Append any trailing non-encoded text
|
||||
if lastEnd < nsInput.length {
|
||||
result += nsInput.substring(from: lastEnd)
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
private static func decodeQuotedPrintableBytes(_ input: String) -> Data {
|
||||
var data = Data()
|
||||
var i = input.startIndex
|
||||
while i < input.endIndex {
|
||||
if input[i] == "=" {
|
||||
let hexStart = input.index(after: i)
|
||||
guard hexStart < input.endIndex else {
|
||||
data.append(contentsOf: "=".utf8)
|
||||
break
|
||||
}
|
||||
let hexEnd = input.index(hexStart, offsetBy: 1, limitedBy: input.endIndex) ?? input.endIndex
|
||||
guard hexEnd < input.endIndex else {
|
||||
data.append(contentsOf: String(input[i...]).utf8)
|
||||
break
|
||||
}
|
||||
let nextAfterHex = input.index(after: hexEnd)
|
||||
let hex = String(input[hexStart...hexEnd])
|
||||
if let byte = UInt8(hex, radix: 16) {
|
||||
data.append(byte)
|
||||
i = nextAfterHex
|
||||
} else {
|
||||
data.append(contentsOf: "=".utf8)
|
||||
i = hexStart
|
||||
}
|
||||
} else {
|
||||
data.append(contentsOf: String(input[i]).utf8)
|
||||
i = input.index(after: i)
|
||||
}
|
||||
}
|
||||
return data
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,51 @@
|
||||
import Testing
|
||||
import Foundation
|
||||
@testable import MIMEParser
|
||||
|
||||
@Suite("RFC2047Decoder")
|
||||
struct RFC2047DecoderTests {
|
||||
|
||||
@Test("plain ASCII filename passes through unchanged")
|
||||
func plainAscii() {
|
||||
let result = RFC2047Decoder.decode("report.pdf")
|
||||
#expect(result == "report.pdf")
|
||||
}
|
||||
|
||||
@Test("base64 encoded UTF-8 filename decoded correctly")
|
||||
func base64Utf8() {
|
||||
// "Bericht.pdf" in base64
|
||||
let encoded = "=?utf-8?B?QmVyaWNodC5wZGY=?="
|
||||
let result = RFC2047Decoder.decode(encoded)
|
||||
#expect(result == "Bericht.pdf")
|
||||
}
|
||||
|
||||
@Test("quoted-printable encoded UTF-8 filename decoded correctly")
|
||||
func quotedPrintableUtf8() {
|
||||
// "Grüße.txt" — ü = =C3=BC, ß = =C3=9F
|
||||
let encoded = "=?utf-8?Q?Gr=C3=BC=C3=9Fe.txt?="
|
||||
let result = RFC2047Decoder.decode(encoded)
|
||||
#expect(result == "Grüße.txt")
|
||||
}
|
||||
|
||||
@Test("multiple encoded words concatenated")
|
||||
func multipleEncodedWords() {
|
||||
let encoded = "=?utf-8?B?SGVsbG8=?= =?utf-8?B?V29ybGQ=?="
|
||||
let result = RFC2047Decoder.decode(encoded)
|
||||
#expect(result == "HelloWorld")
|
||||
}
|
||||
|
||||
@Test("ISO-8859-1 encoded filename decoded correctly")
|
||||
func iso88591() {
|
||||
// "café" — é = 0xE9 in ISO-8859-1, base64 of "café" in ISO-8859-1 is "Y2Fm6Q=="
|
||||
let encoded = "=?iso-8859-1?B?Y2Fm6Q==?="
|
||||
let result = RFC2047Decoder.decode(encoded)
|
||||
#expect(result == "café")
|
||||
}
|
||||
|
||||
@Test("underscores in Q-encoding replaced with spaces")
|
||||
func qEncodingUnderscores() {
|
||||
let encoded = "=?utf-8?Q?my_file_name.pdf?="
|
||||
let result = RFC2047Decoder.decode(encoded)
|
||||
#expect(result == "my file name.pdf")
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user