add RFC 2047 encoded word decoder for MIME filenames

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-03-14 12:07:17 +01:00
parent a3a99e668f
commit cd7d39de9e
2 changed files with 150 additions and 0 deletions

View File

@@ -0,0 +1,99 @@
import Foundation
public enum RFC2047Decoder {
/// Decode RFC 2047 encoded words in a string.
/// Pattern: =?charset?encoding?encoded_text?=
public static func decode(_ input: String) -> String {
let pattern = #"=\?([^?]+)\?([BbQq])\?([^?]*)\?="#
guard let regex = try? NSRegularExpression(pattern: pattern) else {
return input
}
let nsInput = input as NSString
let matches = regex.matches(in: input, range: NSRange(location: 0, length: nsInput.length))
guard !matches.isEmpty else { return input }
var result = ""
var lastEnd = 0
for match in matches {
let matchRange = match.range
// Add any non-encoded text between matches (skip whitespace between adjacent encoded words)
let gap = nsInput.substring(with: NSRange(location: lastEnd, length: matchRange.location - lastEnd))
let trimmedGap = gap.trimmingCharacters(in: .whitespaces)
if !trimmedGap.isEmpty || lastEnd == 0 {
// Only add gap if it's not just whitespace between encoded words
if lastEnd == 0 && matchRange.location > 0 {
result += gap
} else if !trimmedGap.isEmpty {
result += gap
}
}
let charset = nsInput.substring(with: match.range(at: 1))
let encoding = nsInput.substring(with: match.range(at: 2)).uppercased()
let encodedText = nsInput.substring(with: match.range(at: 3))
let cfEncoding = CFStringConvertIANACharSetNameToEncoding(charset as CFString)
let nsEncoding = CFStringConvertEncodingToNSStringEncoding(cfEncoding)
let decoded: String?
if encoding == "B" {
guard let data = Data(base64Encoded: encodedText) else {
result += nsInput.substring(with: matchRange)
lastEnd = matchRange.location + matchRange.length
continue
}
decoded = String(data: data, encoding: String.Encoding(rawValue: nsEncoding))
} else {
// Q encoding: like quoted-printable but underscores represent spaces
let withSpaces = encodedText.replacingOccurrences(of: "_", with: " ")
let data = decodeQuotedPrintableBytes(withSpaces)
decoded = String(data: data, encoding: String.Encoding(rawValue: nsEncoding))
}
result += decoded ?? nsInput.substring(with: matchRange)
lastEnd = matchRange.location + matchRange.length
}
// Append any trailing non-encoded text
if lastEnd < nsInput.length {
result += nsInput.substring(from: lastEnd)
}
return result
}
private static func decodeQuotedPrintableBytes(_ input: String) -> Data {
var data = Data()
var i = input.startIndex
while i < input.endIndex {
if input[i] == "=" {
let hexStart = input.index(after: i)
guard hexStart < input.endIndex else {
data.append(contentsOf: "=".utf8)
break
}
let hexEnd = input.index(hexStart, offsetBy: 1, limitedBy: input.endIndex) ?? input.endIndex
guard hexEnd < input.endIndex else {
data.append(contentsOf: String(input[i...]).utf8)
break
}
let nextAfterHex = input.index(after: hexEnd)
let hex = String(input[hexStart...hexEnd])
if let byte = UInt8(hex, radix: 16) {
data.append(byte)
i = nextAfterHex
} else {
data.append(contentsOf: "=".utf8)
i = hexStart
}
} else {
data.append(contentsOf: String(input[i]).utf8)
i = input.index(after: i)
}
}
return data
}
}

View File

@@ -0,0 +1,51 @@
import Testing
import Foundation
@testable import MIMEParser
@Suite("RFC2047Decoder")
struct RFC2047DecoderTests {
@Test("plain ASCII filename passes through unchanged")
func plainAscii() {
let result = RFC2047Decoder.decode("report.pdf")
#expect(result == "report.pdf")
}
@Test("base64 encoded UTF-8 filename decoded correctly")
func base64Utf8() {
// "Bericht.pdf" in base64
let encoded = "=?utf-8?B?QmVyaWNodC5wZGY=?="
let result = RFC2047Decoder.decode(encoded)
#expect(result == "Bericht.pdf")
}
@Test("quoted-printable encoded UTF-8 filename decoded correctly")
func quotedPrintableUtf8() {
// "Grüße.txt" ü = =C3=BC, ß = =C3=9F
let encoded = "=?utf-8?Q?Gr=C3=BC=C3=9Fe.txt?="
let result = RFC2047Decoder.decode(encoded)
#expect(result == "Grüße.txt")
}
@Test("multiple encoded words concatenated")
func multipleEncodedWords() {
let encoded = "=?utf-8?B?SGVsbG8=?= =?utf-8?B?V29ybGQ=?="
let result = RFC2047Decoder.decode(encoded)
#expect(result == "HelloWorld")
}
@Test("ISO-8859-1 encoded filename decoded correctly")
func iso88591() {
// "café" é = 0xE9 in ISO-8859-1, base64 of "café" in ISO-8859-1 is "Y2Fm6Q=="
let encoded = "=?iso-8859-1?B?Y2Fm6Q==?="
let result = RFC2047Decoder.decode(encoded)
#expect(result == "café")
}
@Test("underscores in Q-encoding replaced with spaces")
func qEncodingUnderscores() {
let encoded = "=?utf-8?Q?my_file_name.pdf?="
let result = RFC2047Decoder.decode(encoded)
#expect(result == "my file name.pdf")
}
}