add RFC 2047 encoded word decoder for MIME filenames

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-14 12:07:17 +01:00
parent a3a99e668f
commit cd7d39de9e
2 changed files with 150 additions and 0 deletions
--- a/Packages/MagnumOpusCore/Sources/MIMEParser/RFC2047Decoder.swift
+++ b/Packages/MagnumOpusCore/Sources/MIMEParser/RFC2047Decoder.swift
@@ -0,0 +1,99 @@
+import Foundation
+
+public enum RFC2047Decoder {
+	/// Decode RFC 2047 encoded words in a string.
+	/// Pattern: =?charset?encoding?encoded_text?=
+	public static func decode(_ input: String) -> String {
+		let pattern = #"=\?([^?]+)\?([BbQq])\?([^?]*)\?="#
+		guard let regex = try? NSRegularExpression(pattern: pattern) else {
+			return input
+		}
+
+		let nsInput = input as NSString
+		let matches = regex.matches(in: input, range: NSRange(location: 0, length: nsInput.length))
+
+		guard !matches.isEmpty else { return input }
+
+		var result = ""
+		var lastEnd = 0
+
+		for match in matches {
+			let matchRange = match.range
+			// Add any non-encoded text between matches (skip whitespace between adjacent encoded words)
+			let gap = nsInput.substring(with: NSRange(location: lastEnd, length: matchRange.location - lastEnd))
+			let trimmedGap = gap.trimmingCharacters(in: .whitespaces)
+			if !trimmedGap.isEmpty || lastEnd == 0 {
+				// Only add gap if it's not just whitespace between encoded words
+				if lastEnd == 0 && matchRange.location > 0 {
+					result += gap
+				} else if !trimmedGap.isEmpty {
+					result += gap
+				}
+			}
+
+			let charset = nsInput.substring(with: match.range(at: 1))
+			let encoding = nsInput.substring(with: match.range(at: 2)).uppercased()
+			let encodedText = nsInput.substring(with: match.range(at: 3))
+
+			let cfEncoding = CFStringConvertIANACharSetNameToEncoding(charset as CFString)
+			let nsEncoding = CFStringConvertEncodingToNSStringEncoding(cfEncoding)
+
+			let decoded: String?
+			if encoding == "B" {
+				guard let data = Data(base64Encoded: encodedText) else {
+					result += nsInput.substring(with: matchRange)
+					lastEnd = matchRange.location + matchRange.length
+					continue
+				}
+				decoded = String(data: data, encoding: String.Encoding(rawValue: nsEncoding))
+			} else {
+				// Q encoding: like quoted-printable but underscores represent spaces
+				let withSpaces = encodedText.replacingOccurrences(of: "_", with: " ")
+				let data = decodeQuotedPrintableBytes(withSpaces)
+				decoded = String(data: data, encoding: String.Encoding(rawValue: nsEncoding))
+			}
+
+			result += decoded ?? nsInput.substring(with: matchRange)
+			lastEnd = matchRange.location + matchRange.length
+		}
+
+		// Append any trailing non-encoded text
+		if lastEnd < nsInput.length {
+			result += nsInput.substring(from: lastEnd)
+		}
+
+		return result
+	}
+
+	private static func decodeQuotedPrintableBytes(_ input: String) -> Data {
+		var data = Data()
+		var i = input.startIndex
+		while i < input.endIndex {
+			if input[i] == "=" {
+				let hexStart = input.index(after: i)
+				guard hexStart < input.endIndex else {
+					data.append(contentsOf: "=".utf8)
+					break
+				}
+				let hexEnd = input.index(hexStart, offsetBy: 1, limitedBy: input.endIndex) ?? input.endIndex
+				guard hexEnd < input.endIndex else {
+					data.append(contentsOf: String(input[i...]).utf8)
+					break
+				}
+				let nextAfterHex = input.index(after: hexEnd)
+				let hex = String(input[hexStart...hexEnd])
+				if let byte = UInt8(hex, radix: 16) {
+					data.append(byte)
+					i = nextAfterHex
+				} else {
+					data.append(contentsOf: "=".utf8)
+					i = hexStart
+				}
+			} else {
+				data.append(contentsOf: String(input[i]).utf8)
+				i = input.index(after: i)
+			}
+		}
+		return data
+	}
+}
--- a/Packages/MagnumOpusCore/Tests/MIMEParserTests/RFC2047DecoderTests.swift
+++ b/Packages/MagnumOpusCore/Tests/MIMEParserTests/RFC2047DecoderTests.swift
@@ -0,0 +1,51 @@
+import Testing
+import Foundation
+@testable import MIMEParser
+
+@Suite("RFC2047Decoder")
+struct RFC2047DecoderTests {
+
+	@Test("plain ASCII filename passes through unchanged")
+	func plainAscii() {
+		let result = RFC2047Decoder.decode("report.pdf")
+		#expect(result == "report.pdf")
+	}
+
+	@Test("base64 encoded UTF-8 filename decoded correctly")
+	func base64Utf8() {
+		// "Bericht.pdf" in base64
+		let encoded = "=?utf-8?B?QmVyaWNodC5wZGY=?="
+		let result = RFC2047Decoder.decode(encoded)
+		#expect(result == "Bericht.pdf")
+	}
+
+	@Test("quoted-printable encoded UTF-8 filename decoded correctly")
+	func quotedPrintableUtf8() {
+		// "Grüße.txt" — ü = =C3=BC, ß = =C3=9F
+		let encoded = "=?utf-8?Q?Gr=C3=BC=C3=9Fe.txt?="
+		let result = RFC2047Decoder.decode(encoded)
+		#expect(result == "Grüße.txt")
+	}
+
+	@Test("multiple encoded words concatenated")
+	func multipleEncodedWords() {
+		let encoded = "=?utf-8?B?SGVsbG8=?= =?utf-8?B?V29ybGQ=?="
+		let result = RFC2047Decoder.decode(encoded)
+		#expect(result == "HelloWorld")
+	}
+
+	@Test("ISO-8859-1 encoded filename decoded correctly")
+	func iso88591() {
+		// "café" — é = 0xE9 in ISO-8859-1, base64 of "café" in ISO-8859-1 is "Y2Fm6Q=="
+		let encoded = "=?iso-8859-1?B?Y2Fm6Q==?="
+		let result = RFC2047Decoder.decode(encoded)
+		#expect(result == "café")
+	}
+
+	@Test("underscores in Q-encoding replaced with spaces")
+	func qEncodingUnderscores() {
+		let encoded = "=?utf-8?Q?my_file_name.pdf?="
+		let result = RFC2047Decoder.decode(encoded)
+		#expect(result == "my file name.pdf")
+	}
+}