diff --git a/VorleserKit/Sources/BookParser/EPUBParser.swift b/VorleserKit/Sources/BookParser/EPUBParser.swift new file mode 100644 index 0000000..9f0f6b2 --- /dev/null +++ b/VorleserKit/Sources/BookParser/EPUBParser.swift @@ -0,0 +1,189 @@ +import Foundation +import ZIPFoundation +import SwiftSoup + +public enum EPUBParserError: Error, CustomStringConvertible { + case cannotOpenArchive(URL) + case missingContainerXML + case missingOPF(String) + case malformedOPF + case noSpineItems + + public var description: String { + switch self { + case .cannotOpenArchive(let url): "cannot open EPUB archive at \(url.path)" + case .missingContainerXML: "missing META-INF/container.xml" + case .missingOPF(let path): "missing OPF file at \(path)" + case .malformedOPF: "malformed OPF (package document)" + case .noSpineItems: "EPUB has no spine items" + } + } +} + +public struct EPUBParser { + public static func parse(url: URL) throws -> Book { + let archive: Archive + do { + archive = try Archive(url: url, accessMode: .read) + } catch { + throw EPUBParserError.cannotOpenArchive(url) + } + + let opfPath = try findOPFPath(in: archive) + let opfDir = (opfPath as NSString).deletingLastPathComponent + let opfData = try extractData(from: archive, path: opfPath) + let (title, author, manifest, spine) = try parseOPF(data: opfData) + + guard !spine.isEmpty else { throw EPUBParserError.noSpineItems } + + var chapters: [Chapter] = [] + for (index, itemRef) in spine.enumerated() { + guard let href = manifest[itemRef] else { continue } + let fullPath = opfDir.isEmpty ? href : "\(opfDir)/\(href)" + + var chapterTitle: String + var chapterText: String + do { + let htmlData = try extractData(from: archive, path: fullPath) + let html = String(data: htmlData, encoding: .utf8) ?? "" + let doc = try SwiftSoup.parse(html) + chapterTitle = try doc.select("h1, h2, h3, title").first()?.text() ?? "Chapter \(index + 1)" + let body = try doc.body()?.text() ?? "" + chapterText = normalizeWhitespace(body) + } catch { + chapterTitle = "Chapter \(index + 1) (parse error)" + chapterText = "" + } + + chapters.append(Chapter(index: index, title: chapterTitle, text: chapterText)) + } + + return Book( + title: title ?? url.deletingPathExtension().lastPathComponent, + author: author, + chapters: chapters + ) + } + + private static func findOPFPath(in archive: Archive) throws -> String { + let containerData = try extractData(from: archive, path: "META-INF/container.xml") + let parser = ContainerXMLParser(data: containerData) + guard let opfPath = parser.parse() else { + throw EPUBParserError.missingContainerXML + } + return opfPath + } + + private static func parseOPF(data: Data) throws -> (title: String?, author: String?, manifest: [String: String], spine: [String]) { + let parser = OPFParser(data: data) + guard let result = parser.parse() else { + throw EPUBParserError.malformedOPF + } + return result + } + + private static func extractData(from archive: Archive, path: String) throws -> Data { + guard let entry = archive[path] else { + throw EPUBParserError.missingOPF(path) + } + var data = Data() + _ = try archive.extract(entry) { chunk in + data.append(chunk) + } + return data + } + + private static func normalizeWhitespace(_ text: String) -> String { + text.components(separatedBy: .whitespacesAndNewlines) + .filter { !$0.isEmpty } + .joined(separator: " ") + } +} + +// MARK: - XML Parsers + +private class ContainerXMLParser: NSObject, XMLParserDelegate { + private let data: Data + private var opfPath: String? + + init(data: Data) { + self.data = data + } + + func parse() -> String? { + let parser = XMLParser(data: data) + parser.delegate = self + parser.parse() + return opfPath + } + + func parser(_ parser: XMLParser, didStartElement element: String, namespaceURI: String?, + qualifiedName: String?, attributes: [String: String]) { + if element == "rootfile", let path = attributes["full-path"] { + opfPath = path + } + } +} + +private class OPFParser: NSObject, XMLParserDelegate { + private let data: Data + private var title: String? + private var author: String? + private var manifest: [String: String] = [:] + private var spine: [String] = [] + private var currentElement = "" + private var currentText = "" + private var inMetadata = false + + init(data: Data) { + self.data = data + } + + func parse() -> (String?, String?, [String: String], [String])? { + let parser = XMLParser(data: data) + parser.delegate = self + parser.parse() + return (title, author, manifest, spine) + } + + func parser(_ parser: XMLParser, didStartElement element: String, namespaceURI: String?, + qualifiedName: String?, attributes: [String: String]) { + let localName = element.components(separatedBy: ":").last ?? element + currentElement = localName + currentText = "" + + switch localName { + case "metadata": + inMetadata = true + case "item": + if let id = attributes["id"], let href = attributes["href"] { + manifest[id] = href + } + case "itemref": + if let idref = attributes["idref"] { + spine.append(idref) + } + default: + break + } + } + + func parser(_ parser: XMLParser, foundCharacters string: String) { + currentText += string + } + + func parser(_ parser: XMLParser, didEndElement element: String, namespaceURI: String?, + qualifiedName: String?) { + let localName = element.components(separatedBy: ":").last ?? element + if inMetadata { + let trimmed = currentText.trimmingCharacters(in: .whitespacesAndNewlines) + if localName == "title" && title == nil && !trimmed.isEmpty { + title = trimmed + } else if localName == "creator" && author == nil && !trimmed.isEmpty { + author = trimmed + } else if localName == "metadata" { + inMetadata = false + } + } + } +} diff --git a/VorleserKit/Tests/BookParserTests/EPUBParserTests.swift b/VorleserKit/Tests/BookParserTests/EPUBParserTests.swift new file mode 100644 index 0000000..a9fc216 --- /dev/null +++ b/VorleserKit/Tests/BookParserTests/EPUBParserTests.swift @@ -0,0 +1,37 @@ +import Testing +import Foundation +@testable import BookParser + +@Suite("EPUBParser") +struct EPUBParserTests { + let fixtureURL: URL = { + Bundle.module.url(forResource: "test", withExtension: "epub", subdirectory: "Fixtures")! + }() + + @Test func parsesTestEPUB() throws { + let book = try EPUBParser.parse(url: fixtureURL) + #expect(book.chapters.count == 2) + #expect(book.chapters[0].title == "Chapter One") + #expect(book.chapters[0].text.contains("first chapter")) + #expect(book.chapters[1].title == "Chapter Two") + } + + @Test func extractsTitle() throws { + let book = try EPUBParser.parse(url: fixtureURL) + #expect(!book.title.isEmpty) + } + + @Test func throwsOnInvalidFile() { + let badURL = URL(fileURLWithPath: "/tmp/nonexistent.epub") + #expect(throws: EPUBParserError.self) { + try EPUBParser.parse(url: badURL) + } + } + + @Test func chaptersHaveSequentialIndices() throws { + let book = try EPUBParser.parse(url: fixtureURL) + for (i, chapter) in book.chapters.enumerated() { + #expect(chapter.index == i) + } + } +} diff --git a/VorleserKit/Tests/BookParserTests/Fixtures/test.epub b/VorleserKit/Tests/BookParserTests/Fixtures/test.epub new file mode 100644 index 0000000..c5704d2 Binary files /dev/null and b/VorleserKit/Tests/BookParserTests/Fixtures/test.epub differ