add EPUB parser with ZIP extraction, OPF/spine parsing, HTML-to-text

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-13 21:51:34 +01:00
parent 53b91ee4ed
commit 2e647d7fe3
3 changed files with 226 additions and 0 deletions

View File

@@ -0,0 +1,189 @@
import Foundation
import ZIPFoundation
import SwiftSoup
public enum EPUBParserError: Error, CustomStringConvertible {
case cannotOpenArchive(URL)
case missingContainerXML
case missingOPF(String)
case malformedOPF
case noSpineItems
public var description: String {
switch self {
case .cannotOpenArchive(let url): "cannot open EPUB archive at \(url.path)"
case .missingContainerXML: "missing META-INF/container.xml"
case .missingOPF(let path): "missing OPF file at \(path)"
case .malformedOPF: "malformed OPF (package document)"
case .noSpineItems: "EPUB has no spine items"
}
}
}
public struct EPUBParser {
public static func parse(url: URL) throws -> Book {
let archive: Archive
do {
archive = try Archive(url: url, accessMode: .read)
} catch {
throw EPUBParserError.cannotOpenArchive(url)
}
let opfPath = try findOPFPath(in: archive)
let opfDir = (opfPath as NSString).deletingLastPathComponent
let opfData = try extractData(from: archive, path: opfPath)
let (title, author, manifest, spine) = try parseOPF(data: opfData)
guard !spine.isEmpty else { throw EPUBParserError.noSpineItems }
var chapters: [Chapter] = []
for (index, itemRef) in spine.enumerated() {
guard let href = manifest[itemRef] else { continue }
let fullPath = opfDir.isEmpty ? href : "\(opfDir)/\(href)"
var chapterTitle: String
var chapterText: String
do {
let htmlData = try extractData(from: archive, path: fullPath)
let html = String(data: htmlData, encoding: .utf8) ?? ""
let doc = try SwiftSoup.parse(html)
chapterTitle = try doc.select("h1, h2, h3, title").first()?.text() ?? "Chapter \(index + 1)"
let body = try doc.body()?.text() ?? ""
chapterText = normalizeWhitespace(body)
} catch {
chapterTitle = "Chapter \(index + 1) (parse error)"
chapterText = ""
}
chapters.append(Chapter(index: index, title: chapterTitle, text: chapterText))
}
return Book(
title: title ?? url.deletingPathExtension().lastPathComponent,
author: author,
chapters: chapters
)
}
private static func findOPFPath(in archive: Archive) throws -> String {
let containerData = try extractData(from: archive, path: "META-INF/container.xml")
let parser = ContainerXMLParser(data: containerData)
guard let opfPath = parser.parse() else {
throw EPUBParserError.missingContainerXML
}
return opfPath
}
private static func parseOPF(data: Data) throws -> (title: String?, author: String?, manifest: [String: String], spine: [String]) {
let parser = OPFParser(data: data)
guard let result = parser.parse() else {
throw EPUBParserError.malformedOPF
}
return result
}
private static func extractData(from archive: Archive, path: String) throws -> Data {
guard let entry = archive[path] else {
throw EPUBParserError.missingOPF(path)
}
var data = Data()
_ = try archive.extract(entry) { chunk in
data.append(chunk)
}
return data
}
private static func normalizeWhitespace(_ text: String) -> String {
text.components(separatedBy: .whitespacesAndNewlines)
.filter { !$0.isEmpty }
.joined(separator: " ")
}
}
// MARK: - XML Parsers
private class ContainerXMLParser: NSObject, XMLParserDelegate {
private let data: Data
private var opfPath: String?
init(data: Data) {
self.data = data
}
func parse() -> String? {
let parser = XMLParser(data: data)
parser.delegate = self
parser.parse()
return opfPath
}
func parser(_ parser: XMLParser, didStartElement element: String, namespaceURI: String?,
qualifiedName: String?, attributes: [String: String]) {
if element == "rootfile", let path = attributes["full-path"] {
opfPath = path
}
}
}
private class OPFParser: NSObject, XMLParserDelegate {
private let data: Data
private var title: String?
private var author: String?
private var manifest: [String: String] = [:]
private var spine: [String] = []
private var currentElement = ""
private var currentText = ""
private var inMetadata = false
init(data: Data) {
self.data = data
}
func parse() -> (String?, String?, [String: String], [String])? {
let parser = XMLParser(data: data)
parser.delegate = self
parser.parse()
return (title, author, manifest, spine)
}
func parser(_ parser: XMLParser, didStartElement element: String, namespaceURI: String?,
qualifiedName: String?, attributes: [String: String]) {
let localName = element.components(separatedBy: ":").last ?? element
currentElement = localName
currentText = ""
switch localName {
case "metadata":
inMetadata = true
case "item":
if let id = attributes["id"], let href = attributes["href"] {
manifest[id] = href
}
case "itemref":
if let idref = attributes["idref"] {
spine.append(idref)
}
default:
break
}
}
func parser(_ parser: XMLParser, foundCharacters string: String) {
currentText += string
}
func parser(_ parser: XMLParser, didEndElement element: String, namespaceURI: String?,
qualifiedName: String?) {
let localName = element.components(separatedBy: ":").last ?? element
if inMetadata {
let trimmed = currentText.trimmingCharacters(in: .whitespacesAndNewlines)
if localName == "title" && title == nil && !trimmed.isEmpty {
title = trimmed
} else if localName == "creator" && author == nil && !trimmed.isEmpty {
author = trimmed
} else if localName == "metadata" {
inMetadata = false
}
}
}
}

View File

@@ -0,0 +1,37 @@
import Testing
import Foundation
@testable import BookParser
@Suite("EPUBParser")
struct EPUBParserTests {
let fixtureURL: URL = {
Bundle.module.url(forResource: "test", withExtension: "epub", subdirectory: "Fixtures")!
}()
@Test func parsesTestEPUB() throws {
let book = try EPUBParser.parse(url: fixtureURL)
#expect(book.chapters.count == 2)
#expect(book.chapters[0].title == "Chapter One")
#expect(book.chapters[0].text.contains("first chapter"))
#expect(book.chapters[1].title == "Chapter Two")
}
@Test func extractsTitle() throws {
let book = try EPUBParser.parse(url: fixtureURL)
#expect(!book.title.isEmpty)
}
@Test func throwsOnInvalidFile() {
let badURL = URL(fileURLWithPath: "/tmp/nonexistent.epub")
#expect(throws: EPUBParserError.self) {
try EPUBParser.parse(url: badURL)
}
}
@Test func chaptersHaveSequentialIndices() throws {
let book = try EPUBParser.parse(url: fixtureURL)
for (i, chapter) in book.chapters.enumerated() {
#expect(chapter.index == i)
}
}
}

Binary file not shown.