add EPUB parser with ZIP extraction, OPF/spine parsing, HTML-to-text
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
189
VorleserKit/Sources/BookParser/EPUBParser.swift
Normal file
189
VorleserKit/Sources/BookParser/EPUBParser.swift
Normal file
@@ -0,0 +1,189 @@
|
||||
import Foundation
|
||||
import ZIPFoundation
|
||||
import SwiftSoup
|
||||
|
||||
public enum EPUBParserError: Error, CustomStringConvertible {
|
||||
case cannotOpenArchive(URL)
|
||||
case missingContainerXML
|
||||
case missingOPF(String)
|
||||
case malformedOPF
|
||||
case noSpineItems
|
||||
|
||||
public var description: String {
|
||||
switch self {
|
||||
case .cannotOpenArchive(let url): "cannot open EPUB archive at \(url.path)"
|
||||
case .missingContainerXML: "missing META-INF/container.xml"
|
||||
case .missingOPF(let path): "missing OPF file at \(path)"
|
||||
case .malformedOPF: "malformed OPF (package document)"
|
||||
case .noSpineItems: "EPUB has no spine items"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public struct EPUBParser {
|
||||
public static func parse(url: URL) throws -> Book {
|
||||
let archive: Archive
|
||||
do {
|
||||
archive = try Archive(url: url, accessMode: .read)
|
||||
} catch {
|
||||
throw EPUBParserError.cannotOpenArchive(url)
|
||||
}
|
||||
|
||||
let opfPath = try findOPFPath(in: archive)
|
||||
let opfDir = (opfPath as NSString).deletingLastPathComponent
|
||||
let opfData = try extractData(from: archive, path: opfPath)
|
||||
let (title, author, manifest, spine) = try parseOPF(data: opfData)
|
||||
|
||||
guard !spine.isEmpty else { throw EPUBParserError.noSpineItems }
|
||||
|
||||
var chapters: [Chapter] = []
|
||||
for (index, itemRef) in spine.enumerated() {
|
||||
guard let href = manifest[itemRef] else { continue }
|
||||
let fullPath = opfDir.isEmpty ? href : "\(opfDir)/\(href)"
|
||||
|
||||
var chapterTitle: String
|
||||
var chapterText: String
|
||||
do {
|
||||
let htmlData = try extractData(from: archive, path: fullPath)
|
||||
let html = String(data: htmlData, encoding: .utf8) ?? ""
|
||||
let doc = try SwiftSoup.parse(html)
|
||||
chapterTitle = try doc.select("h1, h2, h3, title").first()?.text() ?? "Chapter \(index + 1)"
|
||||
let body = try doc.body()?.text() ?? ""
|
||||
chapterText = normalizeWhitespace(body)
|
||||
} catch {
|
||||
chapterTitle = "Chapter \(index + 1) (parse error)"
|
||||
chapterText = ""
|
||||
}
|
||||
|
||||
chapters.append(Chapter(index: index, title: chapterTitle, text: chapterText))
|
||||
}
|
||||
|
||||
return Book(
|
||||
title: title ?? url.deletingPathExtension().lastPathComponent,
|
||||
author: author,
|
||||
chapters: chapters
|
||||
)
|
||||
}
|
||||
|
||||
private static func findOPFPath(in archive: Archive) throws -> String {
|
||||
let containerData = try extractData(from: archive, path: "META-INF/container.xml")
|
||||
let parser = ContainerXMLParser(data: containerData)
|
||||
guard let opfPath = parser.parse() else {
|
||||
throw EPUBParserError.missingContainerXML
|
||||
}
|
||||
return opfPath
|
||||
}
|
||||
|
||||
private static func parseOPF(data: Data) throws -> (title: String?, author: String?, manifest: [String: String], spine: [String]) {
|
||||
let parser = OPFParser(data: data)
|
||||
guard let result = parser.parse() else {
|
||||
throw EPUBParserError.malformedOPF
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
private static func extractData(from archive: Archive, path: String) throws -> Data {
|
||||
guard let entry = archive[path] else {
|
||||
throw EPUBParserError.missingOPF(path)
|
||||
}
|
||||
var data = Data()
|
||||
_ = try archive.extract(entry) { chunk in
|
||||
data.append(chunk)
|
||||
}
|
||||
return data
|
||||
}
|
||||
|
||||
private static func normalizeWhitespace(_ text: String) -> String {
|
||||
text.components(separatedBy: .whitespacesAndNewlines)
|
||||
.filter { !$0.isEmpty }
|
||||
.joined(separator: " ")
|
||||
}
|
||||
}
|
||||
|
||||
// MARK: - XML Parsers
|
||||
|
||||
private class ContainerXMLParser: NSObject, XMLParserDelegate {
|
||||
private let data: Data
|
||||
private var opfPath: String?
|
||||
|
||||
init(data: Data) {
|
||||
self.data = data
|
||||
}
|
||||
|
||||
func parse() -> String? {
|
||||
let parser = XMLParser(data: data)
|
||||
parser.delegate = self
|
||||
parser.parse()
|
||||
return opfPath
|
||||
}
|
||||
|
||||
func parser(_ parser: XMLParser, didStartElement element: String, namespaceURI: String?,
|
||||
qualifiedName: String?, attributes: [String: String]) {
|
||||
if element == "rootfile", let path = attributes["full-path"] {
|
||||
opfPath = path
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private class OPFParser: NSObject, XMLParserDelegate {
|
||||
private let data: Data
|
||||
private var title: String?
|
||||
private var author: String?
|
||||
private var manifest: [String: String] = [:]
|
||||
private var spine: [String] = []
|
||||
private var currentElement = ""
|
||||
private var currentText = ""
|
||||
private var inMetadata = false
|
||||
|
||||
init(data: Data) {
|
||||
self.data = data
|
||||
}
|
||||
|
||||
func parse() -> (String?, String?, [String: String], [String])? {
|
||||
let parser = XMLParser(data: data)
|
||||
parser.delegate = self
|
||||
parser.parse()
|
||||
return (title, author, manifest, spine)
|
||||
}
|
||||
|
||||
func parser(_ parser: XMLParser, didStartElement element: String, namespaceURI: String?,
|
||||
qualifiedName: String?, attributes: [String: String]) {
|
||||
let localName = element.components(separatedBy: ":").last ?? element
|
||||
currentElement = localName
|
||||
currentText = ""
|
||||
|
||||
switch localName {
|
||||
case "metadata":
|
||||
inMetadata = true
|
||||
case "item":
|
||||
if let id = attributes["id"], let href = attributes["href"] {
|
||||
manifest[id] = href
|
||||
}
|
||||
case "itemref":
|
||||
if let idref = attributes["idref"] {
|
||||
spine.append(idref)
|
||||
}
|
||||
default:
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
func parser(_ parser: XMLParser, foundCharacters string: String) {
|
||||
currentText += string
|
||||
}
|
||||
|
||||
func parser(_ parser: XMLParser, didEndElement element: String, namespaceURI: String?,
|
||||
qualifiedName: String?) {
|
||||
let localName = element.components(separatedBy: ":").last ?? element
|
||||
if inMetadata {
|
||||
let trimmed = currentText.trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
if localName == "title" && title == nil && !trimmed.isEmpty {
|
||||
title = trimmed
|
||||
} else if localName == "creator" && author == nil && !trimmed.isEmpty {
|
||||
author = trimmed
|
||||
} else if localName == "metadata" {
|
||||
inMetadata = false
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
37
VorleserKit/Tests/BookParserTests/EPUBParserTests.swift
Normal file
37
VorleserKit/Tests/BookParserTests/EPUBParserTests.swift
Normal file
@@ -0,0 +1,37 @@
|
||||
import Testing
|
||||
import Foundation
|
||||
@testable import BookParser
|
||||
|
||||
@Suite("EPUBParser")
|
||||
struct EPUBParserTests {
|
||||
let fixtureURL: URL = {
|
||||
Bundle.module.url(forResource: "test", withExtension: "epub", subdirectory: "Fixtures")!
|
||||
}()
|
||||
|
||||
@Test func parsesTestEPUB() throws {
|
||||
let book = try EPUBParser.parse(url: fixtureURL)
|
||||
#expect(book.chapters.count == 2)
|
||||
#expect(book.chapters[0].title == "Chapter One")
|
||||
#expect(book.chapters[0].text.contains("first chapter"))
|
||||
#expect(book.chapters[1].title == "Chapter Two")
|
||||
}
|
||||
|
||||
@Test func extractsTitle() throws {
|
||||
let book = try EPUBParser.parse(url: fixtureURL)
|
||||
#expect(!book.title.isEmpty)
|
||||
}
|
||||
|
||||
@Test func throwsOnInvalidFile() {
|
||||
let badURL = URL(fileURLWithPath: "/tmp/nonexistent.epub")
|
||||
#expect(throws: EPUBParserError.self) {
|
||||
try EPUBParser.parse(url: badURL)
|
||||
}
|
||||
}
|
||||
|
||||
@Test func chaptersHaveSequentialIndices() throws {
|
||||
let book = try EPUBParser.parse(url: fixtureURL)
|
||||
for (i, chapter) in book.chapters.enumerated() {
|
||||
#expect(chapter.index == i)
|
||||
}
|
||||
}
|
||||
}
|
||||
BIN
VorleserKit/Tests/BookParserTests/Fixtures/test.epub
Normal file
BIN
VorleserKit/Tests/BookParserTests/Fixtures/test.epub
Normal file
Binary file not shown.
Reference in New Issue
Block a user