Files
vorleser/VorleserMac/Services/EPUBService.swift

191 lines
6.7 KiB
Swift

import Foundation
import ZIPFoundation
import SwiftSoup
struct EPUBChapter: Identifiable, Hashable {
let id: UUID
let title: String
let rawText: String
init(title: String, rawText: String) {
self.id = UUID()
self.title = title
self.rawText = rawText
}
}
final class EPUBService {
enum EPUBError: Error, LocalizedError {
case missingContainer
case missingRootfile
case missingOPF
case invalidOPF
case missingSpine
var errorDescription: String? {
switch self {
case .missingContainer:
return "Missing META-INF/container.xml"
case .missingRootfile:
return "container.xml did not include a rootfile"
case .missingOPF:
return "OPF file missing"
case .invalidOPF:
return "OPF parsing failed"
case .missingSpine:
return "OPF spine is empty"
}
}
}
func extractChapters(from epubURL: URL) throws -> [EPUBChapter] {
let tmpDir = try createTempDirectory()
try unzip(epubURL, to: tmpDir)
let containerURL = tmpDir.appendingPathComponent("META-INF/container.xml")
guard FileManager.default.fileExists(atPath: containerURL.path) else {
throw EPUBError.missingContainer
}
let containerData = try Data(contentsOf: containerURL)
let rootfilePath = try parseContainer(data: containerData)
guard let rootfilePath else {
throw EPUBError.missingRootfile
}
let opfURL = tmpDir.appendingPathComponent(rootfilePath)
guard FileManager.default.fileExists(atPath: opfURL.path) else {
throw EPUBError.missingOPF
}
let opfData = try Data(contentsOf: opfURL)
let opfResult = try parseOPF(data: opfData)
guard !opfResult.spine.isEmpty else {
throw EPUBError.missingSpine
}
let baseURL = opfURL.deletingLastPathComponent()
var chapters: [EPUBChapter] = []
for idref in opfResult.spine {
guard let href = opfResult.manifest[idref] else { continue }
let contentURL = baseURL.appendingPathComponent(href)
guard FileManager.default.fileExists(atPath: contentURL.path) else { continue }
let html = try String(contentsOf: contentURL, encoding: .utf8)
let text = try SwiftSoup.parse(html).text()
let title = try extractTitle(from: html) ?? contentURL.deletingPathExtension().lastPathComponent
let cleaned = cleanText(text)
if cleaned.isEmpty { continue }
chapters.append(EPUBChapter(title: title, rawText: cleaned))
}
return chapters
}
private func unzip(_ url: URL, to destination: URL) throws {
let archive = try Archive(url: url, accessMode: .read)
for entry in archive {
let entryURL = destination.appendingPathComponent(entry.path)
let parent = entryURL.deletingLastPathComponent()
if !FileManager.default.fileExists(atPath: parent.path) {
try FileManager.default.createDirectory(at: parent, withIntermediateDirectories: true)
}
_ = try archive.extract(entry, to: entryURL)
}
}
private func createTempDirectory() throws -> URL {
let base = URL(fileURLWithPath: NSTemporaryDirectory())
let dir = base.appendingPathComponent("epub_\(UUID().uuidString)", isDirectory: true)
try FileManager.default.createDirectory(at: dir, withIntermediateDirectories: true)
return dir
}
private func parseContainer(data: Data) throws -> String? {
let parser = XMLParser(data: data)
let delegate = ContainerParser()
parser.delegate = delegate
parser.parse()
return delegate.rootfilePath
}
private func parseOPF(data: Data) throws -> OPFParseResult {
let parser = XMLParser(data: data)
let delegate = OPFParser()
parser.delegate = delegate
parser.parse()
return OPFParseResult(title: delegate.title?.trimmingCharacters(in: .whitespacesAndNewlines), manifest: delegate.manifest, spine: delegate.spine)
}
private func extractTitle(from html: String) throws -> String? {
let doc = try SwiftSoup.parse(html)
if let h1 = try doc.select("h1").first() {
return try h1.text()
}
if let title = try doc.select("title").first() {
return try title.text()
}
return nil
}
private func cleanText(_ text: String) -> String {
let collapsed = text.replacingOccurrences(of: "\\s+", with: " ", options: .regularExpression)
return collapsed.trimmingCharacters(in: .whitespacesAndNewlines)
}
}
private struct OPFParseResult {
let title: String?
let manifest: [String: String]
let spine: [String]
}
private final class ContainerParser: NSObject, XMLParserDelegate {
var rootfilePath: String?
func parser(_ parser: XMLParser, didStartElement elementName: String, namespaceURI: String?, qualifiedName qName: String?, attributes attributeDict: [String : String] = [:]) {
guard elementName.lowercased().contains("rootfile") else { return }
if let path = attributeDict["full-path"] {
rootfilePath = path
}
}
}
private final class OPFParser: NSObject, XMLParserDelegate {
var manifest: [String: String] = [:]
var spine: [String] = []
var title: String?
private var isCollectingTitle = false
private var titleBuffer = ""
func parser(_ parser: XMLParser, didStartElement elementName: String, namespaceURI: String?, qualifiedName qName: String?, attributes attributeDict: [String : String] = [:]) {
let name = elementName.lowercased()
if name == "item", let id = attributeDict["id"], let href = attributeDict["href"] {
manifest[id] = href
} else if name == "itemref", let idref = attributeDict["idref"] {
spine.append(idref)
} else if name.hasSuffix("title"), title == nil {
isCollectingTitle = true
titleBuffer = ""
}
}
func parser(_ parser: XMLParser, foundCharacters string: String) {
if isCollectingTitle {
titleBuffer.append(string)
}
}
func parser(_ parser: XMLParser, didEndElement elementName: String, namespaceURI: String?, qualifiedName qName: String?) {
if isCollectingTitle, elementName.lowercased().hasSuffix("title") {
let trimmed = titleBuffer.trimmingCharacters(in: .whitespacesAndNewlines)
if !trimmed.isEmpty {
title = trimmed
}
isCollectingTitle = false
}
}
}