191 lines
6.7 KiB
Swift
191 lines
6.7 KiB
Swift
import Foundation
|
|
import ZIPFoundation
|
|
import SwiftSoup
|
|
|
|
struct EPUBChapter: Identifiable, Hashable {
|
|
let id: UUID
|
|
let title: String
|
|
let rawText: String
|
|
|
|
init(title: String, rawText: String) {
|
|
self.id = UUID()
|
|
self.title = title
|
|
self.rawText = rawText
|
|
}
|
|
}
|
|
|
|
final class EPUBService {
|
|
enum EPUBError: Error, LocalizedError {
|
|
case missingContainer
|
|
case missingRootfile
|
|
case missingOPF
|
|
case invalidOPF
|
|
case missingSpine
|
|
|
|
var errorDescription: String? {
|
|
switch self {
|
|
case .missingContainer:
|
|
return "Missing META-INF/container.xml"
|
|
case .missingRootfile:
|
|
return "container.xml did not include a rootfile"
|
|
case .missingOPF:
|
|
return "OPF file missing"
|
|
case .invalidOPF:
|
|
return "OPF parsing failed"
|
|
case .missingSpine:
|
|
return "OPF spine is empty"
|
|
}
|
|
}
|
|
}
|
|
|
|
func extractChapters(from epubURL: URL) throws -> [EPUBChapter] {
|
|
let tmpDir = try createTempDirectory()
|
|
try unzip(epubURL, to: tmpDir)
|
|
|
|
let containerURL = tmpDir.appendingPathComponent("META-INF/container.xml")
|
|
guard FileManager.default.fileExists(atPath: containerURL.path) else {
|
|
throw EPUBError.missingContainer
|
|
}
|
|
|
|
let containerData = try Data(contentsOf: containerURL)
|
|
let rootfilePath = try parseContainer(data: containerData)
|
|
guard let rootfilePath else {
|
|
throw EPUBError.missingRootfile
|
|
}
|
|
|
|
let opfURL = tmpDir.appendingPathComponent(rootfilePath)
|
|
guard FileManager.default.fileExists(atPath: opfURL.path) else {
|
|
throw EPUBError.missingOPF
|
|
}
|
|
|
|
let opfData = try Data(contentsOf: opfURL)
|
|
let opfResult = try parseOPF(data: opfData)
|
|
guard !opfResult.spine.isEmpty else {
|
|
throw EPUBError.missingSpine
|
|
}
|
|
|
|
let baseURL = opfURL.deletingLastPathComponent()
|
|
var chapters: [EPUBChapter] = []
|
|
|
|
for idref in opfResult.spine {
|
|
guard let href = opfResult.manifest[idref] else { continue }
|
|
let contentURL = baseURL.appendingPathComponent(href)
|
|
guard FileManager.default.fileExists(atPath: contentURL.path) else { continue }
|
|
|
|
let html = try String(contentsOf: contentURL, encoding: .utf8)
|
|
let text = try SwiftSoup.parse(html).text()
|
|
let title = try extractTitle(from: html) ?? contentURL.deletingPathExtension().lastPathComponent
|
|
let cleaned = cleanText(text)
|
|
if cleaned.isEmpty { continue }
|
|
chapters.append(EPUBChapter(title: title, rawText: cleaned))
|
|
}
|
|
|
|
return chapters
|
|
}
|
|
|
|
private func unzip(_ url: URL, to destination: URL) throws {
|
|
let archive = try Archive(url: url, accessMode: .read)
|
|
for entry in archive {
|
|
let entryURL = destination.appendingPathComponent(entry.path)
|
|
let parent = entryURL.deletingLastPathComponent()
|
|
if !FileManager.default.fileExists(atPath: parent.path) {
|
|
try FileManager.default.createDirectory(at: parent, withIntermediateDirectories: true)
|
|
}
|
|
_ = try archive.extract(entry, to: entryURL)
|
|
}
|
|
}
|
|
|
|
private func createTempDirectory() throws -> URL {
|
|
let base = URL(fileURLWithPath: NSTemporaryDirectory())
|
|
let dir = base.appendingPathComponent("epub_\(UUID().uuidString)", isDirectory: true)
|
|
try FileManager.default.createDirectory(at: dir, withIntermediateDirectories: true)
|
|
return dir
|
|
}
|
|
|
|
private func parseContainer(data: Data) throws -> String? {
|
|
let parser = XMLParser(data: data)
|
|
let delegate = ContainerParser()
|
|
parser.delegate = delegate
|
|
parser.parse()
|
|
return delegate.rootfilePath
|
|
}
|
|
|
|
private func parseOPF(data: Data) throws -> OPFParseResult {
|
|
let parser = XMLParser(data: data)
|
|
let delegate = OPFParser()
|
|
parser.delegate = delegate
|
|
parser.parse()
|
|
return OPFParseResult(title: delegate.title?.trimmingCharacters(in: .whitespacesAndNewlines), manifest: delegate.manifest, spine: delegate.spine)
|
|
}
|
|
|
|
private func extractTitle(from html: String) throws -> String? {
|
|
let doc = try SwiftSoup.parse(html)
|
|
if let h1 = try doc.select("h1").first() {
|
|
return try h1.text()
|
|
}
|
|
if let title = try doc.select("title").first() {
|
|
return try title.text()
|
|
}
|
|
return nil
|
|
}
|
|
|
|
private func cleanText(_ text: String) -> String {
|
|
let collapsed = text.replacingOccurrences(of: "\\s+", with: " ", options: .regularExpression)
|
|
return collapsed.trimmingCharacters(in: .whitespacesAndNewlines)
|
|
}
|
|
}
|
|
|
|
private struct OPFParseResult {
|
|
let title: String?
|
|
let manifest: [String: String]
|
|
let spine: [String]
|
|
}
|
|
|
|
private final class ContainerParser: NSObject, XMLParserDelegate {
|
|
var rootfilePath: String?
|
|
|
|
func parser(_ parser: XMLParser, didStartElement elementName: String, namespaceURI: String?, qualifiedName qName: String?, attributes attributeDict: [String : String] = [:]) {
|
|
guard elementName.lowercased().contains("rootfile") else { return }
|
|
if let path = attributeDict["full-path"] {
|
|
rootfilePath = path
|
|
}
|
|
}
|
|
}
|
|
|
|
private final class OPFParser: NSObject, XMLParserDelegate {
|
|
var manifest: [String: String] = [:]
|
|
var spine: [String] = []
|
|
var title: String?
|
|
|
|
private var isCollectingTitle = false
|
|
private var titleBuffer = ""
|
|
|
|
func parser(_ parser: XMLParser, didStartElement elementName: String, namespaceURI: String?, qualifiedName qName: String?, attributes attributeDict: [String : String] = [:]) {
|
|
let name = elementName.lowercased()
|
|
if name == "item", let id = attributeDict["id"], let href = attributeDict["href"] {
|
|
manifest[id] = href
|
|
} else if name == "itemref", let idref = attributeDict["idref"] {
|
|
spine.append(idref)
|
|
} else if name.hasSuffix("title"), title == nil {
|
|
isCollectingTitle = true
|
|
titleBuffer = ""
|
|
}
|
|
}
|
|
|
|
func parser(_ parser: XMLParser, foundCharacters string: String) {
|
|
if isCollectingTitle {
|
|
titleBuffer.append(string)
|
|
}
|
|
}
|
|
|
|
func parser(_ parser: XMLParser, didEndElement elementName: String, namespaceURI: String?, qualifiedName qName: String?) {
|
|
if isCollectingTitle, elementName.lowercased().hasSuffix("title") {
|
|
let trimmed = titleBuffer.trimmingCharacters(in: .whitespacesAndNewlines)
|
|
if !trimmed.isEmpty {
|
|
title = trimmed
|
|
}
|
|
isCollectingTitle = false
|
|
}
|
|
}
|
|
}
|