scaffold VorleserKit package with shared types, sentence segmenter, tests
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@@ -18,6 +18,8 @@ autoaudiobook/samples/
|
|||||||
|
|
||||||
# Build outputs
|
# Build outputs
|
||||||
build/
|
build/
|
||||||
|
.build/
|
||||||
|
**/.build/
|
||||||
DerivedData/
|
DerivedData/
|
||||||
*.dSYM/
|
*.dSYM/
|
||||||
*.log
|
*.log
|
||||||
|
|||||||
24
VorleserKit/Package.resolved
Normal file
24
VorleserKit/Package.resolved
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
{
|
||||||
|
"originHash" : "1bf1d418d8d58ea936176af8e96313605ea72a6fbf437f877b8e5d9a5b0d822c",
|
||||||
|
"pins" : [
|
||||||
|
{
|
||||||
|
"identity" : "swiftsoup",
|
||||||
|
"kind" : "remoteSourceControl",
|
||||||
|
"location" : "https://github.com/scinfu/SwiftSoup.git",
|
||||||
|
"state" : {
|
||||||
|
"revision" : "dba183c96b2da4e4b80bb31b1e2e59cb9542b8fc",
|
||||||
|
"version" : "2.13.0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"identity" : "zipfoundation",
|
||||||
|
"kind" : "remoteSourceControl",
|
||||||
|
"location" : "https://github.com/weichsel/ZIPFoundation.git",
|
||||||
|
"state" : {
|
||||||
|
"revision" : "22787ffb59de99e5dc1fbfe80b19c97a904ad48d",
|
||||||
|
"version" : "0.9.20"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"version" : 3
|
||||||
|
}
|
||||||
37
VorleserKit/Package.swift
Normal file
37
VorleserKit/Package.swift
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
// swift-tools-version: 6.2
|
||||||
|
import PackageDescription
|
||||||
|
|
||||||
|
let package = Package(
|
||||||
|
name: "VorleserKit",
|
||||||
|
platforms: [
|
||||||
|
.iOS(.v18),
|
||||||
|
.macOS(.v15),
|
||||||
|
],
|
||||||
|
products: [
|
||||||
|
.library(name: "VorleserKit", targets: ["VorleserKit"]),
|
||||||
|
.library(name: "BookParser", targets: ["BookParser"]),
|
||||||
|
],
|
||||||
|
dependencies: [
|
||||||
|
.package(url: "https://github.com/weichsel/ZIPFoundation.git", from: "0.9.0"),
|
||||||
|
.package(url: "https://github.com/scinfu/SwiftSoup.git", from: "2.7.0"),
|
||||||
|
],
|
||||||
|
targets: [
|
||||||
|
.target(
|
||||||
|
name: "VorleserKit",
|
||||||
|
dependencies: []
|
||||||
|
),
|
||||||
|
.target(
|
||||||
|
name: "BookParser",
|
||||||
|
dependencies: ["VorleserKit", "ZIPFoundation", "SwiftSoup"]
|
||||||
|
),
|
||||||
|
.testTarget(
|
||||||
|
name: "BookParserTests",
|
||||||
|
dependencies: ["BookParser"],
|
||||||
|
resources: [.copy("Fixtures")]
|
||||||
|
),
|
||||||
|
.testTarget(
|
||||||
|
name: "VorleserKitTests",
|
||||||
|
dependencies: ["VorleserKit"]
|
||||||
|
),
|
||||||
|
]
|
||||||
|
)
|
||||||
52
VorleserKit/Sources/BookParser/Book.swift
Normal file
52
VorleserKit/Sources/BookParser/Book.swift
Normal file
@@ -0,0 +1,52 @@
|
|||||||
|
import Foundation
|
||||||
|
import VorleserKit
|
||||||
|
|
||||||
|
public struct Book: Sendable {
|
||||||
|
public let id: UUID
|
||||||
|
public let title: String
|
||||||
|
public let author: String?
|
||||||
|
public let chapters: [Chapter]
|
||||||
|
|
||||||
|
public init(id: UUID = UUID(), title: String, author: String?, chapters: [Chapter]) {
|
||||||
|
self.id = id
|
||||||
|
self.title = title
|
||||||
|
self.author = author
|
||||||
|
self.chapters = chapters
|
||||||
|
}
|
||||||
|
|
||||||
|
/// All sentences across all chapters, with global character offsets.
|
||||||
|
public var sentences: [Sentence] {
|
||||||
|
var result: [Sentence] = []
|
||||||
|
var offset: CharacterOffset = 0
|
||||||
|
for chapter in chapters {
|
||||||
|
let chapterSentences = SentenceSegmenter.segment(chapter.text, globalOffset: offset)
|
||||||
|
result.append(contentsOf: chapterSentences)
|
||||||
|
offset += chapter.text.count
|
||||||
|
}
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the sentence index containing the given global character offset.
|
||||||
|
public func sentenceIndex(containing offset: CharacterOffset) -> Int? {
|
||||||
|
let allSentences = sentences
|
||||||
|
return allSentences.firstIndex { $0.range.contains(offset) }
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Maps a global character offset to (chapter index, local offset within chapter).
|
||||||
|
public func chapterAndLocalOffset(for globalOffset: CharacterOffset) -> (chapterIndex: Int, localOffset: Int)? {
|
||||||
|
var offset = 0
|
||||||
|
for chapter in chapters {
|
||||||
|
let chapterEnd = offset + chapter.text.count
|
||||||
|
if globalOffset < chapterEnd {
|
||||||
|
return (chapter.index, globalOffset - offset)
|
||||||
|
}
|
||||||
|
offset = chapterEnd
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Total character count across all chapters.
|
||||||
|
public var totalCharacters: Int {
|
||||||
|
chapters.reduce(0) { $0 + $1.text.count }
|
||||||
|
}
|
||||||
|
}
|
||||||
11
VorleserKit/Sources/BookParser/Chapter.swift
Normal file
11
VorleserKit/Sources/BookParser/Chapter.swift
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
public struct Chapter: Sendable {
|
||||||
|
public let index: Int
|
||||||
|
public let title: String
|
||||||
|
public let text: String
|
||||||
|
|
||||||
|
public init(index: Int, title: String, text: String) {
|
||||||
|
self.index = index
|
||||||
|
self.title = title
|
||||||
|
self.text = text
|
||||||
|
}
|
||||||
|
}
|
||||||
9
VorleserKit/Sources/VorleserKit/Sentence.swift
Normal file
9
VorleserKit/Sources/VorleserKit/Sentence.swift
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
public struct Sentence: Sendable {
|
||||||
|
public let text: String
|
||||||
|
public let range: Range<CharacterOffset>
|
||||||
|
|
||||||
|
public init(text: String, range: Range<CharacterOffset>) {
|
||||||
|
self.text = text
|
||||||
|
self.range = range
|
||||||
|
}
|
||||||
|
}
|
||||||
18
VorleserKit/Sources/VorleserKit/SentenceSegmenter.swift
Normal file
18
VorleserKit/Sources/VorleserKit/SentenceSegmenter.swift
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
import NaturalLanguage
|
||||||
|
|
||||||
|
public struct SentenceSegmenter: Sendable {
|
||||||
|
public static func segment(_ text: String, globalOffset: CharacterOffset = 0) -> [Sentence] {
|
||||||
|
let tokenizer = NLTokenizer(unit: .sentence)
|
||||||
|
tokenizer.string = text
|
||||||
|
var sentences: [Sentence] = []
|
||||||
|
tokenizer.enumerateTokens(in: text.startIndex..<text.endIndex) { range, _ in
|
||||||
|
let sentenceText = String(text[range]).trimmingCharacters(in: .whitespacesAndNewlines)
|
||||||
|
guard !sentenceText.isEmpty else { return true }
|
||||||
|
let start = text.distance(from: text.startIndex, to: range.lowerBound) + globalOffset
|
||||||
|
let end = text.distance(from: text.startIndex, to: range.upperBound) + globalOffset
|
||||||
|
sentences.append(Sentence(text: sentenceText, range: start..<end))
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
return sentences
|
||||||
|
}
|
||||||
|
}
|
||||||
2
VorleserKit/Sources/VorleserKit/VorleserKit.swift
Normal file
2
VorleserKit/Sources/VorleserKit/VorleserKit.swift
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
/// A position in a book, measured in characters from the start of the first chapter.
|
||||||
|
public typealias CharacterOffset = Int
|
||||||
0
VorleserKit/Tests/BookParserTests/Fixtures/.gitkeep
Normal file
0
VorleserKit/Tests/BookParserTests/Fixtures/.gitkeep
Normal file
@@ -0,0 +1,39 @@
|
|||||||
|
import Testing
|
||||||
|
@testable import VorleserKit
|
||||||
|
|
||||||
|
@Suite("SentenceSegmenter")
|
||||||
|
struct SentenceSegmenterTests {
|
||||||
|
@Test func segmentsSimpleSentences() {
|
||||||
|
let text = "Hello world. How are you? I am fine."
|
||||||
|
let sentences = SentenceSegmenter.segment(text)
|
||||||
|
#expect(sentences.count == 3)
|
||||||
|
#expect(sentences[0].text == "Hello world.")
|
||||||
|
#expect(sentences[1].text == "How are you?")
|
||||||
|
#expect(sentences[2].text == "I am fine.")
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test func handlesAbbreviations() {
|
||||||
|
let text = "Dr. Smith went to Washington. He arrived at 3 p.m."
|
||||||
|
let sentences = SentenceSegmenter.segment(text)
|
||||||
|
// NLTokenizer should handle "Dr." without splitting
|
||||||
|
#expect(sentences.count == 2)
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test func appliesGlobalOffset() {
|
||||||
|
let text = "First sentence. Second sentence."
|
||||||
|
let sentences = SentenceSegmenter.segment(text, globalOffset: 100)
|
||||||
|
#expect(sentences[0].range.lowerBound >= 100)
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test func handlesEmptyText() {
|
||||||
|
let sentences = SentenceSegmenter.segment("")
|
||||||
|
#expect(sentences.isEmpty)
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test func handlesSingleSentence() {
|
||||||
|
let text = "Just one sentence."
|
||||||
|
let sentences = SentenceSegmenter.segment(text)
|
||||||
|
#expect(sentences.count == 1)
|
||||||
|
#expect(sentences[0].text == "Just one sentence.")
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user