scaffold VorleserKit package with shared types, sentence segmenter, tests

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-13 21:48:39 +01:00
parent 6c19f89287
commit 53b91ee4ed
10 changed files with 194 additions and 0 deletions

2
.gitignore vendored
View File

@@ -18,6 +18,8 @@ autoaudiobook/samples/
# Build outputs # Build outputs
build/ build/
.build/
**/.build/
DerivedData/ DerivedData/
*.dSYM/ *.dSYM/
*.log *.log

View File

@@ -0,0 +1,24 @@
{
"originHash" : "1bf1d418d8d58ea936176af8e96313605ea72a6fbf437f877b8e5d9a5b0d822c",
"pins" : [
{
"identity" : "swiftsoup",
"kind" : "remoteSourceControl",
"location" : "https://github.com/scinfu/SwiftSoup.git",
"state" : {
"revision" : "dba183c96b2da4e4b80bb31b1e2e59cb9542b8fc",
"version" : "2.13.0"
}
},
{
"identity" : "zipfoundation",
"kind" : "remoteSourceControl",
"location" : "https://github.com/weichsel/ZIPFoundation.git",
"state" : {
"revision" : "22787ffb59de99e5dc1fbfe80b19c97a904ad48d",
"version" : "0.9.20"
}
}
],
"version" : 3
}

37
VorleserKit/Package.swift Normal file
View File

@@ -0,0 +1,37 @@
// swift-tools-version: 6.2
import PackageDescription
let package = Package(
name: "VorleserKit",
platforms: [
.iOS(.v18),
.macOS(.v15),
],
products: [
.library(name: "VorleserKit", targets: ["VorleserKit"]),
.library(name: "BookParser", targets: ["BookParser"]),
],
dependencies: [
.package(url: "https://github.com/weichsel/ZIPFoundation.git", from: "0.9.0"),
.package(url: "https://github.com/scinfu/SwiftSoup.git", from: "2.7.0"),
],
targets: [
.target(
name: "VorleserKit",
dependencies: []
),
.target(
name: "BookParser",
dependencies: ["VorleserKit", "ZIPFoundation", "SwiftSoup"]
),
.testTarget(
name: "BookParserTests",
dependencies: ["BookParser"],
resources: [.copy("Fixtures")]
),
.testTarget(
name: "VorleserKitTests",
dependencies: ["VorleserKit"]
),
]
)

View File

@@ -0,0 +1,52 @@
import Foundation
import VorleserKit
public struct Book: Sendable {
public let id: UUID
public let title: String
public let author: String?
public let chapters: [Chapter]
public init(id: UUID = UUID(), title: String, author: String?, chapters: [Chapter]) {
self.id = id
self.title = title
self.author = author
self.chapters = chapters
}
/// All sentences across all chapters, with global character offsets.
public var sentences: [Sentence] {
var result: [Sentence] = []
var offset: CharacterOffset = 0
for chapter in chapters {
let chapterSentences = SentenceSegmenter.segment(chapter.text, globalOffset: offset)
result.append(contentsOf: chapterSentences)
offset += chapter.text.count
}
return result
}
/// Returns the sentence index containing the given global character offset.
public func sentenceIndex(containing offset: CharacterOffset) -> Int? {
let allSentences = sentences
return allSentences.firstIndex { $0.range.contains(offset) }
}
/// Maps a global character offset to (chapter index, local offset within chapter).
public func chapterAndLocalOffset(for globalOffset: CharacterOffset) -> (chapterIndex: Int, localOffset: Int)? {
var offset = 0
for chapter in chapters {
let chapterEnd = offset + chapter.text.count
if globalOffset < chapterEnd {
return (chapter.index, globalOffset - offset)
}
offset = chapterEnd
}
return nil
}
/// Total character count across all chapters.
public var totalCharacters: Int {
chapters.reduce(0) { $0 + $1.text.count }
}
}

View File

@@ -0,0 +1,11 @@
public struct Chapter: Sendable {
public let index: Int
public let title: String
public let text: String
public init(index: Int, title: String, text: String) {
self.index = index
self.title = title
self.text = text
}
}

View File

@@ -0,0 +1,9 @@
public struct Sentence: Sendable {
public let text: String
public let range: Range<CharacterOffset>
public init(text: String, range: Range<CharacterOffset>) {
self.text = text
self.range = range
}
}

View File

@@ -0,0 +1,18 @@
import NaturalLanguage
public struct SentenceSegmenter: Sendable {
public static func segment(_ text: String, globalOffset: CharacterOffset = 0) -> [Sentence] {
let tokenizer = NLTokenizer(unit: .sentence)
tokenizer.string = text
var sentences: [Sentence] = []
tokenizer.enumerateTokens(in: text.startIndex..<text.endIndex) { range, _ in
let sentenceText = String(text[range]).trimmingCharacters(in: .whitespacesAndNewlines)
guard !sentenceText.isEmpty else { return true }
let start = text.distance(from: text.startIndex, to: range.lowerBound) + globalOffset
let end = text.distance(from: text.startIndex, to: range.upperBound) + globalOffset
sentences.append(Sentence(text: sentenceText, range: start..<end))
return true
}
return sentences
}
}

View File

@@ -0,0 +1,2 @@
/// A position in a book, measured in characters from the start of the first chapter.
public typealias CharacterOffset = Int

View File

@@ -0,0 +1,39 @@
import Testing
@testable import VorleserKit
@Suite("SentenceSegmenter")
struct SentenceSegmenterTests {
@Test func segmentsSimpleSentences() {
let text = "Hello world. How are you? I am fine."
let sentences = SentenceSegmenter.segment(text)
#expect(sentences.count == 3)
#expect(sentences[0].text == "Hello world.")
#expect(sentences[1].text == "How are you?")
#expect(sentences[2].text == "I am fine.")
}
@Test func handlesAbbreviations() {
let text = "Dr. Smith went to Washington. He arrived at 3 p.m."
let sentences = SentenceSegmenter.segment(text)
// NLTokenizer should handle "Dr." without splitting
#expect(sentences.count == 2)
}
@Test func appliesGlobalOffset() {
let text = "First sentence. Second sentence."
let sentences = SentenceSegmenter.segment(text, globalOffset: 100)
#expect(sentences[0].range.lowerBound >= 100)
}
@Test func handlesEmptyText() {
let sentences = SentenceSegmenter.segment("")
#expect(sentences.isEmpty)
}
@Test func handlesSingleSentence() {
let text = "Just one sentence."
let sentences = SentenceSegmenter.segment(text)
#expect(sentences.count == 1)
#expect(sentences[0].text == "Just one sentence.")
}
}