scaffold VorleserKit package with shared types, sentence segmenter, tests
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@@ -18,6 +18,8 @@ autoaudiobook/samples/
|
||||
|
||||
# Build outputs
|
||||
build/
|
||||
.build/
|
||||
**/.build/
|
||||
DerivedData/
|
||||
*.dSYM/
|
||||
*.log
|
||||
|
||||
24
VorleserKit/Package.resolved
Normal file
24
VorleserKit/Package.resolved
Normal file
@@ -0,0 +1,24 @@
|
||||
{
|
||||
"originHash" : "1bf1d418d8d58ea936176af8e96313605ea72a6fbf437f877b8e5d9a5b0d822c",
|
||||
"pins" : [
|
||||
{
|
||||
"identity" : "swiftsoup",
|
||||
"kind" : "remoteSourceControl",
|
||||
"location" : "https://github.com/scinfu/SwiftSoup.git",
|
||||
"state" : {
|
||||
"revision" : "dba183c96b2da4e4b80bb31b1e2e59cb9542b8fc",
|
||||
"version" : "2.13.0"
|
||||
}
|
||||
},
|
||||
{
|
||||
"identity" : "zipfoundation",
|
||||
"kind" : "remoteSourceControl",
|
||||
"location" : "https://github.com/weichsel/ZIPFoundation.git",
|
||||
"state" : {
|
||||
"revision" : "22787ffb59de99e5dc1fbfe80b19c97a904ad48d",
|
||||
"version" : "0.9.20"
|
||||
}
|
||||
}
|
||||
],
|
||||
"version" : 3
|
||||
}
|
||||
37
VorleserKit/Package.swift
Normal file
37
VorleserKit/Package.swift
Normal file
@@ -0,0 +1,37 @@
|
||||
// swift-tools-version: 6.2
|
||||
import PackageDescription
|
||||
|
||||
let package = Package(
|
||||
name: "VorleserKit",
|
||||
platforms: [
|
||||
.iOS(.v18),
|
||||
.macOS(.v15),
|
||||
],
|
||||
products: [
|
||||
.library(name: "VorleserKit", targets: ["VorleserKit"]),
|
||||
.library(name: "BookParser", targets: ["BookParser"]),
|
||||
],
|
||||
dependencies: [
|
||||
.package(url: "https://github.com/weichsel/ZIPFoundation.git", from: "0.9.0"),
|
||||
.package(url: "https://github.com/scinfu/SwiftSoup.git", from: "2.7.0"),
|
||||
],
|
||||
targets: [
|
||||
.target(
|
||||
name: "VorleserKit",
|
||||
dependencies: []
|
||||
),
|
||||
.target(
|
||||
name: "BookParser",
|
||||
dependencies: ["VorleserKit", "ZIPFoundation", "SwiftSoup"]
|
||||
),
|
||||
.testTarget(
|
||||
name: "BookParserTests",
|
||||
dependencies: ["BookParser"],
|
||||
resources: [.copy("Fixtures")]
|
||||
),
|
||||
.testTarget(
|
||||
name: "VorleserKitTests",
|
||||
dependencies: ["VorleserKit"]
|
||||
),
|
||||
]
|
||||
)
|
||||
52
VorleserKit/Sources/BookParser/Book.swift
Normal file
52
VorleserKit/Sources/BookParser/Book.swift
Normal file
@@ -0,0 +1,52 @@
|
||||
import Foundation
|
||||
import VorleserKit
|
||||
|
||||
public struct Book: Sendable {
|
||||
public let id: UUID
|
||||
public let title: String
|
||||
public let author: String?
|
||||
public let chapters: [Chapter]
|
||||
|
||||
public init(id: UUID = UUID(), title: String, author: String?, chapters: [Chapter]) {
|
||||
self.id = id
|
||||
self.title = title
|
||||
self.author = author
|
||||
self.chapters = chapters
|
||||
}
|
||||
|
||||
/// All sentences across all chapters, with global character offsets.
|
||||
public var sentences: [Sentence] {
|
||||
var result: [Sentence] = []
|
||||
var offset: CharacterOffset = 0
|
||||
for chapter in chapters {
|
||||
let chapterSentences = SentenceSegmenter.segment(chapter.text, globalOffset: offset)
|
||||
result.append(contentsOf: chapterSentences)
|
||||
offset += chapter.text.count
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
/// Returns the sentence index containing the given global character offset.
|
||||
public func sentenceIndex(containing offset: CharacterOffset) -> Int? {
|
||||
let allSentences = sentences
|
||||
return allSentences.firstIndex { $0.range.contains(offset) }
|
||||
}
|
||||
|
||||
/// Maps a global character offset to (chapter index, local offset within chapter).
|
||||
public func chapterAndLocalOffset(for globalOffset: CharacterOffset) -> (chapterIndex: Int, localOffset: Int)? {
|
||||
var offset = 0
|
||||
for chapter in chapters {
|
||||
let chapterEnd = offset + chapter.text.count
|
||||
if globalOffset < chapterEnd {
|
||||
return (chapter.index, globalOffset - offset)
|
||||
}
|
||||
offset = chapterEnd
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
/// Total character count across all chapters.
|
||||
public var totalCharacters: Int {
|
||||
chapters.reduce(0) { $0 + $1.text.count }
|
||||
}
|
||||
}
|
||||
11
VorleserKit/Sources/BookParser/Chapter.swift
Normal file
11
VorleserKit/Sources/BookParser/Chapter.swift
Normal file
@@ -0,0 +1,11 @@
|
||||
public struct Chapter: Sendable {
|
||||
public let index: Int
|
||||
public let title: String
|
||||
public let text: String
|
||||
|
||||
public init(index: Int, title: String, text: String) {
|
||||
self.index = index
|
||||
self.title = title
|
||||
self.text = text
|
||||
}
|
||||
}
|
||||
9
VorleserKit/Sources/VorleserKit/Sentence.swift
Normal file
9
VorleserKit/Sources/VorleserKit/Sentence.swift
Normal file
@@ -0,0 +1,9 @@
|
||||
public struct Sentence: Sendable {
|
||||
public let text: String
|
||||
public let range: Range<CharacterOffset>
|
||||
|
||||
public init(text: String, range: Range<CharacterOffset>) {
|
||||
self.text = text
|
||||
self.range = range
|
||||
}
|
||||
}
|
||||
18
VorleserKit/Sources/VorleserKit/SentenceSegmenter.swift
Normal file
18
VorleserKit/Sources/VorleserKit/SentenceSegmenter.swift
Normal file
@@ -0,0 +1,18 @@
|
||||
import NaturalLanguage
|
||||
|
||||
public struct SentenceSegmenter: Sendable {
|
||||
public static func segment(_ text: String, globalOffset: CharacterOffset = 0) -> [Sentence] {
|
||||
let tokenizer = NLTokenizer(unit: .sentence)
|
||||
tokenizer.string = text
|
||||
var sentences: [Sentence] = []
|
||||
tokenizer.enumerateTokens(in: text.startIndex..<text.endIndex) { range, _ in
|
||||
let sentenceText = String(text[range]).trimmingCharacters(in: .whitespacesAndNewlines)
|
||||
guard !sentenceText.isEmpty else { return true }
|
||||
let start = text.distance(from: text.startIndex, to: range.lowerBound) + globalOffset
|
||||
let end = text.distance(from: text.startIndex, to: range.upperBound) + globalOffset
|
||||
sentences.append(Sentence(text: sentenceText, range: start..<end))
|
||||
return true
|
||||
}
|
||||
return sentences
|
||||
}
|
||||
}
|
||||
2
VorleserKit/Sources/VorleserKit/VorleserKit.swift
Normal file
2
VorleserKit/Sources/VorleserKit/VorleserKit.swift
Normal file
@@ -0,0 +1,2 @@
|
||||
/// A position in a book, measured in characters from the start of the first chapter.
|
||||
public typealias CharacterOffset = Int
|
||||
0
VorleserKit/Tests/BookParserTests/Fixtures/.gitkeep
Normal file
0
VorleserKit/Tests/BookParserTests/Fixtures/.gitkeep
Normal file
@@ -0,0 +1,39 @@
|
||||
import Testing
|
||||
@testable import VorleserKit
|
||||
|
||||
@Suite("SentenceSegmenter")
|
||||
struct SentenceSegmenterTests {
|
||||
@Test func segmentsSimpleSentences() {
|
||||
let text = "Hello world. How are you? I am fine."
|
||||
let sentences = SentenceSegmenter.segment(text)
|
||||
#expect(sentences.count == 3)
|
||||
#expect(sentences[0].text == "Hello world.")
|
||||
#expect(sentences[1].text == "How are you?")
|
||||
#expect(sentences[2].text == "I am fine.")
|
||||
}
|
||||
|
||||
@Test func handlesAbbreviations() {
|
||||
let text = "Dr. Smith went to Washington. He arrived at 3 p.m."
|
||||
let sentences = SentenceSegmenter.segment(text)
|
||||
// NLTokenizer should handle "Dr." without splitting
|
||||
#expect(sentences.count == 2)
|
||||
}
|
||||
|
||||
@Test func appliesGlobalOffset() {
|
||||
let text = "First sentence. Second sentence."
|
||||
let sentences = SentenceSegmenter.segment(text, globalOffset: 100)
|
||||
#expect(sentences[0].range.lowerBound >= 100)
|
||||
}
|
||||
|
||||
@Test func handlesEmptyText() {
|
||||
let sentences = SentenceSegmenter.segment("")
|
||||
#expect(sentences.isEmpty)
|
||||
}
|
||||
|
||||
@Test func handlesSingleSentence() {
|
||||
let text = "Just one sentence."
|
||||
let sentences = SentenceSegmenter.segment(text)
|
||||
#expect(sentences.count == 1)
|
||||
#expect(sentences[0].text == "Just one sentence.")
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user