// // CharacterReader.swift // SwiftSoup // // Created by Nabil Chatbi on 10/10/16. // Copyright © 2016 Nabil Chatbi.. All rights reserved. // import Foundation /** CharacterReader consumes tokens off a string. To replace the old TokenQueue. */ public final class CharacterReader { private static let empty = "" public static let EOF: UnicodeScalar = "\u{FFFF}"//65535 private let input: [UnicodeScalar] private let length: Int private var pos: Int = 0 private var mark: Int = 0 //private let stringCache: Array // holds reused strings in this doc, to lessen garbage public init(_ input: String) { self.input = Array(input.unicodeScalars) self.length = self.input.count //stringCache = Array(repeating:nil, count:512) } public func getPos() -> Int { return self.pos } public func isEmpty() -> Bool { return pos >= length } public func current() -> UnicodeScalar { return (pos >= length) ? CharacterReader.EOF : input[pos] } @discardableResult public func consume() -> UnicodeScalar { let val = (pos >= length) ? CharacterReader.EOF : input[pos] pos += 1 return val } public func unconsume() { pos -= 1 } public func advance() { pos += 1 } public func markPos() { mark = pos } public func rewindToMark() { pos = mark } public func consumeAsString() -> String { let p = pos pos+=1 return String(input[p]) //return String(input, pos+=1, 1) } /** * Returns the number of characters between the current position and the next instance of the input char * @param c scan target * @return offset between current position and next instance of target. -1 if not found. */ public func nextIndexOf(_ c: UnicodeScalar) -> Int { // doesn't handle scanning for surrogates for i in pos.. Int { // doesn't handle scanning for surrogates if(seq.isEmpty) {return -1} let startChar: UnicodeScalar = seq.unicodeScalar(0) for var offset in pos.. String { let offset = nextIndexOf(c) if (offset != -1) { let consumed = cacheString(pos, offset) pos += offset return consumed } else { return consumeToEnd() } } public func consumeTo(_ seq: String) -> String { let offset = nextIndexOf(seq) if (offset != -1) { let consumed = cacheString(pos, offset) pos += offset return consumed } else { return consumeToEnd() } } public func consumeToAny(_ chars: UnicodeScalar...) -> String { return consumeToAny(chars) } public func consumeToAny(_ chars: [UnicodeScalar]) -> String { let start: Int = pos let remaining: Int = length let val = input OUTER: while (pos < remaining) { if chars.contains(val[pos]) { break OUTER } // for c in chars { // if (val[pos] == c){ // break OUTER // } // } pos += 1 } return pos > start ? cacheString(start, pos-start) : CharacterReader.empty } public func consumeToAnySorted(_ chars: UnicodeScalar...) -> String { return consumeToAnySorted(chars) } public func consumeToAnySorted(_ chars: [UnicodeScalar]) -> String { let start = pos let remaining = length let val = input while (pos < remaining) { if chars.contains(val[pos]) { break } pos += 1 } return pos > start ? cacheString(start, pos-start) : CharacterReader.empty } public func consumeData() -> String { // &, <, null let start = pos let remaining = length let val = input while (pos < remaining) { let c: UnicodeScalar = val[pos] if (c == UnicodeScalar.Ampersand || c == UnicodeScalar.LessThan || c == TokeniserStateVars.nullScalr) { break } pos += 1 } return pos > start ? cacheString(start, pos-start) : CharacterReader.empty } public func consumeTagName() -> String { // '\t', '\n', '\r', '\f', ' ', '/', '>', nullChar let start = pos let remaining = length let val = input while (pos < remaining) { let c: UnicodeScalar = val[pos] if (c == UnicodeScalar.BackslashT || c == UnicodeScalar.BackslashN || c == UnicodeScalar.BackslashR || c == UnicodeScalar.BackslashF || c == UnicodeScalar.Space || c == UnicodeScalar.Slash || c == UnicodeScalar.GreaterThan || c == TokeniserStateVars.nullScalr) { break } pos += 1 } return pos > start ? cacheString(start, pos-start) : CharacterReader.empty } public func consumeToEnd() -> String { let data = cacheString(pos, length-pos) pos = length return data } public func consumeLetterSequence() -> String { let start = pos while (pos < length) { let c: UnicodeScalar = input[pos] if ((c >= "A" && c <= "Z") || (c >= "a" && c <= "z") || c.isMemberOfCharacterSet(CharacterSet.letters)) { pos += 1 } else { break } } return cacheString(start, pos - start) } public func consumeLetterThenDigitSequence() -> String { let start = pos while (pos < length) { let c = input[pos] if ((c >= "A" && c <= "Z") || (c >= "a" && c <= "z") || c.isMemberOfCharacterSet(CharacterSet.letters)) { pos += 1 } else { break } } while (!isEmpty()) { let c = input[pos] if (c >= "0" && c <= "9") { pos += 1 } else { break } } return cacheString(start, pos - start) } public func consumeHexSequence() -> String { let start = pos while (pos < length) { let c = input[pos] if ((c >= "0" && c <= "9") || (c >= "A" && c <= "F") || (c >= "a" && c <= "f")) { pos+=1 } else { break } } return cacheString(start, pos - start) } public func consumeDigitSequence() -> String { let start = pos while (pos < length) { let c = input[pos] if (c >= "0" && c <= "9") { pos+=1 } else { break } } return cacheString(start, pos - start) } public func matches(_ c: UnicodeScalar) -> Bool { return !isEmpty() && input[pos] == c } public func matches(_ seq: String) -> Bool { let scanLength = seq.unicodeScalars.count if (scanLength > length - pos) { return false } for offset in 0.. Bool { let scanLength = seq.unicodeScalars.count if(scanLength == 0) { return false } if (scanLength > length - pos) { return false } for offset in 0.. Bool { if (isEmpty()) { return false } let c: UnicodeScalar = input[pos] for seek in seq { if (seek == c) { return true } } return false } public func matchesAnySorted(_ seq: [UnicodeScalar]) -> Bool { return !isEmpty() && seq.contains(input[pos]) } public func matchesLetter() -> Bool { if (isEmpty()) { return false } let c = input[pos] return (c >= "A" && c <= "Z") || (c >= "a" && c <= "z") || c.isMemberOfCharacterSet(CharacterSet.letters) } public func matchesDigit() -> Bool { if (isEmpty()) { return false } let c = input[pos] return (c >= "0" && c <= "9") } @discardableResult public func matchConsume(_ seq: String) -> Bool { if (matches(seq)) { pos += seq.unicodeScalars.count return true } else { return false } } @discardableResult public func matchConsumeIgnoreCase(_ seq: String) -> Bool { if (matchesIgnoreCase(seq)) { pos += seq.unicodeScalars.count return true } else { return false } } public func containsIgnoreCase(_ seq: String ) -> Bool { // used to check presence of , . only finds consistent case. let loScan = seq.lowercased(with: Locale(identifier: "en")) let hiScan = seq.uppercased(with: Locale(identifier: "eng")) return (nextIndexOf(loScan) > -1) || (nextIndexOf(hiScan) > -1) } public func toString() -> String { return String(input[pos.. * Simplistic, and on hash collisions just falls back to creating a new string, vs a full HashMap with Entry list. * That saves both having to create objects as hash keys, and running through the entry list, at the expense of * some more duplicates. */ private func cacheString(_ start: Int, _ count: Int) -> String { return String(input[start.. CharacterReader.maxCacheLen) { // return String(val[start.. Bool { // if (count == cached.unicodeScalars.count) { // var count = count // let one = input // var i = start // var j = 0 // while (count != 0) { // count -= 1 // if (one[i] != cached.unicodeScalar(j) ) { // return false // } // j += 1 // i += 1 // } // return true // } // return false // } } extension CharacterReader: CustomDebugStringConvertible { public var debugDescription: String { return self.toString() } }